一个简单的 memcpy 循环，步长为 8-16-32 字节，没有 L1 缓存未命中，是什么导致后端周期停顿？

2024-6-20 • tag-icon

一个简单的 memcpy 循环，步长为 8-16-32 字节，没有 L1 缓存未命中，是什么导致后端周期停顿？

我试图了解单生产者单消费者队列算法中的CPU缓存性能，但在某些情况下无法查明性能下降的原因。以下简化的测试程序运行时几乎没有 L1 缓存未命中，但当其内存访问模式有些稀疏时，它会在 CPU 后端花费许多周期。在几乎没有 L1 未命中的情况下，是什么导致 CPU 后端在这种情况下停止？应该测量什么来查明原因？

我知道这个问题与其说是关于Linux或perf_event，不如说是关于CPU和缓存的架构。什么是更合适的堆栈交换？ Stackoverflow 更注重软件？ Serverfault 或 Superuser 也不针对这些主题。 Electronics stackexchange 并不完全针对 CPU 架构。虽然，根据这个元数据来自 2013 年，电子产品可能是最合适的。我将其发布在这里只是因为所有测试都是在 Linux 上完成的，并且根据经验，我认为有些专家可能知道这里发生了什么，例如 Gilles。也许，最好的办法是将其发布到 AMD 论坛上。但我无法在那里发布我的草稿，因为当我没有实际发布帖子时，出现错误：“检测到帖子泛滥（用户试图在 600 秒内发布 2 条以上的消息）”。难怪他们的论坛这么安静。

我的CPU是AMD Ryzen 5 PRO 4650G，它是Zen 2“Renoir”，具有192KiB L1d缓存。测试memcpy程序：

// demo_memcpy_test_speed-gap.c
#include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <cstring>

#define PACKET_SIZE 8 // 16 32 // <--- it is really the stride of the memcpy over the mem array
#define SIZE_TO_MEMCPY 8 // memcpy only the first 8 bytes of the "packet"

const static long long unsigned n_packets = 512; // use few packets, to fit in L2 etc
static long long unsigned repeat    = 1000*1000 * 2 * 2; // repeat many times to get enough stats in perf

const static long long unsigned n_max_data_bytes = n_packets * PACKET_SIZE;

#define CACHE_LINE_SIZE 64 // align explicitly just in case
alignas(CACHE_LINE_SIZE) uint8_t data_in  [n_max_data_bytes];
alignas(CACHE_LINE_SIZE) uint8_t data_out [n_packets][PACKET_SIZE];

int main(int argc, char* argv[])
{
printf("memcpy_test.c standard\n");
printf("PACKET_SIZE %d   SIZE_TO_MEMCPY %d\n", PACKET_SIZE, SIZE_TO_MEMCPY);

//
// warmup the memory
// i.e. access the memory to make sure Linux has set up the virtual mem tables
...

{
printf("\nrun memcpy\n");

long long unsigned n_bytes_copied = 0;
long long unsigned memcpy_ops     = 0;
start_setup = clock();
for (unsigned rep=0; rep<repeat; rep++) {
    uint8_t* data_in_ptr  = &data_in [0];

    for (unsigned long long i_packet=0; i_packet<n_packets; i_packet++) {
        // copy only SIZE_TO_MEMCPY of the in data array to the out

        uint8_t* data_out_ptr = &(data_out [i_packet][0]);

        memcpy(data_out_ptr, data_in_ptr, SIZE_TO_MEMCPY*sizeof(uint8_t));
        memcpy_ops++;
        n_bytes_copied   += SIZE_TO_MEMCPY;

        data_in_ptr  += PACKET_SIZE;
    }
}
end_setup   = clock();
cpu_time_used_setup = ((double) (end_setup - start_setup)) / CLOCKS_PER_SEC;
printf("memcpy() took %f seconds to execute\n", cpu_time_used_setup);

printf("%f Mops\n",   memcpy_ops/(1000000*cpu_time_used_setup));
printf("%llu bytes\n",  n_bytes_copied);
printf("%f Mbytes/s\n", n_bytes_copied/(1000000*cpu_time_used_setup));
}

} // end of main

它的构建是-O1为了获得一个真正有效的循环memcpy：

g++ -g -O1 ./demo_memcpy_test_speed-gap.c

循环的指令memcpy，如注释选项所示perf record：

sudo perf record -F 999 -e stalled-cycles-backend -- ./a.out
sudo perf report
...select main

设置为PACKET_SIZE8 时，代码非常高效：

       │     for (unsigned long long i_packet=0; i_packet<n_packets; i_packet++) {
       │11b:┌─→mov      %rbx,%rax
       │    │memcpy():
       │11e:│  mov      (%rcx,%rax,8),%rdx
100.00 │    │  mov      %rdx,(%rsi,%rax,8)
       │    │main():
       │    │  add      $0x1,%rax
       │    │  cmp      $0x200,%rax
       │    │↑ jne      11e
       │    │for (unsigned rep=0; rep<repeat; rep++) {
       │    │  sub      $0x1,%edi
       │    └──jne      11b

设置PACKET_SIZE为 1024 时，代码与 256 相同，只不过add $0x100..改为0x400：

       │       lea      _end,%rsi
       │140:┌─→mov      %rbp,%rdx
       │    │
       │    │  lea      data_in,%rax
       │    │
       │    │__fortify_function void *
       │    │__NTH (memcpy (void *__restrict __dest, const void *__restrict __src,
       │    │size_t __len))
       │    │{
       │    │return __builtin___memcpy_chk (__dest, __src, __len,
       │14a:│  mov      (%rax),%rcx
       │    │memcpy():
 96.31 │    │  mov      %rcx,(%rdx)
       │    │
  1.81 │    │  add      $0x400,%rax
  0.20 │    │  add      $0x400,%rdx
  1.12 │    │  cmp      %rsi,%rax
  0.57 │    │↑ jne      14a
       │    │  sub      $0x1,%edi
       │    └──jne      140

我运行它时PACKET_SIZE设置为 8、16、32 和其他值。性能计数为 8 和 32：

sudo perf stat -e task-clock,instructions,cycles,stalled-cycles-frontend,stalled-cycles-backend \
      -e L1-dcache-loads,L1-dcache-load-misses,L1-dcache-prefetches \
      -e l2_cache_accesses_from_dc_misses,l2_cache_hits_from_dc_misses,l2_cache_misses_from_dc_misses \
      -- ./a.out

PACKET_SIZE 8   SIZE_TO_MEMCPY 8
...
 Performance counter stats for './a.out':

            503.43 msec task-clock                       #    0.998 CPUs utilized
    10,323,618,071      instructions                     #    4.79  insn per cycle
                                                  #    0.01  stalled cycles per insn     (29.11%)
     2,154,694,815      cycles                           #    4.280 GHz                         (29.91%)
         5,148,993      stalled-cycles-frontend          #    0.24% frontend cycles idle        (30.70%)
        55,922,538      stalled-cycles-backend           #    2.60% backend cycles idle         (30.99%)
     4,091,862,625      L1-dcache-loads                  #    8.128 G/sec                       (30.99%)
            24,211      L1-dcache-load-misses            #    0.00% of all L1-dcache accesses   (30.99%)
            18,745      L1-dcache-prefetches             #   37.234 K/sec                       (30.37%)
            30,749      l2_cache_accesses_from_dc_misses #   61.079 K/sec                       (29.57%)
            21,046      l2_cache_hits_from_dc_misses     #   41.805 K/sec                       (28.78%)
             9,095      l2_cache_misses_from_dc_misses   #   18.066 K/sec                       (28.60%)

PACKET_SIZE 32   SIZE_TO_MEMCPY 8
...
 Performance counter stats for './a.out':

            832.83 msec task-clock                       #    0.999 CPUs utilized
    12,289,501,297      instructions                     #    3.46  insn per cycle
                                                  #    0.11  stalled cycles per insn     (29.42%)
     3,549,297,932      cycles                           #    4.262 GHz                         (29.64%)
         5,552,837      stalled-cycles-frontend          #    0.16% frontend cycles idle        (30.12%)
     1,349,663,970      stalled-cycles-backend           #   38.03% backend cycles idle         (30.25%)
     4,144,875,512      L1-dcache-loads                  #    4.977 G/sec                       (30.25%)
           772,968      L1-dcache-load-misses            #    0.02% of all L1-dcache accesses   (30.24%)
           539,481      L1-dcache-prefetches             #  647.767 K/sec                       (30.25%)
           532,879      l2_cache_accesses_from_dc_misses #  639.839 K/sec                       (30.24%)
           461,131      l2_cache_hits_from_dc_misses     #  553.690 K/sec                       (30.04%)
            14,485      l2_cache_misses_from_dc_misses   #   17.392 K/sec                       (29.55%)

L1 缓存未命中略有增加：从 8 字节时的 0%PACKET_SIZE上升到 32 字节时的 0.02%。但它能解释为什么后端停滞从 2.6% 跃升至 38% 吗？如果不是，那么还有什么会导致 CPU 后端停顿？

我知道较大的步幅意味着memcpy循环会更快地从一个 L1 缓存行移动到另一个 L1 缓存行。但是，如果这些行已经在缓存中，并且实际上没有 L1 未命中事件（如所报告的那样）perf，则为什么访问不同的缓存行会导致后端停顿？与CPU如何并行发出指令有关吗？也许，它不能发出同时访问不同缓存行的指令？

PACKET_SIZE下面的图表显示了最多 1024 字节的运行情况：

数据点上的数字显示了PACKET_SIZE运行的参数，即访问模式的步幅memcpy。 X 轴是每秒数百万次操作（Mops），一次“操作”= 1 memcpy。 Y 轴包含性能指标：丢失的 L1 访问百分比，以及后端和前端停滞的周期百分比。

在所有这些运行中，L2 访问实际上不会丢失，即l2_cache_misses_from_dc_misses度量总是非常低。只是为了完整性，根据阿南德科技Zen 2架构L1延迟为4个周期，L2延迟为12个周期。

我不确定为什么前端会停滞。但这就是perf报道的内容。我相信这是真的。因为前端停顿和后端停顿的效果是不一样的。如果您将PACKET_SIZE图上的运行与 256 和 1024 进行比较：它们的 L1 未命中大致相同； 256 大约有 77% 的周期在后端停滞，0% 在前端； 1024 则相反，77% 前端停滞，0% 后端停滞。然而，1024 的速度要慢得多，因为它每个周期发出的指令要少得多。 1024 次运行中约为 0.42，256 次运行中约为 1.28。

因此，当 CPU 在前端停滞时，每个周期发出的指令比在后端停滞时要少。我想这就是前端和后端的工作方式，即后端可以更加并行地运行。如果有人能够证实或纠正这个猜测，我们将不胜感激。然而，一个更重要的问题：为什么前端会停滞？前端应该只是解码指令。PACKET_SIZE设置为 256 或 1024 时，程序集并没有真正改变。那么，是什么导致前端在 1024 步幅时比 256 步幅时停顿更多呢？

所有运行中每个 Mops 的 IPC 绘图PACKET_SIZE：

8的运行PACKET_SIZE稍微偏离了更多 Mops 的路线，即比其他值的趋势更快。这一定是因为指令更有效。

相关内容