为什么内核 5.2 上的系统调用性能比内核 2.6 低很多?

为什么内核 5.2 上的系统调用性能比内核 2.6 低很多?

我正在测试不同内核上系统调用的性能(硬件相同):

测试代码

#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>

int main(int argc, char *argv[])
{
       int i;
       pid_t tid;
       unsigned long cnt = 1000000;

       for(i = 0; i < cnt; i++)
       {
               tid = syscall(SYS_gettid);
       }
       return 0;
}

内核2.6的结果:

processor       : 3
vendor_id       : GenuineIntel
cpu family      : 6
model           : 55
model name      : Intel(R) Celeron(R) CPU  J1900  @ 1.99GHz
stepping        : 9
cpu MHz         : 2000.029
cache size      : 1024 KB
physical id     : 0
siblings        : 4
core id         : 3
cpu cores       : 4
apicid          : 6
initial apicid  : 6
fdiv_bug        : no
hlt_bug         : no
f00f_bug        : no
coma_bug        : no
fpu             : yes
fpu_exception   : yes
cpuid level     : 11
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx rdtscp lm constant_tsc arch_perfmon pebs bts xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 movbe popcnt lahf_lm 3dnowprefetch arat tpr_shadow vnmi flexpriority ept vpid
bogomips        : 3999.80
clflush size    : 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management:

[HFOS] $ uname -a
Linux HFOS 2.6.32.10 #1 SMP Fri Sep 9 16:11:47 CST 2016 i686 pentium3 i386 GNU/Linux

[HFOS] $ strace -c ./sc
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
99.90    0.023803           0   1000000           gettid
 0.10    0.000023           1        40        39 open
 0.00    0.000000           0         1           read
 0.00    0.000000           0         1           close
 0.00    0.000000           0         1           execve
 0.00    0.000000           0         1         1 access
 0.00    0.000000           0         1           brk
 0.00    0.000000           0         5           mmap2
 0.00    0.000000           0        39        35 stat64
 0.00    0.000000           0         1           fstat64
 0.00    0.000000           0         1           set_thread_area
------ ----------- ----------- --------- --------- ----------------
100.00    0.023826               1000091        75 total

内核5.2的结果:

processor       : 3
vendor_id       : GenuineIntel
cpu family      : 6
model           : 55
model name      : Intel(R) Celeron(R) CPU  J1900  @ 1.99GHz
stepping        : 9
microcode       : 0x90a
cpu MHz         : 1332.848
cache size      : 1024 KB
physical id     : 0
siblings        : 4
core id         : 3
cpu cores       : 4
apicid          : 6
initial apicid  : 6
fpu             : yes
fpu_exception   : yes
cpuid level     : 11
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 movbe popcnt tsc_deadline_timer rdrand lahf_lm 3dnowprefetch epb pti ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid tsc_adjust smep erms dtherm arat
bugs            : cpu_meltdown spectre_v1 spectre_v2 mds msbds_only
bogomips        : 3998.40
clflush size    : 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management:

localhost:ipc # uname -a
Linux localhost 5.2.8 #2 SMP Wed May 6 12:51:13 CST 2020 x86_64 GNU/Linux

strace -c ./sc
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
99.99    7.964185           7   1000000           gettid
 0.00    0.000384         384         1           execve
 0.00    0.000152          21         7           mmap
 0.00    0.000084          21         4           mprotect
 0.00    0.000045          22         2           openat
 0.00    0.000031          31         1           munmap
 0.00    0.000027          13         2           fstat
 0.00    0.000021          10         2           close
 0.00    0.000020          20         1         1 access
 0.00    0.000016          16         1           read
 0.00    0.000011          11         1           brk
 0.00    0.000010          10         1           arch_prctl
------ ----------- ----------- --------- --------- ----------------
100.00    7.964986           7   1000023         1 total

我很困惑为什么它在新内核上这么慢。请帮我。非常感谢。

设置 [mitigations=off] 后,系统调用性能几乎相同(执行时间几乎相同,但 strace 时间不同)。

localhost:~ # dmesg | grep iso
[    0.006693] Kernel/User page tables isolation: disabled on command line.

结果如下所示。

内核2.6:

[HFOS] $ uname -a
Linux HFOS 2.6.32.10 #1 SMP Fri Sep 9 16:11:47 CST 2016 i686 pentium3 i386 GNU/Linux
[HFOS] $ cat sc.c
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>

int main(int argc, char *argv[])
{
    int i;
    pid_t tid;
    unsigned long cnt = 100000000;

    for(i = 0; i < cnt; i++)
    {
        tid = syscall(SYS_gettid);
    }
    return 0;
}

[HFOS] $ time ./sc

real    0m16.736s
user    0m5.529s
sys     0m11.204s

[HFOS] $ time strace -c ./sc
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
100.00    0.275026           0  10000000           gettid
  0.00    0.000000           0         1           read
  0.00    0.000000           0        40        39 open
  0.00    0.000000           0         1           close
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         1         1 access
  0.00    0.000000           0         1           brk
  0.00    0.000000           0         5           mmap2
  0.00    0.000000           0        39        35 stat64
  0.00    0.000000           0         1           fstat64
  0.00    0.000000           0         1           set_thread_area
------ ----------- ----------- --------- --------- ----------------
100.00    0.275026              10000091        75 total

real    2m57.054s
user    0m28.704s
sys     2m27.259s



内核5.2:

localhost:test # uname -a
Linux localhost 5.2.8 #2 SMP Thu May 14 02:46:43 CST 2020 x86_64 GNU/Linux
localhost:test # cat sc.c
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>

int main(int argc, char *argv[])
{
    int i;
    pid_t tid;
    unsigned long cnt = 100000000;

    for(i = 0; i < cnt; i++)
    {
        tid = syscall(SYS_gettid);
    }
    return 0;
}

localhost:test # time ./sc

real    0m19.043s
user    0m8.501s
sys     0m10.532s

localhost:test # time strace -c ./sc
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
100.00   77.250398           7  10000000           gettid
  0.00    0.000405         405         1           execve
  0.00    0.000159          22         7           mmap
  0.00    0.000088          22         4           mprotect
  0.00    0.000048          24         2           openat
  0.00    0.000031          31         1           munmap
  0.00    0.000028          14         2           fstat
  0.00    0.000024          12         2           close
  0.00    0.000021          21         1         1 access
  0.00    0.000016          16         1           read
  0.00    0.000013          13         1           brk
  0.00    0.000012          12         1           arch_prctl
------ ----------- ----------- --------- --------- ----------------
100.00   77.251243           7  10000023         1 total

real    6m7.443s
user    0m55.590s
sys     6m23.482s


但UNIX域套接字性能没有变化。测试代码如下所示。

svr.c:

#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <ctype.h>
#include <sys/un.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>

#define TEST_SOCK_FILE  "/tmp/test.sock"

int main(int argc, char **argv)
{
    int fd;
    int cfd;
    int r;
    int cnt = 0;
    socklen_t  sklen;
    struct sockaddr caddr;
    char rbuf[1024];
    char sbuf[512];
    struct sockaddr_un  svraddr;

    unlink(TEST_SOCK_FILE);
    svraddr.sun_family = AF_UNIX;
    snprintf(svraddr.sun_path, sizeof(svraddr.sun_path), TEST_SOCK_FILE);

    fd = socket(AF_UNIX, SOCK_STREAM, 0);
    if(fd < 0)
    {
        printf("Create socket failed : %s\n", strerror(errno));
        return -1;
    }

    if(bind(fd, (struct sockaddr *)&svraddr, sizeof(svraddr)) < 0)
    {
        printf("Bind socket failed : %s\n", strerror(errno));
        close(fd);
        return -1;
    }

    if(listen(fd, 10) < 0)
    {
        printf("Listen socket failed : %s\n", strerror(errno));
        close(fd);
        return -1;
    }

    while(1)
    {
        sklen = sizeof(caddr);
        memset(&caddr, 0, sizeof(caddr));
        cfd = accept(fd, &caddr, &sklen);
        if(cfd < 0)
        {
            printf("Accept failed : %s", strerror(errno));
            return -1;
        }

        cnt = 0;
        while(1)
        {
            r = read(cfd, rbuf, sizeof(rbuf));
            if(r <= 0)
            {
                printf("recv failed : %s\n", strerror(errno));
                break;
            }
            if(rbuf[0] == 0x22)
            {
                break;
            }
            r = write(cfd, sbuf, sizeof(sbuf));
            if(r <= 0)
            {
                printf("send failed : %s\n", strerror(errno));
                break;
            }
            cnt++;
        }

        printf("Recv packet : %d\n", cnt);
        close(cfd);
    }
    close(fd);
    return 0;
}

cli.c:

#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <time.h>
#include <errno.h>
#include <sys/un.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>


int main(int argc, char **argv)
{
    int fd;
    int r;
    int i;
    int cnt = 1000000;
    struct sockaddr_un unaddr;
    time_t ts, te, tu;
    char sbuf[1024];
    char rbuf[512];

    unaddr.sun_family = AF_UNIX;
    strcpy(unaddr.sun_path, "/tmp/test.sock");
    fd = socket(SOCK_STREAM, SOCK_STREAM, 0);
    if(fd < 0)
    {
        printf("Create socket failed : %s\n", strerror(errno));
        return -1;
    }

    r = connect(fd, (struct sockaddr *)&unaddr, sizeof(unaddr));
    if(r < 0)
    {
        printf("Connect failed : %s\n", strerror(errno));
        close(fd);
        return -1;
    }

    ts = time(NULL);
    for(i = 0; i < cnt; i++)
    {
        sbuf[0] = 0x11;
        r = write(fd, sbuf, sizeof(sbuf));
        if(r <= 0)
        {
            printf("Send failed : %s\n", strerror(errno));
            break;
        }
        r = read(fd, rbuf, sizeof(rbuf));
        if(r <= 0)
        {
            printf("Recv failed : %s\n", strerror(errno));
            break;
        }
    }

    sbuf[0] = 0x22;
    write(fd, sbuf, sizeof(sbuf));

    te = time(NULL);
    tu = te > ts ? (te - ts) : 1;
    printf("PPS(%d) : %d packet used %lu seconds\n", cnt / tu, cnt, tu);

    close(fd);
    return 0;
}

结果如下所示:

localhost:test # uname -a
Linux localhost 5.2.8 #2 SMP Thu May 14 02:46:43 CST 2020 x86_64 GNU/Linux
localhost:test # ./svr &
[1] 955
localhost:test # ./cli
PPS(34482) : 1000000 packet used 29 seconds
Recv packet : 1000000
[HFOS] $ uname -a
Linux HFOS 2.6.32.10 #1 SMP Fri Sep 9 16:11:47 CST 2016 i686 pentium3 i386 GNU/Linux
[HFOS] $ ./svr &
[1] 32624
[HFOS] $ ./cli
Recv packet : 1000000
PPS(71428) : 1000000 packet used 14 seconds

答案1

与 2.6.32 内核相比,5.2 内核上的系统调用性能较低的大部分原因可能是内核页表隔离以及其他与安全相关的变更。 KPTI 涉及在用户空间中运行和在内核中运行时使用不同的页表。因此,每个系统调用都会更改页表两次,并产生级联后果,例如 TLB 刷新(在较旧的硬件上)。

去年发表了一篇跟踪 Linux 内核性能变化的有趣论文;它有一些详细的描述在这篇博文中,而纸张本身就是在 ACM DL 上可用(并在六月底之前公开)。

相关内容