我正在测试不同内核上系统调用的性能(硬件相同):
测试代码
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
int main(int argc, char *argv[])
{
int i;
pid_t tid;
unsigned long cnt = 1000000;
for(i = 0; i < cnt; i++)
{
tid = syscall(SYS_gettid);
}
return 0;
}
内核2.6的结果:
processor : 3
vendor_id : GenuineIntel
cpu family : 6
model : 55
model name : Intel(R) Celeron(R) CPU J1900 @ 1.99GHz
stepping : 9
cpu MHz : 2000.029
cache size : 1024 KB
physical id : 0
siblings : 4
core id : 3
cpu cores : 4
apicid : 6
initial apicid : 6
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 11
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx rdtscp lm constant_tsc arch_perfmon pebs bts xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 movbe popcnt lahf_lm 3dnowprefetch arat tpr_shadow vnmi flexpriority ept vpid
bogomips : 3999.80
clflush size : 64
cache_alignment : 64
address sizes : 36 bits physical, 48 bits virtual
power management:
[HFOS] $ uname -a
Linux HFOS 2.6.32.10 #1 SMP Fri Sep 9 16:11:47 CST 2016 i686 pentium3 i386 GNU/Linux
[HFOS] $ strace -c ./sc
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
99.90 0.023803 0 1000000 gettid
0.10 0.000023 1 40 39 open
0.00 0.000000 0 1 read
0.00 0.000000 0 1 close
0.00 0.000000 0 1 execve
0.00 0.000000 0 1 1 access
0.00 0.000000 0 1 brk
0.00 0.000000 0 5 mmap2
0.00 0.000000 0 39 35 stat64
0.00 0.000000 0 1 fstat64
0.00 0.000000 0 1 set_thread_area
------ ----------- ----------- --------- --------- ----------------
100.00 0.023826 1000091 75 total
内核5.2的结果:
processor : 3
vendor_id : GenuineIntel
cpu family : 6
model : 55
model name : Intel(R) Celeron(R) CPU J1900 @ 1.99GHz
stepping : 9
microcode : 0x90a
cpu MHz : 1332.848
cache size : 1024 KB
physical id : 0
siblings : 4
core id : 3
cpu cores : 4
apicid : 6
initial apicid : 6
fpu : yes
fpu_exception : yes
cpuid level : 11
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 movbe popcnt tsc_deadline_timer rdrand lahf_lm 3dnowprefetch epb pti ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid tsc_adjust smep erms dtherm arat
bugs : cpu_meltdown spectre_v1 spectre_v2 mds msbds_only
bogomips : 3998.40
clflush size : 64
cache_alignment : 64
address sizes : 36 bits physical, 48 bits virtual
power management:
localhost:ipc # uname -a
Linux localhost 5.2.8 #2 SMP Wed May 6 12:51:13 CST 2020 x86_64 GNU/Linux
strace -c ./sc
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
99.99 7.964185 7 1000000 gettid
0.00 0.000384 384 1 execve
0.00 0.000152 21 7 mmap
0.00 0.000084 21 4 mprotect
0.00 0.000045 22 2 openat
0.00 0.000031 31 1 munmap
0.00 0.000027 13 2 fstat
0.00 0.000021 10 2 close
0.00 0.000020 20 1 1 access
0.00 0.000016 16 1 read
0.00 0.000011 11 1 brk
0.00 0.000010 10 1 arch_prctl
------ ----------- ----------- --------- --------- ----------------
100.00 7.964986 7 1000023 1 total
我很困惑为什么它在新内核上这么慢。请帮我。非常感谢。
设置 [mitigations=off] 后,系统调用性能几乎相同(执行时间几乎相同,但 strace 时间不同)。
localhost:~ # dmesg | grep iso
[ 0.006693] Kernel/User page tables isolation: disabled on command line.
结果如下所示。
内核2.6:
[HFOS] $ uname -a
Linux HFOS 2.6.32.10 #1 SMP Fri Sep 9 16:11:47 CST 2016 i686 pentium3 i386 GNU/Linux
[HFOS] $ cat sc.c
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
int main(int argc, char *argv[])
{
int i;
pid_t tid;
unsigned long cnt = 100000000;
for(i = 0; i < cnt; i++)
{
tid = syscall(SYS_gettid);
}
return 0;
}
[HFOS] $ time ./sc
real 0m16.736s
user 0m5.529s
sys 0m11.204s
[HFOS] $ time strace -c ./sc
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
100.00 0.275026 0 10000000 gettid
0.00 0.000000 0 1 read
0.00 0.000000 0 40 39 open
0.00 0.000000 0 1 close
0.00 0.000000 0 1 execve
0.00 0.000000 0 1 1 access
0.00 0.000000 0 1 brk
0.00 0.000000 0 5 mmap2
0.00 0.000000 0 39 35 stat64
0.00 0.000000 0 1 fstat64
0.00 0.000000 0 1 set_thread_area
------ ----------- ----------- --------- --------- ----------------
100.00 0.275026 10000091 75 total
real 2m57.054s
user 0m28.704s
sys 2m27.259s
内核5.2:
localhost:test # uname -a
Linux localhost 5.2.8 #2 SMP Thu May 14 02:46:43 CST 2020 x86_64 GNU/Linux
localhost:test # cat sc.c
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
int main(int argc, char *argv[])
{
int i;
pid_t tid;
unsigned long cnt = 100000000;
for(i = 0; i < cnt; i++)
{
tid = syscall(SYS_gettid);
}
return 0;
}
localhost:test # time ./sc
real 0m19.043s
user 0m8.501s
sys 0m10.532s
localhost:test # time strace -c ./sc
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
100.00 77.250398 7 10000000 gettid
0.00 0.000405 405 1 execve
0.00 0.000159 22 7 mmap
0.00 0.000088 22 4 mprotect
0.00 0.000048 24 2 openat
0.00 0.000031 31 1 munmap
0.00 0.000028 14 2 fstat
0.00 0.000024 12 2 close
0.00 0.000021 21 1 1 access
0.00 0.000016 16 1 read
0.00 0.000013 13 1 brk
0.00 0.000012 12 1 arch_prctl
------ ----------- ----------- --------- --------- ----------------
100.00 77.251243 7 10000023 1 total
real 6m7.443s
user 0m55.590s
sys 6m23.482s
但UNIX域套接字性能没有变化。测试代码如下所示。
svr.c:
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <ctype.h>
#include <sys/un.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#define TEST_SOCK_FILE "/tmp/test.sock"
int main(int argc, char **argv)
{
int fd;
int cfd;
int r;
int cnt = 0;
socklen_t sklen;
struct sockaddr caddr;
char rbuf[1024];
char sbuf[512];
struct sockaddr_un svraddr;
unlink(TEST_SOCK_FILE);
svraddr.sun_family = AF_UNIX;
snprintf(svraddr.sun_path, sizeof(svraddr.sun_path), TEST_SOCK_FILE);
fd = socket(AF_UNIX, SOCK_STREAM, 0);
if(fd < 0)
{
printf("Create socket failed : %s\n", strerror(errno));
return -1;
}
if(bind(fd, (struct sockaddr *)&svraddr, sizeof(svraddr)) < 0)
{
printf("Bind socket failed : %s\n", strerror(errno));
close(fd);
return -1;
}
if(listen(fd, 10) < 0)
{
printf("Listen socket failed : %s\n", strerror(errno));
close(fd);
return -1;
}
while(1)
{
sklen = sizeof(caddr);
memset(&caddr, 0, sizeof(caddr));
cfd = accept(fd, &caddr, &sklen);
if(cfd < 0)
{
printf("Accept failed : %s", strerror(errno));
return -1;
}
cnt = 0;
while(1)
{
r = read(cfd, rbuf, sizeof(rbuf));
if(r <= 0)
{
printf("recv failed : %s\n", strerror(errno));
break;
}
if(rbuf[0] == 0x22)
{
break;
}
r = write(cfd, sbuf, sizeof(sbuf));
if(r <= 0)
{
printf("send failed : %s\n", strerror(errno));
break;
}
cnt++;
}
printf("Recv packet : %d\n", cnt);
close(cfd);
}
close(fd);
return 0;
}
cli.c:
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <time.h>
#include <errno.h>
#include <sys/un.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>
int main(int argc, char **argv)
{
int fd;
int r;
int i;
int cnt = 1000000;
struct sockaddr_un unaddr;
time_t ts, te, tu;
char sbuf[1024];
char rbuf[512];
unaddr.sun_family = AF_UNIX;
strcpy(unaddr.sun_path, "/tmp/test.sock");
fd = socket(SOCK_STREAM, SOCK_STREAM, 0);
if(fd < 0)
{
printf("Create socket failed : %s\n", strerror(errno));
return -1;
}
r = connect(fd, (struct sockaddr *)&unaddr, sizeof(unaddr));
if(r < 0)
{
printf("Connect failed : %s\n", strerror(errno));
close(fd);
return -1;
}
ts = time(NULL);
for(i = 0; i < cnt; i++)
{
sbuf[0] = 0x11;
r = write(fd, sbuf, sizeof(sbuf));
if(r <= 0)
{
printf("Send failed : %s\n", strerror(errno));
break;
}
r = read(fd, rbuf, sizeof(rbuf));
if(r <= 0)
{
printf("Recv failed : %s\n", strerror(errno));
break;
}
}
sbuf[0] = 0x22;
write(fd, sbuf, sizeof(sbuf));
te = time(NULL);
tu = te > ts ? (te - ts) : 1;
printf("PPS(%d) : %d packet used %lu seconds\n", cnt / tu, cnt, tu);
close(fd);
return 0;
}
结果如下所示:
localhost:test # uname -a
Linux localhost 5.2.8 #2 SMP Thu May 14 02:46:43 CST 2020 x86_64 GNU/Linux
localhost:test # ./svr &
[1] 955
localhost:test # ./cli
PPS(34482) : 1000000 packet used 29 seconds
Recv packet : 1000000
[HFOS] $ uname -a
Linux HFOS 2.6.32.10 #1 SMP Fri Sep 9 16:11:47 CST 2016 i686 pentium3 i386 GNU/Linux
[HFOS] $ ./svr &
[1] 32624
[HFOS] $ ./cli
Recv packet : 1000000
PPS(71428) : 1000000 packet used 14 seconds
答案1
与 2.6.32 内核相比,5.2 内核上的系统调用性能较低的大部分原因可能是内核页表隔离以及其他与安全相关的变更。 KPTI 涉及在用户空间中运行和在内核中运行时使用不同的页表。因此,每个系统调用都会更改页表两次,并产生级联后果,例如 TLB 刷新(在较旧的硬件上)。
去年发表了一篇跟踪 Linux 内核性能变化的有趣论文;它有一些详细的描述在这篇博文中,而纸张本身就是在 ACM DL 上可用(并在六月底之前公开)。