Linux 高 iowait,无块设备活动

Linux 高 iowait,无块设备活动

我很难找出导致我的服务器 iowait 过高的原因。

top:

%Cpu(s):  : 30.0 us,  5.4 sy,  0.0 ni, 53.9 id, 10.1 wa,  0.0 hi,  0.7 si,  0.0 st

iostat:

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
sda               0.00     1.00    0.00    0.80     0.00     7.20    18.00     0.00    0.25    0.00    0.25   0.25   0.02

iotop:

Total DISK READ :       0.00 B/s | Total DISK WRITE :       0.00 B/s
Actual DISK READ:       0.00 B/s | Actual DISK WRITE:       0.00 B/s
  TID  PRIO  USER     DISK READ  DISK WRITE  SWAPIN     IO>    COMMAND
 4807 be/4 daemon      0.00 B/s    0.00 B/s  0.00 % 21.66 % httpd -k restart
 4522 be/4 daemon      0.00 B/s    0.00 B/s  0.00 % 10.76 % httpd -k restart
 4547 be/4 daemon      0.00 B/s    0.00 B/s  0.00 %  7.87 % httpd -k restart
 4603 be/4 daemon      0.00 B/s    0.00 B/s  0.00 %  7.66 % httpd -k restart
 4652 be/4 daemon      0.00 B/s    0.00 B/s  0.00 %  5.36 % httpd -k restart
 4671 be/4 daemon      0.00 B/s    0.00 B/s  0.00 %  4.83 % httpd -k restart

因此看起来所有 IO 活动都是由 httpd 生成的,但 sda 上没有任何活动,并且服务器上也没有其他硬盘或网络驱动器。

我已经在 httpd 上尝试过 strace:

Process 4975 attached
accept4(4, {sa_family=AF_INET6, sin6_port=htons(60547), inet_pton(AF_INET6, "::ffff:127.0.0.1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28], SOCK_CLOEXEC) = 9
getsockname(9, {sa_family=AF_INET6, sin6_port=htons(8080), inet_pton(AF_INET6, "::ffff:127.0.0.1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0
fcntl(9, F_GETFL)                       = 0x2 (flags O_RDWR)
fcntl(9, F_SETFL, O_RDWR|O_NONBLOCK)    = 0
read(9, "GET /category/toys/ HTTP/1.0\r"..., 8000) = 287
stat("/srv/www/website/category/toys/", 0x7ffff11072d0) = -1 ENOENT (No such file or directory)
lstat("/srv", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
lstat("/srv/www", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
lstat("/srv/www/website", {st_mode=S_IFDIR|0755, st_size=20480, ...}) = 0
lstat("/srv/www/website/category", 0x7ffff11072d0) = -1 ENOENT (No such file or directory)
stat("/srv/www/website/index.php", {st_mode=S_IFREG|0644, st_size=39274, ...}) = 0
setitimer(ITIMER_PROF, {it_interval={0, 0}, it_value={6000000, 0}}, NULL) = 0
rt_sigaction(SIGPROF, {0x7f4329cbc270, [PROF], SA_RESTORER|SA_RESTART, 0x7f432a7bd650}, {0x7f4329cbc270, [PROF], SA_RESTORER|SA_RESTART, 0x7f432a7bd650}, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [PROF], NULL, 8) = 0
getcwd("/", 4095)                       = 2
chdir("/srv/www/website")               = 0
setitimer(ITIMER_PROF, {it_interval={0, 0}, it_value={1000000, 0}}, NULL) = 0
fcntl(7, F_SETLK, {type=F_RDLCK, whence=SEEK_SET, start=1, len=1}) = 0
getcwd("/srv/www/website", 4096)        = 17
socket(PF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 10
connect(10, {sa_family=AF_LOCAL, sun_path="/tmp/memcached2.sock"}, 110) = 0
sendto(10, "get memc.sess.key.ee3e1a44ced404"..., 52, MSG_NOSIGNAL, NULL, 0) = 52
recvfrom(10, "VALUE memc.sess.key.ee3e1a44ced4"..., 8196, MSG_NOSIGNAL, NULL, NULL) = 175
socket(PF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 11
connect(11, {sa_family=AF_LOCAL, sun_path="/tmp/memcached.sock"}, 110) = 0
sendto(11, "get 389768020cec2648767316c7233f"..., 38, MSG_NOSIGNAL, NULL, 0) = 38
recvfrom(11, "VALUE 389768020cec2648767316c723"..., 8196, MSG_NOSIGNAL, NULL, NULL) = 8196
recvfrom(11, "\0\"\345\22\315\00230:\374\16u\205\315\360\23\375\00050\376\0009\353\0\250\342\n\213\353Si"..., 8196, MSG_NOSIGNAL, NULL, NULL) = 8196
recvfrom(11, "px#f =\7body><tr@\v\0d \4\1ul \4\1li\304\36\2"..., 8196, MSG_NOSIGNAL, NULL, NULL) = 4629
sendto(11, "quit\r\n", 6, MSG_NOSIGNAL, NULL, 0) = 6
shutdown(11, SHUT_WR)                   = 0
shutdown(11, SHUT_RD)                   = 0
close(11)                               = 0
writev(9, [{"HTTP/1.1 200 OK\r\nDate: Mon, 11 M"..., 246}, {"<!DOCTYPE html> <html lang=\"en\">"..., 107113}], 2) = 107359
chdir("/")                              = 0
setitimer(ITIMER_PROF, {it_interval={0, 0}, it_value={0, 0}}, NULL) = 0
sendto(10, "set memc.sess.key.ee3e1a44ced404"..., 175, MSG_NOSIGNAL, NULL, 0) = 175
recvfrom(10, "STORED\r\n", 8196, MSG_NOSIGNAL, NULL, NULL) = 8
sendto(10, "quit\r\n", 6, MSG_NOSIGNAL, NULL, 0) = 6
shutdown(10, SHUT_WR)                   = 0
shutdown(10, SHUT_RD)                   = 0
close(10)                               = 0
fcntl(7, F_SETLK, {type=F_UNLCK, whence=SEEK_SET, start=0, len=0}) = 0
setitimer(ITIMER_PROF, {it_interval={0, 0}, it_value={0, 0}}, NULL) = 0
writev(9, [{"<!-- static content | last modif"..., 60}], 1) = 60
times({tms_utime=40554, tms_stime=8627, tms_cutime=0, tms_cstime=0}) = 456014999
shutdown(9, SHUT_WR)                    = 0
poll([{fd=9, events=POLLIN}], 1, 2000)  = 1 ([{fd=9, revents=POLLIN|POLLHUP}])
read(9, "", 512)                        = 0
close(9)                                = 0
read(5, 0x7ffff1107697, 1)              = -1 EAGAIN (Resource temporarily unavailable)
accept4(4,

尽管重写规则将所有请求定向到 index.php,但它对整个目录结构进行了相当多的 lstats,但这些简单的 lstats 是否可能导致如此巨大的 iowait?它还通过 unix 套接字与 memcached 服务器进行一些通信,但据我所知,unix 套接字通信不算作 iowait。

服务器仅以动态内容的速度每秒处理大约 200 个请求,但所有这些内容都缓冲在 RAM 中(确保没有交换和 64G RAM),并且它没有从驱动器读取任何内容,那么为什么 iowait 这么高?

我该如何诊断这个问题?有没有办法测量跨多个 httpd 进程的调用,当几乎没有块设备使用时,这些调用会增加 iowait ?

更新 1: 服务器位于 1GE 网络上,速度约为 10Mbps,没有任何错误。

更新 2: strace -c -f -p:

% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 46.39    0.006063           1     10330           poll
 26.66    0.003485           0     10212        71 recvfrom
  7.98    0.001043           1       880           sendto
  5.03    0.000658           2       294       109 connect
  2.39    0.000313           1       294           socket
  2.10    0.000274           1       338           close
  1.45    0.000189           1       341        43 read
  1.44    0.000188           4        43           accept4
  1.40    0.000183           0       509           fcntl
  1.26    0.000165           3        52           writev
  0.83    0.000109           1       193           shutdown
  0.55    0.000072           1        84           chdir
  0.42    0.000055           0       168           setitimer
  0.41    0.000053           1        73           stat
  0.33    0.000043           1        43           getsockname
  0.25    0.000033           0       116           rt_sigaction
  0.24    0.000032           0        72           getcwd
  0.21    0.000028           0       109           getsockopt
  0.21    0.000027           0        60           lstat
  0.16    0.000021           0       106           setsockopt
  0.11    0.000014           0        43           times
  0.08    0.000010           0        42           rt_sigprocmask
  0.05    0.000007           7         1           select
  0.02    0.000003           1         5           access
  0.02    0.000002           0         6           brk
  0.00    0.000000           0         1           open
  0.00    0.000000           0         1           mmap
  0.00    0.000000           0         1           munmap
------ ----------- ----------- --------- --------- ----------------
100.00    0.013070                 24417       223 total

这是 strace -c 运行约一分钟的结果,但我仍然找不到任何可以产生如此高 iowait 的东西......

更新 3:

我已经更改了 httpd 配置并将 RewriteRules 从 Directory 移至 VirtualHost 范围,以便我可以禁用 FollowSymLinks 并为每个请求保存了一些额外的 lstat 调用,来自:

stat("/srv/www/website/category/toys/", 0x7ffff11072d0) = -1 ENOENT (No such file or directory)
lstat("/srv", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
lstat("/srv/www", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
lstat("/srv/www/website", {st_mode=S_IFDIR|0755, st_size=20480, ...}) = 0
lstat("/srv/www/website/category", 0x7ffff11072d0) = -1 ENOENT (No such file or directory)
stat("/srv/www/website/index.php", {st_mode=S_IFREG|0644, st_size=39274, 

只是:

stat("/srv/www/website/index.php", {st_mode=S_IFREG|0644, st_size=39274, ...}) = 0
lstat("/srv/www/website/index.php", {st_mode=S_IFREG|0644, st_size=39274, ...}) = 0

但对 IO 没有影响。我还注意到,在 httpd 重启后约 10 秒(在此期间 http 请求一直在处理),iowait 处于 0-1% 水平,然后上升 - 也许这可能是线索?

更新 4

我已将最小备用服务器数从 280 改为 64,iowait 也已减少到 1%。它可以与记分板连接,正如 Cameron Kerr 提到的那样,或者与某些父子通信连接,但据我所知,“Apache httpd 首先尝试完全在内存中创建记分板”,而我的 apache 实例并未使用基于文件的记分板。

有谁知道为什么备用 prefork 进程的数量对 iowait 有如此大的影响(280 个备用进程为 20%,而 64 个备用进程为 2%)?

相关内容