我的 Ubuntu 服务器 20.04 在运行几个小时后一直冻结,风扇发烫,我甚至无法建立与它的 ssh 连接。
重启后,它运行正常,没有 CPU 负载峰值或内存问题。服务器是全新安装的,实际上只运行一个空的 MongoDB 数据库。
我查看了 kern.log,发现重启之前它充斥着这些日志。
Dec 10 21:41:05 backend kernel: [102373.982171] rc rc0: receive overflow
Dec 10 21:41:05 backend kernel: [102373.986010] rc rc0: receive overflow
它刚刚再次开始发生,我有机会看到哪些进程正在运行。
/usr/sbin/rsyslogd -n -iNONE
/lib/systemd/systemd-journald
/usr/lib/accountsservice/accounts-daemon
你知道为什么这种情况会不断发生吗?
答案1
我在 Intel NUC 上遇到了同样的问题,并且strace
执行该过程后发现了几个有趣的文件:
openat(AT_FDCWD, "/run/log/journal/0322fcd160934520a68be3469a358ed3/system.journal", O_RDWR|O_NONBLOCK|O_CLOEXEC) = -1 ENOENT (No such file or directory)
gettid() = 393
timerfd_settime(17, TFD_TIMER_ABSTIME, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=13887, tv_nsec=175660000}}, NULL) = 0
epoll_wait(8, [], 62, 0) = 0
ftruncate(37, 16777216) = 0
gettid() = 393
epoll_wait(8, [{events=EPOLLIN, data={u32=3603220400, u64=94565898176432}}], 62, -1) = 1
read(9, "4,1594792,13834610419,-;rc rc0: "..., 8192) = 79
access("/sys/subsystem/rc/devices/rc0", F_OK) = -1 ENOENT (No such file or directory)
access("/sys/bus/rc/devices/rc0", F_OK) = -1 ENOENT (No such file or directory)
access("/sys/class/rc/rc0", F_OK) = 0
openat(AT_FDCWD, "/", O_RDONLY|O_CLOEXEC|O_PATH|O_DIRECTORY) = 23
openat(23, "sys", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_PATH) = 36
newfstatat(36, "", {st_mode=S_IFDIR|0555, st_size=0, ...}, AT_EMPTY_PATH) = 0
close(23) = 0
openat(36, "class", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_PATH) = 23
newfstatat(23, "", {st_mode=S_IFDIR|0755, st_size=0, ...}, AT_EMPTY_PATH) = 0
close(36) = 0
openat(23, "rc", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_PATH) = 36
newfstatat(36, "", {st_mode=S_IFDIR|0755, st_size=0, ...}, AT_EMPTY_PATH) = 0
close(23) = 0
openat(36, "rc0", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_PATH) = 23
newfstatat(23, "", {st_mode=S_IFLNK|0777, st_size=0, ...}, AT_EMPTY_PATH) = 0
readlinkat(36, "rc0", "../../devices/pnp0/00:01/rc/rc0", 4096) = 31
close(23) = 0
openat(36, "..", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_PATH) = 23
close(36) = 0
openat(23, "..", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_PATH) = 36
close(23) = 0
openat(36, "devices", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_PATH) = 23
newfstatat(23, "", {st_mode=S_IFDIR|0755, st_size=0, ...}, AT_EMPTY_PATH) = 0
close(36) = 0
openat(23, "pnp0", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_PATH) = 36
newfstatat(36, "", {st_mode=S_IFDIR|0755, st_size=0, ...}, AT_EMPTY_PATH) = 0
close(23) = 0
openat(36, "00:01", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_PATH) = 23
newfstatat(23, "", {st_mode=S_IFDIR|0755, st_size=0, ...}, AT_EMPTY_PATH) = 0
close(36) = 0
openat(23, "rc", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_PATH) = 36
newfstatat(36, "", {st_mode=S_IFDIR|0755, st_size=0, ...}, AT_EMPTY_PATH) = 0
close(23) = 0
openat(36, "rc0", O_RDONLY|O_NOFOLLOW|O_CLOEXEC|O_PATH) = 23
newfstatat(23, "", {st_mode=S_IFDIR|0755, st_size=0, ...}, AT_EMPTY_PATH) = 0
close(36) = 0
close(23) = 0
access("/sys/devices/pnp0/00:01/rc/rc0/uevent", F_OK) = 0
openat(AT_FDCWD, "/sys/devices/pnp0/00:01/rc/rc0/uevent", O_RDONLY|O_CLOEXEC) = 23
newfstatat(23, "", {st_mode=S_IFREG|0644, st_size=4096, ...}, AT_EMPTY_PATH) = 0
read(23, "NAME=rc-rc6-mce\nDRV_NAME=ite-cir"..., 4104) = 66
close(23) = 0
readlinkat(AT_FDCWD, "/sys/devices/pnp0/00:01/rc/rc0/subsystem", "../../../../../class/rc", 4096) = 23
openat(AT_FDCWD, "/run/udev/data/+rc:rc0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/run/log/journal/0322fcd160934520a68be3469a358ed3/system.journal", O_RDWR|O_NONBLOCK|O_CLOEXEC) = -1 ENOENT (No such file or directory)
通过查看这些文件,我找到了一种重现此问题的方法:
marius@nuc:~$ cat /sys/devices/pnp0/00\:01/rc/rc0/input4/name
ITE8708 CIR transceiver
marius@nuc:~$ cat /sys/devices/pnp0/00\:01/rc/rc0/protocols
rc-5 nec [rc-6] jvc sony rc-5-sz sanyo sharp mce_kbd xmp imon rc-mm [lirc]
NUC 位于电视下方,它有一个红外接收器(对此一无所知),它可以捕获来自电视遥控器的信号。按下遥控器上的任何按钮都会触发此问题。
我找到了导致此问题的内核模块lsmod
,并用 禁用了它sudo modprobe -r ite_cir
。此修复是暂时的,我必须研究将该模块列入黑名单。
答案2
我在带有红外接收器的 Intel NUC 上运行 Linux Mint(不是 Ubuntu,但足够相似)。
我在这里找到了解决方案:https://support.tools/post/rc-rc0-overflow/
TLDR;如果不使用红外接收器,可以将其禁用并列入黑名单,从而解决该问题。您可以按如下方式操作:
# Disable IR kernel module
sudo modprobe -r ite_cir
# Check journal to make sure 'kernel: rc rc0: receive overflow' message is gone
sudo journalctl -f
# If disabling the IR module resolved your issue, you have to blacklist it to make it permanent.
# Edit this file: /etc/modprobe.d/blacklist.conf
sudo nano /etc/modprobe.d/blacklist.conf
# Add to it the following line:
blacklist ite_cir
# ctrl+o and return to save, ctrl+x to exit nano editor.
# Update initramfs
sudo update-initramfs -u
这为我解决了这个问题(至少对于当前安装的内核,可能需要在下次内核更新后重复这些步骤,幸运的是,这是一个小的、可重复的操作。
答案3
我使用了一些方法来解决 NUC 上的高 CPU 负载问题,这些方法确实有效。
#SOLUTION 1
sudo vim /etc/default/grub
# Add the following parameter to the GRUB_CMDLINE_LINUX_DEFAULT property
pci=nomsi
# And try also addin this parameter to the GRUB_CMDLINE_LINUX_DEFAULT and GRUB_CMDLINE_LINUX properties
pcie_aspm=off
#
sudo update-grub
sudo reboot
#SOLUTION 2 - Update the bios firmware
wget https://downloadmirror.intel.com/758735/BN0089.bio
sudo cp BN0089.bio /boot/
sudo cp BN0089.bio /root/
sudo reboot
#SOLUTION 3 (Usually this fixes the problem)
# Disable IR kernel module
sudo modprobe -r ite_cir
# Check journal to make sure 'kernel: rc rc0: receive overflow' message is gone
sudo journalctl -f
# If disabling the IR module resolved your issue, you have to blacklist it to make it permanent.
# Edit this file: /etc/modprobe.d/blacklist.conf
sudo vi /etc/modprobe.d/blacklist.conf
# Add to it the following line:
blacklist ite_cir
# Update initramfs
sudo update-initramfs -u