我怀疑我们的某个服务器应用程序已达到其最大打开文件限制。
应用程序在用户空间中以其自己的帐户运行。初始化脚本启动大量进程,这些进程又启动大量子进程和大量线程。
根据我所设定的书/etc/security/limits.conf:
USERNAME - nofile 2048
我怀疑应用程序已达到限制——通过查看临时文件目录,我发现那里有 2000 多个文件。
将限制提高到 4096 并重新启动应用程序后,我发现那里有超过 2100 个文件。
现在的问题是:如果应用程序达到 2048 的限制 - 为什么它没有记录在 /var/log/messages 中?
syslog-ng 是当前正在使用的 syslog 守护进程。
/etc/syslog-ng/syslog-ng.conf
options { long_hostnames(off); sync(0); perm(0640); stats(3600); };
source src {
internal();
unix-dgram("/dev/log");
unix-dgram("/var/lib/ntp/dev/log");
};
filter f_iptables { facility(kern) and match("IN=") and match("OUT="); };
filter f_console { level(warn) and facility(kern) and not filter(f_iptables)
or level(err) and not facility(authpriv); };
filter f_newsnotice { level(notice) and facility(news); };
filter f_newscrit { level(crit) and facility(news); };
filter f_newserr { level(err) and facility(news); };
filter f_news { facility(news); };
filter f_mailinfo { level(info) and facility(mail); };
filter f_mailwarn { level(warn) and facility(mail); };
filter f_mailerr { level(err, crit) and facility(mail); };
filter f_mail { facility(mail); };
filter f_cron { facility(cron); };
filter f_local { facility(local0, local1, local2, local3,
local4, local5, local6, local7); };
filter f_messages { not facility(news, mail, cron, authpriv, auth) and not filter(f_iptables); };
filter f_warn { level(warn, err, crit) and not filter(f_iptables); };
filter f_alert { level(alert); };
filter f_auth { facility(authpriv, auth); };
destination console { pipe("/dev/tty10" group(tty) perm(0620)); };
log { source(src); filter(f_console); destination(console); };
destination xconsole { pipe("/dev/xconsole" group(tty) perm(0400)); };
log { source(src); filter(f_console); destination(xconsole); };
destination auth { file("/var/log/auth"); };
log { source(src); filter(f_auth); destination(auth); };
destination newscrit { file("/var/log/news/news.crit"); };
log { source(src); filter(f_newscrit); destination(newscrit); };
destination newserr { file("/var/log/news/news.err"); };
log { source(src); filter(f_newserr); destination(newserr); };
destination newsnotice { file("/var/log/news/news.notice"); };
log { source(src); filter(f_newsnotice); destination(newserr); };
destination mailinfo { file("/var/log/mail.info"); };
log { source(src); filter(f_mailinfo); destination(mailinfo); };
destination mailwarn { file("/var/log/mail.warn"); };
log { source(src); filter(f_mailwarn); destination(mailwarn); };
destination mailerr { file("/var/log/mail.err" fsync(yes)); };
log { source(src); filter(f_mailerr); destination(mailerr); };
destination mail { file("/var/log/mail"); };
log { source(src); filter(f_mail); destination(mail); };
destination cron { file("/var/log/cron"); };
log { source(src); filter(f_cron); destination(cron); };
destination localmessages { file("/var/log/localmessages"); };
log { source(src); filter(f_local); destination(localmessages); };
destination messages { file("/var/log/messages"); };
log { source(src); filter(f_messages); destination(messages); };
destination firewall { file("/var/log/firewall"); };
log { source(src); filter(f_iptables); destination(firewall); };
destination warn { file("/var/log/warn" fsync(yes)); };
log { source(src); filter(f_warn); destination(warn); };
答案1
您需要真正了解文件是否用完了。
运行您的进程。然后检查 cat/proc/<pid>/limits
并查看其限制。
然后,您可以通过运行获取文件描述符计数ls -1 /proc/<pid>/fd | wc -l
。
请注意,每个进程都有自己的限制(例如父进程的子进程)。但是,线程显然共享调用进程的文件描述符表,因此做在线程和调用进程之间共享文件限制。
虽然您无法在 bash 中创建线程,但可以使用此程序来演示其效果。
/* Compiled with gcc -o upcount upcount.c -pthread */
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <string.h>
#include <unistd.h>
#include <err.h>
#include <sysexits.h>
#include <errno.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#define THREADS 3
#define NUMCHILD 3
#define DEF_OPEN_LIMIT 256
/* The gimmick in this program is to constantly dup an FD
* until we run out of file handles */
void dup_fds(
int basefd)
{
int i;
int *fds = calloc(1048576, sizeof(int));
char etxt[256];
int me = pthread_self();
for (i=0; i < 1048576; i++)
fds[i] = -1;
for (i=0; i < 1048576; i++) {
fds[i] = dup(basefd);
if (fds[i] < 0) {
strerror_r(errno, etxt, 256);
fprintf(stderr, "Cannot dup file: %s\n", etxt);
return;
}
usleep(100000 + (rand_r(&me) % 400000));
}
}
void * run_thread(
void *data)
{
/* This procedure should not be independent */
struct rlimit ofiles;
int i;
i = pthread_self();
/* Obtain the open files limit */
if (getrlimit(RLIMIT_NOFILE, &ofiles) < 0) {
perror("cannot get limits");
pthread_exit(NULL);
}
/* Assign a random value to current limit */
i = getpid();
ofiles.rlim_cur = 128 + (rand_r(&i) % 896);
/* Set the limit */
if (setrlimit(RLIMIT_NOFILE, &ofiles) < 0) {
perror("cannot set limits");
pthread_exit(NULL);
}
dup_fds(1);
}
void run_child(
void)
{
int i;
struct rlimit ofiles;
pthread_t threads[THREADS];
/* Obtain the open files limit */
if (getrlimit(RLIMIT_NOFILE, &ofiles) < 0)
err(EX_OSERR, "Cannot obtain limits");
/* Assign a random value to current limit */
i = getpid();
ofiles.rlim_cur = 128 + (rand_r(&i) % 896);
/* Set the limit */
if (setrlimit(RLIMIT_NOFILE, &ofiles) < 0)
err(EX_OSERR, "Canot set limits");
/* Create threads */
for (i=0; i < THREADS; i++) {
if (pthread_create(&threads[i], NULL, run_thread, NULL))
err(EX_OSERR, "Cannot spawn thread");
}
dup_fds(1);
for (i=0; i < THREADS; i++)
if (pthread_join(threads[i], NULL))
err(EX_OSERR, "Cannot join thread");
exit(0);
}
int main()
{
int i, s;
/* Spawn children */
for (i=0; i < NUMCHILD; i++) {
if (fork()) {
continue;
}
run_child();
}
for (i=0; i < NUMCHILD; i++) {
if (wait(&s) < 0)
warn("wait failed");
}
return 0;
}
这个程序用 3 个线程产生了 3 个子程序。
$ ./upfilecnt & pstree -p $!
upfilecnt(12662)─┬─upfilecnt(12663)─┬─{upfilecnt}(12666)
│ ├─{upfilecnt}(12667)
│ └─{upfilecnt}(12668)
├─upfilecnt(12664)─┬─{upfilecnt}(12669)
│ ├─{upfilecnt}(12670)
│ └─{upfilecnt}(12671)
└─upfilecnt(12665)─┬─{upfilecnt}(12672)
├─{upfilecnt}(12673)
└─{upfilecnt}(12674)
每个子线程每半秒不断创建一个新的文件描述符,再加上一些随机等待。
可以看到,从子进程来看,每个子进程都有一个独立的文件描述符表。
$ for i in 1266{3,4,5}; do ls -1 /proc/$i/fd | wc -l; done
637
646
636
然而这些子进程的线程都与子进程共享相同的数量。
# .. another invokation
$ for i in 134{11,14,15,10,12,13,16,17,18}; do ls -1 /proc/$i/fd | wc -l; done
438
438
438
430
430
430
433
433
433
还请注意,子进程可以有独立的限制。此程序还对每个子进程的调用设置了随机限制。
$ grep -h "Max open" /proc/1420{3,4,5}/limits
Max open files 504 4096 files
Max open files 502 4096 files
Max open files 372 4096 files
为了增加额外的乐趣,它还设置了每个线程的随机打开文件限制。但这并不固定,而是在进程和子进程中的所有线程之间共享。
grep -h "Max open" /proc/1420{3,4,5}/task/*/limits
Max open files 1011 4096 files
Max open files 1011 4096 files
Max open files 1011 4096 files
Max open files 1011 4096 files
Max open files 1009 4096 files
Max open files 1009 4096 files
Max open files 1009 4096 files
Max open files 1009 4096 files
Max open files 750 4096 files
Max open files 750 4096 files
Max open files 750 4096 files
Max open files 750 4096 files
答案2
您缺少这个源定义:
# messages from the kernel
file("/proc/kmsg" program_override("kernel: "));
那你就没事了!