rm 包含数百万个文件的目录

Question 1

值得尝试mountdata=writeback选项，以防止对文件系统进行日志记录。这应该只在删除时执行，但如果在删除操作期间服务器关闭或重新启动，则存在风险。

根据这一页，

一些应用程序在使用它时会表现出非常明显的速度提升。例如，当应用程序创建和删除大量小文件时，可以看到速度提升 (...)。

fstab该选项在挂载操作中或挂载操作期间设置，替换data=ordered为data=writeback。必须重新挂载包含要删除的文件的文件系统。

Answer

值得尝试mountdata=writeback选项，以防止对文件系统进行日志记录。这应该只在删除时执行，但如果在删除操作期间服务器关闭或重新启动，则存在风险。

根据这一页，

一些应用程序在使用它时会表现出非常明显的速度提升。例如，当应用程序创建和删除大量小文件时，可以看到速度提升 (...)。

fstab该选项在挂载操作中或挂载操作期间设置，替换data=ordered为data=writeback。必须重新挂载包含要删除的文件的文件系统。

Question 2

2021 年 8 月更新

这个答案继续吸引了很多关注，我觉得它已经过时了，现在有点多余了。

就性能而言，这样做find ... -delete很可能会产生可接受的结果。

我认为可能提高性能的一个方面是解决问题的“删除”部分而不是“列出”部分。

我试过了，但没用。但我觉得解释一下我做了什么以及为什么这么做是有用的。

在当今较新的内核中，通过使用内核中的 IO 子系统（参见man 2 io_uring_setup），实际上可以尝试异步执行取消链接 - 这意味着我们可以提交取消链接请求而无需等待或阻塞以查看结果。

该程序基本上读取一个目录，提交数百个unlinks而不等待结果，然后在系统处理完请求后立即收获结果。

它尝试执行dentls所做的操作，但使用了 IO uring。可以使用进行编译gcc -o dentls2 dentls2.c -luring。

#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <err.h>
#include <sched.h>

#include <sys/stat.h>
#include <sys/types.h>
#include <dirent.h>

#include <linux/io_uring.h>
#include <liburing.h>

/* Try to keep the queue size to under two pages as internally its stored in
 * the kernel as contiguously ordered pages. Basically the bigger you make it
 * the higher order it becomes and the less likely you'll have the contiguous
 * pages to support it, despite not hitting any user limits.
 * This reduces an ENOMEM here by keeping the queue size as order 1
 * Ring size internally is rougly 24 bytes per entry plus overheads I haven't
 * accounted for.
 */
#define QUEUE_SIZE 256

/* Globals to manage the queue */
static volatile int pending = 0;
static volatile int total_files = 0;

/* Probes kernel uring implementation and checks if action is 
 * supported inside the kernel */
static void probe_uring(
    struct io_uring *ring)
{
  struct io_uring_probe *pb = {0};

  pb = io_uring_get_probe_ring(ring);

  /* Can we perform IO uring unlink in this kernel ? */
  if (!io_uring_opcode_supported(pb, IORING_OP_UNLINKAT)) {
    free(pb);
    errno = ENOTSUP;
    err(EXIT_FAILURE, "Unable to configure uring");
  }

  free(pb);
}


/* Place a unlink call for the specified file/directory on the ring */
static int submit_unlink_request(
    int dfd,
    const char *fname,
    struct io_uring *ring)
{
  char *fname_cpy = strdup(fname);
  struct io_uring_sqe *sqe = NULL;

  /* Fetch a free submission entry off the ring */
  sqe = io_uring_get_sqe(ring);
  if (!sqe)
    /* Submission queue full */
    return 0;

  pending++;
  /* Format the unlink call for submission */
  io_uring_prep_rw(IORING_OP_UNLINKAT, sqe, dfd, fname_cpy, 0, 0);
  sqe->unlink_flags = 0;

  /* Set the data to just be the filename. Useful for debugging
   * at a later point */
  io_uring_sqe_set_data(sqe, fname_cpy);

  return 1;
}


/* Submit the pending queue, then reap the queue
 * clearing up room on the completion queue */
static void consume_queue(
    struct io_uring *ring)
{
  char *fn;
  int i = 0, bad = 0;
  int rc;
  struct io_uring_cqe **cqes = NULL;

  if (pending < 0)
    abort();

  cqes = calloc(pending, sizeof(struct io_uring_cqe *));
  if (!cqes)
    err(EXIT_FAILURE, "Cannot find memory for CQE pointers");

  /* Notify about submitted entries from the queue (this is a async call) */
  io_uring_submit(ring);

  /* We can immediately take a peek to see if we've anything completed */
  rc = io_uring_peek_batch_cqe(ring, cqes, pending);

  /* Iterate the list of completed entries. Check nothing crazy happened */
  for (i=0; i < rc; i++) {
    /* This returns the filename we set earlier */
    fn = io_uring_cqe_get_data(cqes[i]);

    /* Check the error code of the unlink calls */
    if (cqes[i]->res < 0) {
      errno = -cqes[i]->res;
      warn("Unlinking entry %s failed", fn);
      bad++;
    }

    /* Clear up our CQE */
    free(fn);
    io_uring_cqe_seen(ring, cqes[i]);
  }

  pending -= rc + bad;
  total_files += rc - bad;
  free(cqes);
}



/* Main start */
int main(
    const int argc,
    const char **argv)
{
  struct io_uring ring = {0};
  struct stat st = {0};
  DIR *target = NULL;
  int dfd;
  struct dirent *fn;

  /* Check initial arguments passed make sense */
  if (argc < 2)
    errx(EXIT_FAILURE, "Must pass a directory to remove files from.");

  /* Check path validity */
  if (lstat(argv[1], &st) < 0)
    err(EXIT_FAILURE, "Cannot access target directory");

  if (!S_ISDIR(st.st_mode)) 
    errx(EXIT_FAILURE, "Path specified must be a directory");

  /* Open the directory */
  target = opendir(argv[1]);
  if (!target)
    err(EXIT_FAILURE, "Opening the directory failed");
  dfd = dirfd(target);

  /* Create the initial uring for handling the file removals */
  if (io_uring_queue_init(QUEUE_SIZE, &ring, 0) < 0)
    err(EXIT_FAILURE, "Cannot initialize URING");

  /* Check the unlink action is supported */
  probe_uring(&ring);

  /* So as of writing this code, GETDENTS doesn't have URING support.
   * but checking the kernel mailing list indicates its in progress.
   * For now, we'll just do laymans readdir(). These days theres no 
   * actual difference between it and making the getdents() call ourselves.
   */
  while (fn = readdir(target)) {
    if (fn->d_type != DT_REG)
      /* Pay no attention to non-files */
      continue;

    /* Add to the queue until its full, try to consume it
     * once its full. 
     */
    while (!submit_unlink_request(dfd, fn->d_name, &ring)) {
      /* When the queue becomes full, consume queued entries */
      consume_queue(&ring);
      /* This yield is here to give the uring a chance to 
       * complete pending requests */
      sched_yield();
      continue;
    }
  }

  /* Out of files in directory to list. Just clear the queue */
  while (pending) {
    consume_queue(&ring);
    sched_yield();
  }

  printf("Total files: %d\n", total_files);

  io_uring_queue_exit(&ring);
  closedir(target);
  exit(0);
}

具有讽刺意味的是，结果与我的猜测相反，但是为什么呢？

包含 400 万个文件的 TMPFS

$ time ./dentls2 /tmp/many
Total files: 4000000

real    0m6.459s
user    0m0.360s
sys 0m24.224s

使用查找：

$ time find /tmp/many -type f -delete

real    0m9.978s
user    0m1.872s
sys 0m6.617s

包含 1000 万个文件的 BTRFS

$ time ./dentls2 ./many
Total files: 10000000

real    10m25.749s
user    0m2.214s
sys 16m30.865s

使用查找：

time find ./many -type f -delete

real    7m1.328s
user    0m9.209s
sys 4m42.000s

因此，看起来批量系统调用并没有实时改善性能。新系统dentls2花费了更多时间（四倍）工作，但性能却更差。因此，整体效率的净损失和更差的延迟。dentls2更糟糕。

造成这种情况的原因是 io_uring 产生内核调度线程来内部执行取消链接工作，但正在处理的目录 inode 一次只能由一个写入器修改。

基本上，我们使用uring创建了很多小线程，但只有一个线程被允许从目录中删除。我们刚刚创建了一堆争用，并消除了执行批量 IO 的优势。

使用 eBPF，您可以测量解除链接的频率并观察导致延迟的原因。

对于 BTRFS 来说，它的内核函数调用在被调用btrfs_commit_inode_delayed_inode时获取锁。unlink

和dentls2

# /usr/share/bcc/tools/funclatency btrfs_commit_inode_delayed_inode
    Tracing 1 functions for "btrfs_commit_inode_delayed_inode"... Hit Ctrl-C to end.

     nsecs               : count     distribution
         0 -> 1          : 0        |                                        |
         2 -> 3          : 0        |                                        |
         4 -> 7          : 0        |                                        |
         8 -> 15         : 0        |                                        |
        16 -> 31         : 0        |                                        |
        32 -> 63         : 0        |                                        |
        64 -> 127        : 0        |                                        |
       128 -> 255        : 0        |                                        |
       256 -> 511        : 18       |                                        |
       512 -> 1023       : 120      |                                        |
      1024 -> 2047       : 50982    |                                        |
      2048 -> 4095       : 2569467  |********************                    |
      4096 -> 8191       : 4936402  |****************************************|
      8192 -> 16383      : 1662380  |*************                           |
     16384 -> 32767      : 656883   |*****                                   |
     32768 -> 65535      : 85409    |                                        |
     65536 -> 131071     : 21715    |                                        |
    131072 -> 262143     : 9719     |                                        |
    262144 -> 524287     : 5981     |                                        |
    524288 -> 1048575    : 857      |                                        |
   1048576 -> 2097151    : 293      |                                        |
   2097152 -> 4194303    : 220      |                                        |
   4194304 -> 8388607    : 255      |                                        |
   8388608 -> 16777215   : 153      |                                        |
  16777216 -> 33554431   : 56       |                                        |
  33554432 -> 67108863   : 6        |                                        |
  67108864 -> 134217727  : 1        |                                        |

avg = 8533 nsecs, total: 85345432173 nsecs, count: 10000918

使用find ... -delete：

# /usr/share/bcc/tools/funclatency btrfs_commit_inode_delayed_inode
Tracing 1 functions for "btrfs_commit_inode_delayed_inode"... Hit Ctrl-C to end.
     nsecs               : count     distribution
         0 -> 1          : 0        |                                        |
         2 -> 3          : 0        |                                        |
         4 -> 7          : 0        |                                        |
         8 -> 15         : 0        |                                        |
        16 -> 31         : 0        |                                        |
        32 -> 63         : 0        |                                        |
        64 -> 127        : 0        |                                        |
       128 -> 255        : 0        |                                        |
       256 -> 511        : 34       |                                        |
       512 -> 1023       : 95       |                                        |
      1024 -> 2047       : 1005784  |****                                    |
      2048 -> 4095       : 8110338  |****************************************|
      4096 -> 8191       : 672119   |***                                     |
      8192 -> 16383      : 158329   |                                        |
     16384 -> 32767      : 42338    |                                        |
     32768 -> 65535      : 4667     |                                        |
     65536 -> 131071     : 3597     |                                        |
    131072 -> 262143     : 2860     |                                        |
    262144 -> 524287     : 216      |                                        |
    524288 -> 1048575    : 22       |                                        |
   1048576 -> 2097151    : 6        |                                        |
   2097152 -> 4194303    : 3        |                                        |
   4194304 -> 8388607    : 5        |                                        |
   8388608 -> 16777215   : 3        |                                        |

avg = 3258 nsecs, total: 32585481993 nsecs, count: 10000416

从直方图中可以看出，find在中平均花费 3258 纳秒，btrfs_commit_inode_delayed_inode但dentls2在函数中花费 8533 纳秒。

直方图还显示，总体而言，io_uring 线程等待锁的时间至少是其他线程的两倍，大多数调用花费 4096-8091 纳秒，而大多数调用find花费 2048-4095 纳秒。

Find 是单线程的并且不会争用锁，而 `dentls2 是多线程的（由于 uring），这会产生锁争用，并且所经历的延迟会反映在分析中。

结论

总而言之，在现代系统上（截至撰写本文时），在软件中能够使其运行速度比设定的更快的操作越来越少。

它曾经从磁盘读取一个大的缓冲区，您可以将昂贵的 IO 调用合并为一个大的顺序读取，而不是像小的 getdents() 缓冲区那样进行寻道 IO。

此外，由于其他改进，仅调用系统调用的开销更小，并且顺序/随机 I/O 访问时间得到重大改进，从而消除了我们过去遇到的大型 I/O 瓶颈。

在我的系统上，这个问题已经变得受内存/CPU 限制。BTRFS 上存在单访问器问题（至少如此），这会限制您每次访问单个 CPU/程序时每个目录的取消链接的速度。即使在使用 tmpfs 的理想情况下，尝试批量处理 IO 也只能获得微小的改进，而在实际文件系统上，情况通常更糟。

最重要的是，我们真的不再有这个问题了——花 4 个小时删除 1000 万个文件的日子已经一去不复返了。

只需做一些简单的事情find ... -delete。我尝试的任何优化似乎都无法在默认的简单设置上产生值得编码（或分析）的重大性能改进。

原始答案

虽然导致该问题的主要原因是 ext3 在处理数百万个文件时的性能问题，但该问题的实际根本原因却不同。

当需要列出目录时，将对目录调用 readdir()，从而生成文件列表。readdir 是一个 posix 调用，但此处使用的真正 Linux 系统调用称为“getdents”。Getdents 通过使用条目填充缓冲区来列出目录条目。

问题主要在于 readdir() 使用固定大小为 32Kb 的缓冲区来获取文件。随着目录越来越大（文件添加后大小也会增加），ext3 获取条目的速度会越来越慢，而且 readdir 的额外 32Kb 缓冲区大小仅足以包含目录中的一小部分条目。这导致 readdir 不断循环并不断调用昂贵的系统调用。

例如，在我创建的一个包含 260 多万个文件的测试目录中，运行“ls -1|wc-l”会显示许多 getdent 系统调用的大量 strace 输出。

$ strace ls -1 | wc -l
brk(0x4949000)                          = 0x4949000
getdents(3, /* 1025 entries */, 32768)  = 32752
getdents(3, /* 1024 entries */, 32768)  = 32752
getdents(3, /* 1025 entries */, 32768)  = 32760
getdents(3, /* 1025 entries */, 32768)  = 32768
brk(0)                                  = 0x4949000
brk(0x496a000)                          = 0x496a000
getdents(3, /* 1024 entries */, 32768)  = 32752
getdents(3, /* 1026 entries */, 32768)  = 32760
...

此外，在这个目录中花费的时间也相当多。

$ time ls -1 | wc -l
2616044

real    0m20.609s
user    0m16.241s
sys 0m3.639s

使这个过程更加高效的方法是使用更大的缓冲区手动调用 getdents。这可以显著提高性能。

现在，你不应该自己手动调用 getdents，因此不存在正常使用它的接口（查看 getdents 的手册页即可了解！），但是你能手动调用它并使您的系统调用更加高效。

这大大减少了获取这些文件所需的时间。我编写了一个程序来实现这一点。

/* I can be compiled with the command "gcc -o dentls dentls.c" */

#define _GNU_SOURCE

#include <dirent.h>     /* Defines DT_* constants */
#include <err.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>

struct linux_dirent {
        long           d_ino;
        off_t          d_off;
        unsigned short d_reclen;
        char           d_name[256];
        char           d_type;
};

static int delete = 0;
char *path = NULL;

static void parse_config(
        int argc,
        char **argv)
{
    int option_idx = 0;
    static struct option loptions[] = {
      { "delete", no_argument, &delete, 1 },
      { "help", no_argument, NULL, 'h' },
      { 0, 0, 0, 0 }
    };

    while (1) {
        int c = getopt_long(argc, argv, "h", loptions, &option_idx);
        if (c < 0)
            break;

        switch(c) {
          case 0: {
              break;
          }
 
          case 'h': {
              printf("Usage: %s [--delete] DIRECTORY\n"
                     "List/Delete files in DIRECTORY.\n"
                     "Example %s --delete /var/spool/postfix/deferred\n",
                     argv[0], argv[0]);
              exit(0);                      
              break;
          }

          default:
          break;
        }
    }

    if (optind >= argc)
      errx(EXIT_FAILURE, "Must supply a valid directory\n");

    path = argv[optind];
}

int main(
    int argc,
    char** argv)
{

    parse_config(argc, argv);

    int totalfiles = 0;
    int dirfd = -1;
    int offset = 0;
    int bufcount = 0;
    void *buffer = NULL;
    char *d_type;
    struct linux_dirent *dent = NULL;
    struct stat dstat;

    /* Standard sanity checking stuff */
    if (access(path, R_OK) < 0) 
        err(EXIT_FAILURE, "Could not access directory");

    if (lstat(path, &dstat) < 0) 
        err(EXIT_FAILURE, "Unable to lstat path");

    if (!S_ISDIR(dstat.st_mode))
        errx(EXIT_FAILURE, "The path %s is not a directory.\n", path);

    /* Allocate a buffer of equal size to the directory to store dents */
    if ((buffer = calloc(dstat.st_size*3, 1)) == NULL)
        err(EXIT_FAILURE, "Buffer allocation failure");

    /* Open the directory */
    if ((dirfd = open(path, O_RDONLY)) < 0) 
        err(EXIT_FAILURE, "Open error");

    /* Switch directories */
    fchdir(dirfd);

    if (delete) {
        printf("Deleting files in ");
        for (int i=5; i > 0; i--) {
            printf("%u. . . ", i);
            fflush(stdout);
            sleep(1);
        }
        printf("\n");
    }

    while (bufcount = syscall(SYS_getdents, dirfd, buffer, dstat.st_size*3)) {
        offset = 0;
        dent = buffer;
        while (offset < bufcount) {
            /* Don't print thisdir and parent dir */
            if (!((strcmp(".",dent->d_name) == 0) || (strcmp("..",dent->d_name) == 0))) {
                d_type = (char *)dent + dent->d_reclen-1;
                /* Only print files */
                if (*d_type == DT_REG) {
                    printf ("%s\n", dent->d_name);
                    if (delete) {
                        if (unlink(dent->d_name) < 0)
                            warn("Cannot delete file \"%s\"", dent->d_name);
                    }
                    totalfiles++;
                }
            }
            offset += dent->d_reclen;
            dent = buffer + offset;
        }
    }
    fprintf(stderr, "Total files: %d\n", totalfiles);
    close(dirfd);
    free(buffer);

    exit(0);
}

虽然这不能解决根本问题（文件系统中有大量文件，但性能不佳）。但它可能比发布的许多替代方案要快得多。

出于事先考虑，应该删除受影响的目录，然后重新创建。目录的大小只会增加，并且由于目录的大小，即使里面有几个文件，其性能仍然很差。

编辑：我已经清理了不少。添加了一个选项，允许您在运行时在命令行上删除，并删除了一堆 treewalk 内容，老实说，回头看，这充其量是值得怀疑的。还显示会产生内存损坏。

你现在可以dentls --delete /my/path

新结果。基于包含 182 万个文件的目录。

## Ideal ls Uncached
$ time ls -u1 data >/dev/null

real    0m44.948s
user    0m1.737s
sys 0m22.000s

## Ideal ls Cached
$ time ls -u1 data >/dev/null

real    0m46.012s
user    0m1.746s
sys 0m21.805s


### dentls uncached
$ time ./dentls data >/dev/null
Total files: 1819292

real    0m1.608s
user    0m0.059s
sys 0m0.791s

## dentls cached
$ time ./dentls data >/dev/null
Total files: 1819292

real    0m0.771s
user    0m0.057s
sys 0m0.711s

有点惊讶它仍然如此有效！

Answer

2021 年 8 月更新

这个答案继续吸引了很多关注，我觉得它已经过时了，现在有点多余了。

就性能而言，这样做find ... -delete很可能会产生可接受的结果。

我认为可能提高性能的一个方面是解决问题的“删除”部分而不是“列出”部分。

我试过了，但没用。但我觉得解释一下我做了什么以及为什么这么做是有用的。

在当今较新的内核中，通过使用内核中的 IO 子系统（参见man 2 io_uring_setup），实际上可以尝试异步执行取消链接 - 这意味着我们可以提交取消链接请求而无需等待或阻塞以查看结果。

该程序基本上读取一个目录，提交数百个unlinks而不等待结果，然后在系统处理完请求后立即收获结果。

它尝试执行dentls所做的操作，但使用了 IO uring。可以使用进行编译gcc -o dentls2 dentls2.c -luring。

#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <err.h>
#include <sched.h>

#include <sys/stat.h>
#include <sys/types.h>
#include <dirent.h>

#include <linux/io_uring.h>
#include <liburing.h>

/* Try to keep the queue size to under two pages as internally its stored in
 * the kernel as contiguously ordered pages. Basically the bigger you make it
 * the higher order it becomes and the less likely you'll have the contiguous
 * pages to support it, despite not hitting any user limits.
 * This reduces an ENOMEM here by keeping the queue size as order 1
 * Ring size internally is rougly 24 bytes per entry plus overheads I haven't
 * accounted for.
 */
#define QUEUE_SIZE 256

/* Globals to manage the queue */
static volatile int pending = 0;
static volatile int total_files = 0;

/* Probes kernel uring implementation and checks if action is 
 * supported inside the kernel */
static void probe_uring(
    struct io_uring *ring)
{
  struct io_uring_probe *pb = {0};

  pb = io_uring_get_probe_ring(ring);

  /* Can we perform IO uring unlink in this kernel ? */
  if (!io_uring_opcode_supported(pb, IORING_OP_UNLINKAT)) {
    free(pb);
    errno = ENOTSUP;
    err(EXIT_FAILURE, "Unable to configure uring");
  }

  free(pb);
}


/* Place a unlink call for the specified file/directory on the ring */
static int submit_unlink_request(
    int dfd,
    const char *fname,
    struct io_uring *ring)
{
  char *fname_cpy = strdup(fname);
  struct io_uring_sqe *sqe = NULL;

  /* Fetch a free submission entry off the ring */
  sqe = io_uring_get_sqe(ring);
  if (!sqe)
    /* Submission queue full */
    return 0;

  pending++;
  /* Format the unlink call for submission */
  io_uring_prep_rw(IORING_OP_UNLINKAT, sqe, dfd, fname_cpy, 0, 0);
  sqe->unlink_flags = 0;

  /* Set the data to just be the filename. Useful for debugging
   * at a later point */
  io_uring_sqe_set_data(sqe, fname_cpy);

  return 1;
}


/* Submit the pending queue, then reap the queue
 * clearing up room on the completion queue */
static void consume_queue(
    struct io_uring *ring)
{
  char *fn;
  int i = 0, bad = 0;
  int rc;
  struct io_uring_cqe **cqes = NULL;

  if (pending < 0)
    abort();

  cqes = calloc(pending, sizeof(struct io_uring_cqe *));
  if (!cqes)
    err(EXIT_FAILURE, "Cannot find memory for CQE pointers");

  /* Notify about submitted entries from the queue (this is a async call) */
  io_uring_submit(ring);

  /* We can immediately take a peek to see if we've anything completed */
  rc = io_uring_peek_batch_cqe(ring, cqes, pending);

  /* Iterate the list of completed entries. Check nothing crazy happened */
  for (i=0; i < rc; i++) {
    /* This returns the filename we set earlier */
    fn = io_uring_cqe_get_data(cqes[i]);

    /* Check the error code of the unlink calls */
    if (cqes[i]->res < 0) {
      errno = -cqes[i]->res;
      warn("Unlinking entry %s failed", fn);
      bad++;
    }

    /* Clear up our CQE */
    free(fn);
    io_uring_cqe_seen(ring, cqes[i]);
  }

  pending -= rc + bad;
  total_files += rc - bad;
  free(cqes);
}



/* Main start */
int main(
    const int argc,
    const char **argv)
{
  struct io_uring ring = {0};
  struct stat st = {0};
  DIR *target = NULL;
  int dfd;
  struct dirent *fn;

  /* Check initial arguments passed make sense */
  if (argc < 2)
    errx(EXIT_FAILURE, "Must pass a directory to remove files from.");

  /* Check path validity */
  if (lstat(argv[1], &st) < 0)
    err(EXIT_FAILURE, "Cannot access target directory");

  if (!S_ISDIR(st.st_mode)) 
    errx(EXIT_FAILURE, "Path specified must be a directory");

  /* Open the directory */
  target = opendir(argv[1]);
  if (!target)
    err(EXIT_FAILURE, "Opening the directory failed");
  dfd = dirfd(target);

  /* Create the initial uring for handling the file removals */
  if (io_uring_queue_init(QUEUE_SIZE, &ring, 0) < 0)
    err(EXIT_FAILURE, "Cannot initialize URING");

  /* Check the unlink action is supported */
  probe_uring(&ring);

  /* So as of writing this code, GETDENTS doesn't have URING support.
   * but checking the kernel mailing list indicates its in progress.
   * For now, we'll just do laymans readdir(). These days theres no 
   * actual difference between it and making the getdents() call ourselves.
   */
  while (fn = readdir(target)) {
    if (fn->d_type != DT_REG)
      /* Pay no attention to non-files */
      continue;

    /* Add to the queue until its full, try to consume it
     * once its full. 
     */
    while (!submit_unlink_request(dfd, fn->d_name, &ring)) {
      /* When the queue becomes full, consume queued entries */
      consume_queue(&ring);
      /* This yield is here to give the uring a chance to 
       * complete pending requests */
      sched_yield();
      continue;
    }
  }

  /* Out of files in directory to list. Just clear the queue */
  while (pending) {
    consume_queue(&ring);
    sched_yield();
  }

  printf("Total files: %d\n", total_files);

  io_uring_queue_exit(&ring);
  closedir(target);
  exit(0);
}

具有讽刺意味的是，结果与我的猜测相反，但是为什么呢？

包含 400 万个文件的 TMPFS

$ time ./dentls2 /tmp/many
Total files: 4000000

real    0m6.459s
user    0m0.360s
sys 0m24.224s

使用查找：

$ time find /tmp/many -type f -delete

real    0m9.978s
user    0m1.872s
sys 0m6.617s

包含 1000 万个文件的 BTRFS

$ time ./dentls2 ./many
Total files: 10000000

real    10m25.749s
user    0m2.214s
sys 16m30.865s

使用查找：

time find ./many -type f -delete

real    7m1.328s
user    0m9.209s
sys 4m42.000s

因此，看起来批量系统调用并没有实时改善性能。新系统dentls2花费了更多时间（四倍）工作，但性能却更差。因此，整体效率的净损失和更差的延迟。dentls2更糟糕。

造成这种情况的原因是 io_uring 产生内核调度线程来内部执行取消链接工作，但正在处理的目录 inode 一次只能由一个写入器修改。

基本上，我们使用uring创建了很多小线程，但只有一个线程被允许从目录中删除。我们刚刚创建了一堆争用，并消除了执行批量 IO 的优势。

使用 eBPF，您可以测量解除链接的频率并观察导致延迟的原因。

对于 BTRFS 来说，它的内核函数调用在被调用btrfs_commit_inode_delayed_inode时获取锁。unlink

和dentls2

# /usr/share/bcc/tools/funclatency btrfs_commit_inode_delayed_inode
    Tracing 1 functions for "btrfs_commit_inode_delayed_inode"... Hit Ctrl-C to end.

     nsecs               : count     distribution
         0 -> 1          : 0        |                                        |
         2 -> 3          : 0        |                                        |
         4 -> 7          : 0        |                                        |
         8 -> 15         : 0        |                                        |
        16 -> 31         : 0        |                                        |
        32 -> 63         : 0        |                                        |
        64 -> 127        : 0        |                                        |
       128 -> 255        : 0        |                                        |
       256 -> 511        : 18       |                                        |
       512 -> 1023       : 120      |                                        |
      1024 -> 2047       : 50982    |                                        |
      2048 -> 4095       : 2569467  |********************                    |
      4096 -> 8191       : 4936402  |****************************************|
      8192 -> 16383      : 1662380  |*************                           |
     16384 -> 32767      : 656883   |*****                                   |
     32768 -> 65535      : 85409    |                                        |
     65536 -> 131071     : 21715    |                                        |
    131072 -> 262143     : 9719     |                                        |
    262144 -> 524287     : 5981     |                                        |
    524288 -> 1048575    : 857      |                                        |
   1048576 -> 2097151    : 293      |                                        |
   2097152 -> 4194303    : 220      |                                        |
   4194304 -> 8388607    : 255      |                                        |
   8388608 -> 16777215   : 153      |                                        |
  16777216 -> 33554431   : 56       |                                        |
  33554432 -> 67108863   : 6        |                                        |
  67108864 -> 134217727  : 1        |                                        |

avg = 8533 nsecs, total: 85345432173 nsecs, count: 10000918

使用find ... -delete：

# /usr/share/bcc/tools/funclatency btrfs_commit_inode_delayed_inode
Tracing 1 functions for "btrfs_commit_inode_delayed_inode"... Hit Ctrl-C to end.
     nsecs               : count     distribution
         0 -> 1          : 0        |                                        |
         2 -> 3          : 0        |                                        |
         4 -> 7          : 0        |                                        |
         8 -> 15         : 0        |                                        |
        16 -> 31         : 0        |                                        |
        32 -> 63         : 0        |                                        |
        64 -> 127        : 0        |                                        |
       128 -> 255        : 0        |                                        |
       256 -> 511        : 34       |                                        |
       512 -> 1023       : 95       |                                        |
      1024 -> 2047       : 1005784  |****                                    |
      2048 -> 4095       : 8110338  |****************************************|
      4096 -> 8191       : 672119   |***                                     |
      8192 -> 16383      : 158329   |                                        |
     16384 -> 32767      : 42338    |                                        |
     32768 -> 65535      : 4667     |                                        |
     65536 -> 131071     : 3597     |                                        |
    131072 -> 262143     : 2860     |                                        |
    262144 -> 524287     : 216      |                                        |
    524288 -> 1048575    : 22       |                                        |
   1048576 -> 2097151    : 6        |                                        |
   2097152 -> 4194303    : 3        |                                        |
   4194304 -> 8388607    : 5        |                                        |
   8388608 -> 16777215   : 3        |                                        |

avg = 3258 nsecs, total: 32585481993 nsecs, count: 10000416

从直方图中可以看出，find在中平均花费 3258 纳秒，btrfs_commit_inode_delayed_inode但dentls2在函数中花费 8533 纳秒。

直方图还显示，总体而言，io_uring 线程等待锁的时间至少是其他线程的两倍，大多数调用花费 4096-8091 纳秒，而大多数调用find花费 2048-4095 纳秒。

Find 是单线程的并且不会争用锁，而 `dentls2 是多线程的（由于 uring），这会产生锁争用，并且所经历的延迟会反映在分析中。

结论

总而言之，在现代系统上（截至撰写本文时），在软件中能够使其运行速度比设定的更快的操作越来越少。

它曾经从磁盘读取一个大的缓冲区，您可以将昂贵的 IO 调用合并为一个大的顺序读取，而不是像小的 getdents() 缓冲区那样进行寻道 IO。

此外，由于其他改进，仅调用系统调用的开销更小，并且顺序/随机 I/O 访问时间得到重大改进，从而消除了我们过去遇到的大型 I/O 瓶颈。

在我的系统上，这个问题已经变得受内存/CPU 限制。BTRFS 上存在单访问器问题（至少如此），这会限制您每次访问单个 CPU/程序时每个目录的取消链接的速度。即使在使用 tmpfs 的理想情况下，尝试批量处理 IO 也只能获得微小的改进，而在实际文件系统上，情况通常更糟。

最重要的是，我们真的不再有这个问题了——花 4 个小时删除 1000 万个文件的日子已经一去不复返了。

只需做一些简单的事情find ... -delete。我尝试的任何优化似乎都无法在默认的简单设置上产生值得编码（或分析）的重大性能改进。

原始答案

虽然导致该问题的主要原因是 ext3 在处理数百万个文件时的性能问题，但该问题的实际根本原因却不同。

当需要列出目录时，将对目录调用 readdir()，从而生成文件列表。readdir 是一个 posix 调用，但此处使用的真正 Linux 系统调用称为“getdents”。Getdents 通过使用条目填充缓冲区来列出目录条目。

问题主要在于 readdir() 使用固定大小为 32Kb 的缓冲区来获取文件。随着目录越来越大（文件添加后大小也会增加），ext3 获取条目的速度会越来越慢，而且 readdir 的额外 32Kb 缓冲区大小仅足以包含目录中的一小部分条目。这导致 readdir 不断循环并不断调用昂贵的系统调用。

例如，在我创建的一个包含 260 多万个文件的测试目录中，运行“ls -1|wc-l”会显示许多 getdent 系统调用的大量 strace 输出。

$ strace ls -1 | wc -l
brk(0x4949000)                          = 0x4949000
getdents(3, /* 1025 entries */, 32768)  = 32752
getdents(3, /* 1024 entries */, 32768)  = 32752
getdents(3, /* 1025 entries */, 32768)  = 32760
getdents(3, /* 1025 entries */, 32768)  = 32768
brk(0)                                  = 0x4949000
brk(0x496a000)                          = 0x496a000
getdents(3, /* 1024 entries */, 32768)  = 32752
getdents(3, /* 1026 entries */, 32768)  = 32760
...

此外，在这个目录中花费的时间也相当多。

$ time ls -1 | wc -l
2616044

real    0m20.609s
user    0m16.241s
sys 0m3.639s

使这个过程更加高效的方法是使用更大的缓冲区手动调用 getdents。这可以显著提高性能。

现在，你不应该自己手动调用 getdents，因此不存在正常使用它的接口（查看 getdents 的手册页即可了解！），但是你能手动调用它并使您的系统调用更加高效。

这大大减少了获取这些文件所需的时间。我编写了一个程序来实现这一点。

/* I can be compiled with the command "gcc -o dentls dentls.c" */

#define _GNU_SOURCE

#include <dirent.h>     /* Defines DT_* constants */
#include <err.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>

struct linux_dirent {
        long           d_ino;
        off_t          d_off;
        unsigned short d_reclen;
        char           d_name[256];
        char           d_type;
};

static int delete = 0;
char *path = NULL;

static void parse_config(
        int argc,
        char **argv)
{
    int option_idx = 0;
    static struct option loptions[] = {
      { "delete", no_argument, &delete, 1 },
      { "help", no_argument, NULL, 'h' },
      { 0, 0, 0, 0 }
    };

    while (1) {
        int c = getopt_long(argc, argv, "h", loptions, &option_idx);
        if (c < 0)
            break;

        switch(c) {
          case 0: {
              break;
          }
 
          case 'h': {
              printf("Usage: %s [--delete] DIRECTORY\n"
                     "List/Delete files in DIRECTORY.\n"
                     "Example %s --delete /var/spool/postfix/deferred\n",
                     argv[0], argv[0]);
              exit(0);                      
              break;
          }

          default:
          break;
        }
    }

    if (optind >= argc)
      errx(EXIT_FAILURE, "Must supply a valid directory\n");

    path = argv[optind];
}

int main(
    int argc,
    char** argv)
{

    parse_config(argc, argv);

    int totalfiles = 0;
    int dirfd = -1;
    int offset = 0;
    int bufcount = 0;
    void *buffer = NULL;
    char *d_type;
    struct linux_dirent *dent = NULL;
    struct stat dstat;

    /* Standard sanity checking stuff */
    if (access(path, R_OK) < 0) 
        err(EXIT_FAILURE, "Could not access directory");

    if (lstat(path, &dstat) < 0) 
        err(EXIT_FAILURE, "Unable to lstat path");

    if (!S_ISDIR(dstat.st_mode))
        errx(EXIT_FAILURE, "The path %s is not a directory.\n", path);

    /* Allocate a buffer of equal size to the directory to store dents */
    if ((buffer = calloc(dstat.st_size*3, 1)) == NULL)
        err(EXIT_FAILURE, "Buffer allocation failure");

    /* Open the directory */
    if ((dirfd = open(path, O_RDONLY)) < 0) 
        err(EXIT_FAILURE, "Open error");

    /* Switch directories */
    fchdir(dirfd);

    if (delete) {
        printf("Deleting files in ");
        for (int i=5; i > 0; i--) {
            printf("%u. . . ", i);
            fflush(stdout);
            sleep(1);
        }
        printf("\n");
    }

    while (bufcount = syscall(SYS_getdents, dirfd, buffer, dstat.st_size*3)) {
        offset = 0;
        dent = buffer;
        while (offset < bufcount) {
            /* Don't print thisdir and parent dir */
            if (!((strcmp(".",dent->d_name) == 0) || (strcmp("..",dent->d_name) == 0))) {
                d_type = (char *)dent + dent->d_reclen-1;
                /* Only print files */
                if (*d_type == DT_REG) {
                    printf ("%s\n", dent->d_name);
                    if (delete) {
                        if (unlink(dent->d_name) < 0)
                            warn("Cannot delete file \"%s\"", dent->d_name);
                    }
                    totalfiles++;
                }
            }
            offset += dent->d_reclen;
            dent = buffer + offset;
        }
    }
    fprintf(stderr, "Total files: %d\n", totalfiles);
    close(dirfd);
    free(buffer);

    exit(0);
}

虽然这不能解决根本问题（文件系统中有大量文件，但性能不佳）。但它可能比发布的许多替代方案要快得多。

出于事先考虑，应该删除受影响的目录，然后重新创建。目录的大小只会增加，并且由于目录的大小，即使里面有几个文件，其性能仍然很差。

编辑：我已经清理了不少。添加了一个选项，允许您在运行时在命令行上删除，并删除了一堆 treewalk 内容，老实说，回头看，这充其量是值得怀疑的。还显示会产生内存损坏。

你现在可以dentls --delete /my/path

新结果。基于包含 182 万个文件的目录。

## Ideal ls Uncached
$ time ls -u1 data >/dev/null

real    0m44.948s
user    0m1.737s
sys 0m22.000s

## Ideal ls Cached
$ time ls -u1 data >/dev/null

real    0m46.012s
user    0m1.746s
sys 0m21.805s


### dentls uncached
$ time ./dentls data >/dev/null
Total files: 1819292

real    0m1.608s
user    0m0.059s
sys 0m0.791s

## dentls cached
$ time ./dentls data >/dev/null
Total files: 1819292

real    0m0.771s
user    0m0.057s
sys 0m0.711s

有点惊讶它仍然如此有效！

Question 3

是否可以将该文件系统中的所有其他文件备份到临时存储位置，重新格式化分区，然后恢复文件？

Answer

是否可以将该文件系统中的所有其他文件备份到临时存储位置，重新格式化分区，然后恢复文件？

Question 4

我没有进行过基准测试，但是这家伙：

rsync -a --delete ./emptyDirectoty/ ./hugeDirectory/

Answer

我没有进行过基准测试，但是这家伙：

rsync -a --delete ./emptyDirectoty/ ./hugeDirectory/

rm 包含数百万个文件的目录

答案1

答案2

包含 400 万个文件的 TMPFS

包含 1000 万个文件的 BTRFS

结论

原始答案

答案3

答案4

相关内容