如何在shell脚本中将数据追加到缓冲区？

Question 1

谢谢斯蒂芬·查泽拉斯的回答“为什么echo和cat的执行时间有这么大的差别？”，答案为穆鲁仅调用cat一次可能会有所改善（但是，对于大数据和大量循环迭代，这个“一点”量可能会变得很多；在我的系统上，此脚本大约需要循环脚本所花费的 75% 时间）：

#!/bin/sh
yes INPUT.file | head -100 | xargs cat >> OUTPUT.file

Answer

谢谢斯蒂芬·查泽拉斯的回答“为什么echo和cat的执行时间有这么大的差别？”，答案为穆鲁仅调用cat一次可能会有所改善（但是，对于大数据和大量循环迭代，这个“一点”量可能会变得很多；在我的系统上，此脚本大约需要循环脚本所花费的 75% 时间）：

#!/bin/sh
yes INPUT.file | head -100 | xargs cat >> OUTPUT.file

Question 2

考虑重定向循环本身：

#!/bin/sh
for jj in seq 100; do
    cat INPUT.file
done >> OUTPUT.file

Answer

考虑重定向循环本身：

#!/bin/sh
for jj in seq 100; do
    cat INPUT.file
done >> OUTPUT.file

Question 3

如果速度是您主要关心的问题，那么您可能会发现cat完成此任务的速度不够快。您可能希望将组成文件并行写入输出。

我制作了一个并行的快速版本，cat但有以下警告：

所有输入文件必须是常规文件（因此我们提前知道大小）。
fcat运行时不要写入或截断输入文件
输出文件不能已经存在（为了防止意外，也为了避免浪费时间阅读我们将要覆盖的内容）。

显然，这是一个快速的概念验证，因此可以变得更加稳健，但想法如下：

fcat.c：

#include <errno.h>
#include <stdio.h>
#include <stdlib.h>

#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>


struct in_fd {
    int fd;
    int err;
    off_t start;
    struct stat s;
};

int main(int argc, char**argv)
{
    char *outfile = argv[--argc];

    if (argc < 2) {
        fprintf(stderr, "Usage: %s INFILE... OUTFILE\n", argv[0]);
        return 1;
    }

    struct in_fd *infiles = calloc(argc, sizeof *infiles);

#pragma omp parallel for
    for (int i = 1;  i < argc;  ++i) {
        struct in_fd *const input = infiles + i;
        char const *const filename = argv[i];
        input->err = 0;
        if ((input->fd = open(filename, O_RDONLY)) < 0) {
            perror(filename);
            input->err = errno;
            continue;
        }
        if (fstat(input->fd, &input->s)) {
            perror(filename);
            input->err = errno;
            continue;
        }
        if (!S_ISREG(input->s.st_mode)) {
            fprintf(stderr, "%s: not a regular file\n", filename);
            input->err = EINVAL;
            continue;
        }
    }

    off_t total = 0;
    for (int i = 1;  i < argc;  ++i) {
        if (infiles[i].err)
            return EXIT_FAILURE;
        infiles[i].start = total;
        total += infiles[i].s.st_size;
    }

    int out_fd = open(outfile, O_RDWR | O_CREAT | O_EXCL, 0666);
    if (out_fd < 1) {
        perror(outfile);
        return 1;
    }

    if (ftruncate(out_fd, total)) {
        perror(outfile);
        return 1;
    }

    /* On Linux, you might wish to add MAP_HUGETLB */
    char *out_mem = mmap(NULL, total, PROT_WRITE, MAP_SHARED, out_fd, 0);
    if (out_mem == MAP_FAILED) {
        perror(outfile);
        return 1;
    }

#pragma omp parallel for
    for (int i = 1;  i < argc;  ++i) {
        struct in_fd *const input = infiles + i;
        char *p = out_mem + input->start;
        char *end = p + input->s.st_size;
        input->err = 0;
        while (p < end) {
            int r = read(input->fd, p, end-p);
            if (r < 0) {
                if (errno != EINTR) {
                    perror(argv[i]);
                    input->err = errno;
                    break;
                }
            } else {
                p += r;
            }
        }
        close(infiles->fd);
    }


    if (munmap(out_mem, total)) {
        perror(outfile);
    }

    for (int i = 1;  i < argc;  ++i) {
        if (infiles[i].err) {
            unlink(outfile);
            return EXIT_FAILURE;
        }
    }

    return EXIT_SUCCESS;
}

生成文件：

CFLAGS += -Wall -Wextra
CFLAGS += -std=c99 -D_GNU_SOURCE
CFLAGS += -g -O2
CFLAGS += -fopenmp

all: fcat
.PHONY:all

我的 12 个线程的计时结果显示，运行时间为 0.2 秒，而运行时间为 2.3 秒cat（每次运行 3 次，使用热缓存，48 个文件，总计 138M）的运行时间为 2.3 秒。

Answer