更快地对数据进行排序的方法

Question 1

假设您有足够的内存来读取文件，您可以尝试

perl -e 'use List::Util 'shuffle'; @k=shuffle(<>); print @k[0..999]' file.bed

由于您想执行 10000 次，因此我建议将重复集成到脚本中并洗牌指数而不是数组本身来加快速度：

$ time perl -e 'use List::Util 'shuffle'; 
            @l=<>; for $i (1..10000){
               open(my $fh, ">","file.$i.bed"); 
               @r=shuffle(0..$#l); 
               print $fh @l[@r[0..999]]
            }' file.bed

real    1m12.444s
user    1m8.536s
sys     0m3.244s

上面从包含 37000 行的文件创建了 10000 个文件，每个文件 1000 行（您的示例文件重复了 1000 次）。正如您所看到的，在我的系统上花费了略多于三分钟的时间。

解释

use List::Util 'shuffle';：这会导入一个 Perl 模块，该模块提供shuffle()随机化数组的函数。
@l=<>;：将输入文件( <>)加载到数组中@l。
for $i (1..10000){}：运行 10000 次。
@r=shuffle(0..$#l);:是so$#l中的元素数量，现在是数组索引号（输入文件的行）的随机列表。@l@r@l
open(my $fh, ">","file.$i.bed");：打开一个需要写入的文件file.$i.bed。$i取值范围为 1 到 10000。
print $fh @l[@r[0..999]]：获取打乱数组中的前 1000 个索引并打印相应的行（的元素@l）。

另一种方法是使用shuf(谢谢@frostschutz）：

$ time for i in {1..10000}; do shuf -n 1000 file.bed > file.$i.abed; done

real    1m9.743s
user    0m23.732s
sys     0m31.764s

Answer

假设您有足够的内存来读取文件，您可以尝试

perl -e 'use List::Util 'shuffle'; @k=shuffle(<>); print @k[0..999]' file.bed

由于您想执行 10000 次，因此我建议将重复集成到脚本中并洗牌指数而不是数组本身来加快速度：

$ time perl -e 'use List::Util 'shuffle'; 
            @l=<>; for $i (1..10000){
               open(my $fh, ">","file.$i.bed"); 
               @r=shuffle(0..$#l); 
               print $fh @l[@r[0..999]]
            }' file.bed

real    1m12.444s
user    1m8.536s
sys     0m3.244s

上面从包含 37000 行的文件创建了 10000 个文件，每个文件 1000 行（您的示例文件重复了 1000 次）。正如您所看到的，在我的系统上花费了略多于三分钟的时间。

解释

use List::Util 'shuffle';：这会导入一个 Perl 模块，该模块提供shuffle()随机化数组的函数。
@l=<>;：将输入文件( <>)加载到数组中@l。
for $i (1..10000){}：运行 10000 次。
@r=shuffle(0..$#l);:是so$#l中的元素数量，现在是数组索引号（输入文件的行）的随机列表。@l@r@l
open(my $fh, ">","file.$i.bed");：打开一个需要写入的文件file.$i.bed。$i取值范围为 1 到 10000。
print $fh @l[@r[0..999]]：获取打乱数组中的前 1000 个索引并打印相应的行（的元素@l）。

另一种方法是使用shuf(谢谢@frostschutz）：

$ time for i in {1..10000}; do shuf -n 1000 file.bed > file.$i.abed; done

real    1m9.743s
user    0m23.732s
sys     0m31.764s

Question 2

如果您想要一个基准测试来看看它能完成多快，请将其复制粘贴到10kshuffle.cpp并编译g++ 10kshuffle.cpp -o 10kshuffle。然后你可以运行它：

10kshuffle filename < inputfile

其中filename是用于输出文件的基本路径；它们将被命名为filename.0、filename.1等，并且每个都包含随机播放的前 1000 行。它会随时写入每个文件的名称。

#include <cerrno>
#include <cstdlib>
#include <cstring>
#include <fcntl.h>
#include <fstream>
#include <iostream>
#include <string>
#include <sstream>
#include <unistd.h>
#include <vector>

using namespace std;

unsigned int randomSeed () {
    int in = open("/dev/urandom", O_RDONLY);
    if (!in) {
        cerr << strerror(errno);
        exit(1);
    }
    unsigned int x;
    read(in, &x, sizeof(x));
    close(in);
    return x;
}

int main (int argc, const char *argv[]) {
    char basepath[1024];
    strcpy(basepath,argv[1]);
    char *pathend = &basepath[strlen(basepath)];
// Read in.
    vector<char*> data;
    data.reserve(1<<16);
    while (!cin.eof()) {
        char *buf = new char[1024];
        cin.getline(buf,1023);
        data.push_back(buf);
    }

    srand(randomSeed());
    for (int n = 0; n < 10000; n++) {
        vector<char*> copy(data);
    // Fisher-Yates shuffle.
        int last = copy.size() - 1;
        for (int i = last; i > 0; i--) {
            int r = rand() % i;
            if (r == i) continue;
            char *t = copy[i];
            copy[i] = copy[r];
            copy[r] = t;
        }
    // Write out.
        sprintf(pathend, ".%d", n);
        ofstream file(basepath);
        for (int j = 0; j < 1000; j++) file << copy[j] << endl;
        cout << basepath << endl;
        file.close();
    }

    return 0;
}

在单个 3.5 Ghz 核心上，运行时间约为 20 秒：

   time ./10kshuffle tmp/test < data.txt
   tmp/test.0
   [...]
   tmp/test.9999
   real 19.95, user 9.46, sys 9.86, RSS 39408

data.txt问题重复了 37000 行。如果您希望输出文件中包含整个随机播放而不是前 1000 行，请将第 54 行更改为：

for (int j = 0; j < copy.size(); j++) file << copy[j] << endl;

Answer

如果您想要一个基准测试来看看它能完成多快，请将其复制粘贴到10kshuffle.cpp并编译g++ 10kshuffle.cpp -o 10kshuffle。然后你可以运行它：

10kshuffle filename < inputfile

其中filename是用于输出文件的基本路径；它们将被命名为filename.0、filename.1等，并且每个都包含随机播放的前 1000 行。它会随时写入每个文件的名称。

#include <cerrno>
#include <cstdlib>
#include <cstring>
#include <fcntl.h>
#include <fstream>
#include <iostream>
#include <string>
#include <sstream>
#include <unistd.h>
#include <vector>

using namespace std;

unsigned int randomSeed () {
    int in = open("/dev/urandom", O_RDONLY);
    if (!in) {
        cerr << strerror(errno);
        exit(1);
    }
    unsigned int x;
    read(in, &x, sizeof(x));
    close(in);
    return x;
}

int main (int argc, const char *argv[]) {
    char basepath[1024];
    strcpy(basepath,argv[1]);
    char *pathend = &basepath[strlen(basepath)];
// Read in.
    vector<char*> data;
    data.reserve(1<<16);
    while (!cin.eof()) {
        char *buf = new char[1024];
        cin.getline(buf,1023);
        data.push_back(buf);
    }

    srand(randomSeed());
    for (int n = 0; n < 10000; n++) {
        vector<char*> copy(data);
    // Fisher-Yates shuffle.
        int last = copy.size() - 1;
        for (int i = last; i > 0; i--) {
            int r = rand() % i;
            if (r == i) continue;
            char *t = copy[i];
            copy[i] = copy[r];
            copy[r] = t;
        }
    // Write out.
        sprintf(pathend, ".%d", n);
        ofstream file(basepath);
        for (int j = 0; j < 1000; j++) file << copy[j] << endl;
        cout << basepath << endl;
        file.close();
    }

    return 0;
}

在单个 3.5 Ghz 核心上，运行时间约为 20 秒：

   time ./10kshuffle tmp/test < data.txt
   tmp/test.0
   [...]
   tmp/test.9999
   real 19.95, user 9.46, sys 9.86, RSS 39408

data.txt问题重复了 37000 行。如果您希望输出文件中包含整个随机播放而不是前 1000 行，请将第 54 行更改为：

for (int j = 0; j < copy.size(); j++) file << copy[j] << endl;

Question 3

因此，您的问题涉及 Unix 方面，但值得首先解决您的基本问题，然后尝试找到一种 Unix-y 方法来实现该解决方案。

您需要从行数未知的文件中创建 10,000 个样本，每个样本大小为 1,000。可以在以下位置执行此操作一次通过如果内存中可以容纳 10,000 x 1,000 行，则可以查看文件的内容。如果您无法在内存中保存那么多行，并且您知道文件包含多少行，则仍然可以一次性完成。如果您不知道文件包含多少行，则需要额外一次来计算行数。

在更困难的情况下，当您不知道行数时，该算法将对每个样本执行以下操作（并行地将样本保留在内存中）：

包括样本中的前 1,000 行
对于第 n 行（其中n > 1000），将其包含在概率中1000 / n，并从已选择的行中丢弃随机行。（由于可能会丢弃某些行，我们需要将样本保留在内存中直到输入结束）

实现第二步的一个优雅方法是k在中生成一个随机整数[1, n]。如果k <= 1000则包含该行并k用它替换现有的第行。这是该算法的更标准的描述：http://en.wikipedia.org/wiki/Reservoir_sampling

如果您知道行数，R则：

从样本大小s0开始
包含概率的第 n 行(1000 - s) / (R - n + 1)并立即输出（并增加样本大小s）

如何在 Unix 上执行此操作？awk似乎是互联网上这篇文章的答案（我不能保证其正确性，但代码就在那里）https://news.ycombinator.com/item?id=4840043

Answer