我有一个相当大的文件列表,我喜欢将其分组为每个 5GB 的较小列表,以便进一步处理。
文件列表采用以下形式:“大小”(来自du
)“选项卡”“绝对路径”,如下所示(为了保护所有者的隐私,我对文件名进行了混淆):
$ cat unix_StackExchange_question.txt
543732 1.txt
543736 2.txt
543736 3.txt
543736 4.txt
543740 5.txt
543740 6.txt
543740 7.txt
543744 8.txt
543748 9.txt
543748 10.txt
543752 11.txt
543760 12.txt
543768 13.txt
543772 14.txt
543780 15.txt
543780 16.txt
543784 17.txt
543784 18.txt
543796 19.txt
543860 20.txt
546068 21.txt
546384 22.txt
548576 23.txt
549600 24.txt
549624 25.txt
549660 26.txt
549680 27.txt
549700 28.txt
549704 29.txt
549712 30.txt
549712 31.txt
549716 32.txt
549716 33.txt
549716 34.txt
549720 35.txt
549720 36.txt
549720 37.txt
549724 38.txt
549732 39.txt
549736 40.txt
549740 41.txt
549740 42.txt
549744 43.txt
549744 44.txt
549744 45.txt
549748 46.txt
549748 47.txt
549752 48.txt
549752 49.txt
549752 50.txt
549756 51.txt
549760 52.txt
549760 53.txt
549764 54.txt
549764 55.txt
549772 56.txt
549772 57.txt
549812 58.txt
549828 59.txt
550676 60.txt
550900 61.txt
553036 62.txt
556656 63.txt
557020 64.txt
563208 65.txt
569180 66.txt
569228 67.txt
569248 68.txt
569248 69.txt
569248 70.txt
569252 71.txt
569252 72.txt
569256 73.txt
569256 74.txt
569260 75.txt
569260 76.txt
569260 77.txt
569260 78.txt
569264 79.txt
569268 80.txt
569268 81.txt
569272 82.txt
569272 83.txt
569272 84.txt
569276 85.txt
569280 86.txt
569284 87.txt
569284 88.txt
569284 89.txt
569288 90.txt
569288 91.txt
569288 92.txt
569288 93.txt
569288 94.txt
569292 95.txt
569292 96.txt
569296 97.txt
569300 98.txt
569332 99.txt
569356 100.txt
569508 101.txt
576092 102.txt
577420 103.txt
584208 104.txt
587428 105.txt
680280 106.txt
680292 107.txt
682412 108.txt
682424 109.txt
689768 110.txt
689784 111.txt
690116 112.txt
690124 113.txt
690132 114.txt
690136 115.txt
690140 116.txt
690156 117.txt
690188 118.txt
690188 119.txt
690188 120.txt
690188 121.txt
690188 122.txt
690188 123.txt
690188 124.txt
690188 125.txt
690188 126.txt
690188 127.txt
690188 128.txt
690188 129.txt
690188 130.txt
690188 131.txt
690188 132.txt
690188 133.txt
690188 134.txt
690188 135.txt
690188 136.txt
690188 137.txt
690188 138.txt
690188 139.txt
690188 140.txt
690192 141.txt
690192 142.txt
690192 143.txt
690192 144.txt
690200 145.txt
690200 146.txt
690200 147.txt
690200 148.txt
690200 149.txt
690200 150.txt
690200 151.txt
690200 152.txt
690200 153.txt
690200 154.txt
690200 155.txt
690200 156.txt
690204 157.txt
690204 158.txt
690204 159.txt
690204 160.txt
690204 161.txt
690204 162.txt
690204 163.txt
690204 164.txt
690204 165.txt
690204 166.txt
690204 167.txt
690204 168.txt
690204 169.txt
690204 170.txt
690208 171.txt
690272 172.txt
690288 173.txt
690348 174.txt
690364 175.txt
690400 176.txt
690412 177.txt
696052 178.txt
697400 179.txt
697416 180.txt
697436 181.txt
697452 182.txt
699616 183.txt
700284 184.txt
700952 185.txt
700968 186.txt
702124 187.txt
702656 188.txt
704088 189.txt
704712 190.txt
704800 191.txt
706052 192.txt
706156 193.txt
706184 194.txt
706564 195.txt
706660 196.txt
706672 197.txt
706676 198.txt
706692 199.txt
706708 200.txt
706720 201.txt
706716 202.txt
706720 203.txt
706724 204.txt
706720 205.txt
706724 206.txt
706724 207.txt
706724 208.txt
706720 209.txt
706724 210.txt
706724 211.txt
706724 212.txt
706724 213.txt
706724 214.txt
706724 215.txt
706720 216.txt
706728 217.txt
706728 218.txt
706724 219.txt
706732 220.txt
706780 221.txt
706888 222.txt
706896 223.txt
707404 224.txt
708716 225.txt
708812 226.txt
709376 227.txt
711440 228.txt
711516 229.txt
713376 230.txt
713412 231.txt
714332 232.txt
716332 233.txt
718600 234.txt
732196 235.txt
737300 236.txt
737300 237.txt
743436 238.txt
743440 239.txt
743444 240.txt
747304 241.txt
748636 242.txt
748640 243.txt
748640 244.txt
748648 245.txt
748644 246.txt
748648 247.txt
748648 248.txt
755276 249.txt
755948 250.txt
761264 251.txt
761268 252.txt
782544 253.txt
787724 254.txt
795236 255.txt
801968 256.txt
801980 257.txt
801992 258.txt
801996 259.txt
801996 260.txt
802000 261.txt
802004 262.txt
802004 263.txt
802004 264.txt
802012 265.txt
802016 266.txt
802028 267.txt
802032 268.txt
802032 269.txt
802032 270.txt
802040 271.txt
802044 272.txt
802044 273.txt
802048 274.txt
802052 275.txt
802060 276.txt
802060 277.txt
802064 278.txt
802068 279.txt
802072 280.txt
802076 281.txt
802080 282.txt
802084 283.txt
802084 284.txt
802084 285.txt
802096 286.txt
802100 287.txt
802132 288.txt
802136 289.txt
802204 290.txt
802288 291.txt
803828 292.txt
804116 293.txt
808492 294.txt
813204 295.txt
826780 296.txt
829888 297.txt
831328 298.txt
831616 299.txt
832740 300.txt
832940 301.txt
833180 302.txt
833796 303.txt
833832 304.txt
834040 305.txt
834276 306.txt
834284 307.txt
834384 308.txt
834420 309.txt
834448 310.txt
834452 311.txt
834456 312.txt
834460 313.txt
834452 314.txt
834456 315.txt
834456 316.txt
834460 317.txt
834464 318.txt
834460 319.txt
834456 320.txt
834460 321.txt
834460 322.txt
834460 323.txt
834456 324.txt
834460 325.txt
834468 326.txt
834460 327.txt
834456 328.txt
834464 329.txt
834468 330.txt
834468 331.txt
834476 332.txt
834468 333.txt
834528 334.txt
834632 335.txt
834748 336.txt
834864 337.txt
835744 338.txt
836008 339.txt
836124 340.txt
837464 341.txt
837488 342.txt
838640 343.txt
838840 344.txt
842068 345.txt
908696 346.txt
908744 347.txt
908744 348.txt
916972 349.txt
917092 350.txt
917096 351.txt
917100 352.txt
917116 353.txt
917116 354.txt
917120 355.txt
917120 356.txt
917124 357.txt
917124 358.txt
917132 359.txt
922580 360.txt
922580 361.txt
923056 362.txt
923056 363.txt
924040 364.txt
925392 365.txt
925448 366.txt
925448 367.txt
937824 368.txt
950020 369.txt
950032 370.txt
954788 371.txt
954804 372.txt
955920 373.txt
959628 374.txt
963432 375.txt
963448 376.txt
964024 377.txt
964040 378.txt
964040 379.txt
964056 380.txt
964064 381.txt
964080 382.txt
964096 383.txt
964100 384.txt
964100 385.txt
964100 386.txt
964100 387.txt
964100 388.txt
964100 389.txt
964100 390.txt
964100 391.txt
964100 392.txt
964100 393.txt
964100 394.txt
964100 395.txt
964100 396.txt
964100 397.txt
964100 398.txt
964100 399.txt
964100 400.txt
964104 401.txt
964104 402.txt
964104 403.txt
964104 404.txt
964108 405.txt
964108 406.txt
964108 407.txt
964108 408.txt
964112 409.txt
964112 410.txt
964112 411.txt
964116 412.txt
964116 413.txt
964116 414.txt
964116 415.txt
964116 416.txt
964116 417.txt
964116 418.txt
964116 419.txt
964116 420.txt
964116 421.txt
964116 422.txt
964116 423.txt
964116 424.txt
964116 425.txt
964116 426.txt
964116 427.txt
964116 428.txt
964116 429.txt
964116 430.txt
964120 431.txt
964120 432.txt
964120 433.txt
964124 434.txt
964124 435.txt
964128 436.txt
964172 437.txt
964188 438.txt
964192 439.txt
964204 440.txt
964296 441.txt
964312 442.txt
972764 443.txt
972780 444.txt
976292 445.txt
976304 446.txt
979696 447.txt
979712 448.txt
988492 449.txt
1057628 450.txt
1090748 451.txt
1092992 452.txt
1098340 453.txt
1099496 454.txt
1100484 455.txt
1100528 456.txt
1100724 457.txt
1101532 458.txt
1111008 459.txt
1111016 460.txt
1111040 461.txt
1111052 462.txt
1111052 463.txt
1111056 464.txt
1111064 465.txt
1111072 466.txt
1111080 467.txt
1111084 468.txt
1111092 469.txt
1111100 470.txt
1111100 471.txt
1111100 472.txt
1111108 473.txt
1111120 474.txt
1111124 475.txt
1111132 476.txt
1111136 477.txt
1111136 478.txt
1111140 479.txt
1111160 480.txt
1111168 481.txt
1111168 482.txt
1111180 483.txt
1111184 484.txt
1111192 485.txt
1111236 486.txt
1111256 487.txt
1111344 488.txt
1116348 489.txt
1120988 490.txt
1137784 491.txt
1137784 492.txt
1138120 493.txt
1138120 494.txt
1138380 495.txt
1138380 496.txt
1204392 497.txt
1488592 498.txt
1757076 499.txt
顺便说一句:这些只是最大的第 500 个文件。
有没有一种简单的方法可以将此列表拆分为包含总计大约 5GB 的文件的列表?
编辑:希望输出可能的解决方案
file1:
2500000 478.txt
2345872 400.txt
134325 300.txt
file2:
2134356 123.txt
2857439 4325.txt
file
答案1
听起来您正在寻求解决方案装箱问题。据我们所知,没有一种方法可以既最佳又快速地做到这一点(但我们也不知道是否存在;这是一个悬而未决的问题)。
但你可以靠近。执行此操作的一个实用程序称为数据打包机。没用过那个;通过搜索找到了。我确信还有更多。
就我个人而言,我使用我自己写的一个。 (请注意,这些都将输出进入每个 bin 的文件列表;您可以轻松地cat
将它们放在一个文本文件中。)注释指的是bfpp
我编写的另一个程序,只要满足以下条件即可找到最佳解决方案你只有几个(比如不到 10 个)文件:
#!/usr/bin/perl -w
#
# Sane bin packer. Attempts to finds a way to arrange a set of items
# into the fewest bins. Uses first-fit decreasing, which will get within
# 11⁄9⋅OPT + 6⁄9 (where OPT is the optimal number of discs, as found by
# bfpp). Unlike bfpp, this program will finish quickly.
#
# Also, this program features many more non-ASCII characters than bfpp.
# This is probably its most important feature.
use strict;
use 5.010;
use Data::Dump qw(pp);
use File::Next;
use List::Util qw(sum);
use String::ShellQuote qw(shell_quote);
use Carp qw(confess croak carp);
sub usage($) {
say STDERR "Usage: $0 bin-size-megabytes file1 file2 ...";
exit shift;
}
sub parse_command_line() {
usage(1) if @ARGV < 3;
usage(0) if $ARGV[0] =~ /^--?[h?]/;
my $size = shift @ARGV;
given ($size) {
when (/^dvd5?$/) {
$size = 4011;
}
when (/^dvd9$/) {
$size = 7291;
}
when (/^bd(-r)?1?$/) {
$size = 21360;
}
when (/^bd(-r)?2$/) {
$size = 42720;
}
when (/^\d+$/) {
# do nothing, already number
}
default {
say STDERR "ERROR: Size must be number or known size constant (dvd, dvd9, bd, bd2)";
usage(1);
}
}
return {
bin_size => $size * 1024 * 1024,
items => get_item_info(@ARGV),
};
}
sub get_item_info {
my %items;
my ($in_group, $group_no) = (0, 0);
foreach my $item (@_) {
if ('(' eq $item ) {
$in_group and confess "Nested groups not supported";
$in_group = 1;
++$group_no;
next;
} elsif (')' eq $item) {
$in_group or croak "Can't close a group when none open";
$in_group = 0;
next;
}
my $key;
$in_group and $key = "!!__GROUP${group_no}__!!";
if (-f $item) {
defined $key or ($key = $item) =~ s/\..{2,4}$//;
push @{$items{$key}{members}}, $item;
$items{$key}{size} += -s _;
} elsif (-d $item) {
$key //= $item;
my $files = File::Next::files($item);
while (defined(my $file = $files->())) {
push @{$items{$key}{members}}, $file;
$items{$key}{size} += -s $file;
}
} else {
confess "Not sure how to handle $item (weird type or doesn't exist)"
}
}
$in_group and carp "WARNING: Group wasn't closed";
return \%items;
}
sub check_sanity($) {
my $info = shift;
my $_;
my $binsize = $info->{bin_size};
my @dontfit = grep $info->{items}{$_}{size} > $binsize,
keys %{$info->{items}};
if (@dontfit) {
say "\nWARNING! WARNING! WARNING! WARNING!";
say "The following items are larger than the bin size:";
say pp(\@dontfit);
say "This is going to lead to some funny results.\n";
say "WARNING! WARNING! WARNING! WARNING!\n";
}
return $info;
}
sub approximate_best {
my $info = shift;
my @sorted
= sort { $info->{items}{$::b}{size} <=> $info->{items}{$::a}{size} }
keys %{$info->{items}};
my @bins;
FILE: foreach my $file (@sorted) {
my $size = $info->{items}{$file}{size};
BIN: foreach my $bin (@bins) {
next BIN unless $bin->{remaining} >= $size;
push @{$bin->{contents}}, $file;
$bin->{remaining} -= $size;
next FILE;
}
# didn't find a bin, open a new one
push @bins,
{
remaining => $info->{bin_size} - $size,
contents => [$file],
};
}
$info->{bins} = \@bins;
return $info;
}
sub print_bins($) {
my $info = shift;
my $_;
my $bins = $info->{bins};
#<<< [Hide this mess from PerlTidy]
use locale;
my @bins_full = map { # for each disk
[ sort( # sort each disk's fileset
map { # for each fileset
@{$info->{items}{$_}{members}}
} @{$_->{contents}}
) ];
} @$bins;
#>>>
for (my $d = 0; $d < @bins_full; ++$d) {
print <<DISC
DISC #@{[$d + 1]}: (@{[ int($bins->[$d]{remaining}/1024/1024) ]} MiB empty)
@{[ join(qq{\n }, @{$bins_full[$d]}) ]}
DISC
}
say "As space-separated, escaped values (for shell):";
for (my $d = 0; $d < @bins_full; ++$d) {
say $d+1, q{: }, shell_quote @{$bins_full[$d]};
}
return undef;
}
# believe it or not, the below is code.
print_bins approximate_best check_sanity parse_command_line;
答案2
是的。使用 bash 脚本:
#!/bin/bash
groupsize=0
groupcnt=0
cat unix_StackExchange_question.txt | while read fsize fname
do
[ "$groupsize" == "0" ] && echo "Group : GROUP_$groupcnt"
echo -en "\t $fname\n"
((groupsize+=$fsize))
#cp $fname GROUP_$groupcnt
if [ "$groupsize" -gt "5000000" ]
then
((groupcnt++))
groupsize=0
fi
done
答案3
awk -v sizelimit=5000000 -v outputfilename=shorter_list \
'BEGIN {target=outputfilename ".0000"}; '\
'{sum+=$1; '\
'if(sum>sizelimit) { file_index++; target=outputfilename "." sprintf("%04d",file_index); sum=$1;}; '\
'print $0 >target}' file
应该做你想做的事。不过,您必须调整大小限制。我使用较小的值进行测试(这可能对您的测试也有用)。
答案4
这个应该可以,但是速度很慢(500 个条目需要 1 分 18 秒)
#!/bin/bash
#reformatting the initial file to remove tab
SRC=`cat file.txt | expand`
outputs_dir="outputs"
if [ ! -d "$outputs_dir" ];then
mkdir "$outputs_dir"
else
echo "$outputs_dir exist"
#rm "$outputs_dir"/*
fi
#init file outputs array with 2 files first one is in case files is bigger than 5GB
foutputs_array=( "$outputs_dir"/leftover.txt "$outputs_dir"/file1.txt )
#init file size array. Each time a file will be added to an output file, its size will be added here.
# If its size doesn't fit in an existing file, it will be added to a new file. New file will be added to foutputs_array,...
# If it doesn't fit anywhere, it will go to leftover.
fsize_array=( "0" "0" )
#5GB limit
FLIMIT=5242880
FLAG=0
i=1
array_index=1
fitIn(){
local file_size=$1
local total_size=$2
#echo "summing.." >&2
sum=$(expr $file_size + $total_size)
#echo "sum=" "$sum" >&2
if [[ "$sum" -le "$FLIMIT" ]];then
echo 0
else
echo 1
fi
}
while read fsize fname
do
# echo "array_index=" $array_index ${fsize_array[@]} "fsize"$fsize ${fsize_array[$array_index]}
check_size=`fitIn $fsize ${fsize_array[$array_index]}`
# echo "check_size" $check_size
if [ "$fsize" -le "$FLIMIT" -a "$check_size" -eq "0" ];then
# echo "In limit"
FLAG=0
elif [ $fsize -le $FLIMIT ];then
# echo "In max limit"
FLAG=0
while [ $check_size -eq "1" ]
do
# echo $array_index $i
(( array_index++ ))
(( i++ ))
if [ -z ${fsize_array[$array_index]} ];then
fsize_array[$array_index]=0
fi
check_size=`fitIn $fsize ${fsize_array[$array_index]}`
done
# echo "new position" $array_index
foutputs_array[$array_index]="$outputs_dir"/file${i}.txt
else
echo "$fsize $fname doesn't fit anywhere!"
FLAG=1
array_index=0
fi
if [ $FLAG -eq 0 ];then
(( fsize_array[$array_index]+=$fsize ))
fi
echo "$fsize" "$fname" >> "${foutputs_array[$array_index]}"
array_index=1
i=1
done <<< "$SRC"