如何将文件名列表拆分为 5GB 集？

Question 1

听起来您正在寻求解决方案装箱问题。据我们所知，没有一种方法可以既最佳又快速地做到这一点（但我们也不知道是否存在；这是一个悬而未决的问题）。

但你可以靠近。执行此操作的一个实用程序称为数据打包机。没用过那个；通过搜索找到了。我确信还有更多。

就我个人而言，我使用我自己写的一个。（请注意，这些都将输出进入每个 bin 的文件列表；您可以轻松地cat将它们放在一个文本文件中。）注释指的是bfpp我编写的另一个程序，只要满足以下条件即可找到最佳解决方案你只有几个（比如不到 10 个）文件：

#!/usr/bin/perl -w
#
# Sane bin packer. Attempts to finds a way to arrange a set of items
# into the fewest bins. Uses first-fit decreasing, which will get within
# 11⁄9⋅OPT + 6⁄9 (where OPT is the optimal number of discs, as found by
# bfpp). Unlike bfpp, this program will finish quickly.
#
# Also, this program features many more non-ASCII characters than bfpp.
# This is probably its most important feature.

use strict;
use 5.010;
use Data::Dump qw(pp);
use File::Next;
use List::Util qw(sum);
use String::ShellQuote qw(shell_quote);
use Carp qw(confess croak carp);

sub usage($) {
    say STDERR "Usage: $0 bin-size-megabytes file1 file2 ...";
    exit shift;
}

sub parse_command_line() {
    usage(1) if @ARGV < 3;
    usage(0) if $ARGV[0] =~ /^--?[h?]/;

    my $size = shift @ARGV;

    given ($size) {
        when (/^dvd5?$/) {
            $size = 4011;
        }
        when (/^dvd9$/) {
            $size = 7291;
        }
        when (/^bd(-r)?1?$/) {
            $size = 21360;
        }
        when (/^bd(-r)?2$/) {
            $size = 42720;
        }
        when (/^\d+$/) {
            # do nothing, already number
        }
        default {
            say STDERR "ERROR: Size must be number or known size constant (dvd, dvd9, bd, bd2)";
            usage(1);
        }
    }

    return {
        bin_size => $size * 1024 * 1024,
        items    => get_item_info(@ARGV),
    };
}

sub get_item_info {
    my %items;
    my ($in_group, $group_no) = (0, 0);
    foreach my $item (@_) {
        if ('(' eq $item ) {
            $in_group and confess "Nested groups not supported";
            $in_group = 1;
            ++$group_no;
            next;
        } elsif (')' eq $item) {
            $in_group or croak "Can't close a group when none open";
            $in_group = 0;
            next;
        }

        my $key;
        $in_group and $key = "!!__GROUP${group_no}__!!";

        if (-f $item) {
            defined $key or ($key = $item) =~ s/\..{2,4}$//;
            push @{$items{$key}{members}}, $item;
            $items{$key}{size} += -s _;
        } elsif (-d $item) {
            $key //= $item;
            my $files = File::Next::files($item);
            while (defined(my $file = $files->())) {
                push @{$items{$key}{members}}, $file;
                $items{$key}{size} += -s $file;
            }
        } else {
            confess "Not sure how to handle $item (weird type or doesn't exist)"
        }
    }

    $in_group and carp "WARNING: Group wasn't closed";

    return \%items;
}

sub check_sanity($) {
    my $info = shift;
    my $_;

    my $binsize = $info->{bin_size};
    my @dontfit = grep $info->{items}{$_}{size} > $binsize,
        keys %{$info->{items}};

    if (@dontfit) {
        say "\nWARNING! WARNING! WARNING! WARNING!";
        say "The following items are larger than the bin size:";
        say pp(\@dontfit);
        say "This is going to lead to some funny results.\n";
        say "WARNING! WARNING! WARNING! WARNING!\n";
    }

    return $info;
}

sub approximate_best {
    my $info = shift;

    my @sorted
        = sort { $info->{items}{$::b}{size} <=> $info->{items}{$::a}{size} }
        keys %{$info->{items}};

    my @bins;

FILE: foreach my $file (@sorted) {
        my $size = $info->{items}{$file}{size};
    BIN: foreach my $bin (@bins) {
            next BIN unless $bin->{remaining} >= $size;
            push @{$bin->{contents}}, $file;
            $bin->{remaining} -= $size;
            next FILE;
        }
        # didn't find a bin, open a new one
        push @bins,
            {
            remaining => $info->{bin_size} - $size,
            contents  => [$file],
            };
    }

    $info->{bins} = \@bins;

    return $info;
}

sub print_bins($) {
    my $info = shift;
    my $_;

    my $bins = $info->{bins};

    #<<< [Hide this mess from PerlTidy]
    use locale;
    my @bins_full = map { # for each disk
        [ sort( # sort each disk's fileset
            map { # for each fileset
                @{$info->{items}{$_}{members}}
            } @{$_->{contents}}
        ) ];
    } @$bins;
    #>>>
    for (my $d = 0; $d < @bins_full; ++$d) {
        print <<DISC
DISC #@{[$d + 1]}:   (@{[ int($bins->[$d]{remaining}/1024/1024) ]} MiB empty)
   @{[ join(qq{\n   }, @{$bins_full[$d]}) ]}

DISC
    }

    say "As space-separated, escaped values (for shell):";
    for (my $d = 0; $d < @bins_full; ++$d) {
        say $d+1, q{: }, shell_quote @{$bins_full[$d]};
    }

    return undef;
}

# believe it or not, the below is code.
print_bins approximate_best check_sanity parse_command_line;

Answer

听起来您正在寻求解决方案装箱问题。据我们所知，没有一种方法可以既最佳又快速地做到这一点（但我们也不知道是否存在；这是一个悬而未决的问题）。

但你可以靠近。执行此操作的一个实用程序称为数据打包机。没用过那个；通过搜索找到了。我确信还有更多。

就我个人而言，我使用我自己写的一个。（请注意，这些都将输出进入每个 bin 的文件列表；您可以轻松地cat将它们放在一个文本文件中。）注释指的是bfpp我编写的另一个程序，只要满足以下条件即可找到最佳解决方案你只有几个（比如不到 10 个）文件：

#!/usr/bin/perl -w
#
# Sane bin packer. Attempts to finds a way to arrange a set of items
# into the fewest bins. Uses first-fit decreasing, which will get within
# 11⁄9⋅OPT + 6⁄9 (where OPT is the optimal number of discs, as found by
# bfpp). Unlike bfpp, this program will finish quickly.
#
# Also, this program features many more non-ASCII characters than bfpp.
# This is probably its most important feature.

use strict;
use 5.010;
use Data::Dump qw(pp);
use File::Next;
use List::Util qw(sum);
use String::ShellQuote qw(shell_quote);
use Carp qw(confess croak carp);

sub usage($) {
    say STDERR "Usage: $0 bin-size-megabytes file1 file2 ...";
    exit shift;
}

sub parse_command_line() {
    usage(1) if @ARGV < 3;
    usage(0) if $ARGV[0] =~ /^--?[h?]/;

    my $size = shift @ARGV;

    given ($size) {
        when (/^dvd5?$/) {
            $size = 4011;
        }
        when (/^dvd9$/) {
            $size = 7291;
        }
        when (/^bd(-r)?1?$/) {
            $size = 21360;
        }
        when (/^bd(-r)?2$/) {
            $size = 42720;
        }
        when (/^\d+$/) {
            # do nothing, already number
        }
        default {
            say STDERR "ERROR: Size must be number or known size constant (dvd, dvd9, bd, bd2)";
            usage(1);
        }
    }

    return {
        bin_size => $size * 1024 * 1024,
        items    => get_item_info(@ARGV),
    };
}

sub get_item_info {
    my %items;
    my ($in_group, $group_no) = (0, 0);
    foreach my $item (@_) {
        if ('(' eq $item ) {
            $in_group and confess "Nested groups not supported";
            $in_group = 1;
            ++$group_no;
            next;
        } elsif (')' eq $item) {
            $in_group or croak "Can't close a group when none open";
            $in_group = 0;
            next;
        }

        my $key;
        $in_group and $key = "!!__GROUP${group_no}__!!";

        if (-f $item) {
            defined $key or ($key = $item) =~ s/\..{2,4}$//;
            push @{$items{$key}{members}}, $item;
            $items{$key}{size} += -s _;
        } elsif (-d $item) {
            $key //= $item;
            my $files = File::Next::files($item);
            while (defined(my $file = $files->())) {
                push @{$items{$key}{members}}, $file;
                $items{$key}{size} += -s $file;
            }
        } else {
            confess "Not sure how to handle $item (weird type or doesn't exist)"
        }
    }

    $in_group and carp "WARNING: Group wasn't closed";

    return \%items;
}

sub check_sanity($) {
    my $info = shift;
    my $_;

    my $binsize = $info->{bin_size};
    my @dontfit = grep $info->{items}{$_}{size} > $binsize,
        keys %{$info->{items}};

    if (@dontfit) {
        say "\nWARNING! WARNING! WARNING! WARNING!";
        say "The following items are larger than the bin size:";
        say pp(\@dontfit);
        say "This is going to lead to some funny results.\n";
        say "WARNING! WARNING! WARNING! WARNING!\n";
    }

    return $info;
}

sub approximate_best {
    my $info = shift;

    my @sorted
        = sort { $info->{items}{$::b}{size} <=> $info->{items}{$::a}{size} }
        keys %{$info->{items}};

    my @bins;

FILE: foreach my $file (@sorted) {
        my $size = $info->{items}{$file}{size};
    BIN: foreach my $bin (@bins) {
            next BIN unless $bin->{remaining} >= $size;
            push @{$bin->{contents}}, $file;
            $bin->{remaining} -= $size;
            next FILE;
        }
        # didn't find a bin, open a new one
        push @bins,
            {
            remaining => $info->{bin_size} - $size,
            contents  => [$file],
            };
    }

    $info->{bins} = \@bins;

    return $info;
}

sub print_bins($) {
    my $info = shift;
    my $_;

    my $bins = $info->{bins};

    #<<< [Hide this mess from PerlTidy]
    use locale;
    my @bins_full = map { # for each disk
        [ sort( # sort each disk's fileset
            map { # for each fileset
                @{$info->{items}{$_}{members}}
            } @{$_->{contents}}
        ) ];
    } @$bins;
    #>>>
    for (my $d = 0; $d < @bins_full; ++$d) {
        print <<DISC
DISC #@{[$d + 1]}:   (@{[ int($bins->[$d]{remaining}/1024/1024) ]} MiB empty)
   @{[ join(qq{\n   }, @{$bins_full[$d]}) ]}

DISC
    }

    say "As space-separated, escaped values (for shell):";
    for (my $d = 0; $d < @bins_full; ++$d) {
        say $d+1, q{: }, shell_quote @{$bins_full[$d]};
    }

    return undef;
}

# believe it or not, the below is code.
print_bins approximate_best check_sanity parse_command_line;

Question 2

是的。使用 bash 脚本：

#!/bin/bash
groupsize=0
groupcnt=0
cat unix_StackExchange_question.txt | while read fsize fname
do
        [ "$groupsize" == "0" ] && echo "Group : GROUP_$groupcnt"
        echo -en "\t $fname\n"
        ((groupsize+=$fsize))
        #cp $fname GROUP_$groupcnt
        if [ "$groupsize" -gt "5000000" ]
        then
            ((groupcnt++))
            groupsize=0
        fi
done

Answer

是的。使用 bash 脚本：

#!/bin/bash
groupsize=0
groupcnt=0
cat unix_StackExchange_question.txt | while read fsize fname
do
        [ "$groupsize" == "0" ] && echo "Group : GROUP_$groupcnt"
        echo -en "\t $fname\n"
        ((groupsize+=$fsize))
        #cp $fname GROUP_$groupcnt
        if [ "$groupsize" -gt "5000000" ]
        then
            ((groupcnt++))
            groupsize=0
        fi
done

Question 3

awk -v sizelimit=5000000 -v outputfilename=shorter_list \
'BEGIN {target=outputfilename ".0000"}; '\
'{sum+=$1; '\
'if(sum>sizelimit) { file_index++; target=outputfilename "." sprintf("%04d",file_index); sum=$1;}; '\
'print $0 >target}' file

应该做你想做的事。不过，您必须调整大小限制。我使用较小的值进行测试（这可能对您的测试也有用）。

Answer

awk -v sizelimit=5000000 -v outputfilename=shorter_list \
'BEGIN {target=outputfilename ".0000"}; '\
'{sum+=$1; '\
'if(sum>sizelimit) { file_index++; target=outputfilename "." sprintf("%04d",file_index); sum=$1;}; '\
'print $0 >target}' file

应该做你想做的事。不过，您必须调整大小限制。我使用较小的值进行测试（这可能对您的测试也有用）。

Question 4

这个应该可以，但是速度很慢（500 个条目需要 1 分 18 秒）

#!/bin/bash

#reformatting the initial file to remove tab

SRC=`cat file.txt | expand`

outputs_dir="outputs"

if [ ! -d "$outputs_dir" ];then
  mkdir "$outputs_dir"
else
 echo "$outputs_dir exist"
 #rm "$outputs_dir"/*
fi

#init file outputs array with 2 files first one is in case files is bigger than 5GB

foutputs_array=( "$outputs_dir"/leftover.txt "$outputs_dir"/file1.txt )

#init file size array. Each time a file will be added to an output file, its size will be added here. 
# If its size doesn't fit in an existing file, it will be added to a new file. New file will be added to foutputs_array,...
# If it doesn't fit anywhere, it will go to leftover.

fsize_array=( "0" "0" )

#5GB limit

FLIMIT=5242880

FLAG=0
i=1
array_index=1

fitIn(){

local file_size=$1
local total_size=$2
#echo "summing.." >&2
sum=$(expr $file_size + $total_size)
#echo "sum=" "$sum" >&2
if [[ "$sum" -le "$FLIMIT" ]];then
 echo 0
else
 echo 1
fi
}


while read fsize fname

do
# echo "array_index=" $array_index ${fsize_array[@]} "fsize"$fsize ${fsize_array[$array_index]} 
 check_size=`fitIn $fsize ${fsize_array[$array_index]}`
# echo "check_size" $check_size
 if [ "$fsize" -le "$FLIMIT" -a "$check_size" -eq "0" ];then
 #  echo "In limit"
   FLAG=0  
 elif [ $fsize -le $FLIMIT ];then
 #  echo "In max limit"
   FLAG=0
   while [ $check_size -eq "1" ]
    do
#     echo $array_index $i
     (( array_index++ )) 
     (( i++ )) 
     if [ -z ${fsize_array[$array_index]} ];then 
      fsize_array[$array_index]=0
     fi
     check_size=`fitIn $fsize ${fsize_array[$array_index]}`
    done
#    echo "new position" $array_index
    foutputs_array[$array_index]="$outputs_dir"/file${i}.txt
 else
  echo "$fsize $fname doesn't fit anywhere!"
  FLAG=1
  array_index=0 
 fi 

 if [ $FLAG -eq 0 ];then
  (( fsize_array[$array_index]+=$fsize ))
 fi
 echo "$fsize" "$fname" >> "${foutputs_array[$array_index]}"
 array_index=1
 i=1  
done <<< "$SRC"

Answer

这个应该可以，但是速度很慢（500 个条目需要 1 分 18 秒）

#!/bin/bash

#reformatting the initial file to remove tab

SRC=`cat file.txt | expand`

outputs_dir="outputs"

if [ ! -d "$outputs_dir" ];then
  mkdir "$outputs_dir"
else
 echo "$outputs_dir exist"
 #rm "$outputs_dir"/*
fi

#init file outputs array with 2 files first one is in case files is bigger than 5GB

foutputs_array=( "$outputs_dir"/leftover.txt "$outputs_dir"/file1.txt )

#init file size array. Each time a file will be added to an output file, its size will be added here. 
# If its size doesn't fit in an existing file, it will be added to a new file. New file will be added to foutputs_array,...
# If it doesn't fit anywhere, it will go to leftover.

fsize_array=( "0" "0" )

#5GB limit

FLIMIT=5242880

FLAG=0
i=1
array_index=1

fitIn(){

local file_size=$1
local total_size=$2
#echo "summing.." >&2
sum=$(expr $file_size + $total_size)
#echo "sum=" "$sum" >&2
if [[ "$sum" -le "$FLIMIT" ]];then
 echo 0
else
 echo 1
fi
}


while read fsize fname

do
# echo "array_index=" $array_index ${fsize_array[@]} "fsize"$fsize ${fsize_array[$array_index]} 
 check_size=`fitIn $fsize ${fsize_array[$array_index]}`
# echo "check_size" $check_size
 if [ "$fsize" -le "$FLIMIT" -a "$check_size" -eq "0" ];then
 #  echo "In limit"
   FLAG=0  
 elif [ $fsize -le $FLIMIT ];then
 #  echo "In max limit"
   FLAG=0
   while [ $check_size -eq "1" ]
    do
#     echo $array_index $i
     (( array_index++ )) 
     (( i++ )) 
     if [ -z ${fsize_array[$array_index]} ];then 
      fsize_array[$array_index]=0
     fi
     check_size=`fitIn $fsize ${fsize_array[$array_index]}`
    done
#    echo "new position" $array_index
    foutputs_array[$array_index]="$outputs_dir"/file${i}.txt
 else
  echo "$fsize $fname doesn't fit anywhere!"
  FLAG=1
  array_index=0 
 fi 

 if [ $FLAG -eq 0 ];then
  (( fsize_array[$array_index]+=$fsize ))
 fi
 echo "$fsize" "$fname" >> "${foutputs_array[$array_index]}"
 array_index=1
 i=1  
done <<< "$SRC"

如何将文件名列表拆分为 5GB 集？

答案1

答案2

答案3

答案4

相关内容