awk 模式匹配和合并文件

awk 模式匹配和合并文件

我有 3 个文件 1.csv、2.csv 和 3.csv

1.csv

TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1  
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791  
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373  
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2  
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85  

2.csv

SIMINN_ICELAND_TELECOM,ICELAND,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795  
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638

3.csv:

TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Calling_Party_Address_Blocking,79  
CABLE&WIRELESS_BARBADOS,BARBADOS,Calling_Party_Address_Blocking,30  
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,Calling_Party_Address_Blocking,6
SYNIVERSE_ANSI,UNITED_STATES,Calling_Party_Address_Blocking,12

我想合并文件,以便它打印输出文件,如下所示

TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1,NA,NA,Calling_Party_Address_Blocking,79
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791,NA,NA,Calling_Party_Address_Blocking,30
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795,NA,NA
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638,NA,NA
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85,NA,NA,NA,NA 
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,NA,NA,NA,NA,Calling_Party_Address_Blocking,6
SYNIVERSE_ANSI,UNITED_STATES,NA,NA,NA,NA,Calling_Party_Address_Blocking,12

答案1

如果您不介意输出顺序,这里有一个perl解决方案:

$ perl -F',' -anle '
    push @{$h{$F[0].",".$F[1]}{$ARGV}}, @F[2..$#F];
    push @file, $ARGV if eof;
    END {
        for $k (keys %h) {
            for (sort {$a <=> $b} @file) {
                if (defined($h{$k}{$_})) {
                    push @tmp, @{$h{$k}{$_}};
                } else {
                    push @tmp, qw(NA NA);
                }
            }
            print join ",",($k,@tmp);
            @tmp=();
        }
    }
' 1.csv 2.csv 3.csv
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,NA,NA,NA,NA,Calling_Party_Address_Blocking,6
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638,NA,NA
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791,NA,NA,Calling_Party_Address_Blocking,30
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795,NA,NA
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85,NA,NA,NA,NA
SYNIVERSE_ANSI,UNITED_STATES,NA,NA,NA,NA,Calling_Party_Address_Blocking,12
TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1,NA,NA,Calling_Party_Address_Blocking,79

答案2

awk

#!/usr/bin/awk -f

BEGIN {
    FS=",";
}

{  
    k=$1
    if (k in ar) {
        for (i = 3; i <= NF; i++) ar[k] = ar[k] "," $i
    } else {
        ar[k] = $0;
    }
}

END { 
    for (j in ar) print ar[j]
}

另存为merge.awk并使其可执行chmod +x merge.awk

它(应该)输出:

CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638
SYNIVERSE_ANSI,UNITED_STATES,Calling_Party_Address_Blocking,12
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,Calling_Party_Address_Blocking,6
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795
TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1,Calling_Party_Address_Blocking,79
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791,Calling_Party_Address_Blocking,30

当运行为merge.awk 1.csv 2.csv 3.csv或什至merge.awk *.csv.

答案3

awk -F ',' '
  FILENAME == ARGV[1] {
    a[++n] = ($1 FS $2);
    ar1[a[n]]=($3 FS $4);
    next}
  FILENAME == ARGV[2] {
    if(!(($1 FS $2) in ar1))
      {a[++n] = ($1 FS $2)};
    ar2[$1 FS $2]=($3 FS $4);
    next}
  FILENAME == ARGV[3] {
    if(!(($1 FS $2) in ar1))
      {a[++n] = ($1 FS $2)};
    ar3[$1 FS $2]=($3 FS $4);
    next}
  END {for(i=1; i<=n; i++)
    {if(!(a[i] in ar1))
      ar1[a[i]] = ("NA,NA")
    if(!(a[i] in ar2))
      ar2[a[i]] = ("NA,NA")
    if(!(a[i] in ar3))
      ar3[a[i]] = ("NA,NA")
    printf "%s,%s,%s,%s\n",
      a[i], ar1[a[i]], ar2[a[i]], ar3[a[i]]}}' 1.csv 2.csv 3.csv

上述命令的输出是

TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1,NA,NA,Calling_Party_Address_Blocking,79
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791,NA,NA,Calling_Party_Address_Blocking,30
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795,NA,NA
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638,NA,NA
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85,NA,NA,NA,NA
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,NA,NA,NA,NA,Calling_Party_Address_Blocking,6
SYNIVERSE_ANSI,UNITED_STATES,NA,NA,NA,NA,Calling_Party_Address_Blocking,12

相关内容