根据第一列合并多行，但所有列应保持独立

Question 1

awk 'BEGIN{FS="\t"} NR==1{print; next} {a[$1]=$1; for(i=2; i<=NF; i++){if($i!="") {f[$1,i]=$i; if(i>last[$1]){last[$1]=i}}} } END{for(j in a){printf("%s", a[j]); for(k=2; k<=last[j]; k++){printf("%s%s", FS, f[j,k])} print ""}}' file

awk 'BEGIN{FS="\t"}          # use tab as field separator
     NR==1{print; next}      # print header
     {
       a[$1]=$1              # save first column in current row
       for(i=2; i<=NF; i++){ # loop with all columns but first
         if($i!=""){         # if column not empty
           f[$1,i]=$i        # save content to array
           if(i>last[$1]){
             last[$1]=i      # save number of last element in current row
           }
         }
       }
     }
     END{
       for(j in a){
         printf("%s", a[j])           # print first element
         for(k=2; k<=last[j]; k++){   # print second to last element
           printf("%s%s", FS, f[j,k])
         }
         print ""
       }
     }' file

数组a包含第一列。数组f包含没有第一列的行。数组last包含当前行中最后一个元素的位置。

尚未测试。

Answer

awk 'BEGIN{FS="\t"} NR==1{print; next} {a[$1]=$1; for(i=2; i<=NF; i++){if($i!="") {f[$1,i]=$i; if(i>last[$1]){last[$1]=i}}} } END{for(j in a){printf("%s", a[j]); for(k=2; k<=last[j]; k++){printf("%s%s", FS, f[j,k])} print ""}}' file

awk 'BEGIN{FS="\t"}          # use tab as field separator
     NR==1{print; next}      # print header
     {
       a[$1]=$1              # save first column in current row
       for(i=2; i<=NF; i++){ # loop with all columns but first
         if($i!=""){         # if column not empty
           f[$1,i]=$i        # save content to array
           if(i>last[$1]){
             last[$1]=i      # save number of last element in current row
           }
         }
       }
     }
     END{
       for(j in a){
         printf("%s", a[j])           # print first element
         for(k=2; k<=last[j]; k++){   # print second to last element
           printf("%s%s", FS, f[j,k])
         }
         print ""
       }
     }' file

数组a包含第一列。数组f包含没有第一列的行。数组last包含当前行中最后一个元素的位置。

尚未测试。

Question 2

这是一种方法：

$ awk -F"\t" '{if(NR==1){ cols=NF; print; } else{for(i=2;i<=NF;i++){if(length($i)>0){data[$1][i]=$i}}}}END{for(id in data){ printf "%s",id; for(i=2;i<=cols;i++){printf "\t%s", data[id][i]} print ""}}' file 
OG  FC_AG_NICO  FC_AG_ZEA   FC_AG_BRAS  FC_MB_NICO  FC_MB_ZEA   FC_MB_BRAS  FC_TN_NICO  FC_TN_ZEA   FC_TN_BRAS  FC_SL_NICO  FC_SL_ZEA   FC_SL_BRAS  FC_SE_NICFC_SE_ZEA  FC_SE_BRAS 
OG0004400   -0.787302663    -0.710790578    0.663333543 -0.360799687    -0.0958126  0.056722264 -1.77626686 -0.971114297    0.707963822 -0.373838773    0.277055943 0.481626213 -1.659046364    -1.019969932
OG0004402   -0.304209641    -0.259080399    0.44366888  0.253000346 0.338511357 -0.121760564                -0.274550145    0.1933262   0.374095809 0.442748804 0.042958499

或者，更容易阅读：

awk -F"\t" '{
                ## Print the headers and store the number of columns.
                if(NR==1){ 
                    cols=NF;
                    print; 
                } 
                else{
                    ## Iterate over all columns, starting from the 2nd.
                    for(i=2;i<=NF;i++){
                        ## If this one isn't empty, store it.
                        if(length($i)>0){
                            data[$1][i]=$i
                        }
                    }
                }
            }
           ## After reading everything, print.
            END{
                for(id in data){ 
                    printf "%s",id; 
                    for(i=2;i<=cols;i++){
                        printf "\t%s", data[id][i]
                    } 
                    print ""
                }
            }' file

请注意，这假设每个 ID（第一个字段）在一行上的每一列都有一个值，并且仅在一行上。如果您可以有 ID 为空的列，则需要稍微不同的方法。

Answer

这是一种方法：

$ awk -F"\t" '{if(NR==1){ cols=NF; print; } else{for(i=2;i<=NF;i++){if(length($i)>0){data[$1][i]=$i}}}}END{for(id in data){ printf "%s",id; for(i=2;i<=cols;i++){printf "\t%s", data[id][i]} print ""}}' file 
OG  FC_AG_NICO  FC_AG_ZEA   FC_AG_BRAS  FC_MB_NICO  FC_MB_ZEA   FC_MB_BRAS  FC_TN_NICO  FC_TN_ZEA   FC_TN_BRAS  FC_SL_NICO  FC_SL_ZEA   FC_SL_BRAS  FC_SE_NICFC_SE_ZEA  FC_SE_BRAS 
OG0004400   -0.787302663    -0.710790578    0.663333543 -0.360799687    -0.0958126  0.056722264 -1.77626686 -0.971114297    0.707963822 -0.373838773    0.277055943 0.481626213 -1.659046364    -1.019969932
OG0004402   -0.304209641    -0.259080399    0.44366888  0.253000346 0.338511357 -0.121760564                -0.274550145    0.1933262   0.374095809 0.442748804 0.042958499

或者，更容易阅读：

awk -F"\t" '{
                ## Print the headers and store the number of columns.
                if(NR==1){ 
                    cols=NF;
                    print; 
                } 
                else{
                    ## Iterate over all columns, starting from the 2nd.
                    for(i=2;i<=NF;i++){
                        ## If this one isn't empty, store it.
                        if(length($i)>0){
                            data[$1][i]=$i
                        }
                    }
                }
            }
           ## After reading everything, print.
            END{
                for(id in data){ 
                    printf "%s",id; 
                    for(i=2;i<=cols;i++){
                        printf "\t%s", data[id][i]
                    } 
                    print ""
                }
            }' file

请注意，这假设每个 ID（第一个字段）在一行上的每一列都有一个值，并且仅在一行上。如果您可以有 ID 为空的列，则需要稍微不同的方法。

Question 3

完后还有awk。

我确实想知道迭代关联数组for (f in fields)是否会扰乱字段输出，但是，在运行了一些最多 20 个字段宽的测试后，似乎不会。

假设您的标头位于第 1 行，数据已排序（根据帖子），并且您有大量不希望同时存入内存的数据

awk 'BEGIN{getline; split($0,out,"\t"); old=$1}
    old!=$1{for (o in out) printf "%s\t", out[o]; print""; delete out;old=$1}
    {split($0,tmp,"\t"); for (t in tmp) out[t]=(t==1)?tmp[t]:out[t]+tmp[t]}
    END{for (o in out) printf "%s\t", out[o];}' file

这个输入

head    c1  c2  c3
H1  -0.71       
H1      2   
H1          3
H2  11  12  
H2          13

给予

head    c1      c2      c3
H1      -0.71   2       3
H2      11      12      13

演练

抓住第一行并记住第一个字段old

awk 'BEGIN{getline; split($0,out,"\t"); old=$1}

如果下一条记录中的第一个字段与最后一个字段（新标头）不同，则最后一个聚合已完成，因此将其打印出来，清空聚合数组out并记下您位于新集合中old=$1

    old!=$1{for (o in out) printf "%s\t", out[o]; print""; delete out;old=$1}

拆分$0为一个数组tmp，迭代数组添加值tmp，out除非它是标题列，当您只获取值时

    {split($0,tmp,"\t"); for (t in tmp) out[t]=(t==1)?tmp[t]:out[t]+tmp[t]}

清除保存在的最后一组out记录END

    END{for (o in out) printf "%s\t", out[o];}' file

Answer