我有一个包含 1,505,496 行的大文本文件,格式如下 -
PAN rs1 G G
PAB rs1 G G
PAC rs1 G G
PAE rs1 G G
PAT rs1 G G
PAN rs2 T T
PAB rs2 T T
PAC rs2 T T
PAE rs2 T T
PAT rs2 T T
PAN rs3 A C
PAB rs3 A C
PAC rs3 A C
PAE rs3 A C
PAT rs3 A C
.
.
我希望我想要的输出如下所示:
Rs1 rs1 rs2 rs2 rs3 rs3 ....
PAN G G T T A C
PAB G G T T A C
PAC G G T T A C
PAE G G T T A C
PAT G G T T A C
有 1153 个“rs”数字,每个“rs”都有 ref 和 alt 值(如果每个 rs ref 为第一列,第二列为 alt)。
答案1
使用 GNU awk 处理数组的数组:
$ cat tst.awk
BEGIN { OFS="\t" }
!seen[$1]++ { rowIds[++numRows] = $1 }
!seen[$2]++ { colIds[++numCols] = $2 }
{ vals[$1][$2] = $3 OFS $4 }
END {
printf "%s%s", "", OFS
for (colNr=1; colNr<=numCols; colNr++) {
colId = colIds[colNr]
printf "%s%s%s%s", colId, OFS, colId, (colNr<numCols ? OFS : ORS)
}
for (rowNr=1; rowNr<=numRows; rowNr++) {
rowId = rowIds[rowNr]
printf "%s%s", rowId, OFS
for (colNr=1; colNr<=numCols; colNr++) {
colId = colIds[colNr]
printf "%s%s", vals[rowId][colId], (colNr<numCols ? OFS : ORS)
}
}
}
$ awk -f tst.awk file
rs1 rs1 rs2 rs2 rs3 rs3
PAN G G T T A C
PAB G G T T A C
PAC G G T T A C
PAE G G T T A C
PAT G G T T A C
答案2
gawk(使用多维数组功能):
{
if($2 ~ "rs[0-9]+")
{
idx = substr($2, 3)
max_rs = idx > max_rs ? idx : max_rs
a[$1][idx * 2 - 1] = $3
a[$1][idx * 2] = $4
}
}
END{
# header
printf "\t"
for (i = 1; i <= max_rs; ++i) {
printf "rs" i "\trs" i "\t"
}
printf "\n"
# entries
for (entry in a) {
printf entry "\t"
for (i = 1; i <= max_rs*2; ++i) {
printf a[entry][i] "\t"
}
printf "\n"
}
}
它的工作原理是将所有rs[index]
列存储在a[key][index*2-1]
和中,a[key][index*2]
并将它们输出到END
,max_rs
用于存储最大rs
数。