如何使用 awk 根据最后一行和下一行对 12 进行减法或求和?

如何使用 awk 根据最后一行和下一行对 12 进行减法或求和?

我有这个数据:

##sequence-region Q75T13 1 641
Q75T13,UniProtKB,Chain,1,641,.,.,.,ID
Q75T13,UniProtKB,Topological domain,1,60,.,.,.,Note=Cytoplasmic
Q75T13,UniProtKB,Transmembrane,61,85,.,.,.,Note=Helical
Q75T13,UniProtKB,Topological domain,86,641,.,.,.,Note=Lumenal


##sequence-region Q9BRR3 1 403
Q9BRR3,UniProtKB,Chain,1,403,.,.,.,ID
Q9BRR3,UniProtKB,Topological domain,1,22,.,.,.,Note=Lumenal
Q9BRR3,UniProtKB,Transmembrane,23,43,.,.,.,Note=Helical
Q9BRR3,UniProtKB,Topological domain,44,259,.,.,.,Note=Cytoplasmic

##sequence-region Q96FM1 1 250
Q96FM1,UniProtKB,Topological domain,120,135,.,.,.,Note=Cytoplasmic
Q96FM1,UniProtKB,Transmembrane,136,156,.,.,.,Note=Helical
Q96FM1,UniProtKB,Topological domain,157,169,.,.,.,Note=Lumenal
Q96FM1,UniProtKB,Transmembrane,170,190,.,.,.,Note=Helical
Q96FM1,UniProtKB,Topological domain,191,250,.,.,.,Note=Lumenal

我想知道 awk 代码会是什么样子:

具有单词 lumenal 的行,如果在前一行中具有单词 transmembrane,则在第 4 列中减去 -12 并打印带有单词 lumenal 的行。如果带有单词“lumenal”的行在下一行中包含单词“跨膜”,则在第 5 列中添加 +12 并打印带有单词“lumenal”的行。最终文件将是:

Q75T13,UniProtKB,Topological domain,74,641,.,.,.,Note=Lumenal
Q9BRR3,UniProtKB,Topological domain,1,34,.,.,.,Note=Lumenal
Q96FM1,UniProtKB,Topological domain,145,169,.,.,.,Note=Lumenal
Q96FM1,UniProtKB,Topological domain,157,181,.,.,.,Note=Lumenal
Q96FM1,UniProtKB,Topological domain,179,250,.,.,.,Note=Lumenal

有人能帮我吗?我有点卡住了。我正在尝试使用 awk 和 grep

答案1

尝试以下命令:

root@u2004:~# cat test
##sequence-region Q75T13 1 641
Q75T13,UniProtKB,Chain,1,641,.,.,.,ID
Q75T13,UniProtKB,Topological domain,1,60,.,.,.,Note=Cytoplasmic
Q75T13,UniProtKB,Transmembrane,61,85,.,.,.,Note=Helical
Q75T13,UniProtKB,Topological domain,86,641,.,.,.,Note=Lumenal


##sequence-region Q9BRR3 1 403
Q9BRR3,UniProtKB,Chain,1,403,.,.,.,ID
Q9BRR3,UniProtKB,Topological domain,1,22,.,.,.,Note=Lumenal
Q9BRR3,UniProtKB,Transmembrane,23,43,.,.,.,Note=Helical
Q9BRR3,UniProtKB,Topological domain,44,259,.,.,.,Note=Cytoplasmic

##sequence-region Q96FM1 1 250
Q96FM1,UniProtKB,Topological domain,120,135,.,.,.,Note=Cytoplasmic
Q96FM1,UniProtKB,Transmembrane,136,156,.,.,.,Note=Helical
Q96FM1,UniProtKB,Topological domain,157,169,.,.,.,Note=Lumenal
Q96FM1,UniProtKB,Transmembrane,170,190,.,.,.,Note=Helical
Q96FM1,UniProtKB,Topological domain,191,250,.,.,.,Note=Lumenal
root@u2004:~# 
root@u2004:~# awk -F, -v OFS=, '{while(1){if($0~/Lumenal/){a=$0; $4-=12;p=$0; $0=a;$5+=12;n=$0; if(index(pre,"Transmembrane")>0)print p; if(getline>0){if(index($0,"Transmembrane"))print n; if($0~/Lumenal/){pre=$0; continue}}} break}} {pre=$0}' test
Q75T13,UniProtKB,Topological domain,74,641,.,.,.,Note=Lumenal
Q9BRR3,UniProtKB,Topological domain,1,34,.,.,.,Note=Lumenal
Q96FM1,UniProtKB,Topological domain,145,169,.,.,.,Note=Lumenal
Q96FM1,UniProtKB,Topological domain,157,181,.,.,.,Note=Lumenal
Q96FM1,UniProtKB,Topological domain,179,250,.,.,.,Note=Lumenal
root@u2004:~#

答案2

只需保留 3 行的滚动缓冲区并检查:

$ cat tst.awk
BEGIN { FS=OFS="," }
{
    nxt = $0
    prt()
}
END {
    prt()
}

function prt() {
    if ( cur ~ /Lumenal/ ) {
        if ( pre ~ /Transmembrane/ ) {
            $0 = cur
            $4 -= 12
            print
        }

        if ( nxt ~ /Transmembrane/ ) {
            $0 = cur
            $5 += 12
            print
        }
    }

    pre = cur
    cur = nxt
    nxt = ""
}

$ awk -f tst.awk file
Q75T13,UniProtKB,Topological domain,74,641,.,.,.,Note=Lumenal
Q9BRR3,UniProtKB,Topological domain,1,34,.,.,.,Note=Lumenal
Q96FM1,UniProtKB,Topological domain,145,169,.,.,.,Note=Lumenal
Q96FM1,UniProtKB,Topological domain,157,181,.,.,.,Note=Lumenal
Q96FM1,UniProtKB,Topological domain,179,250,.,.,.,Note=Lumenal

答案3

在您将列分隔符更改为逗号之前,我已经解决了这个问题。第一项工作是将测试文件中的多个空格更改为制表符:

$ cat indata 
##sequence-region Q75T13 1 641
Q75T13  UniProtKB   Chain   1   641 .   .   .   ID
Q75T13  UniProtKB   Topological domain  1   60  .   .   .   Note=Cytoplasmic    
Q75T13  UniProtKB   Transmembrane   61  85  .   .   .   Note=Helical
Q75T13  UniProtKB   Topological domain  86  641 .   .   .   Note=Lumenal


##sequence-region Q9BRR3 1 403
Q9BRR3  UniProtKB   Chain   1   403 .   .   .   ID
Q9BRR3  UniProtKB   Topological domain  1   22  .   .   .   Note=Lumenal
Q9BRR3  UniProtKB   Transmembrane   23  43  .   .   .   Note=Helical
Q9BRR3  UniProtKB   Topological domain  44  259 .   .   .   Note=Cytoplasmic

##sequence-region Q96FM1 1 250
Q96FM1  UniProtKB   Topological domain  120 135 .   .   .   Note=Cytoplasmic
Q96FM1  UniProtKB   Transmembrane   136 156 .   .   .   Note=Helical
Q96FM1  UniProtKB   Topological domain  157 169 .   .   .   Note=Lumenal
Q96FM1  UniProtKB   Transmembrane   170 190 .   .   .   Note=Helical
Q96FM1  UniProtKB   Topological domain  191 250 .   .   .   Note=Lumenal

接下来是脚本。请注意,函数中的第三个参数split是制表符。

#!/bin/bash
awk '
        function add12(out_line) {
                iarr = split( out_line, arr, "  " )
                arr[5] = 12 + arr[5]
                printf( "%s", arr[1])
                for (i=2 ; i<=iarr ; i++) printf( "\t%s", arr[i] )
                printf( "\n" )
        }

        function sub12(out_line) {
                iarr = split( out_line, arr, "  " )
                arr[4] = arr[4] - 12
                printf( "%s", arr[1])
                for (i=2 ; i<=iarr ; i++) printf( "\t%s", arr[i] )
                printf( "\n" )
        }

        NR == 1 { last_line = $0 ; next }
        NR == 2 { test_line = $0 ; next }

        test_line ~ /Lumenal/ {
                if (last_line ~ /Transmembrane/) sub12( test_line )
                if ($0  ~ /Transmembrane/) add12( test_line )
        }

        {
                last_line = test_line
                test_line = $0
        }

        END {
                if (test_line ~ /Lumenal/) {
                        if (last_line ~ /Transmembrane/) sub12( test_line )
                }
        }
' $1

和“布丁的证明”:

$ ./doit indata
Q75T13  UniProtKB   Topological domain  74  641 .   .   .   Note=Lumenal
Q9BRR3  UniProtKB   Topological domain  1   34  .   .   .   Note=Lumenal
Q96FM1  UniProtKB   Topological domain  145 169 .   .   .   Note=Lumenal
Q96FM1  UniProtKB   Topological domain  157 181 .   .   .   Note=Lumenal
Q96FM1  UniProtKB   Topological domain  179 250 .   .   .   Note=Lumenal

我创建了doit2

$ diff doit*
4c4
<       iarr = split( out_line, arr, "  " )
---
>       iarr = split( out_line, arr, "," )
7c7
<       for (i=2 ; i<=iarr ; i++) printf( "\t%s", arr[i] )
---
>       for (i=2 ; i<=iarr ; i++) printf( ",%s", arr[i] )
12c12
<       iarr = split( out_line, arr, "  " )
---
>       iarr = split( out_line, arr, "," )
15c15
<       for (i=2 ; i<=iarr ; i++) printf( "\t%s", arr[i] )
---
>       for (i=2 ; i<=iarr ; i++) printf( ",%s", arr[i] )

并使用 csv 文件:

$ ./doit2 comma
Q75T13,UniProtKB,Topological domain,74,641,.,.,.,Note=Lumenal
Q9BRR3,UniProtKB,Topological domain,1,34,.,.,.,Note=Lumenal
Q96FM1,UniProtKB,Topological domain,145,169,.,.,.,Note=Lumenal
Q96FM1,UniProtKB,Topological domain,157,181,.,.,.,Note=Lumenal
Q96FM1,UniProtKB,Topological domain,179,250,.,.,.,Note=Lumenal

相关内容