如何从多个文本文件中提取特定数据(使用字符串和分隔符)到 Excel 工作表?

如何从多个文本文件中提取特定数据(使用字符串和分隔符)到 Excel 工作表?

我有数千个文件(例如 EOG090W06LO_M0_mlc),如下所示,我想从这些文件中提取特定数据,例如

                 lnL            kappa (ts/tv)    omega (dN/dS)     tree length for dN   tree length for dS
EOG090W06LO  -5160.995083        1.50195           0.05318               0.6637               12.4810
EOG090W05ZG   ......             .......           ......               .......              ........
EOG090W06CS   ......             .......           ......               .......              ........

请指导我如何在 bash 或 python 中做到这一点。

示例文件 EOG090W06LO_M0_mlc

> Codon frequencies under model, for use in evolver (TTT TTC TTA TTG ...
> GGG):
>   0.01286657  0.01332375  0.01120763  0.01409444
>   0.00654743  0.00678008  0.00570324  0.00717226
>   0.01087585  0.01126231  0.00000000  0.00000000
>   0.00570731  0.00591011  0.00497145  0.00625197
>   0.01541211  0.01595975  0.01342496  0.01688291
>   0.00784278  0.00812146  0.00683158  0.00859123
>   0.01302755  0.01349046  0.01134786  0.01427079
>   0.00683646  0.00707938  0.00595501  0.00748887
>   0.02626538  0.02719867  0.02287888  0.02877192
>   0.01336570  0.01384063  0.01164241  0.01464121
>   0.02220161  0.02299050  0.01933907  0.02432034
>   0.01165072  0.01206470  0.01014854  0.01276256
>   0.03667897  0.03798228  0.03194980  0.04017930
>   0.01866488  0.01932810  0.01625834  0.02044610
>   0.03100401  0.03210568  0.02700654  0.03396277
>   0.01626995  0.01684807  0.01417220  0.01782261
> 
> 
> TREE #  1:  (15, 43, (((((1, (((((10, (12, 6)), ((((11, 3), 13), 5),
> 8)), (7, 9)), (14, 4)), 2)), 44), ((19, 24), ((21, 22), (47, 48)))),
> ((((((((16, 18), 17), 54), 20), (((((((26, 32), 51), (33, 53)), 29),
> ((27, 34), ((28, 31), 30))), 50), 35)), ((((36, (39, 40)), (38, 41)),
> 37), ((42, (45, 46)), 52))), 49), 25)), 23));   MP score: 1153
> lnL(ntime:  0  np:  2):  -5160.995083      +0.000000
>  1.501953 0.053177
> 
> tree length = 11.795207
> 
> (15: 0.475705, 43: 0.645809, (((((1: 0.206782, (((((10: 0.069332, (12:
> 0.046579, 6: 0.054610): 0.024700): 0.084846, ((((11: 0.011270, 3: 0.006663): 0.009598, 13: 0.006096): 0.062727, 5: 0.096529): 0.014611, 8: 0.101040): 0.032842): 0.041200, (7: 0.057180, 9: 0.084448):
> 0.062359): 0.047267, (14: 0.103211, 4: 0.090685): 0.077973): 0.064394, 2: 0.181198): 0.243921): 0.270070, 44: 0.295151): 0.134002, ((19:
> 0.391130, 24: 0.180895): 0.089913, ((21: 0.075882, 22: 0.052405): 0.142810, (47: 0.360326, 48: 0.151256): 0.061099): 0.073893): 0.186753): 0.035225, ((((((((16: 0.021929, 18: 0.021001): 0.022963, 17: 0.036132): 0.034858, 54: 0.055961): 0.084331, 20: 0.138317):
> 0.140076, (((((((26: 0.148166, 32: 0.104747): 0.033408, 51: 0.164604): 0.029923, (33: 0.190488, 53: 0.174541): 0.030186): 0.019935, 29: 0.222916): 0.020484, ((27: 0.050592, 34: 0.061516): 0.092558, ((28: 0.074252, 31: 0.108120): 0.020924, 30: 0.061404): 0.114540): 0.109270): 0.097724, 50: 0.202292): 0.206280, 35: 0.296817): 0.083934): 0.055128, ((((36: 0.026483, (39: 0.027394, 40: 0.011871): 0.003734): 0.010522, (38: 0.015286, 41: 0.013052): 0.022506): 0.062251, 37: 0.074776): 0.179134, ((42: 0.088283, (45: 0.133738, 46: 0.088861): 0.020813): 0.076282, 52: 0.230076): 0.048498): 0.051390): 0.167347, 49: 0.395711): 0.061789, 25: 0.377871): 0.036416): 0.064064, 23: 0.514741): 0.157612);
> 
> (taxon15: 0.475705, taxon43: 0.645809, (((((taxon1: 0.206782,
> (((((taxon10: 0.069332, (taxon12: 0.046579, taxon6: 0.054610):
> 0.024700): 0.084846, ((((taxon11: 0.011270, taxon3: 0.006663): 0.009598, taxon13: 0.006096): 0.062727, taxon5: 0.096529): 0.014611, taxon8: 0.101040): 0.032842): 0.041200, (taxon7: 0.057180, taxon9:
> 0.084448): 0.062359): 0.047267, (taxon14: 0.103211, taxon4: 0.090685): 0.077973): 0.064394, taxon2: 0.181198): 0.243921): 0.270070, taxon44: 0.295151): 0.134002, ((taxon19: 0.391130, taxon24: 0.180895): 0.089913, ((taxon21: 0.075882, taxon22: 0.052405): 0.142810, (taxon47: 0.360326, taxon48: 0.151256): 0.061099): 0.073893): 0.186753): 0.035225, ((((((((taxon16: 0.021929, taxon18: 0.021001): 0.022963, taxon17: 0.036132): 0.034858, taxon54: 0.055961): 0.084331, taxon20:
> 0.138317): 0.140076, (((((((taxon26: 0.148166, taxon32: 0.104747): 0.033408, taxon51: 0.164604): 0.029923, (taxon33: 0.190488, taxon53: 0.174541): 0.030186): 0.019935, taxon29: 0.222916): 0.020484, ((taxon27: 0.050592, taxon34: 0.061516): 0.092558, ((taxon28:
> 0.074252, taxon31: 0.108120): 0.020924, taxon30: 0.061404): 0.114540): 0.109270): 0.097724, taxon50: 0.202292): 0.206280, taxon35: 0.296817): 0.083934): 0.055128, ((((taxon36: 0.026483, (taxon39: 0.027394, taxon40: 0.011871): 0.003734): 0.010522, (taxon38: 0.015286, taxon41:
> 0.013052): 0.022506): 0.062251, taxon37: 0.074776): 0.179134, ((taxon42: 0.088283, (taxon45: 0.133738, taxon46: 0.088861):
> 0.020813): 0.076282, taxon52: 0.230076): 0.048498): 0.051390): 0.167347, taxon49: 0.395711): 0.061789, taxon25: 0.377871): 0.036416): 0.064064, taxon23: 0.514741): 0.157612);
> 
> Detailed output identifying parameters
> 
> kappa (ts/tv) =  1.50195
> 
> omega (dN/dS) =  0.05318
> 
> dN & dS for each branch
> 
>  branch          t       N       S   dN/dS      dN      dS  N*dN  S*dS
> 
>   55..15     0.476   158.4    60.6  0.0532  0.0268  0.5034   4.2  30.5
>   55..43     0.646   158.4    60.6  0.0532  0.0363  0.6834   5.8  41.4
>   55..56     0.158   158.4    60.6  0.0532  0.0089  0.1668   1.4  10.1
>   56..57     0.064   158.4    60.6  0.0532  0.0036  0.0678   0.6   4.1
>   57..58     0.035   158.4    60.6  0.0532  0.0020  0.0373   0.3   2.3
>   58..59     0.134   158.4    60.6  0.0532  0.0075  0.1418   1.2   8.6
>   59..60     0.270   158.4    60.6  0.0532  0.0152  0.2858   2.4  17.3
>   60..1      0.207   158.4    60.6  0.0532  0.0116  0.2188   1.8  13.3
>   60..61     0.244   158.4    60.6  0.0532  0.0137  0.2581   2.2  15.6
>   61..62     0.064   158.4    60.6  0.0532  0.0036  0.0681   0.6   4.1
>   62..63     0.047   158.4    60.6  0.0532  0.0027  0.0500   0.4   3.0
>   63..64     0.041   158.4    60.6  0.0532  0.0023  0.0436   0.4   2.6
>   64..65     0.085   158.4    60.6  0.0532  0.0048  0.0898   0.8   5.4
>   65..10     0.069   158.4    60.6  0.0532  0.0039  0.0734   0.6   4.4
>   65..66     0.025   158.4    60.6  0.0532  0.0014  0.0261   0.2   1.6
>   66..12     0.047   158.4    60.6  0.0532  0.0026  0.0493   0.4   3.0
>   66..6      0.055   158.4    60.6  0.0532  0.0031  0.0578   0.5   3.5
>   64..67     0.033   158.4    60.6  0.0532  0.0018  0.0348   0.3   2.1
>   67..68     0.015   158.4    60.6  0.0532  0.0008  0.0155   0.1   0.9
>   68..69     0.063   158.4    60.6  0.0532  0.0035  0.0664   0.6   4.0
>   69..70     0.010   158.4    60.6  0.0532  0.0005  0.0102   0.1   0.6
>   70..11     0.011   158.4    60.6  0.0532  0.0006  0.0119   0.1   0.7
>   70..3      0.007   158.4    60.6  0.0532  0.0004  0.0071   0.1   0.4
>   69..13     0.006   158.4    60.6  0.0532  0.0003  0.0065   0.1   0.4
>   68..5      0.097   158.4    60.6  0.0532  0.0054  0.1021   0.9   6.2
>   67..8      0.101   158.4    60.6  0.0532  0.0057  0.1069   0.9   6.5
>   63..71     0.062   158.4    60.6  0.0532  0.0035  0.0660   0.6   4.0
>   71..7      0.057   158.4    60.6  0.0532  0.0032  0.0605   0.5   3.7
>   71..9      0.084   158.4    60.6  0.0532  0.0048  0.0894   0.8   5.4
>   62..72     0.078   158.4    60.6  0.0532  0.0044  0.0825   0.7   5.0
>   72..14     0.103   158.4    60.6  0.0532  0.0058  0.1092   0.9   6.6
>   72..4      0.091   158.4    60.6  0.0532  0.0051  0.0960   0.8   5.8
>   61..2      0.181   158.4    60.6  0.0532  0.0102  0.1917   1.6  11.6
>   59..44     0.295   158.4    60.6  0.0532  0.0166  0.3123   2.6  18.9
>   58..73     0.187   158.4    60.6  0.0532  0.0105  0.1976   1.7  12.0
>   73..74     0.090   158.4    60.6  0.0532  0.0051  0.0951   0.8   5.8
>   74..19     0.391   158.4    60.6  0.0532  0.0220  0.4139   3.5  25.1
>   74..24     0.181   158.4    60.6  0.0532  0.0102  0.1914   1.6  11.6
>   73..75     0.074   158.4    60.6  0.0532  0.0042  0.0782   0.7   4.7
>   75..76     0.143   158.4    60.6  0.0532  0.0080  0.1511   1.3   9.2
>   76..21     0.076   158.4    60.6  0.0532  0.0043  0.0803   0.7   4.9
>   76..22     0.052   158.4    60.6  0.0532  0.0029  0.0555   0.5   3.4
>   75..77     0.061   158.4    60.6  0.0532  0.0034  0.0647   0.5   3.9
>   77..47     0.360   158.4    60.6  0.0532  0.0203  0.3813   3.2  23.1
>   77..48     0.151   158.4    60.6  0.0532  0.0085  0.1601   1.3   9.7
>   57..78     0.036   158.4    60.6  0.0532  0.0020  0.0385   0.3   2.3
>   78..79     0.062   158.4    60.6  0.0532  0.0035  0.0654   0.6   4.0
>   79..80     0.167   158.4    60.6  0.0532  0.0094  0.1771   1.5  10.7
>   80..81     0.055   158.4    60.6  0.0532  0.0031  0.0583   0.5   3.5
>   81..82     0.140   158.4    60.6  0.0532  0.0079  0.1482   1.2   9.0
>   82..83     0.084   158.4    60.6  0.0532  0.0047  0.0892   0.8   5.4
>   83..84     0.035   158.4    60.6  0.0532  0.0020  0.0369   0.3   2.2
>   84..85     0.023   158.4    60.6  0.0532  0.0013  0.0243   0.2   1.5
>   85..16     0.022   158.4    60.6  0.0532  0.0012  0.0232   0.2   1.4
>   85..18     0.021   158.4    60.6  0.0532  0.0012  0.0222   0.2   1.3
>   84..17     0.036   158.4    60.6  0.0532  0.0020  0.0382   0.3   2.3
>   83..54     0.056   158.4    60.6  0.0532  0.0031  0.0592   0.5   3.6
>   82..20     0.138   158.4    60.6  0.0532  0.0078  0.1464   1.2   8.9
>   81..86     0.084   158.4    60.6  0.0532  0.0047  0.0888   0.7   5.4
>   86..87     0.206   158.4    60.6  0.0532  0.0116  0.2183   1.8  13.2
>   87..88     0.098   158.4    60.6  0.0532  0.0055  0.1034   0.9   6.3
>   88..89     0.020   158.4    60.6  0.0532  0.0012  0.0217   0.2   1.3
>   89..90     0.020   158.4    60.6  0.0532  0.0011  0.0211   0.2   1.3
>   90..91     0.030   158.4    60.6  0.0532  0.0017  0.0317   0.3   1.9
>   91..92     0.033   158.4    60.6  0.0532  0.0019  0.0354   0.3   2.1
>   92..26     0.148   158.4    60.6  0.0532  0.0083  0.1568   1.3   9.5
>   92..32     0.105   158.4    60.6  0.0532  0.0059  0.1108   0.9   6.7
>   91..51     0.165   158.4    60.6  0.0532  0.0093  0.1742   1.5  10.5
>   90..93     0.030   158.4    60.6  0.0532  0.0017  0.0319   0.3   1.9
>   93..33     0.190   158.4    60.6  0.0532  0.0107  0.2016   1.7  12.2
>   93..53     0.175   158.4    60.6  0.0532  0.0098  0.1847   1.6  11.2
>   89..29     0.223   158.4    60.6  0.0532  0.0125  0.2359   2.0  14.3
>   88..94     0.109   158.4    60.6  0.0532  0.0061  0.1156   1.0   7.0
>   94..95     0.093   158.4    60.6  0.0532  0.0052  0.0979   0.8   5.9
>   95..27     0.051   158.4    60.6  0.0532  0.0028  0.0535   0.5   3.2
>   95..34     0.062   158.4    60.6  0.0532  0.0035  0.0651   0.5   3.9
>   94..96     0.115   158.4    60.6  0.0532  0.0064  0.1212   1.0   7.3
>   96..97     0.021   158.4    60.6  0.0532  0.0012  0.0221   0.2   1.3
>   97..28     0.074   158.4    60.6  0.0532  0.0042  0.0786   0.7   4.8
>   97..31     0.108   158.4    60.6  0.0532  0.0061  0.1144   1.0   6.9
>   96..30     0.061   158.4    60.6  0.0532  0.0035  0.0650   0.5   3.9
>   87..50     0.202   158.4    60.6  0.0532  0.0114  0.2141   1.8  13.0
>   86..35     0.297   158.4    60.6  0.0532  0.0167  0.3141   2.6  19.0
>   80..98     0.051   158.4    60.6  0.0532  0.0029  0.0544   0.5   3.3
>   98..99     0.179   158.4    60.6  0.0532  0.0101  0.1895   1.6  11.5
>   99..100    0.062   158.4    60.6  0.0532  0.0035  0.0659   0.6   4.0
>  100..101    0.011   158.4    60.6  0.0532  0.0006  0.0111   0.1   0.7
>  101..36     0.026   158.4    60.6  0.0532  0.0015  0.0280   0.2   1.7
>  101..102    0.004   158.4    60.6  0.0532  0.0002  0.0040   0.0   0.2
>  102..39     0.027   158.4    60.6  0.0532  0.0015  0.0290   0.2   1.8
>  102..40     0.012   158.4    60.6  0.0532  0.0007  0.0126   0.1   0.8
>  100..103    0.023   158.4    60.6  0.0532  0.0013  0.0238   0.2   1.4
>  103..38     0.015   158.4    60.6  0.0532  0.0009  0.0162   0.1   1.0
>  103..41     0.013   158.4    60.6  0.0532  0.0007  0.0138   0.1   0.8
>   99..37     0.075   158.4    60.6  0.0532  0.0042  0.0791   0.7   4.8
>   98..104    0.048   158.4    60.6  0.0532  0.0027  0.0513   0.4   3.1
>  104..105    0.076   158.4    60.6  0.0532  0.0043  0.0807   0.7   4.9
>  105..42     0.088   158.4    60.6  0.0532  0.0050  0.0934   0.8   5.7
>  105..106    0.021   158.4    60.6  0.0532  0.0012  0.0220   0.2   1.3
>  106..45     0.134   158.4    60.6  0.0532  0.0075  0.1415   1.2   8.6
>  106..46     0.089   158.4    60.6  0.0532  0.0050  0.0940   0.8   5.7
>  104..52     0.230   158.4    60.6  0.0532  0.0129  0.2435   2.1  14.7
>   79..49     0.396   158.4    60.6  0.0532  0.0223  0.4187   3.5  25.4
>   78..25     0.378   158.4    60.6  0.0532  0.0213  0.3998   3.4  24.2
>   56..23     0.515   158.4    60.6  0.0532  0.0290  0.5447   4.6  33.0
> 
> tree length for dN:       0.6637 tree length for dS:      12.4810
> 
> 
> Time used:  0:04

答案1

任务规格:

(1) 从一个指定目录中保存的大量文件中读取并收集数据。每个文件名的前 11 个字符都是唯一的,名称的最后 7 个字符应为_M0_mlc.

(2) 文件是带有空格的纯文本,本质上是自由格式的。

(3) 要提取的数据由列出的特定文本标识。对于每个文件,仅使用第一个此类匹配。

lnL
kappa
omega
tree length for dN
tree length for dS

同一文本行上紧随其后的实数应为输出值。对于任何未找到此类特定文本或值的文件,应输出值 N/A。

(4) 输出应为问题中所示的单个列表,每个输入文件一行。

(5) 任何文件中的所有其他数据均应被忽略。

(6) [正如标题提到的 Excel 工作表]。应提供将数据输出为 CSV(逗号分隔值)文本文件的选项,适合导入到 Excel。

这是一个经过测试的脚本,可以根据上面的注释工作。它已经使用问题中发布的文件进行了测试,加上对数据的一些修改以测试某些条件(例如丢失数据)。

测试文件有166行,我制作了2600份用于性能测试。在我的笔记本电脑上,它在 1.25 秒内运行 2600 个 166 行的文件,大约每秒 300,000 行。您的 512KB 文件大约大 40 倍,因此它可能会在一分钟内处理完这批文件。

-c如果您希望CSV输出导入到 Excel,则需要一个参数。

Paul--) ./awkFastParse
DataSet      lnL                kappa (ts/tv)      omega (dN/dS)      tree length for dN tree length for dS
EOA100W06LO  -5160.995083       1.50195            0.05318            0.6637             12.4810           
EOA110W06LO  -5160.995083       1.50195            0.05318            0.6637             12.4810           
EOA120W06LO  N/A                N/A                N/A                N/A                N/A               
EOA130W06LO  -5160.995083       N/A                0.05318            0.6637             12.4810           
EOA140W06LO  -5160.995083       N/A                0.05318            0.6637             12.4810           
EOA150W06LO  -5160.995083       1.50195            0.05318            0.6637             12.4810           
EOA160W06LO  N/A                N/A                N/A                0.6637             12.4810           
EOA170W06LO  -5160.995083       1.50195            0.05318            0.6637             12.4810           
EOA180W06LO  -5160.995083       1.50195            0.05318            0.6637             12.4810           
Paul--) 
Paul--) ./awkFastParse -c
DataSet,lnL,kappa (ts/tv),omega (dN/dS),tree length for dN,tree length for dS
EOA100W06LO,-5160.995083,1.50195,0.05318,0.6637,12.4810
EOA110W06LO,-5160.995083,1.50195,0.05318,0.6637,12.4810
EOA120W06LO,N/A,N/A,N/A,N/A,N/A
EOA130W06LO,-5160.995083,N/A,0.05318,0.6637,12.4810
EOA140W06LO,-5160.995083,N/A,0.05318,0.6637,12.4810
EOA150W06LO,-5160.995083,1.50195,0.05318,0.6637,12.4810
EOA160W06LO,N/A,N/A,N/A,0.6637,12.4810
EOA170W06LO,-5160.995083,1.50195,0.05318,0.6637,12.4810
EOA180W06LO,-5160.995083,1.50195,0.05318,0.6637,12.4810
Paul--) 

该脚本当前从子目录“Data”获取文件,切断任何路径名和“_M0_mlc”文件名扩展名。这可能需要一些更改才能满足您的需求。如果您需要任何改进,请发布。

#! /bin/bash --

[[ "${1}" = "-c" ]] && { Csv=1; shift; }    #.. Option to set CSV output.

awkFastParse () {

    AWK='
BEGIN {
    txTitle = "DataSet,lnL,kappa (ts/tv),omega (dN/dS)," \
        "tree length for dN,tree length for dS";
    split (txTitle, Title, /,/);
    fmtCSV = "%s,%s,%s,%s,%s,%s\n";
    fmtCol = "%-12s %-18s %-18s %-18s %-18s %-18s\n";
    fmtRow = (Csv) ? fmtCSV : fmtCol;
    printf (fmtRow, Title[1], Title[2], Title[3], Title[4], Title[5], Title[6]);

    stderr = "cat 1>&2"; NUL = "\000"; NL = RS; RS = NUL;
    reAnyTag = "(lnL|kappa|omega|tree length for (dN|dS))";
    reValue = "[-+]?[0-9]+[.][0-9]+";
    Tag["lnL"]; Tag["kappa"]; Tag["omega"];
    Tag["tree length for dN"]; Tag["tree length for dS"];
}

function doFile (fn, Local, g, Set, fnr, txt, tag, j, cut, value) {

    RS = NL;
    while ((g = (getline txt < fn)) > 0) {
        ++fnr;
        #.. Optimisation -- skip any line that does not contain a Tag.
        if (txt !~ reAnyTag) continue;
        #.. Find all tags in this line.
        for (tag in Tag) {
            #.. Only take the first value in any file.
            if (tag in value) continue;
            #.. Locate the tag, or skip.
            if ((j = index (txt, tag)) == 0) continue;
            #.. Isolate the rest of the line.
            cut = substr (txt, j);
            #.. The value is the next real number.
            if (match (cut, reValue)) {
                value[tag] = substr (cut, RSTART, RLENGTH);
            }
        }
        #.. Optimisation -- if we have one of each value, skip the file.
        if (length (value) == length (Tag)) break;
    }
    if (g < 0) printf ("%s: %s\n", fn, ERRNO) | stderr;
    close (fn);
    RS = NUL;
    #.. Format and print the line for this file.
    Set = fn; sub (/^.*\//, "", Set); sub (/_M0_mlc$/, "", Set);
    for (tag in Tag) if (! (tag in value)) value[tag] = "N/A";
    printf (fmtRow, Set, value["lnL"], value["kappa"], value["omega"],
        value["tree length for dN"], value["tree length for dS"]);
}

{ doFile( $0); }
'
    awk -v Csv="${Csv}" -f <( printf '%s' "${AWK}" )
}


#.. Generate a list of file names with null terminations.

    find Data -name '*_M0_mlc' -print0 | sort -z | awkFastParse

相关内容