在块中插入缺失的数据

Question

感觉可以用更简单的方式来完成，但经过一个小时的绞尽脑汁后我能想到的最好的方法是这个Python脚本:

#! /usr/bin/env python3

import sys, os

class Block:
    block_id = ''
    source1 = ''
    source2 = ''
    mixtures = []
    def __init__(self, block_id = '', source1 = '', source2 = '', mixtures = []):
        self.block_id = block_id
        self.mixtures = mixtures
        self.source1 = source1
        self.source2 = source2

        # Convert mixtures to a set of characters. For example, 
        #     ''.join(["AT", "TT"]) 
        # creates the string "ATTT". set() then converts that string to 
        # a set of characters {'A', 'T'}
        sources = set(''.join(mixtures))

        # If a source is empty, we take from the set the first element (pop()) 
        # after removing the other source (difference()). Since the set 
        # contains single characters, we double it to get "AA", "TT", etc.
        if self.source1 == '':
            self.source1 = sources.difference(set(self.source2)).pop()*2
        sources.remove (self.source1[0])
        if self.source2 == '':
            self.source2 = sources.pop()*2

    def print (self):
        print (self.block_id, "source1", self.source1)
        print (self.block_id, "source2", self.source2)
        for mix in self.mixtures:
            print (self.block_id, "mixture", mix)

if len(sys.argv) == 1:
    files = [os.stdin]
else:
    files = (open(f) for f in sys.argv[1:])

for f in files:
    # Read in all the lines
    data = [line for rawline in f for line in [rawline.strip().split(' ')]]
    # Get the unique block IDs
    blocks = set (lines[0] for line in data)
    # For each block ID
    for b in blocks:
        # The corresponding mixtures
        mix = [line[2] for line in data if line[0] == b and "mixture" == line[1]]

        # If "source1 XX" is present, we will get the list ['XX'], and [] if 
        # source1 is not present. ''.join() allows us to flatten ['XX']  to 
        # just 'XX' (and doesn't affect []). Similarly for source2.
        source1 = ''.join(d[2] for line in data if line[0] == b and "source1" == line[1])
        source2 = ''.join(d[2] for line in data if line[0] == b and "source2" == line[1])

        # Create an object of the class defined above, and print it.
        # Initialization fills up the blank values.
        Block(b, source1, source2, mix).print()

即使这样，这也将提供随机的、无序的输出（即block3数据可能先于block1等）。

将其保存在脚本中（例如，insert.py）并运行：

python3 insert.py inputfile

我重写了这个awk:

#! /usr/bin/awk -f

function build (block, source1, source2, sources, mixtures)
{       
    if (! source1)
    {
        for (char in sources)
        {
            if (source2 != char char)
            {
                source1 = char char
                delete sources[char]
                break
            }
        }
    }
    if (! source2)
    {   
        for (char in sources)
        {
            if (source1 != char char)
            {
                source2 = char char
                delete sources[char]
                break
            }
        }
    }
    printf "%s %s %s\n", block, "source1", source1
    printf "%s %s %s\n", block, "source2", source2
    for (m in mixtures)
    {
        for (i = 0; i < mixtures[m]; i++)
        {
            printf "%s %s %s\n", block, "mixture", m
        }
    }

}

{
    if (prev != $1)
    {
        if (prev in data)
        {
            build(prev, source1, source2, sources, mixtures)
        }

        prev = $1
        source1 = ""
        source2 = ""
        delete sources
        delete mixtures
    }

    data[$1]++
    if ($2 == "source1") {source1 = $3; next}
    if ($2 == "source2") {source2 = $3; next}
    if ($2 == "mixture")
    {
        mixtures[$3]++ 
        split ($3, chars, "")
        for (i=1; i <= length($3); i++)
        {
            sources[chars[i]]++
        }
    }
}

END { build(prev, source1, source2, sources, mixtures) }

将其保存在脚本中（例如insert.awk），chmod +x然后运行它：

./insert.awk inputfile

现在它也应该保留订单。请注意，我使用了delete，它可能不存在于某些 awks 中（但应该存在于 GNU awk 和 mawk 中）。

Answer 1