比较两个文件的四列

Question 1

您可以用于sqlite加入您的数据：

$ sqlite3 <<EOT
.mode csv
CREATE TABLE file1 (A11,A12,A13,A14,A15,A16,A17,A18,A19,A110,A111);
CREATE TABLE file2 (B11,B12,B13,B14);
.import file1 file1
.import file2 file2
SELECT DISTINCT file1.*, ifnull(file2.B14,"NOACT") FROM file1 LEFT JOIN file2 ON abs(julianday(file1.A11) - julianday(file2.B11))*86400.0 < 1 AND A15 = B13;
EOT

这使：

19:15:30.047315,184.125.142.179,87492,174.145.246.178,87961,.,7648919765,145,767942442,896450764,1,NOACT
19:15:30.047578,184.125.142.185,87492,174.145.246.132,52364,.,7648919765,145,767942442,896450764,1,OK
19:15:31.047315,184.125.142.145,87492,174.145.246.158,52364,.,7648919765,145,767942442,896450764,1,NOACT
19:15:31.049687,184.125.142.145,87492,174.145.246.158,44444,.,7648919765,145,767942442,896450764,1,NOT
19:15:32.049687,184.125.142.145,87492,174.145.246.158,44454,.,7648919765,145,767942442,896450764,1,NOACT

Answer

您可以用于sqlite加入您的数据：

$ sqlite3 <<EOT
.mode csv
CREATE TABLE file1 (A11,A12,A13,A14,A15,A16,A17,A18,A19,A110,A111);
CREATE TABLE file2 (B11,B12,B13,B14);
.import file1 file1
.import file2 file2
SELECT DISTINCT file1.*, ifnull(file2.B14,"NOACT") FROM file1 LEFT JOIN file2 ON abs(julianday(file1.A11) - julianday(file2.B11))*86400.0 < 1 AND A15 = B13;
EOT

这使：

19:15:30.047315,184.125.142.179,87492,174.145.246.178,87961,.,7648919765,145,767942442,896450764,1,NOACT
19:15:30.047578,184.125.142.185,87492,174.145.246.132,52364,.,7648919765,145,767942442,896450764,1,OK
19:15:31.047315,184.125.142.145,87492,174.145.246.158,52364,.,7648919765,145,767942442,896450764,1,NOACT
19:15:31.049687,184.125.142.145,87492,174.145.246.158,44444,.,7648919765,145,767942442,896450764,1,NOT
19:15:32.049687,184.125.142.145,87492,174.145.246.158,44454,.,7648919765,145,767942442,896450764,1,NOACT

Question 2

以下脚本应该可以完成您在 python 中寻找的内容：

#!/usr/bin/env python3

import sys
import csv
import bisect
import re

timeRange = 1 # seconds

# Timestamps should match the following regex
timestampRegex = re.compile('[0-9]*:*[0-9]*:[0-9][0-9]\.*[0-9]*')

# Convert the time from a string 'hh:mm:ss' to a float
def timeToFloat(time):
    # We add hours and minutes in case they are missinge
    # We reverse the list
    # We only take the first three elements of the list anyway
    (s, m, h) = (['00', '00'] + time.split(':'))[::-1][0:3]
    result = float(h) * 3600 + float(m) * 60 + float(s)
    return result

if (len(sys.argv) != 4):
    print('Usage: {} <input file 1> <input file 2> <output file>'.format(sys.argv[0]))
    exit(1)

inputFileName1 = sys.argv[1]
inputFileName2 = sys.argv[2]
outputFileName = sys.argv[3]

# Each entry will be a tuple with the time as the first element
# and the row of file2 as the second element
file2Entries = []
with open(inputFileName2) as inputFile2:
    csvReader = csv.reader(inputFile2)
    for row in csvReader:
        if len(row) == 4:
            if not timestampRegex.match(row[0]):
                continue
            time = timeToFloat(row[0])
            file2Entries.append((time, row))

file1Entries = []
with open(inputFileName1) as inputFile1, open(outputFileName, 'w') as outputFile:
    csvReader = csv.reader(inputFile1)
    # For each row in file1 we look for a match among the entries of file2
    for row in csvReader:
        if len(row) == 11:
            if not timestampRegex.match(row[0]):
                # We can't possibly find a match for this line
                outputFile.write(','.join(row + ['NOACT']))
                outputFile.write('\n')
                continue
            time = timeToFloat(row[0])
            # Find the first and last entries of file2 within a range of 1 second
            a = bisect.bisect_right(file2Entries, (time - timeRange,))
            b = bisect.bisect_left(file2Entries, (time + timeRange,))
            # Loop on the possible matches (those within the given range)
            for entry in file2Entries[a:b]:
                if entry[1][2] == row[4]:
                    outputFile.write(','.join(row + [entry[1][3]]))
                    outputFile.write('\n')
                    break;
            else:
                # We haven't found a match in file2
                outputFile.write(','.join(row + ['NOACT']))
                outputFile.write('\n')

您可以将其另存为compare.py并运行为：

./compare.py input1.txt input2.txt output.txt

请注意，我并不完全确定提取的范围的正确性，您应该确保它正常工作。

更新：第二个输入文件中不包含有效时间戳的行现在将被忽略，而第一个输入文件中不包含有效时间戳的行现在将复制到输出文件，并在末尾附加 NOACT

Answer

以下脚本应该可以完成您在 python 中寻找的内容：

#!/usr/bin/env python3

import sys
import csv
import bisect
import re

timeRange = 1 # seconds

# Timestamps should match the following regex
timestampRegex = re.compile('[0-9]*:*[0-9]*:[0-9][0-9]\.*[0-9]*')

# Convert the time from a string 'hh:mm:ss' to a float
def timeToFloat(time):
    # We add hours and minutes in case they are missinge
    # We reverse the list
    # We only take the first three elements of the list anyway
    (s, m, h) = (['00', '00'] + time.split(':'))[::-1][0:3]
    result = float(h) * 3600 + float(m) * 60 + float(s)
    return result

if (len(sys.argv) != 4):
    print('Usage: {} <input file 1> <input file 2> <output file>'.format(sys.argv[0]))
    exit(1)

inputFileName1 = sys.argv[1]
inputFileName2 = sys.argv[2]
outputFileName = sys.argv[3]

# Each entry will be a tuple with the time as the first element
# and the row of file2 as the second element
file2Entries = []
with open(inputFileName2) as inputFile2:
    csvReader = csv.reader(inputFile2)
    for row in csvReader:
        if len(row) == 4:
            if not timestampRegex.match(row[0]):
                continue
            time = timeToFloat(row[0])
            file2Entries.append((time, row))

file1Entries = []
with open(inputFileName1) as inputFile1, open(outputFileName, 'w') as outputFile:
    csvReader = csv.reader(inputFile1)
    # For each row in file1 we look for a match among the entries of file2
    for row in csvReader:
        if len(row) == 11:
            if not timestampRegex.match(row[0]):
                # We can't possibly find a match for this line
                outputFile.write(','.join(row + ['NOACT']))
                outputFile.write('\n')
                continue
            time = timeToFloat(row[0])
            # Find the first and last entries of file2 within a range of 1 second
            a = bisect.bisect_right(file2Entries, (time - timeRange,))
            b = bisect.bisect_left(file2Entries, (time + timeRange,))
            # Loop on the possible matches (those within the given range)
            for entry in file2Entries[a:b]:
                if entry[1][2] == row[4]:
                    outputFile.write(','.join(row + [entry[1][3]]))
                    outputFile.write('\n')
                    break;
            else:
                # We haven't found a match in file2
                outputFile.write(','.join(row + ['NOACT']))
                outputFile.write('\n')

您可以将其另存为compare.py并运行为：

./compare.py input1.txt input2.txt output.txt

请注意，我并不完全确定提取的范围的正确性，您应该确保它正常工作。

更新：第二个输入文件中不包含有效时间戳的行现在将被忽略，而第一个输入文件中不包含有效时间戳的行现在将复制到输出文件，并在末尾附加 NOACT

Question 3

新编辑的脚本没问题，但在新行上停止，
这是错误

root@localhost:~/python# ./compare.py input1.txt input2.txt output.txt Traceback (most recent call last): File "./compare.py", line 46, in <module> for row in csvReader: File "/usr/lib/python3.2/codecs.py", line 300, in decode (result, consumed) = self._buffer_decode(data, self.errors, final) UnicodeDecodeError: 'utf-8' codec can't decode byte 0x83 in position 2099: invalid start byte``

我在 input1 中查找了下一行，它与其他行类似，所以我尝试解决该问题并发现了这一点：

https://stackoverflow.com/questions/9233027/unicodedecodeerror-charmap-codec-cant-decode-byte-x-in-position-y-character

我将脚本更改为

#!/usr/bin/env python3

    import sys
    import csv
    import bisect
    import re

timeRange = 1 # seconds

# Timestamps should match the following regex
timestampRegex = re.compile('[0-9]*:*[0-9]*:[0-9][0-9]\.*[0-9]*')

# Convert the time from a string 'hh:mm:ss' to a float
def timeToFloat(time):
# We add hours and minutes in case they are missinge
# We reverse the list
# We only take the first three elements of the list anyway
(s, m, h) = (['00', '00'] + time.split(':'))[::-1][0:3]
result = float(h) * 3600 + float(m) * 60 + float(s)
return result

if (len(sys.argv) != 4):
    print('Usage: {} <input file 1> <input file 2> <output file>'.format(sys.argv[0]))
exit(1)

inputFileName1 = sys.argv[1]
inputFileName2 = sys.argv[2]
outputFileName = sys.argv[3]

# Each entry will be a tuple with the time as the first element
# and the row of file2 as the second element
file2Entries = []
with open(inputFileName2, encoding="utf8") as inputFile2:
    csvReader = csv.reader(inputFile2)
    for row in csvReader:

        if len(row) == 4:
            if not timestampRegex.match(row[0]):
                continue
            time = timeToFloat(row[0])
            file2Entries.append((time, row))

file1Entries = []
with open(inputFileName1, encoding="utf8") as inputFile1, open(outputFileName, 'w', encoding="utf8") as outputFile:
csvReader = csv.reader(inputFile1)
# For each row in file1 we look for a match among the entries of file2
for row in csvReader:
        if len(row) == 11:
            if not timestampRegex.match(row[0]):
                # We can't possibly find a match for this line
                outputFile.write(','.join(row + ['NOACT']))
                outputFile.write('\n')
                continue
            time = timeToFloat(row[0])
            # Find the first and last entries of file2 within a range of 1 second
            a = bisect.bisect_right(file2Entries, (time - timeRange,))
            b = bisect.bisect_left(file2Entries, (time + timeRange,))
            # Loop on the possible matches (those within the given range)
            for entry in file2Entries[a:b]:
                if entry[1][2] == row[4]:
                    outputFile.write(','.join(row + [entry[1][3]]))
                    outputFile.write('\n')
                    break;
            else:
                # We haven't found a match in file2
                outputFile.write(','.join(row + ['NOACT']))
                outputFile.write('\n')

但仍然出现一些错误

Answer