如何解析文件

Question

这个Python脚本应该做你想做的事：

#!/usr/bin/env python
# -*- encoding: ascii -*-
"""parse.py

Parses a custom-format data-file.
Processes the file first and then prints the results.
"""

import sys

# Read the data from the file
file = open(sys.argv[1], 'r')

# Initialize a dictionary to collect the values for each label
labels = {}

# Initialize a stack to keep track of block state
stack = []

# Initialize a counter to count the number of blocks
block = 0

# Process the file
line = file.readline()
while line:

    # Remove white-space
    line = line.strip()

    # The stack should be empty when we start a new block
    if line.lower() == "start":
        if stack:
            raise Exception("Invalid File Format: Bad Start")
        else:
            stack.append(line)

    # Otherwise the bottom of the stack should be a "Start"
    # When we reach the end of a block we empty the stack
    # end increment the block counter
    elif line.lower() == "end":
        if stack[0].lower() != "start":
            raise Exception("Invalid File Format: Bad End")
        else:
            block += 1
            stack = []

    # Other lines should come in consecutive label/value pairs
    # i.e. a value row should follow a label row
    elif line:

        # If there are an odd number of data rows in the stack then
        # the current row should be a value row - check that it matches
        # the corresponding label row
        if len(stack[1:])%2==1:

            _labels = stack[-1].split()
            _values = line.split()

            # Verify that the label row and value row have the same number
            # of columns
            if len(_labels) == len(_values):
                stack.append(line)
                for label, value in zip(_labels, _values):

                    # Add new labels to the labels dictionary
                    if label not in labels:
                        labels[label] = {
                            "cols": len(label)
                        }

                    # Add the value for the current block
                    labels[label][block] = value

                    # Keep track of the longest value for each label
                    # so we can format the output later
                    if len(value) > labels[label]["cols"]:
                        labels[label]["cols"] = len(value)
            else:
                raise Exception("Invalid File Format: Label/Value Mismatch")

        # If there are an even number of data rows in the stack then
        # the current row should be a label row - append it to the stack
        else:
            stack.append(line)

    # Read the next line
    line = file.readline()

# Construct the header row
header = ""
for label in labels:
    cols = labels[label]["cols"]
    header += "{0: <{width}}".format(label, width=cols+1)

# Construct the data rows
rows = []
for i in range(0, block):
    row = ""
    for label in labels:
        cols = labels[label]["cols"]
        row += "{0: <{width}}".format(labels[label].get(i, "NA"), width=cols+1)
    rows.append(row)

# Print the results
print(header)
for row in rows:
    print(row)

你可以像这样运行它：

python parse.py file1.txt

它会对示例数据生成以下输出：

label1 label2 label3 label4 label5 label6 label7
value1 value2 value3 value4 value5 value6 value7
valueA valueB NA     valueD valueE valueF NA

Answer 1

这个Python脚本应该做你想做的事：

#!/usr/bin/env python
# -*- encoding: ascii -*-
"""parse.py

Parses a custom-format data-file.
Processes the file first and then prints the results.
"""

import sys

# Read the data from the file
file = open(sys.argv[1], 'r')

# Initialize a dictionary to collect the values for each label
labels = {}

# Initialize a stack to keep track of block state
stack = []

# Initialize a counter to count the number of blocks
block = 0

# Process the file
line = file.readline()
while line:

    # Remove white-space
    line = line.strip()

    # The stack should be empty when we start a new block
    if line.lower() == "start":
        if stack:
            raise Exception("Invalid File Format: Bad Start")
        else:
            stack.append(line)

    # Otherwise the bottom of the stack should be a "Start"
    # When we reach the end of a block we empty the stack
    # end increment the block counter
    elif line.lower() == "end":
        if stack[0].lower() != "start":
            raise Exception("Invalid File Format: Bad End")
        else:
            block += 1
            stack = []

    # Other lines should come in consecutive label/value pairs
    # i.e. a value row should follow a label row
    elif line:

        # If there are an odd number of data rows in the stack then
        # the current row should be a value row - check that it matches
        # the corresponding label row
        if len(stack[1:])%2==1:

            _labels = stack[-1].split()
            _values = line.split()

            # Verify that the label row and value row have the same number
            # of columns
            if len(_labels) == len(_values):
                stack.append(line)
                for label, value in zip(_labels, _values):

                    # Add new labels to the labels dictionary
                    if label not in labels:
                        labels[label] = {
                            "cols": len(label)
                        }

                    # Add the value for the current block
                    labels[label][block] = value

                    # Keep track of the longest value for each label
                    # so we can format the output later
                    if len(value) > labels[label]["cols"]:
                        labels[label]["cols"] = len(value)
            else:
                raise Exception("Invalid File Format: Label/Value Mismatch")

        # If there are an even number of data rows in the stack then
        # the current row should be a label row - append it to the stack
        else:
            stack.append(line)

    # Read the next line
    line = file.readline()

# Construct the header row
header = ""
for label in labels:
    cols = labels[label]["cols"]
    header += "{0: <{width}}".format(label, width=cols+1)

# Construct the data rows
rows = []
for i in range(0, block):
    row = ""
    for label in labels:
        cols = labels[label]["cols"]
        row += "{0: <{width}}".format(labels[label].get(i, "NA"), width=cols+1)
    rows.append(row)

# Print the results
print(header)
for row in rows:
    print(row)

你可以像这样运行它：

python parse.py file1.txt

它会对示例数据生成以下输出：

label1 label2 label3 label4 label5 label6 label7
value1 value2 value3 value4 value5 value6 value7
valueA valueB NA     valueD valueE valueF NA

如何解析文件

答案1

相关内容