如何从 grep 输出构建 xml 文档

如何从 grep 输出构建 xml 文档

我想将find | grep操作结果转换为结构化 XML 文档,其中文件条目包含文件名、出现次数、行号和行内容。 linux 是否提供任何工具来格式化输出,或者我必须自己编写代码?

答案1

所以我用 Python 尝试了一下,我想我已经想出了一个简单的脚本来完成你想要的事情。这里是:

#!/usr/bin/env python2
# -*- coding: ascii -*-
"""pathlist2xml.py

Takes a list of file-system paths and
generates an XML representation of the
corresponding file-system hierarchy.
"""

import sys
from lxml.etree import Element, SubElement, fromstring, tostring, XMLParser
from xml.sax.saxutils import escape, unescape
from os.path import join, isdir
from posix import lstat
import fileinput

def insert_path(xmlroot, path):
    """Updates an XML element `xmlroot` and adds the
    child elements that represent the path `path`."""

    # Initialize a node cursor to start at the root node
    xmlcursor = xmlroot

    # Keep track of the relative path
    fullpath = ''

    # Iterate through the components of the path
    for path_component in path.split('/'):

        # Update the path
        fullpath = join(fullpath, path_component)

        # UTF and XML encode the strings
        fullpath_encoded = escape(fullpath.encode('string-escape'))
        path_component_encoded = escape(path_component.encode('string-escape'))

        # Check to see if the component if already represented by a node
        xmlnodes = xmlcursor.xpath("./*[@name='%s']" % path_component_encoded)

        # If the node exists, update the cursor
        if xmlnodes:
            xmlcursor = xmlnodes[0]

        # If the node doesn't exists, create it
        else:

            # Create the node
            if isdir(fullpath):
                xmlcursor = SubElement(xmlcursor, "directory")
            else:
                xmlcursor = SubElement(xmlcursor, "file")

            # (Optional) Add some file-attributes
            # xmlcursor.set('name', path_component)
            xmlcursor.set('name', path_component_encoded)
            xmlcursor.set('path', fullpath_encoded)
            xmlcursor.set('inode', str(lstat(fullpath).st_ino))

    # Return the modified root element (for convenience - not necessary)
    return(xmlroot)

def paths_to_xml(pathlist):
    """ Takes a list of file-system paths and generates an XML
    representation of the corresponding file-system hierarchy.
    """

    xmlroot = Element('root')

    for path in pathlist:
        insert_path(xmlroot, path.strip().strip('/'))

    return(xmlroot)

# Read a list of file paths standard input or from a list of files
if __name__ == "__main__":

    # Get the XML document
    xmlroot = paths_to_xml(fileinput.input())

    # Display the generated XML document
    print(tostring(xmlroot, pretty_print=True))

这里有一个小示例会话,说明了它在实践中如何工作。首先我创建了一些目录和文件:

mkdir -p /tmp/xmltest
cd /tmp/xmltest
touch file1
touch file2
mkdir dir1
touch dir1/file3
touch dir1/file4
mkdir dir2
mkdir dir2/dir3
touch dir2/dir3/file5

这个子层次结构如下所示tree

.
├── dir1
│   ├── file3
│   └── file4
├── dir2
│   └── dir3
│       └── file5
├── file1
└── file2

以下是如何使用以下输出调用脚本的示例find

find . | pathlist2xml.py

这是生成的 XML 输出:

<root>
  <directory name="." path="." inode="3587802">
    <directory name="dir1" path="./dir1" inode="3587817">
      <file name="file3" path="./dir1/file3" inode="3587818"/>
      <file name="file4" path="./dir1/file4" inode="3587819"/>
    </directory>
    <directory name="dir2" path="./dir2" inode="3587820">
      <directory name="dir3" path="./dir2/dir3" inode="3587821">
        <file name="file5" path="./dir2/dir3/file5" inode="3587822"/>
      </directory>
    </directory>
    <file name="file1" path="./file1" inode="3587815"/>
    <file name="file2" path="./file2" inode="3587816"/>
  </directory>
</root>

find这是与结合的第二个示例grep

find . | grep dir2 | pathlist2xml.py

这是第二个示例的输出:

<root>
  <directory name="." path="." inode="3587802">
    <directory name="dir2" path="./dir2" inode="3587820">
      <directory name="dir3" path="./dir2/dir3" inode="3587821">
        <file name="file5" path="./dir2/dir3/file5" inode="3587822"/>
      </directory>
    </directory>
  </directory>
</root>

相关内容