如何批量处理含有word格式的文件?

如何批量处理含有word格式的文件?

我填写了一堆 Word 表格,需要将这些数据导入 Excel/CSV/任何结构化格式。我在网上看到过如何一次完成的解决方案,但有没有批量完成的既定方法?

在编写 powershell 脚本之前我想问一下。

答案1

以下是我用 Python 给出的解决方案:

"""
Copyright 2009 Konrads Smelkovs <[email protected]>
UTF8Recorder and UnicodeWriter come from python docs
"""

import sys,os,csv
import win32com.client
import pywintypes


import codecs, cStringIO

class UTF8Recoder:
    """
    Iterator that reads an encoded stream and reencodes the input to UTF-8
    """
    def __init__(self, f, encoding):
        self.reader = codecs.getreader(encoding)(f)

    def __iter__(self):
        return self

    def next(self):
        return self.reader.next().encode("utf-8")

class UnicodeWriter:
    """
    A CSV writer which will write rows to CSV file "f",
    which is encoded in the given encoding.
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
        self.queue = cStringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoder = codecs.getincrementalencoder(encoding)()

    def writerow(self, row):
        self.writer.writerow([s.encode("utf-8") for s in row])
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

def main():
 if len(sys.argv)<3:
    print "Usage: %s <directory> <outfile.csv>" % sys.argv[0]
    print "Where <directory> - directory containing word docs with forms"
    print "and <outfile.csv> - file where to put results"
    sys.exit(-1)
 directory=os.path.abspath(sys.argv[1])
 wordapp = win32com.client.Dispatch("Word.Application")
 wordapp.Visible=0 # Hide word app
 results=[]
 for docfile in os.listdir(directory):
     thisdocresults=[]
     if docfile.endswith(".doc") or docfile.endswith(".docx"):
         print >> sys.stderr, "Processing %s" % docfile
         worddoc=wordapp.Documents.Open(os.path.join(directory,docfile))
         for i in range(1,worddoc.FormFields.Count+1):
            try:
                form=worddoc.FormFields.Item(i)
                name=form.Name
                value=form.Result
                thisdocresults.append((name,value))
                try:
                    print >>sys.stderr, "%s: %s" % (name,value)
                except UnicodeEncodeError,e:
                    print >>sys.stderr, "Error decoding charset,%s" % e
            except pywintypes.com_error,e:
                print >>sys.stderr, "Exception: %s" % str(e)
         results.append(thisdocresults)
         worddoc.Close()
 csvfile=file(sys.argv[2],"wb")
 csvwriter=UnicodeWriter(csvfile,quoting=csv.QUOTE_ALL)
 for docres in results:
     data=[]
     for (n,v) in docres:
         data.append(v)
     csvwriter.writerow(data)
 wordapp.Quit()

if __name__=="__main__":
    main()

将其转换为 powershell 很简单。如果有人真的需要,请给我发邮件

相关内容