在 python 中写入文件时无法对 unicode 字符进行编码

在 python 中写入文件时无法对 unicode 字符进行编码

在这里,我使用 python 从本地存储的 html 文件中提取圣经经文,实际上这个 python 不是我写的,我从某个地方得到的,当我运行 python 代码来提取圣经经文时,我得到类似这种类型的东西,u'Joseph asopo atum aphan Jakob thek-longsi, alangli arjulo, \u201cLahei komat asomarlo?\u201d'它有很多行这种类型,在写入文件时无法正确编码 unicode 字符。在终端上运行时,它可以正确显示并正确编码 unicode 字符。如果有人可以纠正问题,这里是 python 代码,这里是本地存储的 html下载 zip 格式的 html 文件 这是Python文件。此Python代码正在读取文件夹下的html文件example并在文件夹内生成输出文件example/exampleoutputs

import os, sys
import numpy as np
import array as arr
import re
from lxml import etree
from lxml import html
import urllib2
from bs4 import BeautifulSoup
import csv
import codecs
import shutil
class BIBLE_CLASS:

    def __init__(self,path):
        self.bookslist=[]

    def readBookList(self,path):    # Read the data from file
        file=open(path,'r')
        for x in file:
        #   print(x)
            self.bookslist.append(x)
        #print(Bookslist)
        #for i in range(1,len(self.bookslist)):
        #   print(self.bookslist[i])

    def searchBook(self,book):  # Search the book ID in the books file
        #bookcode=0
        for i in range(1,len(self.bookslist)):
            st=str(self.bookslist[i])
            #st=st.upper()
            #skey=book.upper()
            #print(st,'',skey)
            if(st.find(book)!=-1):
                temp=self.bookslist[i]
                index1=temp.index(',')
                #print(temp[0:index1])  
                bookcode=temp[0:index1]
                #print(bookcode)
                return bookcode

    def writeStoryFile(self,path):
        # Writes data into story file
        filext1=['.html']
        print("writeStoryFile")
        dirs=os.listdir(path)
            pattern=re.compile("^v")
        # CREATE A FOLDER WITH PATH+OUTPUTS AS NAME
        outputfile=path+"outputs"
        outpath=os.path.join(path,outputfile)
        if os.path.exists(outpath):
            shutil.rmtree(outpath)
        else:
            os.mkdir(outpath)
        for d in dirs:  #For each directory extract stories from files
            print("Directory name:",d)
            btitle=d[d.rindex('_')+1:]
            print(btitle)
            print('Book code:',o1.searchBook(btitle))   # Book code is extracted
            fullpath=os.path.join(path,d)
            if os.path.isdir(fullpath):
                print("Converting folder",fullpath)
                files=os.listdir(fullpath)
            rno=0
            # GET THE BOOK CODE FROM THE FILE
            btitle=d[d.rindex('_')+1:]   
            #print(btitle)  
            #print('Book code:',o1.searchBook(btitle))
            bcode=o1.searchBook(btitle)
            print(bcode)

            # CREATE STORY LINE FILE FOR EACH FOLDER
            csvfile1=outpath+"/"+d+"story.csv"  # story line filename
            f1 = codecs.open(csvfile1, encoding='utf-8',mode='w')   # Creating story lines file
            csvfile2=outpath+"/"+d+"storyverses.csv"    # story line with verses files
            csvfile3=outpath+"/"+d+"veses.csv"   # only verses      
            f2 = codecs.open(csvfile2, encoding='utf-8',mode='w')
            f3=codecs.open(csvfile3,  encoding='utf-8',mode='w')    
            for f in files:
                fname,fext=os.path.splitext(f)
                rows=[]

                if fext in filext1:
                    print("=================Processing the file",f,"========================")
                    # Process the file
                    print "Stories in the chapter:" 
        #           fullpath=os.path.join(path,file)
                    wpath=os.path.join(fullpath,f)
                    text=open(wpath,"r")
        #           csvfile1=wpath.replace('.html','story.csv') # story line filename
                        html_doc=text.read()    
                        soup = BeautifulSoup(html_doc, 'html.parser',from_encoding="utf-8")
                        h3s = soup.find_all('h3')
                        storyverses=[]
                        storytitles=[]  

                        for h3 in h3s:
                        next_element = h3.find_next()
                        print next_element.text

                        if next_element.find('sup') is None:
                        print("Element is null")
                        required_element=next_element.find_next()
                        if required_element is None:
                            required_element=next_element.next_element.find_next()
                        if required_element.find('sup') is None:
                            break
                        required_element= required_element.find_next('sup')

                        superscript_number = str(required_element.find_next('sup').text)                    
                     #   print bcode,fname,superscript_number,h3.text                           
                        print "========",superscript_number
                        if len(superscript_number)>2:
                            storyverses.append(int(superscript_number[0:1]))
                            sn=superscript_number[0:1]  
                            print sn
                            st=int(bcode),int(fname),int(superscript_number[0:1]), str(h3.text)
                        else:
                            storyverses.append(int(superscript_number))
                            st=int(bcode),int(fname),int(superscript_number), (h3.text).encode('utf-8')

                        storytitles.append(h3.text) 
                        print st    
                        rows.append(st)
                        rno=rno+1   
                        else:

                        superscript_number = str(next_element.find('sup').text)                 
                        if len(superscript_number)>2:
                            storyverses.append(int(superscript_number[0:1]))
                            sn=superscript_number[0:1]  
                            print sn
                            st=int(bcode),int(fname),int(superscript_number[0:1]), str(h3.text)
                        else:
                            storyverses.append(int(superscript_number))
                            st=int(bcode),int(fname),int(superscript_number), str(h3.text)

                        storytitles.append(h3.text) 
                        print st    
                        rows.append(st)
                        rno=rno+1
                        #print storytitles[0]   
                    # Write story lines into file 
                #   f1 = codecs.open(csvfile1, encoding='utf-8',mode='w')   # Creating story lines file
                    for row in rows:               
                        f1.write(str(row))
                        f1.write("\n")

                #   csvfile2=wpath.replace('.html','verse.csv') # story line files
                #       csvfile3=wpath.replace('.html','veses.csv')     
#                       f2=open(csvfile2,"w")
                    verses=[]   
                    verseno=1   
                 #      f2 = codecs.open(csvfile2, encoding='utf-8',mode='w')
                 #      f3=codecs.open(csvfile3,  encoding='utf-8',mode='w')    
                    #f2=codecs.open(csvfile2,mode='w')
                    #f3=codecs.open(csvfile3,mode='w')
                        print "======================================================================================"  
                        k=0 
                        for y in soup.findAll('span',class_=[pattern,"heading"]):
                        sups=y('sup')
                        for z in sups:
                            z.decompose()
                        if verseno in storyverses:
                            st1=str(rows[storyverses.index(verseno)])
                            k=k+1
                            f2.write(st1)
                            f2.write("\n")
                            verses.append(st1)
                            print bcode,fname, verseno,'"'+y.text+'"'
                            st=int(bcode),int(fname), verseno,y.text
                            f2.write(str(st))
                            f2.write("\n")
                            f3.write(str(st))
                            f3.write("\n")
                            verses.append(st)
        #       print st
                        else:
                            print bcode,fname, verseno,'"'+y.text+'"'   
                            st=int(bcode),int(fname),verseno,y.text
                            f2.write(str(st))
                            f2.write("\n")
                            f3.write(str(st))
                            f3.write("\n")
                            verses.append(st)
                        verseno+=1
                #   f1.close()
                #   f2.close()
                #   f3.close()
    def writeVerses(self):  
        print("writeVerses")
        # Writes verses into verse file

    def writebothVerse(self):
        print("writeStoryFile")
        # Writes verse with story in between

# Main program
bookspath="BookCode/books.csv"
path='example'   # Change this to the actual path of the input data

o1=BIBLE_CLASS(path)
# Step 1: Read the book list from the file
o1.readBookList(bookspath)

# Step 2: Create story files
o1.writeStoryFile(path)
filext1=['.html']
dirs=os.listdir(path)

这是一些输出文件的屏幕截图 在此输入图像描述

相关内容