我想要获取一份 Microsoft Word 文档并生成一个电子表格,其中包含该文档中包含的所有单词以及每个单词出现的次数。
例如,
cat 23
said 15
jumped 12
dog 7
这是一个简单的问题,可以使用 Word 或 Excel 的内置函数和特性以简单、直接的方式完成吗?
如果没有,那么此功能是否在现成的工具中可用(在这种情况下,请告知我应该在 Software Recs 网站上查询什么),或者是否需要定制编程?
答案1
除了 VBA,还可以使用 OpenOffice 的 API 开发这样的应用程序来读取 Word 文档的内容;处理它并将结果导出为 CSV 文件以在电子表格应用程序中打开。
然而,如果你熟悉任何编程语言,这实际上只是几行代码。例如在 Python 中,你可以轻松地这样做:
这里我们定义一个简单的函数,它计算给定列表的单词数量
def countWords(a_list):
words = {}
for i in range(len(a_list)):
item = a_list[i]
count = a_list.count(item)
words[item] = count
return sorted(words.items(), key = lambda item: item[1], reverse=True)
剩下的就是对文档内容进行操作了。先粘贴一下:
content = """This is the content of the word document. Just copy paste it.
It can be very very very very long and it can contain punctuation
(they will be ignored) and numbers like 123 and 4567 (they will be counted)."""
在这里我们删除标点符号、EOL、括号等,然后为我们的函数生成一个单词列表:
import re
cleanContent = re.sub('[^a-zA-Z0-9]',' ', content)
wordList = cleanContent.lower().split()
然后我们运行我们的函数并将其结果(单词计数对)存储在另一个列表中并打印结果:
result = countWords(wordList)
for words in result:
print(words)
所以结果是:
('very', 4)
('and', 3)
('it', 3)
('be', 3)
('they', 2)
('will', 2)
('can', 2)
('the', 2)
('ignored', 1)
('just', 1)
('is', 1)
('numbers', 1)
('punctuation', 1)
('long', 1)
('content', 1)
('document', 1)
('123', 1)
('4567', 1)
('copy', 1)
('paste', 1)
('word', 1)
('like', 1)
('this', 1)
('of', 1)
('contain', 1)
('counted', 1)
如果需要,您可以使用搜索/替换删除括号和逗号。
您需要做的就是下载Python 3,安装它,打开 IDLE(附带 Python),替换你的 word 文档的内容并按照给定的顺序一次运行命令。
答案2
使用 VBA。宏(子程序)可以准确地执行您的请求这页:
Sub WordFrequency()
Const maxwords = 9000 'Maximum unique words allowed
Dim SingleWord As String 'Raw word pulled from doc
Dim Words(maxwords) As String 'Array to hold unique words
Dim Freq(maxwords) As Integer 'Frequency counter for unique words
Dim WordNum As Integer 'Number of unique words
Dim ByFreq As Boolean 'Flag for sorting order
Dim ttlwds As Long 'Total words in the document
Dim Excludes As String 'Words to be excluded
Dim Found As Boolean 'Temporary flag
Dim j, k, l, Temp As Integer 'Temporary variables
Dim ans As String 'How user wants to sort results
Dim tword As String '
' Set up excluded words
Excludes = "[the][a][of][is][to][for][by][be][and][are]"
' Find out how to sort
ByFreq = True
ans = InputBox("Sort by WORD or by FREQ?", "Sort order", "WORD")
If ans = "" Then End
If UCase(ans) = "WORD" Then
ByFreq = False
End If
Selection.HomeKey Unit:=wdStory
System.Cursor = wdCursorWait
WordNum = 0
ttlwds = ActiveDocument.Words.Count
' Control the repeat
For Each aword In ActiveDocument.Words
SingleWord = Trim(LCase(aword))
'Out of range?
If SingleWord < "a" Or SingleWord > "z" Then
SingleWord = ""
End If
'On exclude list?
If InStr(Excludes, "[" & SingleWord & "]") Then
SingleWord = ""
End If
If Len(SingleWord) > 0 Then
Found = False
For j = 1 To WordNum
If Words(j) = SingleWord Then
Freq(j) = Freq(j) + 1
Found = True
Exit For
End If
Next j
If Not Found Then
WordNum = WordNum + 1
Words(WordNum) = SingleWord
Freq(WordNum) = 1
End If
If WordNum > maxwords - 1 Then
j = MsgBox("Too many words.", vbOKOnly)
Exit For
End If
End If
ttlwds = ttlwds - 1
StatusBar = "Remaining: " & ttlwds & ", Unique: " & WordNum
Next aword
' Now sort it into word order
For j = 1 To WordNum - 1
k = j
For l = j + 1 To WordNum
If (Not ByFreq And Words(l) < Words(k)) _
Or (ByFreq And Freq(l) > Freq(k)) Then k = l
Next l
If k <> j Then
tword = Words(j)
Words(j) = Words(k)
Words(k) = tword
Temp = Freq(j)
Freq(j) = Freq(k)
Freq(k) = Temp
End If
StatusBar = "Sorting: " & WordNum - j
Next j
' Now write out the results
tmpName = ActiveDocument.AttachedTemplate.FullName
Documents.Add Template:=tmpName, NewTemplate:=False
Selection.ParagraphFormat.TabStops.ClearAll
With Selection
For j = 1 To WordNum
.TypeText Text:=Trim(Str(Freq(j))) _
& vbTab & Words(j) & vbCrLf
Next j
End With
System.Cursor = wdCursorNormal
j = MsgBox("There were " & Trim(Str(WordNum)) & _
" different words ", vbOKOnly, "Finished")
End Sub