它不一定是 pdf 文档;pdf 可以转换为的其他格式也可以。我并不反对自己制作软件,所以如果有任何现有的库可以做这样的事情或有功能强大的工具来做这件事,那就太好了。
我们的想法是拥有一本这样的字典:
"highlight color" to "word" for multiple colors and words
答案1
最简单的方法是编写自定义字符串查找器,然后对其进行调整以突出显示字符串。
一个多类型文档可编程库是 MuPDF,它在 Windows 上有几个单独的 exe 来处理多种文档类型,尤其是 PDF。
因此,我们可以调用Mutool run find.js "filename.pdf" "string"
并调整它,用颜色突出显示每个找到的字符串。假设我们将此页面的问题作为 PDF 进行搜索,以查找“单词”数量和位置。
....mupdf\1.19.1\mutool.exe" run "find.js" "question.pdf" 1 "word string"
结果注意使用版本 1.19.1作为最后一个在 32 位和 64 位 Windows 系统上使用此语法的人(在 1.20+ 中有一个重大变化。)
warning: undefined link destination
Be aware these are from Top Left Top Down in default PDF units (normally 1/72")
Position 1: x = 291.997314453125 y = 90.68339538574219 W = 40.930938720703128 H = 24.240676879882814
Position 2: x = 437.68792724609377 y = 90.68339538574219 W = 40.930938720703128 H = 24.240676879882814
Position 3: x = 188.70480346679688 y = 273.9330749511719 W = 19.30328369140625 H = 8.774993896484375
Position 4: x = 333.4922180175781 y = 273.9330749511719 W = 19.303253173828126 H = 8.774993896484375
Position 5: x = 271.1244812011719 y = 424.8809509277344 W = 19.70745849609375 H = 11.67144775390625
Position 6: x = 349.66943359375 y = 437.03094482421877 W = 19.70745849609375 H = 11.67144775390625
Position 7: x = 168.11524963378907 y = 449.85595703125 W = 21.2091064453125 H = 11.67144775390625
Position 8: x = 243.36720275878907 y = 449.85595703125 W = 21.209121704101564 H = 11.67144775390625
Position 9: x = 407.9195861816406 y = 464.9580993652344 W = 19.303314208984376 H = 8.774993896484375
warning: ... repeated 2 times...
这是第一部分“find.js”的概要。我仅将其设置为按单个页面查找,因为这样可以轻松地在循环中以不同的方式重复使用该函数。较新的版本,例如当前 1.23.0(内置 tesseract 的 Windows zip)可以在以下位置找到https://mupdf.com/releases/index.html
//BEWARE this script was for older Version 1.19.1 see below for more recent.
if (scriptArgs.length != 3) {
print('usage: mutool run Find.js "input.pdf" page# "word string"');
quit();
}
var doc = new Document(scriptArgs[0], "application/pdf");
// var numpages = doc.countPages() // Not Yet Implimented, but will be needed for whole file loop all pages
var page = doc.loadPage(scriptArgs[1] -1); // loads the #-1 page of the document NOTE PDF counts pages from 0
var transform = [1,0,0,1,0,0]; //use default for 1 page unit per point may need modify for /UserUnit
var arr = page.search(scriptArgs[2], 500); // max_hits – Integer Defaults to 500 unless otherwise specified search is case insensitive
print('\tBe aware these are from Top Left Top Down in default PDF units \(normally 1/72\"\)');
var i;
for(i = 0; i < arr.length; i++)
{
var p = arr[i];
print('Position ' + (i + 1) + ':\tx = ' + p[0] + '\ty = ' + p[1] + '\tW = ' + (p[2] - p[0]) + '\tH = ' + (p[5] - p[1]) );
}
因此,我们只需要将 find.js 提升到 highlight.js,大约一天后这里就是运行概念,我们可以在for loop
所需的任意数量的页面上运行。
"...mupdf\1.19.1\mutool.exe" run "highlights.js" "question.pdf" 1 "word" "output1.pdf" 1.0 0.0 1.0
"...mupdf\1.19.1\mutool.exe" run "highlights.js" "output1.pdf" 1 "color" "output2.pdf" 1.0 0.5 0.0
亮点.js
// THIS VERSION WORKS WITH MuTools 1.20.0 (LAST 32 bit or newer 64 bit) not the older version 1.19.1 syntax which is shown for legacy users.
if (scriptArgs.length != 7) {
print('usage: mutool run Highlights.js "input.pdf" page# "word string" "output.pdf" R.# G.# B.# \(e.g. orange = 1.0 0.5 0.0\)');
quit();
}
var doc = new Document(scriptArgs[0], "application/pdf");
// var numpages = doc.countPages() // Not Yet Implimented, but will be needed for whole file loop all pages
var page = doc.loadPage(scriptArgs[1] -1); // loads the #-1 page of the document NOTE PDF counts pages from 0
var transform = [1,0,0,1,0,0]; //use default for 1 page unit per point may need modify for /UserUnit
var arr = page.search(scriptArgs[2], 500); // max_hits – Integer Defaults to 500 unless otherwise specified search is case insensitive
print('\tBe aware these are from Top Left Top Down in default PDF units \(normally 1/72\"\)');
var i;
for(i = 0; i < arr.length; i++)
{
var p = arr[i];
//Version 1.19.1
//print('Position ' + (i + 1) + ':\tx = ' + p[0] + '\ty = ' + p[1] + '\tW = ' + (p[2] - p[0]) + '\tH = ' + (p[5] - p[1]) );
//Version 1.20.0
print('Position ' + (i + 1) + ': Quad=' + p[0])
var annot = page.createAnnotation("Highlight");
annot.setColor([scriptArgs[4], scriptArgs[5], scriptArgs[6]]);
//Version 1.19.1
//annot.setQuadPoints([[p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7]]]);
//Version 1.20.0
annot.setQuadPoints(p);
}
page.update();
doc.save(scriptArgs[3], "pretty,ascii,compress-images,compress-fonts")
答案2
一种方法是进入菜单并选择“查找”,它将突出显示搜索到的单词,您可以通过搜索到的单词来查找高位单词
点击下面的下载按钮,下载 UPDF 应用程序。在您的设备上启动 UPDF,然后单击“打开文件”按钮导入 PDF 文件。在左侧,您可以单击“注释”,选择“突出显示”工具,选择颜色,然后将该工具拖到要突出显示的文本上。只要您离开光标,您的文本就会突出显示。