我正在尝试从外部 pdf 获取链接及其目的地。使用 TL2018 luatex 版本 1.07。文档指向getAnnots
和,getLinks
但在此版本中它们未定义。
<Object> = <Page>:getAnnots()
<Links> = <Page>:getLinks(<Catalog>)
错误:
attempt to call method 'getLinks' (a nil value)
attempt to call method 'geAnnots' (a nil value)
我可以使用Links
来获取一个对象,<PDFDoc>:getLinks()
但是它仅支持一种方法getNumLinks
。我检查了, <Catalog>:getDests()
但它返回的对象是nil
。
<Catalog>:findDest(<name>)
找到了目的地,但我需要知道所有名称。有没有办法找出对象的可用方法?我如何获取链接及其目的地?任何提示都会非常有帮助。
local function parse_links (pdf)
local doc = epdf.open(pdf)
local cat = doc:getCatalog()
local anns = cat:findDest("a1.label")
print (anns:getKindName()) -- output "FitH"
local pages_num = doc:getNumPages()
local page_num = 1
print ('Pages: ' .. pages_num)
while page_num <= pages_num do
local page = cat:getPage(page_num)
-- local links = page:getLinks(cat) -- error "attempt to call method 'getLinks' (a nil value)"
-- local annots = page:getAnnots() -- error "attempt to call method 'geAnnots' (a nil value)"
local links = doc:getLinks(page_num)
local dests = cat:getDests()
print (dests, dests:isNull()) -- output "Object: 002DE700 true"
num_links = links:getNumLinks()
print ("num links:", num_links)
page_num = page_num + 1
end
end
输出:
FitH
Pages: 3
Object: 00B3DF20 true
num links: 2
Object: 00B3DF20 true
num links: 3
Object: 00B3DF20 true
num links: 3
答案1
要访问页面上的所有链接,您可以使用,getAnnotsObject
因为每个链接都是一个注释。未记录的与getAnnotsObject
已记录的类似getAnnots
,只是它确实存在。注意:这是一个<Object>
包含数组,而不是Annots
类型的。
然后,您必须手动解析 Annots 以确定哪些注释是链接:只需查看Subtype
Annotation 目录的字段。如果您找到带有子类型的注释字典Link
,则可以尝试确定目的地。根据 PDF 规范,这要么在中Dest
,要么在中的动作字典中A
。A
也可以包含其他类型的操作,但我们忽略这些。此外,一些目的地可能会直接指定而没有名称,这些目的地不能轻易地包装在对象中LinkDest
:
local function parse_links (pdf)
local doc = epdf.open(pdf)
local cat = doc:getCatalog()
for page_num=1,doc:getNumPages() do
print("Analysing page " .. page_num .. ".")
local page = cat:getPage(page_num)
local annots = page:getAnnotsObject():getArray()
if annots then
for i = 1,annots:getLength() do
local annot = annots:get(i)
annot = annot and annot:getDict()
if annot and annot:lookup"Subtype":getName() == "Link" then
local dest = annot:lookup"Dest"
if dest:isNull() then
local a = annot:lookup"A":getDict()
if a and a:lookup"S":getName() == "GoTo" then
dest = a:lookup"D"
end
end
if dest:isString() or dest:isName() then
dest = cat:findDest(dest:getString() or dest:getName())
else
-- In this case we get a direct reference to the destination array
-- This is *NOT* a LinkDest object, but all the information could be extracted manually
-- This case will not appear with LaTeX/hyperref documents AFAICT, so we ignore it
end
local rect = annot:lookup"Rect":getArray()
-- Let's print the position on the page and the destination object
print(rect:get(1):getNum(), rect:get(2):getNum(), rect:get(3):getNum(), rect:get(4):getNum(), dest)
end
end
end
end
end
正如您所见,该epdf
库非常冗长,感觉好像缺少了一半的实现。此外,有些东西甚至可能无法实现:例如,您可以Dests
通过访问列表:getDests
,但现代 PDF 文档将其目的地包含在名称树中,而不是在 Dests 中。似乎没有任何方法可以直接访问名称树。
对于当前版本的 LuaTeX,epdf
已被 取代pdfe
。该库在较低级别上工作:它对注释、链接或名称树一无所知,而是提供对原始 PDF 对象的访问,您必须自己解释它们。例如,可以使用pdfe
列出所有链接及其目的地pdfe
local function lookup_name(tree, name)
-- A PDF name tree is a tree of multiple arrays with name object pairs,
-- where each node documents the names it is responsible for using /Limits
-- For details, read the PDF specification
local sub = tree.Limits
if sub then
if name < sub[1] or name > sub[2] then
return
end
end
sub = tree.Names
if sub then
for i = 1,#sub,2 do
if sub[i] == name then return sub[i+1] end
end
return false
end
sub = tree.Kids
for i = 1,#sub do
local r = lookup_name(sub[i], name)
if r ~= nil then
return r
end
end
return false
end
local dest_to_string_workers = { -- How to print the destination, you could add e.g. FitH
XYZ = function(p, d)
return ("page %i at position (%f,%f), zoom factor %s"):format(p, d[3], d[4], d[5] or "none")
end,
}
local function dest_to_string(d, dest)
if dest.D then dest = dest.D end
local page_no
for i = 1,#d.Pages do
if tostring(d.Pages[i]) == tostring(dest[1]) then
page_no = i
break
end
end
return (dest_to_string_workers[dest[2]] or function()return""end)(page_no, dest)
end
function show_links(filename)
local d = pdfe.open(filename)
if not d then error(([[Unable to open "%s".]]):format(filename)) end
for i=1,#d.Pages do
local annots = d.Pages[i].Annots
if annots then -- We have annotations on this page
for j = 1,#annots do
local annot = annots[j]
if annot.Subtype == "Link" then -- We found a link
local dest = annot.Dest or (annot.A and annot.A.S == "GoTo" and annot.A.D)
if dest then
if type(dest) == 'string' then
local dest_name = dest
dest = lookup_name(d.Catalog.Names.Dests, dest)
print("On page " .. i .. " at position (" .. annot.Rect[1] .. "," .. annot.Rect[2] .. ") we found a link with destination \"" .. dest_name .. "\" on " .. dest_to_string(d, dest))
else
print("On page " .. i .. " at position (" .. annot.Rect[1] .. "," .. annot.Rect[2] .. ") we found a link with destination on " .. dest_to_string(d, dest))
end
else
print[[This destination is not supported...]]
end
end
end
end
end
end
pdfe
还允许迭代名称树以列出所有目标树:
local function show_name_list(tree)
local sub = tree.Names
if sub then
for i = 1,#sub,2 do
print(sub[i])
end
return
end
sub = tree.Kids
for i = 1,#sub do
show_name_list(sub[i])
end
end
function show_dests(filename)
local d = pdfe.open(filename)
if not d then error(([[Unable to open "%s".]]):format(filename)) end
show_name_list(d.Catalog.Names.Dests)
end