hyperxmp + xelatex 错误：pdfkeywords 重复

Question 1

2011/06/12 v1.4 版本中的软件包hyperxmp不使用 UTF-8 来处理 XMP 数据。相反，数据字节以 PDFDocEncoding 或 UTF-16BE 给出。因此，其 trimspaces 宏也会中断。如果是 XeTeX，BOM（字节顺序标记）会写得不正确...

\pdfstringdef(和朋友)产生的字符串\@pdfauthor可以具有依赖于选项设置和驱动程序的不同编码。\ifHy@unicode对于编码检测没有用。pdfencoding=auto字符串也可以有 PDFDocEncoding！此外，XeTeX 也有一个例外，其中字符串可能有大字符（字符代码 > 127）。与所有其他驱动程序不同，如果应用程序想要写入任意二进制数据，XeTeX 只会提供一个非常丑陋的未指定和不完整的接口。

可以通过查找为 Unicode 字符串设置的 BOM 来检测字符串\pdfstringdef。此外（对于 XeTeX），还需要检查字符串中是否有大字符。

下一步写入 XMP 数据。软件包hyperxmp使用 UTF-8。因此，下面的补丁使用 XML 转义将字符串编码为 UTF-8。（使用原始 PDFDocEncoding 或 Unicode 是错误的，这两种编码都不同于 UTF-8。）

该补丁还修复了 XeTeX 和 LuaTeX 的 BOM 标记。

\documentclass{article}

\usepackage[
  pdfencoding=auto
%  unicode,
]{hyperref}
\hypersetup{
    pdfauthor = {Matthias<abc>},
    pdftitle = {MWO\textalpha\textomega},
    pdfkeywords = {Test, MWO, This is a test, Lipsum}
}

\usepackage{hyperxmp}
\usepackage{ltxcmds}[2010/04/26]
\usepackage{ifxetex}
\usepackage{ifluatex}

\makeatletter
\newif\ifhyxmp@unicodetex
\ifluatex
  \hyxmp@unicodetextrue
\else\ifxetex
  \hyxmp@unicodetextrue
\fi\fi

\renewcommand*{\hyxmp@xmlify}[1]{%
  \gdef\hyxmp@xmlified{}%
  % Escaped PDF string -> PDFDocEncoding/Unicode
  \EdefUnescapeString\hyxmp@text{#1}%
  \ifhyxmp@unicodetex
    % PDFDocEncoding/Unicode -> UTF-32BE
    \hyxmp@is@unicode\hyxmp@text{%
      \StringEncodingConvert
      \hyxmp@text\hyxmp@text{utf16be}{utf32be}%
    }{%
      \ifxetex
        \hyxmp@xetex@crap
      \else
        \StringEncodingConvert
        \hyxmp@text\hyxmp@text{pdfdoc}{utf32be}%
      \fi
    }%
    % UTF-32BE -> UTF-32BE as hex string
    \EdefEscapeHex\hyxmp@text{\hyxmp@text}%
    % UTF-32BE -> XML in ASCII
    \edef\hyxmp@text{%
      \expandafter
    }\expandafter\hyxmp@toxml@unicodetex\hyxmp@text
    \relax\relax\relax\relax\relax\relax\relax\relax
  \else
    % PDFDocEncoding/Unicode -> UTF-8
    \hyxmp@is@unicode\hyxmp@text{%
      \StringEncodingConvert
      \hyxmp@text\hyxmp@text{utf16be}{utf8}%
    }{%
      \StringEncodingConvert
      \hyxmp@text\hyxmp@text{pdfdoc}{utf8}%
    }%
    % UTF-8 -> UTF-8 as hex string
    \EdefEscapeHex\hyxmp@text{\hyxmp@text}%
    % UTF-8 as hex string -> XML in UTF-8 as hex string
    \edef\hyxmp@text{%
      \expandafter\hyxmp@toxml\hyxmp@text\@empty\@empty
    }%
    % XML in UTF-8 as hexstring -> XML in UTF-8
    \EdefUnescapeHex\hyxmp@text{\hyxmp@text}%
  \fi
  \global\let\hyxmp@xmlified\hyxmp@text
}
\begingroup
  \lccode`\<=254 %
  \lccode`\>=255 %
  \catcode254=12 %
  \catcode255=12 %
\lowercase{\endgroup
  \def\hyxmp@is@unicode#1{%
    \expandafter\hyxmp@@is@unicode#1<>\@nil
  }%
  \def\hyxmp@@is@unicode#1<>#2\@nil{%
    \ifx\\#1\\%
      \expandafter\@firstoftwo
    \else
      \expandafter\@secondoftwo
    \fi
  }%
}
\def\hyxmp@toxml@unicodetex#1#2#3#4#5#6#7#8{%
  \ifx#1\relax
  \else
    \ifnum"#1#2#3#4#5#6#7#8>127 %
      \uccode`\*="#1#2#3#4#5#6#7#8\relax
      \uppercase{%
        \edef\hyxmp@text{\hyxmp@text *}%
      }%
    \else\ifnum"#7#8=`\< %
      \edef\hyxmp@text{\hyxmp@text &lt;}%
    \else\ifnum"#7#8=`\& %
      \edef\hyxmp@text{\hyxmp@text &amp;}%
    \else\ifnum"#7#8=`\> %
      \edef\hyxmp@text{\hyxmp@text &gt;}%
    \else\ifnum"#7#8=`\ %
      \edef\hyxmp@text{\hyxmp@text\space}%
    \else
      \uccode`\*="#7#8\relax
      \uppercase{%
        \edef\hyxmp@text{\hyxmp@text *}%
      }%
    \fi\fi\fi\fi\fi
    \expandafter\hyxmp@toxml@unicodetex
  \fi
}
\def\hyxmp@skipzeros#1{%
  \ifx#10%
    \expandafter\hyxmp@skipzeros
  \fi
}
\def\hyxmp@toxml#1#2{%
  \ifx#1\@empty
  \else
    \ifnum"#1#2=`\& %
      26616D703B% &amp;
    \else\ifnum"#1#2=`\< %
      266C743B% &lt;
    \else\ifnum"#1#2=`\> %
      2667743B%
    \else
      #1#2%
    \fi\fi\fi
    \expandafter\hyxmp@toxml
  \fi
}

% In case of xetex the strings by \pdfstringdef can contain
% big chars, then the string ist treated as Unicode.
\begingroup
\def\x#1{\endgroup
  \def\hyxmp@xetex@crap{%
    \edef\hyxmp@try{%
      \expandafter\hyxmp@SpaceOther\hyxmp@text#1\@nil
    }%
    \let\hyxmp@crap@result=N%
    \expandafter\hyxmp@crap@test\hyxmp@try\relax
    \ifx\hyxmp@crap@result Y%
      \let\hyxmp@text\@empty
      \expandafter\hyxmp@crap@convert\hyxmp@try\relax
    \else
      \StringEncodingConvert
      \hyxmp@text\hyxmp@text{pdfdoc}{utf32be}%
    \fi
  }%
}\x{ }
\begingroup
  \catcode`\~=12 %
  \lccode`\~=`\ %
\lowercase{\endgroup
  \def\hyxmp@SpaceOther#1 #2\@nil{%
    #1%
    \ifx\relax#2\relax
      \expandafter\@gobble
    \else
      ~%
      \expandafter\@firstofone
    \fi
    {\hyxmp@SpaceOther#2\@nil}%
  }%
}
\def\hyxmp@crap@test#1{%
  \ifx#1\relax
  \else
    \ifnum`#1>127 %
      \let\hyxmp@crap@result=Y%
      \expandafter\expandafter\expandafter\hyxmp@skiptorelax
    \else
      \expandafter\expandafter\expandafter\hyxmp@crap@test
    \fi
  \fi
}
\def\hyxmp@skiptorelax#1\relax{}
\def\hyxmp@crap@convert#1{%
  \ifx#1\relax
  \else
    \edef\hyxmp@num{\number`#1}%
    \ifnum\hyxmp@num>"FFFFFF %
      \lccode`\!=\intcalcDiv{\hyxmp@num}{\number"1000000}\relax
      \lowercase{\edef\hyxmp@text{\hyxmp@text!}}%
      \edef\hyxmp@num{\intcalcMod{\hyxmp@num}{\number"1000000}}%
    \else
      \edef\hyxmp@text{\hyxmp@text\hyxmp@zero}%
    \fi
    \ifnum\hyxmp@num>"FFFF %
      \lccode`\!=\intcalcDiv{\hyxmp@num}{\number"10000}\relax
      \lowercase{\edef\hyxmp@text{\hyxmp@text!}}%
      \edef\hyxmp@num{\intcalcMod{\hyxmp@num}{\number"10000}}%
    \else
      \edef\hyxmp@text{\hyxmp@text\hyxmp@zero}%
    \fi
    \ifnum\hyxmp@num>"FF %
      \lccode`\!=\intcalcDiv{\hyxmp@num}{\number"100}\relax
      \lowercase{\edef\hyxmp@text{\hyxmp@text!}}%
      \edef\hyxmp@num{\intcalcMod{\hyxmp@num}{\number"100}}%
    \else
      \edef\hyxmp@text{\hyxmp@text\hyxmp@zero}%
    \fi
    \ifnum\hyxmp@num>0 %
      \lccode`\!=\hyxmp@num\relax
      \lowercase{\edef\hyxmp@text{\hyxmp@text!}}%
    \else
      \edef\hyxmp@text{\hyxmp@text\hyxmp@zero}%
    \fi
    \expandafter\hyxmp@crap@convert
  \fi
}
\begingroup
  \catcode0=12 %
  \gdef\hyxmp@zero{^^00}%
\endgroup

\def\hyxmp@text{aö€x}
\tracingmacros=1
\hyxmp@xetex@crap
\tracingmacros=0

\renewcommand*{\hyxmp@list@to@xml}[3]{%
  \ifx#3\@empty
  \else
    \hyxmp@add@to@xml{%
_________<dc:#1>^^J%
____________<rdf:#2>^^J%
    }%
    \bgroup
      \hyxmp@xmlify{#3}%
      \hyxmp@commas@to@list\hyxmp@list{\hyxmp@xmlified}%
      \def\@elt##1{%
        \hyxmp@add@to@xml{%
_______________<rdf:li>##1</rdf:li>^^J%
        }%
      }%
      \hyxmp@list
    \egroup
    \hyxmp@add@to@xml{%
____________</rdf:#2>^^J%
_________</dc:#1>^^J%
    }%
  \fi
}

\begingroup
  \ifhyxmp@unicodetex
    \lccode`\!="FEFF %
    \lowercase{%
      \gdef\hyxmp@bom{!}
    }%
  \else
    \catcode`\^^ef=12
    \catcode`\^^bb=12
    \catcode`\^^bf=12
    \gdef\hyxmp@bom{^^ef^^bb^^bf}%
  \fi
\endgroup

\def\hyxmp@construct@packet{%
  \gdef\hyxmp@xml{}%
  \hyxmp@add@to@xml{<?xpacket begin="\hyxmp@bom" %
id="W5M0MpCehiHzreSzNTczkc9d"?>^^J%
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="3.1-702">^^J%
___<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns\hyxmp@hash">^^J%
  }%
  \hyxmp@pdf@schema
  \hyxmp@xmpRights@schema
  \hyxmp@dc@schema
  \hyxmp@photoshop@schema
  \hyxmp@mm@schema
  \hyxmp@add@to@xml{%
___</rdf:RDF>^^J%
</x:xmpmeta>^^J%
\hyxmp@padding
<?xpacket end="w"?>^^J%
  }%
}

\renewcommand*{\hyxmp@reencode}[1]{%
  % dummy only used for \@pdfmetalang in \begin{document}
}
\AtBeginDocument{%
  \@ifpackageloaded{hyperref}{%
    \ifx\@pdflang\@empty
      \let\@pdfmetalang=\hyxmp@x@default
    \else
      \edef\@pdfmetalang{\@pdflang}%
    \fi
    \hyxmp@xmlify\@pdfmetalang
  }{}%
}
\makeatother

\usepackage{lipsum}

\begin{document}
\lipsum[1-3]
\end{document}

我没有测试所有可能的情况和驱动程序，因此可能存在一些未发现的问题。但也许补丁有助于改进软件包。

Answer

2011/06/12 v1.4 版本中的软件包hyperxmp不使用 UTF-8 来处理 XMP 数据。相反，数据字节以 PDFDocEncoding 或 UTF-16BE 给出。因此，其 trimspaces 宏也会中断。如果是 XeTeX，BOM（字节顺序标记）会写得不正确...

\pdfstringdef(和朋友)产生的字符串\@pdfauthor可以具有依赖于选项设置和驱动程序的不同编码。\ifHy@unicode对于编码检测没有用。pdfencoding=auto字符串也可以有 PDFDocEncoding！此外，XeTeX 也有一个例外，其中字符串可能有大字符（字符代码 > 127）。与所有其他驱动程序不同，如果应用程序想要写入任意二进制数据，XeTeX 只会提供一个非常丑陋的未指定和不完整的接口。