lstlisting 给出错误字符串包含无效的 utf-8 序列

Question

lulatex 似乎不接受 U+FFFD，即使正确编码为 UTF-8，这似乎是错误的，但无论如何，您可以使用输入缓冲区回调将其删除，这里将其更改为?

\documentclass{article}



\begin{document}

\directlua{
function zzz(buff)
 return string.gsub(buff,"^^^^fffd","?")
end
luatexbase.add_to_callback ( "process_input_buffer", zzz, "zzz" )
}

hmmm�

\end{document}

查看源代码可以清楚地了解为什么会发生这种情况，如果无法解码输入流，则函数返回 FFFD，然后如果它返回 FFFD，则会发出无效序列错误。但是，这意味着您在正确编码的 FFFD 输入字符上遇到错误。

unsigned str2uni(const unsigned char *k)
{
    register int ch;
    int val = 0xFFFD;
    const unsigned char *text = k;
    if ((ch = *text++) < 0x80) {
        val = (unsigned) ch;
    } else if (ch <= 0xbf) {    /* error */
    } else if (ch <= 0xdf) {
        if (*text >= 0x80 && *text < 0xc0)
            val = (unsigned) (((ch & 0x1f) << 6) | (*text++ & 0x3f));
    } else if (ch <= 0xef) {
        if (*text >= 0x80 && *text < 0xc0 && text[1] >= 0x80 && text[1] < 0xc0) {
            val = (unsigned)
                (((ch & 0xf) << 12) | ((text[0] & 0x3f) << 6) |
                 (text[1] & 0x3f));
        }
    } else if (ch <= 0xf7) {
        int w = (((ch & 0x7) << 2) | ((text[0] & 0x30) >> 4)) - 1, w2;
        w = (w << 6) | ((text[0] & 0xf) << 2) | ((text[1] & 0x30) >> 4);
        w2 = ((text[1] & 0xf) << 6) | (text[2] & 0x3f);
        val = (unsigned) (w * 0x400 + w2 + 0x10000);
        if (*text < 0x80 || text[1] < 0x80 || text[2] < 0x80 ||
            *text >= 0xc0 || text[1] >= 0xc0 || text[2] >= 0xc0)
            val = 0xFFFD;
    } else {
        /* the 5- and 6-byte UTF-8 sequences generate integers
           that are outside of the valid UCS range, and therefore
           unsupported
         */
    }

    if (val == 0xFFFD)
        utf_error();

    return (val);
}

Answer 1

lulatex 似乎不接受 U+FFFD，即使正确编码为 UTF-8，这似乎是错误的，但无论如何，您可以使用输入缓冲区回调将其删除，这里将其更改为?

\documentclass{article}



\begin{document}

\directlua{
function zzz(buff)
 return string.gsub(buff,"^^^^fffd","?")
end
luatexbase.add_to_callback ( "process_input_buffer", zzz, "zzz" )
}

hmmm�

\end{document}

查看源代码可以清楚地了解为什么会发生这种情况，如果无法解码输入流，则函数返回 FFFD，然后如果它返回 FFFD，则会发出无效序列错误。但是，这意味着您在正确编码的 FFFD 输入字符上遇到错误。

unsigned str2uni(const unsigned char *k)
{
    register int ch;
    int val = 0xFFFD;
    const unsigned char *text = k;
    if ((ch = *text++) < 0x80) {
        val = (unsigned) ch;
    } else if (ch <= 0xbf) {    /* error */
    } else if (ch <= 0xdf) {
        if (*text >= 0x80 && *text < 0xc0)
            val = (unsigned) (((ch & 0x1f) << 6) | (*text++ & 0x3f));
    } else if (ch <= 0xef) {
        if (*text >= 0x80 && *text < 0xc0 && text[1] >= 0x80 && text[1] < 0xc0) {
            val = (unsigned)
                (((ch & 0xf) << 12) | ((text[0] & 0x3f) << 6) |
                 (text[1] & 0x3f));
        }
    } else if (ch <= 0xf7) {
        int w = (((ch & 0x7) << 2) | ((text[0] & 0x30) >> 4)) - 1, w2;
        w = (w << 6) | ((text[0] & 0xf) << 2) | ((text[1] & 0x30) >> 4);
        w2 = ((text[1] & 0xf) << 6) | (text[2] & 0x3f);
        val = (unsigned) (w * 0x400 + w2 + 0x10000);
        if (*text < 0x80 || text[1] < 0x80 || text[2] < 0x80 ||
            *text >= 0xc0 || text[1] >= 0xc0 || text[2] >= 0xc0)
            val = 0xFFFD;
    } else {
        /* the 5- and 6-byte UTF-8 sequences generate integers
           that are outside of the valid UCS range, and therefore
           unsupported
         */
    }

    if (val == 0xFFFD)
        utf_error();

    return (val);
}

lstlisting 给出错误字符串包含无效的 utf-8 序列

答案1

相关内容