可扩展宏，无需额外软件包即可提取 UTF-8/西里尔字符串的第一个字符

Question 1

UTF-8 编码的每个字节在 pdflatex 中都是一个单独的标记，但是您可以识别前导标记，它会告诉您需要多少个字节。此版本涵盖了一个字节和两个字节的情况。

\documentclass{article}

\usepackage[T2A]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[russian]{babel}

\makeatletter
\newcommand{\firstof}[1]{\expandafter\checkfirst#1\@nil}
\def\checkfirst#1{%
  \ifx\UTFviii@two@octets#1%
  \expandafter\gettwooctets
  \else
  \expandafter\@car\expandafter#1%
  \fi
}
\def\gettwooctets#1#2#3\@nil{\UTFviii@two@octets#1#2}

\makeatother

\begin{document}

\firstof{Vladimir}

\firstof{Владимир}

\end{document}

如果您想要处理输入的其余部分，而不是丢弃第一个字母之后的所有内容，您可以进行一些小改动，以便传入一个命令来应用于剩余的文本。如果您传入，\gobble 它会像以前一样提取。如果您传入，\firstofx\gobble那么它会提取剩余文本的第一个字母，这样您就会得到两个字母：

\documentclass{article}

\usepackage[T2A]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[russian]{babel}

\makeatletter
\newcommand{\firstofx}[2]{\expandafter\checkfirst#2\@nil{#1}}
\def\checkfirst#1{%
  \ifx\UTFviii@two@octets#1%
  \expandafter\gettwooctetsx
  \else
  \expandafter\getasciix\expandafter#1%
  \fi
}

\def\getasciix#1#2\@nil#3{#1#3{#2}}

\def\gettwooctetsx#1#2#3\@nil#4{\UTFviii@two@octets#1#2#4{#3}}

\newcommand\gobble[1]{}

\makeatother

\begin{document}

\firstofx\gobble{Vladimir}

\firstofx{\firstofx\gobble}{Vladimir}

\firstofx\gobble{Владимир}

\firstofx{\firstofx\gobble}{Владимир}


\end{document}

Answer

UTF-8 编码的每个字节在 pdflatex 中都是一个单独的标记，但是您可以识别前导标记，它会告诉您需要多少个字节。此版本涵盖了一个字节和两个字节的情况。

\documentclass{article}

\usepackage[T2A]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[russian]{babel}

\makeatletter
\newcommand{\firstof}[1]{\expandafter\checkfirst#1\@nil}
\def\checkfirst#1{%
  \ifx\UTFviii@two@octets#1%
  \expandafter\gettwooctets
  \else
  \expandafter\@car\expandafter#1%
  \fi
}
\def\gettwooctets#1#2#3\@nil{\UTFviii@two@octets#1#2}

\makeatother

\begin{document}

\firstof{Vladimir}

\firstof{Владимир}

\end{document}

如果您想要处理输入的其余部分，而不是丢弃第一个字母之后的所有内容，您可以进行一些小改动，以便传入一个命令来应用于剩余的文本。如果您传入，\gobble 它会像以前一样提取。如果您传入，\firstofx\gobble那么它会提取剩余文本的第一个字母，这样您就会得到两个字母：

\documentclass{article}

\usepackage[T2A]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[russian]{babel}

\makeatletter
\newcommand{\firstofx}[2]{\expandafter\checkfirst#2\@nil{#1}}
\def\checkfirst#1{%
  \ifx\UTFviii@two@octets#1%
  \expandafter\gettwooctetsx
  \else
  \expandafter\getasciix\expandafter#1%
  \fi
}

\def\getasciix#1#2\@nil#3{#1#3{#2}}

\def\gettwooctetsx#1#2#3\@nil#4{\UTFviii@two@octets#1#2#4{#3}}

\newcommand\gobble[1]{}

\makeatother

\begin{document}

\firstofx\gobble{Vladimir}

\firstofx{\firstofx\gobble}{Vladimir}

\firstofx\gobble{Владимир}

\firstofx{\firstofx\gobble}{Владимир}


\end{document}

Question 2

这应该可以工作。我定义了\headof和\tailof（使用从中窃取的代码）这里和这里)，它们的作用与名称所承诺的一样。如果你使用\headof*(或)，它将扩展其参数，因此你可以通过嵌套and\tailof*毫不费力地从序列中获取第 n 个字符（如果你喜欢的话，这非常类似于lisp 中的and ）。例如，可以使用以下方法提取的第四个字符\headof*\tailof*carcdrВладимир

\headof*{\tailof*{\tailof*{\tailof{Владимир}}}}

简单得令人烦恼 :)

然后，您可以使用原始\pdfstrcmp或更高级别来比较字符串\str_if_eq:eeTF。 \ifx这将不起作用，因为它比较两个标记，并且л（例如）本身就是两个标记（当然，假设您使用 pdfTeX）。

如果参数为空，结果也为空。如果头部是一组标记（在{...内}），则该组将被视为单个事物并返回，不带外部括号。

\documentclass{article}

\usepackage[T2A]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[russian]{babel}

\ExplSyntaxOn
\NewExpandableDocumentCommand \headof { s +m }
  { \IfBooleanT {#1} { \exp_args:Ne } \crosfield_text_head:n {#2} }
\NewExpandableDocumentCommand \tailof { s +m }
  { \IfBooleanT {#1} { \exp_args:Ne } \crosfield_text_tail:n {#2} }
\cs_new:Npn \crosfield_text_head:n #1
  {
    \tl_if_head_is_N_type:nTF {#1}
      { \__crosfield_get_head:Nw #1 \q_stop }
      {
        \tl_if_head_is_group:nTF {#1}
          { \exp_not:o { \use_i_delimit_by_q_stop:nw #1 \q_stop } }
          { \tl_if_empty:nTF {#1} { } { ~ } }
      }
  }
\cs_new:Npn \crosfield_text_tail:n #1
  {
    \tl_if_head_is_N_type:nTF {#1}
      { \__crosfield_get_tail:Nw #1 \q_stop }
      {
        \tl_if_head_is_group:nTF {#1}
          { \exp_not:o { \use_none:n #1 } }
          { \tl_if_empty:nTF {#1} { } { \exp_not:o { \exp:w \exp_end_continue_f:w #1 } } }
      }
  }
\bool_lazy_or:nnTF
    { \sys_if_engine_luatex_p: }
    { \sys_if_engine_xetex_p: }
  {
    \cs_new:Npn \__crosfield_get_head:Nw #1 #2 \q_stop { \exp_not:N #1 }
    \cs_new:Npn \__crosfield_get_tail:Nw #1 #2 \q_stop { \exp_not:n {#2} }
    \use_none:n
  }
  { \makeatletter \use:n }
  {
    \makeatother
    \cs_new:Npn \__crosfield_get_head:Nw
      { \__crosfield_head_tail:NNw \use_i:nn }
    \cs_new:Npn \__crosfield_get_tail:Nw
      { \__crosfield_head_tail:NNw \use_ii:nn }
    \cs_new:Npn \__crosfield_head_tail:NNw #1 #2 #3 \q_stop
      {
        \use:e
          {
            \exp_not:N \__crosfield_head_tail:w
              \exp_not:o { \token_to_meaning:N #2 }
              \tl_to_str:n { UTFviii@ one @octets } ~
          }   \q_stop { #2 #3 } #1
      }
    \use:e
      {
        \cs_new:Npn \exp_not:N \__crosfield_head_tail:w
          #1 \tl_to_str:n { UTFviii@ } #2 \tl_to_str:n { @octets } ~ #3
          \exp_not:N \q_stop #4 #5
      }
      {
        \str_case:nnTF {#2}
          {
            { one   } { \__crosfield_head_or_tail:NNw #5 }
            { two   } { \__crosfield_head_or_tail:NNNw #5 }
            { three } { \__crosfield_head_or_tail:NNNNw #5 }
            { four  } { \__crosfield_head_or_tail:NNNNNw #5 }
          }
          { #4 \q_stop }
          { \ERROR? }
      }
    \cs_new:Npn \__crosfield_head_or_tail:NNw #1 #2 #3 \q_stop
      { \exp_not:o { #1 {#2} {#3} } }
    \cs_new:Npn \__crosfield_head_or_tail:NNNw #1 #2#3 #4 \q_stop
      { \exp_not:o { #1 {#2#3} {#4} } }
    \cs_new:Npn \__crosfield_head_or_tail:NNNNw #1 #2#3#4 #5 \q_stop
      { \exp_not:o { #1 {#2#3#4} {#5} } }
    \cs_new:Npn \__crosfield_head_or_tail:NNNNNw #1 #2#3#4#5 #6 \q_stop
      { \exp_not:o { #1 {#2#3#4#5} {#6} } }
  }
\cs_new_eq:NN \StrCompare \str_if_eq:eeTF
\ExplSyntaxOff

\begin{document}

\StrCompare
  {\headof*{\tailof{Владимир}}}{\detokenize{л}}
  {\true}{\false}

\end{document}

Answer