搜索文本中最常出现的模式（用宏替换它们）

Question 1

编辑：我已经实现了某个版本的Lempel-Ziv-Welch 算法，根据 Ryan Reich 的评论。我现在已经完全改变了代码，但我不知道该算法是否有名字。

我按照 David Carlisle 的要求，将圣诞节的 12 天输入到新代码中，并获得了以下结果（对于纯 TeX）

\long\def\A#1#2{\def#1{#2}\A}\A\AG{s\par\AC\A\AB}\AH{\AE\W\X\V\T\U m\V\AD}\AA
{\C\D\E\F\G}\AB{\H\I\J\A\par\bigskip\B}\AC{\O\P\par\M\N\K\L nd}\AD{\R\S\par\Q
\A ring}\AE{\Y es danc\V}\AF{t\Z a leap\V}\B{\par On the }\C{ day of C}\D
{hristmas }\E{my true l}\F{ove gave }\G{to me\par}\H{a partrid}\I{ge in a p}\J
{ear tree.}\K{two turtl}\L{e doves\par a}\M{three fre}\N{nch hens\par}\O{four
call}\P{ing birds}\Q{five gold}\R{six geese}\S{ a laying}\T{seven swa}\U{ns a
swim}\V{ing\par}\W{eight mai}\X{ds a milk}\Y{nine ladi}\Z{en lords } \A{ }\B
first\AA\AB second\AA\K\L nd \AB third\AA\M\N\K\L nd \AB fourth\AA\AC\A\AB
fifth\AA\Q\A ring\AG sixth\AA\AD\AG seventh\AA\T\U m\V\AD\AG eighth\AA\W\X\V\T
\U m\V\AD\AG ninth\AA\AH\AG tenth\AA\AF\AH\AG eleventh\AA eleven pipers pip\V
\AF\AH\AG twelfth\AA twelve drummers drumm\V eleven pipers pip\V\AF\AH s\par
\AC\A\H\I\J\A\bye

长度为 878 字节。这应该与 767 字节进行比较xii.tex，歌词也是一样的。

\RequirePackage[enable-debug]{expl3}
\ExplSyntaxOn
\cs_generate_variant:Nn \tl_replace_all:Nnn { Nxx, NV, Nxn }
\cs_generate_variant:Nn \tl_replace_once:Nnn { Nxx }
\cs_generate_variant:Nn \tl_if_in:NnT { No }
\cs_generate_variant:Nn \tl_if_eq:NNTF { cc }
\cs_generate_variant:Nn \tl_if_eq:nnTF { xx }
\cs_generate_variant:Nn \tl_if_eq:nnT { xx }
\cs_generate_variant:Nn \str_count:N { c }
\cs_generate_variant:Nn \regex_match:nnF { nV }
\cs_generate_variant:Nn \str_set:Nn { NV }
\str_new:N \l__find_prefix_str
\int_new:N \l__find_prefix_int
\tl_new:N \l__find_tl
\tl_new:N \l__find_chunks_tl
\int_step_inline:nnnn { 1 } { 1 } { 9 }
  { \seq_new:c { l__find_chunks_#1_seq } }
\int_new:N \l__find_common_int
\int_new:N \l__find_nesting_int
\tl_new:N \l__find_previous_tl
\seq_new:N \l__find_chunks_seq
\int_new:N \l__find_best_score_int
\int_new:N \l__find_macro_int
\tl_new:N \l__find_macros_tl
\tl_new:N \l__find_result_tl
\int_new:N \l__find_length_int
\int_new:N \l__find_previous_length_int
\tl_new:N \l__find_display_tl
\tl_new:N \l__find_best_chunk_tl

\cs_new_protected_nopar:Npn \dot: { \tex_message:D { . } }

\cs_new_protected:Npn \find_matches:nnN #1#2#3
  {
    % '#1' is the prefix, '#2' is the token list to study.
    %
    \__find_set_prefix:n {#1}
    \tl_set:Nn \l__find_tl { ~! #2 }
    \__find_escape_spaces:xN { \l__find_prefix_str A } \l__find_tl
    \int_set:Nn \l__find_macro_int { 1 }
    \__find_get_length:V \l__find_prefix_str
    \iow_term:x { \int_use:N \l__find_length_int }
    \int_set_eq:NN \l__find_length_int \c_max_int
    \__find_matches_aux:V \l__find_prefix_str
    \tl_replace_once:Nnn \l__find_tl { ! } { { ~ } }
    \tl_set:Nx #3
      {
        \__find_preamble:
        \exp_not:V \l__find_tl
      }
  }
\cs_new_protected:Npn \__find_escape_spaces:nN #1#2
  {
    \regex_replace_all:nnN { \cS\  } { \c{#1} } #2
    \dot:
  }
\cs_generate_variant:Nn \__find_escape_spaces:nN { x }
\cs_new_nopar:Npn \__find_preamble:
  {
    \exp_not:n { \long \def } \exp_not:c { \l__find_prefix_str A } ####1####2
      {
        \exp_not:N \def ####1{####2}
        \exp_not:c { \l__find_prefix_str A }
      }
    \exp_not:c { \l__find_prefix_str A }
  }
\cs_new_protected:Npn \__find_matches_aux:n #1
  {
    \int_set_eq:NN \l__find_previous_length_int \l__find_length_int
    \tl_set_eq:NN \l__find_previous_tl \l__find_tl
    \__find_escape_tl:nN {#1} \l__find_tl
    \int_compare:nNnTF { \tl_count:N \l__find_tl } < 9
      { \tl_set_eq:Nn \l__find_tl \l__find_previous_tl }
      {
        \__find_set_chunks:
        \__find_sort_chunks:
        \__find_common:
        \__find_best_macros:
        \__find_undefine_tmp:
        \tl_set_eq:NN \l__find_tl \l__find_result_tl
        \__find_unescape_tl:nN {#1} \l__find_tl
        \__find_get_length:n {#1}
        \iow_term:x { \int_use:N \l__find_length_int }
        \int_compare:nNnTF \l__find_length_int < \l__find_previous_length_int
          { \__find_matches_aux:n {#1} }
          {
            \iow_term:n { }
            \iow_term:x { \l__find_display_tl }
            \iow_term:n { }
            \tl_set_eq:NN \l__find_tl \l__find_previous_tl
          }
      }
  }
\cs_generate_variant:Nn \__find_matches_aux:n { V }

\cs_new_protected:Npn \__find_get_length:n #1
  {
    \tl_set_eq:NN \l__find_display_tl \l__find_tl
    \tl_replace_once:Nxx \l__find_display_tl
      { \exp_not:c { #1 A } ! }
      { ~ \exp_not:c { #1 A } {~} }
    \str_set:NV \l__find_display_tl \l__find_display_tl
    \tl_replace_all:Nxn \l__find_display_tl
      { \c_backslash_str #1 \token_to_str:N A ~ } \c_space_tl
    \tl_replace_all:Nnn \l__find_display_tl
      { ~ \c_space_tl } { ~ \exp_not:c { #1 A } }
    \dot:
    \str_set:Nx \l__find_display_tl { \__find_preamble: \l__find_display_tl }
    \tl_replace_all:Nxx \l__find_display_tl
      { \c_hash_str \c_hash_str } { \c_hash_str }
    \dot:
    \regex_replace_all:nnN
      { (\\[A-Za-z]+) \ ([A-Za-z]) }
      { \1 \ \ \2 }
      \l__find_display_tl
    \dot:
    \regex_replace_all:nnN
      { (\\[A-Za-z]+) \ }
      { \1 \c{__find_allow_break:} }
      \l__find_display_tl
    \dot:
    \iow_wrap:nnnN { \l__find_display_tl } { } { } \__find_get_length_aux:n
  }
\cs_generate_variant:Nn \__find_get_length:n { V }
\cs_new_protected:Npn \__find_get_length_aux:n #1
  {
    \int_set:Nn \l__find_length_int { \str_count:n {#1} }
    \tl_set:Nn \l__find_display_tl {#1}
  }



\cs_new_protected:Npn \__find_set_prefix:n #1
  {
    % Check that the prefix |#1| is made only of alphabetic characters.
    %
    \str_set:Nx \l__find_prefix_str {#1}
    \int_set:Nn \l__find_prefix_int { \str_count:N \l__find_prefix_str }
    \regex_match:nVF { \A\w*\Z } \l__find_prefix_str
      {
        \msg_error:nnx { find } { invalid-prefix }
          { \l__find_prefix_str }
      }
    \dot:
  }

\cs_new_protected:Npn \__find_escape_tl:nN #1#2
  {
    % During the 'study' step, we manipulate the token list |#2|
    % with all begin-group and end-group tokens replaced by a
    % control sequence built from the prefix.  We must change both
    % begin-group and end-group tokens in one pass, to avoid getting an
    % unbalanced result.  Also replace macro parameters because they
    % cannot be used as delimiters for macros.  Spaces have been
    % turned into a control sequence earlier.  At this stage, every
    % token in |#2| can be grabbed as an N-type argument.
    %
    \regex_replace_all:nnN { \cB. } { \cB\{ } #2
    \dot:
    \regex_replace_all:nnN { \cE. } { \cE\} } #2
    \dot:
    \regex_replace_all:nnN { \cP. } { \c{ #1 \# } } #2
    \dot:
    \regex_replace_all:nnN { \c[BEP]. } { \c{ #1 \0 } } #2
    \dot:
  }




\cs_new_protected_nopar:Npn \__find_set_chunks:
  {
    % Build a token list whose items are each nine consecutive tokens
    % of the original token list, in a running window.  So for instance
    % |ABCDEFGHIJKL| would lead to the following \(12\) items:
    % |ABCDEFGHI|, |BCDEFGHIJ|, |CDEFGHIJK|, |DEFGHIJKL|, |EFGHIJKL|,
    % |FGHIJKL|, |GHIJKL|, |HIJKL|, |IJKL|, |JKL|, |KL|, |L|.  The items
    % of this token list are built in an |x|-expanded loop.
    % A special case arises if the |find| token list is too short to
    % safely perform the loop: then our whole algorithm is not going to
    % do any good anyways, so we build an empty chunk list.
    %
    \tl_set:Nx \l__find_chunks_tl
      {
        \exp_after:wN \__find_set_chunks_loop:NNNNNNNNN \l__find_tl
          \q_recursion_tail \q_recursion_stop
      }
  }
\cs_new:Npn \__find_set_chunks_loop:NNNNNNNNN #1#2#3#4#5#6#7#8#9
  {
    % As usual in a TeX loop, first check for the end-loop marker (here,
    % \cs{q_recursion_tail}).  If it is reached, we fill in the last few
    % chunks (which become shorter and shorter as we go).  Otherwise,
    % add (to the token list we are building) an item containing \(9\)
    % tokens, and loop, dropping the first of the items.
    %
    \quark_if_recursion_tail_stop_do:Nn #9
      { \__find_set_chunks_end:NNNNNNNN #1 #2 #3 #4 #5 #6 #7 #8 }
    { \exp_not:n { #1 #2 #3 #4 #5 #6 #7 #8 #9 } }
    \__find_set_chunks_loop:NNNNNNNNN #2#3#4#5#6#7#8#9
  }
\cs_new:Npn \__find_set_chunks_end:NNNNNNNN #1#2#3#4#5#6#7#8
  {
    \exp_not:n
      {
        { #1 #2 #3 #4 #5 #6 #7 #8 }
        { #2 #3 #4 #5 #6 #7 #8 }
        { #3 #4 #5 #6 #7 #8 }
        { #4 #5 #6 #7 #8 }
        { #5 #6 #7 #8 }
        { #6 #7 #8 }
        { #7 #8 }
        { #8 }
      }
  }
\cs_new_protected:Npn \__find_sort_chunks:
  {
    \tl_sort:Nn \l__find_chunks_tl
      {
        \int_compare:nNnTF
          {
            \tex_strcmp:D
              { \exp_not:n {##1} }
              { \exp_not:n {##2} }
          }
          > 0
          { \sort_return_swapped: }
          { \sort_return_same: }
      }
  }

\cs_new_protected:Npn \__find_common:
  {
    \int_step_inline:nnnn { 1 } { 1 } { 9 }
      { \seq_clear:c { l__find_chunks_##1_seq } }
    \exp_after:wN \__find_common_loop:nn \l__find_chunks_tl
      \q_recursion_tail \q_recursion_tail \q_recursion_stop
    \int_step_inline:nnnn { 4 } { 1 } { 9 }
      {
        \seq_map_inline:cn { l__find_chunks_##1_seq }
          {
            \tl_if_exist:cTF { l__find_chunk_ ' \tl_to_str:n {####1} ' _tl }
              {
                \tl_put_right:cn
                  { l__find_chunk_ ' \tl_to_str:n {####1} ' _tl } { i }
              }
              {
                \cs_set_eq:cN
                  { l__find_chunk_ ' \tl_to_str:n {####1} ' _tl } \c_empty_tl
              }
          }
      }
  }

\cs_new_protected:Npn \__find_common_loop:nn #1#2
  {
    \quark_if_recursion_tail_stop:n {#2}
    \int_zero:N \l__find_common_int
    \__find_count_common_aux:nn {#1} {#2}
    \use:c { __find_common_ \int_use:N \l__find_common_int :w }
      #1 X X X X X X X X X \q_stop
    \__find_common_loop:nn {#2}
  }
\cs_new_protected:Npn \__find_count_common_aux:nn #1#2
  {
    \tl_if_empty:nF {#1}
      {
        \tl_if_empty:nF {#2}
          {
            \tl_if_eq:xxT { \tl_head:n {#1} } { \tl_head:n {#2} }
              {
                \int_incr:N \l__find_common_int
                \__find_count_common_aux:xx
                  { \tl_tail:n {#1} } { \tl_tail:n {#2} }
              }
          }
      }
  }
\cs_generate_variant:Nn \__find_count_common_aux:nn { xx }

\cs_new_eq:cN { __find_common_0:w } \use_none_delimit_by_q_stop:w
\cs_new_protected:cpn { __find_common_1:w } #1
  { \__find_common_auxii:nnw { 1 } {#1} }
\cs_new_protected:cpn { __find_common_2:w } #1#2
  { \__find_common_auxii:nnw { 2 } { #1 #2 } }
\cs_new_protected:cpn { __find_common_3:w } #1#2#3
  { \__find_common_auxii:nnw { 3 } { #1 #2 #3 } }
\cs_new_protected:cpn { __find_common_4:w } #1#2#3#4
  { \__find_common_auxii:nnw { 4 } { #1 #2 #3 #4 } }
\cs_new_protected:cpn { __find_common_5:w } #1#2#3#4#5
  {
    \dot:
    \__find_common_auxii:nnw { 5 } { #1 #2 #3 #4 #5 }
  }
\cs_new_protected:cpn { __find_common_6:w } #1#2#3#4#5#6
  { \__find_common_auxii:nnw { 6 } { #1 #2 #3 #4 #5 #6 } }
\cs_new_protected:cpn { __find_common_7:w } #1#2#3#4#5#6#7
  { \__find_common_auxii:nnw { 7 } { #1 #2 #3 #4 #5 #6 #7 } }
\cs_new_protected:cpn { __find_common_8:w } #1#2#3#4#5#6#7#8
  { \__find_common_auxii:nnw { 8 } { #1 #2 #3 #4 #5 #6 #7 #8 } }
\cs_new_protected:cpn { __find_common_9:w } #1#2#3#4#5#6#7#8#9
  { \__find_common_auxii:nnw { 9 } { #1 #2 #3 #4 #5 #6 #7 #8 #9 } }
\cs_new_protected:Npn \__find_common_auxii:nnw #1#2#3 \q_stop
  {
    \int_zero:N \l__find_nesting_int
    \tl_map_inline:nn {#2}
      {
        \str_case_e:nn { \exp_not:n {##1} }
          {
            { \exp_not:c { \l__find_prefix_str \c_left_brace_str } }
              { \int_incr:N \l__find_nesting_int }
            { \exp_not:c { \l__find_prefix_str \c_right_brace_str } }
              {
                \int_compare:nNnF \l__find_nesting_int > 0
                  { \use_none_delimit_by_q_stop:w }
                \int_decr:N \l__find_nesting_int
              }
          }
      }
    \int_compare:nNnF \l__find_nesting_int = 0
      { \use_none_delimit_by_q_stop:w }
    \seq_put_right:cn { l__find_chunks_#1_seq } {#2}
    \use_none_delimit_by_q_stop:w
    \q_stop
  }

\cs_new_protected_nopar:Npn \__find_best_macros:
  {
    \tl_clear:N \l__find_macros_tl
    \tl_clear:N \l__find_result_tl
    \__find_best_macros_aux:
    \tl_put_left:NV \l__find_result_tl \l__find_macros_tl
  }
\cs_new_protected:Npn \__find_best_macros_aux:
  {
    \exp_after:wN \__find_best_macros_auxii:NNNNNNNNN \l__find_tl
      \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_stop
    \tl_if_empty:NF \l__find_tl { \__find_best_macros_aux: }
  }
\cs_new_protected:Npn \__find_best_macros_auxii:NNNNNNNNN #1#2#3#4#5#6#7#8#9
  {
    \int_zero:N \l__find_best_score_int
    \tl_clear:N \l__find_best_chunk_tl
    \tl_map_inline:nn
      {
        {#1} {#1#2} {#1#2#3} {#1#2#3#4} {#1#2#3#4#5} {#1#2#3#4#5#6}
        {#1#2#3#4#5#6#7} {#1#2#3#4#5#6#7#8} {#1#2#3#4#5#6#7#8#9}
      }
      {
        \int_compare:nNnT
          { \__find_score:n {##1} } > \l__find_best_score_int
          {
            \tl_set:Nn \l__find_best_chunk_tl {##1}
            \int_set:Nn \l__find_best_score_int
              { \__find_score:n {##1} }
          }
      }
    \tl_if_empty:NF \l__find_best_chunk_tl
      {
        \int_incr:N \l__find_macro_int
        \tl_put_right:Nx \l__find_macros_tl
          {
            \exp_not:c
              { \l__find_prefix_str \int_to_Alph:n { \l__find_macro_int } }
            { \exp_not:V \l__find_best_chunk_tl }
          }
        \use:x
          {
            \exp_not:n { \tl_replace_all:NVn \l__find_tl \l__find_best_chunk_tl }
              { \exp_not:c { \l__find_prefix_str \int_to_Alph:n { \l__find_macro_int } } }
          }
        \dot:
      }
    \tl_put_right:Nx \l__find_result_tl { \tl_head:N \l__find_tl }
    \tl_set:Nx \l__find_tl { \tl_tail:N \l__find_tl }
    \use_none_delimit_by_q_stop:w
  }
\cs_new:Npn \__find_score:n #1
  {
    % Turning ####1 into a length (p+2) macro
    % (e.g. |\<prefix>A|) saves this number of chars.
    % Good if non-negative.
    \cs_if_exist:cTF { l__find_chunk_ ' \tl_to_str:n {#1} ' _tl }
      {
        \int_eval:n
          {
            \tl_count:c { l__find_chunk_ ' \tl_to_str:n {#1} ' _tl }
            * ( \str_count:n {#1} - 3 - \l__find_prefix_int )
            - 2 * \l__find_prefix_int - 9
          }
      }
      { -1 }
  }

\cs_new_protected:Npn \__find_undefine_tmp:
  {
    \int_step_inline:nnnn { 4 } { 1 } { 9 }
      {
        \seq_map_inline:cn { l__find_chunks_##1_seq }
          { \cs_undefine:c { l__find_chunk_ ' \tl_to_str:n {####1} ' _tl } }
      }
  }

\cs_new_protected:Npn \__find_unescape_tl:nN #1#2
  {
    \regex_replace_all:nnN { \c{#1\{} } { \cB\{ \cE\? } #2
    \dot:
    \regex_replace_all:nnN { \c{#1\}} } { \cB\? \cE\} } #2
    \dot:
    \regex_replace_all:nnN { \c[BE]\? } { } #2
    \dot:
    \regex_replace_all:nnN { \c{#1\#} } { \cP\# } #2
    \dot:
  }

\cs_if_exist:NTF \iow_allow_break:
  { \cs_new:Npn \__find_allow_break: { \iow_allow_break: } }
  { \cs_new:Npn \__find_allow_break: { ~ } }

\msg_new:nnn { find } { invalid-prefix }
  { The~prefix~used,~'#1',~must~be~made~of~letters. }

\cs_set_protected:Npn \FindMatches #1#2
  {
    \find_matches:nnN {#1} {#2} \l_tmpa_tl
    \tl_use:N \l_tmpa_tl
  }
\ExplSyntaxOff

% Set up some basic things to behave a bit like plain TeX.
%
\documentclass{article}
\begin{document}
\setlength{\parindent}{0pt}
\def\bye{\end{document}}
%
% End of setup.

\FindMatches{}{% Let's go!

On the first day of Christmas my true love gave to me\par
a partridge in a pear tree.

\bigskip

On the second day of Christmas my true love gave to me\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the third day of Christmas my true love gave to me\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the fourth day of Christmas my true love gave to me\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the fifth day of Christmas my true love gave to me\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the sixth day of Christmas my true love gave to me\par
six geese a laying\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the seventh day of Christmas my true love gave to me\par
seven swans a swimming\par
six geese a laying\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the eighth day of Christmas my true love gave to me\par
eight maids a milking\par
seven swans a swimming\par
six geese a laying\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the ninth day of Christmas my true love gave to me\par
nine ladies dancing\par
eight maids a milking\par
seven swans a swimming\par
six geese a laying\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the tenth day of Christmas my true love gave to me\par
ten lords a leaping\par
nine ladies dancing\par
eight maids a milking\par
seven swans a swimming\par
six geese a laying\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the eleventh day of Christmas my true love gave to me\par
eleven pipers piping\par
ten lords a leaping\par
nine ladies dancing\par
eight maids a milking\par
seven swans a swimming\par
six geese a laying\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the twelfth day of Christmas my true love gave to me\par
twelve drummers drumming\par
eleven pipers piping\par
ten lords a leaping\par
nine ladies dancing\par
eight maids a milking\par
seven swans a swimming\par
six geese a laying\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.
\bye
}

Answer

编辑：我已经实现了某个版本的Lempel-Ziv-Welch 算法，根据 Ryan Reich 的评论。我现在已经完全改变了代码，但我不知道该算法是否有名字。

我按照 David Carlisle 的要求，将圣诞节的 12 天输入到新代码中，并获得了以下结果（对于纯 TeX）

\long\def\A#1#2{\def#1{#2}\A}\A\AG{s\par\AC\A\AB}\AH{\AE\W\X\V\T\U m\V\AD}\AA
{\C\D\E\F\G}\AB{\H\I\J\A\par\bigskip\B}\AC{\O\P\par\M\N\K\L nd}\AD{\R\S\par\Q
\A ring}\AE{\Y es danc\V}\AF{t\Z a leap\V}\B{\par On the }\C{ day of C}\D
{hristmas }\E{my true l}\F{ove gave }\G{to me\par}\H{a partrid}\I{ge in a p}\J
{ear tree.}\K{two turtl}\L{e doves\par a}\M{three fre}\N{nch hens\par}\O{four
call}\P{ing birds}\Q{five gold}\R{six geese}\S{ a laying}\T{seven swa}\U{ns a
swim}\V{ing\par}\W{eight mai}\X{ds a milk}\Y{nine ladi}\Z{en lords } \A{ }\B
first\AA\AB second\AA\K\L nd \AB third\AA\M\N\K\L nd \AB fourth\AA\AC\A\AB
fifth\AA\Q\A ring\AG sixth\AA\AD\AG seventh\AA\T\U m\V\AD\AG eighth\AA\W\X\V\T
\U m\V\AD\AG ninth\AA\AH\AG tenth\AA\AF\AH\AG eleventh\AA eleven pipers pip\V
\AF\AH\AG twelfth\AA twelve drummers drumm\V eleven pipers pip\V\AF\AH s\par
\AC\A\H\I\J\A\bye

长度为 878 字节。这应该与 767 字节进行比较xii.tex，歌词也是一样的。

\RequirePackage[enable-debug]{expl3}
\ExplSyntaxOn
\cs_generate_variant:Nn \tl_replace_all:Nnn { Nxx, NV, Nxn }
\cs_generate_variant:Nn \tl_replace_once:Nnn { Nxx }
\cs_generate_variant:Nn \tl_if_in:NnT { No }
\cs_generate_variant:Nn \tl_if_eq:NNTF { cc }
\cs_generate_variant:Nn \tl_if_eq:nnTF { xx }
\cs_generate_variant:Nn \tl_if_eq:nnT { xx }
\cs_generate_variant:Nn \str_count:N { c }
\cs_generate_variant:Nn \regex_match:nnF { nV }
\cs_generate_variant:Nn \str_set:Nn { NV }
\str_new:N \l__find_prefix_str
\int_new:N \l__find_prefix_int
\tl_new:N \l__find_tl
\tl_new:N \l__find_chunks_tl
\int_step_inline:nnnn { 1 } { 1 } { 9 }
  { \seq_new:c { l__find_chunks_#1_seq } }
\int_new:N \l__find_common_int
\int_new:N \l__find_nesting_int
\tl_new:N \l__find_previous_tl
\seq_new:N \l__find_chunks_seq
\int_new:N \l__find_best_score_int
\int_new:N \l__find_macro_int
\tl_new:N \l__find_macros_tl
\tl_new:N \l__find_result_tl
\int_new:N \l__find_length_int
\int_new:N \l__find_previous_length_int
\tl_new:N \l__find_display_tl
\tl_new:N \l__find_best_chunk_tl

\cs_new_protected_nopar:Npn \dot: { \tex_message:D { . } }

\cs_new_protected:Npn \find_matches:nnN #1#2#3
  {
    % '#1' is the prefix, '#2' is the token list to study.
    %
    \__find_set_prefix:n {#1}
    \tl_set:Nn \l__find_tl { ~! #2 }
    \__find_escape_spaces:xN { \l__find_prefix_str A } \l__find_tl
    \int_set:Nn \l__find_macro_int { 1 }
    \__find_get_length:V \l__find_prefix_str
    \iow_term:x { \int_use:N \l__find_length_int }
    \int_set_eq:NN \l__find_length_int \c_max_int
    \__find_matches_aux:V \l__find_prefix_str
    \tl_replace_once:Nnn \l__find_tl { ! } { { ~ } }
    \tl_set:Nx #3
      {
        \__find_preamble:
        \exp_not:V \l__find_tl
      }
  }
\cs_new_protected:Npn \__find_escape_spaces:nN #1#2
  {
    \regex_replace_all:nnN { \cS\  } { \c{#1} } #2
    \dot:
  }
\cs_generate_variant:Nn \__find_escape_spaces:nN { x }
\cs_new_nopar:Npn \__find_preamble:
  {
    \exp_not:n { \long \def } \exp_not:c { \l__find_prefix_str A } ####1####2
      {
        \exp_not:N \def ####1{####2}
        \exp_not:c { \l__find_prefix_str A }
      }
    \exp_not:c { \l__find_prefix_str A }
  }
\cs_new_protected:Npn \__find_matches_aux:n #1
  {
    \int_set_eq:NN \l__find_previous_length_int \l__find_length_int
    \tl_set_eq:NN \l__find_previous_tl \l__find_tl
    \__find_escape_tl:nN {#1} \l__find_tl
    \int_compare:nNnTF { \tl_count:N \l__find_tl } < 9
      { \tl_set_eq:Nn \l__find_tl \l__find_previous_tl }
      {
        \__find_set_chunks:
        \__find_sort_chunks:
        \__find_common:
        \__find_best_macros:
        \__find_undefine_tmp:
        \tl_set_eq:NN \l__find_tl \l__find_result_tl
        \__find_unescape_tl:nN {#1} \l__find_tl
        \__find_get_length:n {#1}
        \iow_term:x { \int_use:N \l__find_length_int }
        \int_compare:nNnTF \l__find_length_int < \l__find_previous_length_int
          { \__find_matches_aux:n {#1} }
          {
            \iow_term:n { }
            \iow_term:x { \l__find_display_tl }
            \iow_term:n { }
            \tl_set_eq:NN \l__find_tl \l__find_previous_tl
          }
      }
  }
\cs_generate_variant:Nn \__find_matches_aux:n { V }

\cs_new_protected:Npn \__find_get_length:n #1
  {
    \tl_set_eq:NN \l__find_display_tl \l__find_tl
    \tl_replace_once:Nxx \l__find_display_tl
      { \exp_not:c { #1 A } ! }
      { ~ \exp_not:c { #1 A } {~} }
    \str_set:NV \l__find_display_tl \l__find_display_tl
    \tl_replace_all:Nxn \l__find_display_tl
      { \c_backslash_str #1 \token_to_str:N A ~ } \c_space_tl
    \tl_replace_all:Nnn \l__find_display_tl
      { ~ \c_space_tl } { ~ \exp_not:c { #1 A } }
    \dot:
    \str_set:Nx \l__find_display_tl { \__find_preamble: \l__find_display_tl }
    \tl_replace_all:Nxx \l__find_display_tl
      { \c_hash_str \c_hash_str } { \c_hash_str }
    \dot:
    \regex_replace_all:nnN
      { (\\[A-Za-z]+) \ ([A-Za-z]) }
      { \1 \ \ \2 }
      \l__find_display_tl
    \dot:
    \regex_replace_all:nnN
      { (\\[A-Za-z]+) \ }
      { \1 \c{__find_allow_break:} }
      \l__find_display_tl
    \dot:
    \iow_wrap:nnnN { \l__find_display_tl } { } { } \__find_get_length_aux:n
  }
\cs_generate_variant:Nn \__find_get_length:n { V }
\cs_new_protected:Npn \__find_get_length_aux:n #1
  {
    \int_set:Nn \l__find_length_int { \str_count:n {#1} }
    \tl_set:Nn \l__find_display_tl {#1}
  }



\cs_new_protected:Npn \__find_set_prefix:n #1
  {
    % Check that the prefix |#1| is made only of alphabetic characters.
    %
    \str_set:Nx \l__find_prefix_str {#1}
    \int_set:Nn \l__find_prefix_int { \str_count:N \l__find_prefix_str }
    \regex_match:nVF { \A\w*\Z } \l__find_prefix_str
      {
        \msg_error:nnx { find } { invalid-prefix }
          { \l__find_prefix_str }
      }
    \dot:
  }

\cs_new_protected:Npn \__find_escape_tl:nN #1#2
  {
    % During the 'study' step, we manipulate the token list |#2|
    % with all begin-group and end-group tokens replaced by a
    % control sequence built from the prefix.  We must change both
    % begin-group and end-group tokens in one pass, to avoid getting an
    % unbalanced result.  Also replace macro parameters because they
    % cannot be used as delimiters for macros.  Spaces have been
    % turned into a control sequence earlier.  At this stage, every
    % token in |#2| can be grabbed as an N-type argument.
    %
    \regex_replace_all:nnN { \cB. } { \cB\{ } #2
    \dot:
    \regex_replace_all:nnN { \cE. } { \cE\} } #2
    \dot:
    \regex_replace_all:nnN { \cP. } { \c{ #1 \# } } #2
    \dot:
    \regex_replace_all:nnN { \c[BEP]. } { \c{ #1 \0 } } #2
    \dot:
  }




\cs_new_protected_nopar:Npn \__find_set_chunks:
  {
    % Build a token list whose items are each nine consecutive tokens
    % of the original token list, in a running window.  So for instance
    % |ABCDEFGHIJKL| would lead to the following \(12\) items:
    % |ABCDEFGHI|, |BCDEFGHIJ|, |CDEFGHIJK|, |DEFGHIJKL|, |EFGHIJKL|,
    % |FGHIJKL|, |GHIJKL|, |HIJKL|, |IJKL|, |JKL|, |KL|, |L|.  The items
    % of this token list are built in an |x|-expanded loop.
    % A special case arises if the |find| token list is too short to
    % safely perform the loop: then our whole algorithm is not going to
    % do any good anyways, so we build an empty chunk list.
    %
    \tl_set:Nx \l__find_chunks_tl
      {
        \exp_after:wN \__find_set_chunks_loop:NNNNNNNNN \l__find_tl
          \q_recursion_tail \q_recursion_stop
      }
  }
\cs_new:Npn \__find_set_chunks_loop:NNNNNNNNN #1#2#3#4#5#6#7#8#9
  {
    % As usual in a TeX loop, first check for the end-loop marker (here,
    % \cs{q_recursion_tail}).  If it is reached, we fill in the last few
    % chunks (which become shorter and shorter as we go).  Otherwise,
    % add (to the token list we are building) an item containing \(9\)
    % tokens, and loop, dropping the first of the items.
    %
    \quark_if_recursion_tail_stop_do:Nn #9
      { \__find_set_chunks_end:NNNNNNNN #1 #2 #3 #4 #5 #6 #7 #8 }
    { \exp_not:n { #1 #2 #3 #4 #5 #6 #7 #8 #9 } }
    \__find_set_chunks_loop:NNNNNNNNN #2#3#4#5#6#7#8#9
  }
\cs_new:Npn \__find_set_chunks_end:NNNNNNNN #1#2#3#4#5#6#7#8
  {
    \exp_not:n
      {
        { #1 #2 #3 #4 #5 #6 #7 #8 }
        { #2 #3 #4 #5 #6 #7 #8 }
        { #3 #4 #5 #6 #7 #8 }
        { #4 #5 #6 #7 #8 }
        { #5 #6 #7 #8 }
        { #6 #7 #8 }
        { #7 #8 }
        { #8 }
      }
  }
\cs_new_protected:Npn \__find_sort_chunks:
  {
    \tl_sort:Nn \l__find_chunks_tl
      {
        \int_compare:nNnTF
          {
            \tex_strcmp:D
              { \exp_not:n {##1} }
              { \exp_not:n {##2} }
          }
          > 0
          { \sort_return_swapped: }
          { \sort_return_same: }
      }
  }

\cs_new_protected:Npn \__find_common:
  {
    \int_step_inline:nnnn { 1 } { 1 } { 9 }
      { \seq_clear:c { l__find_chunks_##1_seq } }
    \exp_after:wN \__find_common_loop:nn \l__find_chunks_tl
      \q_recursion_tail \q_recursion_tail \q_recursion_stop
    \int_step_inline:nnnn { 4 } { 1 } { 9 }
      {
        \seq_map_inline:cn { l__find_chunks_##1_seq }
          {
            \tl_if_exist:cTF { l__find_chunk_ ' \tl_to_str:n {####1} ' _tl }
              {
                \tl_put_right:cn
                  { l__find_chunk_ ' \tl_to_str:n {####1} ' _tl } { i }
              }
              {
                \cs_set_eq:cN
                  { l__find_chunk_ ' \tl_to_str:n {####1} ' _tl } \c_empty_tl
              }
          }
      }
  }

\cs_new_protected:Npn \__find_common_loop:nn #1#2
  {
    \quark_if_recursion_tail_stop:n {#2}
    \int_zero:N \l__find_common_int
    \__find_count_common_aux:nn {#1} {#2}
    \use:c { __find_common_ \int_use:N \l__find_common_int :w }
      #1 X X X X X X X X X \q_stop
    \__find_common_loop:nn {#2}
  }
\cs_new_protected:Npn \__find_count_common_aux:nn #1#2
  {
    \tl_if_empty:nF {#1}
      {
        \tl_if_empty:nF {#2}
          {
            \tl_if_eq:xxT { \tl_head:n {#1} } { \tl_head:n {#2} }
              {
                \int_incr:N \l__find_common_int
                \__find_count_common_aux:xx
                  { \tl_tail:n {#1} } { \tl_tail:n {#2} }
              }
          }
      }
  }
\cs_generate_variant:Nn \__find_count_common_aux:nn { xx }

\cs_new_eq:cN { __find_common_0:w } \use_none_delimit_by_q_stop:w
\cs_new_protected:cpn { __find_common_1:w } #1
  { \__find_common_auxii:nnw { 1 } {#1} }
\cs_new_protected:cpn { __find_common_2:w } #1#2
  { \__find_common_auxii:nnw { 2 } { #1 #2 } }
\cs_new_protected:cpn { __find_common_3:w } #1#2#3
  { \__find_common_auxii:nnw { 3 } { #1 #2 #3 } }
\cs_new_protected:cpn { __find_common_4:w } #1#2#3#4
  { \__find_common_auxii:nnw { 4 } { #1 #2 #3 #4 } }
\cs_new_protected:cpn { __find_common_5:w } #1#2#3#4#5
  {
    \dot:
    \__find_common_auxii:nnw { 5 } { #1 #2 #3 #4 #5 }
  }
\cs_new_protected:cpn { __find_common_6:w } #1#2#3#4#5#6
  { \__find_common_auxii:nnw { 6 } { #1 #2 #3 #4 #5 #6 } }
\cs_new_protected:cpn { __find_common_7:w } #1#2#3#4#5#6#7
  { \__find_common_auxii:nnw { 7 } { #1 #2 #3 #4 #5 #6 #7 } }
\cs_new_protected:cpn { __find_common_8:w } #1#2#3#4#5#6#7#8
  { \__find_common_auxii:nnw { 8 } { #1 #2 #3 #4 #5 #6 #7 #8 } }
\cs_new_protected:cpn { __find_common_9:w } #1#2#3#4#5#6#7#8#9
  { \__find_common_auxii:nnw { 9 } { #1 #2 #3 #4 #5 #6 #7 #8 #9 } }
\cs_new_protected:Npn \__find_common_auxii:nnw #1#2#3 \q_stop
  {
    \int_zero:N \l__find_nesting_int
    \tl_map_inline:nn {#2}
      {
        \str_case_e:nn { \exp_not:n {##1} }
          {
            { \exp_not:c { \l__find_prefix_str \c_left_brace_str } }
              { \int_incr:N \l__find_nesting_int }
            { \exp_not:c { \l__find_prefix_str \c_right_brace_str } }
              {
                \int_compare:nNnF \l__find_nesting_int > 0
                  { \use_none_delimit_by_q_stop:w }
                \int_decr:N \l__find_nesting_int
              }
          }
      }
    \int_compare:nNnF \l__find_nesting_int = 0
      { \use_none_delimit_by_q_stop:w }
    \seq_put_right:cn { l__find_chunks_#1_seq } {#2}
    \use_none_delimit_by_q_stop:w
    \q_stop
  }

\cs_new_protected_nopar:Npn \__find_best_macros:
  {
    \tl_clear:N \l__find_macros_tl
    \tl_clear:N \l__find_result_tl
    \__find_best_macros_aux:
    \tl_put_left:NV \l__find_result_tl \l__find_macros_tl
  }
\cs_new_protected:Npn \__find_best_macros_aux:
  {
    \exp_after:wN \__find_best_macros_auxii:NNNNNNNNN \l__find_tl
      \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_stop
    \tl_if_empty:NF \l__find_tl { \__find_best_macros_aux: }
  }
\cs_new_protected:Npn \__find_best_macros_auxii:NNNNNNNNN #1#2#3#4#5#6#7#8#9
  {
    \int_zero:N \l__find_best_score_int
    \tl_clear:N \l__find_best_chunk_tl
    \tl_map_inline:nn
      {
        {#1} {#1#2} {#1#2#3} {#1#2#3#4} {#1#2#3#4#5} {#1#2#3#4#5#6}
        {#1#2#3#4#5#6#7} {#1#2#3#4#5#6#7#8} {#1#2#3#4#5#6#7#8#9}
      }
      {
        \int_compare:nNnT
          { \__find_score:n {##1} } > \l__find_best_score_int
          {
            \tl_set:Nn \l__find_best_chunk_tl {##1}
            \int_set:Nn \l__find_best_score_int
              { \__find_score:n {##1} }
          }
      }
    \tl_if_empty:NF \l__find_best_chunk_tl
      {
        \int_incr:N \l__find_macro_int
        \tl_put_right:Nx \l__find_macros_tl
          {
            \exp_not:c
              { \l__find_prefix_str \int_to_Alph:n { \l__find_macro_int } }
            { \exp_not:V \l__find_best_chunk_tl }
          }
        \use:x
          {
            \exp_not:n { \tl_replace_all:NVn \l__find_tl \l__find_best_chunk_tl }
              { \exp_not:c { \l__find_prefix_str \int_to_Alph:n { \l__find_macro_int } } }
          }
        \dot:
      }
    \tl_put_right:Nx \l__find_result_tl { \tl_head:N \l__find_tl }
    \tl_set:Nx \l__find_tl { \tl_tail:N \l__find_tl }
    \use_none_delimit_by_q_stop:w
  }
\cs_new:Npn \__find_score:n #1
  {
    % Turning ####1 into a length (p+2) macro
    % (e.g. |\<prefix>A|) saves this number of chars.
    % Good if non-negative.
    \cs_if_exist:cTF { l__find_chunk_ ' \tl_to_str:n {#1} ' _tl }
      {
        \int_eval:n
          {
            \tl_count:c { l__find_chunk_ ' \tl_to_str:n {#1} ' _tl }
            * ( \str_count:n {#1} - 3 - \l__find_prefix_int )
            - 2 * \l__find_prefix_int - 9
          }
      }
      { -1 }
  }

\cs_new_protected:Npn \__find_undefine_tmp:
  {
    \int_step_inline:nnnn { 4 } { 1 } { 9 }
      {
        \seq_map_inline:cn { l__find_chunks_##1_seq }
          { \cs_undefine:c { l__find_chunk_ ' \tl_to_str:n {####1} ' _tl } }
      }
  }

\cs_new_protected:Npn \__find_unescape_tl:nN #1#2
  {
    \regex_replace_all:nnN { \c{#1\{} } { \cB\{ \cE\? } #2
    \dot:
    \regex_replace_all:nnN { \c{#1\}} } { \cB\? \cE\} } #2
    \dot:
    \regex_replace_all:nnN { \c[BE]\? } { } #2
    \dot:
    \regex_replace_all:nnN { \c{#1\#} } { \cP\# } #2
    \dot:
  }

\cs_if_exist:NTF \iow_allow_break:
  { \cs_new:Npn \__find_allow_break: { \iow_allow_break: } }
  { \cs_new:Npn \__find_allow_break: { ~ } }

\msg_new:nnn { find } { invalid-prefix }
  { The~prefix~used,~'#1',~must~be~made~of~letters. }

\cs_set_protected:Npn \FindMatches #1#2
  {
    \find_matches:nnN {#1} {#2} \l_tmpa_tl
    \tl_use:N \l_tmpa_tl
  }
\ExplSyntaxOff

% Set up some basic things to behave a bit like plain TeX.
%
\documentclass{article}
\begin{document}
\setlength{\parindent}{0pt}
\def\bye{\end{document}}
%
% End of setup.

\FindMatches{}{% Let's go!

On the first day of Christmas my true love gave to me\par
a partridge in a pear tree.

\bigskip

On the second day of Christmas my true love gave to me\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the third day of Christmas my true love gave to me\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the fourth day of Christmas my true love gave to me\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the fifth day of Christmas my true love gave to me\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the sixth day of Christmas my true love gave to me\par
six geese a laying\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the seventh day of Christmas my true love gave to me\par
seven swans a swimming\par
six geese a laying\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the eighth day of Christmas my true love gave to me\par
eight maids a milking\par
seven swans a swimming\par
six geese a laying\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the ninth day of Christmas my true love gave to me\par
nine ladies dancing\par
eight maids a milking\par
seven swans a swimming\par
six geese a laying\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the tenth day of Christmas my true love gave to me\par
ten lords a leaping\par
nine ladies dancing\par
eight maids a milking\par
seven swans a swimming\par
six geese a laying\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the eleventh day of Christmas my true love gave to me\par
eleven pipers piping\par
ten lords a leaping\par
nine ladies dancing\par
eight maids a milking\par
seven swans a swimming\par
six geese a laying\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.

\bigskip

On the twelfth day of Christmas my true love gave to me\par
twelve drummers drumming\par
eleven pipers piping\par
ten lords a leaping\par
nine ladies dancing\par
eight maids a milking\par
seven swans a swimming\par
six geese a laying\par
five gold rings\par
four calling birds\par
three french hens\par
two turtle doves\par
and a partridge in a pear tree.
\bye
}

Question 2

我提供的是完全的拼凑物，从我的titlecaps包中偷取了一些东西。因此格式不是最佳的，但它可能提供了一个继续前进的地方。好消息是，在目标文本的搜索中标点符号被筛选出来了。

当将搜索字符串和目标文本（不超过一个段落）传递给时\seekphrase，它将输出搜索短语的每个单词在目标文本中出现的位置。我将其输出为一对“目标位置：搜索词索引”。为了尝试理解它，我在搜索[词索引=1 匹配之前输出一个，]在搜索词索引=n 匹配之后输出一个。因此，要符合匹配“短语”的条件，例如，在 3 个词的搜索中，输出必须包含类似以下内容的实例[214:1 215:2, 216:3]

显然可以进一步完善，但这将花费我目前所拥有的更多时间。

\documentclass{article}
\usepackage{titlecaps}
\usepackage{lipsum}
\makeatletter
\renewcommand\seek@lcwords{%
\kill@punct%
  \setcounter{word@count}{0}%
  \whiledo{\value{word@count} < \narg}{%
    \addtocounter{word@count}{1}%
\protected@edef\current@word{\csname arg\roman{word@count}\endcsname}%
    \def\found@word{F}%
    \setcounter{lcword@index}{0}%
    \expandafter\def\csname%
            found@word\roman{word@count}\endcsname{F}%
    \whiledo{\value{lcword@index} < \value{lc@words}}{%
      \addtocounter{lcword@index}{1}%
      \protected@edef\current@lcword{%
            \csname lcword\roman{lcword@index}\endcsname}%
%% THE FOLLOWING THREE LINES ARE FROM DAVID CARLISLE
  \protected@edef\tmp{\noexpand\scantokens{\def\noexpand\tmp%
   {\noexpand\ifthenelse{\noexpand\equal{\current@word}{\current@lcword}}}}}%
  \tmp\ifhmode\unskip\fi\tmp
%%
      {\expandafter\def\csname%
            found@word\roman{word@count}\endcsname{T}%
\ifthenelse{\equal{\value{lcword@index}}{1}}{[}{}%
\arabic{word@count}:\arabic{lcword@index}%
\ifthenelse{\equal{\value{lcword@index}}{\value{lc@words}}}{]}{ }%
       \setcounter{lcword@index}{\value{lc@words}}}%
      {}%
    }%
  }%
\restore@punct%
}
\let\getargsC\get@argsC

\newcommand\seekphrase[2]{%
  Seeking phrase ``#1'':\\%
  \Addlcwords{#1}%
  \redefine@tertius%
  \getargsC{#2}%
  \seek@lcwords%
  \Resetlcwords%
  \par%
}
\makeatother

\begin{document}

\lipsum[1]
\lipsum[1]

\def\x{%
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Ut
purus elit, vestibulum ut, placerat ac, adipiscing vitae, felis.
Curabitur dictum gravida mauris. Nam arcu libero, nonummy eget,
consectetuer id, vulputate a, magna.  Donec vehicula augue eu neque.
Pellentesque habitant morbi tristique senectus et netus et malesuada
fames ac turpis egestas. Mauris ut leo. Cras viverra metus rhoncus sem.
Nulla et lectus vestibulum urna fringilla ultrices.  Phasellus eu tellus
sit amet tortor gravida placerat. Integer sapien est, iaculis in,
pretium quis, viverra ac, nunc. Praesent eget sem vel leo ultrices
bibendum.  Aenean faucibus.  Morbi dolor nulla, malesuada eu, pulvinar
at, mollis ac, nulla. Curabitur auctor semper nulla. Donec varius orci
eget risus. Duis nibh mi, congue eu, accumsan eleifend, sagittis quis,
diam. Duis eget orci sit amet orci dignissim rutrum.
%
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Ut
purus elit, vestibulum ut, placerat ac, adipiscing vitae, felis.
Curabitur dictum gravida mauris. Nam arcu libero, nonummy eget,
consectetuer id, vulputate a, magna.  Donec vehicula augue eu neque.
Pellentesque habitant morbi tristique senectus et netus et malesuada
fames ac turpis egestas. Mauris ut leo. Cras viverra metus rhoncus sem.
Nulla et lectus vestibulum urna fringilla ultrices.  Phasellus eu tellus
sit amet tortor gravida placerat. Integer sapien est, iaculis in,
pretium quis, viverra ac, nunc. Praesent eget sem vel leo ultrices
bibendum.  Aenean faucibus.  Morbi dolor nulla, malesuada eu, pulvinar
at, mollis ac, nulla. Curabitur auctor semper nulla. Donec varius orci
eget risus. Duis nibh mi, congue eu, accumsan eleifend, sagittis quis,
diam. Duis eget orci sit amet orci dignissim rutrum.
}

\seekphrase{et}{\x}

\seekphrase{et netus}{\x}

\seekphrase{bibendum Aenean}{\x}

\seekphrase{eget sem vel}{\x}

\end{document}

在此处输入图片描述

Answer

我提供的是完全的拼凑物，从我的titlecaps包中偷取了一些东西。因此格式不是最佳的，但它可能提供了一个继续前进的地方。好消息是，在目标文本的搜索中标点符号被筛选出来了。

当将搜索字符串和目标文本（不超过一个段落）传递给时\seekphrase，它将输出搜索短语的每个单词在目标文本中出现的位置。我将其输出为一对“目标位置：搜索词索引”。为了尝试理解它，我在搜索[词索引=1 匹配之前输出一个，]在搜索词索引=n 匹配之后输出一个。因此，要符合匹配“短语”的条件，例如，在 3 个词的搜索中，输出必须包含类似以下内容的实例[214:1 215:2, 216:3]

显然可以进一步完善，但这将花费我目前所拥有的更多时间。

\documentclass{article}
\usepackage{titlecaps}
\usepackage{lipsum}
\makeatletter
\renewcommand\seek@lcwords{%
\kill@punct%
  \setcounter{word@count}{0}%
  \whiledo{\value{word@count} < \narg}{%
    \addtocounter{word@count}{1}%
\protected@edef\current@word{\csname arg\roman{word@count}\endcsname}%
    \def\found@word{F}%
    \setcounter{lcword@index}{0}%
    \expandafter\def\csname%
            found@word\roman{word@count}\endcsname{F}%
    \whiledo{\value{lcword@index} < \value{lc@words}}{%
      \addtocounter{lcword@index}{1}%
      \protected@edef\current@lcword{%
            \csname lcword\roman{lcword@index}\endcsname}%
%% THE FOLLOWING THREE LINES ARE FROM DAVID CARLISLE
  \protected@edef\tmp{\noexpand\scantokens{\def\noexpand\tmp%
   {\noexpand\ifthenelse{\noexpand\equal{\current@word}{\current@lcword}}}}}%
  \tmp\ifhmode\unskip\fi\tmp
%%
      {\expandafter\def\csname%
            found@word\roman{word@count}\endcsname{T}%
\ifthenelse{\equal{\value{lcword@index}}{1}}{[}{}%
\arabic{word@count}:\arabic{lcword@index}%
\ifthenelse{\equal{\value{lcword@index}}{\value{lc@words}}}{]}{ }%
       \setcounter{lcword@index}{\value{lc@words}}}%
      {}%
    }%
  }%
\restore@punct%
}
\let\getargsC\get@argsC

\newcommand\seekphrase[2]{%
  Seeking phrase ``#1'':\\%
  \Addlcwords{#1}%
  \redefine@tertius%
  \getargsC{#2}%
  \seek@lcwords%
  \Resetlcwords%
  \par%
}
\makeatother

\begin{document}

\lipsum[1]
\lipsum[1]

\def\x{%
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Ut
purus elit, vestibulum ut, placerat ac, adipiscing vitae, felis.
Curabitur dictum gravida mauris. Nam arcu libero, nonummy eget,
consectetuer id, vulputate a, magna.  Donec vehicula augue eu neque.
Pellentesque habitant morbi tristique senectus et netus et malesuada
fames ac turpis egestas. Mauris ut leo. Cras viverra metus rhoncus sem.
Nulla et lectus vestibulum urna fringilla ultrices.  Phasellus eu tellus
sit amet tortor gravida placerat. Integer sapien est, iaculis in,
pretium quis, viverra ac, nunc. Praesent eget sem vel leo ultrices
bibendum.  Aenean faucibus.  Morbi dolor nulla, malesuada eu, pulvinar
at, mollis ac, nulla. Curabitur auctor semper nulla. Donec varius orci
eget risus. Duis nibh mi, congue eu, accumsan eleifend, sagittis quis,
diam. Duis eget orci sit amet orci dignissim rutrum.
%
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Ut
purus elit, vestibulum ut, placerat ac, adipiscing vitae, felis.
Curabitur dictum gravida mauris. Nam arcu libero, nonummy eget,
consectetuer id, vulputate a, magna.  Donec vehicula augue eu neque.
Pellentesque habitant morbi tristique senectus et netus et malesuada
fames ac turpis egestas. Mauris ut leo. Cras viverra metus rhoncus sem.
Nulla et lectus vestibulum urna fringilla ultrices.  Phasellus eu tellus
sit amet tortor gravida placerat. Integer sapien est, iaculis in,
pretium quis, viverra ac, nunc. Praesent eget sem vel leo ultrices
bibendum.  Aenean faucibus.  Morbi dolor nulla, malesuada eu, pulvinar
at, mollis ac, nulla. Curabitur auctor semper nulla. Donec varius orci
eget risus. Duis nibh mi, congue eu, accumsan eleifend, sagittis quis,
diam. Duis eget orci sit amet orci dignissim rutrum.
}

\seekphrase{et}{\x}

\seekphrase{et netus}{\x}

\seekphrase{bibendum Aenean}{\x}

\seekphrase{eget sem vel}{\x}

\end{document}

在此处输入图片描述

搜索文本中最常出现的模式（用宏替换它们）

答案1

答案2

相关内容