支持将标点符号放入宏中进行字符计数

2024-5-21 • tag-icon

关于这个宏的问题：

通过在最终字符计数中添加一些标点符号（例如，.;:?!'()），是否可以提高此宏的精度？

\documentclass{scrreprt}

\usepackage{classicthesis}

        \RequirePackage{xparse}
        \RequirePackage{siunitx}
        \RequirePackage{environ}
        \RequirePackage{booktabs}

\ExplSyntaxOn

\bool_new:N \g_has_run_bool
\tl_new:N \l_aw_text_tl
\int_new:N \l_aw_tot_int
\int_new:N \g_aw_tot_alph_int
\int_new:N \g_wid_space_int
\int_new:N \g_space_int
\fp_new:N \g_rat_space_int
\fp_new:N \g_aw_avg_width_fp
\dim_new:N \myalphabetwidth
\dim_new:N \mytextwidth
\makeatletter
\input{\jobname.aux}
\tl_const:Nn \c_aw_the_alphabet_tl {abcdefghijklmnopqrstuvwxyz}

% this can be changed to an evironment or renamed or whatever
\NewDocumentCommand {\avgwidthstart} {}
  {
    \aw_avg_width:w
  }

\NewDocumentCommand {\avgwidthend}{}{}

% Here is the environment version, using just "text" as a name is probably a bad idea.
\NewEnviron{avgtext}
{
  \expandafter\avgwidthstart\BODY\avgwidthend
}

\cs_new:Npn \aw_avg_width:w #1 \avgwidthend
  {
    % if first run, then generate variables to be used
    \bool_if:NF \g_has_run_bool
      {
        \tl_map_inline:Nn \c_aw_the_alphabet_tl
        {
          \int_new:c {g_##1_int}
          \fp_new:c {g_rat_##1_fp}
          \fp_new:c {g_wid_##1_fp}
        }
      }
    \tl_set:Nn \l_aw_text_tl {#1}

    % this can be used rather than the preceding line to take capital 
    % letters into account, but is Slooooooow
    %\tl_set:Nx \l_aw_text_tl {\tl_expandable_lowercase:n {#1}}

    \int_set:Nn \l_aw_tot_int {\tl_count:N \l_aw_text_tl}
    \tl_map_function:NN \c_aw_the_alphabet_tl \aw_get_counts:n
    \deal_with_spaces:n {#1}
    \tl_map_function:NN \c_aw_the_alphabet_tl \aw_calc_ratios:n
    \tl_map_function:NN \c_aw_the_alphabet_tl \aw_calc_avg_width:n
    \fp_gset_eq:NN \g_aw_avg_width_fp \l_tmpa_fp
    \fp_zero:N \l_tmpa_fp

    % the dimension \myalphabetwidth gives the width of the alphabet based on your character freq,
    % can be accessed by \the\myalphabetwidth
    \dim_gset:Nn \myalphabetwidth {\fp_to_dim:n {\fp_eval:n {26*\g_aw_avg_width_fp}}}

    % the dimension \mytextwidth gives the recommended \textwidth based on 66 chars per line.
    % can be accessed by \the\mytextwidth
    \dim_gset:Nn \mytextwidth {\fp_to_dim:n {\fp_eval:n {66*\g_aw_avg_width_fp}}}
    \protected@write\@mainaux{}{\mytextwidth=\the\mytextwidth}
    \bool_gset_true:N \g_has_run_bool

    % and lastly print the content
    #1
  }

\makeatother

\cs_new:Npn \aw_get_counts:n #1
  {
    % make a temporary token list from the document body 
    \tl_set_eq:NN \l_tmpb_tl \l_aw_text_tl
    % remove all occurrences of the character
    \tl_remove_all:Nn \l_tmpb_tl {#1}
    % add to appropriate int the number of occurrences of that character in current block
    \int_set:Nn \l_tmpa_int {\int_eval:n{\l_aw_tot_int -\tl_count:N \l_tmpb_tl}}
    % add to appropriate int the number of occurrences of that character in current block
    \int_gadd:cn {g_#1_int} {\l_tmpa_int}
    % add this to the total
    \int_gadd:Nn \g_aw_tot_alph_int {\l_tmpa_int}
  }

\cs_new:Npn \deal_with_spaces:n #1
  {
    \tl_set:Nn \l_tmpa_tl {#1}
    % rescan body with spaces as characters
    \tl_set_rescan:Nnn \l_tmpb_tl {\char_set_catcode_letter:N \ }{#1}
    % find number of new characters introduced.  add to number of spaces and alph chars
    \int_set:Nn \l_tmpa_int {\tl_count:N \l_tmpb_tl -\tl_count:N \l_tmpa_tl}
    \int_gadd:Nn \g_space_int {\l_tmpa_int}
    \int_gadd:Nn \g_aw_tot_alph_int {\l_tmpa_int}
    % since this comes after the rest of chars are dealt with, tot_alph is final total
    \fp_set:Nn \g_rat_space_fp {\g_space_int/\g_aw_tot_alph_int}
    % get width of space and use it.  obviously space is stretchable, so i'll assume
    % that the expansions and contractions cancel one another over large text.  is this
    % a terrible assumption???
    \hbox_set:Nn \l_tmpa_box {\ }
    \fp_gset:Nn \g_wid_space_fp {\dim_to_fp:n {\box_wd:N \l_tmpa_box}}
    \fp_add:Nn \l_tmpa_fp {\g_wid_space_fp*\g_rat_space_fp}
  }

\cs_new:Npn \aw_calc_ratios:n #1
  {
    % divide number of occurrences of char by total alphabetic chars
    \fp_gset:cn {g_rat_#1_fp}{{\int_use:c {g_#1_int}}/\g_aw_tot_alph_int}
  }

\cs_new:Npn \aw_calc_avg_width:n #1
  {
    % only need to find char widths once
    \bool_if:NF \g_has_run_bool
      {
        % find width of char box
        \hbox_set:Nn \l_tmpa_box {#1}
        \fp_gset:cn {g_wid_#1_fp}{\dim_to_fp:n {\box_wd:N \l_tmpa_box}}
      }
    % multiply it by char frequency and add to avg width
    \fp_add:Nn \l_tmpa_fp {{\fp_use:c {g_wid_#1_fp}}*{\fp_use:c {g_rat_#1_fp}}}
  }
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% This part is just for fun. Delete it and the showtable command from the document if
% it isn't wanted
    \newlength{\alphabet}%
    \settowidth{\alphabet}{\normalfont abcdefghijklmnopqrstuvwxyz}%
\tl_new:N \l_aw_tab_rows_tl
\seq_new:N \g_aw_theor_rats_seq
\seq_new:N \g_aw_the_alphabet_seq

\seq_gset_split:Nnn \g_aw_theor_rats_seq {,}
{0.0651738,0.0124248,0.0217339,0.0349835,0.1041442,0.0197881,0.0158610,0.0492888,0.0558094,0.0009033,0.0050529,0.0331490,0.0202124,0.0564513,0.0596302,0.0137645,0.0008606,0.0497563,0.0515760,0.0729357,0.0225134,0.0082903,0.0171272,0.0013692,0.0145984,0.0007836,0.1918182}

\NewDocumentCommand {\showtable}{}
    {
      \clearpage
      \aw_make_table:
    }

\cs_generate_variant:Nn \seq_set_split:Nnn {NnV}
\cs_new:Npn \aw_make_table:
    {
      \thispagestyle{empty}
      \seq_set_split:NnV \g_aw_the_alphabet_seq {} \c_aw_the_alphabet_tl
      %takes corresponding letter/theoretical ratio pairs from sequences and applies function
      \seq_mapthread_function:NNN \g_aw_the_alphabet_seq \g_aw_theor_rats_seq \aw_generate_row:nn
      \begin{table}[h]
      \centering
      \sisetup{round-mode = places,round-precision = 5,output-decimal-marker={,},table-format = 3.5}
      \begin{tabular}{llll}
        \toprule
        {Average\,\texttt{\textbackslash textwidth}}&{Average\,character\,width}&{Average\,alphabet\,width}&{Alphabet\,width}\\
        \midrule
        \the\mytextwidth&\fp_eval:n {round(\g_aw_avg_width_fp,5)}pt&\the\myalphabetwidth&\the\alphabet\\
        \bottomrule
      \end{tabular}\par
      \end{table}
      \vfil
      \begin{center}
        Total\,characters\,=\,\fp_eval:n {\g_aw_tot_alph_int}
      \end{center}
      \[%
        \mathrm{Total\,line\,type}=\frac{\fp_eval:n {\g_aw_tot_alph_int}\cdot \fp_eval:n {round(\g_aw_avg_width_fp,5)}\mathrm{pt}}{\fp_eval:n {\g_aw_tot_alph_int*{round(\g_aw_avg_width_fp,5)}/({\g_aw_tot_alph_int}/66)}\mathrm{pt}}=\fp_eval:n {\g_aw_tot_alph_int/66}
      \]
      \vfil
      \begin{table}[h]
      \centering
      \sisetup{round-mode = places,round-precision = 5,output-decimal-marker={,},table-format = 3.5}
      \begin{tabular}{cSSS}
        \toprule
        {Letter}&{Actual}&{Theoretical}&{Difference}\\
        \midrule
        spaces&\fp_eval:n {\g_rat_space_fp*100}\%&19.18182\%&\fp_eval:n {{\g_rat_space_fp*100-19.18182}}\%\\
        \tl_use:N \l_aw_tab_rows_tl
        \bottomrule
      \end{tabular}\par
      \end{table}
    }

\cs_new:Npn \aw_generate_row:nn #1#2
    {
      \tl_put_right:Nn \l_aw_tab_rows_tl {#1&}
      \tl_put_right:Nx \l_aw_tab_rows_tl {\fp_eval:n {100*{\fp_use:c {g_rat_#1_fp}}}\%&}
      \tl_put_right:Nn \l_aw_tab_rows_tl {\fp_eval:n{100*{#2}}\%&}
      \tl_put_right:Nx \l_aw_tab_rows_tl {\fp_eval:n {{\fp_use:c {g_rat_#1_fp}*100-\fp_eval:n {#2}*100}}\%}
      \tl_put_right:Nn \l_aw_tab_rows_tl {\\}
    }

\ExplSyntaxOff

\begin{document}

\avgwidthstart
lorem ipsum dolor sit amet consectetur adipiscing elit aenean faucibus luctus diam id convallis mauris faucibus ut aenean accumsan dignissim posuere praesent et diam nec est lobortis faucibus fusce sit amet placerat velit curabitur tortor velit imperdiet imperdiet condimentum in blandit ac augue in vulputate volutpat ligula malesuada porta mi eleifend at praesent ut augue vel nulla molestie ornare ac a sem donec luctus volutpat ipsum sed consectetur maecenas at lacus a tortor congue blandit vestibulum elementum risus dapibus sem blandit adipiscing pellentesque vulputate ullamcorper vulputate in suscipit facilisis libero maecenas massa sapien 
\avgwidthend

\showtable{}

\end{document}

我已尝试以下方法但是...

                                                              %ADD OTHER SYMBOLS WITHOUT AN ORDER
\tl_const:Nn \c_aw_the_alphabet_tl {abcdefghijklmnopqrstuvwxyz,.;:?!'}

                                                         %CHANGE 26 INTO 33 (TOTAL OF CHARACTERS)
\dim_gset:Nn \myalphabetwidth {\fp_to_dim:n {\fp_eval:n {33*\g_aw_avg_width_fp}}}


% ANALYZING THIS CHARACTERS I GET 32, NOT 33 WHY? 
% ALSO THE TABLE BELOW DON'T SHOW THE FREQUENCIES OF THE ADDED SYMBOLS, WHY?
% IF IS FOR THE CORRESPONDING TEORICAL VALUES DOES NOT MATTER, 
% THE TEORICAL VALUES CAN ARRIVE ONLY AT THE LAST CHARACTER THAT CORRESPOND ONE VALUE
% AND THAN LEAVE A WHITE SPACE OR ELSE CANCEL ALL THE TEORICAL VALUES


\avgwidthstart
abcdefghijklmnopqrstuvwxyz,.;:?!'
\avgwidthend

答案1

该列表解析不正确，因为:宏和文本中的类别代码不同。例如

\tl_const:Nx \c_aw_the_alphabet_tl {abcdefghijklmnopqrstuvwxyz,.;?!' \token_to_str:N :}

这样标记列表中的冒号将具有类别代码 12。

这是因为在expl3编程环境中，冒号的类别代码为 11；

\token_to_str:N :

扩展后我们强制它为类别代码12，这就是\tl_const:Nx使用它的原因。

然后你的例子给出

总字符数 = 33

关于这个宏的问题：

答案1

相关内容