测量字母宽度的缺陷

测量字母宽度的缺陷

问题:

为什么在下面的例子中我得到(CM 为 10pt)\alphabet=342.93138pt以及 \myalphabetwidth=342.6536pt为什么这两个度量之间存在这种差异?哪一个更正确?

\documentclass{book}

\RequirePackage{xparse}
\RequirePackage{siunitx}
\RequirePackage{environ}
\RequirePackage{booktabs}

% **********************************************************
\ExplSyntaxOn
% **********************************************************

\bool_new:N \g_has_run_bool
\tl_new:N \l_aw_text_tl
\int_new:N \l_aw_tot_int
\int_new:N \g_aw_tot_alph_int
\int_new:N \g_wid_space_int
\int_new:N \g_space_int
\fp_new:N \g_rat_space_int
\fp_new:N \g_aw_avg_width_fp
\dim_new:N \myalphabetwidth
\dim_new:N \mytextwidth
\input{\jobname.aux}
\tl_const:Nx \c_aw_the_alphabet_tl {abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,.;?()!' \token_to_str:N :}

% this can be changed to an evironment or renamed or whatever
\NewDocumentCommand {\avgwidthstart} {}
  {
    \aw_avg_width:w
  }

\NewDocumentCommand {\avgwidthend}{}{}

% Here is the environment version, using just "text" as a name is probably a bad idea.
\NewEnviron{textcount}
{
  \expandafter\avgwidthstart\BODY\avgwidthend
}

\cs_new:Npn \aw_avg_width:w #1 \avgwidthend
  {
    % if first run, then generate variables to be used
    \bool_if:NF \g_has_run_bool
      {
        \tl_map_inline:Nn \c_aw_the_alphabet_tl
        {
          \int_new:c {g_##1_int}
          \fp_new:c {g_rat_##1_fp}
          \fp_new:c {g_wid_##1_fp}
        }
      }
    \tl_set:Nn \l_aw_text_tl {#1}

    % this can be used rather than the preceding line to take capital 
    % letters into account, but is Slooooooow
    %\tl_set:Nx \l_aw_text_tl {\tl_expandable_lowercase:n {#1}}

    \int_set:Nn \l_aw_tot_int {\tl_count:N \l_aw_text_tl}
    \tl_map_function:NN \c_aw_the_alphabet_tl \aw_get_counts:n
    \deal_with_spaces:n {#1}
    \tl_map_function:NN \c_aw_the_alphabet_tl \aw_calc_ratios:n
    \tl_map_function:NN \c_aw_the_alphabet_tl \aw_calc_avg_width:n
    \fp_gset_eq:NN \g_aw_avg_width_fp \l_tmpa_fp
    \fp_zero:N \l_tmpa_fp

    % the dimension \myalphabetwidth gives the width of the alphabet based on your character freq,
    % can be accessed by \the\myalphabetwidth
    \dim_gset:Nn \myalphabetwidth {\fp_to_dim:n {\fp_eval:n {61*\g_aw_avg_width_fp}}}

    % the dimension \mytextwidth gives the recommended \textwidth based on 66 chars per line.
    % can be accessed by \the\mytextwidth
    \dim_gset:Nn \mytextwidth {\fp_to_dim:n {\fp_eval:n {66*\g_aw_avg_width_fp}}}
    \protected@write\@mainaux{}{\mytextwidth=\the\mytextwidth}
    \bool_gset_true:N \g_has_run_bool

    % and lastly print the content
    #1
  }

\cs_new:Npn \aw_get_counts:n #1
  {
    % make a temporary token list from the document body 
    \tl_set_eq:NN \l_tmpb_tl \l_aw_text_tl
    % remove all occurrences of the character
    \tl_remove_all:Nn \l_tmpb_tl {#1}
    % add to appropriate int the number of occurrences of that character in current block
    \int_set:Nn \l_tmpa_int {\int_eval:n{\l_aw_tot_int -\tl_count:N \l_tmpb_tl}}
    % add to appropriate int the number of occurrences of that character in current block
    \int_gadd:cn {g_#1_int} {\l_tmpa_int}
    % add this to the total
    \int_gadd:Nn \g_aw_tot_alph_int {\l_tmpa_int}
  }

\cs_new:Npn \deal_with_spaces:n #1
  {
    \tl_set:Nn \l_tmpa_tl {#1}
    % rescan body with spaces as characters
    \tl_set_rescan:Nnn \l_tmpb_tl {\char_set_catcode_letter:N \ }{#1}
    % find number of new characters introduced.  add to number of spaces and alph chars
    \int_set:Nn \l_tmpa_int {\tl_count:N \l_tmpb_tl -\tl_count:N \l_tmpa_tl}
    \int_gadd:Nn \g_space_int {\l_tmpa_int}
    \int_gadd:Nn \g_aw_tot_alph_int {\l_tmpa_int}
    % since this comes after the rest of chars are dealt with, tot_alph is final total
    \fp_set:Nn \g_rat_space_fp {\g_space_int/\g_aw_tot_alph_int}
    % get width of space and use it.  obviously space is stretchable, so i'll assume
    % that the expansions and contractions cancel one another over large text.  is this
    % a terrible assumption???
    \hbox_set:Nn \l_tmpa_box {\ }
    \fp_gset:Nn \g_wid_space_fp {\dim_to_fp:n {\box_wd:N \l_tmpa_box}}
    \fp_add:Nn \l_tmpa_fp {\g_wid_space_fp*\g_rat_space_fp}
  }

\cs_new:Npn \aw_calc_ratios:n #1
  {
    % divide number of occurrences of char by total alphabetic chars
    \fp_gset:cn {g_rat_#1_fp}{{\int_use:c {g_#1_int}}/\g_aw_tot_alph_int}
  }

\cs_new:Npn \aw_calc_avg_width:n #1
  {
    % only need to find char widths once
    \bool_if:NF \g_has_run_bool
      {
        % find width of char box
        \hbox_set:Nn \l_tmpa_box {#1}
        \fp_gset:cn {g_wid_#1_fp}{\dim_to_fp:n {\box_wd:N \l_tmpa_box}}
      }
    % multiply it by char frequency and add to avg width
    \fp_add:Nn \l_tmpa_fp {{\fp_use:c {g_wid_#1_fp}}*{\fp_use:c {g_rat_#1_fp}}}
  }
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% This part is just for fun. Delete it and the showtable command from the document if
% it isn't wanted
    \newlength{\alphabet}%
    \settowidth{\alphabet}{\normalfont abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,.;?()!':}%
\tl_new:N \l_aw_tab_rows_tl
\seq_new:N \g_aw_theor_rats_seq
\seq_new:N \g_aw_the_alphabet_seq

\seq_gset_split:Nnn \g_aw_theor_rats_seq {,}
{0.0651738,0.0124248,0.0217339,0.0349835,0.1041442,0.0197881,0.0158610,0.0492888,0.0558094,0.0009033,0.0050529,0.0331490,0.0202124,0.0564513,0.0596302,0.0137645,0.0008606,0.0497563,0.0515760,0.0729357,0.0225134,0.0082903,0.0171272,0.0013692,0.0145984,0.0007836,0.1918182}

\NewDocumentCommand {\showtable}{}
    {
      \clearpage
      \aw_make_table:
    }

\cs_generate_variant:Nn \seq_set_split:Nnn {NnV}
\cs_new:Npn \aw_make_table:
    {
      \thispagestyle{empty}
      \seq_set_split:NnV \g_aw_the_alphabet_seq {} \c_aw_the_alphabet_tl
      %takes corresponding letter/theoretical ratio pairs from sequences and applies function
      \seq_mapthread_function:NNN \g_aw_the_alphabet_seq \g_aw_theor_rats_seq \aw_generate_row:nn
      \begin{table}[h]
      \centering
      \sisetup{round-mode = places,round-precision = 5,output-decimal-marker={,},table-format = 3.5}
      \begin{tabular}{llll}
        \toprule
        {Average\,\texttt{\textbackslash textwidth}}&{Average\,character\,width}&{Average\,alphabet\,width}&{Alphabet\,width}\\
        \midrule
        \the\mytextwidth&\fp_eval:n {round(\g_aw_avg_width_fp,5)}pt&\the\myalphabetwidth&\the\alphabet\\
        \bottomrule
      \end{tabular}\par
      \end{table}
      \vfil
      \begin{center}
        Total\,characters\,=\,\fp_eval:n {\g_aw_tot_alph_int}
      \end{center}
      \[%
        \mathrm{Total\,line\,type}=\frac{\fp_eval:n {\g_aw_tot_alph_int}\cdot \fp_eval:n {round(\g_aw_avg_width_fp,5)}\mathrm{pt}}{\fp_eval:n {\g_aw_tot_alph_int*{round(\g_aw_avg_width_fp,5)}/({\g_aw_tot_alph_int}/66)}\mathrm{pt}}=\fp_eval:n {\g_aw_tot_alph_int/66}
      \]
      \vfil
      \begin{table}[h]
      \centering
      \sisetup{round-mode = places,round-precision = 5,output-decimal-marker={,},table-format = 3.5}
      \begin{tabular}{cSSS}
        \toprule
        {Letter}&{Actual}&{Theoretical}&{Difference}\\
        \midrule
        spaces&\fp_eval:n {\g_rat_space_fp*100}\%&19.18182\%&\fp_eval:n {{\g_rat_space_fp*100-19.18182}}\%\\
        \tl_use:N \l_aw_tab_rows_tl
        \bottomrule
      \end{tabular}\par
      \end{table}
    }

\cs_new:Npn \aw_generate_row:nn #1#2
    {
      \tl_put_right:Nn \l_aw_tab_rows_tl {#1&}
      \tl_put_right:Nx \l_aw_tab_rows_tl {\fp_eval:n {100*{\fp_use:c {g_rat_#1_fp}}}\%&}
      \tl_put_right:Nn \l_aw_tab_rows_tl {\fp_eval:n{100*{#2}}\%&}
      \tl_put_right:Nx \l_aw_tab_rows_tl {\fp_eval:n {{\fp_use:c {g_rat_#1_fp}*100-\fp_eval:n {#2}*100}}\%}
      \tl_put_right:Nn \l_aw_tab_rows_tl {\\}
    }

% **********************************************************
\ExplSyntaxOff
% **********************************************************


\begin{document}

\avgwidthstart
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,.;?()!':%
\avgwidthend

\the\myalphabetwidth

\the\alphabet

\end{document}

更新:

我不想要某个特定字符串的总宽度。我的示例只是测试宏的一种方法。我尝试比较同一字符串的两个度量来检查宏的性能,即计算文档中的字符频率(在文档的普通文本上),然后获取平均字母宽度并使用它来设置\textwidth

因为您曾说过:“我不明白这个问题。”我尝试重新解释一下:

我的问题是:

宏中有一种方法可以说明:如果您发现某些字符在宽度总数中添加了这些字符的字母间字距?其中的值取自所用字体的度量。例如,第一个\hbox(您向我展示的,即\alphabet)将字母间字距计入计数,是否有一种方法可以将其也计入宏的最终计数中?

答案1

您的代码不太清楚,但您获得的值与两个框一致

\documentclass{article}

\showoutput


\begin{document}

\hbox{abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,.;?()!':}

\hbox{{a}{b}{c}{d}{e}{f}{g}{h}{i}{j}{k}{l}{m}{n}{o}{p}{q}{r}{s}{t}{u}{v}{w}{x}{y}{z}{A}{B}{C}{D}{E}{F}{G}{H}{I}{J}{K}{L}{M}{N}{O}{P}{Q}{R}{S}{T}{U}{V}{W}{X}{Y}{Z}{,}{.}{;}{?}{(}{)}{!}{'}{:}}

\end{document}

有大小

...\hbox(7.5+2.5)x342.93138
...\hbox(7.5+2.5)x342.6536

请注意,第一个有几个字母间的字距。


因此,如果使用 cmr10 设置,则值 342.93138pt 是该字符串的宽度,而 342.6536pt 是第二个表达式的宽度,或者等效于各个字符宽度的总和。日志文件会显示差异来自何处,例如第一个框的日志显示

....\OT1/cmr/m/n/10 b
....\kern0.27779
....\OT1/cmr/m/n/10 c

这意味着 cmr10 字体指标规定,如果 ab 和 c 相邻,则应添加宽度为 0.27779pt 的字距,类似地

....\OT1/cmr/m/n/10 p
....\kern0.27779
....\OT1/cmr/m/n/10 q

....\OT1/cmr/m/n/10 F
....\kern-0.27779
....\OT1/cmr/m/n/10 G

这恰好是全套,因此有两个正调整和一个负调整,总计 0.27779pt。

哪个是正确的取决于你想回答哪个问题。不清楚你为什么要问这两个问题。如果宽度是为了确定页面几何形状,那么更有趣的平均值将是典型自然语言文本的平均宽度。

相关内容