问题:
为什么在下面的例子中我得到(CM 为 10pt)\alphabet=342.93138pt
以及
\myalphabetwidth=342.6536pt
为什么这两个度量之间存在这种差异?哪一个更正确?
\documentclass{book}
\RequirePackage{xparse}
\RequirePackage{siunitx}
\RequirePackage{environ}
\RequirePackage{booktabs}
% **********************************************************
\ExplSyntaxOn
% **********************************************************
\bool_new:N \g_has_run_bool
\tl_new:N \l_aw_text_tl
\int_new:N \l_aw_tot_int
\int_new:N \g_aw_tot_alph_int
\int_new:N \g_wid_space_int
\int_new:N \g_space_int
\fp_new:N \g_rat_space_int
\fp_new:N \g_aw_avg_width_fp
\dim_new:N \myalphabetwidth
\dim_new:N \mytextwidth
\input{\jobname.aux}
\tl_const:Nx \c_aw_the_alphabet_tl {abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,.;?()!' \token_to_str:N :}
% this can be changed to an evironment or renamed or whatever
\NewDocumentCommand {\avgwidthstart} {}
{
\aw_avg_width:w
}
\NewDocumentCommand {\avgwidthend}{}{}
% Here is the environment version, using just "text" as a name is probably a bad idea.
\NewEnviron{textcount}
{
\expandafter\avgwidthstart\BODY\avgwidthend
}
\cs_new:Npn \aw_avg_width:w #1 \avgwidthend
{
% if first run, then generate variables to be used
\bool_if:NF \g_has_run_bool
{
\tl_map_inline:Nn \c_aw_the_alphabet_tl
{
\int_new:c {g_##1_int}
\fp_new:c {g_rat_##1_fp}
\fp_new:c {g_wid_##1_fp}
}
}
\tl_set:Nn \l_aw_text_tl {#1}
% this can be used rather than the preceding line to take capital
% letters into account, but is Slooooooow
%\tl_set:Nx \l_aw_text_tl {\tl_expandable_lowercase:n {#1}}
\int_set:Nn \l_aw_tot_int {\tl_count:N \l_aw_text_tl}
\tl_map_function:NN \c_aw_the_alphabet_tl \aw_get_counts:n
\deal_with_spaces:n {#1}
\tl_map_function:NN \c_aw_the_alphabet_tl \aw_calc_ratios:n
\tl_map_function:NN \c_aw_the_alphabet_tl \aw_calc_avg_width:n
\fp_gset_eq:NN \g_aw_avg_width_fp \l_tmpa_fp
\fp_zero:N \l_tmpa_fp
% the dimension \myalphabetwidth gives the width of the alphabet based on your character freq,
% can be accessed by \the\myalphabetwidth
\dim_gset:Nn \myalphabetwidth {\fp_to_dim:n {\fp_eval:n {61*\g_aw_avg_width_fp}}}
% the dimension \mytextwidth gives the recommended \textwidth based on 66 chars per line.
% can be accessed by \the\mytextwidth
\dim_gset:Nn \mytextwidth {\fp_to_dim:n {\fp_eval:n {66*\g_aw_avg_width_fp}}}
\protected@write\@mainaux{}{\mytextwidth=\the\mytextwidth}
\bool_gset_true:N \g_has_run_bool
% and lastly print the content
#1
}
\cs_new:Npn \aw_get_counts:n #1
{
% make a temporary token list from the document body
\tl_set_eq:NN \l_tmpb_tl \l_aw_text_tl
% remove all occurrences of the character
\tl_remove_all:Nn \l_tmpb_tl {#1}
% add to appropriate int the number of occurrences of that character in current block
\int_set:Nn \l_tmpa_int {\int_eval:n{\l_aw_tot_int -\tl_count:N \l_tmpb_tl}}
% add to appropriate int the number of occurrences of that character in current block
\int_gadd:cn {g_#1_int} {\l_tmpa_int}
% add this to the total
\int_gadd:Nn \g_aw_tot_alph_int {\l_tmpa_int}
}
\cs_new:Npn \deal_with_spaces:n #1
{
\tl_set:Nn \l_tmpa_tl {#1}
% rescan body with spaces as characters
\tl_set_rescan:Nnn \l_tmpb_tl {\char_set_catcode_letter:N \ }{#1}
% find number of new characters introduced. add to number of spaces and alph chars
\int_set:Nn \l_tmpa_int {\tl_count:N \l_tmpb_tl -\tl_count:N \l_tmpa_tl}
\int_gadd:Nn \g_space_int {\l_tmpa_int}
\int_gadd:Nn \g_aw_tot_alph_int {\l_tmpa_int}
% since this comes after the rest of chars are dealt with, tot_alph is final total
\fp_set:Nn \g_rat_space_fp {\g_space_int/\g_aw_tot_alph_int}
% get width of space and use it. obviously space is stretchable, so i'll assume
% that the expansions and contractions cancel one another over large text. is this
% a terrible assumption???
\hbox_set:Nn \l_tmpa_box {\ }
\fp_gset:Nn \g_wid_space_fp {\dim_to_fp:n {\box_wd:N \l_tmpa_box}}
\fp_add:Nn \l_tmpa_fp {\g_wid_space_fp*\g_rat_space_fp}
}
\cs_new:Npn \aw_calc_ratios:n #1
{
% divide number of occurrences of char by total alphabetic chars
\fp_gset:cn {g_rat_#1_fp}{{\int_use:c {g_#1_int}}/\g_aw_tot_alph_int}
}
\cs_new:Npn \aw_calc_avg_width:n #1
{
% only need to find char widths once
\bool_if:NF \g_has_run_bool
{
% find width of char box
\hbox_set:Nn \l_tmpa_box {#1}
\fp_gset:cn {g_wid_#1_fp}{\dim_to_fp:n {\box_wd:N \l_tmpa_box}}
}
% multiply it by char frequency and add to avg width
\fp_add:Nn \l_tmpa_fp {{\fp_use:c {g_wid_#1_fp}}*{\fp_use:c {g_rat_#1_fp}}}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% This part is just for fun. Delete it and the showtable command from the document if
% it isn't wanted
\newlength{\alphabet}%
\settowidth{\alphabet}{\normalfont abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,.;?()!':}%
\tl_new:N \l_aw_tab_rows_tl
\seq_new:N \g_aw_theor_rats_seq
\seq_new:N \g_aw_the_alphabet_seq
\seq_gset_split:Nnn \g_aw_theor_rats_seq {,}
{0.0651738,0.0124248,0.0217339,0.0349835,0.1041442,0.0197881,0.0158610,0.0492888,0.0558094,0.0009033,0.0050529,0.0331490,0.0202124,0.0564513,0.0596302,0.0137645,0.0008606,0.0497563,0.0515760,0.0729357,0.0225134,0.0082903,0.0171272,0.0013692,0.0145984,0.0007836,0.1918182}
\NewDocumentCommand {\showtable}{}
{
\clearpage
\aw_make_table:
}
\cs_generate_variant:Nn \seq_set_split:Nnn {NnV}
\cs_new:Npn \aw_make_table:
{
\thispagestyle{empty}
\seq_set_split:NnV \g_aw_the_alphabet_seq {} \c_aw_the_alphabet_tl
%takes corresponding letter/theoretical ratio pairs from sequences and applies function
\seq_mapthread_function:NNN \g_aw_the_alphabet_seq \g_aw_theor_rats_seq \aw_generate_row:nn
\begin{table}[h]
\centering
\sisetup{round-mode = places,round-precision = 5,output-decimal-marker={,},table-format = 3.5}
\begin{tabular}{llll}
\toprule
{Average\,\texttt{\textbackslash textwidth}}&{Average\,character\,width}&{Average\,alphabet\,width}&{Alphabet\,width}\\
\midrule
\the\mytextwidth&\fp_eval:n {round(\g_aw_avg_width_fp,5)}pt&\the\myalphabetwidth&\the\alphabet\\
\bottomrule
\end{tabular}\par
\end{table}
\vfil
\begin{center}
Total\,characters\,=\,\fp_eval:n {\g_aw_tot_alph_int}
\end{center}
\[%
\mathrm{Total\,line\,type}=\frac{\fp_eval:n {\g_aw_tot_alph_int}\cdot \fp_eval:n {round(\g_aw_avg_width_fp,5)}\mathrm{pt}}{\fp_eval:n {\g_aw_tot_alph_int*{round(\g_aw_avg_width_fp,5)}/({\g_aw_tot_alph_int}/66)}\mathrm{pt}}=\fp_eval:n {\g_aw_tot_alph_int/66}
\]
\vfil
\begin{table}[h]
\centering
\sisetup{round-mode = places,round-precision = 5,output-decimal-marker={,},table-format = 3.5}
\begin{tabular}{cSSS}
\toprule
{Letter}&{Actual}&{Theoretical}&{Difference}\\
\midrule
spaces&\fp_eval:n {\g_rat_space_fp*100}\%&19.18182\%&\fp_eval:n {{\g_rat_space_fp*100-19.18182}}\%\\
\tl_use:N \l_aw_tab_rows_tl
\bottomrule
\end{tabular}\par
\end{table}
}
\cs_new:Npn \aw_generate_row:nn #1#2
{
\tl_put_right:Nn \l_aw_tab_rows_tl {#1&}
\tl_put_right:Nx \l_aw_tab_rows_tl {\fp_eval:n {100*{\fp_use:c {g_rat_#1_fp}}}\%&}
\tl_put_right:Nn \l_aw_tab_rows_tl {\fp_eval:n{100*{#2}}\%&}
\tl_put_right:Nx \l_aw_tab_rows_tl {\fp_eval:n {{\fp_use:c {g_rat_#1_fp}*100-\fp_eval:n {#2}*100}}\%}
\tl_put_right:Nn \l_aw_tab_rows_tl {\\}
}
% **********************************************************
\ExplSyntaxOff
% **********************************************************
\begin{document}
\avgwidthstart
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,.;?()!':%
\avgwidthend
\the\myalphabetwidth
\the\alphabet
\end{document}
更新:
我不想要某个特定字符串的总宽度。我的示例只是测试宏的一种方法。我尝试比较同一字符串的两个度量来检查宏的性能,即计算文档中的字符频率(在文档的普通文本上),然后获取平均字母宽度并使用它来设置\textwidth
。
因为您曾说过:“我不明白这个问题。”我尝试重新解释一下:
我的问题是:
宏中有一种方法可以说明:如果您发现某些字符在宽度总数中添加了这些字符的字母间字距?其中的值取自所用字体的度量。例如,第一个\hbox
(您向我展示的,即\alphabet
)将字母间字距计入计数,是否有一种方法可以将其也计入宏的最终计数中?
答案1
您的代码不太清楚,但您获得的值与两个框一致
\documentclass{article}
\showoutput
\begin{document}
\hbox{abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,.;?()!':}
\hbox{{a}{b}{c}{d}{e}{f}{g}{h}{i}{j}{k}{l}{m}{n}{o}{p}{q}{r}{s}{t}{u}{v}{w}{x}{y}{z}{A}{B}{C}{D}{E}{F}{G}{H}{I}{J}{K}{L}{M}{N}{O}{P}{Q}{R}{S}{T}{U}{V}{W}{X}{Y}{Z}{,}{.}{;}{?}{(}{)}{!}{'}{:}}
\end{document}
有大小
...\hbox(7.5+2.5)x342.93138
...\hbox(7.5+2.5)x342.6536
请注意,第一个有几个字母间的字距。
因此,如果使用 cmr10 设置,则值 342.93138pt 是该字符串的宽度,而 342.6536pt 是第二个表达式的宽度,或者等效于各个字符宽度的总和。日志文件会显示差异来自何处,例如第一个框的日志显示
....\OT1/cmr/m/n/10 b
....\kern0.27779
....\OT1/cmr/m/n/10 c
这意味着 cmr10 字体指标规定,如果 ab 和 c 相邻,则应添加宽度为 0.27779pt 的字距,类似地
....\OT1/cmr/m/n/10 p
....\kern0.27779
....\OT1/cmr/m/n/10 q
和
....\OT1/cmr/m/n/10 F
....\kern-0.27779
....\OT1/cmr/m/n/10 G
这恰好是全套,因此有两个正调整和一个负调整,总计 0.27779pt。
哪个是正确的取决于你想回答哪个问题。不清楚你为什么要问这两个问题。如果宽度是为了确定页面几何形状,那么更有趣的平均值将是典型自然语言文本的平均宽度。