假设我有一台连接到一些实验仪器的 PC。C:
是装有 Windows 7 及其上所有程序的系统分区,D:
是保存所有获取的数据的存储分区。有没有一种快速简便的方法可以生成所有文件大小的粗略直方图D:
?如果可能的话,我想避免对不同文件大小进行数十次搜索并手动生成直方图。一些简单的工具会非常方便。
我找到了一个工具,叫做目录统计,这对于可视化所有文件很有用,但我找不到任何功能来获取适当的直方图。
答案1
考虑使用以下 Python 脚本。对于我来说,在 Windows 和 Linux 上,使用 python 3.8 和 matplotlib 3.5 都可以使用。
from pathlib import Path
import numpy as np
from matplotlib import pyplot as plt
def file_size_hist(folder: Path, size_min, size_stop, size_step):
hist_x = np.logspace(start=np.log2(size_min)/np.log2(size_step),
stop=np.log2(size_stop)/np.log2(size_step)+1,
num=int(np.log2(size_stop/size_min)/np.log2(size_step))+2,
base=size_step,
dtype=np.uint64)
hist_y = np.zeros(hist_x.size, dtype=np.uint64)
for f in folder.rglob('*'):
if f.is_file():
file_size_bytes = f.stat().st_size
if file_size_bytes < size_min:
bin = 0
else:
bin = np.log2(file_size_bytes//size_min) // np.log2(size_step) + 1
# Example:
# file_size_bytes = np.array([0, size_min-1, size_min, size_min+1, size_stop-1, size_stop, size_stop+1])
# bin = np.array([0, 0, 1, 1, 22, 23, 23])
bin = int(np.minimum(bin, hist_y.size-1)) # Last bin includes all files with size >= size_stop
hist_y[bin] += 1
return hist_x, hist_y
if __name__ == "__main__":
size_min_bytes = 2**10 # files with size < size_min_bytes are counted in the first bar
size_stop_bytes = 2**32 # files with size >= size_stop_bytes are counted in the last bar
size_step_coeff = 2 # e.g. `[1024, 2048, 4096, ...]` bytes
target_folder = r"F:"
hist_x, hist_y = file_size_hist(Path(target_folder), size_min_bytes, size_stop_bytes, size_step_coeff)
print('Found {} files total'.format(np.sum(hist_y)))
print('Found {} files with size below 64 KB'.format(np.sum(hist_y * (hist_x < 64*2**10))))
#
fig, ax = plt.subplots(1, 1)
ax.hist(hist_x, weights=hist_y, bins=hist_x, label='file sizes')
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('File size in bytes')
ax.set_ylabel('File count')
fig.show()
# | bin | min | max | hist_x |
# --+------+-----------------+-------------------+--------------------------------+
# | 0 | 0 | 1023 | 1024*2**0 |
# | 1 | 1024*2**0=1024 | 2047 | 1024*2**1 |
# | 2 | 1024*2**1=2048 | 4095 | 1024*2**2 |
# | 22 | 1024*2**21 | 1024*2**22-1 | 1024*2**22 == size_stop_bytes |
# | 23 | 1024*2**22 | inf | 1024*2**23 |
答案2
SpaceSniffer 可能是你想要的