我有以下 Python 脚本,有时用于镜像大型目录(同步的主要逻辑在 中实现sync_directory()
)。它还尝试镜像其他元数据,例如文件创建/修改/访问时间:
# Standard library full imports
import hashlib
import shutil
import stat
import sys
import os
# Related third-party full imports
import docopt
# Standard library partial imports
from pathlib import Path
"=================================================================================================="
"Constants"
# Command line interface for docopt
USAGE_MSG = """
MySync
Precisely mirror <src_dir> in <dst_dir>.
Usage:
mysync.py [options] <src_dir> <dst_dir>
mysync.py -h | --help
Options:
-d --show-deletions Show any deletions that occur within <dst_dir>.
-h --help Show this screen.
"""
"=================================================================================================="
"Helper Functions - Rmtree On-Error"
def rmtree_onerror(func, path, excinfo):
"""Handle instances of paths not being properly deleted.
Source: https://stackoverflow.com/a/59507412
"""
os.chmod(path, stat.S_IWRITE)
if os.path.isdir(path):
os.rmdir(path)
else:
os.remove(path)
"=================================================================================================="
"Helper Functions - Set Creation Time"
"""
Source for this section:
https://github.com/Delgan/win32-setctime/blob/master/win32_setctime.py
"""
try:
from ctypes import byref, get_last_error, wintypes, FormatError, WinDLL, WinError
kernel32 = WinDLL("kernel32", use_last_error=True)
CreateFileW = kernel32.CreateFileW
SetFileTime = kernel32.SetFileTime
CloseHandle = kernel32.CloseHandle
CreateFileW.argtypes = (
wintypes.LPWSTR,
wintypes.DWORD,
wintypes.DWORD,
wintypes.LPVOID,
wintypes.DWORD,
wintypes.DWORD,
wintypes.HANDLE,
)
CreateFileW.restype = wintypes.HANDLE
SetFileTime.argtypes = (
wintypes.HANDLE,
wintypes.PFILETIME,
wintypes.PFILETIME,
wintypes.PFILETIME,
)
SetFileTime.restype = wintypes.BOOL
CloseHandle.argtypes = (wintypes.HANDLE,)
CloseHandle.restype = wintypes.BOOL
except (ImportError, AttributeError, OSError, ValueError):
SUPPORTED = False
else:
SUPPORTED = os.name == "nt"
def setctime(filepath, timestamp, *, follow_symlinks=True):
"""Set the "ctime" (creation time) attribute of a file given an unix timestamp (Windows only)."""
if not SUPPORTED:
raise OSError("This function is only available for the Windows platform.")
filepath = os.path.normpath(os.path.abspath(str(filepath)))
timestamp = int(timestamp * 10000000) + 116444736000000000
if not 0 < timestamp < (1 << 64):
raise ValueError("The system value of the timestamp exceeds u64 size: %d" % timestamp)
atime = wintypes.FILETIME(0xFFFFFFFF, 0xFFFFFFFF)
mtime = wintypes.FILETIME(0xFFFFFFFF, 0xFFFFFFFF)
ctime = wintypes.FILETIME(timestamp & 0xFFFFFFFF, timestamp >> 32)
flags = 128 | 0x02000000
if not follow_symlinks:
flags |= 0x00200000
handle = wintypes.HANDLE(CreateFileW(filepath, 256, 0, None, 3, flags, None))
if handle.value == wintypes.HANDLE(-1).value:
raise WinError(get_last_error())
if not wintypes.BOOL(SetFileTime(handle, byref(ctime), byref(atime), byref(mtime))):
raise WinError(get_last_error())
if not wintypes.BOOL(CloseHandle(handle)):
raise WinError(get_last_error())
"=================================================================================================="
"Helper Functions - Sync Directory"
def sha256_file_hash(file):
"""Get SHA-256 checksum of a file"""
# Buffer of given byte size (for checksum())
BUFFER_SIZE = 64 * 1024 * 1024 #64 megabytes
hash_obj = hashlib.sha256()
with open(file, 'rb') as file:
while (data := file.read(BUFFER_SIZE)):
hash_obj.update(data)
return hash_obj.hexdigest()
def sync_directory(src_dir, dst_dir, show_deletions=False):
"""Backup dir by only copying/deleting files as needed (similar to rsync)"""
src_dir = str(src_dir)
dst_dir = str(dst_dir)
if not os.path.exists(dst_dir):
os.mkdir(dst_dir)
# List of names found ONLY in dst_dir (will be trimmed down)
dst_dir_only_names = set(os.listdir(dst_dir))
try:
src_dir_names = os.listdir(src_dir)
except PermissionError as exc:
print(f'INFO: os.listdir() raised error trying to access "{str(src_dir)}"')
print(f"=> {exc}")
src_dir_names = []
for src_name in src_dir_names:
# Already found in src_dir, so not unique to dst_dir
if src_name in dst_dir_only_names:
dst_dir_only_names.remove(src_name)
src_path = os.path.join(src_dir, src_name)
dst_path = os.path.join(dst_dir, src_name)
# Base case: src_path is for a file
if os.path.isfile(src_path):
if not (
os.path.exists(dst_path) and
os.path.isfile(dst_path) and
os.path.getsize(dst_path) == os.path.getsize(src_path) and
os.path.getmtime(dst_path) == os.path.getmtime(src_path)
):
# Try to copy file, as well as metadata (ctime, mtime, atime)
try:
shutil.copy2(src_path, dst_path)
# May occasionally generate OSError (or subclass thereof)
# But these will be considered "spurious" if all content was copied
except OSError as exc:
print(f'INFO: shutil.copy2() raised error copying "{str(src_path)}"')
print(f"=> {exc}")
# Can continue normal operation as long as both files have same contents
# Either shutil.copy2() partially succeeded, or both files already same
if (
os.path.exists(dst_path) and
os.path.isfile(dst_path) and
os.path.getsize(dst_path) == os.path.getsize(src_path) and
sha256_file_hash(src_path) == sha256_file_hash(dst_path) ):
print("=> But files have matching content")
# If contents not same, copying operation definitely failed
else:
# Can still continue trying to copy other files
# No need to re-raise, just print for user to see
print(f'=> Couldn\'t copy file: "{src_path}"')
continue #to next src_path
# Make sure dst_path's ctime same as src_path
# setctime() might be expensive, so check if needed first
src_ctime = os.path.getctime(src_path)
dst_ctime = os.path.getctime(dst_path)
if src_ctime != dst_ctime:
setctime(dst_path, src_ctime)
# Make sure dst_path's atime/mtime same as src_path
os.utime(dst_path, (
os.path.getatime(src_path),
os.path.getmtime(src_path)))
# Let user know if any file times still don't match (highly unusual)
for attr_name in ("ctime", "mtime", "atime"):
getxtime = getattr(os.path, f"get{attr_name}")
if getxtime(src_path) != getxtime(dst_path):
print(f"INFO: {attr_name} mismatch: {src_path!r}")
# Use recursion to sync sub-directories
elif os.path.isdir(src_path):
sync_directory(src_path, dst_path, show_deletions=show_deletions)
# Remove all paths that were only found in dst
for dst_name in dst_dir_only_names:
dst_path = os.path.join(dst_dir, dst_name)
if show_deletions:
print(f'Deleting "{dst_path}"...')
if os.path.isfile(dst_path):
os.remove(dst_path)
elif os.path.isdir(dst_path):
shutil.rmtree(dst_path, onerror=rmtree_onerror)
# Can only reliably set atime/mtime after modifying dir itself
os.utime(dst_dir, (os.path.getatime(src_dir), os.path.getmtime(src_dir)))
"=================================================================================================="
"Main"
def main():
# Set up interface for docopt
args = docopt.docopt(USAGE_MSG)
src_dir = Path(args["<src_dir>"]).absolute()
dst_dir = Path(args["<dst_dir>"]).absolute()
show_deletions = args["--show-deletions"]
sync_directory(src_dir, dst_dir, show_deletions=show_deletions)
if __name__ == "__main__":
main()
我编写此脚本的原因是,当我尝试使用 执行与其等效的操作时rsync
,它很多速度较慢。这是rsync
在 MSYS2 (mingw64) 中使用的,但我注意到在rsync
WSL2 中也出现了类似的结果(因此这与其本身的工作方式有关rsync
)。
我目前正在运行:
- Windows 10
- Python 3.11.4
- rsync 版本 3.2.7(在 MSYS2 中)
例子:
我将 27 GB 的文档目录(约 145,000 个文件、17,000 个文件夹)镜像到外部硬盘。作为基础测试,源和镜像都已提前同步(尽管有几个文件发生了更改,但我得到的结果还是相似的)。
使用rsync
,它似乎在 6 分钟左右卡住了,之后我手动停止了它33分钟。我怀疑它最终会完成(我之前对其他同步作业使用过相同的调用),但我不知道它需要多长时间:
$ C:\msys64\usr\bin\rsync.exe ^
--archive ^
--delete ^
--whole-file ^
--partial ^
--times ^
--crtimes ^
--atimes ^
--verbose ^
--progress ^
--info=progress2,name0 ^
"C:/path/to/Documents" "E:/path/to/DocumentsBackup"
sending incremental file list
...
4,147,432 0% 10.80kB/s 0:06:11 (xfr#6, ir-chk=1001/54912)
rsync error: received SIGINT, SIGTERM, or SIGHUP (code 20) at rsync.c(713) [sender=3.2.7]
对于相同的同步任务,我的 Python 脚本在88 秒。
我有两个问题:
为什么
rsync
比我的 Python 脚本慢这么多(即它们所做的事情有何不同)?我如何实现功能和
rsync
使用(或cp
,或类似的工具)我的python脚本的性能如何?
重要的:我非常希望所选的实用程序/方法也能在 Linux 上运行,因为我目前正尝试从 Windows 迁移。
提前致谢。
更新 #1:我知道在 Linux 文件系统上更新 crtimes 并不容易。因此,任何至少可以保留访问/修改时间的解决方案都是可以的。
更新 #2:嗯,这真是出乎意料。
我决定在我的 Linux Mint 双启动上测试rsync
和mysync.py
方法。我在 Linux 系统中已经有一份稍旧的 Documents 文件夹副本(因为我正在迁移)。我尝试从外部硬盘同步到 Linux 中的 Documents 目录(一些文件自之前以来已被更改)。我在 rsync 调用中唯一更改的是删除标志--crtimes
(似乎我的 Linux Mint 不支持它rsync
):
$ time rsync --archive --delete --whole-file --partial --times --atimes --verbose --progress --info=progress2,name0 /media/user/hdd/Documents/ ~/Documents/
sending incremental file list
...
3,061,695,271 10% 10.87MB/s 0:04:28 (xfr#81200, to-chk=0/162233)
sent 3,070,547,352 bytes received 1,777,891 bytes 11,357,949.14 bytes/sec
total size is 28,796,538,110 speedup is 9.37
real 4m30.102s
user 0m12.905s
sys 0m48.600s
rsync
在真正的 Linux 系统中运行,它需要4分30秒第一次。由于它们现在已经同步,我再次尝试,看看第二次是否会更快/更慢:
$ time rsync --archive --delete --whole-file --partial --times --atimes --verbose --progress --info=progress2,name0 /media/user/hdd/Documents/ ~/Documents/
sending incremental file list
0 0% 0.00kB/s 0:00:00 (xfr#0, to-chk=0/162233)
sent 4,544,474 bytes received 20,350 bytes 212,317.40 bytes/sec
total size is 28,796,538,110 speedup is 6,308.36
real 0m21.466s
user 0m1.439s
sys 0m6.301s
仅限这次21 秒。最后,为了比较,我尝试了mysync.py
(具体来说,是没有任何setctime()
功能的改编版)。
$ time python jsync.py /media/user/hdd/Documents/ ~/Documents/
real 2m24.725s
user 0m5.298s
sys 0m35.002s
花了2分24秒(这实际上是第二次运行,不知何故第一次运行似乎停滞了,所以我手动停止了它3分钟)。
结论:比Windows + MSYS2rsync
慢得多,但这在某种程度上几乎mysync.py
反转在真实的 Linux 系统上。
对我来说,好消息是我mysync.py
现在可以继续使用,然后rsync
在迁移到 Linux 时简单地使用。然而,我保留这个问题,以防其他人仍然需要rsync
在 MSYS2 中为他们工作。我自己仍然想知道是什么导致了 Windows 10 上如此巨大的性能差异。
答案1
一种可能的解释是,运行脚本或rsync
曾经将两个相关目录的元数据加载到内存中,以便将来的运行速度更快。
只有当目录中有大量文件时,此理论才成立。