以下 for 循环并行运行数千个作业
OSMSOURCE=europe-latest.o5m
for SHAPEFILE in URBAN_[A-Z]*[0-9] ;do
cd $SHAPEFILE
for POLYGON in *.poly ;do
osmconvert --drop-version $OSMSOURCE -B=$POLYGON --out-o5m > $(basename $OSMSOURCE .o5m |tr "-" "_")_$(basename $POLYGON .poly).o5m &
done
cd ..
done
我想了解 GNU 并行如何执行并了解它是否值得使用。
答案1
好吧,GNU 并行也会做同样的事情,而且也很容易使用。它的优点是它会照顾您机器上的 CPU 核心数量,并且默认情况下它不会执行比该数量更多的作业 (*)。
你的程序没有。如果您有数百个.poly
文件,您将产生数百个osmconvert
作业,这在最好的情况下可能不是最佳的,在最坏的情况下可能会导致您的系统瘫痪(这取决于您的资源)。
你的程序会是这样的(未经测试):
OSMSOURCE=europe-latest.o5m
OSMBASENAME="$(echo "${OSMSOURCE%.o5m}" | tr - _)"
for SHAPEFILE in URBAN_[A-Z]*[0-9]; do
cd "$SHAPEFILE"
for POLYGON in *.poly; do
echo "cd '$SHAPEFILE'; osmconvert --drop-version '$OSMSOURCE' -B='$POLYGON' --out-o5m > '${OSMBASENAME}_${POLYGON%.poly}.o5m'"
done
cd ..
done | parallel # You may want to add a -j option
(*) 您可以为其指定自己的阈值。您可能想保留一些备用 CPU 核心用于其他用途。另一方面,如果 I/O 是瓶颈,您可能需要提供比默认值更高的数字。
答案2
你也许可以这样做:
OSMSOURCE=europe-latest.o5m
export OSMSOURCE
doit() {
cd "$1"
POLYGON="$2"
osmconvert --drop-version $OSMSOURCE -B=$POLYGON --out-o5m > $(basename $OSMSOURCE .o5m |tr "-" "_")_$(basename $POLYGON .poly).o5m
}
export -f doit
您现在可以手动测试这是否有效:
doit URBAN_dir file_in_URBAN_dir.poly
当它起作用时:
parallel doit {//} {/} ::: URBAN_[A-Z]*[0-9]/*.poly
如果给出command too long
,请尝试:
find URBAN_[A-Z]*[0-9] -name *.poly | parallel doit {//} {/}
或者:
find . | grep -E 'URBAN_[A-Z].*[0-9]/.*.poly$' | parallel doit {//} {/}
花一个小时逛逛man parallel_tutorial
。你的命令行会感谢你的。
答案3
根据可用答案和一小组数据对转换进行计时测试。
测试数据集
ESRI Shapefile 目录
URAU_RG_100K_2011_2014_AT001L2
URAU_RG_100K_2011_2014_AT002L2
URAU_RG_100K_2011_2014_AT003L2
URAU_RG_100K_2011_2014_AT004L2
URAU_RG_100K_2011_2014_AT005L2
URAU_RG_100K_2011_2014_AT006L1
URAU_RG_100K_2011_2014_UK546L0
其中包括以下9项.聚文件
URAU_RG_100K_2011_2014_AT001L2/URAU_RG_100K_2011_2014_AT001L2_0.poly
URAU_RG_100K_2011_2014_AT002L2/URAU_RG_100K_2011_2014_AT002L2_0.poly
URAU_RG_100K_2011_2014_AT003L2/URAU_RG_100K_2011_2014_AT003L2_0.poly
URAU_RG_100K_2011_2014_AT004L2/URAU_RG_100K_2011_2014_AT004L2_0.poly
URAU_RG_100K_2011_2014_AT005L2/URAU_RG_100K_2011_2014_AT005L2_0.poly
URAU_RG_100K_2011_2014_AT006L1/URAU_RG_100K_2011_2014_AT006L1_0.poly
URAU_RG_100K_2011_2014_UK546L0/URAU_RG_100K_2011_2014_UK546L0_0.poly
URAU_RG_100K_2011_2014_UK546L0/URAU_RG_100K_2011_2014_UK546L0_1.poly
URAU_RG_100K_2011_2014_UK546L0/URAU_RG_100K_2011_2014_UK546L0_2.poly
将同时推出9个职位
分叉
for SHAPEFILE in URAU_RG_100K_2011_2014_[A-Z]*[0-9]/ ;do cd $SHAPEFILE && for POLYGON in *.poly ;do time osmconvert --drop-version $OSMSOURCE -B=$POLYGON --out-o5m > $(basename $OSMSOURCE .o5m |tr "-" "_")_$(basename $POLYGON .poly).o5m & done && cd .. ;done
real 6m0.951s user 5m36.869s sys 0m20.298s real 6m23.591s user 5m43.808s sys 0m20.336s real 6m24.066s user 5m44.619s sys 0m19.936s real 6m24.129s user 5m45.239s sys 0m19.378s real 6m29.208s user 5m43.094s sys 0m19.314s real 6m30.974s user 5m44.318s sys 0m19.870s real 6m33.625s user 5m45.233s sys 0m19.658s real 6m33.731s user 5m45.712s sys 0m20.001s real 6m41.014s user 6m15.112s sys 0m19.571s
GNU 并行
for SHAPEFILE in URAU_RG_100K_2011_2014_[A-Z]*[0-9]*/ ;do cd "$SHAPEFILE"; for POLYGON in *.poly ;do echo "cd '$SHAPEFILE'; time osmconvert --drop-version '$OSMSOURCE' -B='$POLYGON' --out-o5m > $(basename $OSMSOURCE .o5m |tr "-" "_")_$(basename $POLYGON .poly).o5m"; done; cd ..; done |parallel -j 10
real 6m19.005s user 5m42.739s sys 0m18.798s real 6m26.939s user 5m44.689s sys 0m19.257s real 6m27.152s user 5m44.597s sys 0m19.644s real 6m28.821s user 5m41.650s sys 0m18.283s real 6m38.174s user 5m44.367s sys 0m19.564s real 6m40.277s user 5m45.000s sys 0m19.650s real 6m39.940s user 5m45.421s sys 0m19.208s real 6m40.285s user 5m45.443s sys 0m19.393s real 6m40.428s user 5m48.828s sys 0m18.871s
使用绝对路径和文件名
for SHAPEFILE in URAU_RG_100K_2011_2014_[A-Z]*[0-9]*/ ;do for POLYGON in $SHAPEFILE*.poly ;do echo "time osmconvert --drop-version --out-o5m $OSMSOURCE -B=$(readlink -f $POLYGON) > $(dirname `readlink -f $POLYGON`)/$(basename ${OSMSOURCE%.o5m})_$(basename ${POLYGON%.poly}).o5m" ;done ;done |parallel -j 10
real 6m6.919s user 5m39.203s sys 0m19.659s real 6m23.779s user 5m43.835s sys 0m19.225s real 6m26.033s user 5m45.370s sys 0m19.235s real 6m26.871s user 5m46.124s sys 0m19.780s real 6m33.355s user 5m41.902s sys 0m18.556s real 6m34.368s user 5m42.973s sys 0m19.156s real 6m37.063s user 5m46.169s sys 0m19.669s real 6m37.363s user 5m46.846s sys 0m19.194s real 6m37.428s user 5m49.679s sys 0m19.674s
定义 shell 函数
OSMSOURCE=$(realpath europe-latest.o5m)
export OSMSOURCE
osmpolyclip() {
cd "$1"
POLYGON="$2"
osmconvert --drop-version $OSMSOURCE -B=$POLYGON --out-o5m > $(basename $OSMSOURCE .o5m |tr "-" "_")_$(basename $POLYGON .poly).o5m
}
和
parallel time osmpolyclip {//} {/} ::: URAU_[A-Z]*[0-9]/*.poly
real 6m19.749s user 5m41.909s sys 0m19.162s real 6m23.559s user 5m43.881s sys 0m18.748s real 6m34.149s user 5m41.859s sys 0m18.895s real 6m38.776s user 5m44.614s sys 0m18.767s real 6m38.817s user 5m44.420s sys 0m19.038s real 6m39.421s user 5m46.060s sys 0m18.819s real 6m39.889s user 5m45.917s sys 0m19.541s real 6m39.956s user 5m48.368s sys 0m19.742s real 6m51.397s user 6m26.095s sys 0m19.306s
然而,与要转换的数千个多边形文件相比,这只是一个最小的测试集。最重要的是,这些文件驻留在远程目录中,这实际上会减慢读写速度。
答案4
80 核系统中的时序
使用绝对路径和文件名,
for SHAPEFILE in URAU_RG_100K_2011_2014_[A-Z]*[0-9]*/ ;do for POLYGON in $SHAPEFILE*.poly ;do echo "time osmconvert --drop-version --out-o5m $OSMSOURCE -B=$(readlink -f $POLYGON) > $(dirname `readlink -f $POLYGON`)/$(basename ${OSMSOURCE%.o5m})_$(basename ${POLYGON%.poly}).o5m" ;done ;done |parallel -j 10
使用 GNU Parallel 进行 3559 次转换的时间(以秒为单位)为
timings.real timings.user timings.sys
Min. :387.4 Min. :367.5 Min. :17.95
1st Qu.:636.8 1st Qu.:616.3 1st Qu.:19.35
Median :639.4 Median :618.5 Median :19.74
Mean :637.6 Mean :616.8 Mean :19.81
3rd Qu.:642.5 3rd Qu.:621.5 3rd Qu.:20.15
Max. :709.3 Max. :689.9 Max. :27.34
系统详情
这些进程是从 Dockerized Ubuntu 16.04.2 LTS 内部运行的(请参阅https://hub.docker.com/r/_/ubuntu/),而主机是 CentOS 机器(Linux 3.10.0-514.26.2.el7.x86_64 #1 SMP)。总内存为1056760752 KB和数量英特尔(R) 至强(R) CPU E7-4820 v3 @ 1.90GHz处理器数量为 80。