我有一个脚本,它通过 LSF 调度程序在集群上计算模拟。在我的设置中,“main.bash”执行 LSF 作业脚本“simulator.bash”。如果线程/索引因某种原因失败,则作业将通过布雷克厄直到所有线程都成功完成任务。
有了像 slurm 这样的调度程序,我们控制,可用于修改 RUNTIMELIMIT、核心数、节点数等属性。提交/重新排队A完成的工作。
我的问题是,LSF 中是否存在一种机制,我们可以在其中修改完成/完工工作并重新排队?。
main.bash 的内容
#!/usr/bin/env bash
collect_job_status() {
jid=$1
bjobs -noheader -o 'jobid:-10 jobindex:-10 exit_code:-10 stat:-5 start_time:-15 finish_time:-15 runtimelimit:-15 resume_reason:15 exit_reason:20' $jid
status=$(bjobs -noheader -w -a $jid | tr -s ' ' | cut -d ' ' -f3)
if [[ $status =~ .*EXIT.* ]]; then
echo 1
else
echo 0
fi
}
main() {
output=$(bsub <./simulator.bash)
jid=$(echo $output | cut -d'<' -f2 | cut -d'>' -f1)
echo "Submitted JobId = $jid"
echo "Submitting dummy job for waiting."
bsub -K -P myproject -w "ended($jid)" -W '00:10' -J "DUMM_JOB_$(($RANDOM % 1000))" -N 1 "sleep 5; exit 0;"
jname=$(bjobs -noheader -o 'job_name jobindex' $jid | tr -s ' ' | sort -n -k2 | tail -1 | cut -d ' ' -f1)
echo "Job NAME :: $jname"
echo "JOB EXIT STATUS for $jid :: $(collect_job_status $jid)"
while [ $(collect_job_status $jid) -ne 0 ]; do
echo "-----------------------------------------------------------------------------------------------"
echo -e "Job $jname $jid Failed.\nREQUEUING JOB..."
current_run_time=$(bjobs -noheader -o 'jobid:-10 jobindex:-10 exit_code:-10 stat:-5 start_time:-15 finish_time:-15 runtimelimit:-15 resume_reason:15 exit_reason:20' 2610014 | tr -s ' ' | cut -d ' ' -f13 | sort -u -rn)
brequeue -J "$jname" -e $jid
echo "Submitting dummy job for waiting."
bsub -K -P myproject -w "ended($jid)" -W '00:10' -J "DUMM_JOB_$(($RANDOM % 1000))" -N 1 "sleep 5; exit 0;"
done
echo "-----------------------------------------------------------------------------------------------"
}
main
simulator.bash 的内容
#!/usr/bin/env bash
#BSUB -W 00:10
#BSUB -P myproject
#BSUB -n 1
#BSUB -o test_reque_%J_%I.out
#BSUB -e test_reque_%J_%I.err
#BSUB -cwd /opt
#BSUB -J TESTING_REQUE[1-10]
export LSB_JOB_REPORT_MAIL=N
MODEL_DIR_FOLDER=(
/path/to/model/1
/path/to/model/2
/path/to/model/3
/path/to/model/4
/path/to/model/5
/path/to/model/6
/path/to/model/7
/path/to/model/8
/path/to/model/9
/path/to/model/10
)
SIM_FILE_NAME=(
sim_file_1
sim_file_2
sim_file_3
sim_file_4
sim_file_5
sim_file_6
sim_file_7
sim_file_8
sim_file_9
sim_file_10
)
if [[ $(($RANDOM % 2)) -eq 0 ]]; then
echo >&2 "FAILING THIS JOB $LSB_JOBINDEX"
exit 127
fi
echo "JOB@@_OUTPUT $LSB_JOBINDEX :: MODEL_DIR_FOLDER :: ${MODEL_DIR_FOLDER[$((LSB_JOBINDEX - 1))]}"
echo "JOB@@_OUTPUT $LSB_JOBINDEX :: SIM_FILE_NAME :: ${SIM_FILE_NAME[$((LSB_JOBINDEX - 1))]}"
sleep 10
exit 0