LSF 使用修改后的提交选项重新排队作业

LSF 使用修改后的提交选项重新排队作业

我有一个脚本,它通过 LSF 调度程序在集群上计算模拟。在我的设置中,“main.bash”执行 LSF 作业脚本“simulator.bash”。如果线程/索引因某种原因失败,则作业将通过布雷克厄直到所有线程都成功完成任务。

有了像 slurm 这样的调度程序,我们控制,可用于修改 RUNTIMELIMIT、核心数、节点数等属性。提交/重新排队A完成的工作。

我的问题是,LSF 中是否存在一种机制,我们可以在其中修改完成/完工工作并重新排队?。

main.bash 的内容

#!/usr/bin/env bash

collect_job_status() {
    jid=$1
    bjobs -noheader -o 'jobid:-10 jobindex:-10 exit_code:-10 stat:-5 start_time:-15 finish_time:-15 runtimelimit:-15 resume_reason:15 exit_reason:20' $jid
    status=$(bjobs -noheader -w -a $jid | tr -s ' ' | cut -d ' ' -f3)
    if [[ $status =~ .*EXIT.* ]]; then
        echo 1
    else
        echo 0
    fi
}

main() {
    output=$(bsub <./simulator.bash)
    jid=$(echo $output | cut -d'<' -f2 | cut -d'>' -f1)
    echo "Submitted JobId = $jid"

    echo "Submitting dummy job for waiting."
    bsub -K -P myproject -w "ended($jid)" -W '00:10' -J "DUMM_JOB_$(($RANDOM % 1000))" -N 1 "sleep 5; exit 0;"

    jname=$(bjobs -noheader -o 'job_name jobindex' $jid | tr -s ' ' | sort -n -k2 | tail -1 | cut -d ' ' -f1)
    echo "Job NAME :: $jname"

    echo "JOB EXIT STATUS for $jid :: $(collect_job_status $jid)"
    while [ $(collect_job_status $jid) -ne 0 ]; do
        echo "-----------------------------------------------------------------------------------------------"
        echo -e "Job $jname $jid Failed.\nREQUEUING JOB..."

        current_run_time=$(bjobs -noheader -o 'jobid:-10 jobindex:-10 exit_code:-10 stat:-5 start_time:-15 finish_time:-15 runtimelimit:-15 resume_reason:15 exit_reason:20' 2610014 | tr -s ' ' | cut -d ' ' -f13 | sort -u -rn)

        brequeue -J "$jname" -e $jid
        echo "Submitting dummy job for waiting."
        bsub -K -P myproject -w "ended($jid)" -W '00:10' -J "DUMM_JOB_$(($RANDOM % 1000))" -N 1 "sleep 5; exit 0;"
    done
    echo "-----------------------------------------------------------------------------------------------"
}

main

simulator.bash 的内容

#!/usr/bin/env bash

#BSUB -W 00:10
#BSUB -P myproject
#BSUB -n 1
#BSUB -o test_reque_%J_%I.out
#BSUB -e test_reque_%J_%I.err
#BSUB -cwd /opt
#BSUB -J TESTING_REQUE[1-10]

export LSB_JOB_REPORT_MAIL=N

MODEL_DIR_FOLDER=(
    /path/to/model/1
    /path/to/model/2
    /path/to/model/3
    /path/to/model/4
    /path/to/model/5
    /path/to/model/6
    /path/to/model/7
    /path/to/model/8
    /path/to/model/9
    /path/to/model/10
)

SIM_FILE_NAME=(
    sim_file_1
    sim_file_2
    sim_file_3
    sim_file_4
    sim_file_5
    sim_file_6
    sim_file_7
    sim_file_8
    sim_file_9
    sim_file_10
)

if [[ $(($RANDOM % 2)) -eq 0 ]]; then
    echo >&2 "FAILING THIS JOB $LSB_JOBINDEX"
    exit 127
fi

echo "JOB@@_OUTPUT $LSB_JOBINDEX :: MODEL_DIR_FOLDER :: ${MODEL_DIR_FOLDER[$((LSB_JOBINDEX - 1))]}"
echo "JOB@@_OUTPUT $LSB_JOBINDEX :: SIM_FILE_NAME    :: ${SIM_FILE_NAME[$((LSB_JOBINDEX - 1))]}"

sleep 10
exit 0

相关内容