From 4830205a97b61297e868a3eb6c5982b248559ae3 Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Fri, 26 Jul 2024 22:55:31 +0000 Subject: [PATCH 01/15] Specify the squeue format explicitly --- tests/rt_utils.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index a8aba0860d..545fca302c 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -189,7 +189,7 @@ submit_and_wait() { set -e ;; slurm) - job_info=$( squeue -u "${USER}" -j "${jobid}" ) + job_info=$( squeue -u "${USER}" -j "${jobid}" -o '%i %T' ) ;; *) ;; @@ -205,7 +205,7 @@ submit_and_wait() { # Getting the status letter from scheduler info status=$( grep "${jobid}" <<< "${job_info}" ) - status=$( awk '{print $5}' <<< "${status}" ) + status=$( awk '{print $2}' <<< "${status}" ) case ${status} in #waiting cases From fb56194b73e9423bd8537859b38a1f1df541b317 Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Fri, 26 Jul 2024 23:43:01 +0000 Subject: [PATCH 02/15] Add ${ECF_TRYNO} suffix to compile and run log files, if ecflow is used --- tests/rt_utils.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index 545fca302c..ee4bc535b5 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -580,7 +580,7 @@ ecflow_create_compile_task() { cat << EOF > "${ECFLOW_RUN}/${ECFLOW_SUITE}/compile_${COMPILE_ID}.ecf" %include -${PATHRT}/run_compile.sh "${PATHRT}" "${RUNDIR_ROOT}" "${MAKE_OPT}" "${COMPILE_ID}" > "${LOG_DIR}/compile_${COMPILE_ID}.log" 2>&1 & +${PATHRT}/run_compile.sh "${PATHRT}" "${RUNDIR_ROOT}" "${MAKE_OPT}" "${COMPILE_ID}" > "${LOG_DIR}/compile_${COMPILE_ID}.log.\${ECF_TRYNO}" 2>&1 & %include EOF { @@ -596,7 +596,7 @@ ecflow_create_run_task() { echo "rt_utils.sh: ${TEST_ID}: Creating ECFLOW run task" cat << EOF > "${ECFLOW_RUN}/${ECFLOW_SUITE}/${TEST_ID}${RT_SUFFIX}.ecf" %include -${PATHRT}/run_test.sh "${PATHRT}" "${RUNDIR_ROOT}" "${TEST_NAME}" "${TEST_ID}" "${COMPILE_ID}" > "${LOG_DIR}/run_${TEST_ID}${RT_SUFFIX}.log" 2>&1 & +${PATHRT}/run_test.sh "${PATHRT}" "${RUNDIR_ROOT}" "${TEST_NAME}" "${TEST_ID}" "${COMPILE_ID}" > "${LOG_DIR}/run_${TEST_ID}${RT_SUFFIX}.log.\${ECF_TRYNO}" 2>&1 & %include EOF { From 47e956080cea341bc60e6965b7b321f59a1e331a Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Mon, 29 Jul 2024 14:43:49 +0000 Subject: [PATCH 03/15] Check the job exit status in rt_utils.sh --- tests/rt_utils.sh | 49 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index ee4bc535b5..1dd7a1848f 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -187,26 +187,36 @@ submit_and_wait() { set +e job_info=$( qstat "${jobid}" ) set -e + if grep -q "${jobid}" <<< "${job_info}"; then + job_running=true + # Getting the status letter from scheduler info + status=$( grep "${jobid}" <<< "${job_info}" ) + status=$( awk '{print $5}' <<< "${status}" ) + else + job_running=false + status='COMPLETED' + exit_status=$( qstat ${jobid} -x -f | grep Exit_status | awk '{print $3}') + if [[ $exit_status != 0 ]]; then + status='FAILED' + fi + fi ;; slurm) job_info=$( squeue -u "${USER}" -j "${jobid}" -o '%i %T' ) + if grep -q "${jobid}" <<< "${job_info}"; then + job_running=true + else + job_running=false + job_info=$( sacct -n -j ${jobid} --format=JobID,state%20,Jobname%64 | grep "^${jobid}" | grep ${JBNME} ) + fi + # Getting the status letter from scheduler info + status=$( grep "${jobid}" <<< "${job_info}" ) + status=$( awk '{print $2}' <<< "${status}" ) ;; *) ;; esac - - if grep -q "${jobid}" <<< "${job_info}"; then - job_running=true - else - job_running=false - continue - fi - - # Getting the status letter from scheduler info - status=$( grep "${jobid}" <<< "${job_info}" ) - status=$( awk '{print $2}' <<< "${status}" ) - case ${status} in #waiting cases #pbs: Q @@ -229,14 +239,15 @@ submit_and_wait() { #fail/completed cases #slurm: F/FAILED TO/TIMEOUT CA/CANCELLED F|TO|CA|FAILED|TIMEOUT|CANCELLED) - echo "rt_utils.sh: !!!!!!!!!!JOB TERMINATED!!!!!!!!!!" + echo "rt_utils.sh: !!!!!!!!!!JOB TERMINATED!!!!!!!!!! status=${status}" job_running=false #Trip the loop to end with these status flags interrupt_job exit 1 ;; #completed - #pbs only: C-Complete E-Exiting - C|E) + #pbs: C-Complete E-Exiting + #slurm: CD/COMPLETED + C|E|CD|COMPLETED) status_label='Completed' ;; *) @@ -580,6 +591,10 @@ ecflow_create_compile_task() { cat << EOF > "${ECFLOW_RUN}/${ECFLOW_SUITE}/compile_${COMPILE_ID}.ecf" %include +( +cd "${LOG_DIR}" +ln -sf "compile_${COMPILE_ID}.log.\${ECF_TRYNO}" "compile_${COMPILE_ID}.log" +) ${PATHRT}/run_compile.sh "${PATHRT}" "${RUNDIR_ROOT}" "${MAKE_OPT}" "${COMPILE_ID}" > "${LOG_DIR}/compile_${COMPILE_ID}.log.\${ECF_TRYNO}" 2>&1 & %include EOF @@ -596,6 +611,10 @@ ecflow_create_run_task() { echo "rt_utils.sh: ${TEST_ID}: Creating ECFLOW run task" cat << EOF > "${ECFLOW_RUN}/${ECFLOW_SUITE}/${TEST_ID}${RT_SUFFIX}.ecf" %include +( +cd "${LOG_DIR}" +ln -sf "run_${TEST_ID}${RT_SUFFIX}.log.\${ECF_TRYNO}" "${LOG_DIR}/run_${TEST_ID}${RT_SUFFIX}.log" +) ${PATHRT}/run_test.sh "${PATHRT}" "${RUNDIR_ROOT}" "${TEST_NAME}" "${TEST_ID}" "${COMPILE_ID}" > "${LOG_DIR}/run_${TEST_ID}${RT_SUFFIX}.log.\${ECF_TRYNO}" 2>&1 & %include EOF From fbd9014cd6d8e0d565d4cee8609629d487243db6 Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Mon, 29 Jul 2024 15:18:17 +0000 Subject: [PATCH 04/15] Fix shellcheck warnings --- tests/rt_utils.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index 1dd7a1848f..987426fba1 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -195,8 +195,10 @@ submit_and_wait() { else job_running=false status='COMPLETED' - exit_status=$( qstat ${jobid} -x -f | grep Exit_status | awk '{print $3}') - if [[ $exit_status != 0 ]]; then + set +e + exit_status=$( qstat "${jobid}" -x -f | grep Exit_status | awk '{print $3}') + set -e + if [[ ${exit_status} != 0 ]]; then status='FAILED' fi fi @@ -207,7 +209,7 @@ submit_and_wait() { job_running=true else job_running=false - job_info=$( sacct -n -j ${jobid} --format=JobID,state%20,Jobname%64 | grep "^${jobid}" | grep ${JBNME} ) + job_info=$( sacct -n -j "${jobid}" --format=JobID,state%20,Jobname%64 | grep "^${jobid}" | grep ${JBNME} ) fi # Getting the status letter from scheduler info status=$( grep "${jobid}" <<< "${job_info}" ) From 306387ad49c202d2697474f89accaa8a0cb3711a Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Mon, 29 Jul 2024 15:23:09 +0000 Subject: [PATCH 05/15] Fix shellcheck warnings --- tests/rt_utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index 987426fba1..d7305542bd 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -209,7 +209,7 @@ submit_and_wait() { job_running=true else job_running=false - job_info=$( sacct -n -j "${jobid}" --format=JobID,state%20,Jobname%64 | grep "^${jobid}" | grep ${JBNME} ) + job_info=$( sacct -n -j "${jobid}" --format=JobID,state%20,Jobname%64 | grep "^${jobid}" | grep "${JBNME}" ) fi # Getting the status letter from scheduler info status=$( grep "${jobid}" <<< "${job_info}" ) From 3b496f480f59075a6719d3899b218278e2686912 Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Mon, 29 Jul 2024 18:33:15 +0000 Subject: [PATCH 06/15] Add ecflow_client label update --- tests/rt_utils.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index d7305542bd..8f7efbdf18 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -151,6 +151,7 @@ submit_and_wait() { local count=0 local job_running='' echo "rt_utils.sh: Job is waiting to enter the queue..." + [[ ${ECFLOW:-false} == true ]] && ecflow_client --label=job_status "Waiting to enter the queue" until [[ ${job_running} == 'true' ]] do case ${SCHEDULER} in @@ -177,6 +178,10 @@ submit_and_wait() { if [[ ${count} -eq 13 ]]; then echo "No job in queue after one minute, exiting..."; exit 2; fi done echo "rt_utils.sh Job (${jobid}) is now in the queue." + if [[ ${ECFLOW:-false} == true ]]; then + ecflow_client --label=job_id "${jobid}" + ecflow_client --label=job_status "Submitted" + fi # wait for the job to finish and compare results local n=1 @@ -243,6 +248,7 @@ submit_and_wait() { F|TO|CA|FAILED|TIMEOUT|CANCELLED) echo "rt_utils.sh: !!!!!!!!!!JOB TERMINATED!!!!!!!!!! status=${status}" job_running=false #Trip the loop to end with these status flags + [[ ${ECFLOW:-false} == true ]] && ecflow_client --label=job_status "Failed" interrupt_job exit 1 ;; @@ -260,6 +266,7 @@ submit_and_wait() { esac echo "${n} min. ${SCHEDULER^} Job ${jobid} Status: ${status_label} (${status})" + [[ ${ECFLOW:-false} == true ]] && ecflow_client --label=job_status "${status_label}" (( n=n+1 )) sleep 60 & wait $! From f93e983c3664cb9d64945004455e2b8a87e21b88 Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Mon, 29 Jul 2024 20:42:22 +0000 Subject: [PATCH 07/15] Support COMPLETING Slurm job status --- tests/rt_utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index 8f7efbdf18..e63c8098a9 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -234,7 +234,7 @@ submit_and_wait() { #running cases #pbs: R #slurm: (old: R, new: RUNNING) - R|RUNNING) + R|RUNNING|COMPLETING) status_label='Job running' ;; #held cases From 9690557e28a4f5dfb1d46a1ccbade53f2077f6e0 Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Fri, 2 Aug 2024 19:36:41 +0000 Subject: [PATCH 08/15] Move code from check_results function into run_test.sh --- tests/rt_utils.sh | 134 ----------------------------------------- tests/run_test.sh | 150 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 134 insertions(+), 150 deletions(-) diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index b4c487768d..b749c26bc1 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -273,140 +273,6 @@ submit_and_wait() { done } -check_results() { - echo "rt_utils.sh: Checking results of the regression test: ${TEST_ID}" - - ROCOTO=${ROCOTO:-false} - ECFLOW=${ECFLOW:-false} - - local test_status='PASS' - - # Give one minute for data to show up on file system - #sleep 60 - - { - echo - echo "baseline dir = ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}" - echo "working dir = ${RUNDIR}" - echo "Checking test ${TEST_ID} results ...." - } > "${RT_LOG}" - echo - echo "baseline dir = ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}" - echo "working dir = ${RUNDIR}" - echo "Checking test ${TEST_ID} results ...." - - if [[ ${CREATE_BASELINE} = false ]]; then - # - # --- regression test comparison - # - for i in ${LIST_FILES} ; do - printf %s " Comparing ${i} ....." >> "${RT_LOG}" - printf %s " Comparing ${i} ....." - - if [[ ! -f ${RUNDIR}/${i} ]] ; then - - echo ".......MISSING file" >> "${RT_LOG}" - echo ".......MISSING file" - test_status='FAIL' - - elif [[ ! -f ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i} ]] ; then - - echo ".......MISSING baseline" >> "${RT_LOG}" - echo ".......MISSING baseline" - test_status='FAIL' - - else - if [[ ${i##*.} == nc* ]] ; then - if [[ " orion hercules hera wcoss2 acorn derecho gaea jet s4 noaacloud " =~ ${MACHINE_ID} ]]; then - printf "USING NCCMP.." >> "${RT_LOG}" - printf "USING NCCMP.." - if [[ ${CMP_DATAONLY} == false ]]; then - nccmp -d -S -q -f -g -B --Attribute=checksum --warn=format "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" > "${i}_nccmp.log" 2>&1 && d=$? || d=$? - else - nccmp -d -S -q -f -B --Attribute=checksum --warn=format "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" > "${i}_nccmp.log" 2>&1 && d=$? || d=$? - fi - if [[ ${d} -ne 0 && ${d} -ne 1 ]]; then - printf "....ERROR" >> "${RT_LOG}" - printf "....ERROR" - test_status='FAIL' - fi - fi - else - printf "USING CMP.." >> "${RT_LOG}" - printf "USING CMP.." - cmp "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" >/dev/null 2>&1 && d=$? || d=$? - if [[ ${d} -eq 2 ]]; then - printf "....ERROR" >> "${RT_LOG}" - printf "....ERROR" - test_status='FAIL' - fi - - fi - - if [[ ${d} -ne 0 ]]; then - echo "....NOT IDENTICAL" >> "${RT_LOG}" - echo "....NOT IDENTICAL" - test_status='FAIL' - else - echo "....OK" >> "${RT_LOG}" - echo "....OK" - fi - - fi - - done - - else - # - # --- create baselines - # - echo;echo "Moving baseline ${TEST_ID} files ...." - echo;echo "Moving baseline ${TEST_ID} files ...." >> "${RT_LOG}" - - for i in ${LIST_FILES} ; do - printf %s " Moving ${i} ....." - printf %s " Moving ${i} ....." >> "${RT_LOG}" - if [[ -f ${RUNDIR}/${i} ]] ; then - mkdir -p "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}/$(dirname "${i}")" - cp "${RUNDIR}/${i}" "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}/${i}" - echo "....OK" >> "${RT_LOG}" - echo "....OK" - else - echo "....NOT OK. Missing ${RUNDIR}/${i}" >> "${RT_LOG}" - echo "....NOT OK. Missing ${RUNDIR}/${i}" - test_status='FAIL' - fi - done - - fi - - { - echo - grep "The total amount of wall time" "${RUNDIR}/out" - grep "The maximum resident set size" "${RUNDIR}/out" - echo - } >> "${RT_LOG}" - - TRIES='' - if [[ ${ECFLOW} == true ]]; then - if [[ ${ECF_TRYNO} -gt 1 ]]; then - TRIES=" Tries: ${ECF_TRYNO}" - fi - fi - echo "Test ${TEST_ID} ${test_status}${TRIES}" >> "${RT_LOG}" - echo >> "${RT_LOG}" - echo "Test ${TEST_ID} ${test_status}${TRIES}" - echo - - if [[ ${test_status} = 'FAIL' ]]; then - echo "${TEST_ID} failed in check_result" >> "${PATHRT}/fail_test_${TEST_ID}" - return 1 - else - return 0 - fi -} - - kill_job() { echo "rt_utils.sh: Killing job: ${jobid} on ${SCHEDULER}..." [[ -z $1 ]] && exit 1 diff --git a/tests/run_test.sh b/tests/run_test.sh index ace4fd0cf1..cfce002bdc 100755 --- a/tests/run_test.sh +++ b/tests/run_test.sh @@ -17,12 +17,16 @@ cleanup() { write_fail_test() { echo "${TEST_ID} failed in run_test" >> "${PATHRT}/fail_test_${TEST_ID}" - exit 1 -} - -remove_fail_test() { - echo "Removing test failure flag file for ${TEST_ID}" - rm -f "${PATHRT}/fail_test_${TEST_ID}" + if [[ ${ROCOTO:-false} == true ]] || [[ ${ECFLOW:-false} == true ]]; then + # if this script has been submitted by a workflow return non-zero exit status + # so that workflow can resubmit it + exit 1 + else + # if this script has been executed interactively, return zero exit status + # so that rt.sh can continue running, and hope that rt.sh's generate_log + # will catch failed tests + exit 0 + fi } if [[ $# != 5 ]]; then @@ -53,7 +57,7 @@ source default_vars.sh [[ -e ${RUNDIR_ROOT}/run_test_${TEST_ID}.env ]] && source "${RUNDIR_ROOT}/run_test_${TEST_ID}.env" source "tests/${TEST_NAME}" -remove_fail_test +rm -f "${PATHRT}/fail_test_${TEST_ID}" # Save original CNTL_DIR name as INPUT_DIR for regression # tests that try to copy input data from CNTL_DIR @@ -396,11 +400,129 @@ else fi skip_check_results=${skip_check_results:-false} -results_okay=YES -if [[ ${skip_check_results} = false ]]; then - if ( ! check_results ) ; then - results_okay=NO +if [[ ${skip_check_results} == false ]]; then + + test_status='PASS' + + { + echo + echo "baseline dir = ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}" + echo "working dir = ${RUNDIR}" + echo "Checking test ${TEST_ID} results ...." + } > "${RT_LOG}" + echo + echo "baseline dir = ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}" + echo "working dir = ${RUNDIR}" + echo "Checking test ${TEST_ID} results ...." + + if [[ ${CREATE_BASELINE} = false ]]; then + # + # --- regression test comparison + # + for i in ${LIST_FILES} ; do + printf %s " Comparing ${i} ....." >> "${RT_LOG}" + printf %s " Comparing ${i} ....." + + if [[ ! -f ${RUNDIR}/${i} ]] ; then + + echo ".......MISSING file" >> "${RT_LOG}" + echo ".......MISSING file" + test_status='FAIL' + + elif [[ ! -f ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i} ]] ; then + + echo ".......MISSING baseline" >> "${RT_LOG}" + echo ".......MISSING baseline" + test_status='FAIL' + + else + if [[ ${i##*.} == nc* ]] ; then + if [[ " orion hercules hera wcoss2 acorn derecho gaea jet s4 noaacloud " =~ ${MACHINE_ID} ]]; then + printf "USING NCCMP.." >> "${RT_LOG}" + printf "USING NCCMP.." + if [[ ${CMP_DATAONLY} == false ]]; then + nccmp -d -S -q -f -g -B --Attribute=checksum --warn=format "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" > "${i}_nccmp.log" 2>&1 && d=$? || d=$? + else + nccmp -d -S -q -f -B --Attribute=checksum --warn=format "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" > "${i}_nccmp.log" 2>&1 && d=$? || d=$? + fi + if [[ ${d} -ne 0 && ${d} -ne 1 ]]; then + printf "....ERROR" >> "${RT_LOG}" + printf "....ERROR" + test_status='FAIL' + fi + fi + else + printf "USING CMP.." >> "${RT_LOG}" + printf "USING CMP.." + cmp "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" >/dev/null 2>&1 && d=$? || d=$? + if [[ ${d} -eq 2 ]]; then + printf "....ERROR" >> "${RT_LOG}" + printf "....ERROR" + test_status='FAIL' + fi + + fi + + if [[ ${d} -ne 0 ]]; then + echo "....NOT IDENTICAL" >> "${RT_LOG}" + echo "....NOT IDENTICAL" + test_status='FAIL' + else + echo "....OK" >> "${RT_LOG}" + echo "....OK" + fi + + fi + + done + + else + # + # --- create baselines + # + echo;echo "Moving baseline ${TEST_ID} files ...." + echo;echo "Moving baseline ${TEST_ID} files ...." >> "${RT_LOG}" + + for i in ${LIST_FILES} ; do + printf %s " Moving ${i} ....." + printf %s " Moving ${i} ....." >> "${RT_LOG}" + if [[ -f ${RUNDIR}/${i} ]] ; then + mkdir -p "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}/$(dirname "${i}")" + cp "${RUNDIR}/${i}" "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}/${i}" + # cp "${RUNDIR}/${i}" "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}_doesntexist/${i}" + echo "....OK" >> "${RT_LOG}" + echo "....OK" + else + echo "....NOT OK. Missing ${RUNDIR}/${i}" >> "${RT_LOG}" + echo "....NOT OK. Missing ${RUNDIR}/${i}" + test_status='FAIL' + fi + done + + fi + + { + echo + grep "The total amount of wall time" "${RUNDIR}/out" + grep "The maximum resident set size" "${RUNDIR}/out" + echo + } >> "${RT_LOG}" + + TRIES='' + if [[ ${ECFLOW} == true ]]; then + if [[ ${ECF_TRYNO} -gt 1 ]]; then + TRIES=" Tries: ${ECF_TRYNO}" + fi + fi + echo "Test ${TEST_ID} ${test_status}${TRIES}" >> "${RT_LOG}" + echo >> "${RT_LOG}" + echo "Test ${TEST_ID} ${test_status}${TRIES}" + echo + + if [[ ${test_status} = 'FAIL' ]]; then + echo "${TEST_ID} failed in check_result" >> "${PATHRT}/fail_test_${TEST_ID}" fi + else { echo @@ -408,7 +530,7 @@ else grep "The maximum resident set size" "${RUNDIR}/out" echo echo "Test ${TEST_ID} RUN_SUCCESS" - echo;echo;echo + echo;echo;echo } >> "${RT_LOG}" fi @@ -416,10 +538,6 @@ if [[ ${SCHEDULER} != 'none' ]]; then cat "${RUNDIR}/job_timestamp.txt" >> "${LOG_DIR}/${JBNME}_timestamp.txt" fi -if [[ ${results_okay} == YES ]]; then - remove_fail_test -fi - ################################################################################ # End test ################################################################################ From 107001dd0d10beba2d7646d7180f280d0675be7f Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Fri, 2 Aug 2024 19:42:17 +0000 Subject: [PATCH 09/15] Fix shellcheck warning In /github/workspace/tests/rt_utils.sh line 130: local test_status='PASS' ^---------^ SC2034 (warning): test_status appears unused. Verify use (or export if used externally). --- tests/rt_utils.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index b749c26bc1..5290cfb416 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -127,7 +127,6 @@ submit_and_wait() { ROCOTO=${ROCOTO:-false} ECFLOW=${ECFLOW:-false} - local test_status='PASS' case ${SCHEDULER} in pbs) qsubout=$( qsub "${job_card}" ) From 72a38d3fcf59256eda7f6144b0b97f8596978192 Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Sat, 3 Aug 2024 00:26:53 +0000 Subject: [PATCH 10/15] Add timout test --- tests/error-test.conf | 3 +++ tests/rt.sh | 1 + tests/run_compile.sh | 11 ++++++++++- tests/tests/control_c48.v2.sfc_timeout | 4 ++++ 4 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 tests/tests/control_c48.v2.sfc_timeout diff --git a/tests/error-test.conf b/tests/error-test.conf index 2382c59a9e..3e931e67dc 100644 --- a/tests/error-test.conf +++ b/tests/error-test.conf @@ -11,6 +11,9 @@ COMPILE | atm_dyn32 | intel | -DAPP=ATM -DCCPP_SUITES=FV3_GFS_v16,FV3_GFS_v16_fl # This should succeed RUN | control_c48.v2.sfc | | baseline | +# This should fail due to wall clock timeout +RUN | control_c48.v2.sfc_timeout | | baseline | + # These tests should always fail, and prevent the workflow from completing. RUN | fail_to_copy | | baseline | RUN | fail_to_run | | baseline | diff --git a/tests/rt.sh b/tests/rt.sh index 8d2ed7a467..b79d2a6dff 100755 --- a/tests/rt.sh +++ b/tests/rt.sh @@ -1041,6 +1041,7 @@ if [[ ${skip_check_results} == true ]]; then else REGRESSIONTEST_LOG=${PATHRT}/logs/RegressionTests_${MACHINE_ID}.log fi +rm -f ${REGRESSIONTEST_LOG} TEST_START_TIME="$(date '+%Y%m%d %T')" export TEST_START_TIME diff --git a/tests/run_compile.sh b/tests/run_compile.sh index 1685f89653..6eeb72b13e 100755 --- a/tests/run_compile.sh +++ b/tests/run_compile.sh @@ -17,7 +17,16 @@ cleanup() { write_fail_test() { echo "${JBNME} failed in run_compile" >> "${PATHRT}/fail_${JBNME}" - exit 1 + if [[ ${ROCOTO:-false} == true ]] || [[ ${ECFLOW:-false} == true ]]; then + # if this script has been submitted by a workflow return non-zero exit status + # so that workflow can resubmit it + exit 1 + else + # if this script has been executed interactively, return zero exit status + # so that rt.sh can continue running, and hope that rt.sh's generate_log + # will catch failed tests + exit 0 + fi } remove_fail_test() { diff --git a/tests/tests/control_c48.v2.sfc_timeout b/tests/tests/control_c48.v2.sfc_timeout new file mode 100644 index 0000000000..ec78b47892 --- /dev/null +++ b/tests/tests/control_c48.v2.sfc_timeout @@ -0,0 +1,4 @@ +source tests/control_c48.v2.sfc + +# Intentionally make this test fail due to wall clock timeout. Used by error-test.conf +export WLCLK=2 From 5a0de70e7a45e02ef149ab3b7c63a7379cf8a6f4 Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Mon, 5 Aug 2024 13:41:22 +0000 Subject: [PATCH 11/15] Fix shellcheck warning --- tests/rt.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/rt.sh b/tests/rt.sh index b79d2a6dff..ac19077a6c 100755 --- a/tests/rt.sh +++ b/tests/rt.sh @@ -1041,7 +1041,7 @@ if [[ ${skip_check_results} == true ]]; then else REGRESSIONTEST_LOG=${PATHRT}/logs/RegressionTests_${MACHINE_ID}.log fi -rm -f ${REGRESSIONTEST_LOG} +rm -f "${REGRESSIONTEST_LOG}" TEST_START_TIME="$(date '+%Y%m%d %T')" export TEST_START_TIME From 3ff2c92ef43e0924108a9ea9db593f1ffef14884 Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Tue, 6 Aug 2024 17:36:04 +0000 Subject: [PATCH 12/15] Remove setting ecflow labels from submit_and_wait. Let's not complicate things --- tests/rt_utils.sh | 14 -------------- tests/run_test.sh | 12 +++--------- 2 files changed, 3 insertions(+), 23 deletions(-) diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index 5290cfb416..6a5e094158 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -124,9 +124,6 @@ submit_and_wait() { local -r job_card=$1 - ROCOTO=${ROCOTO:-false} - ECFLOW=${ECFLOW:-false} - case ${SCHEDULER} in pbs) qsubout=$( qsub "${job_card}" ) @@ -150,7 +147,6 @@ submit_and_wait() { local count=0 local job_running='' echo "rt_utils.sh: Job is waiting to enter the queue..." - [[ ${ECFLOW:-false} == true ]] && ecflow_client --label=job_status "Waiting to enter the queue" until [[ ${job_running} == 'true' ]] do case ${SCHEDULER} in @@ -177,10 +173,6 @@ submit_and_wait() { if [[ ${count} -eq 13 ]]; then echo "No job in queue after one minute, exiting..."; exit 2; fi done echo "rt_utils.sh Job (${jobid}) is now in the queue." - if [[ ${ECFLOW:-false} == true ]]; then - ecflow_client --label=job_id "${jobid}" - ecflow_client --label=job_status "Submitted" - fi # wait for the job to finish and compare results local n=1 @@ -247,7 +239,6 @@ submit_and_wait() { F|TO|CA|FAILED|TIMEOUT|CANCELLED) echo "rt_utils.sh: !!!!!!!!!!JOB TERMINATED!!!!!!!!!! status=${status}" job_running=false #Trip the loop to end with these status flags - [[ ${ECFLOW:-false} == true ]] && ecflow_client --label=job_status "Failed" interrupt_job exit 1 ;; @@ -265,7 +256,6 @@ submit_and_wait() { esac echo "${n} min. ${SCHEDULER^} Job ${jobid} Status: ${status_label} (${status})" - [[ ${ECFLOW:-false} == true ]] && ecflow_client --label=job_status "${status_label}" (( n=n+1 )) sleep 60 & wait $! @@ -475,8 +465,6 @@ EOF { echo " task compile_${COMPILE_ID}" echo " label build_options '${MAKE_OPT}'" - echo " label job_id ''" - echo " label job_status ''" echo " inlimit max_builds" } >> "${ECFLOW_RUN}/${ECFLOW_SUITE}.def" } @@ -494,8 +482,6 @@ ${PATHRT}/run_test.sh "${PATHRT}" "${RUNDIR_ROOT}" "${TEST_NAME}" "${TEST_ID}" " EOF { echo " task ${TEST_ID}${RT_SUFFIX}" - echo " label job_id ''" - echo " label job_status ''" echo " inlimit max_jobs" } >> "${ECFLOW_RUN}/${ECFLOW_SUITE}.def" if [[ ${DEP_RUN} != '' ]]; then diff --git a/tests/run_test.sh b/tests/run_test.sh index cfce002bdc..553b0a3d37 100755 --- a/tests/run_test.sh +++ b/tests/run_test.sh @@ -508,15 +508,9 @@ if [[ ${skip_check_results} == false ]]; then echo } >> "${RT_LOG}" - TRIES='' - if [[ ${ECFLOW} == true ]]; then - if [[ ${ECF_TRYNO} -gt 1 ]]; then - TRIES=" Tries: ${ECF_TRYNO}" - fi - fi - echo "Test ${TEST_ID} ${test_status}${TRIES}" >> "${RT_LOG}" - echo >> "${RT_LOG}" - echo "Test ${TEST_ID} ${test_status}${TRIES}" + echo "Test ${TEST_ID} ${test_status}" >> "${RT_LOG}" + echo >> "${RT_LOG}" + echo "Test ${TEST_ID} ${test_status}" echo if [[ ${test_status} = 'FAIL' ]]; then From a0bd6a164f749337e83d7135d3c831b541812620 Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Tue, 6 Aug 2024 14:23:37 -0500 Subject: [PATCH 13/15] Shelcheck warning fix --- tests/run_test.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/run_test.sh b/tests/run_test.sh index 553b0a3d37..3044bfecc6 100755 --- a/tests/run_test.sh +++ b/tests/run_test.sh @@ -489,7 +489,6 @@ if [[ ${skip_check_results} == false ]]; then if [[ -f ${RUNDIR}/${i} ]] ; then mkdir -p "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}/$(dirname "${i}")" cp "${RUNDIR}/${i}" "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}/${i}" - # cp "${RUNDIR}/${i}" "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}_doesntexist/${i}" echo "....OK" >> "${RT_LOG}" echo "....OK" else @@ -506,10 +505,10 @@ if [[ ${skip_check_results} == false ]]; then grep "The total amount of wall time" "${RUNDIR}/out" grep "The maximum resident set size" "${RUNDIR}/out" echo + echo "Test ${TEST_ID} ${test_status}" + echo } >> "${RT_LOG}" - echo "Test ${TEST_ID} ${test_status}" >> "${RT_LOG}" - echo >> "${RT_LOG}" echo "Test ${TEST_ID} ${test_status}" echo From a0ca81928766d106ef3ca9d99756228cfe82cbfc Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Tue, 13 Aug 2024 15:37:51 +0000 Subject: [PATCH 14/15] Use realpath instead of readlink in build.sh and tests/compile.sh --- build.sh | 17 +++++------------ tests/compile.sh | 10 ++-------- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/build.sh b/build.sh index c1cfe43d8d..3f82a882da 100755 --- a/build.sh +++ b/build.sh @@ -1,17 +1,10 @@ #!/bin/bash set -eu -uname_s=$(uname -s) -if [[ ${uname_s} == Darwin ]]; then - UFS_MODEL_DIR=$(greadlink -f -n "${BASH_SOURCE[0]}") - UFS_MODEL_DIR=$(dirname "${UFS_MODEL_DIR}") - UFS_MODEL_DIR=$(cd "${UFS_MODEL_DIR}" && pwd -P) -else - UFS_MODEL_DIR=$(readlink -f -n "${BASH_SOURCE[0]}") - UFS_MODEL_DIR=$(dirname "${UFS_MODEL_DIR}") - UFS_MODEL_DIR=$(cd "${UFS_MODEL_DIR}" && pwd -P) -fi -echo "UFS MODEL DIR: ${UFS_MODEL_DIR}" + +SCRIPT_REALPATH=$(realpath "${BASH_SOURCE[0]}") +UFS_MODEL_DIR=$(dirname "${SCRIPT_REALPATH}") readonly UFS_MODEL_DIR +echo "UFS MODEL DIR: ${UFS_MODEL_DIR}" export CC=${CC:-mpicc} export CXX=${CXX:-mpicxx} @@ -26,4 +19,4 @@ for i in ${CMAKE_FLAGS}; do ARR_CMAKE_FLAGS+=("${i}") ; done cmake "${UFS_MODEL_DIR}" "${ARR_CMAKE_FLAGS[@]}" # Turn off OpenMP threading for parallel builds # to avoid exhausting the number of user processes -OMP_NUM_THREADS=1 make -j "${BUILD_JOBS:-4}" "VERBOSE=${BUILD_VERBOSE:-}" \ No newline at end of file +OMP_NUM_THREADS=1 make -j "${BUILD_JOBS:-4}" "VERBOSE=${BUILD_VERBOSE:-}" diff --git a/tests/compile.sh b/tests/compile.sh index 458d985a88..8ab0f60b82 100755 --- a/tests/compile.sh +++ b/tests/compile.sh @@ -12,14 +12,8 @@ function trim { SECONDS=0 -uname_s=$(uname -s) -if [[ ${uname_s} == Darwin ]]; then - greadlnk=$(greadlink -f -n "${BASH_SOURCE[0]}" ) - MYDIR=$(cd "$(dirname "${greadlnk}" )" && pwd -P) -else - readlnk=$(readlink -f -n "${BASH_SOURCE[0]}" ) - MYDIR=$(cd "$(dirname "${readlnk}" )" && pwd -P) -fi +SCRIPT_REALPATH=$(realpath "${BASH_SOURCE[0]}") +MYDIR=$(dirname "${SCRIPT_REALPATH}") readonly MYDIR # ---------------------------------------------------------------------- From 84ab2996664d1368358605c5aa6152a51faab5de Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Mon, 19 Aug 2024 09:45:27 -0500 Subject: [PATCH 15/15] Update run_test.sh. Call write_fail_test if test files in check_results --- tests/run_test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/run_test.sh b/tests/run_test.sh index 3044bfecc6..88e10210d6 100755 --- a/tests/run_test.sh +++ b/tests/run_test.sh @@ -514,6 +514,7 @@ if [[ ${skip_check_results} == false ]]; then if [[ ${test_status} = 'FAIL' ]]; then echo "${TEST_ID} failed in check_result" >> "${PATHRT}/fail_test_${TEST_ID}" + write_fail_test fi else