From a97da52df492d3db97346744fc457b1398aa8035 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 17 Apr 2017 13:13:56 -0600 Subject: [PATCH 1/3] Fix NODEFAIL test on cheyenne. When the switch is made from running a real exe to a script, an env var must be set in order for MPI to be able to run the script on cheyenne. --- scripts/lib/CIME/SystemTests/nodefail.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/lib/CIME/SystemTests/nodefail.py b/scripts/lib/CIME/SystemTests/nodefail.py index 2f0436a870e..9101a557997 100644 --- a/scripts/lib/CIME/SystemTests/nodefail.py +++ b/scripts/lib/CIME/SystemTests/nodefail.py @@ -56,8 +56,16 @@ def _restart_fake_phase(self): env_mach_specific.set_value("run_exe", fake_exe_file) self._case.flush(flushall=True) + # This flag is needed by mpt to run a script under mpiexec + mpilib = self._case.get_value("MPILIB") + if mpilib == "mpt": + os.environ["MPI_SHEPHERD"] = "true" + self.run_indv(suffix=None) + if mpilib == "mpt": + del os.environ["MPI_SHEPHERD"] + env_mach_specific = self._case.get_env("mach_specific") env_mach_specific.set_value("run_exe", prev_run_exe) self._case.flush(flushall=True) From f4a2804c79231bf66d07584fb4887c4f45710689 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Tue, 18 Apr 2017 11:17:54 -0600 Subject: [PATCH 2/3] fixes for test to work on cheyenne --- config/cesm/machines/config_machines.xml | 4 +++- config/config_tests.xml | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/config/cesm/machines/config_machines.xml b/config/cesm/machines/config_machines.xml index f07a9ee7942..d0b61ed4bcc 100644 --- a/config/cesm/machines/config_machines.xml +++ b/config/cesm/machines/config_machines.xml @@ -219,8 +219,10 @@ mpiexec_mpt + -np $TOTALPES -p "%g:" - omplace + + omplace diff --git a/config/config_tests.xml b/config/config_tests.xml index 7532b250903..ab0dadea2ce 100644 --- a/config/config_tests.xml +++ b/config/config_tests.xml @@ -366,7 +366,8 @@ LII CLM initial condition interpolation test For testing infra only. Tests restart upon detected node failure 1 - ndays + nsteps + $ATM_NCPL 11 $STOP_N / 2 + 1 $STOP_OPTION From cedf8f050ea0b109ff068b52e1c0fb5c9015db5e Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 18 Apr 2017 11:36:26 -0600 Subject: [PATCH 3/3] Add better test documentation --- config/config_tests.xml | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/config/config_tests.xml b/config/config_tests.xml index ab0dadea2ce..5a3c8d1133d 100644 --- a/config/config_tests.xml +++ b/config/config_tests.xml @@ -166,6 +166,45 @@ PRE pause-resume test: by default a BFB test of pause-resume cycling LII CLM initial condition interpolation test +====================================================================== + Infrastructural tests for CIME. These are used by scripts_regression_tests. + Users won't generally run these. +====================================================================== + + +TESTBUILDFAIL Insta-fail build step. Used to confirm that failed + builds are caught and reported correctly. + +TESTBUILDFAILEXC Insta-fail build step by failing to init. Used to test + correct behavior when exceptions are generated. + +TESTRUNFAIL Insta-fail run step. Used to confirm that model run + failures are caught and reported correctly. + +TESTRUNFAILEXC Insta-fail run step via exception. Used to test correct + correct behavior when exceptions are generated. + +TESTRUNPASS Insta-pass run step. Used to test that run that work + are reported correctly. + +TESTMEMLEAKFAIL Insta-fail memleak step. Used to test that memleaks are + detected and reported correctly. + +TESTMEMLEAKPASS Insta-pass memleak step. Used to test that non-memleaks are + reported correctly. + +TESTRUNDIFF Produces a canned hist file. Env var TESTRUNDIFF_ALTERNATE can + be used to cause a DIFF. Used to check that baseline diffs are + detected and reported correctly. + +TESTTESTDIFF Simulates internal test diff (non baseline). Used to check that + internal comparison failures are detected and reported correctly. + +TESTRUNSLOWPASS After 5 minutes of sleep, pass run step. Used to test timeouts + and kills. + +NODEFAIL Tests restart upon detected node failure. Generates fake failures, + the number of which is controlled by NODEFAIL_NUM_FAILS. -->