Skip to content

Commit

Permalink
.testing: Add WORK_SPACE cpp macro to control scratch space
Browse files Browse the repository at this point in the history
- Added CPP macro "WORK_SPACE" which defaults to ".". This controls where
  the work/ and results/ directories are located as used by the target
  "test" in .testing/Makefile.
- Use WORK_SPACE=/lustre/f2/scratch/$USER/runner/$CI_RUNNER_ID in the job
  script so that if the runner is later moved to a read-only-from-compute
  disk the pipeline still works.
  • Loading branch information
adcroft committed Jan 13, 2023
1 parent 0f5e6ca commit 91f3288
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 63 deletions.
8 changes: 4 additions & 4 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,9 @@ actions:gnu:
- cd .testing
- module unload PrgEnv-pgi PrgEnv-intel PrgEnv-gnu darshan ; module load PrgEnv-gnu ; module unload netcdf gcc ; module load gcc/7.3.0 cray-hdf5 cray-netcdf
- make -s -j
- make preproc-compile -s -j
- MPIRUN= make preproc -s -j
- echo -e "\e[0Ksection_end:`date +%s`:compile\r\e[0K"
- (echo '#!/bin/bash';echo 'make MPIRUN="srun -mblock --exclusive" test -s -j') > job.sh
- (echo '#!/bin/bash';echo 'make MPIRUN="srun -mblock --exclusive" WORK_SPACE=/lustre/f2/scratch/$USER/runner/$CI_RUNNER_ID test -s -j') > job.sh
- sbatch --clusters=c3,c4 --nodes=5 --time=0:05:00 --account=gfdl_o --qos=debug --job-name=MOM6.gnu.testing --output=log.$CI_JOB_ID --wait job.sh || ( cat log.$CI_JOB_ID ; exit 911 )
- make test.summary

Expand All @@ -202,9 +202,9 @@ actions:intel:
- cd .testing
- module unload PrgEnv-pgi PrgEnv-intel PrgEnv-gnu darshan; module load PrgEnv-intel; module unload netcdf intel; module load intel/18.0.6.288 cray-hdf5 cray-netcdf
- make -s -j
- make preproc-compile -s -j
- MPIRUN= make preproc -s -j
- echo -e "\e[0Ksection_end:`date +%s`:compile\r\e[0K"
- (echo '#!/bin/bash';echo 'make MPIRUN="srun -mblock --exclusive" test -s -j') > job.sh
- (echo '#!/bin/bash';echo 'make MPIRUN="srun -mblock --exclusive" WORK_SPACE=/lustre/f2/scratch/$USER/runner/$CI_RUNNER_ID test -s -j') > job.sh
- sbatch --clusters=c3,c4 --nodes=5 --time=0:05:00 --account=gfdl_o --qos=debug --job-name=MOM6.intel.testing --output=log.$CI_JOB_ID --wait job.sh || ( cat log.$CI_JOB_ID ; exit 911 )
- make test.summary

Expand Down
124 changes: 65 additions & 59 deletions .testing/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@
# MOM_TARGET_LOCAL_BRANCH Target branch name
# (NOTE: These would typically be configured by a CI.)
#
# Paths for stages:
# WORK_SPACE Location to place work/ and results/ directories (i.e. where to run the model)
#
#----

# TODO: POSIX shell compatibility
Expand Down Expand Up @@ -129,6 +132,8 @@ CONFIGS ?= $(wildcard tc*)
TESTS ?= grid layout rotate restart openmp nan $(foreach d,$(DIMS),dim.$(d))
DIMS ?= t l h z q r

# Default is to place work/ and results/ in current directory
WORK_SPACE ?= .

#---
# Test configuration
Expand Down Expand Up @@ -408,11 +413,11 @@ endef
$(foreach d,$(DIMS),$(eval $(call TEST_DIM_RULE,$(d))))

.PHONY: run.symmetric run.asymmetric run.nans run.openmp run.cov
run.symmetric: $(foreach c,$(CONFIGS),work/$(c)/symmetric/ocean.stats)
run.asymmetric: $(foreach c,$(filter-out tc3,$(CONFIGS)),$(CONFIGS),work/$(c)/asymmetric/ocean.stats)
run.nan: $(foreach c,$(CONFIGS),work/$(c)/nan/ocean.stats)
run.openmp: $(foreach c,$(CONFIGS),work/$(c)/openmp/ocean.stats)
run.cov: $(foreach c,$(CONFIGS),work/$(c)/cov/ocean.stats)
run.symmetric: $(foreach c,$(CONFIGS),$(WORK_SPACE)/work/$(c)/symmetric/ocean.stats)
run.asymmetric: $(foreach c,$(filter-out tc3,$(CONFIGS)),$(CONFIGS),$(WORK_SPACE)/work/$(c)/asymmetric/ocean.stats)
run.nan: $(foreach c,$(CONFIGS),$(WORK_SPACE)/work/$(c)/nan/ocean.stats)
run.openmp: $(foreach c,$(CONFIGS),$(WORK_SPACE)/work/$(c)/openmp/ocean.stats)
run.cov: $(foreach c,$(CONFIGS),$(WORK_SPACE)/work/$(c)/cov/ocean.stats)

# Configuration test rules
# $(1): Configuration name (tc1, tc2, &c.)
Expand Down Expand Up @@ -444,21 +449,21 @@ FAIL = ${RED}FAIL${RESET}
# $(2): Test type (grid, layout, &c.)
# $(3): Comparison targets (symmetric asymmetric, symmetric layout, &c.)
define CMP_RULE
.PRECIOUS: $(foreach b,$(3),work/$(1)/$(b)/ocean.stats)
$(1).$(2): $(foreach b,$(3),work/$(1)/$(b)/ocean.stats)
@test "$$(shell ls -A results/$(1) 2>/dev/null)" || rm -rf results/$(1)
.PRECIOUS: $(foreach b,$(3),$(WORK_SPACE)/work/$(1)/$(b)/ocean.stats)
$(1).$(2): $(foreach b,$(3),$(WORK_SPACE)/work/$(1)/$(b)/ocean.stats)
@test "$$(shell ls -A $(WORK_SPACE)/results/$(1) 2>/dev/null)" || rm -rf $(WORK_SPACE)/results/$(1)
@cmp $$^ || !( \
mkdir -p results/$(1); \
(diff $$^ | tee results/$(1)/ocean.stats.$(2).diff | head -n 20) ; \
mkdir -p $(WORK_SPACE)/results/$(1); \
(diff $$^ | tee $(WORK_SPACE)/results/$(1)/ocean.stats.$(2).diff | head -n 20) ; \
echo -e "$(FAIL): Solutions $(1).$(2) have changed." \
)
@echo -e "$(PASS): Solutions $(1).$(2) agree."

.PRECIOUS: $(foreach b,$(3),work/$(1)/$(b)/chksum_diag)
$(1).$(2).diag: $(foreach b,$(3),work/$(1)/$(b)/chksum_diag)
.PRECIOUS: $(foreach b,$(3),$(WORK_SPACE)/work/$(1)/$(b)/chksum_diag)
$(1).$(2).diag: $(foreach b,$(3),$(WORK_SPACE)/work/$(1)/$(b)/chksum_diag)
@cmp $$^ || !( \
mkdir -p results/$(1); \
(diff $$^ | tee results/$(1)/chksum_diag.$(2).diff | head -n 20) ; \
mkdir -p $(WORK_SPACE)/results/$(1); \
(diff $$^ | tee $(WORK_SPACE)/results/$(1)/chksum_diag.$(2).diff | head -n 20) ; \
echo -e "$(FAIL): Diagnostics $(1).$(2).diag have changed." \
)
@echo -e "$(PASS): Diagnostics $(1).$(2).diag agree."
Expand All @@ -478,36 +483,37 @@ $(foreach c,$(CONFIGS),$(eval $(call CONFIG_DIM_RULE,$(c))))

# Custom comparison rules


# Restart tests only compare the final stat record
.PRECIOUS: $(foreach b,symmetric restart target,work/%/$(b)/ocean.stats)
%.restart: $(foreach b,symmetric restart,work/%/$(b)/ocean.stats)
@test "$(shell ls -A results/$* 2>/dev/null)" || rm -rf results/$*
.PRECIOUS: $(foreach b,symmetric restart target,$(WORK_SPACE)/work/%/$(b)/ocean.stats)
%.restart: $(foreach b,symmetric restart,$(WORK_SPACE)/work/%/$(b)/ocean.stats)
@test "$(shell ls -A $(WORK_SPACE)/results/$* 2>/dev/null)" || rm -rf $(WORK_SPACE)/results/$*
@cmp $(foreach f,$^,<(tr -s ' ' < $(f) | cut -d ' ' -f3- | tail -n 1)) \
|| !( \
mkdir -p results/$*; \
(diff $^ | tee results/$*/chksum_diag.restart.diff | head -n 20) ; \
mkdir -p $(WORK_SPACE)/results/$*; \
(diff $^ | tee $(WORK_SPACE)/results/$*/chksum_diag.restart.diff | head -n 20) ; \
echo -e "$(FAIL): Solutions $*.restart have changed." \
)
@echo -e "$(PASS): Solutions $*.restart agree."

# TODO: chksum_diag parsing of restart files

# stats rule is unchanged, but we cannot use CMP_RULE to generate it.
%.regression: $(foreach b,symmetric target,work/%/$(b)/ocean.stats)
@test "$(shell ls -A results/$* 2>/dev/null)" || rm -rf results/$*
%.regression: $(foreach b,symmetric target,$(WORK_SPACE)/work/%/$(b)/ocean.stats)
@test "$(shell ls -A $(WORK_SPACE)/results/$* 2>/dev/null)" || rm -rf $(WORK_SPACE)/results/$*
@cmp $^ || !( \
mkdir -p results/$*; \
(diff $^ | tee results/$*/ocean.stats.regression.diff | head -n 20) ; \
mkdir -p $(WORK_SPACE)/results/$*; \
(diff $^ | tee $(WORK_SPACE)/results/$*/ocean.stats.regression.diff | head -n 20) ; \
echo -e "$(FAIL): Solutions $*.regression have changed." \
)
@echo -e "$(PASS): Solutions $*.regression agree."

# Regression testing only checks for changes in existing diagnostics
%.regression.diag: $(foreach b,symmetric target,work/%/$(b)/chksum_diag)
%.regression.diag: $(foreach b,symmetric target,$(WORK_SPACE)/work/%/$(b)/chksum_diag)
@! diff $^ | grep "^[<>]" | grep "^>" > /dev/null \
|| ! (\
mkdir -p results/$*; \
(diff $^ | tee results/$*/chksum_diag.regression.diff | head -n 20) ; \
mkdir -p $(WORK_SPACE)/results/$*; \
(diff $^ | tee $(WORK_SPACE)/results/$*/chksum_diag.regression.diff | head -n 20) ; \
echo -e "$(FAIL): Diagnostics $*.regression.diag have changed." \
)
@cmp $^ || ( \
Expand Down Expand Up @@ -536,7 +542,7 @@ tc4/configure: tc4/configure.ac
#---
# Test run output files

# Rule to build work/<tc>/{ocean.stats,chksum_diag}.<tag>
# Rule to build $(WORK_SPACE)/work/<tc>/{ocean.stats,chksum_diag}.<tag>
# $(1): Test configuration name <tag>
# $(2): Executable type
# $(3): Enable coverage flag
Expand All @@ -545,15 +551,15 @@ tc4/configure: tc4/configure.ac
# $(6): Number of MPI ranks

define STAT_RULE
work/%/$(1)/ocean.stats work/%/$(1)/chksum_diag: build/$(2)/MOM6 | preproc
$(WORK_SPACE)/work/%/$(1)/ocean.stats $(WORK_SPACE)/work/%/$(1)/chksum_diag: build/$(2)/MOM6 | preproc
@echo "Running test $$*.$(1)..."
mkdir -p $$(@D)
cp -RL $$*/* $$(@D)
mkdir -p $$(@D)/RESTART
echo -e "$(4)" > $$(@D)/MOM_override
rm -f results/$$*/std.$(1).{out,err}
rm -f $(WORK_SPACE)/results/$$*/std.$(1).{out,err}
cd $$(@D) \
&& $(TIME) $(5) $(MPIRUN) -n $(6) ../../../$$< 2> std.err > std.out \
&& $(TIME) $(5) $(MPIRUN) -n $(6) $(abspath $$<) 2> std.err > std.out \
|| !( \
mkdir -p ../../../results/$$*/ ; \
cat std.out | tee ../../../results/$$*/std.$(1).out | tail -n 20 ; \
Expand All @@ -563,7 +569,7 @@ work/%/$(1)/ocean.stats work/%/$(1)/chksum_diag: build/$(2)/MOM6 | preproc
)
@echo -e "$(DONE): $$*.$(1); no runtime errors."
if [ $(3) ]; then \
mkdir -p results/$$* ; \
mkdir -p $(WORK_SPACE)/results/$$* ; \
cd build/$(2) ; \
gcov -b *.gcda > gcov.$$*.$(1).out ; \
find -name "*.gcov" -exec sed -i -r 's/^( *[0-9]*)\*:/ \1:/g' {} \; ; \
Expand Down Expand Up @@ -614,7 +620,7 @@ $(eval $(call STAT_RULE,cov,cov,true,,,1))
# 2. Convert DAYMAX from TIMEUNIT to seconds
# 3. Apply seconds to `ocean_solo_nml` inside input.nml.
# NOTE: Assumes that runtime set by DAYMAX, will fail if set by input.nml
work/%/restart/ocean.stats: build/symmetric/MOM6 | preproc
$(WORK_SPACE)/work/%/restart/ocean.stats: build/symmetric/MOM6 | preproc
rm -rf $(@D)
mkdir -p $(@D)
cp -RL $*/* $(@D)
Expand All @@ -628,9 +634,9 @@ work/%/restart/ocean.stats: build/symmetric/MOM6 | preproc
&& halfperiod=$$(awk -v t=$${daymax} -v dt=$${timeunit} 'BEGIN {printf "%.f", 0.5*t*dt}') \
&& printf "\n&ocean_solo_nml\n seconds = $${halfperiod}\n/\n" >> input.nml
# Remove any previous archived output
rm -f results/$*/std.restart{1,2}.{out,err}
rm -f $(WORK_SPACE)/results/$*/std.restart{1,2}.{out,err}
# Run the first half-period
cd $(@D) && $(TIME) $(MPIRUN) -n 1 ../../../$< 2> std1.err > std1.out \
cd $(@D) && $(TIME) $(MPIRUN) -n 1 $(abspath $<) 2> std1.err > std1.out \
|| !( \
cat std1.out | tee ../../../results/$*/std.restart1.out | tail -n 20 ; \
cat std1.err | tee ../../../results/$*/std.restart1.err | tail -n 20 ; \
Expand All @@ -641,7 +647,7 @@ work/%/restart/ocean.stats: build/symmetric/MOM6 | preproc
mkdir $(@D)/RESTART
cd $(@D) && sed -i -e "s/input_filename *= *'n'/input_filename = 'r'/g" input.nml
# Run the second half-period
cd $(@D) && $(TIME) $(MPIRUN) -n 1 ../../../$< 2> std2.err > std2.out \
cd $(@D) && $(TIME) $(MPIRUN) -n 1 $(abspath $<) 2> std2.err > std2.out \
|| !( \
cat std2.out | tee ../../../results/$*/std.restart2.out | tail -n 20 ; \
cat std2.err | tee ../../../results/$*/std.restart2.err | tail -n 20 ; \
Expand All @@ -654,20 +660,20 @@ work/%/restart/ocean.stats: build/symmetric/MOM6 | preproc
# Not a true rule; only call this after `make test` to summarize test results.
.PHONY: test.summary
test.summary:
@if ls results/*/* &> /dev/null; then \
if ls results/*/std.*.err &> /dev/null; then \
@if ls $(WORK_SPACE)/results/*/* &> /dev/null; then \
if ls $(WORK_SPACE)/results/*/std.*.err &> /dev/null; then \
echo "The following tests failed to complete:" ; \
ls results/*/std.*.out \
ls $(WORK_SPACE)/results/*/std.*.out \
| awk '{split($$0,a,"/"); split(a[3],t,"."); v=t[2]; if(length(t)>3) v=v"."t[3]; print a[2],":",v}'; \
fi; \
if ls results/*/ocean.stats.*.diff &> /dev/null; then \
if ls $(WORK_SPACE)/results/*/ocean.stats.*.diff &> /dev/null; then \
echo "The following tests report solution regressions:" ; \
ls results/*/ocean.stats.*.diff \
ls $(WORK_SPACE)/results/*/ocean.stats.*.diff \
| awk '{split($$0,a,"/"); split(a[3],t,"."); v=t[3]; if(length(t)>4) v=v"."t[4]; print a[2],":",v}'; \
fi; \
if ls results/*/chksum_diag.*.diff &> /dev/null; then \
if ls $(WORK_SPACE)/results/*/chksum_diag.*.diff &> /dev/null; then \
echo "The following tests report diagnostic regressions:" ; \
ls results/*/chksum_diag.*.diff \
ls $(WORK_SPACE)/results/*/chksum_diag.*.diff \
| awk '{split($$0,a,"/"); split(a[3],t,"."); v=t[2]; if(length(t)>3) v=v"."t[3]; print a[2],":",v}'; \
fi; \
false ; \
Expand All @@ -683,28 +689,28 @@ test.summary:
.PHONY: run.cov.unit
run.cov.unit: build/unit/MOM_file_parser_tests.F90.gcov

work/unit/std.out: build/unit/MOM_unit_tests
$(WORK_SPACE)/work/unit/std.out: build/unit/MOM_unit_tests
if [ $(REPORT_COVERAGE) ]; then \
find build/unit -name *.gcda -exec rm -f '{}' \; ; \
fi
rm -rf $(@D)
mkdir -p $(@D)
cd $(@D) \
&& $(TIME) $(MPIRUN) -n 1 ../../$< 2> std.err > std.out \
&& $(TIME) $(MPIRUN) -n 1 $(abspath $<) 2> std.err > std.out \
|| !( \
cat std.out | tail -n 100 ; \
cat std.err | tail -n 100 ; \
)
cd $(@D) \
&& $(TIME) $(MPIRUN) -n 2 ../../$< 2> p2.std.err > p2.std.out \
&& $(TIME) $(MPIRUN) -n 2 $(abspath $<) 2> p2.std.err > p2.std.out \
|| !( \
cat p2.std.out | tail -n 100 ; \
cat p2.std.err | tail -n 100 ; \
)

# NOTE: .gcov actually depends on .gcda, but .gcda is produced with std.out
# TODO: Replace work/unit/std.out with *.gcda?
build/unit/MOM_file_parser_tests.F90.gcov: work/unit/std.out
# TODO: Replace $(WORK_SPACE)/work/unit/std.out with *.gcda?
build/unit/MOM_file_parser_tests.F90.gcov: $(WORK_SPACE)/work/unit/std.out
cd $(@D) \
&& gcov -b *.gcda > gcov.unit.out
find $(@D) -name "*.gcov" -exec sed -i -r 's/^( *[0-9]*)\*:/ \1:/g' {} \;
Expand All @@ -731,22 +737,22 @@ PCONFIGS = p0
profile: $(foreach p,$(PCONFIGS), prof.$(p))

.PHONY: prof.p0
prof.p0: work/p0/opt/clocks.json work/p0/opt_target/clocks.json
prof.p0: $(WORK_SPACE)/work/p0/opt/clocks.json $(WORK_SPACE)/work/p0/opt_target/clocks.json
python tools/compare_clocks.py $^

work/p0/%/clocks.json: work/p0/%/std.out
$(WORK_SPACE)/work/p0/%/clocks.json: $(WORK_SPACE)/work/p0/%/std.out
python tools/parse_fms_clocks.py -d $(@D) $^ > $@

work/p0/opt/std.out: build/opt/MOM6
work/p0/opt_target/std.out: build/opt_target/MOM6
$(WORK_SPACE)/work/p0/opt/std.out: build/opt/MOM6
$(WORK_SPACE)/work/p0/opt_target/std.out: build/opt_target/MOM6

work/p0/%/std.out:
$(WORK_SPACE)/work/p0/%/std.out:
mkdir -p $(@D)
cp -RL p0/* $(@D)
mkdir -p $(@D)/RESTART
echo -e "" > $(@D)/MOM_override
cd $(@D) \
&& $(MPIRUN) -n 1 ../../../$< 2> std.err > std.out
&& $(MPIRUN) -n 1 $(abspath $<) 2> std.err > std.out


#---
Expand All @@ -759,16 +765,16 @@ PERF_EVENTS ?=
perf: $(foreach p,$(PCONFIGS), perf.$(p))

.PHONY: prof.p0
perf.p0: work/p0/opt/profile.json work/p0/opt_target/profile.json
perf.p0: $(WORK_SPACE)/work/p0/opt/profile.json $(WORK_SPACE)/work/p0/opt_target/profile.json
python tools/compare_perf.py $^

work/p0/%/profile.json: work/p0/%/perf.data
$(WORK_SPACE)/work/p0/%/profile.json: $(WORK_SPACE)/work/p0/%/perf.data
python tools/parse_perf.py -f $< > $@

work/p0/opt/perf.data: build/opt/MOM6
work/p0/opt_target/perf.data: build/opt_target/MOM6
$(WORK_SPACE)/work/p0/opt/perf.data: build/opt/MOM6
$(WORK_SPACE)/work/p0/opt_target/perf.data: build/opt_target/MOM6

work/p0/%/perf.data:
$(WORK_SPACE)/work/p0/%/perf.data:
mkdir -p $(@D)
cp -RL p0/* $(@D)
mkdir -p $(@D)/RESTART
Expand Down Expand Up @@ -799,7 +805,7 @@ clean.build:
.PHONY: clean.stats
clean.stats:
@[ $$(basename $$(pwd)) = .testing ]
rm -rf work results
rm -rf $(WORK_SPACE)/work $(WORK_SPACE)/results


.PHONY: clean.preproc
Expand Down

0 comments on commit 91f3288

Please sign in to comment.