From 0fcab574d287c35151f2f431a09f451c16b9d7ae Mon Sep 17 00:00:00 2001 From: Alistair Adcroft Date: Fri, 17 May 2024 15:21:14 -0400 Subject: [PATCH] Undoes workaround in tools/MRS for Gaea f2 issues - At one point in the life of Gaea, we had so many file system and slurm issues that we had to re-try everything a few times and then only on the last try, detect any failures. - Allowing multiple fails let a bug through with the nVidia executables where is would encounter a sig11 only after finishing writing the output. A second attempt would not rerun the model since it had written the files. - This commit undoes that workaround / loophole. --- tools/MRS/Makefile.restart | 6 ------ tools/MRS/Makefile.run | 11 +++-------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/tools/MRS/Makefile.restart b/tools/MRS/Makefile.restart index 7528d66531..6c08ab0d1c 100644 --- a/tools/MRS/Makefile.restart +++ b/tools/MRS/Makefile.restart @@ -94,17 +94,11 @@ $(foreach s,01 12 02,MOM6-examples/ocean_only/circle_obcs/$(s).ignore/ocean.stat # The above target has been temporarily split as a workaround to the breakage of srun deferred-behavior at ORNL $(1)_ocean_only_by_dep: $(call stats-files,ocean_only,$(1),$(RESTART_STAGE).ignore) $(1)_ocean_only: - -$$(MAKE) -f $(firstword $(MAKEFILE_LIST)) $(1)_ocean_only_by_dep - -$$(MAKE) -f $(firstword $(MAKEFILE_LIST)) $(1)_ocean_only_by_dep - -$$(MAKE) -f $(firstword $(MAKEFILE_LIST)) $(1)_ocean_only_by_dep $$(MAKE) -f $(firstword $(MAKEFILE_LIST)) $(1)_ocean_only_by_dep #$(1)_ice_ocean_SIS2: $(call stats-files,ice_ocean_SIS2,$(1),$(RESTART_STAGE).ignore) # The above target has been temporarily split as a workaround to the breakage of srun deferred-behavior at ORNL $(1)_ice_ocean_SIS2_by_dep: $(call stats-files,ice_ocean_SIS2,$(1),$(RESTART_STAGE).ignore) $(1)_ice_ocean_SIS2: - -$$(MAKE) -f $(firstword $(MAKEFILE_LIST)) $(1)_ice_ocean_SIS2_by_dep - -$$(MAKE) -f $(firstword $(MAKEFILE_LIST)) $(1)_ice_ocean_SIS2_by_dep - -$$(MAKE) -f $(firstword $(MAKEFILE_LIST)) $(1)_ice_ocean_SIS2_by_dep $$(MAKE) -f $(firstword $(MAKEFILE_LIST)) $(1)_ice_ocean_SIS2_by_dep $(1)_land_ice_ocean_LM3_SIS2: $(call stats-files,land_ice_ocean_LM3_SIS2,$(1),$(RESTART_STAGE).ignore) $(1)_coupled_AM2_LM3_SIS: $(call stats-files,coupled_AM2_LM3_SIS/,$(1),$(RESTART_STAGE).ignore) diff --git a/tools/MRS/Makefile.run b/tools/MRS/Makefile.run index 878307e431..4750b2acc4 100644 --- a/tools/MRS/Makefile.run +++ b/tools/MRS/Makefile.run @@ -44,7 +44,7 @@ ifeq ($(MEMORY),static) define run-static-model $(CONFIGS)/%/ocean.stats.$(1): $(CONFIGS)/%/input.nml $(CONFIGS)/%/MOM_input $(CONFIGS)/%/MOM_override echo $(BUILD)/$(1)/$$(MODE)/static/$$*/MOM6"("$$(STATIC_NPES)")" "=>" $$@ - cd $$(@D) && rm -rf Depth_list.nc CPU_stats.$(1) time_stamp.out $$(@F) RESTART FAIL U_velocity_truncations V_velocity_truncations && mkdir RESTART + cd $$(@D) && rm -rf Depth_list.nc CPU_stats.$(1) time_stamp.out exitcode $$(@F) RESTART FAIL U_velocity_truncations V_velocity_truncations seaice.stats.$(1) && mkdir RESTART cd $$(@D) && tic=$$$$(date +%s) && \ (OMP_NUM_THREADS=1 KMP_STACKSIZE=512m NC_BLKSZ=1M time $(MPIRUN) -n $$(STATIC_NPES) $$(call rel_path,$$(@D))$(BUILD)/$(1)/$$(MODE)/static/$$*/MOM6 > log.$(1).out || touch FAIL;) \ 2>&1 | egrep -v 'ing coupler_init| initializ|ing |CHECKSUM::|^ *$$$$' | sed 's,^,$$@: ,' ; toc=$$$$(date +%s) ; echo $$$$(($$$$toc-$$$$tic)) > walltime.$(1).out @@ -64,7 +64,7 @@ ifeq ($(LAYOUT),alt) define run-dynamic-model $(CONFIGS)/$(2)/%/ocean.stats.$(1): $(BUILD)/$(1)/$$(MODE)/$$(MEMORY)/$(2)/MOM6 $(CONFIGS)/$(2)/%/input.nml $(CONFIGS)/$(2)/%/MOM_input $(CONFIGS)/$(2)/%/MOM_override echo $(BUILD)/$(1)/$$(MODE)/$$(MEMORY)/$(2)/MOM6"("$$(ALT_NPES)")" "=>" $$@ - cd $$(@D) && rm -rf Depth_list.nc CPU_stats.$(1) time_stamp.out $$(@F) RESTART FAIL U_velocity_truncations V_velocity_truncations && mkdir RESTART + cd $$(@D) && rm -rf Depth_list.nc CPU_stats.$(1) time_stamp.out exitcode $$(@F) RESTART FAIL U_velocity_truncations V_velocity_truncations seaice.stats.$(1) && mkdir RESTART cd $$(@D) && tic=$$$$(date +%s) && \ (OMP_NUM_THREADS=1 KMP_STACKSIZE=512m NC_BLKSZ=1M time $(MPIRUN) -n $$(ALT_NPES) $$(call rel_path,$$(@D))$(BUILD)/$(1)/$$(MODE)/$$(MEMORY)/$(2)/MOM6 > log.$(1).out || touch FAIL;) \ 2>&1 | egrep -v 'ing coupler_init| initializ|ing |CHECKSUM::|^ *$$$$' | sed 's,^,$$@: ,' ; toc=$$$$(date +%s) ; echo $$$$(($$$$toc-$$$$tic)) > walltime.$(1).out @@ -80,7 +80,7 @@ else define run-dynamic-model $(CONFIGS)/$(2)/%/ocean.stats.$(1): $(BUILD)/$(1)/$$(MODE)/$$(MEMORY)/$(2)/MOM6 $(CONFIGS)/$(2)/%/input.nml $(CONFIGS)/$(2)/%/MOM_input $(CONFIGS)/$(2)/%/MOM_override echo $(BUILD)/$(1)/$$(MODE)/$$(MEMORY)/$(2)/MOM6"("$$(NPES)")" "=>" $$@ - cd $$(@D) && rm -rf Depth_list.nc CPU_stats.$(1) time_stamp.out $$(@F) RESTART FAIL U_velocity_truncations V_velocity_truncations && mkdir RESTART + cd $$(@D) && rm -rf Depth_list.nc CPU_stats.$(1) time_stamp.out exitcode $$(@F) RESTART FAIL U_velocity_truncations V_velocity_truncations seaice.stats.$(1) && mkdir RESTART cd $$(@D) && tic=$$$$(date +%s) && \ (OMP_NUM_THREADS=1 KMP_STACKSIZE=512m NC_BLKSZ=1M time $(MPIRUN) -n $$(NPES) $$(call rel_path,$$(@D))$(BUILD)/$(1)/$$(MODE)/$$(MEMORY)/$(2)/MOM6 > log.$(1).out || touch FAIL;) \ 2>&1 | egrep -v 'ing coupler_init| initializ|ing |CHECKSUM::|^ *$$$$' | sed 's,^,$$@: ,' ; toc=$$$$(date +%s) ; echo $$$$(($$$$toc-$$$$tic)) > walltime.$(1).out @@ -105,11 +105,6 @@ $(1)_coupled_AM2_LM3_SIS2: $(call stats-files,coupled_AM2_LM3_SIS2,$(1)) # The above target has been temporarily split as a workaround to the breakage of srun deferred-behavior at ORNL $(1)_all_by_dep: $(call stats-files,/,$(1)) $(1)_all: - -$$(MAKE) -f $(firstword $(MAKEFILE_LIST)) $(1)_all_by_dep - -$$(MAKE) -f $(firstword $(MAKEFILE_LIST)) $(1)_all_by_dep - -$$(MAKE) -f $(firstword $(MAKEFILE_LIST)) $(1)_all_by_dep - -$$(MAKE) -f $(firstword $(MAKEFILE_LIST)) $(1)_all_by_dep - -$$(MAKE) -f $(firstword $(MAKEFILE_LIST)) $(1)_all_by_dep $$(MAKE) -f $(firstword $(MAKEFILE_LIST)) $(1)_all_by_dep endef $(foreach c,$(COMPILERS),$(eval $(call generate-targets,$(c))))