Prognostic run: restore timings and intermediate restarts (#1994)

brianhenn · web-flow · commit 8b937e1dd3e7 · 2022-08-11T02:57:53.000+02:00
We currently do not have the ability to log wrapper timing and write out intermediate restarts in the prognostic run, though we've had this in the past. This PR restores those, and cleans up the timing logging as a json. 

Significant internal changes:
- Python wrapper timings logged to json and screen
- Intermediate restarts now written out if specified in namelist
diff --git a/workflows/prognostic_c48_run/docs/development.rst b/workflows/prognostic_c48_run/docs/development.rst
@@ -3,35 +3,26 @@
 Developer's Guide
 -----------------
 
-The prognostic run is developed via docker and docker-compose. This
-environment is based off the `prognostic_run` docker image, but has
-bind-mounts to the packages in "/external" of this repository and this
-directory, which allows locally developing this workflow and its
-dependencies.
+The prognostic run is developed via docker. This environment is based off the
+`prognostic_run` docker image, but has bind-mounts to the packages in "/external"
+of this repository and this directory, which allows locally developing this workflow
+and its dependencies.
 
 It is usually fastest to use the latest docker image from Google Container
 Repository. Pull the image::
 
-    docker pull us.gcr.io/vcm-ml/prognostic_run:latest
+    make pull_image_prognostic_run
 
 .. note::
 
     If you run into problems, it would be best to rebuild the docker image from scratch::
 
-        docker-compose build fv3
+        make build_image_prognostic_run
 
 Enter a bash shell in the image::
 
-    docker-compose run fv3net bash
+    make enter_prognostic_run
 
-.. note ::
-
-    This docker-compose will propagate key-based authentication to Google
-    Cloud Platform into the docker image. It expects that environmental variable
-    ``GOOGLE_APPLICATION_CREDENTIALS`` points to a json key. See Google's
-    `documentation <https://cloud.google.com/iam/docs/creating-managing-service-account-keys>`_
-    on how to generate one.
-
-Run the tests::
+Then run the tests::
 
     pytest
diff --git a/workflows/prognostic_c48_run/runtime/loop.py b/workflows/prognostic_c48_run/runtime/loop.py
@@ -338,10 +338,6 @@ def _open_model(self, ml_config: MachineLearningConfig, step: str):
     def time(self) -> cftime.DatetimeJulian:
         return self._state.time
 
-    def cleanup(self):
-        self._print_global_timings()
-        self._fv3gfs.cleanup()
-
     def _step_dynamics(self) -> Diagnostics:
         self._log_debug(f"Dynamics Step")
         self._fv3gfs.step_dynamics()
@@ -378,28 +374,34 @@ def _apply_physics(self) -> Diagnostics:
             "total_precip_after_physics": self._state[TOTAL_PRECIP],
         }
 
-    def _print_timing(self, name, min_val, max_val, mean_val):
-        self._print(f"{name:<30}{min_val:15.4f}{max_val:15.4f}{mean_val:15.4f}")
-
-    def _print_global_timings(self, root=0):
-        is_root = self.rank == root
-        recvbuf = np.array(0.0)
-        reduced = {}
+    def _print_timings(self, reduced):
         self._print("-----------------------------------------------------------------")
         self._print("         Reporting clock statistics from python                  ")
         self._print("-----------------------------------------------------------------")
         self._print(f"{' ':<30}{'min (s)':>15}{'max (s)':>15}{'mean (s)':>15}")
+        for name, timing in reduced.items():
+            self._print(
+                f"{name:<30}{timing['min']:15.4f}"
+                f"{timing['max']:15.4f}{timing['mean']:15.4f}"
+            )
+
+    def log_global_timings(self, root=0):
+        is_root = self.rank == root
+        recvbuf = np.array(0.0)
+        reduced = {}
         for name, value in self._timer.times.items():
             reduced[name] = {}
             for label, op in [("min", MPI.MIN), ("max", MPI.MAX), ("mean", MPI.SUM)]:
                 self.comm.Reduce(np.array(value), recvbuf, op=op)
                 if is_root and label == "mean":
                     recvbuf /= self.comm.Get_size()
                 reduced[name][label] = recvbuf.copy().item()
-            self._print_timing(
-                name, reduced[name]["min"], reduced[name]["max"], reduced[name]["mean"]
-            )
-        self._log_info(f"python_timing:{json.dumps(reduced)}")
+        self._print_timings(reduced)
+        log_out = {
+            "steps": reduced,
+            "units": "[s], cumulative and reduced across ranks",
+        }
+        self._log_info(json.dumps({"python_timing": log_out}))
 
     def _step_prephysics(self) -> Diagnostics:
 
@@ -529,6 +531,11 @@ def _apply_postphysics_to_dycore_state(self) -> Diagnostics:
         )
         return diagnostics
 
+    def _intermediate_restarts(self) -> Diagnostics:
+        self._log_info("Saving intermediate restarts if enabled.")
+        self._fv3gfs.save_intermediate_restart_if_enabled()
+        return {}
+
     def __iter__(
         self,
     ) -> Iterator[Tuple[cftime.DatetimeJulian, Dict[str, xr.DataArray]]]:
@@ -551,6 +558,7 @@ def __iter__(
                 ),
                 self._compute_postphysics,
                 self.monitor("python", self._apply_postphysics_to_dycore_state),
+                self._intermediate_restarts,
             ]:
                 with self._timer.clock(substep.__name__):
                     diagnostics.update(substep())
diff --git a/workflows/prognostic_c48_run/runtime/main.py b/workflows/prognostic_c48_run/runtime/main.py
@@ -71,6 +71,8 @@ def main():
     for diag_file in diag_files:
         diag_file.flush()
 
+    loop.log_global_timings()
+
 
 if __name__ == "__main__":