Skip to content

Commit 8b937e1

Browse files
authored
Prognostic run: restore timings and intermediate restarts (#1994)
We currently do not have the ability to log wrapper timing and write out intermediate restarts in the prognostic run, though we've had this in the past. This PR restores those, and cleans up the timing logging as a json. Significant internal changes: - Python wrapper timings logged to json and screen - Intermediate restarts now written out if specified in namelist
1 parent 713ed38 commit 8b937e1

File tree

3 files changed

+33
-32
lines changed

3 files changed

+33
-32
lines changed

workflows/prognostic_c48_run/docs/development.rst

+8-17
Original file line numberDiff line numberDiff line change
@@ -3,35 +3,26 @@
33
Developer's Guide
44
-----------------
55

6-
The prognostic run is developed via docker and docker-compose. This
7-
environment is based off the `prognostic_run` docker image, but has
8-
bind-mounts to the packages in "/external" of this repository and this
9-
directory, which allows locally developing this workflow and its
10-
dependencies.
6+
The prognostic run is developed via docker. This environment is based off the
7+
`prognostic_run` docker image, but has bind-mounts to the packages in "/external"
8+
of this repository and this directory, which allows locally developing this workflow
9+
and its dependencies.
1110

1211
It is usually fastest to use the latest docker image from Google Container
1312
Repository. Pull the image::
1413

15-
docker pull us.gcr.io/vcm-ml/prognostic_run:latest
14+
make pull_image_prognostic_run
1615

1716
.. note::
1817

1918
If you run into problems, it would be best to rebuild the docker image from scratch::
2019

21-
docker-compose build fv3
20+
make build_image_prognostic_run
2221

2322
Enter a bash shell in the image::
2423

25-
docker-compose run fv3net bash
24+
make enter_prognostic_run
2625

27-
.. note ::
28-
29-
This docker-compose will propagate key-based authentication to Google
30-
Cloud Platform into the docker image. It expects that environmental variable
31-
``GOOGLE_APPLICATION_CREDENTIALS`` points to a json key. See Google's
32-
`documentation <https://cloud.google.com/iam/docs/creating-managing-service-account-keys>`_
33-
on how to generate one.
34-
35-
Run the tests::
26+
Then run the tests::
3627

3728
pytest

workflows/prognostic_c48_run/runtime/loop.py

+23-15
Original file line numberDiff line numberDiff line change
@@ -338,10 +338,6 @@ def _open_model(self, ml_config: MachineLearningConfig, step: str):
338338
def time(self) -> cftime.DatetimeJulian:
339339
return self._state.time
340340

341-
def cleanup(self):
342-
self._print_global_timings()
343-
self._fv3gfs.cleanup()
344-
345341
def _step_dynamics(self) -> Diagnostics:
346342
self._log_debug(f"Dynamics Step")
347343
self._fv3gfs.step_dynamics()
@@ -378,28 +374,34 @@ def _apply_physics(self) -> Diagnostics:
378374
"total_precip_after_physics": self._state[TOTAL_PRECIP],
379375
}
380376

381-
def _print_timing(self, name, min_val, max_val, mean_val):
382-
self._print(f"{name:<30}{min_val:15.4f}{max_val:15.4f}{mean_val:15.4f}")
383-
384-
def _print_global_timings(self, root=0):
385-
is_root = self.rank == root
386-
recvbuf = np.array(0.0)
387-
reduced = {}
377+
def _print_timings(self, reduced):
388378
self._print("-----------------------------------------------------------------")
389379
self._print(" Reporting clock statistics from python ")
390380
self._print("-----------------------------------------------------------------")
391381
self._print(f"{' ':<30}{'min (s)':>15}{'max (s)':>15}{'mean (s)':>15}")
382+
for name, timing in reduced.items():
383+
self._print(
384+
f"{name:<30}{timing['min']:15.4f}"
385+
f"{timing['max']:15.4f}{timing['mean']:15.4f}"
386+
)
387+
388+
def log_global_timings(self, root=0):
389+
is_root = self.rank == root
390+
recvbuf = np.array(0.0)
391+
reduced = {}
392392
for name, value in self._timer.times.items():
393393
reduced[name] = {}
394394
for label, op in [("min", MPI.MIN), ("max", MPI.MAX), ("mean", MPI.SUM)]:
395395
self.comm.Reduce(np.array(value), recvbuf, op=op)
396396
if is_root and label == "mean":
397397
recvbuf /= self.comm.Get_size()
398398
reduced[name][label] = recvbuf.copy().item()
399-
self._print_timing(
400-
name, reduced[name]["min"], reduced[name]["max"], reduced[name]["mean"]
401-
)
402-
self._log_info(f"python_timing:{json.dumps(reduced)}")
399+
self._print_timings(reduced)
400+
log_out = {
401+
"steps": reduced,
402+
"units": "[s], cumulative and reduced across ranks",
403+
}
404+
self._log_info(json.dumps({"python_timing": log_out}))
403405

404406
def _step_prephysics(self) -> Diagnostics:
405407

@@ -529,6 +531,11 @@ def _apply_postphysics_to_dycore_state(self) -> Diagnostics:
529531
)
530532
return diagnostics
531533

534+
def _intermediate_restarts(self) -> Diagnostics:
535+
self._log_info("Saving intermediate restarts if enabled.")
536+
self._fv3gfs.save_intermediate_restart_if_enabled()
537+
return {}
538+
532539
def __iter__(
533540
self,
534541
) -> Iterator[Tuple[cftime.DatetimeJulian, Dict[str, xr.DataArray]]]:
@@ -551,6 +558,7 @@ def __iter__(
551558
),
552559
self._compute_postphysics,
553560
self.monitor("python", self._apply_postphysics_to_dycore_state),
561+
self._intermediate_restarts,
554562
]:
555563
with self._timer.clock(substep.__name__):
556564
diagnostics.update(substep())

workflows/prognostic_c48_run/runtime/main.py

+2
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ def main():
7171
for diag_file in diag_files:
7272
diag_file.flush()
7373

74+
loop.log_global_timings()
75+
7476

7577
if __name__ == "__main__":
7678

0 commit comments

Comments
 (0)