diff --git a/doc/changelog.md b/doc/changelog.md index 433d542ce..ac3dc3a79 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -23,6 +23,11 @@ Description Detailed Notes +- Enable control over monitoring of Models launched with `experiment.start()` by + adding an optional boolean argument determining whether to monitor the particular + model or not. The argument is set to True by default, so no changes are needed for + the default behavior of monitoring all Models launched. + ([SmartSim-PR788](https://github.com/CrayLabs/SmartSim/pull/788)) - Copyright headers have been updated from "2021-2024" to "2021-2025" across 271 files including Python source files, configuration files, documentation, tests, Docker files, shell scripts, and other supporting files to reflect the new year. diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index 6d145a198..966b2bbe9 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -171,7 +171,7 @@ def test_install( @contextlib.contextmanager def _env_vars_set_to( - evars: t.Mapping[str, t.Optional[str]] + evars: t.Mapping[str, t.Optional[str]], ) -> t.Generator[None, None, None]: envvars = tuple((var, os.environ.pop(var, None), val) for var, val in evars.items()) for var, _, tmpval in envvars: diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 15a5d7e27..3e84a5c59 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -115,6 +115,7 @@ def start( manifest: Manifest, block: bool = True, kill_on_interrupt: bool = True, + monitor: bool = True, ) -> None: """Start the passed SmartSim entities @@ -134,7 +135,7 @@ def start( SignalInterceptionStack.get(signal.SIGINT).push_unique( self._jobs.signal_interrupt ) - launched = self._launch(exp_name, exp_path, manifest) + launched = self._launch(exp_name, exp_path, manifest, monitor) # start the job manager thread if not already started if not self._jobs.actively_monitoring: @@ -172,7 +173,7 @@ def poll( :param kill_on_interrupt: flag for killing jobs when SIGINT is received """ self._jobs.kill_on_interrupt = kill_on_interrupt - to_monitor = self._jobs.jobs + to_monitor = self._jobs.monitor_jobs while len(to_monitor) > 0: time.sleep(interval) @@ -388,7 +389,7 @@ def symlink_output_files( ) def _launch( - self, exp_name: str, exp_path: str, manifest: Manifest + self, exp_name: str, exp_path: str, manifest: Manifest, monitor: bool = True ) -> LaunchedManifest[t.Tuple[str, Step]]: """Main launching function of the controller @@ -398,6 +399,7 @@ def _launch( :param exp_name: The name of the launching experiment :param exp_path: path to location of ``Experiment`` directory if generated :param manifest: Manifest of deployables to launch + :param monitor: boolean to signal whether to monitor deployables """ manifest_builder = LaunchedManifestBuilder[t.Tuple[str, Step]]( @@ -479,7 +481,7 @@ def _launch( # launch and symlink steps for step, entity in steps: - self._launch_step(step, entity) + self._launch_step(step, entity, monitor) self.symlink_output_files(step, entity) # symlink substeps to maintain directory structure @@ -570,11 +572,13 @@ def _launch_step( self, job_step: Step, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], + monitor: bool = True, ) -> None: """Use the launcher to launch a job step :param job_step: a job step instance :param entity: entity instance + :param monitor: boolean determining whether to monitor job :raises SmartSimError: if launch fails """ # attempt to retrieve entity name in JobManager.completed @@ -622,7 +626,7 @@ def _launch_step( self._jobs.restart_job(job_step.name, job_id, entity.name, is_task) else: logger.debug(f"Launching {entity.name}") - self._jobs.add_job(job_step.name, job_id, entity, is_task) + self._jobs.add_job(job_step.name, job_id, entity, is_task, monitor) def _create_batch_job_step( self, diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index b692edb8b..4f5cc0466 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -66,6 +66,7 @@ def __init__(self, lock: RLock, launcher: t.Optional[Launcher] = None) -> None: # active jobs self.jobs: t.Dict[str, Job] = {} + self.monitor_jobs: t.Dict[str, Job] = {} self.db_jobs: t.Dict[str, Job] = {} # completed jobs @@ -133,6 +134,8 @@ def move_to_completed(self, job: Job) -> None: del self.db_jobs[job.ename] elif job.ename in self.jobs: del self.jobs[job.ename] + if job.ename in self.monitor_jobs: + del self.monitor_jobs[job.ename] def __getitem__(self, entity_name: str) -> Job: """Return the job associated with the name of the entity @@ -166,12 +169,14 @@ def add_job( job_id: t.Optional[str], entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity], JobEntity], is_task: bool = True, + monitor: bool = True, ) -> None: """Add a job to the job manager which holds specific jobs by type. :param job_name: name of the job step :param job_id: job step id created by launcher :param entity: entity that was launched on job step + :param monitor: boolean to monitor job :param is_task: process monitored by TaskManager (True) or the WLM (True) """ launcher = str(self._launcher) @@ -183,6 +188,8 @@ def add_job( self.db_jobs[entity.name] = job else: self.jobs[entity.name] = job + if monitor: + self.monitor_jobs[entity.name] = job def is_finished(self, entity: SmartSimEntity) -> bool: """Detect if a job has completed diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index f603f218e..f1dfee90b 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -310,7 +310,7 @@ def finalize(self) -> LaunchedManifest[_T]: def _format_exp_telemetry_path( - exp_path: t.Union[str, "os.PathLike[str]"] + exp_path: t.Union[str, "os.PathLike[str]"], ) -> pathlib.Path: return pathlib.Path(exp_path, CONFIG.telemetry_subdir) diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py index dc61858e3..b094bd533 100644 --- a/smartsim/_core/entrypoints/telemetrymonitor.py +++ b/smartsim/_core/entrypoints/telemetrymonitor.py @@ -49,7 +49,7 @@ def register_signal_handlers( - handle_signal: t.Callable[[int, t.Optional[FrameType]], None] + handle_signal: t.Callable[[int, t.Optional[FrameType]], None], ) -> None: """Register a signal handling function for all termination events diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 401187b02..d11e679db 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -189,6 +189,7 @@ def start( block: bool = True, summary: bool = False, kill_on_interrupt: bool = True, + monitor: bool = True, ) -> None: """Start passed instances using Experiment launcher @@ -229,11 +230,16 @@ def start( that all jobs launched by this experiment will be killed, and the zombie processes will need to be manually killed. + If `monitor=True`, all the jobs being started will be monitored + by the Controller. If `monitor=True`, the jobs will not be + monitored, meaning that their status will not be reported. + :param block: block execution until all non-database jobs are finished :param summary: print a launch summary prior to launch :param kill_on_interrupt: flag for killing jobs when ^C (SIGINT) signal is received. + :param monitor: monitor the jobs being started """ start_manifest = Manifest(*args) self._create_entity_dir(start_manifest) @@ -246,6 +252,7 @@ def start( manifest=start_manifest, block=block, kill_on_interrupt=kill_on_interrupt, + monitor=monitor, ) except SmartSimError as e: logger.error(e) diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index 1d6e9bedf..6aa8df06a 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -158,6 +158,16 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: ) self.run_args["transfer"] = None + def set_launcher_args( + self, arguments: t.Dict[str, t.Union[int, str, float, None]] + ) -> None: + """Set any other task launcher argument + + :param arguments: dictionary with string name and value + """ + for name, value in arguments.items(): + self.run_args[name] = value + def set_walltime(self, walltime: str) -> None: """Set the maximum number of seconds that a job will run diff --git a/tests/test_model.py b/tests/test_model.py index fe4a482b3..33cd537b8 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -94,12 +94,18 @@ def _monkeypatch_exp_controller(exp): entity_steps = [] def start_wo_job_manager( - self, exp_name, exp_path, manifest, block=True, kill_on_interrupt=True + self, + exp_name, + exp_path, + manifest, + block=True, + kill_on_interrupt=True, + monitor=True, ): self._launch(exp_name, exp_path, manifest) return LaunchedManifestBuilder("name", "path", "launcher").finalize() - def launch_step_nop(self, step, entity): + def launch_step_nop(self, step, entity, monitor): entity_steps.append((step, entity)) monkeypatch.setattr( diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py index dc297ccde..4e7d3eb1a 100644 --- a/tests/test_pals_settings.py +++ b/tests/test_pals_settings.py @@ -67,6 +67,12 @@ def turn_off_telemetry_indirect(monkeypatch): # func(None) +def test_set_launcher_args(): + settings = PalsMpiexecSettings(default_exe, **default_kwargs) + settings.set_launcher_args({"mem-bind": "none", "line-buffer": ""}) + assert settings.format_run_args() == ["--mem-bind", "none", "--line-buffer"] + + def test_affinity_script(): settings = PalsMpiexecSettings(default_exe, **default_kwargs) settings.set_gpu_affinity_script("/path/to/set_affinity_gpu.sh", 1, 2)