From 4b0e884ea246b0fb4261b9fa9e2cb486cd011034 Mon Sep 17 00:00:00 2001 From: balin Date: Fri, 28 Oct 2022 14:05:03 +0000 Subject: [PATCH 01/25] First round of changes for Polaris. Needs debugging and testing. --- smartsim/_core/launcher/pbs/pbsLauncher.py | 1 + smartsim/_core/launcher/step/mpiexecStep.py | 144 ++++++++++++++++++++ smartsim/database/orchestrator.py | 2 +- smartsim/settings/mpirunSettings.py | 21 ++- 4 files changed, 162 insertions(+), 6 deletions(-) create mode 100644 smartsim/_core/launcher/step/mpiexecStep.py diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py index 8ee6d1d644..10a60f2f97 100644 --- a/smartsim/_core/launcher/pbs/pbsLauncher.py +++ b/smartsim/_core/launcher/pbs/pbsLauncher.py @@ -59,6 +59,7 @@ class PBSLauncher(WLMLauncher): QsubBatchSettings: QsubBatchStep, MpirunSettings: MpirunStep, RunSettings: LocalStep, + MpiexecSettings: MpiexecStep } def run(self, step): diff --git a/smartsim/_core/launcher/step/mpiexecStep.py b/smartsim/_core/launcher/step/mpiexecStep.py new file mode 100644 index 0000000000..9e6644c9bc --- /dev/null +++ b/smartsim/_core/launcher/step/mpiexecStep.py @@ -0,0 +1,144 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2022, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import shutil +from shlex import split as sh_split + +from ....error import AllocationError +from ....log import get_logger +from .step import Step + +logger = get_logger(__name__) + + +class MpiexecStep(Step): + def __init__(self, name, cwd, run_settings): + """Initialize a OpenMPI mpiexec job step + + :param name: name of the entity to be launched + :type name: str + :param cwd: path to launch dir + :type cwd: str + :param run_settings: run settings for entity + :type run_settings: RunSettings + """ + super().__init__(name, cwd) + self.run_settings = run_settings + self.alloc = None + if not self.run_settings.in_batch: + self._set_alloc() + + def get_launch_cmd(self): + """Get the command to launch this step + + :return: launch command + :rtype: list[str] + """ + mpiexec = self.run_settings.run_command + mpiexec_cmd = ["mpiexec", "--wdir", self.cwd] + # add env vars to mpiexec command + mpiexec_cmd.extend(self.run_settings.format_env_vars()) + + # add mpiexec settings to command + mpiexec_cmd.extend(self.run_settings.format_run_args()) + + if self.run_settings.colocated_db_settings: + # disable cpu binding as the entrypoint will set that + # for the application and database process now + mpiexec_cmd.extend(["--cpu-bind", "none"]) + + # Replace the command with the entrypoint wrapper script + bash = shutil.which("bash") + launch_script_path = self.get_colocated_launch_script() + mpiexec_cmd += [bash, launch_script_path] + + mpiexec_cmd += self._build_exe() + + # if its in a batch, redirect stdout to + # file in the cwd. + if self.run_settings.in_batch: + output = self.get_step_file(ending=".out") + mpiexec_cmd += [">", output] + return mpiexec_cmd + + def _set_alloc(self): + """Set the id of the allocation + + :raises AllocationError: allocation not listed or found + """ + if "PBS_JOBID" in os.environ: # cov-pbs + self.alloc = os.environ["PBS_JOBID"] + logger.debug( + f"Running on PBS allocation {self.alloc} gleaned from user environment" + ) + elif "COBALT_JOBID" in os.environ: # cov-cobalt + self.alloc = os.environ["COBALT_JOBID"] + logger.debug( + f"Running on Cobalt allocation {self.alloc} gleaned from user environment" + ) + elif "SLURM_JOBID" in os.environ: # cov-slurm + self.alloc = os.environ["SLURM_JOBID"] + logger.debug( + f"Running on Slurm allocation {self.alloc} gleaned from user environment" + ) + elif "LSB_JOBID" in os.environ: # cov-lsf + self.alloc = os.environ["LSB_JOBID"] + logger.debug( + f"Running on Slurm allocation {self.alloc} gleaned from user environment" + ) + else: + raise AllocationError( + "No allocation specified or found and not running in batch" + ) + + def _build_exe(self): + """Build the executable for this step + + :return: executable list + :rtype: list[str] + """ + if self.run_settings.mpmd: + return self._make_mpmd() + else: + exe = self.run_settings.exe + args = self.run_settings.exe_args + return exe + args + + def _make_mpmd(self): + """Build mpiexec (MPMD) executable""" + exe = self.run_settings.exe + args = self.run_settings.exe_args + cmd = exe + args + for mpmd in self.run_settings.mpmd: + cmd += [" : "] + cmd += mpmd.format_run_args() + cmd += mpmd.format_env_vars() + cmd += mpmd.exe + cmd += mpmd.exe_args + + cmd = sh_split(" ".join(cmd)) + return cmd diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 87b31a5eab..c7a5897898 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -102,7 +102,7 @@ def __init__( by_launcher = { "slurm": ["srun", "mpirun"], - "pbs": ["aprun", "mpirun"], + "pbs": ["aprun", "mpirun","mpiexec"], "cobalt": ["aprun", "mpirun"], "lsf": ["jsrun"], "local": [None], diff --git a/smartsim/settings/mpirunSettings.py b/smartsim/settings/mpirunSettings.py index 616a5685ab..175b1f72d2 100644 --- a/smartsim/settings/mpirunSettings.py +++ b/smartsim/settings/mpirunSettings.py @@ -117,17 +117,24 @@ def set_tasks_per_node(self, tasks_per_node): :param tasks_per_node: number of tasks to launch per node :type tasks_per_node: int """ - self.run_args["npernode"] = int(tasks_per_node) + if (self.run_command=="mpirun"): + self.run_args["npernode"] = int(tasks_per_node) + elif (self.run_command=="mpiexec"): + self.run_args["ppn"] = int(tasks_per_node) def set_tasks(self, tasks): """Set the number of tasks for this job - This sets ``--n`` + This sets ``--n`` for mpirun + and "--np" for mpiexec :param tasks: number of tasks :type tasks: int """ - self.run_args["n"] = int(tasks) + if (self.run_command=="mpirun"): + self.run_args["n"] = int(tasks) + elif (self.run_command=="mpiexec"): + self.run_args["np"] = int(tasks) def set_hostlist(self, host_list): """Set the hostlist for the ``mpirun`` command @@ -235,13 +242,17 @@ def format_env_vars(self): :rtype: list[str] """ formatted = [] + if (self.run_command=="mpirun"): + env_string = "-x" + elif (self.run_command=="mpiexec"): + env_string = "--env" if self.env_vars: for name, value in self.env_vars.items(): if value: - formatted += ["-x", "=".join((name, str(value)))] + formatted += [env_string, "=".join((name, str(value)))] else: - formatted += ["-x", name] + formatted += [env_string, name] return formatted From 012ecf04f1e0432677f04273db0151bad4bf6934 Mon Sep 17 00:00:00 2001 From: balin Date: Fri, 28 Oct 2022 15:07:39 +0000 Subject: [PATCH 02/25] Fixed bugs. Clustered and co-located DB tests run on Polaris. --- smartsim/_core/launcher/pbs/pbsLauncher.py | 2 +- smartsim/_core/launcher/step/__init__.py | 1 + smartsim/settings/mpirunSettings.py | 12 ++++++------ 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py index 10a60f2f97..a72e15fd4e 100644 --- a/smartsim/_core/launcher/pbs/pbsLauncher.py +++ b/smartsim/_core/launcher/pbs/pbsLauncher.py @@ -32,7 +32,7 @@ from ....status import STATUS_CANCELLED, STATUS_COMPLETED from ...config import CONFIG from ..launcher import WLMLauncher -from ..step import AprunStep, LocalStep, MpirunStep, QsubBatchStep +from ..step import AprunStep, LocalStep, MpirunStep, QsubBatchStep, MpiexecStep from ..stepInfo import PBSStepInfo from .pbsCommands import qdel, qstat from .pbsParser import parse_qstat_jobid, parse_step_id_from_qstat diff --git a/smartsim/_core/launcher/step/__init__.py b/smartsim/_core/launcher/step/__init__.py index 2813e36c1e..ada146e9f6 100644 --- a/smartsim/_core/launcher/step/__init__.py +++ b/smartsim/_core/launcher/step/__init__.py @@ -3,5 +3,6 @@ from .localStep import LocalStep from .lsfStep import BsubBatchStep, JsrunStep from .mpirunStep import MpirunStep +from .mpiexecStep import MpiexecStep from .pbsStep import QsubBatchStep from .slurmStep import SbatchStep, SrunStep diff --git a/smartsim/settings/mpirunSettings.py b/smartsim/settings/mpirunSettings.py index 175b1f72d2..560a9a2336 100644 --- a/smartsim/settings/mpirunSettings.py +++ b/smartsim/settings/mpirunSettings.py @@ -117,9 +117,9 @@ def set_tasks_per_node(self, tasks_per_node): :param tasks_per_node: number of tasks to launch per node :type tasks_per_node: int """ - if (self.run_command=="mpirun"): + if "mpirun" in self.run_command: self.run_args["npernode"] = int(tasks_per_node) - elif (self.run_command=="mpiexec"): + elif "mpiexec" in self.run_command: self.run_args["ppn"] = int(tasks_per_node) def set_tasks(self, tasks): @@ -131,9 +131,9 @@ def set_tasks(self, tasks): :param tasks: number of tasks :type tasks: int """ - if (self.run_command=="mpirun"): + if "mpirun" in self.run_command: self.run_args["n"] = int(tasks) - elif (self.run_command=="mpiexec"): + elif "mpiexec" in self.run_command: self.run_args["np"] = int(tasks) def set_hostlist(self, host_list): @@ -242,9 +242,9 @@ def format_env_vars(self): :rtype: list[str] """ formatted = [] - if (self.run_command=="mpirun"): + if "mpirun" in self.run_command: env_string = "-x" - elif (self.run_command=="mpiexec"): + elif "mpiexec" in self.run_command: env_string = "--env" if self.env_vars: From b0ecaca05a87f3be4143d71e619e56d987c92ef3 Mon Sep 17 00:00:00 2001 From: balin Date: Wed, 2 Nov 2022 17:09:19 +0000 Subject: [PATCH 03/25] Added cpu binding options with mpiexec --- smartsim/_core/launcher/step/mpiexecStep.py | 2 +- smartsim/settings/mpirunSettings.py | 22 +++++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/launcher/step/mpiexecStep.py b/smartsim/_core/launcher/step/mpiexecStep.py index 9e6644c9bc..ae058e0270 100644 --- a/smartsim/_core/launcher/step/mpiexecStep.py +++ b/smartsim/_core/launcher/step/mpiexecStep.py @@ -69,7 +69,7 @@ def get_launch_cmd(self): if self.run_settings.colocated_db_settings: # disable cpu binding as the entrypoint will set that # for the application and database process now - mpiexec_cmd.extend(["--cpu-bind", "none"]) + # mpiexec_cmd.extend(["--cpu-bind", "none"]) # Replace the command with the entrypoint wrapper script bash = shutil.which("bash") diff --git a/smartsim/settings/mpirunSettings.py b/smartsim/settings/mpirunSettings.py index 560a9a2336..5efdef9cfe 100644 --- a/smartsim/settings/mpirunSettings.py +++ b/smartsim/settings/mpirunSettings.py @@ -101,7 +101,8 @@ def set_task_map(self, task_mapping): def set_cpus_per_task(self, cpus_per_task): """Set the number of tasks for this job - This sets ``--cpus-per-proc`` + This sets ``--cpus-per-proc`` for mpirun + end ``--depth`` for mpiexec note: this option has been deprecated in openMPI 4.0+ and will soon be replaced. @@ -109,7 +110,24 @@ def set_cpus_per_task(self, cpus_per_task): :param cpus_per_task: number of tasks :type cpus_per_task: int """ - self.run_args["cpus-per-proc"] = int(cpus_per_task) + if "mpirun" in self.run_command: + self.run_args["cpus-per-proc"] = int(cpus_per_task) + elif "mpiexec" in self.run_command: + self.run_args["depth"] = int(cpus_per_task) + + def set_cpu_binding_type(self, bind_type): + """Specifies the cores to which MPI processes are bound + + This sets ``--bind-to`` for mpirun + and ``--cpu-bind`` for mpiexec + + :param bind_type: binding type + :type bind_type: str + """ + if "mpirun" in self.run_command: + self.run_args["bind-to"] = str(bind_type) + elif "mpiexec" in self.run_command: + self.run_args["cpu-bind"] = str(bind_type) def set_tasks_per_node(self, tasks_per_node): """Set the number of tasks per node From 5d2f82d3688b711c134b1120c5a4734906aa0ef5 Mon Sep 17 00:00:00 2001 From: balin Date: Wed, 21 Dec 2022 18:07:21 +0000 Subject: [PATCH 04/25] Correction to flag setting number of tasks for PalsMpiexecSettings --- smartsim/settings/palsSettings.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index 0bb17e6434..8282ad1077 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -127,6 +127,14 @@ def set_cpu_binding_type(self, bind_type): """ self.run_args["cpu-bind"] = str(bind_type) + def set_tasks(self, tasks): + """Set the number of tasks + + :param tasks: number of total tasks to launch + :type tasks: int + """ + self.run_args["np"] = int(tasks) + def set_tasks_per_node(self, tasks_per_node): """Set the number of tasks per node From dd67114b7a63438ae066e6c124bf05b43527a488 Mon Sep 17 00:00:00 2001 From: balin Date: Wed, 21 Dec 2022 20:34:56 +0000 Subject: [PATCH 05/25] Removed mpiexecStep.py, no longer needed after merge with upstream SmartSim develop --- smartsim/_core/launcher/step/mpiexecStep.py | 144 -------------------- 1 file changed, 144 deletions(-) delete mode 100644 smartsim/_core/launcher/step/mpiexecStep.py diff --git a/smartsim/_core/launcher/step/mpiexecStep.py b/smartsim/_core/launcher/step/mpiexecStep.py deleted file mode 100644 index ae058e0270..0000000000 --- a/smartsim/_core/launcher/step/mpiexecStep.py +++ /dev/null @@ -1,144 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2022, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import os -import shutil -from shlex import split as sh_split - -from ....error import AllocationError -from ....log import get_logger -from .step import Step - -logger = get_logger(__name__) - - -class MpiexecStep(Step): - def __init__(self, name, cwd, run_settings): - """Initialize a OpenMPI mpiexec job step - - :param name: name of the entity to be launched - :type name: str - :param cwd: path to launch dir - :type cwd: str - :param run_settings: run settings for entity - :type run_settings: RunSettings - """ - super().__init__(name, cwd) - self.run_settings = run_settings - self.alloc = None - if not self.run_settings.in_batch: - self._set_alloc() - - def get_launch_cmd(self): - """Get the command to launch this step - - :return: launch command - :rtype: list[str] - """ - mpiexec = self.run_settings.run_command - mpiexec_cmd = ["mpiexec", "--wdir", self.cwd] - # add env vars to mpiexec command - mpiexec_cmd.extend(self.run_settings.format_env_vars()) - - # add mpiexec settings to command - mpiexec_cmd.extend(self.run_settings.format_run_args()) - - if self.run_settings.colocated_db_settings: - # disable cpu binding as the entrypoint will set that - # for the application and database process now - # mpiexec_cmd.extend(["--cpu-bind", "none"]) - - # Replace the command with the entrypoint wrapper script - bash = shutil.which("bash") - launch_script_path = self.get_colocated_launch_script() - mpiexec_cmd += [bash, launch_script_path] - - mpiexec_cmd += self._build_exe() - - # if its in a batch, redirect stdout to - # file in the cwd. - if self.run_settings.in_batch: - output = self.get_step_file(ending=".out") - mpiexec_cmd += [">", output] - return mpiexec_cmd - - def _set_alloc(self): - """Set the id of the allocation - - :raises AllocationError: allocation not listed or found - """ - if "PBS_JOBID" in os.environ: # cov-pbs - self.alloc = os.environ["PBS_JOBID"] - logger.debug( - f"Running on PBS allocation {self.alloc} gleaned from user environment" - ) - elif "COBALT_JOBID" in os.environ: # cov-cobalt - self.alloc = os.environ["COBALT_JOBID"] - logger.debug( - f"Running on Cobalt allocation {self.alloc} gleaned from user environment" - ) - elif "SLURM_JOBID" in os.environ: # cov-slurm - self.alloc = os.environ["SLURM_JOBID"] - logger.debug( - f"Running on Slurm allocation {self.alloc} gleaned from user environment" - ) - elif "LSB_JOBID" in os.environ: # cov-lsf - self.alloc = os.environ["LSB_JOBID"] - logger.debug( - f"Running on Slurm allocation {self.alloc} gleaned from user environment" - ) - else: - raise AllocationError( - "No allocation specified or found and not running in batch" - ) - - def _build_exe(self): - """Build the executable for this step - - :return: executable list - :rtype: list[str] - """ - if self.run_settings.mpmd: - return self._make_mpmd() - else: - exe = self.run_settings.exe - args = self.run_settings.exe_args - return exe + args - - def _make_mpmd(self): - """Build mpiexec (MPMD) executable""" - exe = self.run_settings.exe - args = self.run_settings.exe_args - cmd = exe + args - for mpmd in self.run_settings.mpmd: - cmd += [" : "] - cmd += mpmd.format_run_args() - cmd += mpmd.format_env_vars() - cmd += mpmd.exe - cmd += mpmd.exe_args - - cmd = sh_split(" ".join(cmd)) - return cmd From 52ff300bab1e8930fee16f56669d564c227ab622 Mon Sep 17 00:00:00 2001 From: balin Date: Wed, 3 May 2023 15:09:03 +0000 Subject: [PATCH 06/25] Added option to specify affinity script to PALS mpiexec settings. Needed when sharing GPU between different applications running with the co-located database deployment. --- smartsim/settings/mpiSettings.py | 1 + smartsim/settings/palsSettings.py | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/smartsim/settings/mpiSettings.py b/smartsim/settings/mpiSettings.py index 41b6b854c0..398960a819 100644 --- a/smartsim/settings/mpiSettings.py +++ b/smartsim/settings/mpiSettings.py @@ -78,6 +78,7 @@ def __init__( **kwargs, ) self.mpmd = [] + self.affinity_script = None if not shutil.which(self._run_command): msg = ( diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index c556cec625..ff60adf5c0 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -185,6 +185,14 @@ def set_walltime(self, walltime): """ logger.warning("set_walltime not supported under PALS") + def set_gpu_affinity_script(self, affinity): + """ Set the GPU affinity through a bash script + + :param affinity: path to the affinity script + :type affinity: str + """ + self.affinity_script = str(affinity) + def format_run_args(self): """Return a list of MPI-standard formatted run arguments @@ -202,6 +210,10 @@ def format_run_args(self): args += [prefix + opt] else: args += [prefix + opt, str(value)] + + if self.affinity_script: + args += [self.affinity_script] + return args def format_env_vars(self): From f024ec4683c85b6874bb262b4ab6979b975d2d69 Mon Sep 17 00:00:00 2001 From: balin Date: Wed, 30 Aug 2023 20:17:42 +0000 Subject: [PATCH 07/25] Modified affinity script setting to include optional arguments --- smartsim/settings/palsSettings.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index ff60adf5c0..3fb989d7cf 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -185,13 +185,15 @@ def set_walltime(self, walltime): """ logger.warning("set_walltime not supported under PALS") - def set_gpu_affinity_script(self, affinity): + def set_gpu_affinity_script(self, affinity, *args): """ Set the GPU affinity through a bash script :param affinity: path to the affinity script :type affinity: str """ - self.affinity_script = str(affinity) + self.affinity_script = [str(affinity)] + for arg in args: + self.affinity_script.append(str(arg)) def format_run_args(self): """Return a list of MPI-standard formatted run arguments @@ -211,9 +213,10 @@ def format_run_args(self): else: args += [prefix + opt, str(value)] - if self.affinity_script: - args += [self.affinity_script] - + if self.affinity_script is not None: + #args += [self.affinity_script,str(1),str(2)] + args += self.affinity_script + return args def format_env_vars(self): From f0fcf5c6f1f3122f1085997547f5db7dfd3b30e3 Mon Sep 17 00:00:00 2001 From: balin Date: Mon, 16 Oct 2023 15:50:36 +0000 Subject: [PATCH 08/25] Updated affinity script changes to have type defs and hints --- smartsim/settings/mpiSettings.py | 2 +- smartsim/settings/palsSettings.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/smartsim/settings/mpiSettings.py b/smartsim/settings/mpiSettings.py index a1d3181410..5b6b520e3d 100644 --- a/smartsim/settings/mpiSettings.py +++ b/smartsim/settings/mpiSettings.py @@ -81,7 +81,7 @@ def __init__( **kwargs, ) self.mpmd: t.List[RunSettings] = [] - self.affinity_script = None + self.affinity_script: t.List[str] = [] if not shutil.which(self._run_command): msg = ( diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index c93d809cf6..1b10559760 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -182,15 +182,15 @@ def set_walltime(self, walltime: str) -> None: """ logger.warning("set_walltime not supported under PALS") - def set_gpu_affinity_script(self, affinity, *args): + def set_gpu_affinity_script(self, affinity: str, *args: t.Any) -> None: """ Set the GPU affinity through a bash script :param affinity: path to the affinity script :type affinity: str """ - self.affinity_script = [str(affinity)] + self.affinity_script.append(str(affinity)) for arg in args: - self.affinity_script.append(str(arg)) + self.affinity_script.append(str(arg)) def format_run_args(self) -> t.List[str]: """Return a list of MPI-standard formatted run arguments @@ -210,8 +210,7 @@ def format_run_args(self) -> t.List[str]: else: args += [prefix + opt, str(value)] - if self.affinity_script is not None: - #args += [self.affinity_script,str(1),str(2)] + if self.affinity_script: args += self.affinity_script return args From 315009d6def4aff38aaf2a49f472955be57016e5 Mon Sep 17 00:00:00 2001 From: balin Date: Wed, 18 Oct 2023 16:00:18 +0000 Subject: [PATCH 09/25] Added test for Pals affinity script option --- tests/test_pals_settings.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py index 7bc3a65206..4c837149dd 100644 --- a/tests/test_pals_settings.py +++ b/tests/test_pals_settings.py @@ -54,6 +54,11 @@ # with pytest.raises(SSUnsupportedError): # func(None) +def test_affinity_script(): + settings = PalsMpiexecSettings(default_exe, **default_kwargs) + settings.set_gpu_affinity_script("/path/to/set_affinity_gpu.sh", 1, 2) + assert settings.format_run_args() == ["/path/to/set_affinity_gpu.sh", "1", "2"] + def test_cpu_binding_type(): settings = PalsMpiexecSettings(default_exe, **default_kwargs) From 8df7eadf485a50bbfef9bd4fb0a5f9f50faaa910 Mon Sep 17 00:00:00 2001 From: Riccardo Balin Date: Wed, 21 Feb 2024 15:33:06 +0000 Subject: [PATCH 10/25] Modified buildenv.py to take my fork of RedisAI which updates to C++ std17 standard for Torch --- smartsim/_core/_install/buildenv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index c100ac80e6..ea43ac41eb 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -282,9 +282,9 @@ class Versioner: # RedisAI REDISAI = RedisAIVersion(get_env("SMARTSIM_REDISAI", "1.2.7")) REDISAI_URL = get_env( - "SMARTSIM_REDISAI_URL", "https://github.com/RedisAI/RedisAI.git/" + "SMARTSIM_REDISAI_URL", "https://github.com/rickybalin/RedisAI.git/" ) - REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}") + REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}.rb") # ML/DL (based on RedisAI version defaults) # torch can be set by the user because we download that for them From 6e56a70175bbbe82ed4f7ad676a22f004adb69de Mon Sep 17 00:00:00 2001 From: rickybalin Date: Mon, 10 Jun 2024 14:35:02 -0400 Subject: [PATCH 11/25] Synced with SmartSim develop branch --- smartsim/_core/_install/buildenv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index edb1ff116e..847dac30c4 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -263,9 +263,9 @@ class Versioner: # RedisAI REDISAI = RedisAIVersion(get_env("SMARTSIM_REDISAI", "1.2.7")) REDISAI_URL = get_env( - "SMARTSIM_REDISAI_URL", "https://github.com/RedisAI/RedisAI.git/" + "SMARTSIM_REDISAI_URL", "https://github.com/rickybalin/RedisAI.git/" ) - REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}") + REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}.rb") # ML/DL (based on RedisAI version defaults) # torch can be set by the user because we download that for them From 6217a26a924118b92709619e374b69b5e3003a07 Mon Sep 17 00:00:00 2001 From: Riccardo Balin Date: Mon, 21 Oct 2024 21:01:26 +0000 Subject: [PATCH 12/25] Added feature to pals settings to add any mpiexec argument --- smartsim/settings/palsSettings.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index 4100e8efeb..3657903668 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -158,6 +158,14 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: ) self.run_args["transfer"] = None + def set_launcher_args(self, arguments: dict) -> None: + """Set any other task launcher argument + + :param arguments: dictionary with string name and value + """ + for name, value in arguments.items(): + self.run_args[name] = value + def set_walltime(self, walltime: str) -> None: """Set the maximum number of seconds that a job will run From 69d2ef288f1d4cf4c6a532726a4ee9e5f90b18ac Mon Sep 17 00:00:00 2001 From: Riccardo Balin Date: Fri, 28 Feb 2025 22:04:32 +0000 Subject: [PATCH 13/25] Add a minitor flag to experiment start so can select which jobs to monitor and which to not --- smartsim/_core/control/controller.py | 12 +++++++----- smartsim/_core/control/jobmanager.py | 6 ++++++ smartsim/experiment.py | 2 ++ 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 43a2185455..d163851b93 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -115,6 +115,7 @@ def start( manifest: Manifest, block: bool = True, kill_on_interrupt: bool = True, + monitor: bool = True, ) -> None: """Start the passed SmartSim entities @@ -134,7 +135,7 @@ def start( SignalInterceptionStack.get(signal.SIGINT).push_unique( self._jobs.signal_interrupt ) - launched = self._launch(exp_name, exp_path, manifest) + launched = self._launch(exp_name, exp_path, manifest, monitor) # start the job manager thread if not already started if not self._jobs.actively_monitoring: @@ -172,7 +173,7 @@ def poll( :param kill_on_interrupt: flag for killing jobs when SIGINT is received """ self._jobs.kill_on_interrupt = kill_on_interrupt - to_monitor = self._jobs.jobs + to_monitor = self._jobs.monitor_jobs while len(to_monitor) > 0: time.sleep(interval) @@ -388,7 +389,7 @@ def symlink_output_files( ) def _launch( - self, exp_name: str, exp_path: str, manifest: Manifest + self, exp_name: str, exp_path: str, manifest: Manifest, monitor: bool = True ) -> LaunchedManifest[t.Tuple[str, Step]]: """Main launching function of the controller @@ -479,7 +480,7 @@ def _launch( # launch and symlink steps for step, entity in steps: - self._launch_step(step, entity) + self._launch_step(step, entity, monitor) self.symlink_output_files(step, entity) # symlink substeps to maintain directory structure @@ -570,6 +571,7 @@ def _launch_step( self, job_step: Step, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], + monitor: bool = True, ) -> None: """Use the launcher to launch a job step @@ -622,7 +624,7 @@ def _launch_step( self._jobs.restart_job(job_step.name, job_id, entity.name, is_task) else: logger.debug(f"Launching {entity.name}") - self._jobs.add_job(job_step.name, job_id, entity, is_task) + self._jobs.add_job(job_step.name, job_id, entity, is_task, monitor) def _create_batch_job_step( self, diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index 1bc24cf9af..67f52466bf 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -66,6 +66,7 @@ def __init__(self, lock: RLock, launcher: t.Optional[Launcher] = None) -> None: # active jobs self.jobs: t.Dict[str, Job] = {} + self.monitor_jobs: t.Dict[str, Job] = {} self.db_jobs: t.Dict[str, Job] = {} # completed jobs @@ -133,6 +134,8 @@ def move_to_completed(self, job: Job) -> None: del self.db_jobs[job.ename] elif job.ename in self.jobs: del self.jobs[job.ename] + if job.ename in self.monitor_jobs: + del self.monitor_jobs[job.ename] def __getitem__(self, entity_name: str) -> Job: """Return the job associated with the name of the entity @@ -166,6 +169,7 @@ def add_job( job_id: t.Optional[str], entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity], JobEntity], is_task: bool = True, + monitor: bool = True, ) -> None: """Add a job to the job manager which holds specific jobs by type. @@ -183,6 +187,8 @@ def add_job( self.db_jobs[entity.name] = job else: self.jobs[entity.name] = job + if monitor: + self.monitor_jobs[entity.name] = job def is_finished(self, entity: SmartSimEntity) -> bool: """Detect if a job has completed diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 6b9d6a1fb6..ba3e3997b9 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -189,6 +189,7 @@ def start( block: bool = True, summary: bool = False, kill_on_interrupt: bool = True, + monitor: bool = True, ) -> None: """Start passed instances using Experiment launcher @@ -246,6 +247,7 @@ def start( manifest=start_manifest, block=block, kill_on_interrupt=kill_on_interrupt, + monitor=monitor, ) except SmartSimError as e: logger.error(e) From d981269ef1f5ac8e7da87e00199a40922364b98a Mon Sep 17 00:00:00 2001 From: Riccardo Balin Date: Wed, 2 Jul 2025 21:13:27 +0000 Subject: [PATCH 14/25] Clean up --- smartsim/_core/_install/buildenv.py | 4 ++-- tests/test_pals_settings.py | 6 ------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index 847dac30c4..edb1ff116e 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -263,9 +263,9 @@ class Versioner: # RedisAI REDISAI = RedisAIVersion(get_env("SMARTSIM_REDISAI", "1.2.7")) REDISAI_URL = get_env( - "SMARTSIM_REDISAI_URL", "https://github.com/rickybalin/RedisAI.git/" + "SMARTSIM_REDISAI_URL", "https://github.com/RedisAI/RedisAI.git/" ) - REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}.rb") + REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}") # ML/DL (based on RedisAI version defaults) # torch can be set by the user because we download that for them diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py index 81de181062..6ec4e8156e 100644 --- a/tests/test_pals_settings.py +++ b/tests/test_pals_settings.py @@ -72,12 +72,6 @@ def test_affinity_script(): assert settings.format_run_args() == ["/path/to/set_affinity_gpu.sh", "1", "2"] -def test_affinity_script(): - settings = PalsMpiexecSettings(default_exe, **default_kwargs) - settings.set_gpu_affinity_script("/path/to/set_affinity_gpu.sh", 1, 2) - assert settings.format_run_args() == ["/path/to/set_affinity_gpu.sh", "1", "2"] - - def test_cpu_binding_type(): settings = PalsMpiexecSettings(default_exe, **default_kwargs) settings.set_cpu_binding_type("numa") From f4770301e5a965e4fae736f81173f402b6731d85 Mon Sep 17 00:00:00 2001 From: Riccardo Balin Date: Wed, 2 Jul 2025 21:44:05 +0000 Subject: [PATCH 15/25] Update docstrings and add test for set_launcher_args() in PALS settings --- smartsim/_core/control/controller.py | 2 ++ smartsim/_core/control/jobmanager.py | 1 + tests/test_pals_settings.py | 9 +++++++++ 3 files changed, 12 insertions(+) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index d163851b93..a22357b585 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -399,6 +399,7 @@ def _launch( :param exp_name: The name of the launching experiment :param exp_path: path to location of ``Experiment`` directory if generated :param manifest: Manifest of deployables to launch + :param monitor: boolean to signal whether to monitor deployables """ manifest_builder = LaunchedManifestBuilder[t.Tuple[str, Step]]( @@ -577,6 +578,7 @@ def _launch_step( :param job_step: a job step instance :param entity: entity instance + :param monitor: boolean determining whether to monitor job :raises SmartSimError: if launch fails """ # attempt to retrieve entity name in JobManager.completed diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index 67f52466bf..01c805433f 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -176,6 +176,7 @@ def add_job( :param job_name: name of the job step :param job_id: job step id created by launcher :param entity: entity that was launched on job step + :param monitor: boolean to monitor job :param is_task: process monitored by TaskManager (True) or the WLM (True) """ launcher = str(self._launcher) diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py index 6ec4e8156e..91cac1614d 100644 --- a/tests/test_pals_settings.py +++ b/tests/test_pals_settings.py @@ -66,6 +66,15 @@ def turn_off_telemetry_indirect(monkeypatch): # with pytest.raises(SSUnsupportedError): # func(None) + +def test_set_launcher_args(): + settings = PalsMpiexecSettings(default_exe, **default_kwargs) + settings.set_launcher_args( + {"mem-bind": "none", "line-buffer": ""} + ) + assert settings.format_run_args() == ["--mem-bind", "none", "--line-buffer"] + + def test_affinity_script(): settings = PalsMpiexecSettings(default_exe, **default_kwargs) settings.set_gpu_affinity_script("/path/to/set_affinity_gpu.sh", 1, 2) From 6327338d7172ef575fba5990c79ccc050ce3605c Mon Sep 17 00:00:00 2001 From: Riccardo Balin Date: Wed, 2 Jul 2025 22:03:59 +0000 Subject: [PATCH 16/25] Fix type --- smartsim/settings/palsSettings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index 3657903668..4889926c13 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -158,7 +158,7 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: ) self.run_args["transfer"] = None - def set_launcher_args(self, arguments: dict) -> None: + def set_launcher_args(self, arguments: t.Dict[str, t.Union[int, str, float, None]]]) -> None: """Set any other task launcher argument :param arguments: dictionary with string name and value From 7426dbc99b339c91fd304e3c5de3ef4e7951b3e4 Mon Sep 17 00:00:00 2001 From: Riccardo Balin Date: Wed, 2 Jul 2025 22:30:42 +0000 Subject: [PATCH 17/25] Fix typo --- smartsim/settings/palsSettings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index 4889926c13..67c3c1c8a2 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -158,7 +158,7 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: ) self.run_args["transfer"] = None - def set_launcher_args(self, arguments: t.Dict[str, t.Union[int, str, float, None]]]) -> None: + def set_launcher_args(self, arguments: t.Dict[str, t.Union[int, str, float, None]]) -> None: """Set any other task launcher argument :param arguments: dictionary with string name and value From 058d0aad267be4a683f88b740cc4f36f91c2c24c Mon Sep 17 00:00:00 2001 From: Riccardo Balin Date: Wed, 2 Jul 2025 22:38:59 +0000 Subject: [PATCH 18/25] Fix line length error --- smartsim/settings/palsSettings.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index 67c3c1c8a2..f7ab90bd3c 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -158,7 +158,10 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: ) self.run_args["transfer"] = None - def set_launcher_args(self, arguments: t.Dict[str, t.Union[int, str, float, None]]) -> None: + def set_launcher_args( + self, + arguments: t.Dict[str, t.Union[int, str, float, None]] + ) -> None: """Set any other task launcher argument :param arguments: dictionary with string name and value From 93d79d90b598369e17b93cb5a41f33d6c9b3eaea Mon Sep 17 00:00:00 2001 From: Riccardo Balin Date: Wed, 2 Jul 2025 22:53:33 +0000 Subject: [PATCH 19/25] Formatting changes from make style --- smartsim/_core/_cli/validate.py | 2 +- smartsim/_core/control/manifest.py | 2 +- smartsim/_core/control/previewrenderer.py | 6 ++++-- smartsim/_core/entrypoints/dragon.py | 6 ++++-- smartsim/_core/entrypoints/redis.py | 6 ++++-- smartsim/_core/entrypoints/telemetrymonitor.py | 2 +- smartsim/settings/palsSettings.py | 3 +-- tests/install/test_builder.py | 6 ++++-- tests/test_dbnode.py | 8 ++++++-- tests/test_dragon_backend.py | 6 ++++-- tests/test_pals_settings.py | 4 +--- 11 files changed, 31 insertions(+), 20 deletions(-) diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index 96d46d6ee0..96e995a132 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -172,7 +172,7 @@ def test_install( @contextlib.contextmanager def _env_vars_set_to( - evars: t.Mapping[str, t.Optional[str]] + evars: t.Mapping[str, t.Optional[str]], ) -> t.Generator[None, None, None]: envvars = tuple((var, os.environ.pop(var, None), val) for var, val in evars.items()) for var, _, tmpval in envvars: diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index fd5770f187..6543485ea3 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -310,7 +310,7 @@ def finalize(self) -> LaunchedManifest[_T]: def _format_exp_telemetry_path( - exp_path: t.Union[str, "os.PathLike[str]"] + exp_path: t.Union[str, "os.PathLike[str]"], ) -> pathlib.Path: return pathlib.Path(exp_path, CONFIG.telemetry_subdir) diff --git a/smartsim/_core/control/previewrenderer.py b/smartsim/_core/control/previewrenderer.py index 857a703973..85bd032be5 100644 --- a/smartsim/_core/control/previewrenderer.py +++ b/smartsim/_core/control/previewrenderer.py @@ -188,5 +188,7 @@ def _check_output_format(output_format: Format) -> None: Check that a valid file output format is given. """ if not output_format == Format.PLAINTEXT: - raise PreviewFormatError(f"The only valid output format currently available \ -is {Format.PLAINTEXT.value}") + raise PreviewFormatError( + f"The only valid output format currently available \ +is {Format.PLAINTEXT.value}" + ) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 92ebd735fb..e7c3c1a513 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -87,14 +87,16 @@ def print_summary(network_interface: str, ip_address: str) -> None: log_path = get_log_path() with open(log_path, "w", encoding="utf-8") as dragon_config_log: dragon_config_log.write( - textwrap.dedent(f"""\ + textwrap.dedent( + f"""\ -------- Dragon Configuration -------- IPADDRESS: {ip_address} NETWORK: {network_interface} HOSTNAME: {socket.gethostname()} DRAGON_SERVER_CONFIG: {json.dumps(zmq_config)} -------------------------------------- - """), + """ + ), ) diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index c4d8cbbd63..d1566f59ec 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -78,7 +78,8 @@ def print_summary( cmd: t.List[str], network_interface: str, shard_data: LaunchedShardData ) -> None: print( - textwrap.dedent(f"""\ + textwrap.dedent( + f"""\ ----------- Running Command ---------- COMMAND: {' '.join(cmd)} IPADDRESS: {shard_data.hostname} @@ -88,7 +89,8 @@ def print_summary( --------------- Output --------------- - """), + """ + ), flush=True, ) diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py index 5ed1a0c91a..105cc1cd6f 100644 --- a/smartsim/_core/entrypoints/telemetrymonitor.py +++ b/smartsim/_core/entrypoints/telemetrymonitor.py @@ -49,7 +49,7 @@ def register_signal_handlers( - handle_signal: t.Callable[[int, t.Optional[FrameType]], None] + handle_signal: t.Callable[[int, t.Optional[FrameType]], None], ) -> None: """Register a signal handling function for all termination events diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index f7ab90bd3c..a2db531c95 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -159,8 +159,7 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: self.run_args["transfer"] = None def set_launcher_args( - self, - arguments: t.Dict[str, t.Union[int, str, float, None]] + self, arguments: t.Dict[str, t.Union[int, str, float, None]] ) -> None: """Set any other task launcher argument diff --git a/tests/install/test_builder.py b/tests/install/test_builder.py index feaf7e54fe..1cbbd320b6 100644 --- a/tests/install/test_builder.py +++ b/tests/install/test_builder.py @@ -373,13 +373,15 @@ def test_git_commands_are_configered_correctly_for_platforms(plat, cmd, expected def test_modify_source_files(p_test_dir): def make_text_blurb(food): - return textwrap.dedent(f"""\ + return textwrap.dedent( + f"""\ My favorite food is {food} {food} is an important part of a healthy breakfast {food} {food} {food} {food} This line should be unchanged! --> {food} <-- - """) + """ + ) original_word = "SPAM" mutated_word = "EGGS" diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index 04845344cb..f49f7c638e 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -83,7 +83,9 @@ def test_launched_shard_info_can_be_serialized(): @pytest.mark.parametrize("limit", [None, 1]) def test_db_node_can_parse_launched_shard_info(limit): rand_shards = [_random_shard_info() for _ in range(3)] - with io.StringIO(textwrap.dedent("""\ + with io.StringIO( + textwrap.dedent( + """\ This is some file like str -------------------------- @@ -98,7 +100,9 @@ def test_db_node_can_parse_launched_shard_info(limit): SMARTSIM_ORC_SHARD_INFO: {} All other lines should be ignored. - """).format(*(json.dumps(s.to_dict()) for s in rand_shards))) as stream: + """ + ).format(*(json.dumps(s.to_dict()) for s in rand_shards)) + ) as stream: parsed_shards = DBNode._parse_launched_shard_info_from_iterable(stream, limit) if limit is not None: rand_shards = rand_shards[:limit] diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py index a510f660a5..1868915ccf 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_backend.py @@ -435,7 +435,8 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None: set_mock_group_infos(monkeypatch, dragon_backend) hosts = dragon_backend.hosts - expected_message = textwrap.dedent(f"""\ + expected_message = textwrap.dedent( + f"""\ Dragon server backend update | Host | Status | |---------|----------| @@ -448,6 +449,7 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None: | del999-2 | Cancelled | {hosts[1]} | -9 | 1 | | c101vz-3 | Completed | {hosts[1]},{hosts[2]} | 0 | 2 | | 0ghjk1-4 | Failed | {hosts[2]} | -1 | 1 | - | ljace0-5 | NeverStarted | | | 0 |""") + | ljace0-5 | NeverStarted | | | 0 |""" + ) assert dragon_backend.status_message == expected_message diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py index 91cac1614d..a9f8dda5ed 100644 --- a/tests/test_pals_settings.py +++ b/tests/test_pals_settings.py @@ -69,9 +69,7 @@ def turn_off_telemetry_indirect(monkeypatch): def test_set_launcher_args(): settings = PalsMpiexecSettings(default_exe, **default_kwargs) - settings.set_launcher_args( - {"mem-bind": "none", "line-buffer": ""} - ) + settings.set_launcher_args({"mem-bind": "none", "line-buffer": ""}) assert settings.format_run_args() == ["--mem-bind", "none", "--line-buffer"] From 683d733215493fe3e6f43cae2dd5244b4f729d96 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 4 Jul 2025 16:25:33 +0200 Subject: [PATCH 20/25] Make style --- smartsim/_core/control/previewrenderer.py | 6 ++---- smartsim/_core/entrypoints/dragon.py | 6 ++---- smartsim/_core/entrypoints/redis.py | 6 ++---- tests/install/test_builder.py | 6 ++---- tests/test_dbnode.py | 8 ++------ tests/test_dragon_backend.py | 6 ++---- 6 files changed, 12 insertions(+), 26 deletions(-) diff --git a/smartsim/_core/control/previewrenderer.py b/smartsim/_core/control/previewrenderer.py index 85bd032be5..857a703973 100644 --- a/smartsim/_core/control/previewrenderer.py +++ b/smartsim/_core/control/previewrenderer.py @@ -188,7 +188,5 @@ def _check_output_format(output_format: Format) -> None: Check that a valid file output format is given. """ if not output_format == Format.PLAINTEXT: - raise PreviewFormatError( - f"The only valid output format currently available \ -is {Format.PLAINTEXT.value}" - ) + raise PreviewFormatError(f"The only valid output format currently available \ +is {Format.PLAINTEXT.value}") diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index e7c3c1a513..92ebd735fb 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -87,16 +87,14 @@ def print_summary(network_interface: str, ip_address: str) -> None: log_path = get_log_path() with open(log_path, "w", encoding="utf-8") as dragon_config_log: dragon_config_log.write( - textwrap.dedent( - f"""\ + textwrap.dedent(f"""\ -------- Dragon Configuration -------- IPADDRESS: {ip_address} NETWORK: {network_interface} HOSTNAME: {socket.gethostname()} DRAGON_SERVER_CONFIG: {json.dumps(zmq_config)} -------------------------------------- - """ - ), + """), ) diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index d1566f59ec..c4d8cbbd63 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -78,8 +78,7 @@ def print_summary( cmd: t.List[str], network_interface: str, shard_data: LaunchedShardData ) -> None: print( - textwrap.dedent( - f"""\ + textwrap.dedent(f"""\ ----------- Running Command ---------- COMMAND: {' '.join(cmd)} IPADDRESS: {shard_data.hostname} @@ -89,8 +88,7 @@ def print_summary( --------------- Output --------------- - """ - ), + """), flush=True, ) diff --git a/tests/install/test_builder.py b/tests/install/test_builder.py index 1cbbd320b6..feaf7e54fe 100644 --- a/tests/install/test_builder.py +++ b/tests/install/test_builder.py @@ -373,15 +373,13 @@ def test_git_commands_are_configered_correctly_for_platforms(plat, cmd, expected def test_modify_source_files(p_test_dir): def make_text_blurb(food): - return textwrap.dedent( - f"""\ + return textwrap.dedent(f"""\ My favorite food is {food} {food} is an important part of a healthy breakfast {food} {food} {food} {food} This line should be unchanged! --> {food} <-- - """ - ) + """) original_word = "SPAM" mutated_word = "EGGS" diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index f49f7c638e..04845344cb 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -83,9 +83,7 @@ def test_launched_shard_info_can_be_serialized(): @pytest.mark.parametrize("limit", [None, 1]) def test_db_node_can_parse_launched_shard_info(limit): rand_shards = [_random_shard_info() for _ in range(3)] - with io.StringIO( - textwrap.dedent( - """\ + with io.StringIO(textwrap.dedent("""\ This is some file like str -------------------------- @@ -100,9 +98,7 @@ def test_db_node_can_parse_launched_shard_info(limit): SMARTSIM_ORC_SHARD_INFO: {} All other lines should be ignored. - """ - ).format(*(json.dumps(s.to_dict()) for s in rand_shards)) - ) as stream: + """).format(*(json.dumps(s.to_dict()) for s in rand_shards))) as stream: parsed_shards = DBNode._parse_launched_shard_info_from_iterable(stream, limit) if limit is not None: rand_shards = rand_shards[:limit] diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py index 1868915ccf..a510f660a5 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_backend.py @@ -435,8 +435,7 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None: set_mock_group_infos(monkeypatch, dragon_backend) hosts = dragon_backend.hosts - expected_message = textwrap.dedent( - f"""\ + expected_message = textwrap.dedent(f"""\ Dragon server backend update | Host | Status | |---------|----------| @@ -449,7 +448,6 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None: | del999-2 | Cancelled | {hosts[1]} | -9 | 1 | | c101vz-3 | Completed | {hosts[1]},{hosts[2]} | 0 | 2 | | 0ghjk1-4 | Failed | {hosts[2]} | -1 | 1 | - | ljace0-5 | NeverStarted | | | 0 |""" - ) + | ljace0-5 | NeverStarted | | | 0 |""") assert dragon_backend.status_message == expected_message From 55a02c048f29c96a28f72c99646a69f5b525b26a Mon Sep 17 00:00:00 2001 From: balin Date: Thu, 9 Oct 2025 16:57:23 +0000 Subject: [PATCH 21/25] Update changelog.md --- doc/changelog.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/changelog.md b/doc/changelog.md index 433d542cee..ac3dc3a79c 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -23,6 +23,11 @@ Description Detailed Notes +- Enable control over monitoring of Models launched with `experiment.start()` by + adding an optional boolean argument determining whether to monitor the particular + model or not. The argument is set to True by default, so no changes are needed for + the default behavior of monitoring all Models launched. + ([SmartSim-PR788](https://github.com/CrayLabs/SmartSim/pull/788)) - Copyright headers have been updated from "2021-2024" to "2021-2025" across 271 files including Python source files, configuration files, documentation, tests, Docker files, shell scripts, and other supporting files to reflect the new year. From 04a78c685b09c5213e953ebd43a6b768bf143915 Mon Sep 17 00:00:00 2001 From: balin Date: Thu, 9 Oct 2025 17:29:44 +0000 Subject: [PATCH 22/25] Add the new monitor parameter to the docstring of experiment.start() --- smartsim/experiment.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 099c37bc92..37580ab6a1 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -230,11 +230,16 @@ def start( that all jobs launched by this experiment will be killed, and the zombie processes will need to be manually killed. + If `monitor=True`, all the jobs being started will be monitored + by the Controller. If `monitor=True`, the jobs will not be + monitored, meaning that their status will not be reported. + :param block: block execution until all non-database jobs are finished :param summary: print a launch summary prior to launch :param kill_on_interrupt: flag for killing jobs when ^C (SIGINT) signal is received. + :param monitor: monitor the jobs being started """ start_manifest = Manifest(*args) self._create_entity_dir(start_manifest) From 292a529fd606d9586104937c189937bc0ff349b4 Mon Sep 17 00:00:00 2001 From: balin Date: Thu, 9 Oct 2025 17:43:28 +0000 Subject: [PATCH 23/25] Fix format --- smartsim/experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 37580ab6a1..d11e679dbe 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -231,7 +231,7 @@ def start( zombie processes will need to be manually killed. If `monitor=True`, all the jobs being started will be monitored - by the Controller. If `monitor=True`, the jobs will not be + by the Controller. If `monitor=True`, the jobs will not be monitored, meaning that their status will not be reported. :param block: block execution until all non-database From 9826d08d8c853f70050307751b898554f6b68869 Mon Sep 17 00:00:00 2001 From: balin Date: Thu, 9 Oct 2025 18:11:22 +0000 Subject: [PATCH 24/25] Add monitor argument to start_wo_job_manager() --- tests/test_model.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_model.py b/tests/test_model.py index fe4a482b35..bb55365e68 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -94,7 +94,13 @@ def _monkeypatch_exp_controller(exp): entity_steps = [] def start_wo_job_manager( - self, exp_name, exp_path, manifest, block=True, kill_on_interrupt=True + self, + exp_name, + exp_path, + manifest, + block=True, + kill_on_interrupt=True, + monitor=True, ): self._launch(exp_name, exp_path, manifest) return LaunchedManifestBuilder("name", "path", "launcher").finalize() From c2c645bbfb94757b606e5bcb2e20dd3ab8554e2a Mon Sep 17 00:00:00 2001 From: balin Date: Thu, 9 Oct 2025 18:32:00 +0000 Subject: [PATCH 25/25] Add monitor argument to launch_step_nop --- tests/test_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_model.py b/tests/test_model.py index bb55365e68..33cd537b86 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -105,7 +105,7 @@ def start_wo_job_manager( self._launch(exp_name, exp_path, manifest) return LaunchedManifestBuilder("name", "path", "launcher").finalize() - def launch_step_nop(self, step, entity): + def launch_step_nop(self, step, entity, monitor): entity_steps.append((step, entity)) monkeypatch.setattr(