From 4b0e884ea246b0fb4261b9fa9e2cb486cd011034 Mon Sep 17 00:00:00 2001
From: balin <riccardo.balin@colorado.edu>
Date: Fri, 28 Oct 2022 14:05:03 +0000
Subject: [PATCH 01/25] First round of changes for Polaris. Needs debugging and
 testing.

---
 smartsim/_core/launcher/pbs/pbsLauncher.py  |   1 +
 smartsim/_core/launcher/step/mpiexecStep.py | 144 ++++++++++++++++++++
 smartsim/database/orchestrator.py           |   2 +-
 smartsim/settings/mpirunSettings.py         |  21 ++-
 4 files changed, 162 insertions(+), 6 deletions(-)
 create mode 100644 smartsim/_core/launcher/step/mpiexecStep.py

diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py
index 8ee6d1d644..10a60f2f97 100644
--- a/smartsim/_core/launcher/pbs/pbsLauncher.py
+++ b/smartsim/_core/launcher/pbs/pbsLauncher.py
@@ -59,6 +59,7 @@ class PBSLauncher(WLMLauncher):
         QsubBatchSettings: QsubBatchStep,
         MpirunSettings: MpirunStep,
         RunSettings: LocalStep,
+        MpiexecSettings: MpiexecStep
     }
 
     def run(self, step):
diff --git a/smartsim/_core/launcher/step/mpiexecStep.py b/smartsim/_core/launcher/step/mpiexecStep.py
new file mode 100644
index 0000000000..9e6644c9bc
--- /dev/null
+++ b/smartsim/_core/launcher/step/mpiexecStep.py
@@ -0,0 +1,144 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2022, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import shutil
+from shlex import split as sh_split
+
+from ....error import AllocationError
+from ....log import get_logger
+from .step import Step
+
+logger = get_logger(__name__)
+
+
+class MpiexecStep(Step):
+    def __init__(self, name, cwd, run_settings):
+        """Initialize a OpenMPI mpiexec job step
+
+        :param name: name of the entity to be launched
+        :type name: str
+        :param cwd: path to launch dir
+        :type cwd: str
+        :param run_settings: run settings for entity
+        :type run_settings: RunSettings
+        """
+        super().__init__(name, cwd)
+        self.run_settings = run_settings
+        self.alloc = None
+        if not self.run_settings.in_batch:
+            self._set_alloc()
+
+    def get_launch_cmd(self):
+        """Get the command to launch this step
+
+        :return: launch command
+        :rtype: list[str]
+        """
+        mpiexec = self.run_settings.run_command
+        mpiexec_cmd = ["mpiexec", "--wdir", self.cwd]
+        # add env vars to mpiexec command
+        mpiexec_cmd.extend(self.run_settings.format_env_vars())
+
+        # add mpiexec settings to command
+        mpiexec_cmd.extend(self.run_settings.format_run_args())
+
+        if self.run_settings.colocated_db_settings:
+            # disable cpu binding as the entrypoint will set that
+            # for the application and database process now
+            mpiexec_cmd.extend(["--cpu-bind", "none"])
+
+            # Replace the command with the entrypoint wrapper script
+            bash = shutil.which("bash")
+            launch_script_path = self.get_colocated_launch_script()
+            mpiexec_cmd += [bash, launch_script_path]
+
+        mpiexec_cmd += self._build_exe()
+
+        # if its in a batch, redirect stdout to
+        # file in the cwd.
+        if self.run_settings.in_batch:
+            output = self.get_step_file(ending=".out")
+            mpiexec_cmd += [">", output]
+        return mpiexec_cmd
+
+    def _set_alloc(self):
+        """Set the id of the allocation
+
+        :raises AllocationError: allocation not listed or found
+        """
+        if "PBS_JOBID" in os.environ:  # cov-pbs
+            self.alloc = os.environ["PBS_JOBID"]
+            logger.debug(
+                f"Running on PBS allocation {self.alloc} gleaned from user environment"
+            )
+        elif "COBALT_JOBID" in os.environ:  # cov-cobalt
+            self.alloc = os.environ["COBALT_JOBID"]
+            logger.debug(
+                f"Running on Cobalt allocation {self.alloc} gleaned from user environment"
+            )
+        elif "SLURM_JOBID" in os.environ:  # cov-slurm
+            self.alloc = os.environ["SLURM_JOBID"]
+            logger.debug(
+                f"Running on Slurm allocation {self.alloc} gleaned from user environment"
+            )
+        elif "LSB_JOBID" in os.environ:  # cov-lsf
+            self.alloc = os.environ["LSB_JOBID"]
+            logger.debug(
+                f"Running on Slurm allocation {self.alloc} gleaned from user environment"
+            )
+        else:
+            raise AllocationError(
+                "No allocation specified or found and not running in batch"
+            )
+
+    def _build_exe(self):
+        """Build the executable for this step
+
+        :return: executable list
+        :rtype: list[str]
+        """
+        if self.run_settings.mpmd:
+            return self._make_mpmd()
+        else:
+            exe = self.run_settings.exe
+            args = self.run_settings.exe_args
+            return exe + args
+
+    def _make_mpmd(self):
+        """Build mpiexec (MPMD) executable"""
+        exe = self.run_settings.exe
+        args = self.run_settings.exe_args
+        cmd = exe + args
+        for mpmd in self.run_settings.mpmd:
+            cmd += [" : "]
+            cmd += mpmd.format_run_args()
+            cmd += mpmd.format_env_vars()
+            cmd += mpmd.exe
+            cmd += mpmd.exe_args
+
+        cmd = sh_split(" ".join(cmd))
+        return cmd
diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py
index 87b31a5eab..c7a5897898 100644
--- a/smartsim/database/orchestrator.py
+++ b/smartsim/database/orchestrator.py
@@ -102,7 +102,7 @@ def __init__(
 
         by_launcher = {
             "slurm": ["srun", "mpirun"],
-            "pbs": ["aprun", "mpirun"],
+            "pbs": ["aprun", "mpirun","mpiexec"],
             "cobalt": ["aprun", "mpirun"],
             "lsf": ["jsrun"],
             "local": [None],
diff --git a/smartsim/settings/mpirunSettings.py b/smartsim/settings/mpirunSettings.py
index 616a5685ab..175b1f72d2 100644
--- a/smartsim/settings/mpirunSettings.py
+++ b/smartsim/settings/mpirunSettings.py
@@ -117,17 +117,24 @@ def set_tasks_per_node(self, tasks_per_node):
         :param tasks_per_node: number of tasks to launch per node
         :type tasks_per_node: int
         """
-        self.run_args["npernode"] = int(tasks_per_node)
+        if (self.run_command=="mpirun"):
+            self.run_args["npernode"] = int(tasks_per_node)
+        elif (self.run_command=="mpiexec"):
+            self.run_args["ppn"] = int(tasks_per_node)
 
     def set_tasks(self, tasks):
         """Set the number of tasks for this job
 
-        This sets ``--n``
+        This sets ``--n`` for mpirun
+        and "--np" for mpiexec
 
         :param tasks: number of tasks
         :type tasks: int
         """
-        self.run_args["n"] = int(tasks)
+        if (self.run_command=="mpirun"):
+            self.run_args["n"] = int(tasks)
+        elif (self.run_command=="mpiexec"):
+            self.run_args["np"] = int(tasks)
 
     def set_hostlist(self, host_list):
         """Set the hostlist for the ``mpirun`` command
@@ -235,13 +242,17 @@ def format_env_vars(self):
         :rtype: list[str]
         """
         formatted = []
+        if (self.run_command=="mpirun"):
+           env_string = "-x"
+        elif (self.run_command=="mpiexec"):
+           env_string = "--env"
 
         if self.env_vars:
             for name, value in self.env_vars.items():
                 if value:
-                    formatted += ["-x", "=".join((name, str(value)))]
+                    formatted += [env_string, "=".join((name, str(value)))]
                 else:
-                    formatted += ["-x", name]
+                    formatted += [env_string, name]
         return formatted
 
 

From 012ecf04f1e0432677f04273db0151bad4bf6934 Mon Sep 17 00:00:00 2001
From: balin <riccardo.balin@colorado.edu>
Date: Fri, 28 Oct 2022 15:07:39 +0000
Subject: [PATCH 02/25] Fixed bugs. Clustered and co-located DB tests run on
 Polaris.

---
 smartsim/_core/launcher/pbs/pbsLauncher.py |  2 +-
 smartsim/_core/launcher/step/__init__.py   |  1 +
 smartsim/settings/mpirunSettings.py        | 12 ++++++------
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py
index 10a60f2f97..a72e15fd4e 100644
--- a/smartsim/_core/launcher/pbs/pbsLauncher.py
+++ b/smartsim/_core/launcher/pbs/pbsLauncher.py
@@ -32,7 +32,7 @@
 from ....status import STATUS_CANCELLED, STATUS_COMPLETED
 from ...config import CONFIG
 from ..launcher import WLMLauncher
-from ..step import AprunStep, LocalStep, MpirunStep, QsubBatchStep
+from ..step import AprunStep, LocalStep, MpirunStep, QsubBatchStep, MpiexecStep
 from ..stepInfo import PBSStepInfo
 from .pbsCommands import qdel, qstat
 from .pbsParser import parse_qstat_jobid, parse_step_id_from_qstat
diff --git a/smartsim/_core/launcher/step/__init__.py b/smartsim/_core/launcher/step/__init__.py
index 2813e36c1e..ada146e9f6 100644
--- a/smartsim/_core/launcher/step/__init__.py
+++ b/smartsim/_core/launcher/step/__init__.py
@@ -3,5 +3,6 @@
 from .localStep import LocalStep
 from .lsfStep import BsubBatchStep, JsrunStep
 from .mpirunStep import MpirunStep
+from .mpiexecStep import MpiexecStep
 from .pbsStep import QsubBatchStep
 from .slurmStep import SbatchStep, SrunStep
diff --git a/smartsim/settings/mpirunSettings.py b/smartsim/settings/mpirunSettings.py
index 175b1f72d2..560a9a2336 100644
--- a/smartsim/settings/mpirunSettings.py
+++ b/smartsim/settings/mpirunSettings.py
@@ -117,9 +117,9 @@ def set_tasks_per_node(self, tasks_per_node):
         :param tasks_per_node: number of tasks to launch per node
         :type tasks_per_node: int
         """
-        if (self.run_command=="mpirun"):
+        if "mpirun" in self.run_command:
             self.run_args["npernode"] = int(tasks_per_node)
-        elif (self.run_command=="mpiexec"):
+        elif "mpiexec" in self.run_command:
             self.run_args["ppn"] = int(tasks_per_node)
 
     def set_tasks(self, tasks):
@@ -131,9 +131,9 @@ def set_tasks(self, tasks):
         :param tasks: number of tasks
         :type tasks: int
         """
-        if (self.run_command=="mpirun"):
+        if "mpirun" in self.run_command:
             self.run_args["n"] = int(tasks)
-        elif (self.run_command=="mpiexec"):
+        elif "mpiexec" in self.run_command:
             self.run_args["np"] = int(tasks)
 
     def set_hostlist(self, host_list):
@@ -242,9 +242,9 @@ def format_env_vars(self):
         :rtype: list[str]
         """
         formatted = []
-        if (self.run_command=="mpirun"):
+        if "mpirun" in self.run_command:
            env_string = "-x"
-        elif (self.run_command=="mpiexec"):
+        elif "mpiexec" in self.run_command:
            env_string = "--env"
 
         if self.env_vars:

From b0ecaca05a87f3be4143d71e619e56d987c92ef3 Mon Sep 17 00:00:00 2001
From: balin <riccardo.balin@colorado.edu>
Date: Wed, 2 Nov 2022 17:09:19 +0000
Subject: [PATCH 03/25] Added cpu binding options with mpiexec

---
 smartsim/_core/launcher/step/mpiexecStep.py |  2 +-
 smartsim/settings/mpirunSettings.py         | 22 +++++++++++++++++++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/smartsim/_core/launcher/step/mpiexecStep.py b/smartsim/_core/launcher/step/mpiexecStep.py
index 9e6644c9bc..ae058e0270 100644
--- a/smartsim/_core/launcher/step/mpiexecStep.py
+++ b/smartsim/_core/launcher/step/mpiexecStep.py
@@ -69,7 +69,7 @@ def get_launch_cmd(self):
         if self.run_settings.colocated_db_settings:
             # disable cpu binding as the entrypoint will set that
             # for the application and database process now
-            mpiexec_cmd.extend(["--cpu-bind", "none"])
+            # mpiexec_cmd.extend(["--cpu-bind", "none"])
 
             # Replace the command with the entrypoint wrapper script
             bash = shutil.which("bash")
diff --git a/smartsim/settings/mpirunSettings.py b/smartsim/settings/mpirunSettings.py
index 560a9a2336..5efdef9cfe 100644
--- a/smartsim/settings/mpirunSettings.py
+++ b/smartsim/settings/mpirunSettings.py
@@ -101,7 +101,8 @@ def set_task_map(self, task_mapping):
     def set_cpus_per_task(self, cpus_per_task):
         """Set the number of tasks for this job
 
-        This sets ``--cpus-per-proc``
+        This sets ``--cpus-per-proc`` for mpirun
+        end ``--depth`` for mpiexec
 
         note: this option has been deprecated in openMPI 4.0+
         and will soon be replaced.
@@ -109,7 +110,24 @@ def set_cpus_per_task(self, cpus_per_task):
         :param cpus_per_task: number of tasks
         :type cpus_per_task: int
         """
-        self.run_args["cpus-per-proc"] = int(cpus_per_task)
+        if "mpirun" in self.run_command:
+            self.run_args["cpus-per-proc"] = int(cpus_per_task)
+        elif "mpiexec" in self.run_command:
+            self.run_args["depth"] = int(cpus_per_task)
+
+    def set_cpu_binding_type(self, bind_type):
+        """Specifies the cores to which MPI processes are bound
+
+        This sets ``--bind-to`` for mpirun
+        and ``--cpu-bind`` for mpiexec
+
+        :param bind_type: binding type 
+        :type bind_type: str
+        """
+        if "mpirun" in self.run_command:
+            self.run_args["bind-to"] = str(bind_type)
+        elif "mpiexec" in self.run_command:
+            self.run_args["cpu-bind"] = str(bind_type)
 
     def set_tasks_per_node(self, tasks_per_node):
         """Set the number of tasks per node

From 5d2f82d3688b711c134b1120c5a4734906aa0ef5 Mon Sep 17 00:00:00 2001
From: balin <riccardo.balin@colorado.edu>
Date: Wed, 21 Dec 2022 18:07:21 +0000
Subject: [PATCH 04/25] Correction to flag setting number of tasks for
 PalsMpiexecSettings

---
 smartsim/settings/palsSettings.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py
index 0bb17e6434..8282ad1077 100644
--- a/smartsim/settings/palsSettings.py
+++ b/smartsim/settings/palsSettings.py
@@ -127,6 +127,14 @@ def set_cpu_binding_type(self, bind_type):
         """
         self.run_args["cpu-bind"] = str(bind_type)
 
+    def set_tasks(self, tasks):
+        """Set the number of tasks
+
+        :param tasks: number of total tasks to launch
+        :type tasks: int
+        """
+        self.run_args["np"] = int(tasks)
+
     def set_tasks_per_node(self, tasks_per_node):
         """Set the number of tasks per node
 

From dd67114b7a63438ae066e6c124bf05b43527a488 Mon Sep 17 00:00:00 2001
From: balin <riccardo.balin@colorado.edu>
Date: Wed, 21 Dec 2022 20:34:56 +0000
Subject: [PATCH 05/25] Removed mpiexecStep.py, no longer needed after merge
 with upstream SmartSim develop

---
 smartsim/_core/launcher/step/mpiexecStep.py | 144 --------------------
 1 file changed, 144 deletions(-)
 delete mode 100644 smartsim/_core/launcher/step/mpiexecStep.py

diff --git a/smartsim/_core/launcher/step/mpiexecStep.py b/smartsim/_core/launcher/step/mpiexecStep.py
deleted file mode 100644
index ae058e0270..0000000000
--- a/smartsim/_core/launcher/step/mpiexecStep.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# BSD 2-Clause License
-#
-# Copyright (c) 2021-2022, Hewlett Packard Enterprise
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import shutil
-from shlex import split as sh_split
-
-from ....error import AllocationError
-from ....log import get_logger
-from .step import Step
-
-logger = get_logger(__name__)
-
-
-class MpiexecStep(Step):
-    def __init__(self, name, cwd, run_settings):
-        """Initialize a OpenMPI mpiexec job step
-
-        :param name: name of the entity to be launched
-        :type name: str
-        :param cwd: path to launch dir
-        :type cwd: str
-        :param run_settings: run settings for entity
-        :type run_settings: RunSettings
-        """
-        super().__init__(name, cwd)
-        self.run_settings = run_settings
-        self.alloc = None
-        if not self.run_settings.in_batch:
-            self._set_alloc()
-
-    def get_launch_cmd(self):
-        """Get the command to launch this step
-
-        :return: launch command
-        :rtype: list[str]
-        """
-        mpiexec = self.run_settings.run_command
-        mpiexec_cmd = ["mpiexec", "--wdir", self.cwd]
-        # add env vars to mpiexec command
-        mpiexec_cmd.extend(self.run_settings.format_env_vars())
-
-        # add mpiexec settings to command
-        mpiexec_cmd.extend(self.run_settings.format_run_args())
-
-        if self.run_settings.colocated_db_settings:
-            # disable cpu binding as the entrypoint will set that
-            # for the application and database process now
-            # mpiexec_cmd.extend(["--cpu-bind", "none"])
-
-            # Replace the command with the entrypoint wrapper script
-            bash = shutil.which("bash")
-            launch_script_path = self.get_colocated_launch_script()
-            mpiexec_cmd += [bash, launch_script_path]
-
-        mpiexec_cmd += self._build_exe()
-
-        # if its in a batch, redirect stdout to
-        # file in the cwd.
-        if self.run_settings.in_batch:
-            output = self.get_step_file(ending=".out")
-            mpiexec_cmd += [">", output]
-        return mpiexec_cmd
-
-    def _set_alloc(self):
-        """Set the id of the allocation
-
-        :raises AllocationError: allocation not listed or found
-        """
-        if "PBS_JOBID" in os.environ:  # cov-pbs
-            self.alloc = os.environ["PBS_JOBID"]
-            logger.debug(
-                f"Running on PBS allocation {self.alloc} gleaned from user environment"
-            )
-        elif "COBALT_JOBID" in os.environ:  # cov-cobalt
-            self.alloc = os.environ["COBALT_JOBID"]
-            logger.debug(
-                f"Running on Cobalt allocation {self.alloc} gleaned from user environment"
-            )
-        elif "SLURM_JOBID" in os.environ:  # cov-slurm
-            self.alloc = os.environ["SLURM_JOBID"]
-            logger.debug(
-                f"Running on Slurm allocation {self.alloc} gleaned from user environment"
-            )
-        elif "LSB_JOBID" in os.environ:  # cov-lsf
-            self.alloc = os.environ["LSB_JOBID"]
-            logger.debug(
-                f"Running on Slurm allocation {self.alloc} gleaned from user environment"
-            )
-        else:
-            raise AllocationError(
-                "No allocation specified or found and not running in batch"
-            )
-
-    def _build_exe(self):
-        """Build the executable for this step
-
-        :return: executable list
-        :rtype: list[str]
-        """
-        if self.run_settings.mpmd:
-            return self._make_mpmd()
-        else:
-            exe = self.run_settings.exe
-            args = self.run_settings.exe_args
-            return exe + args
-
-    def _make_mpmd(self):
-        """Build mpiexec (MPMD) executable"""
-        exe = self.run_settings.exe
-        args = self.run_settings.exe_args
-        cmd = exe + args
-        for mpmd in self.run_settings.mpmd:
-            cmd += [" : "]
-            cmd += mpmd.format_run_args()
-            cmd += mpmd.format_env_vars()
-            cmd += mpmd.exe
-            cmd += mpmd.exe_args
-
-        cmd = sh_split(" ".join(cmd))
-        return cmd

From 52ff300bab1e8930fee16f56669d564c227ab622 Mon Sep 17 00:00:00 2001
From: balin <riccardo.balin@colorado.edu>
Date: Wed, 3 May 2023 15:09:03 +0000
Subject: [PATCH 06/25] Added option to specify affinity script to PALS mpiexec
 settings. Needed when sharing GPU between different applications running with
 the co-located database deployment.

---
 smartsim/settings/mpiSettings.py  |  1 +
 smartsim/settings/palsSettings.py | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/smartsim/settings/mpiSettings.py b/smartsim/settings/mpiSettings.py
index 41b6b854c0..398960a819 100644
--- a/smartsim/settings/mpiSettings.py
+++ b/smartsim/settings/mpiSettings.py
@@ -78,6 +78,7 @@ def __init__(
             **kwargs,
         )
         self.mpmd = []
+        self.affinity_script = None
 
         if not shutil.which(self._run_command):
             msg = (
diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py
index c556cec625..ff60adf5c0 100644
--- a/smartsim/settings/palsSettings.py
+++ b/smartsim/settings/palsSettings.py
@@ -185,6 +185,14 @@ def set_walltime(self, walltime):
         """
         logger.warning("set_walltime not supported under PALS")
 
+    def set_gpu_affinity_script(self, affinity):
+        """ Set the GPU affinity through a bash script
+
+        :param affinity: path to the affinity script
+        :type affinity: str
+        """
+        self.affinity_script = str(affinity) 
+
     def format_run_args(self):
         """Return a list of MPI-standard formatted run arguments
 
@@ -202,6 +210,10 @@ def format_run_args(self):
                     args += [prefix + opt]
                 else:
                     args += [prefix + opt, str(value)]
+
+        if self.affinity_script:
+            args += [self.affinity_script]
+   
         return args
 
     def format_env_vars(self):

From f024ec4683c85b6874bb262b4ab6979b975d2d69 Mon Sep 17 00:00:00 2001
From: balin <riccardo.balin@colorado.edu>
Date: Wed, 30 Aug 2023 20:17:42 +0000
Subject: [PATCH 07/25] Modified affinity script setting to include optional
 arguments

---
 smartsim/settings/palsSettings.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py
index ff60adf5c0..3fb989d7cf 100644
--- a/smartsim/settings/palsSettings.py
+++ b/smartsim/settings/palsSettings.py
@@ -185,13 +185,15 @@ def set_walltime(self, walltime):
         """
         logger.warning("set_walltime not supported under PALS")
 
-    def set_gpu_affinity_script(self, affinity):
+    def set_gpu_affinity_script(self, affinity, *args):
         """ Set the GPU affinity through a bash script
 
         :param affinity: path to the affinity script
         :type affinity: str
         """
-        self.affinity_script = str(affinity) 
+        self.affinity_script = [str(affinity)]
+        for arg in args:
+            self.affinity_script.append(str(arg)) 
 
     def format_run_args(self):
         """Return a list of MPI-standard formatted run arguments
@@ -211,9 +213,10 @@ def format_run_args(self):
                 else:
                     args += [prefix + opt, str(value)]
 
-        if self.affinity_script:
-            args += [self.affinity_script]
-   
+        if self.affinity_script is not None:
+            #args += [self.affinity_script,str(1),str(2)]
+            args += self.affinity_script
+
         return args
 
     def format_env_vars(self):

From f0fcf5c6f1f3122f1085997547f5db7dfd3b30e3 Mon Sep 17 00:00:00 2001
From: balin <riccardo.balin@colorado.edu>
Date: Mon, 16 Oct 2023 15:50:36 +0000
Subject: [PATCH 08/25] Updated affinity script changes to have type defs and
 hints

---
 smartsim/settings/mpiSettings.py  | 2 +-
 smartsim/settings/palsSettings.py | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/smartsim/settings/mpiSettings.py b/smartsim/settings/mpiSettings.py
index a1d3181410..5b6b520e3d 100644
--- a/smartsim/settings/mpiSettings.py
+++ b/smartsim/settings/mpiSettings.py
@@ -81,7 +81,7 @@ def __init__(
             **kwargs,
         )
         self.mpmd: t.List[RunSettings] = []
-        self.affinity_script = None
+        self.affinity_script: t.List[str] = []
 
         if not shutil.which(self._run_command):
             msg = (
diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py
index c93d809cf6..1b10559760 100644
--- a/smartsim/settings/palsSettings.py
+++ b/smartsim/settings/palsSettings.py
@@ -182,15 +182,15 @@ def set_walltime(self, walltime: str) -> None:
         """
         logger.warning("set_walltime not supported under PALS")
 
-    def set_gpu_affinity_script(self, affinity, *args):
+    def set_gpu_affinity_script(self, affinity: str, *args: t.Any) -> None:
         """ Set the GPU affinity through a bash script
 
         :param affinity: path to the affinity script
         :type affinity: str
         """
-        self.affinity_script = [str(affinity)]
+        self.affinity_script.append(str(affinity))
         for arg in args:
-            self.affinity_script.append(str(arg)) 
+            self.affinity_script.append(str(arg))
 
     def format_run_args(self) -> t.List[str]:
         """Return a list of MPI-standard formatted run arguments
@@ -210,8 +210,7 @@ def format_run_args(self) -> t.List[str]:
                 else:
                     args += [prefix + opt, str(value)]
 
-        if self.affinity_script is not None:
-            #args += [self.affinity_script,str(1),str(2)]
+        if self.affinity_script:
             args += self.affinity_script
 
         return args

From 315009d6def4aff38aaf2a49f472955be57016e5 Mon Sep 17 00:00:00 2001
From: balin <riccardo.balin@colorado.edu>
Date: Wed, 18 Oct 2023 16:00:18 +0000
Subject: [PATCH 09/25] Added test for Pals affinity script option

---
 tests/test_pals_settings.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py
index 7bc3a65206..4c837149dd 100644
--- a/tests/test_pals_settings.py
+++ b/tests/test_pals_settings.py
@@ -54,6 +54,11 @@
 #    with pytest.raises(SSUnsupportedError):
 #        func(None)
 
+def test_affinity_script():
+    settings = PalsMpiexecSettings(default_exe, **default_kwargs)
+    settings.set_gpu_affinity_script("/path/to/set_affinity_gpu.sh", 1, 2)
+    assert settings.format_run_args() == ["/path/to/set_affinity_gpu.sh", "1", "2"]
+
 
 def test_cpu_binding_type():
     settings = PalsMpiexecSettings(default_exe, **default_kwargs)

From 8df7eadf485a50bbfef9bd4fb0a5f9f50faaa910 Mon Sep 17 00:00:00 2001
From: Riccardo Balin <balin@x4420c5s1b0n0.hostmgmt.cm.aurora.alcf.anl.gov>
Date: Wed, 21 Feb 2024 15:33:06 +0000
Subject: [PATCH 10/25] Modified buildenv.py to take my fork of RedisAI which
 updates to C++ std17 standard for Torch

---
 smartsim/_core/_install/buildenv.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py
index c100ac80e6..ea43ac41eb 100644
--- a/smartsim/_core/_install/buildenv.py
+++ b/smartsim/_core/_install/buildenv.py
@@ -282,9 +282,9 @@ class Versioner:
     # RedisAI
     REDISAI = RedisAIVersion(get_env("SMARTSIM_REDISAI", "1.2.7"))
     REDISAI_URL = get_env(
-        "SMARTSIM_REDISAI_URL", "https://github.com/RedisAI/RedisAI.git/"
+        "SMARTSIM_REDISAI_URL", "https://github.com/rickybalin/RedisAI.git/"
     )
-    REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}")
+    REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}.rb")
 
     # ML/DL (based on RedisAI version defaults)
     # torch can be set by the user because we download that for them

From 6e56a70175bbbe82ed4f7ad676a22f004adb69de Mon Sep 17 00:00:00 2001
From: rickybalin <riccardo.balin@colorado.edu>
Date: Mon, 10 Jun 2024 14:35:02 -0400
Subject: [PATCH 11/25] Synced with SmartSim develop branch

---
 smartsim/_core/_install/buildenv.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py
index edb1ff116e..847dac30c4 100644
--- a/smartsim/_core/_install/buildenv.py
+++ b/smartsim/_core/_install/buildenv.py
@@ -263,9 +263,9 @@ class Versioner:
     # RedisAI
     REDISAI = RedisAIVersion(get_env("SMARTSIM_REDISAI", "1.2.7"))
     REDISAI_URL = get_env(
-        "SMARTSIM_REDISAI_URL", "https://github.com/RedisAI/RedisAI.git/"
+        "SMARTSIM_REDISAI_URL", "https://github.com/rickybalin/RedisAI.git/"
     )
-    REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}")
+    REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}.rb")
 
     # ML/DL (based on RedisAI version defaults)
     # torch can be set by the user because we download that for them

From 6217a26a924118b92709619e374b69b5e3003a07 Mon Sep 17 00:00:00 2001
From: Riccardo Balin <balin@aurora-uan-0011.hostmgmt.cm.aurora.alcf.anl.gov>
Date: Mon, 21 Oct 2024 21:01:26 +0000
Subject: [PATCH 12/25] Added feature to pals settings to add any mpiexec
 argument

---
 smartsim/settings/palsSettings.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py
index 4100e8efeb..3657903668 100644
--- a/smartsim/settings/palsSettings.py
+++ b/smartsim/settings/palsSettings.py
@@ -158,6 +158,14 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None:
             )
         self.run_args["transfer"] = None
 
+    def set_launcher_args(self, arguments: dict) -> None:
+        """Set any other task launcher argument
+
+        :param arguments: dictionary with string name and value
+        """
+        for name, value in arguments.items():
+            self.run_args[name] = value
+
     def set_walltime(self, walltime: str) -> None:
         """Set the maximum number of seconds that a job will run
 

From 69d2ef288f1d4cf4c6a532726a4ee9e5f90b18ac Mon Sep 17 00:00:00 2001
From: Riccardo Balin <balin@aurora-uan-0009.hostmgmt.cm.aurora.alcf.anl.gov>
Date: Fri, 28 Feb 2025 22:04:32 +0000
Subject: [PATCH 13/25] Add a minitor flag to experiment start so can select
 which jobs to monitor and which to not

---
 smartsim/_core/control/controller.py | 12 +++++++-----
 smartsim/_core/control/jobmanager.py |  6 ++++++
 smartsim/experiment.py               |  2 ++
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py
index 43a2185455..d163851b93 100644
--- a/smartsim/_core/control/controller.py
+++ b/smartsim/_core/control/controller.py
@@ -115,6 +115,7 @@ def start(
         manifest: Manifest,
         block: bool = True,
         kill_on_interrupt: bool = True,
+        monitor: bool = True,
     ) -> None:
         """Start the passed SmartSim entities
 
@@ -134,7 +135,7 @@ def start(
         SignalInterceptionStack.get(signal.SIGINT).push_unique(
             self._jobs.signal_interrupt
         )
-        launched = self._launch(exp_name, exp_path, manifest)
+        launched = self._launch(exp_name, exp_path, manifest, monitor)
 
         # start the job manager thread if not already started
         if not self._jobs.actively_monitoring:
@@ -172,7 +173,7 @@ def poll(
         :param kill_on_interrupt: flag for killing jobs when SIGINT is received
         """
         self._jobs.kill_on_interrupt = kill_on_interrupt
-        to_monitor = self._jobs.jobs
+        to_monitor = self._jobs.monitor_jobs
         while len(to_monitor) > 0:
             time.sleep(interval)
 
@@ -388,7 +389,7 @@ def symlink_output_files(
             )
 
     def _launch(
-        self, exp_name: str, exp_path: str, manifest: Manifest
+        self, exp_name: str, exp_path: str, manifest: Manifest, monitor: bool = True
     ) -> LaunchedManifest[t.Tuple[str, Step]]:
         """Main launching function of the controller
 
@@ -479,7 +480,7 @@ def _launch(
 
         # launch and symlink steps
         for step, entity in steps:
-            self._launch_step(step, entity)
+            self._launch_step(step, entity, monitor)
             self.symlink_output_files(step, entity)
 
         # symlink substeps to maintain directory structure
@@ -570,6 +571,7 @@ def _launch_step(
         self,
         job_step: Step,
         entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]],
+        monitor: bool = True,
     ) -> None:
         """Use the launcher to launch a job step
 
@@ -622,7 +624,7 @@ def _launch_step(
             self._jobs.restart_job(job_step.name, job_id, entity.name, is_task)
         else:
             logger.debug(f"Launching {entity.name}")
-            self._jobs.add_job(job_step.name, job_id, entity, is_task)
+            self._jobs.add_job(job_step.name, job_id, entity, is_task, monitor)
 
     def _create_batch_job_step(
         self,
diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py
index 1bc24cf9af..67f52466bf 100644
--- a/smartsim/_core/control/jobmanager.py
+++ b/smartsim/_core/control/jobmanager.py
@@ -66,6 +66,7 @@ def __init__(self, lock: RLock, launcher: t.Optional[Launcher] = None) -> None:
 
         # active jobs
         self.jobs: t.Dict[str, Job] = {}
+        self.monitor_jobs: t.Dict[str, Job] = {}
         self.db_jobs: t.Dict[str, Job] = {}
 
         # completed jobs
@@ -133,6 +134,8 @@ def move_to_completed(self, job: Job) -> None:
                 del self.db_jobs[job.ename]
             elif job.ename in self.jobs:
                 del self.jobs[job.ename]
+                if job.ename in self.monitor_jobs:
+                    del self.monitor_jobs[job.ename]
 
     def __getitem__(self, entity_name: str) -> Job:
         """Return the job associated with the name of the entity
@@ -166,6 +169,7 @@ def add_job(
         job_id: t.Optional[str],
         entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity], JobEntity],
         is_task: bool = True,
+        monitor: bool = True,
     ) -> None:
         """Add a job to the job manager which holds specific jobs by type.
 
@@ -183,6 +187,8 @@ def add_job(
             self.db_jobs[entity.name] = job
         else:
             self.jobs[entity.name] = job
+            if monitor:
+                self.monitor_jobs[entity.name] = job
 
     def is_finished(self, entity: SmartSimEntity) -> bool:
         """Detect if a job has completed
diff --git a/smartsim/experiment.py b/smartsim/experiment.py
index 6b9d6a1fb6..ba3e3997b9 100644
--- a/smartsim/experiment.py
+++ b/smartsim/experiment.py
@@ -189,6 +189,7 @@ def start(
         block: bool = True,
         summary: bool = False,
         kill_on_interrupt: bool = True,
+        monitor: bool = True,
     ) -> None:
         """Start passed instances using Experiment launcher
 
@@ -246,6 +247,7 @@ def start(
                 manifest=start_manifest,
                 block=block,
                 kill_on_interrupt=kill_on_interrupt,
+                monitor=monitor,
             )
         except SmartSimError as e:
             logger.error(e)

From d981269ef1f5ac8e7da87e00199a40922364b98a Mon Sep 17 00:00:00 2001
From: Riccardo Balin <balin@aurora-uan-0011.hostmgmt.cm.aurora.alcf.anl.gov>
Date: Wed, 2 Jul 2025 21:13:27 +0000
Subject: [PATCH 14/25] Clean up

---
 smartsim/_core/_install/buildenv.py | 4 ++--
 tests/test_pals_settings.py         | 6 ------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py
index 847dac30c4..edb1ff116e 100644
--- a/smartsim/_core/_install/buildenv.py
+++ b/smartsim/_core/_install/buildenv.py
@@ -263,9 +263,9 @@ class Versioner:
     # RedisAI
     REDISAI = RedisAIVersion(get_env("SMARTSIM_REDISAI", "1.2.7"))
     REDISAI_URL = get_env(
-        "SMARTSIM_REDISAI_URL", "https://github.com/rickybalin/RedisAI.git/"
+        "SMARTSIM_REDISAI_URL", "https://github.com/RedisAI/RedisAI.git/"
     )
-    REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}.rb")
+    REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}")
 
     # ML/DL (based on RedisAI version defaults)
     # torch can be set by the user because we download that for them
diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py
index 81de181062..6ec4e8156e 100644
--- a/tests/test_pals_settings.py
+++ b/tests/test_pals_settings.py
@@ -72,12 +72,6 @@ def test_affinity_script():
     assert settings.format_run_args() == ["/path/to/set_affinity_gpu.sh", "1", "2"]
 
 
-def test_affinity_script():
-    settings = PalsMpiexecSettings(default_exe, **default_kwargs)
-    settings.set_gpu_affinity_script("/path/to/set_affinity_gpu.sh", 1, 2)
-    assert settings.format_run_args() == ["/path/to/set_affinity_gpu.sh", "1", "2"]
-
-
 def test_cpu_binding_type():
     settings = PalsMpiexecSettings(default_exe, **default_kwargs)
     settings.set_cpu_binding_type("numa")

From f4770301e5a965e4fae736f81173f402b6731d85 Mon Sep 17 00:00:00 2001
From: Riccardo Balin <balin@aurora-uan-0011.hostmgmt.cm.aurora.alcf.anl.gov>
Date: Wed, 2 Jul 2025 21:44:05 +0000
Subject: [PATCH 15/25] Update docstrings and add test for set_launcher_args()
 in PALS settings

---
 smartsim/_core/control/controller.py | 2 ++
 smartsim/_core/control/jobmanager.py | 1 +
 tests/test_pals_settings.py          | 9 +++++++++
 3 files changed, 12 insertions(+)

diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py
index d163851b93..a22357b585 100644
--- a/smartsim/_core/control/controller.py
+++ b/smartsim/_core/control/controller.py
@@ -399,6 +399,7 @@ def _launch(
         :param exp_name: The name of the launching experiment
         :param exp_path: path to location of ``Experiment`` directory if generated
         :param manifest: Manifest of deployables to launch
+        :param monitor: boolean to signal whether to monitor deployables
         """
 
         manifest_builder = LaunchedManifestBuilder[t.Tuple[str, Step]](
@@ -577,6 +578,7 @@ def _launch_step(
 
         :param job_step: a job step instance
         :param entity: entity instance
+        :param monitor: boolean determining whether to monitor job
         :raises SmartSimError: if launch fails
         """
         # attempt to retrieve entity name in JobManager.completed
diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py
index 67f52466bf..01c805433f 100644
--- a/smartsim/_core/control/jobmanager.py
+++ b/smartsim/_core/control/jobmanager.py
@@ -176,6 +176,7 @@ def add_job(
         :param job_name: name of the job step
         :param job_id: job step id created by launcher
         :param entity: entity that was launched on job step
+        :param monitor: boolean to monitor job
         :param is_task: process monitored by TaskManager (True) or the WLM (True)
         """
         launcher = str(self._launcher)
diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py
index 6ec4e8156e..91cac1614d 100644
--- a/tests/test_pals_settings.py
+++ b/tests/test_pals_settings.py
@@ -66,6 +66,15 @@ def turn_off_telemetry_indirect(monkeypatch):
 #    with pytest.raises(SSUnsupportedError):
 #        func(None)
 
+
+def test_set_launcher_args():
+    settings = PalsMpiexecSettings(default_exe, **default_kwargs)
+    settings.set_launcher_args(
+        {"mem-bind": "none", "line-buffer": ""}
+    )
+    assert settings.format_run_args() == ["--mem-bind", "none", "--line-buffer"]
+
+
 def test_affinity_script():
     settings = PalsMpiexecSettings(default_exe, **default_kwargs)
     settings.set_gpu_affinity_script("/path/to/set_affinity_gpu.sh", 1, 2)

From 6327338d7172ef575fba5990c79ccc050ce3605c Mon Sep 17 00:00:00 2001
From: Riccardo Balin <balin@aurora-uan-0011.hostmgmt.cm.aurora.alcf.anl.gov>
Date: Wed, 2 Jul 2025 22:03:59 +0000
Subject: [PATCH 16/25] Fix type

---
 smartsim/settings/palsSettings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py
index 3657903668..4889926c13 100644
--- a/smartsim/settings/palsSettings.py
+++ b/smartsim/settings/palsSettings.py
@@ -158,7 +158,7 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None:
             )
         self.run_args["transfer"] = None
 
-    def set_launcher_args(self, arguments: dict) -> None:
+    def set_launcher_args(self, arguments: t.Dict[str, t.Union[int, str, float, None]]]) -> None:
         """Set any other task launcher argument
 
         :param arguments: dictionary with string name and value

From 7426dbc99b339c91fd304e3c5de3ef4e7951b3e4 Mon Sep 17 00:00:00 2001
From: Riccardo Balin <balin@aurora-uan-0011.hostmgmt.cm.aurora.alcf.anl.gov>
Date: Wed, 2 Jul 2025 22:30:42 +0000
Subject: [PATCH 17/25] Fix typo

---
 smartsim/settings/palsSettings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py
index 4889926c13..67c3c1c8a2 100644
--- a/smartsim/settings/palsSettings.py
+++ b/smartsim/settings/palsSettings.py
@@ -158,7 +158,7 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None:
             )
         self.run_args["transfer"] = None
 
-    def set_launcher_args(self, arguments: t.Dict[str, t.Union[int, str, float, None]]]) -> None:
+    def set_launcher_args(self, arguments: t.Dict[str, t.Union[int, str, float, None]]) -> None:
         """Set any other task launcher argument
 
         :param arguments: dictionary with string name and value

From 058d0aad267be4a683f88b740cc4f36f91c2c24c Mon Sep 17 00:00:00 2001
From: Riccardo Balin <balin@aurora-uan-0011.hostmgmt.cm.aurora.alcf.anl.gov>
Date: Wed, 2 Jul 2025 22:38:59 +0000
Subject: [PATCH 18/25] Fix line length error

---
 smartsim/settings/palsSettings.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py
index 67c3c1c8a2..f7ab90bd3c 100644
--- a/smartsim/settings/palsSettings.py
+++ b/smartsim/settings/palsSettings.py
@@ -158,7 +158,10 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None:
             )
         self.run_args["transfer"] = None
 
-    def set_launcher_args(self, arguments: t.Dict[str, t.Union[int, str, float, None]]) -> None:
+    def set_launcher_args(
+        self, 
+        arguments: t.Dict[str, t.Union[int, str, float, None]]
+    ) -> None:
         """Set any other task launcher argument
 
         :param arguments: dictionary with string name and value

From 93d79d90b598369e17b93cb5a41f33d6c9b3eaea Mon Sep 17 00:00:00 2001
From: Riccardo Balin <balin@aurora-uan-0011.hostmgmt.cm.aurora.alcf.anl.gov>
Date: Wed, 2 Jul 2025 22:53:33 +0000
Subject: [PATCH 19/25] Formatting changes from make style

---
 smartsim/_core/_cli/validate.py                | 2 +-
 smartsim/_core/control/manifest.py             | 2 +-
 smartsim/_core/control/previewrenderer.py      | 6 ++++--
 smartsim/_core/entrypoints/dragon.py           | 6 ++++--
 smartsim/_core/entrypoints/redis.py            | 6 ++++--
 smartsim/_core/entrypoints/telemetrymonitor.py | 2 +-
 smartsim/settings/palsSettings.py              | 3 +--
 tests/install/test_builder.py                  | 6 ++++--
 tests/test_dbnode.py                           | 8 ++++++--
 tests/test_dragon_backend.py                   | 6 ++++--
 tests/test_pals_settings.py                    | 4 +---
 11 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py
index 96d46d6ee0..96e995a132 100644
--- a/smartsim/_core/_cli/validate.py
+++ b/smartsim/_core/_cli/validate.py
@@ -172,7 +172,7 @@ def test_install(
 
 @contextlib.contextmanager
 def _env_vars_set_to(
-    evars: t.Mapping[str, t.Optional[str]]
+    evars: t.Mapping[str, t.Optional[str]],
 ) -> t.Generator[None, None, None]:
     envvars = tuple((var, os.environ.pop(var, None), val) for var, val in evars.items())
     for var, _, tmpval in envvars:
diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py
index fd5770f187..6543485ea3 100644
--- a/smartsim/_core/control/manifest.py
+++ b/smartsim/_core/control/manifest.py
@@ -310,7 +310,7 @@ def finalize(self) -> LaunchedManifest[_T]:
 
 
 def _format_exp_telemetry_path(
-    exp_path: t.Union[str, "os.PathLike[str]"]
+    exp_path: t.Union[str, "os.PathLike[str]"],
 ) -> pathlib.Path:
     return pathlib.Path(exp_path, CONFIG.telemetry_subdir)
 
diff --git a/smartsim/_core/control/previewrenderer.py b/smartsim/_core/control/previewrenderer.py
index 857a703973..85bd032be5 100644
--- a/smartsim/_core/control/previewrenderer.py
+++ b/smartsim/_core/control/previewrenderer.py
@@ -188,5 +188,7 @@ def _check_output_format(output_format: Format) -> None:
     Check that a valid file output format is given.
     """
     if not output_format == Format.PLAINTEXT:
-        raise PreviewFormatError(f"The only valid output format currently available \
-is {Format.PLAINTEXT.value}")
+        raise PreviewFormatError(
+            f"The only valid output format currently available \
+is {Format.PLAINTEXT.value}"
+        )
diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py
index 92ebd735fb..e7c3c1a513 100644
--- a/smartsim/_core/entrypoints/dragon.py
+++ b/smartsim/_core/entrypoints/dragon.py
@@ -87,14 +87,16 @@ def print_summary(network_interface: str, ip_address: str) -> None:
     log_path = get_log_path()
     with open(log_path, "w", encoding="utf-8") as dragon_config_log:
         dragon_config_log.write(
-            textwrap.dedent(f"""\
+            textwrap.dedent(
+                f"""\
                 -------- Dragon Configuration --------
                 IPADDRESS: {ip_address}
                 NETWORK: {network_interface}
                 HOSTNAME: {socket.gethostname()}
                 DRAGON_SERVER_CONFIG: {json.dumps(zmq_config)}
                 --------------------------------------
-                """),
+                """
+            ),
         )
 
 
diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py
index c4d8cbbd63..d1566f59ec 100644
--- a/smartsim/_core/entrypoints/redis.py
+++ b/smartsim/_core/entrypoints/redis.py
@@ -78,7 +78,8 @@ def print_summary(
     cmd: t.List[str], network_interface: str, shard_data: LaunchedShardData
 ) -> None:
     print(
-        textwrap.dedent(f"""\
+        textwrap.dedent(
+            f"""\
             ----------- Running Command ----------
             COMMAND: {' '.join(cmd)}
             IPADDRESS: {shard_data.hostname}
@@ -88,7 +89,8 @@ def print_summary(
 
             --------------- Output ---------------
 
-            """),
+            """
+        ),
         flush=True,
     )
 
diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py
index 5ed1a0c91a..105cc1cd6f 100644
--- a/smartsim/_core/entrypoints/telemetrymonitor.py
+++ b/smartsim/_core/entrypoints/telemetrymonitor.py
@@ -49,7 +49,7 @@
 
 
 def register_signal_handlers(
-    handle_signal: t.Callable[[int, t.Optional[FrameType]], None]
+    handle_signal: t.Callable[[int, t.Optional[FrameType]], None],
 ) -> None:
     """Register a signal handling function for all termination events
 
diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py
index f7ab90bd3c..a2db531c95 100644
--- a/smartsim/settings/palsSettings.py
+++ b/smartsim/settings/palsSettings.py
@@ -159,8 +159,7 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None:
         self.run_args["transfer"] = None
 
     def set_launcher_args(
-        self, 
-        arguments: t.Dict[str, t.Union[int, str, float, None]]
+        self, arguments: t.Dict[str, t.Union[int, str, float, None]]
     ) -> None:
         """Set any other task launcher argument
 
diff --git a/tests/install/test_builder.py b/tests/install/test_builder.py
index feaf7e54fe..1cbbd320b6 100644
--- a/tests/install/test_builder.py
+++ b/tests/install/test_builder.py
@@ -373,13 +373,15 @@ def test_git_commands_are_configered_correctly_for_platforms(plat, cmd, expected
 
 def test_modify_source_files(p_test_dir):
     def make_text_blurb(food):
-        return textwrap.dedent(f"""\
+        return textwrap.dedent(
+            f"""\
             My favorite food is {food}
             {food} is an important part of a healthy breakfast
             {food} {food} {food} {food}
             This line should be unchanged!
             --> {food} <--
-            """)
+            """
+        )
 
     original_word = "SPAM"
     mutated_word = "EGGS"
diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py
index 04845344cb..f49f7c638e 100644
--- a/tests/test_dbnode.py
+++ b/tests/test_dbnode.py
@@ -83,7 +83,9 @@ def test_launched_shard_info_can_be_serialized():
 @pytest.mark.parametrize("limit", [None, 1])
 def test_db_node_can_parse_launched_shard_info(limit):
     rand_shards = [_random_shard_info() for _ in range(3)]
-    with io.StringIO(textwrap.dedent("""\
+    with io.StringIO(
+        textwrap.dedent(
+            """\
             This is some file like str
             --------------------------
 
@@ -98,7 +100,9 @@ def test_db_node_can_parse_launched_shard_info(limit):
             SMARTSIM_ORC_SHARD_INFO: {}
 
             All other lines should be ignored.
-            """).format(*(json.dumps(s.to_dict()) for s in rand_shards))) as stream:
+            """
+        ).format(*(json.dumps(s.to_dict()) for s in rand_shards))
+    ) as stream:
         parsed_shards = DBNode._parse_launched_shard_info_from_iterable(stream, limit)
     if limit is not None:
         rand_shards = rand_shards[:limit]
diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py
index a510f660a5..1868915ccf 100644
--- a/tests/test_dragon_backend.py
+++ b/tests/test_dragon_backend.py
@@ -435,7 +435,8 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None:
     set_mock_group_infos(monkeypatch, dragon_backend)
     hosts = dragon_backend.hosts
 
-    expected_message = textwrap.dedent(f"""\
+    expected_message = textwrap.dedent(
+        f"""\
         Dragon server backend update
         | Host    |  Status  |
         |---------|----------|
@@ -448,6 +449,7 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None:
         | del999-2 | Cancelled    | {hosts[1]}         |       -9       |      1      |
         | c101vz-3 | Completed    | {hosts[1]},{hosts[2]} |       0        |      2      |
         | 0ghjk1-4 | Failed       | {hosts[2]}         |       -1       |      1      |
-        | ljace0-5 | NeverStarted |                 |                |      0      |""")
+        | ljace0-5 | NeverStarted |                 |                |      0      |"""
+    )
 
     assert dragon_backend.status_message == expected_message
diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py
index 91cac1614d..a9f8dda5ed 100644
--- a/tests/test_pals_settings.py
+++ b/tests/test_pals_settings.py
@@ -69,9 +69,7 @@ def turn_off_telemetry_indirect(monkeypatch):
 
 def test_set_launcher_args():
     settings = PalsMpiexecSettings(default_exe, **default_kwargs)
-    settings.set_launcher_args(
-        {"mem-bind": "none", "line-buffer": ""}
-    )
+    settings.set_launcher_args({"mem-bind": "none", "line-buffer": ""})
     assert settings.format_run_args() == ["--mem-bind", "none", "--line-buffer"]
 
 

From 683d733215493fe3e6f43cae2dd5244b4f729d96 Mon Sep 17 00:00:00 2001
From: Al Rigazzi <al.rigazzi@hpe.com>
Date: Fri, 4 Jul 2025 16:25:33 +0200
Subject: [PATCH 20/25] Make style

---
 smartsim/_core/control/previewrenderer.py | 6 ++----
 smartsim/_core/entrypoints/dragon.py      | 6 ++----
 smartsim/_core/entrypoints/redis.py       | 6 ++----
 tests/install/test_builder.py             | 6 ++----
 tests/test_dbnode.py                      | 8 ++------
 tests/test_dragon_backend.py              | 6 ++----
 6 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/smartsim/_core/control/previewrenderer.py b/smartsim/_core/control/previewrenderer.py
index 85bd032be5..857a703973 100644
--- a/smartsim/_core/control/previewrenderer.py
+++ b/smartsim/_core/control/previewrenderer.py
@@ -188,7 +188,5 @@ def _check_output_format(output_format: Format) -> None:
     Check that a valid file output format is given.
     """
     if not output_format == Format.PLAINTEXT:
-        raise PreviewFormatError(
-            f"The only valid output format currently available \
-is {Format.PLAINTEXT.value}"
-        )
+        raise PreviewFormatError(f"The only valid output format currently available \
+is {Format.PLAINTEXT.value}")
diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py
index e7c3c1a513..92ebd735fb 100644
--- a/smartsim/_core/entrypoints/dragon.py
+++ b/smartsim/_core/entrypoints/dragon.py
@@ -87,16 +87,14 @@ def print_summary(network_interface: str, ip_address: str) -> None:
     log_path = get_log_path()
     with open(log_path, "w", encoding="utf-8") as dragon_config_log:
         dragon_config_log.write(
-            textwrap.dedent(
-                f"""\
+            textwrap.dedent(f"""\
                 -------- Dragon Configuration --------
                 IPADDRESS: {ip_address}
                 NETWORK: {network_interface}
                 HOSTNAME: {socket.gethostname()}
                 DRAGON_SERVER_CONFIG: {json.dumps(zmq_config)}
                 --------------------------------------
-                """
-            ),
+                """),
         )
 
 
diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py
index d1566f59ec..c4d8cbbd63 100644
--- a/smartsim/_core/entrypoints/redis.py
+++ b/smartsim/_core/entrypoints/redis.py
@@ -78,8 +78,7 @@ def print_summary(
     cmd: t.List[str], network_interface: str, shard_data: LaunchedShardData
 ) -> None:
     print(
-        textwrap.dedent(
-            f"""\
+        textwrap.dedent(f"""\
             ----------- Running Command ----------
             COMMAND: {' '.join(cmd)}
             IPADDRESS: {shard_data.hostname}
@@ -89,8 +88,7 @@ def print_summary(
 
             --------------- Output ---------------
 
-            """
-        ),
+            """),
         flush=True,
     )
 
diff --git a/tests/install/test_builder.py b/tests/install/test_builder.py
index 1cbbd320b6..feaf7e54fe 100644
--- a/tests/install/test_builder.py
+++ b/tests/install/test_builder.py
@@ -373,15 +373,13 @@ def test_git_commands_are_configered_correctly_for_platforms(plat, cmd, expected
 
 def test_modify_source_files(p_test_dir):
     def make_text_blurb(food):
-        return textwrap.dedent(
-            f"""\
+        return textwrap.dedent(f"""\
             My favorite food is {food}
             {food} is an important part of a healthy breakfast
             {food} {food} {food} {food}
             This line should be unchanged!
             --> {food} <--
-            """
-        )
+            """)
 
     original_word = "SPAM"
     mutated_word = "EGGS"
diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py
index f49f7c638e..04845344cb 100644
--- a/tests/test_dbnode.py
+++ b/tests/test_dbnode.py
@@ -83,9 +83,7 @@ def test_launched_shard_info_can_be_serialized():
 @pytest.mark.parametrize("limit", [None, 1])
 def test_db_node_can_parse_launched_shard_info(limit):
     rand_shards = [_random_shard_info() for _ in range(3)]
-    with io.StringIO(
-        textwrap.dedent(
-            """\
+    with io.StringIO(textwrap.dedent("""\
             This is some file like str
             --------------------------
 
@@ -100,9 +98,7 @@ def test_db_node_can_parse_launched_shard_info(limit):
             SMARTSIM_ORC_SHARD_INFO: {}
 
             All other lines should be ignored.
-            """
-        ).format(*(json.dumps(s.to_dict()) for s in rand_shards))
-    ) as stream:
+            """).format(*(json.dumps(s.to_dict()) for s in rand_shards))) as stream:
         parsed_shards = DBNode._parse_launched_shard_info_from_iterable(stream, limit)
     if limit is not None:
         rand_shards = rand_shards[:limit]
diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py
index 1868915ccf..a510f660a5 100644
--- a/tests/test_dragon_backend.py
+++ b/tests/test_dragon_backend.py
@@ -435,8 +435,7 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None:
     set_mock_group_infos(monkeypatch, dragon_backend)
     hosts = dragon_backend.hosts
 
-    expected_message = textwrap.dedent(
-        f"""\
+    expected_message = textwrap.dedent(f"""\
         Dragon server backend update
         | Host    |  Status  |
         |---------|----------|
@@ -449,7 +448,6 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None:
         | del999-2 | Cancelled    | {hosts[1]}         |       -9       |      1      |
         | c101vz-3 | Completed    | {hosts[1]},{hosts[2]} |       0        |      2      |
         | 0ghjk1-4 | Failed       | {hosts[2]}         |       -1       |      1      |
-        | ljace0-5 | NeverStarted |                 |                |      0      |"""
-    )
+        | ljace0-5 | NeverStarted |                 |                |      0      |""")
 
     assert dragon_backend.status_message == expected_message

From 55a02c048f29c96a28f72c99646a69f5b525b26a Mon Sep 17 00:00:00 2001
From: balin <riccardo.balin@colorado.edu>
Date: Thu, 9 Oct 2025 16:57:23 +0000
Subject: [PATCH 21/25] Update changelog.md

---
 doc/changelog.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/changelog.md b/doc/changelog.md
index 433d542cee..ac3dc3a79c 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -23,6 +23,11 @@ Description
 
 Detailed Notes
 
+- Enable control over monitoring of Models launched with `experiment.start()` by 
+  adding an optional boolean argument determining whether to monitor the particular 
+  model or not. The argument is set to True by default, so no changes are needed for 
+  the default behavior of monitoring all Models launched.
+  ([SmartSim-PR788](https://github.com/CrayLabs/SmartSim/pull/788))
 - Copyright headers have been updated from "2021-2024" to "2021-2025" across 271 files
   including Python source files, configuration files, documentation, tests, Docker files,
   shell scripts, and other supporting files to reflect the new year.

From 04a78c685b09c5213e953ebd43a6b768bf143915 Mon Sep 17 00:00:00 2001
From: balin <riccardo.balin@colorado.edu>
Date: Thu, 9 Oct 2025 17:29:44 +0000
Subject: [PATCH 22/25] Add the new monitor parameter to the docstring of
 experiment.start()

---
 smartsim/experiment.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/smartsim/experiment.py b/smartsim/experiment.py
index 099c37bc92..37580ab6a1 100644
--- a/smartsim/experiment.py
+++ b/smartsim/experiment.py
@@ -230,11 +230,16 @@ def start(
         that all jobs launched by this experiment will be killed, and the
         zombie processes will need to be manually killed.
 
+        If `monitor=True`, all the jobs being started will be monitored
+        by the Controller. If `monitor=True`, the jobs will not be 
+        monitored, meaning that their status will not be reported.
+
         :param block: block execution until all non-database
                        jobs are finished
         :param summary: print a launch summary prior to launch
         :param kill_on_interrupt: flag for killing jobs when ^C (SIGINT)
                                   signal is received.
+        :param monitor: monitor the jobs being started
         """
         start_manifest = Manifest(*args)
         self._create_entity_dir(start_manifest)

From 292a529fd606d9586104937c189937bc0ff349b4 Mon Sep 17 00:00:00 2001
From: balin <riccardo.balin@colorado.edu>
Date: Thu, 9 Oct 2025 17:43:28 +0000
Subject: [PATCH 23/25] Fix format

---
 smartsim/experiment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smartsim/experiment.py b/smartsim/experiment.py
index 37580ab6a1..d11e679dbe 100644
--- a/smartsim/experiment.py
+++ b/smartsim/experiment.py
@@ -231,7 +231,7 @@ def start(
         zombie processes will need to be manually killed.
 
         If `monitor=True`, all the jobs being started will be monitored
-        by the Controller. If `monitor=True`, the jobs will not be 
+        by the Controller. If `monitor=True`, the jobs will not be
         monitored, meaning that their status will not be reported.
 
         :param block: block execution until all non-database

From 9826d08d8c853f70050307751b898554f6b68869 Mon Sep 17 00:00:00 2001
From: balin <riccardo.balin@colorado.edu>
Date: Thu, 9 Oct 2025 18:11:22 +0000
Subject: [PATCH 24/25] Add monitor argument to start_wo_job_manager()

---
 tests/test_model.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index fe4a482b35..bb55365e68 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -94,7 +94,13 @@ def _monkeypatch_exp_controller(exp):
         entity_steps = []
 
         def start_wo_job_manager(
-            self, exp_name, exp_path, manifest, block=True, kill_on_interrupt=True
+            self,
+            exp_name,
+            exp_path,
+            manifest,
+            block=True,
+            kill_on_interrupt=True,
+            monitor=True,
         ):
             self._launch(exp_name, exp_path, manifest)
             return LaunchedManifestBuilder("name", "path", "launcher").finalize()

From c2c645bbfb94757b606e5bcb2e20dd3ab8554e2a Mon Sep 17 00:00:00 2001
From: balin <riccardo.balin@colorado.edu>
Date: Thu, 9 Oct 2025 18:32:00 +0000
Subject: [PATCH 25/25] Add monitor argument to launch_step_nop

---
 tests/test_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index bb55365e68..33cd537b86 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -105,7 +105,7 @@ def start_wo_job_manager(
             self._launch(exp_name, exp_path, manifest)
             return LaunchedManifestBuilder("name", "path", "launcher").finalize()
 
-        def launch_step_nop(self, step, entity):
+        def launch_step_nop(self, step, entity, monitor):
             entity_steps.append((step, entity))
 
         monkeypatch.setattr(