🐛 Fix addition of process to reconnected run

kzscisoft · kzscisoft · commit b5bcaf1fe38c · 2025-05-24T18:58:46.000+01:00
Uses underlying low level API run object to retrieve name and ID for reconnect. Also ensures run is "running" when attempting to log system metrics or save files."
diff --git a/simvue/executor.py b/simvue/executor.py
@@ -16,6 +16,7 @@
 import multiprocessing
 import threading
 import os
+import shutil
 import psutil
 import subprocess
 import contextlib
@@ -205,6 +206,19 @@ def callback_function(status_code: int, std_out: str, std_err: str) -> None:
                 "due to function pickling restrictions"
             )
 
+        # To check the executable provided by the user exists combine with environment
+        # PATH variable if exists, if not provided use the current environment
+        _session_path: str | None = (os.environ.copy() | (env or {})).get("PATH", None)
+
+        if (
+            executable
+            and not pathlib.Path(executable).exists()
+            and not shutil.which(executable, path=_session_path)
+        ):
+            raise FileNotFoundError(
+                f"Executable '{executable}' does not exist, please check the path/environment."
+            )
+
         if script:
             self._runner.save_file(file_path=script, category="code")
 
@@ -377,7 +391,7 @@ def _update_alerts(self) -> None:
             # This is so that if a process incorrectly reports its return code,
             # the user can manually set the correct status depending on logs etc.
             _alert = UserAlert(identifier=self._alert_ids[proc_id])
-            _is_set = _alert.get_status(run_id=self._runner._id)
+            _is_set = _alert.get_status(run_id=self._runner.id)
 
             if process.returncode != 0:
                 # If the process fails then purge the dispatcher event queue
@@ -404,6 +418,10 @@ def _update_alerts(self) -> None:
 
     def _save_output(self) -> None:
         """Save the output to Simvue"""
+        if self._runner.status != "running":
+            logger.debug("Run is not active, skipping output save.")
+            return
+
         for proc_id in self._processes.keys():
             # Only save the file if the contents are not empty
             if self.std_err(proc_id):
diff --git a/simvue/run.py b/simvue/run.py
@@ -147,7 +147,6 @@ def __init__(
         ```
         """
         self._uuid: str = f"{uuid.uuid4()}"
-        self._name: str | None = None
 
         # monitor duration with respect to retention period
         self._timer: float = 0
@@ -161,7 +160,6 @@ def __init__(
         self._executor = Executor(self)
         self._dispatcher: DispatcherBaseClass | None = None
 
-        self._id: str | None = None
         self._folder: Folder | None = None
         self._term_color: bool = True
         self._suppress_errors: bool = False
@@ -260,7 +258,7 @@ def __exit__(
     ) -> None:
         logger.debug(
             "Automatically closing run '%s' in status %s",
-            self._id if self._user_config.run.mode == "online" else "unregistered",
+            self.id if self._user_config.run.mode == "online" else "unregistered",
             self._status,
         )
 
@@ -365,24 +363,25 @@ def _get_internal_metrics(
         # Set join on fail to false as if an error is thrown
         # join would be called on this thread and a thread cannot
         # join itself!
-        self._add_metrics_to_dispatch(
-            _current_system_measure.to_dict(),
-            join_on_fail=False,
-            step=system_metrics_step,
-        )
+        if self.status == "running":
+            self._add_metrics_to_dispatch(
+                _current_system_measure.to_dict(),
+                join_on_fail=False,
+                step=system_metrics_step,
+            )
 
         # For the first emissions metrics reading, the time interval to use
         # Is the time since the run started, otherwise just use the time between readings
         if self._emissions_monitor:
             _estimated = self._emissions_monitor.estimate_co2_emissions(
-                process_id=f"{self._name}",
+                process_id=f"{self._sv_obj.name}",
                 cpu_percent=_current_system_measure.cpu_percent,
                 measure_interval=(time.time() - self._start_time)
                 if system_metrics_step == 0
                 else self._system_metrics_interval,
                 gpu_percent=_current_system_measure.gpu_percent,
             )
-            if _estimated:
+            if _estimated and self.status == "running":
                 self._add_metrics_to_dispatch(
                     self._emissions_monitor.simvue_metrics(),
                     join_on_fail=False,
@@ -395,7 +394,7 @@ def _create_heartbeat_callback(
         """Defines the callback executed at the heartbeat interval for the Run."""
         if (
             self._user_config.run.mode == "online"
-            and (not self._user_config.server.url or not self._id)
+            and (not self._user_config.server.url or not self.id)
         ) or not self._heartbeat_termination_trigger:
             raise RuntimeError("Could not commence heartbeat, run not initialised")
 
@@ -460,7 +459,7 @@ def _create_dispatch_callback(
         executed on metrics and events objects held in a buffer.
         """
 
-        if self._user_config.run.mode == "online" and not self._id:
+        if self._user_config.run.mode == "online" and not self.id:
             raise RuntimeError("Expected identifier for run")
 
         if (
@@ -591,7 +590,6 @@ def _error(self, message: str, join_threads: bool = True) -> None:
         # Simvue support now terminated as the instance of Run has entered
         # the dormant state due to exception throw so set listing to be 'lost'
         if self._status == "running" and self._sv_obj:
-            self._sv_obj.name = self._name
             self._sv_obj.status = "lost"
             self._sv_obj.commit()
 
@@ -702,8 +700,6 @@ def init(
         elif not name and self._user_config.run.mode == "offline":
             name = randomname.get_name()
 
-        self._name = name
-
         self._status = "running" if running else "created"
 
         # Parse the time to live/retention time if specified
@@ -751,28 +747,20 @@ def init(
         self._data = self._sv_obj._staging
         self._sv_obj.commit()
 
-        if self._user_config.run.mode == "online":
-            name = self._sv_obj.name
-
-        self._id = self._sv_obj.id
-
-        if not name:
+        if not self.name:
             return False
 
-        elif name is not True:
-            self._name = name
-
         if self._status == "running":
             self._start()
 
         if self._user_config.run.mode == "online":
             click.secho(
-                f"[simvue] Run {self._name} created",
+                f"[simvue] Run {self._sv_obj.name} created",
                 bold=self._term_color,
                 fg="green" if self._term_color else None,
             )
             click.secho(
-                f"[simvue] Monitor in the UI at {self._user_config.server.url.rsplit('/api', 1)[0]}/dashboard/runs/run/{self._id}",
+                f"[simvue] Monitor in the UI at {self._user_config.server.url.rsplit('/api', 1)[0]}/dashboard/runs/run/{self.id}",
                 bold=self._term_color,
                 fg="green" if self._term_color else None,
             )
@@ -952,7 +940,23 @@ def executor(self) -> Executor:
     @property
     def name(self) -> str | None:
         """Return the name of the run"""
-        return self._name
+        if not self._sv_obj:
+            raise RuntimeError("Run has not been initialised")
+        return self._sv_obj.name
+
+    @property
+    def status(
+        self,
+    ) -> (
+        typing.Literal[
+            "created", "running", "completed", "failed", "terminated", "lost"
+        ]
+        | None
+    ):
+        """Return the status of the run"""
+        if not self._sv_obj:
+            raise RuntimeError("Run has not been initialised")
+        return self._sv_obj.status
 
     @property
     def uid(self) -> str:
@@ -962,7 +966,9 @@ def uid(self) -> str:
     @property
     def id(self) -> str | None:
         """Return the unique id of the run"""
-        return self._id
+        if not self._sv_obj:
+            raise RuntimeError("Run has not been initialised")
+        return self._sv_obj.id
 
     @skip_if_failed("_aborted", "_suppress_errors", False)
     @pydantic.validate_call
@@ -981,8 +987,7 @@ def reconnect(self, run_id: str) -> bool:
         """
         self._status = "running"
 
-        self._id = run_id
-        self._sv_obj = RunObject(identifier=self._id, _read_only=False)
+        self._sv_obj = RunObject(identifier=run_id, _read_only=False)
         self._sv_obj.system = get_system()
         self._start()
 
@@ -1612,7 +1617,7 @@ def _tidy_run(self) -> None:
             and self._status != "created"
         ):
             self._user_config.offline.cache.joinpath(
-                "runs", f"{self._id}.closed"
+                "runs", f"{self.id}.closed"
             ).touch()
 
         if _non_zero := self.executor.exit_status:
@@ -2086,7 +2091,7 @@ def log_alert(
             )
             return False
         _alert.read_only(False)
-        _alert.set_status(run_id=self._id, status=state)
+        _alert.set_status(run_id=self.id, status=state)
         _alert.commit()
 
         return True
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -274,8 +274,8 @@ def setup_test_run(run: sv_run.Run, create_objects: bool, request: pytest.Fixtur
     if create_objects:
         TEST_DATA["metrics"] = ("metric_counter", "metric_val")
 
-    TEST_DATA["run_id"] = run._id
-    TEST_DATA["run_name"] = run._name
+    TEST_DATA["run_id"] = run.id
+    TEST_DATA["run_name"] = run.name
     TEST_DATA["url"] = run._user_config.server.url
     TEST_DATA["headers"] = run._headers
     TEST_DATA["pid"] = run._pid
diff --git a/tests/functional/test_run_class.py b/tests/functional/test_run_class.py
@@ -672,7 +672,7 @@ def test_set_folder_details(request: pytest.FixtureRequest) -> None:
     ids=[f"scenario_{i}" for i in range(1, 6)],
 )
 def test_save_file_online(
-    create_plain_run: typing.Tuple[sv_run.Run, dict],
+    create_plain_run: tuple[sv_run.Run, dict],
     valid_mimetype: bool,
     preserve_path: bool,
     name: str | None,
@@ -741,7 +741,7 @@ def test_save_file_online(
     ids=[f"scenario_{i}" for i in range(1, 6)],
 )
 def test_save_file_offline(
-    create_plain_run_offline: typing.Tuple[sv_run.Run, dict],
+    create_plain_run_offline: tuple[sv_run.Run, dict],
     preserve_path: bool,
     name: str | None,
     allow_pickle: bool,
@@ -796,7 +796,7 @@ def test_save_file_offline(
 
 @pytest.mark.run
 def test_update_tags_running(
-    create_plain_run: typing.Tuple[sv_run.Run, dict],
+    create_plain_run: tuple[sv_run.Run, dict],
     request: pytest.FixtureRequest,
 ) -> None:
     simvue_run, _ = create_plain_run
@@ -822,7 +822,7 @@ def test_update_tags_running(
 
 @pytest.mark.run
 def test_update_tags_created(
-    create_pending_run: typing.Tuple[sv_run.Run, dict],
+    create_pending_run: tuple[sv_run.Run, dict],
     request: pytest.FixtureRequest,
 ) -> None:
     simvue_run, _ = create_pending_run
@@ -849,7 +849,7 @@ def test_update_tags_created(
 @pytest.mark.offline
 @pytest.mark.run
 def test_update_tags_offline(
-    create_plain_run_offline: typing.Tuple[sv_run.Run, dict],
+    create_plain_run_offline: tuple[sv_run.Run, dict],
 ) -> None:
     simvue_run, _ = create_plain_run_offline
     run_name = simvue_run._name
@@ -877,7 +877,7 @@ def test_update_tags_offline(
 @pytest.mark.run
 @pytest.mark.parametrize("object_type", ("DataFrame", "ndarray"))
 def test_save_object(
-    create_plain_run: typing.Tuple[sv_run.Run, dict], object_type: str
+    create_plain_run: tuple[sv_run.Run, dict], object_type: str
 ) -> None:
     simvue_run, _ = create_plain_run
 
@@ -1079,7 +1079,7 @@ def abort_callback(abort_run=trigger) -> None:
 
 @pytest.mark.run
 def test_abort_on_alert_python(
-    speedy_heartbeat, create_plain_run: typing.Tuple[sv_run.Run, dict], mocker: pytest_mock.MockerFixture
+    speedy_heartbeat, create_plain_run: tuple[sv_run.Run, dict], mocker: pytest_mock.MockerFixture
 ) -> None:
     timeout: int = 20
     interval: int = 0
@@ -1092,7 +1092,7 @@ def test_abort_on_alert_python(
 
 @pytest.mark.run
 def test_abort_on_alert_raise(
-    create_plain_run: typing.Tuple[sv_run.Run, dict]
+    create_plain_run: tuple[sv_run.Run, dict]
 ) -> None:
 
     run, _ = create_plain_run
@@ -1117,7 +1117,7 @@ def test_abort_on_alert_raise(
 
 
 @pytest.mark.run
-def test_kill_all_processes(create_plain_run: typing.Tuple[sv_run.Run, dict]) -> None:
+def test_kill_all_processes(create_plain_run: tuple[sv_run.Run, dict]) -> None:
     run, _ = create_plain_run
     run.config(system_metrics_interval=1)
     run.add_process(identifier="forever_long_1", executable="bash", c="sleep 10000")
@@ -1148,7 +1148,7 @@ def test_run_created_with_no_timeout() -> None:
 
 @pytest.mark.parametrize("mode", ("online", "offline"), ids=("online", "offline"))
 @pytest.mark.run
-def test_reconnect(mode, monkeypatch: pytest.MonkeyPatch) -> None:
+def test_reconnect_functionality(mode, monkeypatch: pytest.MonkeyPatch) -> None:
     temp_d: tempfile.TemporaryDirectory | None = None
 
     if mode == "offline":
@@ -1188,3 +1188,16 @@ def test_reconnect(mode, monkeypatch: pytest.MonkeyPatch) -> None:
     if temp_d:
         temp_d.cleanup()
 
+
+def test_reconnect_with_process(create_plain_run: tuple[sv_run.Run, dict]) -> None:
+    run, _ = create_plain_run
+    run.init(name="test_reconnect_with_process", folder="/simvue_unit_testing", retention_period="2 minutes", running=False)
+    run.close()
+
+    with sv_run.Run() as new_run:
+        new_run.reconnect(run.id)
+        run.add_process(
+            identifier="test_process",
+            executable="bash",
+            c="echo 'Hello World!'",
+        )