@@ -297,16 +297,11 @@ def processes(self) -> list[psutil.Process]:
297
297
return process_list
298
298
299
299
process_list += [self ._parent_process ]
300
-
301
- # Attach child processes relating to the process set by set_pid
302
- with contextlib .suppress (psutil .NoSuchProcess , psutil .ZombieProcess ):
303
- for child in self ._parent_process .children (recursive = True ):
304
- if child not in process_list :
305
- process_list .append (child )
300
+ process_list += self ._child_processes
306
301
307
302
return list (set (process_list ))
308
303
309
- def _get_sysinfo (self ) -> dict [str , typing .Any ]:
304
+ def _get_sysinfo (self , interval : float | None = None ) -> dict [str , typing .Any ]:
310
305
"""Retrieve system administration
311
306
312
307
Parameters
@@ -320,7 +315,7 @@ def _get_sysinfo(self) -> dict[str, typing.Any]:
320
315
retrieved system specifications
321
316
"""
322
317
processes = self .processes
323
- cpu = get_process_cpu (processes , interval = 0.1 )
318
+ cpu = get_process_cpu (processes , interval = interval )
324
319
memory = get_process_memory (processes )
325
320
gpu = get_gpu_metrics (processes )
326
321
data : dict [str , typing .Any ] = {}
@@ -359,7 +354,9 @@ def _heartbeat(
359
354
last_res_metric_call = time .time ()
360
355
361
356
if self ._resources_metrics_interval :
362
- self ._add_metrics_to_dispatch (self ._get_sysinfo (), join_on_fail = False )
357
+ self ._add_metrics_to_dispatch (
358
+ self ._get_sysinfo (interval = 1 ), join_on_fail = False
359
+ )
363
360
364
361
while not heartbeat_trigger .is_set ():
365
362
time .sleep (0.1 )
@@ -490,6 +487,9 @@ def _start(self, reconnect: bool = False) -> bool:
490
487
self ._pid = os .getpid ()
491
488
492
489
self ._parent_process = psutil .Process (self ._pid ) if self ._pid else None
490
+ self ._child_processes = (
491
+ self ._get_child_processes () if self ._parent_process else None
492
+ )
493
493
494
494
self ._shutdown_event = threading .Event ()
495
495
self ._heartbeat_termination_trigger = threading .Event ()
@@ -904,6 +904,16 @@ def kill_all_processes(self) -> None:
904
904
)
905
905
self ._executor .kill_all ()
906
906
907
+ def _get_child_processes (self ) -> list [psutil .Process ]:
908
+ _process_list = []
909
+ # Attach child processes relating to the process set by set_pid
910
+ with contextlib .suppress (psutil .NoSuchProcess , psutil .ZombieProcess ):
911
+ for child in self ._parent_process .children (recursive = True ):
912
+ if child not in _process_list :
913
+ _process_list .append (child )
914
+
915
+ return list (set (_process_list ))
916
+
907
917
@property
908
918
def executor (self ) -> Executor :
909
919
"""Return the executor for this run"""
@@ -959,6 +969,13 @@ def set_pid(self, pid: int) -> None:
959
969
"""
960
970
self ._pid = pid
961
971
self ._parent_process = psutil .Process (self ._pid )
972
+ self ._child_processes = self ._get_child_processes ()
973
+ # Get CPU usage stats for each of those new processes, so that next time it's measured by the heartbeat the value is accurate
974
+ [
975
+ _process .cpu_percent ()
976
+ for _process in self ._child_processes + [self ._parent_process ]
977
+ ]
978
+ time .sleep (0.1 )
962
979
963
980
@skip_if_failed ("_aborted" , "_suppress_errors" , False )
964
981
@pydantic .validate_call
@@ -1962,7 +1979,7 @@ def log_alert(
1962
1979
self ._error ("Please specify alert to update either by ID or by name." )
1963
1980
return False
1964
1981
1965
- if self ._user_config .run .mode == "offline" :
1982
+ if name and self ._user_config .run .mode == "offline" :
1966
1983
self ._error (
1967
1984
"Cannot retrieve alerts based on names in offline mode - please use IDs instead."
1968
1985
)
0 commit comments