@@ -101,6 +101,7 @@ def __init__(self, mode: typing.Literal["online", "offline"] = "online") -> None
101
101
self ._simvue : typing .Optional [SimvueBaseClass ] = None
102
102
self ._pid : typing .Optional [int ] = 0
103
103
self ._shutdown_event : typing .Optional [threading .Event ] = None
104
+ self ._configuration_lock = threading .Lock ()
104
105
self ._heartbeat_termination_trigger : typing .Optional [threading .Event ] = None
105
106
self ._storage_id : typing .Optional [str ] = None
106
107
self ._heartbeat_thread : typing .Optional [threading .Thread ] = None
@@ -218,27 +219,27 @@ def _create_heartbeat_callback(
218
219
raise RuntimeError ("Could not commence heartbeat, run not initialised" )
219
220
220
221
def _heartbeat (
221
- url : typing .Optional [str ] = self ._url ,
222
- headers : dict [str , str ] = self ._headers ,
223
- run_id : typing .Optional [str ] = self ._id ,
224
- online : bool = self ._mode == "online" ,
225
222
heartbeat_trigger : threading .Event = self ._heartbeat_termination_trigger ,
226
223
) -> None :
227
224
last_heartbeat = time .time ()
228
225
last_res_metric_call = time .time ()
229
226
230
- self ._add_metrics_to_dispatch (self ._get_sysinfo ())
231
-
232
227
while not heartbeat_trigger .is_set ():
233
228
time .sleep (0.1 )
234
229
235
- if (
236
- self ._resources_metrics_interval
237
- and (res_time := time .time ()) - last_res_metric_call
238
- > self ._resources_metrics_interval
239
- ):
240
- self ._add_metrics_to_dispatch (self ._get_sysinfo ())
241
- last_res_metric_call = res_time
230
+ with self ._configuration_lock :
231
+ if (
232
+ self ._resources_metrics_interval
233
+ and (res_time := time .time ()) - last_res_metric_call
234
+ > self ._resources_metrics_interval
235
+ ):
236
+ # Set join on fail to false as if an error is thrown
237
+ # join would be called on this thread and a thread cannot
238
+ # join itself!
239
+ self ._add_metrics_to_dispatch (
240
+ self ._get_sysinfo (), join_on_fail = False
241
+ )
242
+ last_res_metric_call = res_time
242
243
243
244
if time .time () - last_heartbeat < HEARTBEAT_INTERVAL :
244
245
continue
@@ -377,20 +378,23 @@ def _start(self, reconnect: bool = False) -> bool:
377
378
self ._error (e .args [0 ])
378
379
return False
379
380
381
+ self ._active = True
382
+
380
383
self ._dispatcher .start ()
381
384
self ._heartbeat_thread .start ()
382
385
383
- self ._active = True
384
-
385
386
return True
386
387
387
- def _error (self , message : str ) -> None :
388
+ def _error (self , message : str , join_threads : bool = True ) -> None :
388
389
"""Raise an exception if necessary and log error
389
390
390
391
Parameters
391
392
----------
392
393
message : str
393
394
message to display in exception or logger message
395
+ join_threads : bool
396
+ whether to join the threads on failure. This option exists to
397
+ prevent join being called in nested thread calls to this function.
394
398
395
399
Raises
396
400
------
@@ -400,7 +404,8 @@ def _error(self, message: str) -> None:
400
404
# Stop heartbeat
401
405
if self ._heartbeat_termination_trigger and self ._heartbeat_thread :
402
406
self ._heartbeat_termination_trigger .set ()
403
- self ._heartbeat_thread .join ()
407
+ if join_threads :
408
+ self ._heartbeat_thread .join ()
404
409
405
410
# Finish stopping all threads
406
411
if self ._shutdown_event :
@@ -409,7 +414,8 @@ def _error(self, message: str) -> None:
409
414
# Purge the queue as we can no longer send metrics
410
415
if self ._dispatcher and self ._dispatcher .is_alive ():
411
416
self ._dispatcher .purge ()
412
- self ._dispatcher .join ()
417
+ if join_threads :
418
+ self ._dispatcher .join ()
413
419
414
420
if not self ._suppress_errors :
415
421
raise RuntimeError (message )
@@ -793,27 +799,28 @@ def config(
793
799
_description_
794
800
"""
795
801
796
- if suppress_errors is not None :
797
- self ._suppress_errors = suppress_errors
802
+ with self ._configuration_lock :
803
+ if suppress_errors is not None :
804
+ self ._suppress_errors = suppress_errors
798
805
799
- if queue_blocking is not None :
800
- self ._queue_blocking = queue_blocking
806
+ if queue_blocking is not None :
807
+ self ._queue_blocking = queue_blocking
801
808
802
- if resources_metrics_interval and disable_resources_metrics :
803
- self ._error (
804
- "Setting of resource metric interval and disabling resource metrics is ambiguous"
805
- )
806
- return False
809
+ if resources_metrics_interval and disable_resources_metrics :
810
+ self ._error (
811
+ "Setting of resource metric interval and disabling resource metrics is ambiguous"
812
+ )
813
+ return False
807
814
808
- if disable_resources_metrics :
809
- self ._pid = None
810
- self ._resources_metrics_interval = None
815
+ if disable_resources_metrics :
816
+ self ._pid = None
817
+ self ._resources_metrics_interval = None
811
818
812
- if resources_metrics_interval :
813
- self ._resources_metrics_interval = resources_metrics_interval
819
+ if resources_metrics_interval :
820
+ self ._resources_metrics_interval = resources_metrics_interval
814
821
815
- if storage_id :
816
- self ._storage_id = storage_id
822
+ if storage_id :
823
+ self ._storage_id = storage_id
817
824
818
825
return True
819
826
@@ -900,6 +907,7 @@ def _add_metrics_to_dispatch(
900
907
step : typing .Optional [int ] = None ,
901
908
time : typing .Optional [int ] = None ,
902
909
timestamp : typing .Optional [str ] = None ,
910
+ join_on_fail : bool = True ,
903
911
) -> bool :
904
912
if self ._mode == "disabled" :
905
913
return True
@@ -909,19 +917,21 @@ def _add_metrics_to_dispatch(
909
917
return True
910
918
911
919
if not self ._simvue or not self ._dispatcher :
912
- self ._error ("Cannot log metrics, run not initialised" )
920
+ self ._error ("Cannot log metrics, run not initialised" , join_on_fail )
913
921
return False
914
922
915
923
if not self ._active :
916
- self ._error ("Run is not active" )
924
+ self ._error ("Run is not active" , join_on_fail )
917
925
return False
918
926
919
927
if self ._status != "running" :
920
- self ._error ("Cannot log metrics when not in the running state" )
928
+ self ._error (
929
+ "Cannot log metrics when not in the running state" , join_on_fail
930
+ )
921
931
return False
922
932
923
933
if timestamp and not validate_timestamp (timestamp ):
924
- self ._error ("Invalid timestamp format" )
934
+ self ._error ("Invalid timestamp format" , join_on_fail )
925
935
return False
926
936
927
937
_data : dict [str , typing .Any ] = {
0 commit comments