Skip to content

Commit 472108b

Browse files
authored
Merge pull request #76 from simvue-io/debugging
Add debug logging
2 parents a0a3846 + aa12773 commit 472108b

File tree

4 files changed

+65
-12
lines changed

4 files changed

+65
-12
lines changed

simvue/metrics.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
import logging
12
import time
23
import psutil
34
from .pynvml import *
45

6+
logger = logging.getLogger(__name__)
7+
58
def get_process_memory(processes):
69
"""
710
Get the resident set size
@@ -10,8 +13,8 @@ def get_process_memory(processes):
1013
for process in processes:
1114
try:
1215
rss += process.memory_info().rss/1024/1024
13-
except:
14-
pass
16+
except Exception as err:
17+
logger.err(str(err))
1518

1619
return rss
1720

@@ -23,8 +26,8 @@ def get_process_cpu(processes):
2326
for process in processes:
2427
try:
2528
cpu_percent += process.cpu_percent()
26-
except:
27-
pass
29+
except Exception as err:
30+
logger.err(str(err))
2831

2932
return cpu_percent
3033

@@ -62,7 +65,7 @@ def get_gpu_metrics(processes):
6265
gpu_metrics[f"resources/gpu.memory.percent.{i}"] = memory_percent
6366

6467
nvmlShutdown()
65-
except:
66-
pass
68+
except Exception as err:
69+
logger.err(str(err))
6770

6871
return gpu_metrics

simvue/remote.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,16 @@ def create_run(self, data):
3535
"""
3636
Create a run
3737
"""
38+
logger.debug('Creating run with data: "%s"', data)
39+
3840
try:
3941
response = post(f"{self._url}/api/runs", self._headers, data)
4042
except Exception as err:
4143
self._error(f"Exception creating run: {str(err)}")
4244
return False
4345

46+
logger.debug('Got status code %d when creating run, with response: "%s"', response.status_code, response.text)
47+
4448
if response.status_code == 409:
4549
self._error(f"Duplicate run, name {data['name']} already exists")
4650
elif response.status_code != 200:
@@ -59,12 +63,16 @@ def update(self, data, run=None):
5963
if run is not None:
6064
data['name'] = run
6165

66+
logger.debug('Updating run with data: "%s"', data)
67+
6268
try:
6369
response = put(f"{self._url}/api/runs", self._headers, data)
6470
except Exception as err:
6571
self._error(f"Exception creating updating run: {str(err)}")
6672
return False
6773

74+
logger.debug('Got status code %d when updating run, with response: "%s"', response.status_code, response.text)
75+
6876
if response.status_code == 200:
6977
return True
7078

@@ -78,12 +86,16 @@ def set_folder_details(self, data, run=None):
7886
if run is not None:
7987
data['run'] = run
8088

89+
logger.debug('Setting folder details with data: "%s"', data)
90+
8191
try:
8292
response = put(f"{self._url}/api/folders", self._headers, data)
8393
except Exception as err:
8494
self._error(f"Exception setting folder details: {err}")
8595
return False
8696

97+
logger.debug('Got status code %d when setting folder details, with response: "%s"', response.status_code, response.text)
98+
8799
if response.status_code == 200:
88100
return True
89101

@@ -97,13 +109,17 @@ def save_file(self, data, run=None):
97109
if run is not None:
98110
data['run'] = run
99111

112+
logger.debug('Getting presigned URL for saving artifact, with data: "%s"', data)
113+
100114
# Get presigned URL
101115
try:
102116
response = post(f"{self._url}/api/data", self._headers, prepare_for_api(data))
103117
except Exception as err:
104118
self._error(f"Got exception when preparing to upload file {data['name']} to object storage: {str(err)}")
105119
return False
106120

121+
logger.debug('Got status code %d when getting presigned URL, with response: "%s"', response.status_code, response.text)
122+
107123
if response.status_code == 409:
108124
return True
109125

@@ -116,6 +132,9 @@ def save_file(self, data, run=None):
116132
if 'pickled' in data and 'pickledFile' not in data:
117133
try:
118134
response = put(url, {}, data['pickled'], is_json=False, timeout=UPLOAD_TIMEOUT)
135+
136+
logger.debug('Got status code %d when uploading artifact', response.status_code)
137+
119138
if response.status_code != 200:
120139
self._error(f"Got status code {response.status_code} when uploading object {data['name']} to object storage")
121140
return None
@@ -131,6 +150,9 @@ def save_file(self, data, run=None):
131150
try:
132151
with open(use_filename, 'rb') as fh:
133152
response = put(url, {}, fh, is_json=False, timeout=UPLOAD_TIMEOUT)
153+
154+
logger.debug('Got status code %d when uploading artifact', response.status_code)
155+
134156
if response.status_code != 200:
135157
self._error(f"Got status code {response.status_code} when uploading file {data['name']} to object storage")
136158
return None
@@ -147,12 +169,16 @@ def add_alert(self, data, run=None):
147169
if run is not None:
148170
data['run'] = run
149171

172+
logger.debug('Adding alert with data: "%s"', data)
173+
150174
try:
151175
response = post(f"{self._url}/api/alerts", self._headers, data)
152176
except Exception as err:
153177
self._error(f"Got exception when creating an alert: {str(err)}")
154178
return False
155179

180+
logger.debug('Got response %d when adding alert, with response: "%s"', response.status_code, response.text)
181+
156182
if response.status_code in (200, 409):
157183
return True
158184

@@ -163,12 +189,16 @@ def send_metrics(self, data):
163189
"""
164190
Send metrics
165191
"""
192+
logger.debug('Sending metrics')
193+
166194
try:
167195
response = post(f"{self._url}/api/metrics", self._headers_mp, data, is_json=False)
168196
except Exception as err:
169197
self._error(f"Exception sending metrics: {str(err)}")
170198
return False
171199

200+
logger.debug('Got status code %d when sending metrics', response.status_code)
201+
172202
if response.status_code == 200:
173203
return True
174204

@@ -179,12 +209,16 @@ def send_event(self, data):
179209
"""
180210
Send events
181211
"""
212+
logger.debug('Sending events')
213+
182214
try:
183215
response = post(f"{self._url}/api/events", self._headers_mp, data, is_json=False)
184216
except Exception as err:
185217
self._error(f"Exception sending event: {str(err)}")
186218
return False
187219

220+
logger.debug('Got status code %d when sending events', response.status_code)
221+
188222
if response.status_code == 200:
189223
return True
190224

@@ -195,12 +229,16 @@ def send_heartbeat(self):
195229
"""
196230
Send heartbeat
197231
"""
232+
logger.debug('Sending heartbeat')
233+
198234
try:
199235
response = put(f"{self._url}/api/runs/heartbeat", self._headers, {'name': self._name})
200236
except Exception as err:
201237
self._error(f"Exception creating run: {str(err)}")
202238
return False
203239

240+
logger.debug('Got status code %d when sending heartbeat', response.status_code)
241+
204242
if response.status_code == 200:
205243
return True
206244

simvue/run.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,8 @@ def __enter__(self):
162162
return self
163163

164164
def __exit__(self, type, value, traceback):
165+
logger.debug('Automatically closing run %s in status %s', self._name, self._status)
166+
165167
if self._name and self._status == 'running':
166168
if self._shutdown_event is not None:
167169
self._shutdown_event.set()
@@ -191,6 +193,8 @@ def _start(self, reconnect=False):
191193
if self._mode == 'disabled':
192194
return True
193195

196+
logger.debug('Starting run')
197+
194198
self._check_token()
195199

196200
data = {'name': self._name, 'status': self._status}
@@ -440,8 +444,8 @@ def log_event(self, message, timestamp=None):
440444

441445
try:
442446
self._events_queue.put(data, block=self._queue_blocking)
443-
except:
444-
pass
447+
except Exception as err:
448+
logger.error(str(err))
445449

446450
return True
447451

@@ -491,8 +495,8 @@ def log_metrics(self, metrics, step=None, time=None, timestamp=None):
491495

492496
try:
493497
self._metrics_queue.put(data, block=self._queue_blocking)
494-
except:
495-
pass
498+
except Exception as err:
499+
logger.error(str(err))
496500

497501
return True
498502

simvue/worker.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import datetime
22
import json
3+
import logging
34
import os
45
import psutil
56
import sys
@@ -12,6 +13,8 @@
1213
from .metrics import get_process_memory, get_process_cpu, get_gpu_metrics
1314
from .utilities import get_offline_directory, create_file
1415

16+
logger = logging.getLogger(__name__)
17+
1518
HEARTBEAT_INTERVAL = 60
1619
POLLING_INTERVAL = 20
1720
MAX_BUFFER_SEND = 5000
@@ -49,6 +52,7 @@ def __init__(self, metrics_queue, events_queue, shutdown_event, uuid, run_name,
4952
self._pid = pid
5053
if pid:
5154
self._processes = update_processes(psutil.Process(pid), [])
55+
logger.debug('Worker thread started')
5256

5357
def heartbeat(self):
5458
"""
@@ -68,8 +72,11 @@ def post(self, endpoint, data):
6872
else:
6973
unique_id = time.time()
7074
filename = f"{self._directory}/{endpoint}-{unique_id}"
71-
with open(filename, 'w') as fh:
72-
json.dump(data, fh)
75+
try:
76+
with open(filename, 'w') as fh:
77+
json.dump(data, fh)
78+
except Exception as err:
79+
logger.error('Got exception writing offline update for %s: %s', endpoint, str(err))
7380

7481
def run(self):
7582
"""
@@ -147,6 +154,7 @@ def run(self):
147154

148155
if self._shutdown_event.is_set() or not self._parent_thread.is_alive():
149156
if self._metrics_queue.empty() and self._events_queue.empty():
157+
logger.debug('Ending worker thread')
150158
sys.exit(0)
151159
else:
152160
counter = 0

0 commit comments

Comments
 (0)