Skip to content

Commit ea5ff03

Browse files
authored
Log panic file contents on worker failure (#294)
1 parent 312f23b commit ea5ff03

File tree

3 files changed

+133
-3
lines changed

3 files changed

+133
-3
lines changed

src/ansys/hps/data_transfer/client/binary.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,8 @@ class BinaryConfig:
147147
Whether to ignore SSL certificate verification.
148148
debug: bool, default: False
149149
Whether to enable debug logging.
150+
max_restarts: int, default: 5
151+
Maximum number of times to restart the worker if it crashes.
150152
"""
151153

152154
def __init__(
@@ -169,6 +171,7 @@ def __init__(
169171
debug: bool = False,
170172
auth_type: str = None,
171173
env: dict | None = None,
174+
max_restarts: int = 5,
172175
):
173176
"""Initialize the BinaryConfig class object."""
174177
self.data_transfer_url = data_transfer_url
@@ -191,6 +194,7 @@ def __init__(
191194
self._env = env or {}
192195
self.insecure = insecure
193196
self.auth_type = auth_type
197+
self.max_restarts = max_restarts
194198

195199
self._on_token_update = None
196200
self._on_process_died = None
@@ -388,6 +392,7 @@ def _log_output(self):
388392
# log.debug("Worker log output stopped")
389393

390394
def _monitor(self):
395+
restart_count = 0 # Initialize a counter for restarts
391396
while not self._stop.is_set():
392397
if self._process is None:
393398
self._prepare()
@@ -414,18 +419,23 @@ def _monitor(self):
414419
else:
415420
ret_code = self._process.poll()
416421
if ret_code is not None and ret_code != 0:
422+
restart_count += 1 # Increment the restart counter
423+
if restart_count > self.config.max_restarts:
424+
log.error(f"Worker exceeded maximum restart attempts ({self.config.max_restarts}). Stopping...")
425+
break # Exit the loop after exceeding the restart limit
426+
417427
log.warning(f"Worker exited with code {ret_code}, restarting ...")
418428
self._process = None
419429
self._prepared.clear()
420430
if self.config._on_process_died is not None:
421431
self.config._on_process_died(ret_code)
422432
time.sleep(1.0)
423433
continue
424-
# elif self._config.debug:
425-
# log.debug(f"Worker running ...")
434+
# Reset restart_count if the worker is running successfully
435+
restart_count = 0
426436

427437
time.sleep(self._config.monitor_interval)
428-
# log.debug("Worker monitor stopped")
438+
log.debug("Worker monitor stopped")
429439

430440
def _prepare(self):
431441
if self._config._selected_port is None:

src/ansys/hps/data_transfer/client/client.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ def __init__(
220220

221221
self._session = None
222222
self.binary = None
223+
self.panic_file = None
223224

224225
self._features = None
225226
self._api_key = None
@@ -538,6 +539,37 @@ def _adjust_config(self):
538539
}
539540
self._bin_config.env.update({k: v for k, v in env.items() if k not in os.environ})
540541

542+
def _fetch_panic_file(self, resp):
543+
"""Extract and log the panic file location from the response."""
544+
if resp.status_code == 200:
545+
self.panic_file = resp.json().get("debug", {}).get("panic_file", None)
546+
log.debug(f"Worker panic file: {self.panic_file}")
547+
548+
def _panic_file_contents(self):
549+
"""Read and log the contents of the panic file if it exists."""
550+
# if the file exists and the size of the file is > 0,
551+
# read and log its content
552+
if self.panic_file and os.path.exists(self.panic_file):
553+
try:
554+
if os.path.getsize(self.panic_file) > 0:
555+
with open(self.panic_file) as f:
556+
# Read the file line by line
557+
lines = f.readlines()
558+
message = []
559+
for line in lines:
560+
# Check for empty lines to split the message
561+
if line.strip() == "":
562+
log.error(f"Worker panic file content:\n{''.join(message)}")
563+
message = [] # Reset the message buffer
564+
else:
565+
message.append(line)
566+
567+
# Log any remaining content after the last empty line
568+
if message:
569+
log.error(f"Worker panic file content:\n{''.join(message)}")
570+
except Exception as panic_ex:
571+
log.debug(f"Failed to read panic file: {panic_ex}")
572+
541573

542574
class AsyncClient(ClientBase):
543575
"""Provides an async interface to the Python client to the HPS data transfer APIs."""
@@ -568,6 +600,9 @@ def __setstate__(self, state):
568600
async def start(self):
569601
"""Start the async binary worker."""
570602
super().start()
603+
# grab location of panic file
604+
resp = await self.session.get("/")
605+
self._fetch_panic_file(resp)
571606
self._monitor_task = asyncio.create_task(self._monitor())
572607

573608
async def stop(self, wait=5.0):
@@ -632,6 +667,8 @@ async def _monitor(self):
632667
if self.binary_config.debug:
633668
log.debug("URL: %s", self.base_api_url)
634669
log.debug(traceback.format_exc())
670+
# Before marking it as failed check if there is a panic file
671+
self._panic_file_contents()
635672
self._monitor_state.mark_failed(exc=ex, binary=self.binary)
636673
continue
637674

@@ -675,6 +712,9 @@ def start(self):
675712
self._monitor_thread = threading.Thread(
676713
target=self._monitor, args=(), daemon=True, name="worker_status_monitor"
677714
)
715+
# grab location of panic file
716+
resp = self.session.get("/")
717+
self._fetch_panic_file(resp)
678718
self._monitor_thread.start()
679719

680720
def stop(self, wait=5.0):
@@ -733,6 +773,8 @@ def _monitor(self):
733773
if self.binary_config.debug:
734774
log.debug("URL: %s", self.base_api_url)
735775
log.debug(traceback.format_exc())
776+
# Before marking it as failed check if there is a panic file
777+
self._panic_file_contents()
736778
self._monitor_state.mark_failed(exc=ex, binary=self.binary)
737779
continue
738780

tests/test_client.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Copyright (C) 2024 - 2025 ANSYS, Inc. and/or its affiliates.
2+
# SPDX-License-Identifier: MIT
3+
#
4+
#
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
#
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
#
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
"""This module contains tests for verifying the functionality of the Client class"""
24+
25+
import unittest
26+
from unittest.mock import MagicMock, mock_open, patch
27+
28+
from ansys.hps.data_transfer.client.client import ClientBase
29+
30+
31+
class TestClientBase(unittest.TestCase):
32+
"""Test suite for the ClientBase class."""
33+
34+
def setUp(self):
35+
"""Set up the ClientBase instance for testing."""
36+
self.client = ClientBase()
37+
self.client.panic_file = None
38+
39+
@patch("ansys.hps.data_transfer.client.client.log")
40+
def test_fetch_panic_file(self, mock_log):
41+
"""Test the _fetch_panic_file method."""
42+
# Mock the response object
43+
mock_resp = MagicMock()
44+
mock_resp.status_code = 200
45+
mock_resp.json.return_value = {"debug": {"panic_file": "/path/to/panic_file.log"}}
46+
47+
# Call the method
48+
self.client._fetch_panic_file(mock_resp)
49+
50+
# Assertions
51+
assert self.client.panic_file == "/path/to/panic_file.log"
52+
mock_log.debug.assert_called_with("Worker panic file: /path/to/panic_file.log")
53+
54+
@patch("os.path.exists", return_value=True)
55+
@patch("os.path.getsize", return_value=100)
56+
@patch(
57+
"builtins.open",
58+
new_callable=mock_open,
59+
read_data="Error: Something went wrong\n\nDetails: Invalid configuration\n\n",
60+
)
61+
@patch("ansys.hps.data_transfer.client.client.log")
62+
def test_panic_file_contents(self, mock_log, mock_open_file, mock_getsize, mock_exists):
63+
"""Test the _panic_file_contents method."""
64+
# Set the panic file path
65+
self.client.panic_file = "/path/to/panic_file.log"
66+
67+
# Call the method
68+
self.client._panic_file_contents()
69+
70+
# Assertions
71+
mock_exists.assert_called_once_with("/path/to/panic_file.log")
72+
mock_getsize.assert_called_once_with("/path/to/panic_file.log")
73+
mock_log.error.assert_any_call("Worker panic file content:\nError: Something went wrong\n")
74+
mock_log.error.assert_any_call("Worker panic file content:\nDetails: Invalid configuration\n")
75+
76+
77+
if __name__ == "__main__":
78+
unittest.main()

0 commit comments

Comments
 (0)