Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ class FaultStorage {
/// @param description Human-readable description (only used for FAILED events)
/// @param source_id Reporting source identifier
/// @param timestamp Current time for tracking
/// @return true if this created a new fault entry, false if existing fault was updated
/// @return true if this is a new occurrence (new fault or reactivated CLEARED fault),
/// false if existing active fault was updated
virtual bool report_fault_event(const std::string & fault_code, uint8_t event_type, uint8_t severity,
const std::string & description, const std::string & source_id,
const rclcpp::Time & timestamp) = 0;
Expand Down
33 changes: 27 additions & 6 deletions src/ros2_medkit_fault_manager/src/fault_storage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,7 @@ DebounceConfig InMemoryFaultStorage::get_debounce_config() const {
}

void InMemoryFaultStorage::update_status(FaultState & state) {
// Don't update CLEARED faults
if (state.status == ros2_medkit_msgs::msg::Fault::STATUS_CLEARED) {
return;
}
// Note: CLEARED faults are handled in report_fault_event() before this is called

if (state.debounce_counter <= config_.confirmation_threshold) {
state.status = ros2_medkit_msgs::msg::Fault::STATUS_CONFIRMED;
Expand Down Expand Up @@ -107,9 +104,33 @@ bool InMemoryFaultStorage::report_fault_event(const std::string & fault_code, ui
// Existing fault - update
auto & state = it->second;

// Don't update CLEARED faults
// CLEARED faults can be reactivated by FAILED events
if (state.status == ros2_medkit_msgs::msg::Fault::STATUS_CLEARED) {
return false;
if (!is_failed) {
// PASSED events for CLEARED faults are ignored
return false;
}
// FAILED event reactivates the fault - reset debounce counter
state.debounce_counter = -1;
state.last_failed_time = timestamp;
state.last_occurred = timestamp;
state.reporting_sources.insert(source_id);
if (state.occurrence_count < std::numeric_limits<uint32_t>::max()) {
++state.occurrence_count;
}
if (severity > state.severity) {
state.severity = severity;
}
if (!description.empty()) {
state.description = description;
}
// Check for immediate CRITICAL confirmation
if (config_.critical_immediate_confirm && severity == ros2_medkit_msgs::msg::Fault::SEVERITY_CRITICAL) {
state.status = ros2_medkit_msgs::msg::Fault::STATUS_CONFIRMED;
} else {
update_status(state);
}
return true; // Reactivation treated as new occurrence for event publishing
}

state.last_occurred = timestamp;
Expand Down
14 changes: 11 additions & 3 deletions src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,9 +365,17 @@ bool SqliteFaultStorage::report_fault_event(const std::string & fault_code, uint
std::string current_status = check_stmt.column_text(3);
int32_t debounce_counter = static_cast<int32_t>(check_stmt.column_int(4));

// Don't update CLEARED faults
// CLEARED faults can be reactivated by FAILED events
bool is_reactivation = false;
if (current_status == ros2_medkit_msgs::msg::Fault::STATUS_CLEARED) {
return false;
if (!is_failed) {
// PASSED events for CLEARED faults are ignored
return false;
}
// FAILED event reactivates - reset debounce counter to 0 so FAILED branch
// decrements it to -1, then reuse the existing FAILED logic below
debounce_counter = 0;
is_reactivation = true;
}

if (is_failed) {
Expand Down Expand Up @@ -468,7 +476,7 @@ bool SqliteFaultStorage::report_fault_event(const std::string & fault_code, uint
}
}

return false; // Existing fault updated
return is_reactivation; // Reactivation treated as new occurrence for event publishing
}

// New fault - only create for FAILED events
Expand Down
79 changes: 75 additions & 4 deletions src/ros2_medkit_fault_manager/test/test_fault_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,9 +343,66 @@ TEST_F(FaultStorageTest, CriticalBypassCanBeDisabled) {
EXPECT_EQ(fault->status, Fault::STATUS_PREFAILED);
}

TEST_F(FaultStorageTest, ClearedFaultNotReconfirmed) {
TEST_F(FaultStorageTest, ClearedFaultCanBeReactivated) {
rclcpp::Clock clock;

// Report to confirm (with default threshold=-1, single report confirms)
bool is_new = storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR,
"Initial", "/node1", clock.now());
EXPECT_TRUE(is_new);

auto fault = storage_.get_fault("FAULT_1");
ASSERT_TRUE(fault.has_value());
EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED);
EXPECT_EQ(fault->occurrence_count, 1u);

// Clear the fault
storage_.clear_fault("FAULT_1");
fault = storage_.get_fault("FAULT_1");
ASSERT_TRUE(fault.has_value());
EXPECT_EQ(fault->status, Fault::STATUS_CLEARED);

// Report again - should reactivate
is_new = storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR,
"Reactivated", "/node2", clock.now());
EXPECT_TRUE(is_new); // Should return true like a new fault

fault = storage_.get_fault("FAULT_1");
ASSERT_TRUE(fault.has_value());
EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED); // Should be reconfirmed
EXPECT_EQ(fault->occurrence_count, 2u); // Should increment
EXPECT_EQ(fault->reporting_sources.size(), 2u); // Both sources
EXPECT_EQ(fault->description, "Reactivated"); // Updated description
}

TEST_F(FaultStorageTest, PassedEventForClearedFaultIgnored) {
rclcpp::Clock clock;

// Report and confirm
storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node1",
clock.now());

// Clear the fault
storage_.clear_fault("FAULT_1");

// PASSED event should be ignored for CLEARED fault
bool result =
storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_PASSED, 0, "", "/node1", clock.now());
EXPECT_FALSE(result);

auto fault = storage_.get_fault("FAULT_1");
ASSERT_TRUE(fault.has_value());
EXPECT_EQ(fault->status, Fault::STATUS_CLEARED); // Should stay cleared
}

TEST_F(FaultStorageTest, ClearedFaultReactivationRestartsDebounce) {
rclcpp::Clock clock;

// Set threshold to -3 to test debounce behavior
DebounceConfig config;
config.confirmation_threshold = -3;
storage_.set_debounce_config(config);

// Report 3 times to confirm
storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node1",
clock.now());
Expand All @@ -354,16 +411,30 @@ TEST_F(FaultStorageTest, ClearedFaultNotReconfirmed) {
storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node3",
clock.now());

auto fault = storage_.get_fault("FAULT_1");
ASSERT_TRUE(fault.has_value());
EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED);

// Clear the fault
storage_.clear_fault("FAULT_1");

// Report again
// Reactivate - should start in PREFAILED with counter=-1
storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node4",
clock.now());

auto fault = storage_.get_fault("FAULT_1");
fault = storage_.get_fault("FAULT_1");
ASSERT_TRUE(fault.has_value());
EXPECT_EQ(fault->status, Fault::STATUS_CLEARED); // Should stay cleared
EXPECT_EQ(fault->status, Fault::STATUS_PREFAILED); // Not yet confirmed, needs 2 more FAILED

// Report 2 more times to re-confirm
storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node5",
clock.now());
storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node6",
clock.now());

fault = storage_.get_fault("FAULT_1");
ASSERT_TRUE(fault.has_value());
EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED); // Now confirmed
}

TEST_F(FaultStorageTest, PassedEventIncrementsCounter) {
Expand Down
42 changes: 42 additions & 0 deletions src/ros2_medkit_fault_manager/test/test_integration.test.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,48 @@ def test_25_correlation_include_muted_false_excludes_details(self):

print('Default query excludes muted/cluster details as expected')

def test_26_cleared_fault_can_be_reactivated(self):
"""Test that a CLEARED fault can be reactivated by new FAILED events."""
fault_code = 'TEST_REACTIVATION'

# Report and confirm fault
request = ReportFault.Request()
request.fault_code = fault_code
request.event_type = ReportFault.Request.EVENT_FAILED
request.severity = Fault.SEVERITY_CRITICAL
request.description = 'Initial fault'
request.source_id = '/node1'
response = self._call_service(self.report_fault_client, request)
self.assertTrue(response.accepted)

# Clear the fault
clear_request = ClearFault.Request()
clear_request.fault_code = fault_code
clear_response = self._call_service(self.clear_fault_client, clear_request)
self.assertTrue(clear_response.success)

# Report again - should reactivate
request = ReportFault.Request()
request.fault_code = fault_code
request.event_type = ReportFault.Request.EVENT_FAILED
request.severity = Fault.SEVERITY_CRITICAL
request.description = 'Reactivated'
request.source_id = '/node2'
response = self._call_service(self.report_fault_client, request)
self.assertTrue(response.accepted)

# Verify fault is CONFIRMED again
get_request = GetFaults.Request()
get_request.statuses = [Fault.STATUS_CONFIRMED]
get_response = self._call_service(self.get_faults_client, get_request)

fault = next((f for f in get_response.faults if f.fault_code == fault_code), None)
self.assertIsNotNone(fault)
self.assertEqual(fault.status, Fault.STATUS_CONFIRMED)
self.assertEqual(fault.occurrence_count, 2)
self.assertEqual(len(fault.reporting_sources), 2)
print(f'Cleared fault reactivated: occurrence_count={fault.occurrence_count}')


@launch_testing.post_shutdown_test()
class TestFaultManagerShutdown(unittest.TestCase):
Expand Down
79 changes: 75 additions & 4 deletions src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,9 +390,66 @@ TEST_F(SqliteFaultStorageTest, CriticalSeverityBypassesDebounce) {
EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED);
}

TEST_F(SqliteFaultStorageTest, ClearedFaultNotReconfirmed) {
TEST_F(SqliteFaultStorageTest, ClearedFaultCanBeReactivated) {
rclcpp::Clock clock;

// Report to confirm (with default threshold=-1, single report confirms)
bool is_new = storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR,
"Initial", "/node1", clock.now());
EXPECT_TRUE(is_new);

auto fault = storage_->get_fault("FAULT_1");
ASSERT_TRUE(fault.has_value());
EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED);
EXPECT_EQ(fault->occurrence_count, 1u);

// Clear the fault
storage_->clear_fault("FAULT_1");
fault = storage_->get_fault("FAULT_1");
ASSERT_TRUE(fault.has_value());
EXPECT_EQ(fault->status, Fault::STATUS_CLEARED);

// Report again - should reactivate
is_new = storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR,
"Reactivated", "/node2", clock.now());
EXPECT_TRUE(is_new); // Should return true like a new fault

fault = storage_->get_fault("FAULT_1");
ASSERT_TRUE(fault.has_value());
EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED); // Should be reconfirmed
EXPECT_EQ(fault->occurrence_count, 2u); // Should increment
EXPECT_EQ(fault->reporting_sources.size(), 2u); // Both sources
EXPECT_EQ(fault->description, "Reactivated"); // Updated description
}

TEST_F(SqliteFaultStorageTest, PassedEventForClearedFaultIgnored) {
rclcpp::Clock clock;

// Report and confirm
storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node1",
clock.now());

// Clear the fault
storage_->clear_fault("FAULT_1");

// PASSED event should be ignored for CLEARED fault
bool result =
storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_PASSED, 0, "", "/node1", clock.now());
EXPECT_FALSE(result);

auto fault = storage_->get_fault("FAULT_1");
ASSERT_TRUE(fault.has_value());
EXPECT_EQ(fault->status, Fault::STATUS_CLEARED); // Should stay cleared
}

TEST_F(SqliteFaultStorageTest, ClearedFaultReactivationRestartsDebounce) {
rclcpp::Clock clock;

// Set threshold to -3 to test debounce behavior
DebounceConfig config;
config.confirmation_threshold = -3;
storage_->set_debounce_config(config);

// Report 3 times to confirm
storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node1",
clock.now());
Expand All @@ -401,16 +458,30 @@ TEST_F(SqliteFaultStorageTest, ClearedFaultNotReconfirmed) {
storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node3",
clock.now());

auto fault = storage_->get_fault("FAULT_1");
ASSERT_TRUE(fault.has_value());
EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED);

// Clear the fault
storage_->clear_fault("FAULT_1");

// Report again
// Reactivate - should start in PREFAILED with counter=-1
storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node4",
clock.now());

auto fault = storage_->get_fault("FAULT_1");
fault = storage_->get_fault("FAULT_1");
ASSERT_TRUE(fault.has_value());
EXPECT_EQ(fault->status, Fault::STATUS_CLEARED); // Should stay cleared
EXPECT_EQ(fault->status, Fault::STATUS_PREFAILED); // Not yet confirmed, needs 2 more FAILED

// Report 2 more times to re-confirm
storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node5",
clock.now());
storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node6",
clock.now());

fault = storage_->get_fault("FAULT_1");
ASSERT_TRUE(fault.has_value());
EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED); // Now confirmed
}

TEST_F(SqliteFaultStorageTest, ConfirmationPersistsAfterReopen) {
Expand Down