diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp index 4fcb572..37646f5 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp @@ -110,7 +110,8 @@ class FaultStorage { /// @param description Human-readable description (only used for FAILED events) /// @param source_id Reporting source identifier /// @param timestamp Current time for tracking - /// @return true if this created a new fault entry, false if existing fault was updated + /// @return true if this is a new occurrence (new fault or reactivated CLEARED fault), + /// false if existing active fault was updated virtual bool report_fault_event(const std::string & fault_code, uint8_t event_type, uint8_t severity, const std::string & description, const std::string & source_id, const rclcpp::Time & timestamp) = 0; diff --git a/src/ros2_medkit_fault_manager/src/fault_storage.cpp b/src/ros2_medkit_fault_manager/src/fault_storage.cpp index 7eb1861..dce066b 100644 --- a/src/ros2_medkit_fault_manager/src/fault_storage.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_storage.cpp @@ -49,10 +49,7 @@ DebounceConfig InMemoryFaultStorage::get_debounce_config() const { } void InMemoryFaultStorage::update_status(FaultState & state) { - // Don't update CLEARED faults - if (state.status == ros2_medkit_msgs::msg::Fault::STATUS_CLEARED) { - return; - } + // Note: CLEARED faults are handled in report_fault_event() before this is called if (state.debounce_counter <= config_.confirmation_threshold) { state.status = ros2_medkit_msgs::msg::Fault::STATUS_CONFIRMED; @@ -107,9 +104,33 @@ bool InMemoryFaultStorage::report_fault_event(const std::string & fault_code, ui // Existing fault - update auto & state = it->second; - // Don't update CLEARED faults + // CLEARED faults can be reactivated by FAILED events if (state.status == ros2_medkit_msgs::msg::Fault::STATUS_CLEARED) { - return false; + if (!is_failed) { + // PASSED events for CLEARED faults are ignored + return false; + } + // FAILED event reactivates the fault - reset debounce counter + state.debounce_counter = -1; + state.last_failed_time = timestamp; + state.last_occurred = timestamp; + state.reporting_sources.insert(source_id); + if (state.occurrence_count < std::numeric_limits::max()) { + ++state.occurrence_count; + } + if (severity > state.severity) { + state.severity = severity; + } + if (!description.empty()) { + state.description = description; + } + // Check for immediate CRITICAL confirmation + if (config_.critical_immediate_confirm && severity == ros2_medkit_msgs::msg::Fault::SEVERITY_CRITICAL) { + state.status = ros2_medkit_msgs::msg::Fault::STATUS_CONFIRMED; + } else { + update_status(state); + } + return true; // Reactivation treated as new occurrence for event publishing } state.last_occurred = timestamp; diff --git a/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp b/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp index 102f82a..250e96c 100644 --- a/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp +++ b/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp @@ -365,9 +365,17 @@ bool SqliteFaultStorage::report_fault_event(const std::string & fault_code, uint std::string current_status = check_stmt.column_text(3); int32_t debounce_counter = static_cast(check_stmt.column_int(4)); - // Don't update CLEARED faults + // CLEARED faults can be reactivated by FAILED events + bool is_reactivation = false; if (current_status == ros2_medkit_msgs::msg::Fault::STATUS_CLEARED) { - return false; + if (!is_failed) { + // PASSED events for CLEARED faults are ignored + return false; + } + // FAILED event reactivates - reset debounce counter to 0 so FAILED branch + // decrements it to -1, then reuse the existing FAILED logic below + debounce_counter = 0; + is_reactivation = true; } if (is_failed) { @@ -468,7 +476,7 @@ bool SqliteFaultStorage::report_fault_event(const std::string & fault_code, uint } } - return false; // Existing fault updated + return is_reactivation; // Reactivation treated as new occurrence for event publishing } // New fault - only create for FAILED events diff --git a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp index ebc63fe..22714a4 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp @@ -343,9 +343,66 @@ TEST_F(FaultStorageTest, CriticalBypassCanBeDisabled) { EXPECT_EQ(fault->status, Fault::STATUS_PREFAILED); } -TEST_F(FaultStorageTest, ClearedFaultNotReconfirmed) { +TEST_F(FaultStorageTest, ClearedFaultCanBeReactivated) { rclcpp::Clock clock; + // Report to confirm (with default threshold=-1, single report confirms) + bool is_new = storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, + "Initial", "/node1", clock.now()); + EXPECT_TRUE(is_new); + + auto fault = storage_.get_fault("FAULT_1"); + ASSERT_TRUE(fault.has_value()); + EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED); + EXPECT_EQ(fault->occurrence_count, 1u); + + // Clear the fault + storage_.clear_fault("FAULT_1"); + fault = storage_.get_fault("FAULT_1"); + ASSERT_TRUE(fault.has_value()); + EXPECT_EQ(fault->status, Fault::STATUS_CLEARED); + + // Report again - should reactivate + is_new = storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, + "Reactivated", "/node2", clock.now()); + EXPECT_TRUE(is_new); // Should return true like a new fault + + fault = storage_.get_fault("FAULT_1"); + ASSERT_TRUE(fault.has_value()); + EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED); // Should be reconfirmed + EXPECT_EQ(fault->occurrence_count, 2u); // Should increment + EXPECT_EQ(fault->reporting_sources.size(), 2u); // Both sources + EXPECT_EQ(fault->description, "Reactivated"); // Updated description +} + +TEST_F(FaultStorageTest, PassedEventForClearedFaultIgnored) { + rclcpp::Clock clock; + + // Report and confirm + storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node1", + clock.now()); + + // Clear the fault + storage_.clear_fault("FAULT_1"); + + // PASSED event should be ignored for CLEARED fault + bool result = + storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_PASSED, 0, "", "/node1", clock.now()); + EXPECT_FALSE(result); + + auto fault = storage_.get_fault("FAULT_1"); + ASSERT_TRUE(fault.has_value()); + EXPECT_EQ(fault->status, Fault::STATUS_CLEARED); // Should stay cleared +} + +TEST_F(FaultStorageTest, ClearedFaultReactivationRestartsDebounce) { + rclcpp::Clock clock; + + // Set threshold to -3 to test debounce behavior + DebounceConfig config; + config.confirmation_threshold = -3; + storage_.set_debounce_config(config); + // Report 3 times to confirm storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node1", clock.now()); @@ -354,16 +411,30 @@ TEST_F(FaultStorageTest, ClearedFaultNotReconfirmed) { storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node3", clock.now()); + auto fault = storage_.get_fault("FAULT_1"); + ASSERT_TRUE(fault.has_value()); + EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED); + // Clear the fault storage_.clear_fault("FAULT_1"); - // Report again + // Reactivate - should start in PREFAILED with counter=-1 storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node4", clock.now()); - auto fault = storage_.get_fault("FAULT_1"); + fault = storage_.get_fault("FAULT_1"); ASSERT_TRUE(fault.has_value()); - EXPECT_EQ(fault->status, Fault::STATUS_CLEARED); // Should stay cleared + EXPECT_EQ(fault->status, Fault::STATUS_PREFAILED); // Not yet confirmed, needs 2 more FAILED + + // Report 2 more times to re-confirm + storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node5", + clock.now()); + storage_.report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node6", + clock.now()); + + fault = storage_.get_fault("FAULT_1"); + ASSERT_TRUE(fault.has_value()); + EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED); // Now confirmed } TEST_F(FaultStorageTest, PassedEventIncrementsCounter) { diff --git a/src/ros2_medkit_fault_manager/test/test_integration.test.py b/src/ros2_medkit_fault_manager/test/test_integration.test.py index 9b6d5b8..cad919f 100644 --- a/src/ros2_medkit_fault_manager/test/test_integration.test.py +++ b/src/ros2_medkit_fault_manager/test/test_integration.test.py @@ -914,6 +914,48 @@ def test_25_correlation_include_muted_false_excludes_details(self): print('Default query excludes muted/cluster details as expected') + def test_26_cleared_fault_can_be_reactivated(self): + """Test that a CLEARED fault can be reactivated by new FAILED events.""" + fault_code = 'TEST_REACTIVATION' + + # Report and confirm fault + request = ReportFault.Request() + request.fault_code = fault_code + request.event_type = ReportFault.Request.EVENT_FAILED + request.severity = Fault.SEVERITY_CRITICAL + request.description = 'Initial fault' + request.source_id = '/node1' + response = self._call_service(self.report_fault_client, request) + self.assertTrue(response.accepted) + + # Clear the fault + clear_request = ClearFault.Request() + clear_request.fault_code = fault_code + clear_response = self._call_service(self.clear_fault_client, clear_request) + self.assertTrue(clear_response.success) + + # Report again - should reactivate + request = ReportFault.Request() + request.fault_code = fault_code + request.event_type = ReportFault.Request.EVENT_FAILED + request.severity = Fault.SEVERITY_CRITICAL + request.description = 'Reactivated' + request.source_id = '/node2' + response = self._call_service(self.report_fault_client, request) + self.assertTrue(response.accepted) + + # Verify fault is CONFIRMED again + get_request = GetFaults.Request() + get_request.statuses = [Fault.STATUS_CONFIRMED] + get_response = self._call_service(self.get_faults_client, get_request) + + fault = next((f for f in get_response.faults if f.fault_code == fault_code), None) + self.assertIsNotNone(fault) + self.assertEqual(fault.status, Fault.STATUS_CONFIRMED) + self.assertEqual(fault.occurrence_count, 2) + self.assertEqual(len(fault.reporting_sources), 2) + print(f'Cleared fault reactivated: occurrence_count={fault.occurrence_count}') + @launch_testing.post_shutdown_test() class TestFaultManagerShutdown(unittest.TestCase): diff --git a/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp b/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp index 6db1c4f..2e93a60 100644 --- a/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp +++ b/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp @@ -390,9 +390,66 @@ TEST_F(SqliteFaultStorageTest, CriticalSeverityBypassesDebounce) { EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED); } -TEST_F(SqliteFaultStorageTest, ClearedFaultNotReconfirmed) { +TEST_F(SqliteFaultStorageTest, ClearedFaultCanBeReactivated) { rclcpp::Clock clock; + // Report to confirm (with default threshold=-1, single report confirms) + bool is_new = storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, + "Initial", "/node1", clock.now()); + EXPECT_TRUE(is_new); + + auto fault = storage_->get_fault("FAULT_1"); + ASSERT_TRUE(fault.has_value()); + EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED); + EXPECT_EQ(fault->occurrence_count, 1u); + + // Clear the fault + storage_->clear_fault("FAULT_1"); + fault = storage_->get_fault("FAULT_1"); + ASSERT_TRUE(fault.has_value()); + EXPECT_EQ(fault->status, Fault::STATUS_CLEARED); + + // Report again - should reactivate + is_new = storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, + "Reactivated", "/node2", clock.now()); + EXPECT_TRUE(is_new); // Should return true like a new fault + + fault = storage_->get_fault("FAULT_1"); + ASSERT_TRUE(fault.has_value()); + EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED); // Should be reconfirmed + EXPECT_EQ(fault->occurrence_count, 2u); // Should increment + EXPECT_EQ(fault->reporting_sources.size(), 2u); // Both sources + EXPECT_EQ(fault->description, "Reactivated"); // Updated description +} + +TEST_F(SqliteFaultStorageTest, PassedEventForClearedFaultIgnored) { + rclcpp::Clock clock; + + // Report and confirm + storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node1", + clock.now()); + + // Clear the fault + storage_->clear_fault("FAULT_1"); + + // PASSED event should be ignored for CLEARED fault + bool result = + storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_PASSED, 0, "", "/node1", clock.now()); + EXPECT_FALSE(result); + + auto fault = storage_->get_fault("FAULT_1"); + ASSERT_TRUE(fault.has_value()); + EXPECT_EQ(fault->status, Fault::STATUS_CLEARED); // Should stay cleared +} + +TEST_F(SqliteFaultStorageTest, ClearedFaultReactivationRestartsDebounce) { + rclcpp::Clock clock; + + // Set threshold to -3 to test debounce behavior + DebounceConfig config; + config.confirmation_threshold = -3; + storage_->set_debounce_config(config); + // Report 3 times to confirm storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node1", clock.now()); @@ -401,16 +458,30 @@ TEST_F(SqliteFaultStorageTest, ClearedFaultNotReconfirmed) { storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node3", clock.now()); + auto fault = storage_->get_fault("FAULT_1"); + ASSERT_TRUE(fault.has_value()); + EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED); + // Clear the fault storage_->clear_fault("FAULT_1"); - // Report again + // Reactivate - should start in PREFAILED with counter=-1 storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node4", clock.now()); - auto fault = storage_->get_fault("FAULT_1"); + fault = storage_->get_fault("FAULT_1"); ASSERT_TRUE(fault.has_value()); - EXPECT_EQ(fault->status, Fault::STATUS_CLEARED); // Should stay cleared + EXPECT_EQ(fault->status, Fault::STATUS_PREFAILED); // Not yet confirmed, needs 2 more FAILED + + // Report 2 more times to re-confirm + storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node5", + clock.now()); + storage_->report_fault_event("FAULT_1", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "Test", "/node6", + clock.now()); + + fault = storage_->get_fault("FAULT_1"); + ASSERT_TRUE(fault.has_value()); + EXPECT_EQ(fault->status, Fault::STATUS_CONFIRMED); // Now confirmed } TEST_F(SqliteFaultStorageTest, ConfirmationPersistsAfterReopen) {