diff --git a/clusterman/draining/queue.py b/clusterman/draining/queue.py index 69e4369d3..fce4f57fe 100644 --- a/clusterman/draining/queue.py +++ b/clusterman/draining/queue.py @@ -395,7 +395,11 @@ def terminate_host(host: Host) -> None: logger.info(f'Terminating: {host.instance_id}') resource_group_class = RESOURCE_GROUPS[host.sender] resource_group = resource_group_class(host.group_id) - resource_group.terminate_instances_by_id([host.instance_id]) + + try: + resource_group.terminate_instances_by_id([host.instance_id]) + except Exception: + logger.error(f'Failed to terminate host: {host.hostname}, possibly already terminated.') def main(args: argparse.Namespace) -> None: diff --git a/tests/draining/queue_test.py b/tests/draining/queue_test.py index f5ee3e380..0e57258df 100644 --- a/tests/draining/queue_test.py +++ b/tests/draining/queue_test.py @@ -518,6 +518,20 @@ def test_terminate_host(): mock_sfr.return_value.terminate_instances_by_id.assert_called_with(['i123']) +def test_terminate_host_failure_no_crash(): + mock_host = mock.Mock(instance_id='i123', sender='sfr', group_id='sfr123') + + mock_sfr = mock.Mock() + mock_sfr.terminate_instances_by_id.side_effect = Exception() + + with mock.patch.dict( + 'clusterman.draining.queue.RESOURCE_GROUPS', {'sfr': mock_sfr}, clear=True + ): + terminate_host(mock_host) + mock_sfr.assert_called_with('sfr123') + mock_sfr.return_value.terminate_instances_by_id.assert_called_with(['i123']) + + def test_host_from_instance_id(): with mock.patch( 'clusterman.draining.queue.ec2_describe_instances', autospec=True,