Skip to content

Commit 5041cef

Browse files
authored
feat(crons): Fan out check_missing task to each monitor_environment (#55924)
Currently this task has a time-limit of 15s, this can be very sensitive to database issues where things "slow down" and the task hits it's time-limit. This task currently works by finding all monitors that are past their expected check-in time and updates and marks the monitor as having missed a check-in. We can improve performance here by fanning out tasks for each monitor that it needs to mark as missed.
1 parent 9b865e3 commit 5041cef

File tree

2 files changed

+78
-42
lines changed

2 files changed

+78
-42
lines changed

src/sentry/monitors/tasks.py

Lines changed: 37 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -205,39 +205,45 @@ def check_missing(current_datetime: datetime):
205205
)
206206
metrics.gauge("sentry.monitors.tasks.check_missing.count", qs.count(), sample_rate=1.0)
207207
for monitor_environment in qs:
208-
try:
209-
logger.info(
210-
"monitor.missed-checkin", extra={"monitor_environment_id": monitor_environment.id}
211-
)
208+
check_missing_environment.delay(monitor_environment.id)
212209

213-
monitor = monitor_environment.monitor
214-
expected_time = monitor_environment.next_checkin
215210

216-
# add missed checkin.
217-
#
218-
# XXX(epurkhiser): The date_added is backdated so that this missed
219-
# check-in correctly reflects the time of when the checkin SHOULD
220-
# have happened. It is the same as the expected_time.
221-
MonitorCheckIn.objects.create(
222-
project_id=monitor_environment.monitor.project_id,
223-
monitor=monitor_environment.monitor,
224-
monitor_environment=monitor_environment,
225-
status=CheckInStatus.MISSED,
226-
date_added=expected_time,
227-
expected_time=expected_time,
228-
monitor_config=monitor.get_validated_config(),
229-
)
230-
mark_failed(
231-
monitor_environment,
232-
reason=MonitorFailure.MISSED_CHECKIN,
233-
occurrence_context={
234-
"expected_time": expected_time.strftime(SUBTITLE_DATETIME_FORMAT)
235-
if expected_time
236-
else expected_time
237-
},
238-
)
239-
except Exception:
240-
logger.exception("Exception in check_monitors - mark missed", exc_info=True)
211+
@instrumented_task(
212+
name="sentry.monitors.tasks.check_missing_environment",
213+
max_retries=0,
214+
)
215+
def check_missing_environment(monitor_environment_id: int):
216+
logger.info("monitor.missed-checkin", extra={"monitor_environment_id": monitor_environment_id})
217+
218+
monitor_environment = MonitorEnvironment.objects.select_related("monitor").get(
219+
id=monitor_environment_id
220+
)
221+
monitor = monitor_environment.monitor
222+
expected_time = monitor_environment.next_checkin
223+
224+
# add missed checkin.
225+
#
226+
# XXX(epurkhiser): The date_added is backdated so that this missed
227+
# check-in correctly reflects the time of when the checkin SHOULD
228+
# have happened. It is the same as the expected_time.
229+
MonitorCheckIn.objects.create(
230+
project_id=monitor_environment.monitor.project_id,
231+
monitor=monitor_environment.monitor,
232+
monitor_environment=monitor_environment,
233+
status=CheckInStatus.MISSED,
234+
date_added=expected_time,
235+
expected_time=expected_time,
236+
monitor_config=monitor.get_validated_config(),
237+
)
238+
mark_failed(
239+
monitor_environment,
240+
reason=MonitorFailure.MISSED_CHECKIN,
241+
occurrence_context={
242+
"expected_time": expected_time.strftime(SUBTITLE_DATETIME_FORMAT)
243+
if expected_time
244+
else expected_time
245+
},
246+
)
241247

242248

243249
@instrumented_task(

tests/sentry/monitors/test_tasks.py

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from unittest import mock
33

44
import msgpack
5+
import pytest
56
from arroyo.backends.kafka import KafkaPayload
67
from django.test import override_settings
78
from django.utils import timezone
@@ -18,6 +19,7 @@
1819
)
1920
from sentry.monitors.tasks import (
2021
check_missing,
22+
check_missing_environment,
2123
check_timeout,
2224
clock_pulse,
2325
try_monitor_tasks_trigger,
@@ -48,7 +50,8 @@ def make_ref_time():
4850

4951

5052
class MonitorTaskCheckMissingTest(TestCase):
51-
def test_missing_checkin(self):
53+
@mock.patch("sentry.monitors.tasks.check_missing_environment")
54+
def test_missing_checkin(self, check_missing_environment_mock):
5255
org = self.create_organization()
5356
project = self.create_project(organization=org)
5457

@@ -78,6 +81,14 @@ def test_missing_checkin(self):
7881

7982
check_missing(task_run_ts)
8083

84+
# assert that task is called for the specific environment
85+
assert check_missing_environment_mock.delay.call_count == 1
86+
assert check_missing_environment_mock.delay.mock_calls[0] == mock.call(
87+
monitor_environment.id
88+
)
89+
90+
check_missing_environment(monitor_environment.id)
91+
8192
# Monitor status is updated
8293
monitor_environment = MonitorEnvironment.objects.get(
8394
id=monitor_environment.id, status=MonitorStatus.MISSED_CHECKIN
@@ -98,7 +109,8 @@ def test_missing_checkin(self):
98109
).replace(second=0, microsecond=0)
99110
assert missed_checkin.monitor_config == monitor.config
100111

101-
def test_missing_checkin_with_margin(self):
112+
@mock.patch("sentry.monitors.tasks.check_missing_environment")
113+
def test_missing_checkin_with_margin(self, check_missing_environment_mock):
102114
org = self.create_organization()
103115
project = self.create_project(organization=org)
104116

@@ -132,6 +144,9 @@ def test_missing_checkin_with_margin(self):
132144
# No missed check-in generated as we're still within the check-in margin
133145
check_missing(task_run_ts)
134146

147+
# assert that task is not called for the specific environment
148+
assert check_missing_environment_mock.delay.call_count == 0
149+
135150
assert not MonitorEnvironment.objects.filter(
136151
id=monitor_environment.id, status=MonitorStatus.MISSED_CHECKIN
137152
).exists()
@@ -142,6 +157,14 @@ def test_missing_checkin_with_margin(self):
142157
# Missed check-in generated as clock now exceeds expected time plus margin
143158
check_missing(task_run_ts + timedelta(minutes=4))
144159

160+
# assert that task is called for the specific environment
161+
assert check_missing_environment_mock.delay.call_count == 1
162+
assert check_missing_environment_mock.delay.mock_calls[0] == mock.call(
163+
monitor_environment.id
164+
)
165+
166+
check_missing_environment(monitor_environment.id)
167+
145168
monitor_environment = MonitorEnvironment.objects.get(
146169
id=monitor_environment.id, status=MonitorStatus.MISSED_CHECKIN
147170
)
@@ -157,7 +180,7 @@ def test_missing_checkin_with_margin(self):
157180
)
158181

159182
# Missed checkins are back-dated to when the checkin was expected to
160-
# happpen. In this case the expected_time is equal to the date_added.
183+
# happen. In this case the expected_time is equal to the date_added.
161184
assert missed_check.date_added == (
162185
monitor_environment.last_checkin + timedelta(minutes=10)
163186
).replace(second=0, microsecond=0)
@@ -269,8 +292,8 @@ def test_not_missing_checkin(self):
269292
monitor_environment=monitor_environment.id, status=CheckInStatus.MISSED
270293
).exists()
271294

272-
@mock.patch("sentry.monitors.tasks.logger")
273-
def test_missed_exception_handling(self, logger):
295+
@mock.patch("sentry.monitors.tasks.check_missing_environment")
296+
def test_missed_exception_handling(self, check_missing_environment_mock):
274297
org = self.create_organization()
275298
project = self.create_project(organization=org)
276299

@@ -287,7 +310,7 @@ def test_missed_exception_handling(self, logger):
287310
"schedule": [-2, "minute"],
288311
},
289312
)
290-
MonitorEnvironment.objects.create(
313+
failing_monitor_environment = MonitorEnvironment.objects.create(
291314
monitor=exception_monitor,
292315
environment=self.environment,
293316
next_checkin=ts - timedelta(minutes=1),
@@ -301,7 +324,7 @@ def test_missed_exception_handling(self, logger):
301324
type=MonitorType.CRON_JOB,
302325
config={"schedule": "* * * * *"},
303326
)
304-
monitor_environment = MonitorEnvironment.objects.create(
327+
successful_monitor_environment = MonitorEnvironment.objects.create(
305328
monitor=monitor,
306329
environment=self.environment,
307330
next_checkin=ts - timedelta(minutes=1),
@@ -311,15 +334,22 @@ def test_missed_exception_handling(self, logger):
311334

312335
check_missing(task_run_ts)
313336

314-
# Logged the exception
315-
assert logger.exception.call_count == 1
337+
# assert that task is called for the specific environments
338+
assert check_missing_environment_mock.delay.call_count == 2
339+
340+
# assert failing monitor raises an error
341+
with pytest.raises(ValueError):
342+
check_missing_environment(failing_monitor_environment.id)
343+
344+
# assert regular monitor works
345+
check_missing_environment(successful_monitor_environment.id)
316346

317347
# We still marked a monitor as missed
318348
assert MonitorEnvironment.objects.filter(
319-
id=monitor_environment.id, status=MonitorStatus.MISSED_CHECKIN
349+
id=successful_monitor_environment.id, status=MonitorStatus.MISSED_CHECKIN
320350
).exists()
321351
assert MonitorCheckIn.objects.filter(
322-
monitor_environment=monitor_environment.id, status=CheckInStatus.MISSED
352+
monitor_environment=successful_monitor_environment.id, status=CheckInStatus.MISSED
323353
).exists()
324354

325355

0 commit comments

Comments
 (0)