From 087e7c3d1cd437f9776e01d35b63ccbd71a76d10 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Wed, 16 Jun 2021 17:36:35 +0200 Subject: [PATCH 1/4] Builds: check celery and redis before marking a build as stale Previously, we were checking only if the build has been running for a long-long time (2hs) before marking it as stale. This new approach tries to reduce this time to 15m and adds extra checks on top of that: - the build has been running for more than 15 minutes - the build is not being executed by Celery (celery.inspect.active) - the build is not queued in Redis queues When all of these are `True` we mark the build as stale. --- readthedocs/projects/tasks/utils.py | 142 ++++++++++++++++++++++------ readthedocs/settings/base.py | 4 +- 2 files changed, 115 insertions(+), 31 deletions(-) diff --git a/readthedocs/projects/tasks/utils.py b/readthedocs/projects/tasks/utils.py index 202a4f55c30..60b7999a188 100644 --- a/readthedocs/projects/tasks/utils.py +++ b/readthedocs/projects/tasks/utils.py @@ -1,11 +1,11 @@ +import base64 import datetime import os import shutil - -from celery.worker.request import Request +import redis import structlog - +from celery.worker.request import Request from django.db.models import Q from django.utils import timezone from django.utils.translation import gettext_lazy as _ @@ -78,42 +78,126 @@ def clean_project_resources(project, version=None): ) -@app.task() +@app.task(queue="web") def finish_inactive_builds(): """ Finish inactive builds. - A build is consider inactive if it's not in ``FINISHED`` state and it has been - "running" for more time that the allowed one (``Project.container_time_limit`` - or ``DOCKER_LIMITS['time']`` plus a 20% of it). + A build is consider inactive if all the followings are true: + + - it's not in ``FINISHED`` state + - it was created +15 minutes ago + - there is not task queued for it on redis + - celery is not currently executing it - These inactive builds will be marked as ``success`` and ``FINISHED`` with an - ``error`` to be communicated to the user. + These inactive builds will be marked as ``success=False`` and ``FINISHED`` with an + ``error`` to communicate this problem to the user. """ - # TODO similar to the celery task time limit, we can't infer this from - # Docker settings anymore, because Docker settings are determined on the - # build servers dynamically. - # time_limit = int(DOCKER_LIMITS['time'] * 1.2) - # Set time as maximum celery task time limit + 5m - time_limit = 7200 + 300 + time_limit = 15 * 60 # 15 minutes delta = datetime.timedelta(seconds=time_limit) query = ( ~Q(state=BUILD_STATE_FINISHED) & Q(date__lte=timezone.now() - delta) ) - builds_finished = 0 + stale_build_pks = [] builds = Build.objects.filter(query)[:50] - for build in builds: - - if build.project.container_time_limit: - custom_delta = datetime.timedelta( - seconds=int(build.project.container_time_limit), - ) - if build.date + custom_delta > timezone.now(): - # Do not mark as FINISHED builds with a custom time limit that wasn't - # expired yet (they are still building the project version) - continue + redis_client = redis.Redis.from_url(settings.BROKER_URL) + log.info("Builds not yet finished to check for stale.", count=builds.count()) + for build in builds: + build_stale = True + + # 1. check if it's being executed by celery + # + # Ask Celery for all the tasks their workers are running and filter + # them by `update_docs_task` only. Over those tasks we check if the + # argument `build_pk` is the same than the build object we are + # checking. In case it matches, we mark the build as NOT stale. + tasks_running = app.control.inspect().active().items() + log.debug( + "Celery active tasks running.", + tasks=tasks_running, + ) + for queue, tasks in tasks_running: + for task in tasks: + try: + if ( + task.get("name") + != "readthedocs.projects.tasks.builds.update_docs_task" + ): + log.debug( + "Skipping not build task. task=%s", + task.get("name"), + ) + continue + + task_build_pk = task.get("kwargs", {}).get("build_id") + log.info( + "Task running.", + task=task.get("name"), + build_id=task_build_pk, + ) + + if task_build_pk == build.pk: + # The build is not stale, it's being executed + log.info( + "Build not stale. Found as an active task on Celery.", + build_id=build.pk, + ) + build_stale = False + break + except Exception: + log.exception( + "There was an error accesssing the Celery task inspect." + ) + + if not build_stale: + # Continue with the following build that matches the filter + break + + # 2. check if it's queued on redis + # + # Ask Redis for all the queues starting with `build*` (build:default, + # build:large). Then, we check if `build_id` queued matches with the + # build object we are checking. In that case we mark the build as NOT + # stale. + for queue in redis_client.keys("build*"): + log.debug("Redis queue with queued tasks.", queue=queue) + for task in redis_client.lrange(queue, 0, -1): + try: + task = json.loads(task) + body = json.loads(base64.b64decode(task["body"])) + # ``body`` is a 3-element list + # [ + # [int], + # {'record': bool, 'force': bool, 'commit': None, 'build_id': int}, + # {'callbacks': None, 'errbacks': None, 'chain': None, 'chord': None}, + # ] + log.debug("Redis queue task.", queue=queue, body=body) + + build_pk = body[1]["build_pk"] + if build_pk == build.pk: + # The build is not stale, it's queued to be executed + log.info( + "Build not stale. Found as a queued task in Redis.", + build_id=build.pk, + ) + build_stale = False + break + except Exception: + log.exception( + "There was an error accesssing the Celery task from Redis." + ) + + if not build_stale: + # Continue with the following build that matches the filter + break + + if build_stale: + stale_build_pks.append(build.pk) + + for build_pk in stale_build_pks: + build = Build.objects.get(pk=build_pk) build.success = False build.state = BUILD_STATE_FINISHED build.error = _( @@ -122,11 +206,11 @@ def finish_inactive_builds(): 'request with and reference this build id ({}).'.format(build.pk), ) build.save() - builds_finished += 1 - log.info( + log.warning( 'Builds marked as "Terminated due inactivity".', - count=builds_finished, + count=len(stale_build_pks), + build_ids=stale_build_pks, ) diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index 738e916a446..6acb6383cdf 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -394,9 +394,9 @@ def TEMPLATES(self): CELERY_DEFAULT_QUEUE = 'celery' CELERYBEAT_SCHEDULE = { - 'quarter-finish-inactive-builds': { + 'every-5-minutes-finish-inactive-builds': { 'task': 'readthedocs.projects.tasks.utils.finish_inactive_builds', - 'schedule': crontab(minute='*/15'), + 'schedule': crontab(minute='*/5'), 'options': {'queue': 'web'}, }, 'every-three-hour-clear-persistent-messages': { From 85f529add38b3e33a8c330c5907c4369fc3e2e84 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Wed, 6 Oct 2021 11:13:03 +0200 Subject: [PATCH 2/4] Delete non-working test --- readthedocs/rtd_tests/tests/test_project.py | 63 --------------------- 1 file changed, 63 deletions(-) diff --git a/readthedocs/rtd_tests/tests/test_project.py b/readthedocs/rtd_tests/tests/test_project.py index 2903a82a191..d76a1c0d39c 100644 --- a/readthedocs/rtd_tests/tests/test_project.py +++ b/readthedocs/rtd_tests/tests/test_project.py @@ -549,66 +549,3 @@ def test_user_can_change_project_with_same_lang(self): ) self.assertEqual(resp.status_code, 200) self.assertNotContains(resp, 'There is already a') - - -class TestFinishInactiveBuildsTask(TestCase): - fixtures = ['eric', 'test_data'] - - def setUp(self): - self.client.login(username='eric', password='test') - self.pip = Project.objects.get(slug='pip') - - self.taggit = Project.objects.get(slug='taggit') - self.taggit.container_time_limit = 7200 # 2 hours - self.taggit.save() - - # Build just started with the default time - self.build_1 = Build.objects.create( - project=self.pip, - version=self.pip.get_stable_version(), - state=BUILD_STATE_CLONING, - ) - - # Build started an hour ago with default time - self.build_2 = Build.objects.create( - project=self.pip, - version=self.pip.get_stable_version(), - state=BUILD_STATE_TRIGGERED, - ) - self.build_2.date = ( - timezone.now() - datetime.timedelta(hours=1) - ) - self.build_2.save() - - # Build started an hour ago with custom time (2 hours) - self.build_3 = Build.objects.create( - project=self.taggit, - version=self.taggit.get_stable_version(), - state=BUILD_STATE_TRIGGERED, - ) - self.build_3.date = ( - timezone.now() - datetime.timedelta(hours=1) - ) - self.build_3.save() - - @pytest.mark.xfail(reason='Fails while we work out Docker time limits', strict=True) - def test_finish_inactive_builds_task(self): - finish_inactive_builds() - - # Legitimate build (just started) not finished - self.build_1.refresh_from_db() - self.assertTrue(self.build_1.success) - self.assertEqual(self.build_1.error, '') - self.assertEqual(self.build_1.state, BUILD_STATE_CLONING) - - # Build with default time finished - self.build_2.refresh_from_db() - self.assertFalse(self.build_2.success) - self.assertNotEqual(self.build_2.error, '') - self.assertEqual(self.build_2.state, BUILD_STATE_FINISHED) - - # Build with custom time not finished - self.build_3.refresh_from_db() - self.assertTrue(self.build_3.success) - self.assertEqual(self.build_3.error, '') - self.assertEqual(self.build_3.state, BUILD_STATE_TRIGGERED) From fefbe62233eaf1e6cd8f342f81230b649779986c Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Thu, 3 Mar 2022 17:08:19 +0100 Subject: [PATCH 3/4] Minor fixes --- readthedocs/projects/tasks/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/readthedocs/projects/tasks/utils.py b/readthedocs/projects/tasks/utils.py index 60b7999a188..e75f6f99d63 100644 --- a/readthedocs/projects/tasks/utils.py +++ b/readthedocs/projects/tasks/utils.py @@ -6,6 +6,7 @@ import redis import structlog from celery.worker.request import Request +from django.conf import settings from django.db.models import Q from django.utils import timezone from django.utils.translation import gettext_lazy as _ @@ -175,7 +176,7 @@ def finish_inactive_builds(): # ] log.debug("Redis queue task.", queue=queue, body=body) - build_pk = body[1]["build_pk"] + build_pk = body[1]["build_id"] if build_pk == build.pk: # The build is not stale, it's queued to be executed log.info( From 97aee88328fc6347997394852031ca901daeb7c0 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Thu, 3 Mar 2022 17:14:45 +0100 Subject: [PATCH 4/4] Lint --- readthedocs/projects/tasks/utils.py | 1 + readthedocs/rtd_tests/tests/test_project.py | 6 +----- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/readthedocs/projects/tasks/utils.py b/readthedocs/projects/tasks/utils.py index e75f6f99d63..ada34366190 100644 --- a/readthedocs/projects/tasks/utils.py +++ b/readthedocs/projects/tasks/utils.py @@ -1,5 +1,6 @@ import base64 import datetime +import json import os import shutil diff --git a/readthedocs/rtd_tests/tests/test_project.py b/readthedocs/rtd_tests/tests/test_project.py index d76a1c0d39c..0a9c83c4306 100644 --- a/readthedocs/rtd_tests/tests/test_project.py +++ b/readthedocs/rtd_tests/tests/test_project.py @@ -1,13 +1,10 @@ -import datetime import json from unittest import mock from unittest.mock import patch -import pytest from django.contrib.auth.models import User from django.forms.models import model_to_dict from django.test import TestCase -from django.utils import timezone from django_dynamic_fixture import get from rest_framework.reverse import reverse @@ -19,12 +16,11 @@ LATEST, TAG, ) -from readthedocs.builds.models import Build, Version +from readthedocs.builds.models import Version from readthedocs.oauth.services import GitHubService, GitLabService from readthedocs.projects.constants import GITHUB_BRAND, GITLAB_BRAND from readthedocs.projects.exceptions import ProjectConfigurationError from readthedocs.projects.models import Project -from readthedocs.projects.tasks.utils import finish_inactive_builds from readthedocs.rtd_tests.mocks.paths import fake_paths_by_regex