From c4f02c536baadb23a6f158be1a10ad41718888c6 Mon Sep 17 00:00:00 2001 From: theosanderson-agent Date: Wed, 17 Jun 2026 14:43:22 +0100 Subject: [PATCH 01/12] feat(backend): run scheduled tasks on a single replica with ShedLock Introduce ShedLock so that the backend's scheduled tasks execute on only one replica at a time, instead of once per replica. This addresses #6704: previously the effective frequency of every @Scheduled task scaled with the number of backend replicas. ShedLock coordinates via a shared `shedlock` table in the existing Postgres database (added in migration V1.31). Before a task runs, the replica tries to acquire a row-level lock keyed by the task name; only the replica that wins runs the task, the others skip that tick. Changes: - Add shedlock-spring and shedlock-provider-jdbc-template (7.7.0) deps. - Register a JdbcTemplateLockProvider backed by the existing DataSource, using `usingDbTime()` so lock timing relies on the database clock and is unaffected by clock drift between replicas. - Enable locking with @EnableSchedulerLock (defaultLockAtMostFor=PT30M as a safety net if a replica dies mid-task). - Annotate all four scheduled tasks with @SchedulerLock, each with a lockAtMostFor sized above its expected runtime. - Add Flyway migration V1.31 creating the `shedlock` table. Co-Authored-By: Claude Opus 4.8 (1M context) --- backend/build.gradle | 5 +++++ backend/gradle.lockfile | 4 ++++ .../backend/config/BackendSpringConfig.kt | 20 +++++++++++++++++++ .../SeqSetCrossRefCitationsTask.kt | 2 ++ .../service/submission/CleanUpAuxTableTask.kt | 2 ++ .../CleanUpStaleSequencesInProcessingTask.kt | 2 ++ .../UseNewerProcessingPipelineVersionTask.kt | 2 ++ .../migration/V1.31__add_shedlock_table.sql | 11 ++++++++++ 8 files changed, 48 insertions(+) create mode 100644 backend/src/main/resources/db/migration/V1.31__add_shedlock_table.sql diff --git a/backend/build.gradle b/backend/build.gradle index 5e4734bd06..00fcf2f55b 100644 --- a/backend/build.gradle +++ b/backend/build.gradle @@ -48,6 +48,11 @@ dependencies { implementation "org.apache.commons:commons-csv:1.14.1" implementation "org.springdoc:springdoc-openapi-starter-webmvc-ui:2.8.14" implementation "org.flywaydb:flyway-database-postgresql" // Version managed by Spring Boot to ensure compatibility + + // ShedLock ensures scheduled tasks run on only one backend replica at a time (distributed lock held in Postgres) + implementation "net.javacrumbs.shedlock:shedlock-spring:7.7.0" + implementation "net.javacrumbs.shedlock:shedlock-provider-jdbc-template:7.7.0" + implementation platform("org.jetbrains.exposed:exposed-bom:0.61.0") implementation "org.jetbrains.exposed:exposed-spring-boot-starter" implementation "org.jetbrains.exposed:exposed-jdbc" diff --git a/backend/gradle.lockfile b/backend/gradle.lockfile index 8837136876..3b3dd1c0f1 100644 --- a/backend/gradle.lockfile +++ b/backend/gradle.lockfile @@ -119,6 +119,10 @@ junit:junit:4.13.2=testRuntimeClasspath net.bytebuddy:byte-buddy-agent:1.17.8=testCompileClasspath,testRuntimeClasspath net.bytebuddy:byte-buddy:1.17.8=testCompileClasspath,testRuntimeClasspath net.java.dev.jna:jna:5.18.1=testCompileClasspath,testRuntimeClasspath +net.javacrumbs.shedlock:shedlock-core:7.7.0=compileClasspath,productionRuntimeClasspath,runtimeClasspath,testCompileClasspath,testRuntimeClasspath +net.javacrumbs.shedlock:shedlock-provider-jdbc-template:7.7.0=compileClasspath,productionRuntimeClasspath,runtimeClasspath,testCompileClasspath,testRuntimeClasspath +net.javacrumbs.shedlock:shedlock-spring:7.7.0=compileClasspath,productionRuntimeClasspath,runtimeClasspath,testCompileClasspath,testRuntimeClasspath +net.javacrumbs.shedlock:shedlock-sql-support:7.7.0=compileClasspath,productionRuntimeClasspath,runtimeClasspath,testCompileClasspath,testRuntimeClasspath net.minidev:accessors-smart:2.5.2=testCompileClasspath,testRuntimeClasspath net.minidev:json-smart:2.5.2=testCompileClasspath,testRuntimeClasspath org.apache.commons:commons-compress:1.28.0=compileClasspath,productionRuntimeClasspath,runtimeClasspath,testCompileClasspath,testRuntimeClasspath diff --git a/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt b/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt index e9ab9640a1..b295dcd260 100644 --- a/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt +++ b/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt @@ -6,6 +6,9 @@ import com.fasterxml.jackson.module.kotlin.readValue import io.swagger.v3.oas.models.headers.Header import io.swagger.v3.oas.models.media.StringSchema import io.swagger.v3.oas.models.parameters.HeaderParameter +import net.javacrumbs.shedlock.core.LockProvider +import net.javacrumbs.shedlock.provider.jdbctemplate.JdbcTemplateLockProvider +import net.javacrumbs.shedlock.spring.annotation.EnableSchedulerLock import org.flywaydb.core.Flyway import org.jetbrains.exposed.spring.autoconfigure.ExposedAutoConfiguration import org.jetbrains.exposed.sql.Database @@ -26,6 +29,7 @@ import org.springframework.boot.context.properties.ConfigurationPropertiesScan import org.springframework.context.annotation.Bean import org.springframework.context.annotation.Configuration import org.springframework.context.annotation.Profile +import org.springframework.jdbc.core.JdbcTemplate import org.springframework.scheduling.annotation.EnableScheduling import org.springframework.stereotype.Component import org.springframework.web.filter.CommonsRequestLoggingFilter @@ -58,6 +62,10 @@ private val logger = mu.KotlinLogging.logger {} @Configuration @EnableScheduling +// Ensures scheduled tasks run on only one replica at a time. `defaultLockAtMostFor` is a safety +// net: if a replica dies while holding a lock, the lock is released after this duration so another +// replica can take over. Individual tasks override it via `@SchedulerLock(lockAtMostForString = ...)`. +@EnableSchedulerLock(defaultLockAtMostFor = "PT30M") @ImportAutoConfiguration( value = [ExposedAutoConfiguration::class], exclude = [DataSourceTransactionManagerAutoConfiguration::class], @@ -65,6 +73,18 @@ private val logger = mu.KotlinLogging.logger {} @ConfigurationPropertiesScan("org.loculus.backend") class BackendSpringConfig { + /** + * Backs ShedLock with the existing Postgres datasource. `usingDbTime()` makes ShedLock use the + * database clock for lock timing, which avoids problems caused by clock drift between replicas. + */ + @Bean + fun lockProvider(dataSource: DataSource): LockProvider = JdbcTemplateLockProvider( + JdbcTemplateLockProvider.Configuration.builder() + .withJdbcTemplate(JdbcTemplate(dataSource)) + .usingDbTime() + .build(), + ) + @Bean fun logFilter(): CommonsRequestLoggingFilter { val filter = CommonsRequestLoggingFilter() diff --git a/backend/src/main/kotlin/org/loculus/backend/service/seqsetcitations/SeqSetCrossRefCitationsTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/seqsetcitations/SeqSetCrossRefCitationsTask.kt index 05dae9a7eb..c7d1d709c2 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/seqsetcitations/SeqSetCrossRefCitationsTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/seqsetcitations/SeqSetCrossRefCitationsTask.kt @@ -1,5 +1,6 @@ package org.loculus.backend.service.seqsetcitations +import net.javacrumbs.shedlock.spring.annotation.SchedulerLock import org.loculus.backend.api.SeqSetCitationSource import org.loculus.backend.config.BackendSpringProperty import org.loculus.backend.config.ENABLE_SEQSETS_TRUE_VALUE @@ -48,6 +49,7 @@ class SeqSetCrossRefCitationsTask( fixedDelay = 360, timeUnit = java.util.concurrent.TimeUnit.MINUTES, ) + @SchedulerLock(name = "seqSetCrossRefCitations", lockAtMostFor = "PT1H") fun task() { log.info { "Updating SeqSet CrossRef citations..." } diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpAuxTableTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpAuxTableTask.kt index 2f0b4e6a1c..8136a3c56f 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpAuxTableTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpAuxTableTask.kt @@ -3,6 +3,7 @@ package org.loculus.backend.service.maintenance import kotlinx.datetime.DateTimeUnit import kotlinx.datetime.minus import kotlinx.datetime.toLocalDateTime +import net.javacrumbs.shedlock.spring.annotation.SchedulerLock import org.loculus.backend.log.AuditLogger import org.loculus.backend.service.submission.UploadDatabaseService import org.loculus.backend.utils.DateProvider @@ -22,6 +23,7 @@ class CleanUpAuxTableTask( * Runs every hour and deletes auxTable entries older than 24 hours. */ @Scheduled(fixedDelay = 1, timeUnit = java.util.concurrent.TimeUnit.HOURS) + @SchedulerLock(name = "cleanUpAuxTable", lockAtMostFor = "PT15M") fun task() { val hourCutoff = 24L val now = dateProvider.getCurrentInstant() diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt index 32e8462f2f..2bd83a9257 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt @@ -1,5 +1,6 @@ package org.loculus.backend.service.submission +import net.javacrumbs.shedlock.spring.annotation.SchedulerLock import org.loculus.backend.config.BackendSpringProperty import org.springframework.beans.factory.annotation.Value import org.springframework.scheduling.annotation.Scheduled @@ -14,6 +15,7 @@ class CleanUpStaleSequencesInProcessingTask( @Value("\${${BackendSpringProperty.STALE_AFTER_SECONDS}}") private val timeToStaleInSeconds: Long, ) { @Scheduled(fixedRateString = "\${${BackendSpringProperty.CLEAN_UP_RUN_EVERY_SECONDS}}", timeUnit = TimeUnit.SECONDS) + @SchedulerLock(name = "cleanUpStaleSequencesInProcessing", lockAtMostFor = "PT5M") fun task() { log.info { "Cleaning up stale sequences in processing, timeToStaleInSeconds: $timeToStaleInSeconds" } submissionDatabaseService.cleanUpStaleSequencesInProcessing(timeToStaleInSeconds) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt index fa8f0036a7..351e461e4a 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt @@ -1,5 +1,6 @@ package org.loculus.backend.service.submission +import net.javacrumbs.shedlock.spring.annotation.SchedulerLock import org.loculus.backend.config.BackendSpringProperty import org.springframework.scheduling.annotation.Scheduled import org.springframework.stereotype.Component @@ -18,6 +19,7 @@ class UseNewerProcessingPipelineVersionTask(private val submissionDatabaseServic fixedDelayString = "\${${BackendSpringProperty.PIPELINE_VERSION_UPGRADE_CHECK_INTERVAL_SECONDS}}", timeUnit = TimeUnit.SECONDS, ) + @SchedulerLock(name = "useNewerProcessingPipelineVersion", lockAtMostFor = "PT10M") fun task() { log.info { "Checking for newer preprocessing pipeline versions" } val newVersions = submissionDatabaseService.useNewerProcessingPipelineIfPossible() diff --git a/backend/src/main/resources/db/migration/V1.31__add_shedlock_table.sql b/backend/src/main/resources/db/migration/V1.31__add_shedlock_table.sql new file mode 100644 index 0000000000..669017ab81 --- /dev/null +++ b/backend/src/main/resources/db/migration/V1.31__add_shedlock_table.sql @@ -0,0 +1,11 @@ +-- Table used by ShedLock to coordinate scheduled tasks across backend replicas. +-- Only one replica can hold the lock for a given task name at a time, so scheduled +-- tasks run once per interval regardless of how many replicas are deployed. +-- Schema as required by net.javacrumbs.shedlock:shedlock-provider-jdbc-template. +CREATE TABLE shedlock ( + name VARCHAR(64) NOT NULL, + lock_until TIMESTAMP NOT NULL, + locked_at TIMESTAMP NOT NULL, + locked_by VARCHAR(255) NOT NULL, + PRIMARY KEY (name) +); From 9a2016135465825d0cc65615e7f11508ffee25f0 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 17 Jun 2026 13:55:04 +0000 Subject: [PATCH 02/12] Update schema documentation based on migration changes --- backend/docs/db/schema.sql | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/backend/docs/db/schema.sql b/backend/docs/db/schema.sql index f2b4eb3909..b6c480666c 100644 --- a/backend/docs/db/schema.sql +++ b/backend/docs/db/schema.sql @@ -661,6 +661,20 @@ CREATE TABLE public.sequence_upload_aux_table ( ALTER TABLE public.sequence_upload_aux_table OWNER TO postgres; +-- +-- Name: shedlock; Type: TABLE; Schema: public; Owner: postgres +-- + +CREATE TABLE public.shedlock ( + name character varying(64) NOT NULL, + lock_until timestamp without time zone NOT NULL, + locked_at timestamp without time zone NOT NULL, + locked_by character varying(255) NOT NULL +); + + +ALTER TABLE public.shedlock OWNER TO postgres; + -- -- Name: table_update_tracker; Type: TABLE; Schema: public; Owner: postgres -- @@ -904,6 +918,14 @@ ALTER TABLE ONLY public.sequence_upload_aux_table ADD CONSTRAINT sequence_upload_aux_table_pkey PRIMARY KEY (upload_id, fasta_id); +-- +-- Name: shedlock shedlock_pkey; Type: CONSTRAINT; Schema: public; Owner: postgres +-- + +ALTER TABLE ONLY public.shedlock + ADD CONSTRAINT shedlock_pkey PRIMARY KEY (name); + + -- -- Name: table_update_tracker table_update_tracker_unique; Type: CONSTRAINT; Schema: public; Owner: postgres -- From a52f401113e983c35441f2864d0a35f89123a005 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Wed, 17 Jun 2026 15:56:39 +0200 Subject: [PATCH 03/12] Your commit message From 0169f4e3993f2330eb3ea66c4028f0d935a301f4 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 17 Jun 2026 16:14:02 +0200 Subject: [PATCH 04/12] Update backend/src/main/resources/db/migration/V1.31__add_shedlock_table.sql Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com> --- .../main/resources/db/migration/V1.31__add_shedlock_table.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/src/main/resources/db/migration/V1.31__add_shedlock_table.sql b/backend/src/main/resources/db/migration/V1.31__add_shedlock_table.sql index 669017ab81..60413ddeeb 100644 --- a/backend/src/main/resources/db/migration/V1.31__add_shedlock_table.sql +++ b/backend/src/main/resources/db/migration/V1.31__add_shedlock_table.sql @@ -4,8 +4,8 @@ -- Schema as required by net.javacrumbs.shedlock:shedlock-provider-jdbc-template. CREATE TABLE shedlock ( name VARCHAR(64) NOT NULL, - lock_until TIMESTAMP NOT NULL, - locked_at TIMESTAMP NOT NULL, + lock_until TIMESTAMP WITH TIME ZONE NOT NULL, + locked_at TIMESTAMP WITH TIME ZONE NOT NULL, locked_by VARCHAR(255) NOT NULL, PRIMARY KEY (name) ); From 0bc9cb8512f4c9c2f793aff2835dfc24224ebcd2 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 17 Jun 2026 14:15:00 +0000 Subject: [PATCH 05/12] Update schema documentation based on migration changes --- backend/docs/db/schema.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/docs/db/schema.sql b/backend/docs/db/schema.sql index b6c480666c..f4030d25c5 100644 --- a/backend/docs/db/schema.sql +++ b/backend/docs/db/schema.sql @@ -667,8 +667,8 @@ ALTER TABLE public.sequence_upload_aux_table OWNER TO postgres; CREATE TABLE public.shedlock ( name character varying(64) NOT NULL, - lock_until timestamp without time zone NOT NULL, - locked_at timestamp without time zone NOT NULL, + lock_until timestamp with time zone NOT NULL, + locked_at timestamp with time zone NOT NULL, locked_by character varying(255) NOT NULL ); From 16cf0253ffcf13c39359a1c6d8e2998bb9699230 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 17 Jun 2026 16:20:59 +0200 Subject: [PATCH 06/12] Apply suggestion from @claude[bot] Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com> --- .../service/submission/UseNewerProcessingPipelineVersionTask.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt index 351e461e4a..665ecb62cd 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt @@ -19,7 +19,7 @@ class UseNewerProcessingPipelineVersionTask(private val submissionDatabaseServic fixedDelayString = "\${${BackendSpringProperty.PIPELINE_VERSION_UPGRADE_CHECK_INTERVAL_SECONDS}}", timeUnit = TimeUnit.SECONDS, ) - @SchedulerLock(name = "useNewerProcessingPipelineVersion", lockAtMostFor = "PT10M") + @SchedulerLock(name = "useNewerProcessingPipelineVersion", lockAtMostFor = "PT1M") fun task() { log.info { "Checking for newer preprocessing pipeline versions" } val newVersions = submissionDatabaseService.useNewerProcessingPipelineIfPossible() From 3898c3cadf2327f576f949b2509e304791d3b37a Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Wed, 17 Jun 2026 16:41:25 +0200 Subject: [PATCH 07/12] feat: add a little test --- .../submission/ShedLockIntegrationTest.kt | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt diff --git a/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt new file mode 100644 index 0000000000..26723ea108 --- /dev/null +++ b/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt @@ -0,0 +1,32 @@ +package org.loculus.backend.service.submission + +import org.hamcrest.CoreMatchers.`is` +import org.hamcrest.MatcherAssert.assertThat +import org.junit.jupiter.api.Test +import org.loculus.backend.config.BackendSpringProperty +import org.loculus.backend.controller.EndpointTest +import org.springframework.beans.factory.annotation.Autowired +import org.springframework.jdbc.core.JdbcTemplate + +@EndpointTest( + properties = [ + "${BackendSpringProperty.STALE_AFTER_SECONDS}=0", + "${BackendSpringProperty.CLEAN_UP_RUN_EVERY_SECONDS}=3600", + ], +) +class ShedLockIntegrationTest( + @Autowired val jdbcTemplate: JdbcTemplate, + @Autowired val cleanUpStaleSequencesInProcessingTask: CleanUpStaleSequencesInProcessingTask, +) { + @Test + fun `WHEN task is called through the Spring proxy THEN shedlock row is created`() { + cleanUpStaleSequencesInProcessingTask.task() + + val count = jdbcTemplate.queryForObject( + "SELECT COUNT(*) FROM shedlock WHERE name = ?", + Int::class.java, + "cleanUpStaleSequencesInProcessing", + ) + assertThat(count, `is`(1)) + } +} From c8e1f7b53b8008dd22181c0c6e3827fb4b125d5e Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Wed, 17 Jun 2026 16:47:24 +0200 Subject: [PATCH 08/12] more tests --- .../submission/ShedLockIntegrationTest.kt | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt index 26723ea108..4e8d46150d 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt @@ -2,11 +2,13 @@ package org.loculus.backend.service.submission import org.hamcrest.CoreMatchers.`is` import org.hamcrest.MatcherAssert.assertThat +import org.hamcrest.Matchers.greaterThan import org.junit.jupiter.api.Test import org.loculus.backend.config.BackendSpringProperty import org.loculus.backend.controller.EndpointTest import org.springframework.beans.factory.annotation.Autowired import org.springframework.jdbc.core.JdbcTemplate +import java.sql.Timestamp @EndpointTest( properties = [ @@ -29,4 +31,32 @@ class ShedLockIntegrationTest( ) assertThat(count, `is`(1)) } + + @Test + fun `WHEN task completes the lock is released and task can run again`() { + cleanUpStaleSequencesInProcessingTask.task() + + val firstLockedAt = jdbcTemplate.queryForObject( + "SELECT locked_at FROM shedlock WHERE name = ?", + Timestamp::class.java, + "cleanUpStaleSequencesInProcessing", + )!! + + // Ensure DB clock advances before the second acquisition + Thread.sleep(10) + + cleanUpStaleSequencesInProcessingTask.task() + + val secondLockedAt = jdbcTemplate.queryForObject( + "SELECT locked_at FROM shedlock WHERE name = ?", + Timestamp::class.java, + "cleanUpStaleSequencesInProcessing", + )!! + + assertThat( + "second run should have re-acquired the lock (locked_at should advance)", + secondLockedAt, + greaterThan(firstLockedAt), + ) + } } From ca6520502ead677a44095793347379ea42250400 Mon Sep 17 00:00:00 2001 From: Theo Sanderson Date: Thu, 18 Jun 2026 16:32:39 +0100 Subject: [PATCH 09/12] feat(backend): enforce single run per interval with ShedLock lockAtLeastFor The previous ShedLock setup only set lockAtMostFor, which prevents *simultaneous* runs but not frequency-scaling with replica count: because the lock is released as soon as the task finishes and replicas fire out of sync, the task could still run up to N times per interval with N replicas. Add a configurable lockAtLeastFor to every scheduled task so the lock is held for at least the desired interval, which is what actually guarantees "run once per interval regardless of replica count". The schedulers now poll frequently (ShedLock skips rather than queues when a lock is held), and lockAtLeastFor governs the real cadence. For the GC-style aux cleanup, lockAtMostFor is set above lockAtLeastFor so a long run keeps the lock (avoiding parallelism) while still releasing if a replica dies mid-task. Lock minimums are overridable via loculus.locks..atLeast and are set to PT0S in tests (ShedLock's in-memory LockRecordRegistry is incompatible with truncating the shedlock table between tests). ShedLockIntegrationTest now verifies lockAtLeastFor directly through the LockProvider. Co-Authored-By: Claude Opus 4.8 --- .../SeqSetCrossRefCitationsTask.kt | 12 +++-- .../service/submission/CleanUpAuxTableTask.kt | 16 +++++-- .../CleanUpStaleSequencesInProcessingTask.kt | 8 +++- .../UseNewerProcessingPipelineVersionTask.kt | 8 +++- .../src/main/resources/application.properties | 9 ++++ .../submission/ShedLockIntegrationTest.kt | 48 ++++++++++--------- .../src/test/resources/application.properties | 8 ++++ 7 files changed, 78 insertions(+), 31 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/seqsetcitations/SeqSetCrossRefCitationsTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/seqsetcitations/SeqSetCrossRefCitationsTask.kt index c7d1d709c2..3b7e3ff4b4 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/seqsetcitations/SeqSetCrossRefCitationsTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/seqsetcitations/SeqSetCrossRefCitationsTask.kt @@ -37,7 +37,7 @@ class SeqSetCrossRefCitationsTask( private val seqSetCitationsDatabaseService: SeqSetCitationsDatabaseService, ) { /** - * Runs every six hours, with an initial delay of one minute. + * Runs at most once every six hours (enforced by the ShedLock `lockAtLeastFor`), with an initial delay of one minute. * * The task checks that the CrossRef service is active and a DOI prefix is configured for the Loculus instance. * If configured, it retrieves all CrossRef forward links (citations) which begin with the instance's DOI prefix. @@ -46,10 +46,16 @@ class SeqSetCrossRefCitationsTask( */ @Scheduled( initialDelay = 1, - fixedDelay = 360, + fixedDelay = 5, timeUnit = java.util.concurrent.TimeUnit.MINUTES, ) - @SchedulerLock(name = "seqSetCrossRefCitations", lockAtMostFor = "PT1H") + // The scheduler polls every 5 minutes, but `lockAtLeastFor` holds the lock for the full interval so + // the CrossRef service is queried at most once per interval, regardless of how many replicas run. + @SchedulerLock( + name = "seqSetCrossRefCitations", + lockAtLeastFor = "\${loculus.locks.seqSetCrossRefCitations.atLeast:PT6H}", + lockAtMostFor = "PT6H", + ) fun task() { log.info { "Updating SeqSet CrossRef citations..." } diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpAuxTableTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpAuxTableTask.kt index 8136a3c56f..b59b4320a7 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpAuxTableTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpAuxTableTask.kt @@ -20,10 +20,20 @@ class CleanUpAuxTableTask( ) { /** - * Runs every hour and deletes auxTable entries older than 24 hours. + * Deletes auxTable entries older than 24 hours. + * + * The scheduler polls frequently (every 5 minutes), but `lockAtLeastFor` keeps the ShedLock lock + * held for at least the configured interval, so across replicas the task effectively runs once + * per `lockAtLeastFor` regardless of replica count. `lockAtMostFor` is deliberately larger than + * `lockAtLeastFor` so an unusually long run keeps the lock (no parallel run) while still releasing + * within bounds if a replica dies mid-task. */ - @Scheduled(fixedDelay = 1, timeUnit = java.util.concurrent.TimeUnit.HOURS) - @SchedulerLock(name = "cleanUpAuxTable", lockAtMostFor = "PT15M") + @Scheduled(fixedDelay = 5, timeUnit = java.util.concurrent.TimeUnit.MINUTES) + @SchedulerLock( + name = "cleanUpAuxTable", + lockAtLeastFor = "\${loculus.locks.cleanUpAuxTable.atLeast:PT1H}", + lockAtMostFor = "PT6H", + ) fun task() { val hourCutoff = 24L val now = dateProvider.getCurrentInstant() diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt index 2bd83a9257..8429835a17 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt @@ -15,7 +15,13 @@ class CleanUpStaleSequencesInProcessingTask( @Value("\${${BackendSpringProperty.STALE_AFTER_SECONDS}}") private val timeToStaleInSeconds: Long, ) { @Scheduled(fixedRateString = "\${${BackendSpringProperty.CLEAN_UP_RUN_EVERY_SECONDS}}", timeUnit = TimeUnit.SECONDS) - @SchedulerLock(name = "cleanUpStaleSequencesInProcessing", lockAtMostFor = "PT5M") + @SchedulerLock( + name = "cleanUpStaleSequencesInProcessing", + // `lockAtLeastFor` enforces the effective run interval across replicas (default matches the + // standard run-every interval); the lock is held this long even though the scheduler polls. + lockAtLeastFor = "\${loculus.locks.cleanUpStaleSequencesInProcessing.atLeast:PT1M}", + lockAtMostFor = "PT5M", + ) fun task() { log.info { "Cleaning up stale sequences in processing, timeToStaleInSeconds: $timeToStaleInSeconds" } submissionDatabaseService.cleanUpStaleSequencesInProcessing(timeToStaleInSeconds) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt index 665ecb62cd..9a7bb40510 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt @@ -19,7 +19,13 @@ class UseNewerProcessingPipelineVersionTask(private val submissionDatabaseServic fixedDelayString = "\${${BackendSpringProperty.PIPELINE_VERSION_UPGRADE_CHECK_INTERVAL_SECONDS}}", timeUnit = TimeUnit.SECONDS, ) - @SchedulerLock(name = "useNewerProcessingPipelineVersion", lockAtMostFor = "PT1M") + @SchedulerLock( + name = "useNewerProcessingPipelineVersion", + // `lockAtLeastFor` enforces the effective check interval across replicas (default matches the + // standard check interval); the lock is held this long even though the scheduler polls. + lockAtLeastFor = "\${loculus.locks.useNewerProcessingPipelineVersion.atLeast:PT10S}", + lockAtMostFor = "PT1M", + ) fun task() { log.info { "Checking for newer preprocessing pipeline versions" } val newVersions = submissionDatabaseService.useNewerProcessingPipelineIfPossible() diff --git a/backend/src/main/resources/application.properties b/backend/src/main/resources/application.properties index 1ce6999fef..9227661694 100644 --- a/backend/src/main/resources/application.properties +++ b/backend/src/main/resources/application.properties @@ -29,6 +29,15 @@ management.health.readinessState.enabled=true loculus.cleanup.task.reset-stale-in-processing-after-seconds=60 loculus.cleanup.task.run-every-seconds=60 loculus.pipeline-version-upgrade-check.interval-seconds=10 + +# ShedLock minimum lock durations (ISO-8601): each scheduled task holds its lock for at least this +# long, which is what makes the task run once per interval regardless of the number of replicas. +# The production defaults live on each @SchedulerLock annotation (lockAtLeastFor); set the values +# below only to override them. Keep each value <= the task's lockAtMostFor. +# loculus.locks.cleanUpStaleSequencesInProcessing.atLeast=PT1M +# loculus.locks.useNewerProcessingPipelineVersion.atLeast=PT10S +# loculus.locks.cleanUpAuxTable.atLeast=PT1H +# loculus.locks.seqSetCrossRefCitations.atLeast=PT6H loculus.stream.batch-size=1000 loculus.debug-mode=false diff --git a/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt index 4e8d46150d..0841f8cef6 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt @@ -1,14 +1,16 @@ package org.loculus.backend.service.submission +import net.javacrumbs.shedlock.core.LockConfiguration +import net.javacrumbs.shedlock.core.LockProvider import org.hamcrest.CoreMatchers.`is` import org.hamcrest.MatcherAssert.assertThat -import org.hamcrest.Matchers.greaterThan import org.junit.jupiter.api.Test import org.loculus.backend.config.BackendSpringProperty import org.loculus.backend.controller.EndpointTest import org.springframework.beans.factory.annotation.Autowired import org.springframework.jdbc.core.JdbcTemplate -import java.sql.Timestamp +import java.time.Duration +import java.time.Instant @EndpointTest( properties = [ @@ -18,10 +20,11 @@ import java.sql.Timestamp ) class ShedLockIntegrationTest( @Autowired val jdbcTemplate: JdbcTemplate, + @Autowired val lockProvider: LockProvider, @Autowired val cleanUpStaleSequencesInProcessingTask: CleanUpStaleSequencesInProcessingTask, ) { @Test - fun `WHEN task is called through the Spring proxy THEN shedlock row is created`() { + fun `WHEN a scheduled task is invoked through the Spring proxy THEN a shedlock row is created`() { cleanUpStaleSequencesInProcessingTask.task() val count = jdbcTemplate.queryForObject( @@ -33,30 +36,29 @@ class ShedLockIntegrationTest( } @Test - fun `WHEN task completes the lock is released and task can run again`() { - cleanUpStaleSequencesInProcessingTask.task() - - val firstLockedAt = jdbcTemplate.queryForObject( - "SELECT locked_at FROM shedlock WHERE name = ?", - Timestamp::class.java, - "cleanUpStaleSequencesInProcessing", - )!! - - // Ensure DB clock advances before the second acquisition - Thread.sleep(10) + fun `WHEN a lock is released within lockAtLeastFor THEN it cannot be re-acquired yet`() { + // A unique lock name that no scheduled task uses, so the result is not affected by the + // background scheduler. lockAtLeastFor is what prevents the task from running more often + // than the configured interval (regardless of replica count), even after an early release. + val lockName = "shedLockIntegrationTestLock" + val lockAtMostFor = Duration.ofMinutes(5) + val lockAtLeastFor = Duration.ofMinutes(1) - cleanUpStaleSequencesInProcessingTask.task() + val firstLock = lockProvider.lock( + LockConfiguration(Instant.now(), lockName, lockAtMostFor, lockAtLeastFor), + ) + assertThat("the lock should be acquired when free", firstLock.isPresent, `is`(true)) - val secondLockedAt = jdbcTemplate.queryForObject( - "SELECT locked_at FROM shedlock WHERE name = ?", - Timestamp::class.java, - "cleanUpStaleSequencesInProcessing", - )!! + // Release immediately - because lockAtLeastFor has not elapsed, the lock stays held. + firstLock.get().unlock() + val secondLock = lockProvider.lock( + LockConfiguration(Instant.now(), lockName, lockAtMostFor, lockAtLeastFor), + ) assertThat( - "second run should have re-acquired the lock (locked_at should advance)", - secondLockedAt, - greaterThan(firstLockedAt), + "re-acquisition within lockAtLeastFor should be refused", + secondLock.isPresent, + `is`(false), ) } } diff --git a/backend/src/test/resources/application.properties b/backend/src/test/resources/application.properties index 2eae6c6bda..b7348b665e 100644 --- a/backend/src/test/resources/application.properties +++ b/backend/src/test/resources/application.properties @@ -19,3 +19,11 @@ keycloak.client=dummy-cli keycloak.url=dummy:420 spring.security.oauth2.resourceserver.jwt.jwk-set-uri=http://some.value + +# Disable the ShedLock minimum lock duration in tests so scheduled tasks can be invoked +# repeatedly within a single test (and across tests sharing the test-container database) +# without being skipped. ShedLockIntegrationTest exercises lockAtLeastFor directly via LockProvider. +loculus.locks.cleanUpStaleSequencesInProcessing.atLeast=PT0S +loculus.locks.useNewerProcessingPipelineVersion.atLeast=PT0S +loculus.locks.cleanUpAuxTable.atLeast=PT0S +loculus.locks.seqSetCrossRefCitations.atLeast=PT0S From d44579d1042867e8f4151be8b623045ab1d42d52 Mon Sep 17 00:00:00 2001 From: Theo Sanderson Date: Fri, 19 Jun 2026 10:27:10 +0100 Subject: [PATCH 10/12] fix(backend): derive pipeline lock floor from the configured interval Addresses Codex review: useNewerProcessingPipelineVersion hardcoded a lockAtLeastFor of PT10S, which silently overrode operator-configured pipelineVersionUpgradeCheckIntervalSeconds values below 10s (schema allows down to 1s). lockAtLeastFor now defaults to the configured interval via a nested placeholder, so the interval is honored rather than capped. The Helm chart passes both bounds derived from the same value (lockAtLeastFor = interval, lockAtMostFor = 5x interval) so the at-most ceiling always stays above the floor for any configured interval. cleanUpStaleSequencesInProcessing likewise defaults its floor to run-every-seconds. ShedLock does not evaluate SpEL in the duration attributes (only in name), so the 5x is computed in the chart rather than the annotation. Added ShedLockPropertyResolutionTest to cover the production default path, which the DB-backed test can't (it always overrides atLeast to PT0S). Co-Authored-By: Claude Opus 4.8 --- .../CleanUpStaleSequencesInProcessingTask.kt | 9 ++-- .../UseNewerProcessingPipelineVersionTask.kt | 12 +++-- .../src/main/resources/application.properties | 9 ++-- .../ShedLockPropertyResolutionTest.kt | 48 +++++++++++++++++++ .../loculus/templates/loculus-backend.yaml | 4 ++ 5 files changed, 71 insertions(+), 11 deletions(-) create mode 100644 backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockPropertyResolutionTest.kt diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt index 8429835a17..f4f57f1da6 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt @@ -17,9 +17,12 @@ class CleanUpStaleSequencesInProcessingTask( @Scheduled(fixedRateString = "\${${BackendSpringProperty.CLEAN_UP_RUN_EVERY_SECONDS}}", timeUnit = TimeUnit.SECONDS) @SchedulerLock( name = "cleanUpStaleSequencesInProcessing", - // `lockAtLeastFor` enforces the effective run interval across replicas (default matches the - // standard run-every interval); the lock is held this long even though the scheduler polls. - lockAtLeastFor = "\${loculus.locks.cleanUpStaleSequencesInProcessing.atLeast:PT1M}", + // `lockAtLeastFor` enforces the effective run interval across replicas; it defaults to the + // configured run-every interval so that value is honored rather than silently overridden. + // `lockAtMostFor` (PT5M) is the crash-recovery ceiling. Overridable via `loculus.locks.*` + // (tests set `atLeast` to PT0S). + lockAtLeastFor = "\${loculus.locks.cleanUpStaleSequencesInProcessing.atLeast:" + + "PT\${${BackendSpringProperty.CLEAN_UP_RUN_EVERY_SECONDS}}S}", lockAtMostFor = "PT5M", ) fun task() { diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt index 9a7bb40510..bd434407e9 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt @@ -21,10 +21,14 @@ class UseNewerProcessingPipelineVersionTask(private val submissionDatabaseServic ) @SchedulerLock( name = "useNewerProcessingPipelineVersion", - // `lockAtLeastFor` enforces the effective check interval across replicas (default matches the - // standard check interval); the lock is held this long even though the scheduler polls. - lockAtLeastFor = "\${loculus.locks.useNewerProcessingPipelineVersion.atLeast:PT10S}", - lockAtMostFor = "PT1M", + // `lockAtLeastFor` enforces the effective check interval across replicas. It defaults to the + // configured check interval, so operators who tune `interval-seconds` (down to 1s) are honored + // rather than silently overridden. The Helm chart sets `lockAtMostFor` to 5x the interval; the + // PT1M fallback here covers non-Helm runs. Both are overridable via the `loculus.locks.*` keys + // (tests set `atLeast` to PT0S). + lockAtLeastFor = "\${loculus.locks.useNewerProcessingPipelineVersion.atLeast:" + + "PT\${${BackendSpringProperty.PIPELINE_VERSION_UPGRADE_CHECK_INTERVAL_SECONDS}}S}", + lockAtMostFor = "\${loculus.locks.useNewerProcessingPipelineVersion.atMost:PT1M}", ) fun task() { log.info { "Checking for newer preprocessing pipeline versions" } diff --git a/backend/src/main/resources/application.properties b/backend/src/main/resources/application.properties index 9227661694..ae6a58266b 100644 --- a/backend/src/main/resources/application.properties +++ b/backend/src/main/resources/application.properties @@ -32,10 +32,11 @@ loculus.pipeline-version-upgrade-check.interval-seconds=10 # ShedLock minimum lock durations (ISO-8601): each scheduled task holds its lock for at least this # long, which is what makes the task run once per interval regardless of the number of replicas. -# The production defaults live on each @SchedulerLock annotation (lockAtLeastFor); set the values -# below only to override them. Keep each value <= the task's lockAtMostFor. -# loculus.locks.cleanUpStaleSequencesInProcessing.atLeast=PT1M -# loculus.locks.useNewerProcessingPipelineVersion.atLeast=PT10S +# Defaults live on each @SchedulerLock annotation (lockAtLeastFor); the two interval-driven tasks +# default to their configured interval so it is honored rather than overridden. Set the values below +# only to override, and keep each <= the task's lockAtMostFor. +# loculus.locks.cleanUpStaleSequencesInProcessing.atLeast (defaults to run-every-seconds) +# loculus.locks.useNewerProcessingPipelineVersion.atLeast (defaults to interval-seconds; Helm also sets .atMost to 5x) # loculus.locks.cleanUpAuxTable.atLeast=PT1H # loculus.locks.seqSetCrossRefCitations.atLeast=PT6H loculus.stream.batch-size=1000 diff --git a/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockPropertyResolutionTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockPropertyResolutionTest.kt new file mode 100644 index 0000000000..1384868674 --- /dev/null +++ b/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockPropertyResolutionTest.kt @@ -0,0 +1,48 @@ +package org.loculus.backend.service.submission + +import org.hamcrest.CoreMatchers.`is` +import org.hamcrest.MatcherAssert.assertThat +import org.junit.jupiter.api.Test +import org.loculus.backend.config.BackendSpringProperty +import org.springframework.core.env.MapPropertySource +import org.springframework.core.env.StandardEnvironment + +/** + * Verifies the placeholder strings compiled into the `@SchedulerLock` annotations resolve as + * intended. The DB-backed [ShedLockIntegrationTest] always runs with `atLeast` overridden to PT0S, + * so it cannot exercise the production default; this lightweight test covers that path. + */ +class ShedLockPropertyResolutionTest { + + // Must mirror the lockAtLeastFor string in UseNewerProcessingPipelineVersionTask exactly. + private val pipelineLockAtLeast = + "\${loculus.locks.useNewerProcessingPipelineVersion.atLeast:" + + "PT\${${BackendSpringProperty.PIPELINE_VERSION_UPGRADE_CHECK_INTERVAL_SECONDS}}S}" + + private fun resolve(expression: String, props: Map): String { + val env = StandardEnvironment() + env.propertySources.addFirst(MapPropertySource("test", props)) + return env.resolvePlaceholders(expression) + } + + @Test + fun `WHEN no lock override is set THEN lockAtLeastFor falls back to the configured interval`() { + val resolved = resolve( + pipelineLockAtLeast, + mapOf(BackendSpringProperty.PIPELINE_VERSION_UPGRADE_CHECK_INTERVAL_SECONDS to "3"), + ) + assertThat(resolved, `is`("PT3S")) + } + + @Test + fun `WHEN a lock override is set THEN it takes precedence over the interval default`() { + val resolved = resolve( + pipelineLockAtLeast, + mapOf( + BackendSpringProperty.PIPELINE_VERSION_UPGRADE_CHECK_INTERVAL_SECONDS to "3", + "loculus.locks.useNewerProcessingPipelineVersion.atLeast" to "PT0S", + ), + ) + assertThat(resolved, `is`("PT0S")) + } +} diff --git a/kubernetes/loculus/templates/loculus-backend.yaml b/kubernetes/loculus/templates/loculus-backend.yaml index 1a0db9b700..4588253983 100644 --- a/kubernetes/loculus/templates/loculus-backend.yaml +++ b/kubernetes/loculus/templates/loculus-backend.yaml @@ -71,6 +71,10 @@ spec: - "--spring.security.oauth2.resourceserver.jwt.jwk-set-uri=http://loculus-keycloak-service:8083/realms/loculus/protocol/openid-connect/certs" - "--loculus.cleanup.task.reset-stale-in-processing-after-seconds={{- .Values.preprocessingTimeout | default 120 }}" - "--loculus.pipeline-version-upgrade-check.interval-seconds={{- .Values.pipelineVersionUpgradeCheckIntervalSeconds | default 10 }}" + # ShedLock lock bounds for the pipeline-version-upgrade check, derived from the configured + # interval so the lock never overrides it: hold for the interval, expire after 5x it. + - "--loculus.locks.useNewerProcessingPipelineVersion.atLeast=PT{{ .Values.pipelineVersionUpgradeCheckIntervalSeconds | default 10 | int }}S" + - "--loculus.locks.useNewerProcessingPipelineVersion.atMost=PT{{ mul (.Values.pipelineVersionUpgradeCheckIntervalSeconds | default 10 | int) 5 }}S" - "--loculus.s3.enabled=$(S3_ENABLED)" {{- if $.Values.s3.enabled }} - "--loculus.s3.bucket.endpoint=$(S3_BUCKET_ENDPOINT)" From 1507c1b6cf554df6744403384778a62e9ba61863 Mon Sep 17 00:00:00 2001 From: Theo Sanderson Date: Fri, 19 Jun 2026 10:43:40 +0100 Subject: [PATCH 11/12] fix(backend): schedule stale-cleanup with fixedDelay to avoid lock-expiry boundary cleanUpStaleSequencesInProcessing used fixedRate with lockAtLeastFor equal to the run-every interval. fixedRate fires on a wall-clock grid every R while the lock expires at acquire+R, so the next tick lands exactly on the expiry boundary and is skipped or not depending on sub-millisecond clock jitter, making the effective cadence drift toward ~2x. Switching to fixedDelay schedules from completion, so each poll fires after lockAtLeastFor has already elapsed since acquisition and reliably re-acquires, while still holding the lock for the full interval (at-most-once per interval across replicas preserved). The other tasks are unaffected: aux/crossref poll far more often than their lock, and pipeline is already fixedDelay. Co-Authored-By: Claude Opus 4.8 --- .../submission/CleanUpStaleSequencesInProcessingTask.kt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt index f4f57f1da6..f14d96deda 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt @@ -14,7 +14,13 @@ class CleanUpStaleSequencesInProcessingTask( private val submissionDatabaseService: SubmissionDatabaseService, @Value("\${${BackendSpringProperty.STALE_AFTER_SECONDS}}") private val timeToStaleInSeconds: Long, ) { - @Scheduled(fixedRateString = "\${${BackendSpringProperty.CLEAN_UP_RUN_EVERY_SECONDS}}", timeUnit = TimeUnit.SECONDS) + // `fixedDelay` (not `fixedRate`): schedules from completion, so the next poll always fires after + // `lockAtLeastFor` (= run-every) has elapsed since acquisition. With `fixedRate` the poll grid would + // sit exactly on the lock-expiry boundary and skip ticks unpredictably due to clock jitter. + @Scheduled( + fixedDelayString = "\${${BackendSpringProperty.CLEAN_UP_RUN_EVERY_SECONDS}}", + timeUnit = TimeUnit.SECONDS, + ) @SchedulerLock( name = "cleanUpStaleSequencesInProcessing", // `lockAtLeastFor` enforces the effective run interval across replicas; it defaults to the From f78613c535ebea1985e61a0ee26c7920e2b14528 Mon Sep 17 00:00:00 2001 From: Theo Sanderson Date: Fri, 19 Jun 2026 11:32:34 +0100 Subject: [PATCH 12/12] docs(backend): trim verbose ShedLock comments Remove restatements of the lock mechanism that the annotation values already convey, keeping only the non-obvious rationale (fixedDelay choice, the unique test lock name, and why the property-resolution test exists). Co-Authored-By: Claude Opus 4.8 --- .../seqsetcitations/SeqSetCrossRefCitationsTask.kt | 2 -- .../backend/service/submission/CleanUpAuxTableTask.kt | 10 +++------- .../CleanUpStaleSequencesInProcessingTask.kt | 10 +++------- .../UseNewerProcessingPipelineVersionTask.kt | 8 +++----- .../service/submission/ShedLockIntegrationTest.kt | 4 +--- 5 files changed, 10 insertions(+), 24 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/seqsetcitations/SeqSetCrossRefCitationsTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/seqsetcitations/SeqSetCrossRefCitationsTask.kt index 3b7e3ff4b4..8690aaed19 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/seqsetcitations/SeqSetCrossRefCitationsTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/seqsetcitations/SeqSetCrossRefCitationsTask.kt @@ -49,8 +49,6 @@ class SeqSetCrossRefCitationsTask( fixedDelay = 5, timeUnit = java.util.concurrent.TimeUnit.MINUTES, ) - // The scheduler polls every 5 minutes, but `lockAtLeastFor` holds the lock for the full interval so - // the CrossRef service is queried at most once per interval, regardless of how many replicas run. @SchedulerLock( name = "seqSetCrossRefCitations", lockAtLeastFor = "\${loculus.locks.seqSetCrossRefCitations.atLeast:PT6H}", diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpAuxTableTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpAuxTableTask.kt index b59b4320a7..628eeeea65 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpAuxTableTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpAuxTableTask.kt @@ -20,13 +20,9 @@ class CleanUpAuxTableTask( ) { /** - * Deletes auxTable entries older than 24 hours. - * - * The scheduler polls frequently (every 5 minutes), but `lockAtLeastFor` keeps the ShedLock lock - * held for at least the configured interval, so across replicas the task effectively runs once - * per `lockAtLeastFor` regardless of replica count. `lockAtMostFor` is deliberately larger than - * `lockAtLeastFor` so an unusually long run keeps the lock (no parallel run) while still releasing - * within bounds if a replica dies mid-task. + * Deletes auxTable entries older than 24 hours. Effectively runs once per `lockAtLeastFor` (1h) + * regardless of replica count; `lockAtMostFor` is larger so an occasional long run keeps the lock + * rather than allowing a parallel run, while still releasing if a replica dies mid-task. */ @Scheduled(fixedDelay = 5, timeUnit = java.util.concurrent.TimeUnit.MINUTES) @SchedulerLock( diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt index f14d96deda..56772019aa 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/CleanUpStaleSequencesInProcessingTask.kt @@ -14,19 +14,15 @@ class CleanUpStaleSequencesInProcessingTask( private val submissionDatabaseService: SubmissionDatabaseService, @Value("\${${BackendSpringProperty.STALE_AFTER_SECONDS}}") private val timeToStaleInSeconds: Long, ) { - // `fixedDelay` (not `fixedRate`): schedules from completion, so the next poll always fires after - // `lockAtLeastFor` (= run-every) has elapsed since acquisition. With `fixedRate` the poll grid would - // sit exactly on the lock-expiry boundary and skip ticks unpredictably due to clock jitter. + // `fixedDelay`, not `fixedRate`: scheduling from completion keeps each poll past the lock expiry. + // With `fixedRate` the poll grid would sit on the `lockAtLeastFor` boundary and skip ticks on jitter. @Scheduled( fixedDelayString = "\${${BackendSpringProperty.CLEAN_UP_RUN_EVERY_SECONDS}}", timeUnit = TimeUnit.SECONDS, ) @SchedulerLock( name = "cleanUpStaleSequencesInProcessing", - // `lockAtLeastFor` enforces the effective run interval across replicas; it defaults to the - // configured run-every interval so that value is honored rather than silently overridden. - // `lockAtMostFor` (PT5M) is the crash-recovery ceiling. Overridable via `loculus.locks.*` - // (tests set `atLeast` to PT0S). + // Defaults to the configured run-every interval so it is honored, not overridden (tests use PT0S). lockAtLeastFor = "\${loculus.locks.cleanUpStaleSequencesInProcessing.atLeast:" + "PT\${${BackendSpringProperty.CLEAN_UP_RUN_EVERY_SECONDS}}S}", lockAtMostFor = "PT5M", diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt index bd434407e9..f2f0f86d1b 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/UseNewerProcessingPipelineVersionTask.kt @@ -21,11 +21,9 @@ class UseNewerProcessingPipelineVersionTask(private val submissionDatabaseServic ) @SchedulerLock( name = "useNewerProcessingPipelineVersion", - // `lockAtLeastFor` enforces the effective check interval across replicas. It defaults to the - // configured check interval, so operators who tune `interval-seconds` (down to 1s) are honored - // rather than silently overridden. The Helm chart sets `lockAtMostFor` to 5x the interval; the - // PT1M fallback here covers non-Helm runs. Both are overridable via the `loculus.locks.*` keys - // (tests set `atLeast` to PT0S). + // Defaults to the configured check interval so an operator who tunes `interval-seconds` is + // honored, not overridden. The Helm chart sets `atMost` to 5x the interval; PT1M is the + // non-Helm fallback. Both overridable via `loculus.locks.*` (tests use PT0S). lockAtLeastFor = "\${loculus.locks.useNewerProcessingPipelineVersion.atLeast:" + "PT\${${BackendSpringProperty.PIPELINE_VERSION_UPGRADE_CHECK_INTERVAL_SECONDS}}S}", lockAtMostFor = "\${loculus.locks.useNewerProcessingPipelineVersion.atMost:PT1M}", diff --git a/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt index 0841f8cef6..aec8bc031c 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/submission/ShedLockIntegrationTest.kt @@ -37,9 +37,7 @@ class ShedLockIntegrationTest( @Test fun `WHEN a lock is released within lockAtLeastFor THEN it cannot be re-acquired yet`() { - // A unique lock name that no scheduled task uses, so the result is not affected by the - // background scheduler. lockAtLeastFor is what prevents the task from running more often - // than the configured interval (regardless of replica count), even after an early release. + // A unique lock name no scheduled task uses, so the background scheduler can't interfere. val lockName = "shedLockIntegrationTestLock" val lockAtMostFor = Duration.ofMinutes(5) val lockAtLeastFor = Duration.ofMinutes(1)