From 884c04e36e688fdc463e8a90a0e2e83d0be2b12d Mon Sep 17 00:00:00 2001 From: maverbiest Date: Wed, 20 May 2026 16:45:33 +0200 Subject: [PATCH 01/40] Initial S3 garbage collection implementation --- .../backend/config/BackendSpringConfig.kt | 1 + .../service/files/FilesDatabaseService.kt | 44 ++++++++++++++++ .../backend/service/files/S3Service.kt | 13 +++++ .../submission/S3GarbageCollectionTask.kt | 52 +++++++++++++++++++ .../src/main/resources/application.properties | 1 + 5 files changed, 111 insertions(+) create mode 100644 backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt diff --git a/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt b/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt index e9ab9640a1..e3cbafb10b 100644 --- a/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt +++ b/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt @@ -43,6 +43,7 @@ object BackendSpringProperty { const val ENABLE_SEQSETS = "loculus.enable-seqsets" const val S3_ENABLED = "loculus.s3.enabled" + const val S3_MAX_ORPHAN_AGE_DAYS = "loculus.s3.orphan-file-max-age-days" const val S3_BUCKET_ENDPOINT = "loculus.s3.bucket.endpoint" const val S3_BUCKET_INTERNAL_ENDPOINT = "loculus.s3.bucket.internal-endpoint" const val S3_BUCKET_BUCKET = "loculus.s3.bucket.bucket" diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt index dfc6171106..f82574756d 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt @@ -1,8 +1,13 @@ package org.loculus.backend.service.files +import kotlinx.datetime.LocalDateTime import org.jetbrains.exposed.sql.and +import org.jetbrains.exposed.sql.deleteWhere import org.jetbrains.exposed.sql.insert +import org.jetbrains.exposed.sql.kotlin.datetime.KotlinLocalDateTimeColumnType import org.jetbrains.exposed.sql.not +import org.jetbrains.exposed.sql.statements.StatementType +import org.jetbrains.exposed.sql.transactions.transaction import org.jetbrains.exposed.sql.update import org.loculus.backend.utils.DatabaseConstants import org.loculus.backend.utils.DateProvider @@ -26,6 +31,10 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { } } + fun deleteFileEntry(fileId: FiledId) { + FilesTable.deleteWhere { FilesTable.idColumn eq fileId } + } + fun getGroupIds(fileIds: Set): Map = fileIds.chunkedForDatabase({ chunk -> FilesTable.select(FilesTable.idColumn, FilesTable.groupIdColumn) .where { FilesTable.idColumn inList chunk } @@ -57,6 +66,41 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { chunk.filterNot { it in existingIds } }, 1).toSet() + fun getOrphanedFileIds(threshold: LocalDateTime): Set { + val sql = """ + WITH referenced AS ( + SELECT (fil->>'fileId')::uuid AS file_id + FROM sequence_entries, + LATERAL jsonb_each(COALESCE(original_data->'files','{}'::jsonb)) AS cat(k,v), + LATERAL jsonb_array_elements(cat.v) AS fil + UNION + SELECT (fil->>'fileId')::uuid + FROM sequence_entries, + LATERAL jsonb_each(COALESCE(unprocessed_data->'files','{}'::jsonb)) AS cat(k,v), + LATERAL jsonb_array_elements(cat.v) AS fil + UNION + SELECT (fil->>'fileId')::uuid + FROM sequence_entries_preprocessed_data, + LATERAL jsonb_each(COALESCE(processed_data->'files','{}'::jsonb)) AS cat(k,v), + LATERAL jsonb_array_elements(cat.v) AS fil + ) + SELECT f.id FROM files f + LEFT JOIN referenced r ON r.file_id = f.id + WHERE r.file_id IS NULL + AND f.upload_requested_at < ?; + """.trimIndent() + return transaction { + exec(sql, listOf(KotlinLocalDateTimeColumnType() to threshold), + explicitStatementType = StatementType.SELECT) { rs -> + val ids = mutableSetOf() + while (rs.next()) { + ids += rs.getObject("id", UUID::class.java) + } + ids + } ?: emptySet() + } + } + /** * Return the subset of file IDs for which the file size hasn't been checked yet or * no file has been uploaded yet (and therefore there's no file size). diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt index a03e5f6f85..6e50b6148b 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt @@ -14,6 +14,7 @@ import software.amazon.awssdk.services.s3.model.CompleteMultipartUploadRequest import software.amazon.awssdk.services.s3.model.CompletedMultipartUpload import software.amazon.awssdk.services.s3.model.CompletedPart import software.amazon.awssdk.services.s3.model.CreateMultipartUploadRequest +import software.amazon.awssdk.services.s3.model.DeleteObjectRequest import software.amazon.awssdk.services.s3.model.GetObjectRequest import software.amazon.awssdk.services.s3.model.HeadObjectRequest import software.amazon.awssdk.services.s3.model.PutObjectRequest @@ -176,6 +177,18 @@ class S3Service(private val s3Config: S3Config) { Unit } + fun deleteFile(fileId: FileId) = s3ErrorMapping { + val config = getS3BucketConfig() + s3Client.deleteObject( + DeleteObjectRequest + .builder() + .bucket(config.bucket) + .key(getFileIdPath(fileId)) + .build() + ) + Unit + } + private fun assertIsEnabled() { if (!s3Config.enabled) { throw IllegalStateException("S3 is not enabled") diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt new file mode 100644 index 0000000000..d689443bb3 --- /dev/null +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -0,0 +1,52 @@ +package org.loculus.backend.service.submission + +import kotlinx.datetime.DateTimeUnit +import kotlinx.datetime.minus +import kotlinx.datetime.toLocalDateTime +import org.loculus.backend.config.BackendSpringProperty +import org.loculus.backend.log.AuditLogger +import org.loculus.backend.service.files.FilesDatabaseService +import org.loculus.backend.service.files.S3Service +import org.loculus.backend.utils.DateProvider +import org.springframework.beans.factory.annotation.Value +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty +import org.springframework.scheduling.annotation.Scheduled +import org.springframework.stereotype.Component +import java.util.concurrent.TimeUnit + +private val log = mu.KotlinLogging.logger {} + +@Component +@ConditionalOnProperty("loculus.s3.enabled", havingValue = "true") +class S3GarbageCollectionTask( + private val filesDatabaseService: FilesDatabaseService, + private val s3Service: S3Service, + private val dateProvider: DateProvider, + private val auditLogger: AuditLogger, + @Value("\${${BackendSpringProperty.S3_MAX_ORPHAN_AGE_DAYS}}") private val maxOrphanAge: Int, +) { + + /** + * Runs once daily and deletes S3 objects older than `loculus.s3.orphan-file-max-age-days` + * and are not referenced anywhere in original_data, unprocessed_data, or processed_data + */ + @Scheduled(fixedDelay = 1, timeUnit = TimeUnit.DAYS) + fun task() { + val threshold = dateProvider.getCurrentInstant() + .minus(maxOrphanAge, DateTimeUnit.DAY, DateProvider.timeZone) + .toLocalDateTime(DateProvider.timeZone) + val orphans = filesDatabaseService.getOrphanedFileIds(threshold) + orphans.forEach { fileId -> + s3Service.deleteFile(fileId) + filesDatabaseService.deleteFileEntry(fileId) + } + + if (orphans.isNotEmpty()) { + log.info { "Deleted ${orphans.size} orphans that were not referenced by a submission after $maxOrphanAge days" } + auditLogger + .log( + "CLEANUP", "Deleted ${orphans.size} orphans that were not referenced by a submission after $maxOrphanAge days" + ) + } + } +} diff --git a/backend/src/main/resources/application.properties b/backend/src/main/resources/application.properties index 1ce6999fef..b41f31ead6 100644 --- a/backend/src/main/resources/application.properties +++ b/backend/src/main/resources/application.properties @@ -33,6 +33,7 @@ loculus.stream.batch-size=1000 loculus.debug-mode=false loculus.s3.enabled=false +loculus.s3.orphan-file-max-age-days=7 loculus.s3.bucket.endpoint= loculus.s3.bucket.bucket= loculus.s3.bucket.region= From e107ba5a09ea75c8c8991fbb30999fb2bc50985f Mon Sep 17 00:00:00 2001 From: maverbiest Date: Sun, 31 May 2026 16:05:05 +0200 Subject: [PATCH 02/40] Update type --- .../org/loculus/backend/service/files/FilesDatabaseService.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt index f82574756d..1c6c06eb85 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt @@ -31,7 +31,7 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { } } - fun deleteFileEntry(fileId: FiledId) { + fun deleteFileEntry(fileId: UUID) { FilesTable.deleteWhere { FilesTable.idColumn eq fileId } } From 0443280e490120a933491fff15738c087e4e1dc0 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 1 Jun 2026 11:18:45 +0200 Subject: [PATCH 03/40] Update query --- .../loculus/backend/service/files/FilesDatabaseService.kt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt index 1c6c06eb85..562289e3aa 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt @@ -68,12 +68,10 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { fun getOrphanedFileIds(threshold: LocalDateTime): Set { val sql = """ + -- check for files for which an upload was requested > threshold days ago + -- but are not referenced by a submission. For this, check the unprocessed_data + -- and processed_data jsonb objects, but not original_data WITH referenced AS ( - SELECT (fil->>'fileId')::uuid AS file_id - FROM sequence_entries, - LATERAL jsonb_each(COALESCE(original_data->'files','{}'::jsonb)) AS cat(k,v), - LATERAL jsonb_array_elements(cat.v) AS fil - UNION SELECT (fil->>'fileId')::uuid FROM sequence_entries, LATERAL jsonb_each(COALESCE(unprocessed_data->'files','{}'::jsonb)) AS cat(k,v), From af95fb79496360cea741a56a37054013172f37e2 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 1 Jun 2026 15:25:56 +0200 Subject: [PATCH 04/40] Fix query --- .../org/loculus/backend/service/files/FilesDatabaseService.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt index 562289e3aa..fc1dd801a1 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt @@ -72,12 +72,12 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { -- but are not referenced by a submission. For this, check the unprocessed_data -- and processed_data jsonb objects, but not original_data WITH referenced AS ( - SELECT (fil->>'fileId')::uuid + SELECT (fil->>'fileId')::uuid as file_id FROM sequence_entries, LATERAL jsonb_each(COALESCE(unprocessed_data->'files','{}'::jsonb)) AS cat(k,v), LATERAL jsonb_array_elements(cat.v) AS fil UNION - SELECT (fil->>'fileId')::uuid + SELECT (fil->>'fileId')::uuid as file_id FROM sequence_entries_preprocessed_data, LATERAL jsonb_each(COALESCE(processed_data->'files','{}'::jsonb)) AS cat(k,v), LATERAL jsonb_array_elements(cat.v) AS fil From 6f3d14681b5f7d7216cbbf726ce380459c3ca68f Mon Sep 17 00:00:00 2001 From: maverbiest Date: Tue, 2 Jun 2026 12:55:01 +0200 Subject: [PATCH 05/40] Update SQL to also GC files generated by old pipeline versions --- .../service/files/FilesDatabaseService.kt | 36 ++++++++++++------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt index fc1dd801a1..3288281a26 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt @@ -67,25 +67,35 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { }, 1).toSet() fun getOrphanedFileIds(threshold: LocalDateTime): Set { - val sql = """ + val sql = """ -- check for files for which an upload was requested > threshold days ago -- but are not referenced by a submission. For this, check the unprocessed_data -- and processed_data jsonb objects, but not original_data WITH referenced AS ( - SELECT (fil->>'fileId')::uuid as file_id - FROM sequence_entries, - LATERAL jsonb_each(COALESCE(unprocessed_data->'files','{}'::jsonb)) AS cat(k,v), - LATERAL jsonb_array_elements(cat.v) AS fil - UNION - SELECT (fil->>'fileId')::uuid as file_id - FROM sequence_entries_preprocessed_data, - LATERAL jsonb_each(COALESCE(processed_data->'files','{}'::jsonb)) AS cat(k,v), - LATERAL jsonb_array_elements(cat.v) AS fil + SELECT (fil->>'fileId')::uuid AS file_id + -- files uploaded by users and referenced in submissions + FROM sequence_entries, + LATERAL jsonb_each(COALESCE(unprocessed_data->'files','{}'::jsonb)) AS cat(k,v), + LATERAL jsonb_array_elements(cat.v) AS fil + UNION + -- files produced by preprocessing. For these, we keep only files referenced + -- by the current pipeline version or newer. + -- (newer than current versions will only exist during rollouts) + SELECT (fil->>'fileId')::uuid AS file_id + FROM sequence_entries_preprocessed_data sepd + JOIN sequence_entries se + ON se.accession = sepd.accession + AND se.version = sepd.version + JOIN current_processing_pipeline cpp + ON cpp.organism = se.organism + AND sepd.pipeline_version >= cpp.version, + LATERAL jsonb_each(COALESCE(sepd.processed_data->'files','{}'::jsonb)) AS cat(k,v), + LATERAL jsonb_array_elements(cat.v) AS fil ) SELECT f.id FROM files f - LEFT JOIN referenced r ON r.file_id = f.id - WHERE r.file_id IS NULL - AND f.upload_requested_at < ?; + LEFT JOIN referenced r ON r.file_id = f.id + WHERE r.file_id IS NULL + AND f.upload_requested_at < ?; """.trimIndent() return transaction { exec(sql, listOf(KotlinLocalDateTimeColumnType() to threshold), From a3aa7f079cb8b00ee872abeb4b99eb2bdcaeafe1 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Tue, 2 Jun 2026 13:48:45 +0200 Subject: [PATCH 06/40] update comment --- .../backend/service/files/FilesDatabaseService.kt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt index 3288281a26..6bd3995503 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt @@ -72,14 +72,15 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { -- but are not referenced by a submission. For this, check the unprocessed_data -- and processed_data jsonb objects, but not original_data WITH referenced AS ( - SELECT (fil->>'fileId')::uuid AS file_id - -- files uploaded by users and referenced in submissions + -- fetch ids for files uploaded by users and referenced in submissions + SELECT (fil->>'fileId')::uuid AS file_id FROM sequence_entries, LATERAL jsonb_each(COALESCE(unprocessed_data->'files','{}'::jsonb)) AS cat(k,v), LATERAL jsonb_array_elements(cat.v) AS fil UNION - -- files produced by preprocessing. For these, we keep only files referenced - -- by the current pipeline version or newer. + -- fetch ids for files produced by preprocessing. + -- For these, we consider only files referenced by the current pipeline + -- version or newer. -- (newer than current versions will only exist during rollouts) SELECT (fil->>'fileId')::uuid AS file_id FROM sequence_entries_preprocessed_data sepd From aed1fb610e0d25e95775779d93798da7edfea1cc Mon Sep 17 00:00:00 2001 From: maverbiest Date: Tue, 2 Jun 2026 14:56:36 +0200 Subject: [PATCH 07/40] Linting, formatting --- .../loculus/backend/service/files/FilesDatabaseService.kt | 7 +++++-- .../kotlin/org/loculus/backend/service/files/S3Service.kt | 2 +- .../backend/service/submission/S3GarbageCollectionTask.kt | 7 +++++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt index 6bd3995503..bfa94dc8e2 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt @@ -99,8 +99,11 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { AND f.upload_requested_at < ?; """.trimIndent() return transaction { - exec(sql, listOf(KotlinLocalDateTimeColumnType() to threshold), - explicitStatementType = StatementType.SELECT) { rs -> + exec( + sql, + listOf(KotlinLocalDateTimeColumnType() to threshold), + explicitStatementType = StatementType.SELECT, + ) { rs -> val ids = mutableSetOf() while (rs.next()) { ids += rs.getObject("id", UUID::class.java) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt index 6e50b6148b..7c966393f3 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt @@ -184,7 +184,7 @@ class S3Service(private val s3Config: S3Config) { .builder() .bucket(config.bucket) .key(getFileIdPath(fileId)) - .build() + .build(), ) Unit } diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index d689443bb3..e92d9c1399 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -42,10 +42,13 @@ class S3GarbageCollectionTask( } if (orphans.isNotEmpty()) { - log.info { "Deleted ${orphans.size} orphans that were not referenced by a submission after $maxOrphanAge days" } + log.info { + "Deleted ${orphans.size} orphans that were not referenced by a submission after $maxOrphanAge days" + } auditLogger .log( - "CLEANUP", "Deleted ${orphans.size} orphans that were not referenced by a submission after $maxOrphanAge days" + "CLEANUP", + "Deleted ${orphans.size} orphans that were not referenced by a submission after $maxOrphanAge days", ) } } From a6500b8fce915d88aaa726bf4b0922089b138218 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Tue, 2 Jun 2026 15:08:37 +0200 Subject: [PATCH 08/40] fix imports --- .../org/loculus/backend/service/files/FilesDatabaseService.kt | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt index bfa94dc8e2..fff1ad17e7 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt @@ -1,6 +1,7 @@ package org.loculus.backend.service.files import kotlinx.datetime.LocalDateTime +import org.jetbrains.exposed.sql.SqlExpressionBuilder.eq import org.jetbrains.exposed.sql.and import org.jetbrains.exposed.sql.deleteWhere import org.jetbrains.exposed.sql.insert From 683c5b06e34b1b54b4c4e217ae711283930ec9f9 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Fri, 5 Jun 2026 18:21:11 +0200 Subject: [PATCH 09/40] Fix query --- .../org/loculus/backend/service/files/FilesDatabaseService.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt index fff1ad17e7..9c0700a15d 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt @@ -76,7 +76,7 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { -- fetch ids for files uploaded by users and referenced in submissions SELECT (fil->>'fileId')::uuid AS file_id FROM sequence_entries, - LATERAL jsonb_each(COALESCE(unprocessed_data->'files','{}'::jsonb)) AS cat(k,v), + LATERAL jsonb_each(COALESCE(NULLIF(unprocessed_data->'files', 'null'::jsonb),'{}'::jsonb)) AS cat(k,v), LATERAL jsonb_array_elements(cat.v) AS fil UNION -- fetch ids for files produced by preprocessing. @@ -91,7 +91,7 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { JOIN current_processing_pipeline cpp ON cpp.organism = se.organism AND sepd.pipeline_version >= cpp.version, - LATERAL jsonb_each(COALESCE(sepd.processed_data->'files','{}'::jsonb)) AS cat(k,v), + LATERAL jsonb_each(COALESCE(NULLIF(sepd.processed_data->'files', 'null'::jsonb),'{}'::jsonb)) AS cat(k,v), LATERAL jsonb_array_elements(cat.v) AS fil ) SELECT f.id FROM files f From 71ca76414eb559642ff162dfe9559867d7eb4941 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Fri, 5 Jun 2026 18:21:27 +0200 Subject: [PATCH 10/40] Claude: add tests --- .../service/files/GetOrphanedFileIdsTest.kt | 220 ++++++++++++++++++ .../submission/S3GarbageCollectionTaskTest.kt | 115 +++++++++ 2 files changed, 335 insertions(+) create mode 100644 backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt create mode 100644 backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt diff --git a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt new file mode 100644 index 0000000000..cd9aa7130b --- /dev/null +++ b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt @@ -0,0 +1,220 @@ +package org.loculus.backend.service.files + +import kotlinx.datetime.DateTimeUnit +import kotlinx.datetime.LocalDateTime +import kotlinx.datetime.minus +import kotlinx.datetime.toLocalDateTime +import org.hamcrest.MatcherAssert.assertThat +import org.hamcrest.Matchers.containsInAnyOrder +import org.hamcrest.Matchers.`is` +import org.jetbrains.exposed.sql.SqlExpressionBuilder.eq +import org.jetbrains.exposed.sql.insert +import org.jetbrains.exposed.sql.transactions.transaction +import org.jetbrains.exposed.sql.update +import org.junit.jupiter.api.BeforeEach +import org.junit.jupiter.api.Test +import org.loculus.backend.api.FileIdAndName +import org.loculus.backend.api.OriginalData +import org.loculus.backend.api.ProcessedData +import org.loculus.backend.config.BackendSpringProperty +import org.loculus.backend.controller.DEFAULT_GROUP +import org.loculus.backend.controller.DEFAULT_ORGANISM +import org.loculus.backend.controller.EndpointTest +import org.loculus.backend.controller.groupmanagement.GroupManagementControllerClient +import org.loculus.backend.controller.groupmanagement.andGetGroupId +import org.loculus.backend.controller.jwtForDefaultUser +import org.loculus.backend.service.submission.CompressedSequence +import org.loculus.backend.service.submission.SequenceEntriesPreprocessedDataTable +import org.loculus.backend.service.submission.SequenceEntriesTable +import org.loculus.backend.service.submission.dbtables.CurrentProcessingPipelineTable +import org.loculus.backend.utils.DateProvider +import org.springframework.beans.factory.annotation.Autowired +import java.util.UUID + +/** + * Tests for the raw SQL in [FilesDatabaseService.getOrphanedFileIds]. + * + * This is where all the correctness risk of the garbage collection lives, so the cases are + * exercised directly against the database without involving S3. The `threshold` is passed + * explicitly, which lets us isolate the *reference* logic (these tests use a far-back threshold + * so every file is age-eligible) from the *age gate* (tested separately). + * + * The pipeline-version-upgrade task is effectively disabled via a huge check interval so it can't + * concurrently bump the current pipeline version or delete preprocessed rows mid-test. + */ +@EndpointTest( + properties = [ + "${BackendSpringProperty.PIPELINE_VERSION_UPGRADE_CHECK_INTERVAL_SECONDS}=999999", + ], +) +class GetOrphanedFileIdsTest( + @Autowired val filesDatabaseService: FilesDatabaseService, + @Autowired val groupManagementClient: GroupManagementControllerClient, + @Autowired val dateProvider: DateProvider, +) { + private var groupId = 0 + + @BeforeEach + fun createGroup() { + groupId = groupManagementClient + .createNewGroup(group = DEFAULT_GROUP, jwt = jwtForDefaultUser) + .andGetGroupId() + } + + @Test + fun `GIVEN a file only referenced in original_data THEN it is orphaned, but unprocessed_data is protected`() { + // A user edited a submission, replacing `editedAway` with `currentFile`. + // After the edit, only `unprocessed_data` is updated, so `editedAway` lingers in + // `original_data` (which the query intentionally ignores) and should be reclaimed. + val editedAway = UUID.randomUUID() + val currentFile = UUID.randomUUID() + listOf(editedAway, currentFile).forEach { insertFile(it, requestedAt = daysAgo(10)) } + insertSequenceEntry( + accession = "A1", + version = 1, + original = filesReferencing(editedAway), + unprocessed = filesReferencing(currentFile), + ) + + val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) + + assertThat(orphans, containsInAnyOrder(editedAway)) + } + + @Test + fun `GIVEN processed_data files THEN only those from a pipeline version older than current are orphaned`() { + setCurrentPipelineVersion(DEFAULT_ORGANISM, 2) + val fileFromOldVersion = UUID.randomUUID() // pipeline v1 (< current) -> orphaned + val fileFromCurrentVersion = UUID.randomUUID() // pipeline v2 (== current) -> protected + val fileFromNewerVersion = UUID.randomUUID() // pipeline v3 (> current, rollout) -> protected + listOf(fileFromOldVersion, fileFromCurrentVersion, fileFromNewerVersion) + .forEach { insertFile(it, requestedAt = daysAgo(10)) } + + // The sequence entry itself references no files, so only the processed_data references matter. + insertSequenceEntry(accession = "A1", version = 1, original = null, unprocessed = filesReferencing(null)) + insertPreprocessed( + accession = "A1", + version = 1, + pipelineVersion = 1, + processed = processedReferencing(fileFromOldVersion), + ) + insertPreprocessed( + accession = "A1", + version = 1, + pipelineVersion = 2, + processed = processedReferencing(fileFromCurrentVersion), + ) + insertPreprocessed( + accession = "A1", + version = 1, + pipelineVersion = 3, + processed = processedReferencing(fileFromNewerVersion), + ) + + val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) + + assertThat(orphans, containsInAnyOrder(fileFromOldVersion)) + } + + @Test + fun `GIVEN unreferenced files THEN only those whose upload was requested before the threshold are orphaned`() { + val old = UUID.randomUUID() + val recent = UUID.randomUUID() + insertFile(old, requestedAt = daysAgo(10)) + insertFile(recent, requestedAt = daysAgo(1)) + + val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) + + assertThat(orphans, containsInAnyOrder(old)) + } + + @Test + fun `GIVEN a file referenced by an old, superseded version THEN it is still protected`() { + // Branch 1 of the query has no version filter: a file referenced by ANY version's + // unprocessed_data must survive, even if that version is no longer the latest. + val fileInOldVersion = UUID.randomUUID() + insertFile(fileInOldVersion, requestedAt = daysAgo(10)) + insertSequenceEntry( + accession = "A1", + version = 1, + original = null, + unprocessed = filesReferencing(fileInOldVersion), + ) + insertSequenceEntry(accession = "A1", version = 2, original = null, unprocessed = filesReferencing(null)) + + val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) + + assertThat(orphans.isEmpty(), `is`(true)) + } + + private fun daysAgo(days: Long): LocalDateTime = dateProvider.getCurrentInstant() + .minus(days, DateTimeUnit.DAY, DateProvider.timeZone) + .toLocalDateTime(DateProvider.timeZone) + + private fun insertFile(id: UUID, requestedAt: LocalDateTime) = transaction { + FilesTable.insert { + it[idColumn] = id + it[uploadRequestedAtColumn] = requestedAt + it[uploaderColumn] = "testuser" + it[groupIdColumn] = groupId + it[multipartCompleted] = true + } + } + + private fun insertSequenceEntry( + accession: String, + version: Long, + original: OriginalData?, + unprocessed: OriginalData?, + ) = transaction { + SequenceEntriesTable.insert { + it[accessionColumn] = accession + it[versionColumn] = version + it[organismColumn] = DEFAULT_ORGANISM + it[submissionIdColumn] = "submission-$accession-$version" + it[submitterColumn] = "testuser" + it[groupIdColumn] = groupId + it[submittedAtTimestampColumn] = dateProvider.getCurrentDateTime() + it[originalDataColumn] = original + it[unprocessedDataColumn] = unprocessed + } + } + + private fun insertPreprocessed( + accession: String, + version: Long, + pipelineVersion: Long, + processed: ProcessedData, + ) = transaction { + SequenceEntriesPreprocessedDataTable.insert { + it[accessionColumn] = accession + it[versionColumn] = version + it[pipelineVersionColumn] = pipelineVersion + it[processedDataColumn] = processed + it[processingStatusColumn] = "PROCESSED" + it[startedProcessingAtColumn] = dateProvider.getCurrentDateTime() + } + } + + private fun setCurrentPipelineVersion(organism: String, version: Long) = transaction { + CurrentProcessingPipelineTable.update({ CurrentProcessingPipelineTable.organismColumn eq organism }) { + it[versionColumn] = version + } + } + + private fun filesReferencing(fileId: UUID?): OriginalData = OriginalData( + metadata = emptyMap(), + unalignedNucleotideSequences = emptyMap(), + files = fileId?.let { mapOf("rawReads" to listOf(FileIdAndName(it, "raw.txt"))) }, + ) + + private fun processedReferencing(fileId: UUID): ProcessedData = ProcessedData( + metadata = emptyMap(), + unalignedNucleotideSequences = emptyMap(), + alignedNucleotideSequences = emptyMap(), + nucleotideInsertions = emptyMap(), + alignedAminoAcidSequences = emptyMap(), + aminoAcidInsertions = emptyMap(), + files = mapOf("processedOutput" to listOf(FileIdAndName(fileId, "out.txt"))), + ) +} diff --git a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt new file mode 100644 index 0000000000..1d3d210f7e --- /dev/null +++ b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt @@ -0,0 +1,115 @@ +package org.loculus.backend.service.submission + +import com.ninjasquad.springmockk.MockkBean +import io.mockk.verify +import kotlinx.datetime.DateTimeUnit +import kotlinx.datetime.minus +import kotlinx.datetime.toLocalDateTime +import org.hamcrest.MatcherAssert.assertThat +import org.hamcrest.Matchers.containsInAnyOrder +import org.jetbrains.exposed.sql.insert +import org.jetbrains.exposed.sql.transactions.transaction +import org.junit.jupiter.api.BeforeEach +import org.junit.jupiter.api.Test +import org.loculus.backend.api.FileIdAndName +import org.loculus.backend.api.OriginalData +import org.loculus.backend.config.BackendSpringProperty +import org.loculus.backend.controller.DEFAULT_GROUP +import org.loculus.backend.controller.DEFAULT_ORGANISM +import org.loculus.backend.controller.EndpointTest +import org.loculus.backend.controller.groupmanagement.GroupManagementControllerClient +import org.loculus.backend.controller.groupmanagement.andGetGroupId +import org.loculus.backend.controller.jwtForDefaultUser +import org.loculus.backend.service.files.FilesDatabaseService +import org.loculus.backend.service.files.FilesTable +import org.loculus.backend.service.files.S3Service +import org.loculus.backend.utils.DateProvider +import org.springframework.beans.factory.annotation.Autowired +import java.util.UUID + +/** + * Tests the orchestration of [S3GarbageCollectionTask]: which files get deleted from S3 and from + * the files table. S3 itself is mocked here (the interesting behaviour is *selection*, not the + * actual S3 I/O), which also lets us inject faults later if we want to test error handling. + * + * `orphan-file-max-age-days=0` makes the threshold "now", so any file whose upload was requested + * in the past is age-eligible; the referenced file is protected by the reference check regardless. + * + * Assertions are written to be robust against the task's own @Scheduled trigger possibly firing in + * the background: deletion of the orphan is verified as "at least once", protection of the + * referenced file as "never", and the final DB state is checked (which is idempotent under reruns). + */ +@EndpointTest( + properties = [ + "${BackendSpringProperty.S3_ENABLED}=true", + "${BackendSpringProperty.S3_MAX_ORPHAN_AGE_DAYS}=0", + ], +) +class S3GarbageCollectionTaskTest( + @Autowired val s3GarbageCollectionTask: S3GarbageCollectionTask, + @Autowired val filesDatabaseService: FilesDatabaseService, + @Autowired val groupManagementClient: GroupManagementControllerClient, + @Autowired val dateProvider: DateProvider, +) { + @MockkBean(relaxed = true) + lateinit var s3Service: S3Service + + private var groupId = 0 + + @BeforeEach + fun createGroup() { + groupId = groupManagementClient + .createNewGroup(group = DEFAULT_GROUP, jwt = jwtForDefaultUser) + .andGetGroupId() + } + + @Suppress("ktlint:standard:max-line-length") + @Test + fun `GIVEN an orphan and a referenced file WHEN the task runs THEN only the orphan is deleted from S3 and the DB`() { + val orphan = UUID.randomUUID() + val referenced = UUID.randomUUID() + listOf(orphan, referenced).forEach { insertFile(it) } + + // `referenced` is referenced by a submission's unprocessed_data, so it must be protected. + transaction { + SequenceEntriesTable.insert { + it[accessionColumn] = "A1" + it[versionColumn] = 1 + it[organismColumn] = DEFAULT_ORGANISM + it[submissionIdColumn] = "submission-A1" + it[submitterColumn] = "testuser" + it[groupIdColumn] = groupId + it[submittedAtTimestampColumn] = dateProvider.getCurrentDateTime() + it[unprocessedDataColumn] = OriginalData( + metadata = emptyMap(), + unalignedNucleotideSequences = emptyMap(), + files = mapOf("rawReads" to listOf(FileIdAndName(referenced, "raw.txt"))), + ) + } + } + + s3GarbageCollectionTask.task() + + verify { s3Service.deleteFile(orphan) } + verify(exactly = 0) { s3Service.deleteFile(referenced) } + assertThat( + filesDatabaseService.getNonExistentFileIds(setOf(orphan, referenced)), + containsInAnyOrder(orphan), + ) + } + + private fun insertFile(id: UUID) { + val oneDayAgo = dateProvider.getCurrentInstant() + .minus(1, DateTimeUnit.DAY, DateProvider.timeZone) + .toLocalDateTime(DateProvider.timeZone) + transaction { + FilesTable.insert { + it[idColumn] = id + it[uploadRequestedAtColumn] = oneDayAgo + it[uploaderColumn] = "testuser" + it[groupIdColumn] = groupId + it[multipartCompleted] = true + } + } + } +} From 4797c66a44e73622640e8652018f782a1919e32c Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 8 Jun 2026 11:48:07 +0200 Subject: [PATCH 11/40] Clean up GetOrphanedFileIdsTest.kt --- .../service/files/GetOrphanedFileIdsTest.kt | 113 ++++++++---------- 1 file changed, 51 insertions(+), 62 deletions(-) diff --git a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt index cd9aa7130b..9e7d143d32 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt @@ -5,9 +5,7 @@ import kotlinx.datetime.LocalDateTime import kotlinx.datetime.minus import kotlinx.datetime.toLocalDateTime import org.hamcrest.MatcherAssert.assertThat -import org.hamcrest.Matchers.containsInAnyOrder import org.hamcrest.Matchers.`is` -import org.jetbrains.exposed.sql.SqlExpressionBuilder.eq import org.jetbrains.exposed.sql.insert import org.jetbrains.exposed.sql.transactions.transaction import org.jetbrains.exposed.sql.update @@ -32,15 +30,7 @@ import org.springframework.beans.factory.annotation.Autowired import java.util.UUID /** - * Tests for the raw SQL in [FilesDatabaseService.getOrphanedFileIds]. - * - * This is where all the correctness risk of the garbage collection lives, so the cases are - * exercised directly against the database without involving S3. The `threshold` is passed - * explicitly, which lets us isolate the *reference* logic (these tests use a far-back threshold - * so every file is age-eligible) from the *age gate* (tested separately). - * - * The pipeline-version-upgrade task is effectively disabled via a huge check interval so it can't - * concurrently bump the current pipeline version or delete preprocessed rows mid-test. + * Testing of orphan file detection logic in [FilesDatabaseService.getOrphanedFileIds]. */ @EndpointTest( properties = [ @@ -61,86 +51,91 @@ class GetOrphanedFileIdsTest( .andGetGroupId() } + @Test + fun `GIVEN unreferenced files THEN only those whose upload was requested before the threshold are orphaned`() { + val old = UUID.randomUUID() + val recent = UUID.randomUUID() + insertFile(old, requestedAt = daysAgo(10)) + insertFile(recent, requestedAt = daysAgo(1)) + + val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) + + assertThat(orphans, `is`(setOf(old))) + } + @Test fun `GIVEN a file only referenced in original_data THEN it is orphaned, but unprocessed_data is protected`() { - // A user edited a submission, replacing `editedAway` with `currentFile`. - // After the edit, only `unprocessed_data` is updated, so `editedAway` lingers in - // `original_data` (which the query intentionally ignores) and should be reclaimed. + // Simulate a case where a user edited a submission, replacing `editedAway` with `currentFile`. val editedAway = UUID.randomUUID() val currentFile = UUID.randomUUID() listOf(editedAway, currentFile).forEach { insertFile(it, requestedAt = daysAgo(10)) } insertSequenceEntry( - accession = "A1", - version = 1, - original = filesReferencing(editedAway), - unprocessed = filesReferencing(currentFile), + accession = "A", + version = 2, + original = makeUnprocessedData(editedAway), + unprocessed = makeUnprocessedData(currentFile), ) val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) - assertThat(orphans, containsInAnyOrder(editedAway)) + assertThat(orphans, `is`(setOf(editedAway))) } @Test fun `GIVEN processed_data files THEN only those from a pipeline version older than current are orphaned`() { - setCurrentPipelineVersion(DEFAULT_ORGANISM, 2) - val fileFromOldVersion = UUID.randomUUID() // pipeline v1 (< current) -> orphaned - val fileFromCurrentVersion = UUID.randomUUID() // pipeline v2 (== current) -> protected - val fileFromNewerVersion = UUID.randomUUID() // pipeline v3 (> current, rollout) -> protected - listOf(fileFromOldVersion, fileFromCurrentVersion, fileFromNewerVersion) + transaction { + CurrentProcessingPipelineTable.update( + { CurrentProcessingPipelineTable.organismColumn eq DEFAULT_ORGANISM } + ) { + it[versionColumn] = 2 + } + } + + val fileFromOldPipeline = UUID.randomUUID() // pipeline version 1 (< current) -> orphaned + val fileFromCurrentPipeline = UUID.randomUUID() // pipeline version 2 (current) -> protected + val fileFromNewerPipeline = UUID.randomUUID() // pipeline version 3 (> current) -> protected + listOf(fileFromOldPipeline, fileFromCurrentPipeline, fileFromNewerPipeline) .forEach { insertFile(it, requestedAt = daysAgo(10)) } // The sequence entry itself references no files, so only the processed_data references matter. - insertSequenceEntry(accession = "A1", version = 1, original = null, unprocessed = filesReferencing(null)) - insertPreprocessed( - accession = "A1", + insertSequenceEntry(accession = "A", version = 1, original = null, unprocessed = makeUnprocessedData(null)) + insertPreprocessedData( + accession = "A", version = 1, pipelineVersion = 1, - processed = processedReferencing(fileFromOldVersion), + processed = makeProcessedData(fileFromOldPipeline), ) - insertPreprocessed( - accession = "A1", + insertPreprocessedData( + accession = "A", version = 1, pipelineVersion = 2, - processed = processedReferencing(fileFromCurrentVersion), + processed = makeProcessedData(fileFromCurrentPipeline), ) - insertPreprocessed( - accession = "A1", + insertPreprocessedData( + accession = "A", version = 1, pipelineVersion = 3, - processed = processedReferencing(fileFromNewerVersion), + processed = makeProcessedData(fileFromNewerPipeline), ) val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) - assertThat(orphans, containsInAnyOrder(fileFromOldVersion)) + assertThat(orphans, `is`(setOf(fileFromOldPipeline))) } @Test - fun `GIVEN unreferenced files THEN only those whose upload was requested before the threshold are orphaned`() { - val old = UUID.randomUUID() - val recent = UUID.randomUUID() - insertFile(old, requestedAt = daysAgo(10)) - insertFile(recent, requestedAt = daysAgo(1)) - - val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) - - assertThat(orphans, containsInAnyOrder(old)) - } - - @Test - fun `GIVEN a file referenced by an old, superseded version THEN it is still protected`() { + fun `GIVEN a file referenced only by old version THEN it is still protected`() { // Branch 1 of the query has no version filter: a file referenced by ANY version's // unprocessed_data must survive, even if that version is no longer the latest. val fileInOldVersion = UUID.randomUUID() insertFile(fileInOldVersion, requestedAt = daysAgo(10)) insertSequenceEntry( - accession = "A1", + accession = "A", version = 1, original = null, - unprocessed = filesReferencing(fileInOldVersion), + unprocessed = makeUnprocessedData(fileInOldVersion), ) - insertSequenceEntry(accession = "A1", version = 2, original = null, unprocessed = filesReferencing(null)) + insertSequenceEntry(accession = "A", version = 2, original = null, unprocessed = makeUnprocessedData(null)) val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) @@ -180,7 +175,7 @@ class GetOrphanedFileIdsTest( } } - private fun insertPreprocessed( + private fun insertPreprocessedData( accession: String, version: Long, pipelineVersion: Long, @@ -196,25 +191,19 @@ class GetOrphanedFileIdsTest( } } - private fun setCurrentPipelineVersion(organism: String, version: Long) = transaction { - CurrentProcessingPipelineTable.update({ CurrentProcessingPipelineTable.organismColumn eq organism }) { - it[versionColumn] = version - } - } - - private fun filesReferencing(fileId: UUID?): OriginalData = OriginalData( + private fun makeUnprocessedData(fileId: UUID?): OriginalData = OriginalData( metadata = emptyMap(), unalignedNucleotideSequences = emptyMap(), - files = fileId?.let { mapOf("rawReads" to listOf(FileIdAndName(it, "raw.txt"))) }, + files = fileId?.let { mapOf("rawReads" to listOf(FileIdAndName(it, "raw.fastq"))) }, ) - private fun processedReferencing(fileId: UUID): ProcessedData = ProcessedData( + private fun makeProcessedData(fileId: UUID): ProcessedData = ProcessedData( metadata = emptyMap(), unalignedNucleotideSequences = emptyMap(), alignedNucleotideSequences = emptyMap(), nucleotideInsertions = emptyMap(), alignedAminoAcidSequences = emptyMap(), aminoAcidInsertions = emptyMap(), - files = mapOf("processedOutput" to listOf(FileIdAndName(fileId, "out.txt"))), + files = mapOf("processedOutput" to listOf(FileIdAndName(fileId, "aligned.bam"))), ) } From 702ccb350ad4decdb3adcca86921bf49f7fa6ceb Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 8 Jun 2026 12:14:28 +0200 Subject: [PATCH 12/40] Add test helper file, small typing fix --- .../submission/S3GarbageCollectionTask.kt | 3 +- .../loculus/backend/service/TestHelpers.kt | 26 +++++++++++++++ .../service/files/GetOrphanedFileIdsTest.kt | 32 +++++-------------- .../submission/S3GarbageCollectionTaskTest.kt | 23 ++----------- 4 files changed, 39 insertions(+), 45 deletions(-) create mode 100644 backend/src/test/kotlin/org/loculus/backend/service/TestHelpers.kt diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index e92d9c1399..aff43cb1f8 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -12,6 +12,7 @@ import org.springframework.beans.factory.annotation.Value import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty import org.springframework.scheduling.annotation.Scheduled import org.springframework.stereotype.Component +import java.util.UUID import java.util.concurrent.TimeUnit private val log = mu.KotlinLogging.logger {} @@ -36,7 +37,7 @@ class S3GarbageCollectionTask( .minus(maxOrphanAge, DateTimeUnit.DAY, DateProvider.timeZone) .toLocalDateTime(DateProvider.timeZone) val orphans = filesDatabaseService.getOrphanedFileIds(threshold) - orphans.forEach { fileId -> + orphans.forEach { fileId: UUID -> s3Service.deleteFile(fileId) filesDatabaseService.deleteFileEntry(fileId) } diff --git a/backend/src/test/kotlin/org/loculus/backend/service/TestHelpers.kt b/backend/src/test/kotlin/org/loculus/backend/service/TestHelpers.kt new file mode 100644 index 0000000000..42282ab240 --- /dev/null +++ b/backend/src/test/kotlin/org/loculus/backend/service/TestHelpers.kt @@ -0,0 +1,26 @@ +package org.loculus.backend.service + +import kotlinx.datetime.DateTimeUnit +import kotlinx.datetime.LocalDateTime +import kotlinx.datetime.minus +import kotlinx.datetime.toLocalDateTime +import org.jetbrains.exposed.sql.insert +import org.jetbrains.exposed.sql.transactions.transaction +import org.loculus.backend.service.files.FilesTable +import org.loculus.backend.utils.DateProvider +import java.util.UUID +import kotlin.time.Clock + +fun insertFile(id: UUID, groupId: Int, requestedAt: LocalDateTime, uploader: String = "testuser") = transaction { + FilesTable.insert { + it[idColumn] = id + it[uploadRequestedAtColumn] = requestedAt + it[uploaderColumn] = uploader + it[groupIdColumn] = groupId + it[multipartCompleted] = true + } +} + +fun daysAgo(days: Long): LocalDateTime = Clock.System.now() + .minus(days, DateTimeUnit.DAY, DateProvider.timeZone) + .toLocalDateTime(DateProvider.timeZone) diff --git a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt index 9e7d143d32..672b2dcca0 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt @@ -1,9 +1,5 @@ package org.loculus.backend.service.files -import kotlinx.datetime.DateTimeUnit -import kotlinx.datetime.LocalDateTime -import kotlinx.datetime.minus -import kotlinx.datetime.toLocalDateTime import org.hamcrest.MatcherAssert.assertThat import org.hamcrest.Matchers.`is` import org.jetbrains.exposed.sql.insert @@ -21,6 +17,8 @@ import org.loculus.backend.controller.EndpointTest import org.loculus.backend.controller.groupmanagement.GroupManagementControllerClient import org.loculus.backend.controller.groupmanagement.andGetGroupId import org.loculus.backend.controller.jwtForDefaultUser +import org.loculus.backend.service.daysAgo +import org.loculus.backend.service.insertFile import org.loculus.backend.service.submission.CompressedSequence import org.loculus.backend.service.submission.SequenceEntriesPreprocessedDataTable import org.loculus.backend.service.submission.SequenceEntriesTable @@ -55,8 +53,8 @@ class GetOrphanedFileIdsTest( fun `GIVEN unreferenced files THEN only those whose upload was requested before the threshold are orphaned`() { val old = UUID.randomUUID() val recent = UUID.randomUUID() - insertFile(old, requestedAt = daysAgo(10)) - insertFile(recent, requestedAt = daysAgo(1)) + insertFile(old, groupId, daysAgo(10)) + insertFile(recent, groupId, daysAgo(1)) val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) @@ -68,7 +66,7 @@ class GetOrphanedFileIdsTest( // Simulate a case where a user edited a submission, replacing `editedAway` with `currentFile`. val editedAway = UUID.randomUUID() val currentFile = UUID.randomUUID() - listOf(editedAway, currentFile).forEach { insertFile(it, requestedAt = daysAgo(10)) } + listOf(editedAway, currentFile).forEach { insertFile(it, groupId, daysAgo(10)) } insertSequenceEntry( accession = "A", version = 2, @@ -85,7 +83,7 @@ class GetOrphanedFileIdsTest( fun `GIVEN processed_data files THEN only those from a pipeline version older than current are orphaned`() { transaction { CurrentProcessingPipelineTable.update( - { CurrentProcessingPipelineTable.organismColumn eq DEFAULT_ORGANISM } + { CurrentProcessingPipelineTable.organismColumn eq DEFAULT_ORGANISM }, ) { it[versionColumn] = 2 } @@ -95,7 +93,7 @@ class GetOrphanedFileIdsTest( val fileFromCurrentPipeline = UUID.randomUUID() // pipeline version 2 (current) -> protected val fileFromNewerPipeline = UUID.randomUUID() // pipeline version 3 (> current) -> protected listOf(fileFromOldPipeline, fileFromCurrentPipeline, fileFromNewerPipeline) - .forEach { insertFile(it, requestedAt = daysAgo(10)) } + .forEach { insertFile(it, groupId, daysAgo(10)) } // The sequence entry itself references no files, so only the processed_data references matter. insertSequenceEntry(accession = "A", version = 1, original = null, unprocessed = makeUnprocessedData(null)) @@ -128,7 +126,7 @@ class GetOrphanedFileIdsTest( // Branch 1 of the query has no version filter: a file referenced by ANY version's // unprocessed_data must survive, even if that version is no longer the latest. val fileInOldVersion = UUID.randomUUID() - insertFile(fileInOldVersion, requestedAt = daysAgo(10)) + insertFile(fileInOldVersion, groupId, daysAgo(10)) insertSequenceEntry( accession = "A", version = 1, @@ -142,20 +140,6 @@ class GetOrphanedFileIdsTest( assertThat(orphans.isEmpty(), `is`(true)) } - private fun daysAgo(days: Long): LocalDateTime = dateProvider.getCurrentInstant() - .minus(days, DateTimeUnit.DAY, DateProvider.timeZone) - .toLocalDateTime(DateProvider.timeZone) - - private fun insertFile(id: UUID, requestedAt: LocalDateTime) = transaction { - FilesTable.insert { - it[idColumn] = id - it[uploadRequestedAtColumn] = requestedAt - it[uploaderColumn] = "testuser" - it[groupIdColumn] = groupId - it[multipartCompleted] = true - } - } - private fun insertSequenceEntry( accession: String, version: Long, diff --git a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt index 1d3d210f7e..3d8c07c8e6 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt @@ -2,9 +2,6 @@ package org.loculus.backend.service.submission import com.ninjasquad.springmockk.MockkBean import io.mockk.verify -import kotlinx.datetime.DateTimeUnit -import kotlinx.datetime.minus -import kotlinx.datetime.toLocalDateTime import org.hamcrest.MatcherAssert.assertThat import org.hamcrest.Matchers.containsInAnyOrder import org.jetbrains.exposed.sql.insert @@ -21,8 +18,9 @@ import org.loculus.backend.controller.groupmanagement.GroupManagementControllerC import org.loculus.backend.controller.groupmanagement.andGetGroupId import org.loculus.backend.controller.jwtForDefaultUser import org.loculus.backend.service.files.FilesDatabaseService -import org.loculus.backend.service.files.FilesTable import org.loculus.backend.service.files.S3Service +import org.loculus.backend.service.daysAgo +import org.loculus.backend.service.insertFile import org.loculus.backend.utils.DateProvider import org.springframework.beans.factory.annotation.Autowired import java.util.UUID @@ -68,7 +66,7 @@ class S3GarbageCollectionTaskTest( fun `GIVEN an orphan and a referenced file WHEN the task runs THEN only the orphan is deleted from S3 and the DB`() { val orphan = UUID.randomUUID() val referenced = UUID.randomUUID() - listOf(orphan, referenced).forEach { insertFile(it) } + listOf(orphan, referenced).forEach { insertFile(it, groupId, daysAgo(1)) } // `referenced` is referenced by a submission's unprocessed_data, so it must be protected. transaction { @@ -97,19 +95,4 @@ class S3GarbageCollectionTaskTest( containsInAnyOrder(orphan), ) } - - private fun insertFile(id: UUID) { - val oneDayAgo = dateProvider.getCurrentInstant() - .minus(1, DateTimeUnit.DAY, DateProvider.timeZone) - .toLocalDateTime(DateProvider.timeZone) - transaction { - FilesTable.insert { - it[idColumn] = id - it[uploadRequestedAtColumn] = oneDayAgo - it[uploaderColumn] = "testuser" - it[groupIdColumn] = groupId - it[multipartCompleted] = true - } - } - } } From 0ae53a4c2b7ee8aa6c730c43369f80bf31b11f62 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 8 Jun 2026 13:16:54 +0200 Subject: [PATCH 13/40] Fix import order --- .../backend/service/submission/S3GarbageCollectionTaskTest.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt index 3d8c07c8e6..1b8b7432b8 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt @@ -17,9 +17,9 @@ import org.loculus.backend.controller.EndpointTest import org.loculus.backend.controller.groupmanagement.GroupManagementControllerClient import org.loculus.backend.controller.groupmanagement.andGetGroupId import org.loculus.backend.controller.jwtForDefaultUser +import org.loculus.backend.service.daysAgo import org.loculus.backend.service.files.FilesDatabaseService import org.loculus.backend.service.files.S3Service -import org.loculus.backend.service.daysAgo import org.loculus.backend.service.insertFile import org.loculus.backend.utils.DateProvider import org.springframework.beans.factory.annotation.Autowired From 0cf9e122ee0bdf099c2af7b814885dd43fcc2a0f Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 8 Jun 2026 13:37:14 +0200 Subject: [PATCH 14/40] Clean up S3GarbageCollectionTaskTest.kt --- .../submission/S3GarbageCollectionTaskTest.kt | 31 ++++--------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt index 1b8b7432b8..322498cf42 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt @@ -3,10 +3,9 @@ package org.loculus.backend.service.submission import com.ninjasquad.springmockk.MockkBean import io.mockk.verify import org.hamcrest.MatcherAssert.assertThat -import org.hamcrest.Matchers.containsInAnyOrder +import org.hamcrest.Matchers.`is` import org.jetbrains.exposed.sql.insert import org.jetbrains.exposed.sql.transactions.transaction -import org.junit.jupiter.api.BeforeEach import org.junit.jupiter.api.Test import org.loculus.backend.api.FileIdAndName import org.loculus.backend.api.OriginalData @@ -25,18 +24,6 @@ import org.loculus.backend.utils.DateProvider import org.springframework.beans.factory.annotation.Autowired import java.util.UUID -/** - * Tests the orchestration of [S3GarbageCollectionTask]: which files get deleted from S3 and from - * the files table. S3 itself is mocked here (the interesting behaviour is *selection*, not the - * actual S3 I/O), which also lets us inject faults later if we want to test error handling. - * - * `orphan-file-max-age-days=0` makes the threshold "now", so any file whose upload was requested - * in the past is age-eligible; the referenced file is protected by the reference check regardless. - * - * Assertions are written to be robust against the task's own @Scheduled trigger possibly firing in - * the background: deletion of the orphan is verified as "at least once", protection of the - * referenced file as "never", and the final DB state is checked (which is idempotent under reruns). - */ @EndpointTest( properties = [ "${BackendSpringProperty.S3_ENABLED}=true", @@ -52,18 +39,12 @@ class S3GarbageCollectionTaskTest( @MockkBean(relaxed = true) lateinit var s3Service: S3Service - private var groupId = 0 - - @BeforeEach - fun createGroup() { - groupId = groupManagementClient - .createNewGroup(group = DEFAULT_GROUP, jwt = jwtForDefaultUser) - .andGetGroupId() - } - @Suppress("ktlint:standard:max-line-length") @Test fun `GIVEN an orphan and a referenced file WHEN the task runs THEN only the orphan is deleted from S3 and the DB`() { + val groupId = groupManagementClient + .createNewGroup(group = DEFAULT_GROUP, jwt = jwtForDefaultUser) + .andGetGroupId() val orphan = UUID.randomUUID() val referenced = UUID.randomUUID() listOf(orphan, referenced).forEach { insertFile(it, groupId, daysAgo(1)) } @@ -81,7 +62,7 @@ class S3GarbageCollectionTaskTest( it[unprocessedDataColumn] = OriginalData( metadata = emptyMap(), unalignedNucleotideSequences = emptyMap(), - files = mapOf("rawReads" to listOf(FileIdAndName(referenced, "raw.txt"))), + files = mapOf("rawReads" to listOf(FileIdAndName(referenced, "raw.fastq"))), ) } } @@ -92,7 +73,7 @@ class S3GarbageCollectionTaskTest( verify(exactly = 0) { s3Service.deleteFile(referenced) } assertThat( filesDatabaseService.getNonExistentFileIds(setOf(orphan, referenced)), - containsInAnyOrder(orphan), + `is`(setOf(orphan)), ) } } From cbf53be5c55e15f8759b81dd61ba8de65d2b3b27 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 8 Jun 2026 13:46:49 +0200 Subject: [PATCH 15/40] Comment --- .../org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt index 672b2dcca0..75e2ec436b 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt @@ -32,6 +32,7 @@ import java.util.UUID */ @EndpointTest( properties = [ + // set to high value to prevent tests from triggering pipeline version upgrade task "${BackendSpringProperty.PIPELINE_VERSION_UPGRADE_CHECK_INTERVAL_SECONDS}=999999", ], ) From 7fdcd152af3d935a798b3d0802f15edb80f9aa9c Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 8 Jun 2026 14:31:02 +0200 Subject: [PATCH 16/40] Add initialDelay to task, don't abort batch on first failure --- .../submission/S3GarbageCollectionTask.kt | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index aff43cb1f8..f5db486959 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -28,18 +28,24 @@ class S3GarbageCollectionTask( ) { /** - * Runs once daily and deletes S3 objects older than `loculus.s3.orphan-file-max-age-days` - * and are not referenced anywhere in original_data, unprocessed_data, or processed_data + * Runs once daily (with an initial delay of 15 minutes) and deletes S3 objects older than + * `loculus.s3.orphan-file-max-age-days` that are not referenced in unprocessed_data or processed_data */ - @Scheduled(fixedDelay = 1, timeUnit = TimeUnit.DAYS) + @Scheduled(initialDelay = 10, fixedDelay = 60 * 24, timeUnit = TimeUnit.MINUTES) fun task() { val threshold = dateProvider.getCurrentInstant() .minus(maxOrphanAge, DateTimeUnit.DAY, DateProvider.timeZone) .toLocalDateTime(DateProvider.timeZone) val orphans = filesDatabaseService.getOrphanedFileIds(threshold) + var deleteFailures: Int = 0 orphans.forEach { fileId: UUID -> - s3Service.deleteFile(fileId) - filesDatabaseService.deleteFileEntry(fileId) + try { + s3Service.deleteFile(fileId) + filesDatabaseService.deleteFileEntry(fileId) + } catch (e: Exception) { + log.warn("Failed to delete file entry for $fileId", e) + deleteFailures++ + } } if (orphans.isNotEmpty()) { @@ -51,6 +57,12 @@ class S3GarbageCollectionTask( "CLEANUP", "Deleted ${orphans.size} orphans that were not referenced by a submission after $maxOrphanAge days", ) + + if (deleteFailures > 0) { + log.warn { + "Unsuccessfully attempted to delete $deleteFailures orphan files" + } + } } } } From 9f4e8f886f164094008c996eed13b9a4640152d9 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 8 Jun 2026 14:41:30 +0200 Subject: [PATCH 17/40] Fix --- .../service/submission/S3GarbageCollectionTask.kt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index f5db486959..bdabb3cacf 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -31,19 +31,19 @@ class S3GarbageCollectionTask( * Runs once daily (with an initial delay of 15 minutes) and deletes S3 objects older than * `loculus.s3.orphan-file-max-age-days` that are not referenced in unprocessed_data or processed_data */ - @Scheduled(initialDelay = 10, fixedDelay = 60 * 24, timeUnit = TimeUnit.MINUTES) + @Scheduled(initialDelay = 15, fixedDelay = 60 * 24, timeUnit = TimeUnit.MINUTES) fun task() { val threshold = dateProvider.getCurrentInstant() .minus(maxOrphanAge, DateTimeUnit.DAY, DateProvider.timeZone) .toLocalDateTime(DateProvider.timeZone) val orphans = filesDatabaseService.getOrphanedFileIds(threshold) - var deleteFailures: Int = 0 + var deleteFailures = 0 orphans.forEach { fileId: UUID -> try { s3Service.deleteFile(fileId) filesDatabaseService.deleteFileEntry(fileId) } catch (e: Exception) { - log.warn("Failed to delete file entry for $fileId", e) + log.warn("Failed to delete $fileId", e) deleteFailures++ } } @@ -55,7 +55,8 @@ class S3GarbageCollectionTask( auditLogger .log( "CLEANUP", - "Deleted ${orphans.size} orphans that were not referenced by a submission after $maxOrphanAge days", + "Deleted ${orphans.size - deleteFailures} orphans that were not referenced by a" + + "submission after $maxOrphanAge days", ) if (deleteFailures > 0) { From 5b89ed1685ef680bf04a5df38b3b7150ddb68793 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 8 Jun 2026 15:05:05 +0200 Subject: [PATCH 18/40] Fix indent --- .../backend/service/submission/S3GarbageCollectionTask.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index bdabb3cacf..fea2416bc1 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -56,7 +56,7 @@ class S3GarbageCollectionTask( .log( "CLEANUP", "Deleted ${orphans.size - deleteFailures} orphans that were not referenced by a" + - "submission after $maxOrphanAge days", + "submission after $maxOrphanAge days", ) if (deleteFailures > 0) { From e97e00a58b1fa8949e0909f208aeff83beeb765e Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 8 Jun 2026 15:25:39 +0200 Subject: [PATCH 19/40] Fix log --- .../backend/service/submission/S3GarbageCollectionTask.kt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index fea2416bc1..1a78cb3ff4 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -50,7 +50,8 @@ class S3GarbageCollectionTask( if (orphans.isNotEmpty()) { log.info { - "Deleted ${orphans.size} orphans that were not referenced by a submission after $maxOrphanAge days" + "Deleted ${orphans.size - deleteFailures} orphans that were not referenced by a submission after" + + "$maxOrphanAge days" } auditLogger .log( From d77f9234594b6045ff62f7b00519affe35b4ff13 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 8 Jun 2026 15:28:33 +0200 Subject: [PATCH 20/40] Add spaces --- .../backend/service/submission/S3GarbageCollectionTask.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index 1a78cb3ff4..d50e3ef0d3 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -50,13 +50,13 @@ class S3GarbageCollectionTask( if (orphans.isNotEmpty()) { log.info { - "Deleted ${orphans.size - deleteFailures} orphans that were not referenced by a submission after" + + "Deleted ${orphans.size - deleteFailures} orphans that were not referenced by a submission after " + "$maxOrphanAge days" } auditLogger .log( "CLEANUP", - "Deleted ${orphans.size - deleteFailures} orphans that were not referenced by a" + + "Deleted ${orphans.size - deleteFailures} orphans that were not referenced by a " + "submission after $maxOrphanAge days", ) From 26968a26a958377fba84df6f7649ff8a191d9992 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Mon, 8 Jun 2026 16:50:13 +0200 Subject: [PATCH 21/40] Add index to files.upload_requested_at --- .../db/migration/V1.29__add_files_upload_requested_at_index.sql | 1 + 1 file changed, 1 insertion(+) create mode 100644 backend/src/main/resources/db/migration/V1.29__add_files_upload_requested_at_index.sql diff --git a/backend/src/main/resources/db/migration/V1.29__add_files_upload_requested_at_index.sql b/backend/src/main/resources/db/migration/V1.29__add_files_upload_requested_at_index.sql new file mode 100644 index 0000000000..a7e75ce47c --- /dev/null +++ b/backend/src/main/resources/db/migration/V1.29__add_files_upload_requested_at_index.sql @@ -0,0 +1 @@ +CREATE INDEX IF NOT EXISTS files_upload_requested_at_idx ON files (upload_requested_at); From f45252dcd363dea15c51062863190d782026dd82 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Tue, 9 Jun 2026 07:42:53 +0000 Subject: [PATCH 22/40] Update schema documentation based on migration changes --- backend/docs/db/schema.sql | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/backend/docs/db/schema.sql b/backend/docs/db/schema.sql index f2b4eb3909..9e39e44f3b 100644 --- a/backend/docs/db/schema.sql +++ b/backend/docs/db/schema.sql @@ -935,6 +935,13 @@ ALTER TABLE ONLY public.user_groups_table CREATE INDEX data_use_terms_table_accession_idx ON public.data_use_terms_table USING btree (accession); +-- +-- Name: files_upload_requested_at_idx; Type: INDEX; Schema: public; Owner: postgres +-- + +CREATE INDEX files_upload_requested_at_idx ON public.files USING btree (upload_requested_at); + + -- -- Name: flyway_schema_history_s_idx; Type: INDEX; Schema: public; Owner: postgres -- From dd93d92e86bd0c7f0392d45598dbc0c8d4f4b87c Mon Sep 17 00:00:00 2001 From: maverbiest Date: Tue, 9 Jun 2026 13:42:53 +0200 Subject: [PATCH 23/40] TO REVERT: configure garbage collection task for interactive testing --- .../backend/service/submission/S3GarbageCollectionTask.kt | 2 +- backend/src/main/resources/application.properties | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index d50e3ef0d3..6fd9cae215 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -31,7 +31,7 @@ class S3GarbageCollectionTask( * Runs once daily (with an initial delay of 15 minutes) and deletes S3 objects older than * `loculus.s3.orphan-file-max-age-days` that are not referenced in unprocessed_data or processed_data */ - @Scheduled(initialDelay = 15, fixedDelay = 60 * 24, timeUnit = TimeUnit.MINUTES) + @Scheduled(initialDelay = 5, fixedDelay = 5, timeUnit = TimeUnit.MINUTES) fun task() { val threshold = dateProvider.getCurrentInstant() .minus(maxOrphanAge, DateTimeUnit.DAY, DateProvider.timeZone) diff --git a/backend/src/main/resources/application.properties b/backend/src/main/resources/application.properties index b41f31ead6..8b0871585b 100644 --- a/backend/src/main/resources/application.properties +++ b/backend/src/main/resources/application.properties @@ -33,7 +33,7 @@ loculus.stream.batch-size=1000 loculus.debug-mode=false loculus.s3.enabled=false -loculus.s3.orphan-file-max-age-days=7 +loculus.s3.orphan-file-max-age-days=0 loculus.s3.bucket.endpoint= loculus.s3.bucket.bucket= loculus.s3.bucket.region= From 9adaa8eb278721c7851445681b49e27537501fe9 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Tue, 9 Jun 2026 14:57:49 +0200 Subject: [PATCH 24/40] TO REVERT: set initial delay to 90 minutes to wait out initial preprocessing --- .../backend/service/submission/S3GarbageCollectionTask.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index 6fd9cae215..5e4aea1ee0 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -31,7 +31,7 @@ class S3GarbageCollectionTask( * Runs once daily (with an initial delay of 15 minutes) and deletes S3 objects older than * `loculus.s3.orphan-file-max-age-days` that are not referenced in unprocessed_data or processed_data */ - @Scheduled(initialDelay = 5, fixedDelay = 5, timeUnit = TimeUnit.MINUTES) + @Scheduled(initialDelay = 90, fixedDelay = 5, timeUnit = TimeUnit.MINUTES) fun task() { val threshold = dateProvider.getCurrentInstant() .minus(maxOrphanAge, DateTimeUnit.DAY, DateProvider.timeZone) From fc3f17f1f3b9bffe86554ab1a5b0b7ec9c0b6128 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Tue, 9 Jun 2026 14:58:35 +0200 Subject: [PATCH 25/40] log on each GC run --- .../backend/service/submission/S3GarbageCollectionTask.kt | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index 5e4aea1ee0..d7b032d3bc 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -33,6 +33,7 @@ class S3GarbageCollectionTask( */ @Scheduled(initialDelay = 90, fixedDelay = 5, timeUnit = TimeUnit.MINUTES) fun task() { + log.info { "Running S3 garbage collection" } val threshold = dateProvider.getCurrentInstant() .minus(maxOrphanAge, DateTimeUnit.DAY, DateProvider.timeZone) .toLocalDateTime(DateProvider.timeZone) From febad3f27483e9c639f591d7112155e99979cbb9 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Wed, 10 Jun 2026 12:09:10 +0200 Subject: [PATCH 26/40] Improve logs, enforce maxOrphanAge of at least 1 --- .../submission/S3GarbageCollectionTask.kt | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index d7b032d3bc..3f9ec3a30d 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -14,6 +14,7 @@ import org.springframework.scheduling.annotation.Scheduled import org.springframework.stereotype.Component import java.util.UUID import java.util.concurrent.TimeUnit +import kotlin.math.max private val log = mu.KotlinLogging.logger {} @@ -33,11 +34,16 @@ class S3GarbageCollectionTask( */ @Scheduled(initialDelay = 90, fixedDelay = 5, timeUnit = TimeUnit.MINUTES) fun task() { - log.info { "Running S3 garbage collection" } + log.info { "Running S3 garbage collection task" } + + // `maxOrphanAge` must be at least 1 or files produced by preprocessing will be + // garbage collected mid-run before they're attached to sequence entries + val maxOrphanAge = max(maxOrphanAge, 1) val threshold = dateProvider.getCurrentInstant() .minus(maxOrphanAge, DateTimeUnit.DAY, DateProvider.timeZone) .toLocalDateTime(DateProvider.timeZone) val orphans = filesDatabaseService.getOrphanedFileIds(threshold) + var deleteFailures = 0 orphans.forEach { fileId: UUID -> try { @@ -51,21 +57,23 @@ class S3GarbageCollectionTask( if (orphans.isNotEmpty()) { log.info { - "Deleted ${orphans.size - deleteFailures} orphans that were not referenced by a submission after " + - "$maxOrphanAge days" + "S3 garbage collection task deleted ${orphans.size - deleteFailures} orphan(s) not referenced by a " + + "submission after $maxOrphanAge days" } auditLogger .log( "CLEANUP", - "Deleted ${orphans.size - deleteFailures} orphans that were not referenced by a " + - "submission after $maxOrphanAge days", + "S3 garbage collection task deleted ${orphans.size - deleteFailures} orphan(s) " + + "not referenced by a submission after $maxOrphanAge days", ) if (deleteFailures > 0) { log.warn { - "Unsuccessfully attempted to delete $deleteFailures orphan files" + "S3 garbage collection task unsuccessfully attempted to delete $deleteFailures orphan file(s)" } } + } else { + log.info { "S3 garbage collection task identified no orphan files on S3" } } } } From 3cd7cc467a8070be16b9d806df65b332d991c06c Mon Sep 17 00:00:00 2001 From: maverbiest Date: Wed, 10 Jun 2026 13:40:16 +0200 Subject: [PATCH 27/40] Update tests and logs to account for clamping of maxOrphanAge --- .../backend/service/submission/S3GarbageCollectionTask.kt | 6 +++--- .../service/submission/S3GarbageCollectionTaskTest.kt | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index 3f9ec3a30d..3745745a60 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -34,11 +34,11 @@ class S3GarbageCollectionTask( */ @Scheduled(initialDelay = 90, fixedDelay = 5, timeUnit = TimeUnit.MINUTES) fun task() { - log.info { "Running S3 garbage collection task" } - // `maxOrphanAge` must be at least 1 or files produced by preprocessing will be - // garbage collected mid-run before they're attached to sequence entries + // garbage collected before they're attached to sequence entries val maxOrphanAge = max(maxOrphanAge, 1) + log.info { "Running S3 garbage collection task to clean up orphan files at least $maxOrphanAge days old" } + val threshold = dateProvider.getCurrentInstant() .minus(maxOrphanAge, DateTimeUnit.DAY, DateProvider.timeZone) .toLocalDateTime(DateProvider.timeZone) diff --git a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt index 322498cf42..694c8b7fb7 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt @@ -27,7 +27,7 @@ import java.util.UUID @EndpointTest( properties = [ "${BackendSpringProperty.S3_ENABLED}=true", - "${BackendSpringProperty.S3_MAX_ORPHAN_AGE_DAYS}=0", + "${BackendSpringProperty.S3_MAX_ORPHAN_AGE_DAYS}=1", ], ) class S3GarbageCollectionTaskTest( @@ -47,7 +47,7 @@ class S3GarbageCollectionTaskTest( .andGetGroupId() val orphan = UUID.randomUUID() val referenced = UUID.randomUUID() - listOf(orphan, referenced).forEach { insertFile(it, groupId, daysAgo(1)) } + listOf(orphan, referenced).forEach { insertFile(it, groupId, daysAgo(2)) } // `referenced` is referenced by a submission's unprocessed_data, so it must be protected. transaction { From 27138b75ce52628342c3d907928a7de9bbb423bf Mon Sep 17 00:00:00 2001 From: maverbiest Date: Wed, 10 Jun 2026 13:40:59 +0200 Subject: [PATCH 28/40] Revert "TO REVERT: set initial delay to 90 minutes to wait out initial preprocessing" This reverts commit 76606152fe2eb7c81161e0ac6d1da94001f37b65. --- .../backend/service/submission/S3GarbageCollectionTask.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index 3745745a60..046b62d6de 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -32,7 +32,7 @@ class S3GarbageCollectionTask( * Runs once daily (with an initial delay of 15 minutes) and deletes S3 objects older than * `loculus.s3.orphan-file-max-age-days` that are not referenced in unprocessed_data or processed_data */ - @Scheduled(initialDelay = 90, fixedDelay = 5, timeUnit = TimeUnit.MINUTES) + @Scheduled(initialDelay = 5, fixedDelay = 5, timeUnit = TimeUnit.MINUTES) fun task() { // `maxOrphanAge` must be at least 1 or files produced by preprocessing will be // garbage collected before they're attached to sequence entries From 2441fa288679ba17f82e77dbf1adba1ee5fa1fc8 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Wed, 10 Jun 2026 13:41:18 +0200 Subject: [PATCH 29/40] Revert "TO REVERT: configure garbage collection task for interactive testing" This reverts commit b4b7b340a0f03b498d539995316283eb631229db. --- .../backend/service/submission/S3GarbageCollectionTask.kt | 2 +- backend/src/main/resources/application.properties | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index 046b62d6de..ca25090eed 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -32,7 +32,7 @@ class S3GarbageCollectionTask( * Runs once daily (with an initial delay of 15 minutes) and deletes S3 objects older than * `loculus.s3.orphan-file-max-age-days` that are not referenced in unprocessed_data or processed_data */ - @Scheduled(initialDelay = 5, fixedDelay = 5, timeUnit = TimeUnit.MINUTES) + @Scheduled(initialDelay = 15, fixedDelay = 60 * 24, timeUnit = TimeUnit.MINUTES) fun task() { // `maxOrphanAge` must be at least 1 or files produced by preprocessing will be // garbage collected before they're attached to sequence entries diff --git a/backend/src/main/resources/application.properties b/backend/src/main/resources/application.properties index 8b0871585b..b41f31ead6 100644 --- a/backend/src/main/resources/application.properties +++ b/backend/src/main/resources/application.properties @@ -33,7 +33,7 @@ loculus.stream.batch-size=1000 loculus.debug-mode=false loculus.s3.enabled=false -loculus.s3.orphan-file-max-age-days=0 +loculus.s3.orphan-file-max-age-days=7 loculus.s3.bucket.endpoint= loculus.s3.bucket.bucket= loculus.s3.bucket.region= From ff618b50b87f80e49f3238170309ea00edddc31b Mon Sep 17 00:00:00 2001 From: maverbiest Date: Wed, 10 Jun 2026 14:02:45 +0200 Subject: [PATCH 30/40] Make maxOrphanAgeDays configurable in values.yaml --- .../org/loculus/backend/config/BackendSpringConfig.kt | 2 +- .../backend/service/submission/S3GarbageCollectionTask.kt | 2 +- backend/src/main/resources/application.properties | 2 +- kubernetes/loculus/templates/loculus-backend.yaml | 5 ++++- kubernetes/loculus/values.schema.json | 6 ++++++ kubernetes/loculus/values.yaml | 1 + 6 files changed, 14 insertions(+), 4 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt b/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt index e3cbafb10b..f2e4645739 100644 --- a/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt +++ b/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt @@ -43,7 +43,7 @@ object BackendSpringProperty { const val ENABLE_SEQSETS = "loculus.enable-seqsets" const val S3_ENABLED = "loculus.s3.enabled" - const val S3_MAX_ORPHAN_AGE_DAYS = "loculus.s3.orphan-file-max-age-days" + const val S3_MAX_ORPHAN_AGE_DAYS = "loculus.s3.max-orphan-age-days" const val S3_BUCKET_ENDPOINT = "loculus.s3.bucket.endpoint" const val S3_BUCKET_INTERNAL_ENDPOINT = "loculus.s3.bucket.internal-endpoint" const val S3_BUCKET_BUCKET = "loculus.s3.bucket.bucket" diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index ca25090eed..9badef65a3 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -30,7 +30,7 @@ class S3GarbageCollectionTask( /** * Runs once daily (with an initial delay of 15 minutes) and deletes S3 objects older than - * `loculus.s3.orphan-file-max-age-days` that are not referenced in unprocessed_data or processed_data + * `loculus.s3.max-orphan-age-days` that are not referenced in unprocessed_data or processed_data */ @Scheduled(initialDelay = 15, fixedDelay = 60 * 24, timeUnit = TimeUnit.MINUTES) fun task() { diff --git a/backend/src/main/resources/application.properties b/backend/src/main/resources/application.properties index b41f31ead6..142a04e157 100644 --- a/backend/src/main/resources/application.properties +++ b/backend/src/main/resources/application.properties @@ -33,7 +33,7 @@ loculus.stream.batch-size=1000 loculus.debug-mode=false loculus.s3.enabled=false -loculus.s3.orphan-file-max-age-days=7 +loculus.s3.max-orphan-age-days= loculus.s3.bucket.endpoint= loculus.s3.bucket.bucket= loculus.s3.bucket.region= diff --git a/kubernetes/loculus/templates/loculus-backend.yaml b/kubernetes/loculus/templates/loculus-backend.yaml index dc6ef7d14d..9bbc6ca627 100644 --- a/kubernetes/loculus/templates/loculus-backend.yaml +++ b/kubernetes/loculus/templates/loculus-backend.yaml @@ -71,6 +71,7 @@ spec: - "--loculus.pipeline-version-upgrade-check.interval-seconds={{- .Values.pipelineVersionUpgradeCheckIntervalSeconds | default 10 }}" - "--loculus.s3.enabled=$(S3_ENABLED)" {{- if $.Values.s3.enabled }} + - "--loculus.s3.max-orphan-age-days=$(S3_MAX_ORPHAN_AGE_DAYS)" - "--loculus.s3.bucket.endpoint=$(S3_BUCKET_ENDPOINT)" - "--loculus.s3.bucket.internal-endpoint=$(S3_BUCKET_INTERNAL_ENDPOINT)" - "--loculus.s3.bucket.region=$(S3_BUCKET_REGION)" @@ -131,6 +132,8 @@ spec: - name: S3_ENABLED value: {{$.Values.s3.enabled | quote }} {{- if $.Values.s3.enabled }} + - name: S3_MAX_ORPHAN_AGE_DAYS + value: {{$.Values.s3.maxOrphanAgeDays | quote }} - name: S3_BUCKET_ENDPOINT value: {{ include "loculus.s3Url" . | quote }} - name: S3_BUCKET_INTERNAL_ENDPOINT @@ -155,4 +158,4 @@ spec: mountPath: /config volumes: {{ include "loculus.configVolume" (dict "name" "loculus-backend-config") | nindent 8 }} -{{- end }} \ No newline at end of file +{{- end }} diff --git a/kubernetes/loculus/values.schema.json b/kubernetes/loculus/values.schema.json index 9efa3dc6f2..5661ebcd96 100644 --- a/kubernetes/loculus/values.schema.json +++ b/kubernetes/loculus/values.schema.json @@ -1704,6 +1704,12 @@ "default": false, "description": "Whether to enable S3. S3 is needed for the file sharing feature." }, + "maxOrphanAgeDays": { + "groups": ["s3"], + "type": "integer", + "default": 7, + "description": "Files uploaded to S3 but never attached to a sequence entry are garbage-collected after this many days." + }, "bucket": { "type": "object", "properties": { diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index ac55e61ca3..91fc8786ae 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -59,6 +59,7 @@ fileSharing: outputFileUrlType: backend s3: enabled: true + maxOrphanAgeDays: 7 bucket: region: us-east-1 bucket: loculus-preview-private From a8df88e0f82248dac6c0b6d96c61a78b7ab86d1e Mon Sep 17 00:00:00 2001 From: maverbiest Date: Wed, 10 Jun 2026 14:17:58 +0200 Subject: [PATCH 31/40] Add back default value in application.properties as fallback when no command line args are given --- backend/src/main/resources/application.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/main/resources/application.properties b/backend/src/main/resources/application.properties index 142a04e157..d388064388 100644 --- a/backend/src/main/resources/application.properties +++ b/backend/src/main/resources/application.properties @@ -33,7 +33,7 @@ loculus.stream.batch-size=1000 loculus.debug-mode=false loculus.s3.enabled=false -loculus.s3.max-orphan-age-days= +loculus.s3.max-orphan-age-days=7 loculus.s3.bucket.endpoint= loculus.s3.bucket.bucket= loculus.s3.bucket.region= From 16065cb4b61855fa3974566ab6e95020b4718567 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Wed, 10 Jun 2026 14:52:29 +0200 Subject: [PATCH 32/40] Make consistent with major backend refactor after rebase --- .../service/files/FilesDatabaseService.kt | 6 ++-- .../submission/S3GarbageCollectionTask.kt | 2 +- ...__add_files_upload_requested_at_index.sql} | 0 .../service/files/GetOrphanedFileIdsTest.kt | 28 +++++++++---------- .../submission/S3GarbageCollectionTaskTest.kt | 4 +-- 5 files changed, 19 insertions(+), 21 deletions(-) rename backend/src/main/resources/db/migration/{V1.29__add_files_upload_requested_at_index.sql => V1.31__add_files_upload_requested_at_index.sql} (100%) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt index 9c0700a15d..a00f14dd7a 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt @@ -70,13 +70,13 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { fun getOrphanedFileIds(threshold: LocalDateTime): Set { val sql = """ -- check for files for which an upload was requested > threshold days ago - -- but are not referenced by a submission. For this, check the unprocessed_data - -- and processed_data jsonb objects, but not original_data + -- but are not referenced by a submission. For this, check the submitted_data + -- and processed_data jsonb objects, but not archive_of_submitted_data WITH referenced AS ( -- fetch ids for files uploaded by users and referenced in submissions SELECT (fil->>'fileId')::uuid AS file_id FROM sequence_entries, - LATERAL jsonb_each(COALESCE(NULLIF(unprocessed_data->'files', 'null'::jsonb),'{}'::jsonb)) AS cat(k,v), + LATERAL jsonb_each(COALESCE(NULLIF(submitted_data->'files', 'null'::jsonb),'{}'::jsonb)) AS cat(k,v), LATERAL jsonb_array_elements(cat.v) AS fil UNION -- fetch ids for files produced by preprocessing. diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index 9badef65a3..5541106d68 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -30,7 +30,7 @@ class S3GarbageCollectionTask( /** * Runs once daily (with an initial delay of 15 minutes) and deletes S3 objects older than - * `loculus.s3.max-orphan-age-days` that are not referenced in unprocessed_data or processed_data + * `loculus.s3.max-orphan-age-days` that are not referenced in submitted_data or processed_data */ @Scheduled(initialDelay = 15, fixedDelay = 60 * 24, timeUnit = TimeUnit.MINUTES) fun task() { diff --git a/backend/src/main/resources/db/migration/V1.29__add_files_upload_requested_at_index.sql b/backend/src/main/resources/db/migration/V1.31__add_files_upload_requested_at_index.sql similarity index 100% rename from backend/src/main/resources/db/migration/V1.29__add_files_upload_requested_at_index.sql rename to backend/src/main/resources/db/migration/V1.31__add_files_upload_requested_at_index.sql diff --git a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt index 75e2ec436b..45596ed1f8 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt @@ -8,7 +8,7 @@ import org.jetbrains.exposed.sql.update import org.junit.jupiter.api.BeforeEach import org.junit.jupiter.api.Test import org.loculus.backend.api.FileIdAndName -import org.loculus.backend.api.OriginalData +import org.loculus.backend.api.SubmittedData import org.loculus.backend.api.ProcessedData import org.loculus.backend.config.BackendSpringProperty import org.loculus.backend.controller.DEFAULT_GROUP @@ -63,7 +63,7 @@ class GetOrphanedFileIdsTest( } @Test - fun `GIVEN a file only referenced in original_data THEN it is orphaned, but unprocessed_data is protected`() { + fun `GIVEN a file only referenced in archive_of_submitted_data THEN it is orphaned, but submitted_data is protected`() { // Simulate a case where a user edited a submission, replacing `editedAway` with `currentFile`. val editedAway = UUID.randomUUID() val currentFile = UUID.randomUUID() @@ -71,8 +71,8 @@ class GetOrphanedFileIdsTest( insertSequenceEntry( accession = "A", version = 2, - original = makeUnprocessedData(editedAway), - unprocessed = makeUnprocessedData(currentFile), + archive = makeUnprocessedData(editedAway), + submitted = makeUnprocessedData(currentFile), ) val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) @@ -97,7 +97,7 @@ class GetOrphanedFileIdsTest( .forEach { insertFile(it, groupId, daysAgo(10)) } // The sequence entry itself references no files, so only the processed_data references matter. - insertSequenceEntry(accession = "A", version = 1, original = null, unprocessed = makeUnprocessedData(null)) + insertSequenceEntry(accession = "A", version = 1, archive = null, submitted = makeUnprocessedData(null)) insertPreprocessedData( accession = "A", version = 1, @@ -124,17 +124,15 @@ class GetOrphanedFileIdsTest( @Test fun `GIVEN a file referenced only by old version THEN it is still protected`() { - // Branch 1 of the query has no version filter: a file referenced by ANY version's - // unprocessed_data must survive, even if that version is no longer the latest. val fileInOldVersion = UUID.randomUUID() insertFile(fileInOldVersion, groupId, daysAgo(10)) insertSequenceEntry( accession = "A", version = 1, - original = null, - unprocessed = makeUnprocessedData(fileInOldVersion), + archive = null, + submitted = makeUnprocessedData(fileInOldVersion), ) - insertSequenceEntry(accession = "A", version = 2, original = null, unprocessed = makeUnprocessedData(null)) + insertSequenceEntry(accession = "A", version = 2, archive = null, submitted = makeUnprocessedData(null)) val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) @@ -144,8 +142,8 @@ class GetOrphanedFileIdsTest( private fun insertSequenceEntry( accession: String, version: Long, - original: OriginalData?, - unprocessed: OriginalData?, + archive: SubmittedData?, + submitted: SubmittedData?, ) = transaction { SequenceEntriesTable.insert { it[accessionColumn] = accession @@ -155,8 +153,8 @@ class GetOrphanedFileIdsTest( it[submitterColumn] = "testuser" it[groupIdColumn] = groupId it[submittedAtTimestampColumn] = dateProvider.getCurrentDateTime() - it[originalDataColumn] = original - it[unprocessedDataColumn] = unprocessed + it[archiveOfSubmittedDataColumn] = archive + it[submittedDataColumn] = submitted } } @@ -176,7 +174,7 @@ class GetOrphanedFileIdsTest( } } - private fun makeUnprocessedData(fileId: UUID?): OriginalData = OriginalData( + private fun makeUnprocessedData(fileId: UUID?): SubmittedData = SubmittedData( metadata = emptyMap(), unalignedNucleotideSequences = emptyMap(), files = fileId?.let { mapOf("rawReads" to listOf(FileIdAndName(it, "raw.fastq"))) }, diff --git a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt index 694c8b7fb7..8d988cfcd5 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt @@ -8,7 +8,7 @@ import org.jetbrains.exposed.sql.insert import org.jetbrains.exposed.sql.transactions.transaction import org.junit.jupiter.api.Test import org.loculus.backend.api.FileIdAndName -import org.loculus.backend.api.OriginalData +import org.loculus.backend.api.SubmittedData import org.loculus.backend.config.BackendSpringProperty import org.loculus.backend.controller.DEFAULT_GROUP import org.loculus.backend.controller.DEFAULT_ORGANISM @@ -59,7 +59,7 @@ class S3GarbageCollectionTaskTest( it[submitterColumn] = "testuser" it[groupIdColumn] = groupId it[submittedAtTimestampColumn] = dateProvider.getCurrentDateTime() - it[unprocessedDataColumn] = OriginalData( + it[submittedDataColumn] = SubmittedData( metadata = emptyMap(), unalignedNucleotideSequences = emptyMap(), files = mapOf("rawReads" to listOf(FileIdAndName(referenced, "raw.fastq"))), From 06a1cd7e1edc9fd4e4e888eb73108ee019905eeb Mon Sep 17 00:00:00 2001 From: maverbiest Date: Wed, 10 Jun 2026 14:54:42 +0200 Subject: [PATCH 33/40] Formatting --- .../loculus/backend/service/files/GetOrphanedFileIdsTest.kt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt index 45596ed1f8..b7738befdf 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt @@ -8,8 +8,8 @@ import org.jetbrains.exposed.sql.update import org.junit.jupiter.api.BeforeEach import org.junit.jupiter.api.Test import org.loculus.backend.api.FileIdAndName -import org.loculus.backend.api.SubmittedData import org.loculus.backend.api.ProcessedData +import org.loculus.backend.api.SubmittedData import org.loculus.backend.config.BackendSpringProperty import org.loculus.backend.controller.DEFAULT_GROUP import org.loculus.backend.controller.DEFAULT_ORGANISM @@ -62,6 +62,7 @@ class GetOrphanedFileIdsTest( assertThat(orphans, `is`(setOf(old))) } + @Suppress("ktlint:standard:max-line-length") @Test fun `GIVEN a file only referenced in archive_of_submitted_data THEN it is orphaned, but submitted_data is protected`() { // Simulate a case where a user edited a submission, replacing `editedAway` with `currentFile`. From f68bf0ca25f8204b47d8922a5a1b3bdc39d235e3 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Thu, 11 Jun 2026 13:49:08 +0200 Subject: [PATCH 34/40] Add gcDryRun flag, turned on by default --- .../backend/config/BackendSpringConfig.kt | 1 + .../submission/S3GarbageCollectionTask.kt | 13 +++++++-- .../src/main/resources/application.properties | 1 + .../submission/S3GarbageCollectionTaskTest.kt | 27 ++++++++++++++++++- .../loculus/templates/loculus-backend.yaml | 3 +++ kubernetes/loculus/values.schema.json | 6 +++++ kubernetes/loculus/values.yaml | 1 + 7 files changed, 49 insertions(+), 3 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt b/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt index f2e4645739..f9d0685bb8 100644 --- a/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt +++ b/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt @@ -43,6 +43,7 @@ object BackendSpringProperty { const val ENABLE_SEQSETS = "loculus.enable-seqsets" const val S3_ENABLED = "loculus.s3.enabled" + const val S3_GC_DRY_RUN = "loculus.s3.gc-dry-run" const val S3_MAX_ORPHAN_AGE_DAYS = "loculus.s3.max-orphan-age-days" const val S3_BUCKET_ENDPOINT = "loculus.s3.bucket.endpoint" const val S3_BUCKET_INTERNAL_ENDPOINT = "loculus.s3.bucket.internal-endpoint" diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index 5541106d68..c9d41fa409 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -26,6 +26,7 @@ class S3GarbageCollectionTask( private val dateProvider: DateProvider, private val auditLogger: AuditLogger, @Value("\${${BackendSpringProperty.S3_MAX_ORPHAN_AGE_DAYS}}") private val maxOrphanAge: Int, + @Value("\${${BackendSpringProperty.S3_GC_DRY_RUN}}") private val dryRun: Boolean, ) { /** @@ -34,16 +35,24 @@ class S3GarbageCollectionTask( */ @Scheduled(initialDelay = 15, fixedDelay = 60 * 24, timeUnit = TimeUnit.MINUTES) fun task() { - // `maxOrphanAge` must be at least 1 or files produced by preprocessing will be + // `maxOrphanAge` must be at least 1 or files produced by preprocessing may be // garbage collected before they're attached to sequence entries val maxOrphanAge = max(maxOrphanAge, 1) - log.info { "Running S3 garbage collection task to clean up orphan files at least $maxOrphanAge days old" } + log.info { + "Running S3 garbage collection task to clean up orphan files at least $maxOrphanAge " + + "days old (dry run = $dryRun)" + } val threshold = dateProvider.getCurrentInstant() .minus(maxOrphanAge, DateTimeUnit.DAY, DateProvider.timeZone) .toLocalDateTime(DateProvider.timeZone) val orphans = filesDatabaseService.getOrphanedFileIds(threshold) + if (dryRun) { + log.info { "S3 garbage collection task would have deleted ${orphans.size} files: $orphans" } + return + } + var deleteFailures = 0 orphans.forEach { fileId: UUID -> try { diff --git a/backend/src/main/resources/application.properties b/backend/src/main/resources/application.properties index d388064388..2dae05df88 100644 --- a/backend/src/main/resources/application.properties +++ b/backend/src/main/resources/application.properties @@ -33,6 +33,7 @@ loculus.stream.batch-size=1000 loculus.debug-mode=false loculus.s3.enabled=false +loculus.s3.gc-dry-run=true loculus.s3.max-orphan-age-days=7 loculus.s3.bucket.endpoint= loculus.s3.bucket.bucket= diff --git a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt index 8d988cfcd5..fd1647e7b8 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt @@ -16,6 +16,7 @@ import org.loculus.backend.controller.EndpointTest import org.loculus.backend.controller.groupmanagement.GroupManagementControllerClient import org.loculus.backend.controller.groupmanagement.andGetGroupId import org.loculus.backend.controller.jwtForDefaultUser +import org.loculus.backend.log.AuditLogger import org.loculus.backend.service.daysAgo import org.loculus.backend.service.files.FilesDatabaseService import org.loculus.backend.service.files.S3Service @@ -27,6 +28,7 @@ import java.util.UUID @EndpointTest( properties = [ "${BackendSpringProperty.S3_ENABLED}=true", + "${BackendSpringProperty.S3_GC_DRY_RUN}=true", "${BackendSpringProperty.S3_MAX_ORPHAN_AGE_DAYS}=1", ], ) @@ -35,10 +37,25 @@ class S3GarbageCollectionTaskTest( @Autowired val filesDatabaseService: FilesDatabaseService, @Autowired val groupManagementClient: GroupManagementControllerClient, @Autowired val dateProvider: DateProvider, + @Autowired val auditLogger: AuditLogger, ) { @MockkBean(relaxed = true) lateinit var s3Service: S3Service + @Test + fun `GIVEN dry run is enabled WHEN the task runs THEN the orphan is not deleted from S3 or the DB`() { + val groupId = groupManagementClient + .createNewGroup(group = DEFAULT_GROUP, jwt = jwtForDefaultUser) + .andGetGroupId() + val orphan = UUID.randomUUID() + insertFile(orphan, groupId, daysAgo(2)) + + s3GarbageCollectionTask.task() + + verify(exactly = 0) { s3Service.deleteFile(any()) } + assertThat(filesDatabaseService.getNonExistentFileIds(setOf(orphan)), `is`(emptySet())) + } + @Suppress("ktlint:standard:max-line-length") @Test fun `GIVEN an orphan and a referenced file WHEN the task runs THEN only the orphan is deleted from S3 and the DB`() { @@ -67,7 +84,15 @@ class S3GarbageCollectionTaskTest( } } - s3GarbageCollectionTask.task() + val deleteTask = S3GarbageCollectionTask( + filesDatabaseService, + s3Service, + dateProvider, + auditLogger, + maxOrphanAge = 1, + dryRun = false, + ) + deleteTask.task() verify { s3Service.deleteFile(orphan) } verify(exactly = 0) { s3Service.deleteFile(referenced) } diff --git a/kubernetes/loculus/templates/loculus-backend.yaml b/kubernetes/loculus/templates/loculus-backend.yaml index 9bbc6ca627..620f166395 100644 --- a/kubernetes/loculus/templates/loculus-backend.yaml +++ b/kubernetes/loculus/templates/loculus-backend.yaml @@ -71,6 +71,7 @@ spec: - "--loculus.pipeline-version-upgrade-check.interval-seconds={{- .Values.pipelineVersionUpgradeCheckIntervalSeconds | default 10 }}" - "--loculus.s3.enabled=$(S3_ENABLED)" {{- if $.Values.s3.enabled }} + - "--loculus.s3.gc-dry-run=$(S3_GC_DRY_RUN)" - "--loculus.s3.max-orphan-age-days=$(S3_MAX_ORPHAN_AGE_DAYS)" - "--loculus.s3.bucket.endpoint=$(S3_BUCKET_ENDPOINT)" - "--loculus.s3.bucket.internal-endpoint=$(S3_BUCKET_INTERNAL_ENDPOINT)" @@ -132,6 +133,8 @@ spec: - name: S3_ENABLED value: {{$.Values.s3.enabled | quote }} {{- if $.Values.s3.enabled }} + - name: S3_GC_DRY_RUN + value: {{$.Values.s3.gcDryRun | quote }} - name: S3_MAX_ORPHAN_AGE_DAYS value: {{$.Values.s3.maxOrphanAgeDays | quote }} - name: S3_BUCKET_ENDPOINT diff --git a/kubernetes/loculus/values.schema.json b/kubernetes/loculus/values.schema.json index 5661ebcd96..17b8bddc9e 100644 --- a/kubernetes/loculus/values.schema.json +++ b/kubernetes/loculus/values.schema.json @@ -1704,6 +1704,12 @@ "default": false, "description": "Whether to enable S3. S3 is needed for the file sharing feature." }, + "gcDryRun": { + "groups": ["s3"], + "type": "boolean", + "default": true, + "description": "Whether the S3 garbage collection task in the backend should delete orphan files or just output a log with orphan files." + }, "maxOrphanAgeDays": { "groups": ["s3"], "type": "integer", diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 91fc8786ae..08133ecbac 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -59,6 +59,7 @@ fileSharing: outputFileUrlType: backend s3: enabled: true + gcDryRun: true # have garbage collection log instead of actually delete files maxOrphanAgeDays: 7 bucket: region: us-east-1 From 6a83024e793e57c3e5809a7d5ea6f4eb5e355269 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Thu, 11 Jun 2026 15:54:37 +0200 Subject: [PATCH 35/40] Prevent files from previous pipeline version from being garbage collected --- .../backend/service/files/FilesDatabaseService.kt | 14 ++++---------- .../service/files/GetOrphanedFileIdsTest.kt | 4 ++-- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt index a00f14dd7a..75d3a3c797 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt @@ -10,7 +10,6 @@ import org.jetbrains.exposed.sql.not import org.jetbrains.exposed.sql.statements.StatementType import org.jetbrains.exposed.sql.transactions.transaction import org.jetbrains.exposed.sql.update -import org.loculus.backend.utils.DatabaseConstants import org.loculus.backend.utils.DateProvider import org.loculus.backend.utils.chunkedForDatabase import org.springframework.stereotype.Service @@ -71,7 +70,7 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { val sql = """ -- check for files for which an upload was requested > threshold days ago -- but are not referenced by a submission. For this, check the submitted_data - -- and processed_data jsonb objects, but not archive_of_submitted_data + -- and processed_data jsonb objects (but not archive_of_submitted_data) WITH referenced AS ( -- fetch ids for files uploaded by users and referenced in submissions SELECT (fil->>'fileId')::uuid AS file_id @@ -79,18 +78,13 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { LATERAL jsonb_each(COALESCE(NULLIF(submitted_data->'files', 'null'::jsonb),'{}'::jsonb)) AS cat(k,v), LATERAL jsonb_array_elements(cat.v) AS fil UNION - -- fetch ids for files produced by preprocessing. - -- For these, we consider only files referenced by the current pipeline - -- version or newer. - -- (newer than current versions will only exist during rollouts) + -- also need to check processed_data since preprocessing + -- can create files that are never referenced in submissions SELECT (fil->>'fileId')::uuid AS file_id FROM sequence_entries_preprocessed_data sepd JOIN sequence_entries se ON se.accession = sepd.accession - AND se.version = sepd.version - JOIN current_processing_pipeline cpp - ON cpp.organism = se.organism - AND sepd.pipeline_version >= cpp.version, + AND se.version = sepd.version, LATERAL jsonb_each(COALESCE(NULLIF(sepd.processed_data->'files', 'null'::jsonb),'{}'::jsonb)) AS cat(k,v), LATERAL jsonb_array_elements(cat.v) AS fil ) diff --git a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt index b7738befdf..c00fc49efd 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt @@ -82,7 +82,7 @@ class GetOrphanedFileIdsTest( } @Test - fun `GIVEN processed_data files THEN only those from a pipeline version older than current are orphaned`() { + fun `GIVEN multiple pipeline versions THEN files from all pipeline versions are protected`() { transaction { CurrentProcessingPipelineTable.update( { CurrentProcessingPipelineTable.organismColumn eq DEFAULT_ORGANISM }, @@ -120,7 +120,7 @@ class GetOrphanedFileIdsTest( val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) - assertThat(orphans, `is`(setOf(fileFromOldPipeline))) + assertThat(orphans, `is`(emptySet())) } @Test From 56caffe3569c69d347f2936d84cc85b5a1af9295 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Thu, 11 Jun 2026 16:18:50 +0200 Subject: [PATCH 36/40] Improve error message for missing S3 file --- .../kotlin/org/loculus/backend/service/files/S3Service.kt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt index 7c966393f3..6fa72beaac 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt @@ -271,6 +271,11 @@ fun s3ErrorMapping(block: () -> T): T { "by part number.", ) + "NoSuchKey" -> UnprocessableEntityException( + "NoSuchKey: The referenced file does not exist in storage. Uploaded files that are not " + + "referenced by a submission for too long are cleaned up automatically.", + ) + else -> RuntimeException("Unexpected S3 error: ${e.awsErrorDetails().errorCode()}") } } From 13c1444a3d01794d9b3be29624eaf13e9d327fa1 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Tue, 16 Jun 2026 11:16:35 +0200 Subject: [PATCH 37/40] Edit stale comment --- .../loculus/backend/service/files/GetOrphanedFileIdsTest.kt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt index c00fc49efd..cc121d1053 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt @@ -91,9 +91,9 @@ class GetOrphanedFileIdsTest( } } - val fileFromOldPipeline = UUID.randomUUID() // pipeline version 1 (< current) -> orphaned - val fileFromCurrentPipeline = UUID.randomUUID() // pipeline version 2 (current) -> protected - val fileFromNewerPipeline = UUID.randomUUID() // pipeline version 3 (> current) -> protected + val fileFromOldPipeline = UUID.randomUUID() // pipeline version 1 (< current) + val fileFromCurrentPipeline = UUID.randomUUID() // pipeline version 2 (current) + val fileFromNewerPipeline = UUID.randomUUID() // pipeline version 3 (> current) listOf(fileFromOldPipeline, fileFromCurrentPipeline, fileFromNewerPipeline) .forEach { insertFile(it, groupId, daysAgo(10)) } From c15ac0dc58402881d1e4fc3e1142a4ba0ea95c9a Mon Sep 17 00:00:00 2001 From: maverbiest Date: Tue, 16 Jun 2026 12:12:23 +0200 Subject: [PATCH 38/40] Refactor S3 GC config flags --- .../backend/config/BackendSpringConfig.kt | 4 ++-- .../submission/S3GarbageCollectionTask.kt | 24 +++++++++---------- .../src/main/resources/application.properties | 4 ++-- .../submission/S3GarbageCollectionTaskTest.kt | 8 +++---- .../loculus/templates/loculus-backend.yaml | 12 +++++----- kubernetes/loculus/values.schema.json | 12 +++++----- kubernetes/loculus/values.yaml | 4 ++-- 7 files changed, 34 insertions(+), 34 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt b/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt index f9d0685bb8..0d9d8fa9da 100644 --- a/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt +++ b/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt @@ -43,8 +43,8 @@ object BackendSpringProperty { const val ENABLE_SEQSETS = "loculus.enable-seqsets" const val S3_ENABLED = "loculus.s3.enabled" - const val S3_GC_DRY_RUN = "loculus.s3.gc-dry-run" - const val S3_MAX_ORPHAN_AGE_DAYS = "loculus.s3.max-orphan-age-days" + const val S3_GC_ENABLED = "loculus.s3.gc-enabled" + const val S3_GC_GRACE_PERIOD_MINUTES = "loculus.s3.gc-grace-period-minutes" const val S3_BUCKET_ENDPOINT = "loculus.s3.bucket.endpoint" const val S3_BUCKET_INTERNAL_ENDPOINT = "loculus.s3.bucket.internal-endpoint" const val S3_BUCKET_BUCKET = "loculus.s3.bucket.bucket" diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index c9d41fa409..bc4cd403f7 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -25,30 +25,30 @@ class S3GarbageCollectionTask( private val s3Service: S3Service, private val dateProvider: DateProvider, private val auditLogger: AuditLogger, - @Value("\${${BackendSpringProperty.S3_MAX_ORPHAN_AGE_DAYS}}") private val maxOrphanAge: Int, - @Value("\${${BackendSpringProperty.S3_GC_DRY_RUN}}") private val dryRun: Boolean, + @Value("\${${BackendSpringProperty.S3_GC_GRACE_PERIOD_MINUTES}}") private val gracePeriod: Int, + @Value("\${${BackendSpringProperty.S3_GC_ENABLED}}") private val deleteOrphans: Boolean, ) { /** - * Runs once daily (with an initial delay of 15 minutes) and deletes S3 objects older than - * `loculus.s3.max-orphan-age-days` that are not referenced in submitted_data or processed_data + * Deletes S3 objects older than `loculus.s3.gc-grace-period-minutes` that are + * not referenced in submitted_data or processed_data */ @Scheduled(initialDelay = 15, fixedDelay = 60 * 24, timeUnit = TimeUnit.MINUTES) fun task() { - // `maxOrphanAge` must be at least 1 or files produced by preprocessing may be + // `gracePeriod` must be at least 1 or files produced by preprocessing may be // garbage collected before they're attached to sequence entries - val maxOrphanAge = max(maxOrphanAge, 1) + val gracePeriod = max(gracePeriod, 1) log.info { - "Running S3 garbage collection task to clean up orphan files at least $maxOrphanAge " + - "days old (dry run = $dryRun)" + "Running S3 garbage collection task to clean up orphan files at least $gracePeriod " + + "minutes old (garbageCollectionEnabled = $deleteOrphans)" } val threshold = dateProvider.getCurrentInstant() - .minus(maxOrphanAge, DateTimeUnit.DAY, DateProvider.timeZone) + .minus(gracePeriod, DateTimeUnit.MINUTE, DateProvider.timeZone) .toLocalDateTime(DateProvider.timeZone) val orphans = filesDatabaseService.getOrphanedFileIds(threshold) - if (dryRun) { + if (!deleteOrphans) { log.info { "S3 garbage collection task would have deleted ${orphans.size} files: $orphans" } return } @@ -67,13 +67,13 @@ class S3GarbageCollectionTask( if (orphans.isNotEmpty()) { log.info { "S3 garbage collection task deleted ${orphans.size - deleteFailures} orphan(s) not referenced by a " + - "submission after $maxOrphanAge days" + "submission after $gracePeriod minutes" } auditLogger .log( "CLEANUP", "S3 garbage collection task deleted ${orphans.size - deleteFailures} orphan(s) " + - "not referenced by a submission after $maxOrphanAge days", + "not referenced by a submission after $gracePeriod minutes", ) if (deleteFailures > 0) { diff --git a/backend/src/main/resources/application.properties b/backend/src/main/resources/application.properties index 2dae05df88..7ce8a196dc 100644 --- a/backend/src/main/resources/application.properties +++ b/backend/src/main/resources/application.properties @@ -33,8 +33,8 @@ loculus.stream.batch-size=1000 loculus.debug-mode=false loculus.s3.enabled=false -loculus.s3.gc-dry-run=true -loculus.s3.max-orphan-age-days=7 +loculus.s3.gc-enabled=false +loculus.s3.gc-grace-period-minutes=1440 loculus.s3.bucket.endpoint= loculus.s3.bucket.bucket= loculus.s3.bucket.region= diff --git a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt index fd1647e7b8..81397076b8 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt @@ -28,8 +28,8 @@ import java.util.UUID @EndpointTest( properties = [ "${BackendSpringProperty.S3_ENABLED}=true", - "${BackendSpringProperty.S3_GC_DRY_RUN}=true", - "${BackendSpringProperty.S3_MAX_ORPHAN_AGE_DAYS}=1", + "${BackendSpringProperty.S3_GC_ENABLED}=false", + "${BackendSpringProperty.S3_GC_GRACE_PERIOD_MINUTES}=1", ], ) class S3GarbageCollectionTaskTest( @@ -89,8 +89,8 @@ class S3GarbageCollectionTaskTest( s3Service, dateProvider, auditLogger, - maxOrphanAge = 1, - dryRun = false, + gracePeriod = 1, + deleteOrphans = true, ) deleteTask.task() diff --git a/kubernetes/loculus/templates/loculus-backend.yaml b/kubernetes/loculus/templates/loculus-backend.yaml index 620f166395..bcf59c139e 100644 --- a/kubernetes/loculus/templates/loculus-backend.yaml +++ b/kubernetes/loculus/templates/loculus-backend.yaml @@ -71,8 +71,8 @@ spec: - "--loculus.pipeline-version-upgrade-check.interval-seconds={{- .Values.pipelineVersionUpgradeCheckIntervalSeconds | default 10 }}" - "--loculus.s3.enabled=$(S3_ENABLED)" {{- if $.Values.s3.enabled }} - - "--loculus.s3.gc-dry-run=$(S3_GC_DRY_RUN)" - - "--loculus.s3.max-orphan-age-days=$(S3_MAX_ORPHAN_AGE_DAYS)" + - "--loculus.s3.gc-enabled=$(S3_GC_ENABLED)" + - "--loculus.s3.gc-grace-period-minutes=$(S3_GC_GRACE_PERIOD_MINUTES)" - "--loculus.s3.bucket.endpoint=$(S3_BUCKET_ENDPOINT)" - "--loculus.s3.bucket.internal-endpoint=$(S3_BUCKET_INTERNAL_ENDPOINT)" - "--loculus.s3.bucket.region=$(S3_BUCKET_REGION)" @@ -133,10 +133,10 @@ spec: - name: S3_ENABLED value: {{$.Values.s3.enabled | quote }} {{- if $.Values.s3.enabled }} - - name: S3_GC_DRY_RUN - value: {{$.Values.s3.gcDryRun | quote }} - - name: S3_MAX_ORPHAN_AGE_DAYS - value: {{$.Values.s3.maxOrphanAgeDays | quote }} + - name: S3_GC_ENABLED + value: {{$.Values.s3.garbageCollectionEnabled | quote }} + - name: S3_GC_GRACE_PERIOD_MINUTES + value: {{$.Values.s3.garbageCollectionGracePeriodMinutes | quote }} - name: S3_BUCKET_ENDPOINT value: {{ include "loculus.s3Url" . | quote }} - name: S3_BUCKET_INTERNAL_ENDPOINT diff --git a/kubernetes/loculus/values.schema.json b/kubernetes/loculus/values.schema.json index 17b8bddc9e..f4c0b78cb5 100644 --- a/kubernetes/loculus/values.schema.json +++ b/kubernetes/loculus/values.schema.json @@ -1704,17 +1704,17 @@ "default": false, "description": "Whether to enable S3. S3 is needed for the file sharing feature." }, - "gcDryRun": { + "garbageCollectionEnabled": { "groups": ["s3"], "type": "boolean", - "default": true, - "description": "Whether the S3 garbage collection task in the backend should delete orphan files or just output a log with orphan files." + "default": false, + "description": "Whether the S3 garbage collection task in the backend should delete orphan files (if true) or just output a log with orphan files (if false)." }, - "maxOrphanAgeDays": { + "garbageCollectionGracePeriodMinutes": { "groups": ["s3"], "type": "integer", - "default": 7, - "description": "Files uploaded to S3 but never attached to a sequence entry are garbage-collected after this many days." + "default": 1440, + "description": "Files uploaded to S3 but never attached to a sequence entry are garbage-collected after this many minutes." }, "bucket": { "type": "object", diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 08133ecbac..77d06e99ba 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -59,8 +59,8 @@ fileSharing: outputFileUrlType: backend s3: enabled: true - gcDryRun: true # have garbage collection log instead of actually delete files - maxOrphanAgeDays: 7 + garbageCollectionEnabled: false # have garbage collection log instead of actually delete files + garbageCollectionGracePeriodMinutes: 1440 bucket: region: us-east-1 bucket: loculus-preview-private From f2798aa7e1b79ef17efb8efde84a97536026b536 Mon Sep 17 00:00:00 2001 From: maverbiest Date: Tue, 16 Jun 2026 13:22:26 +0200 Subject: [PATCH 39/40] Make initial delay of GC task configurable --- .../org/loculus/backend/config/BackendSpringConfig.kt | 1 + .../backend/service/submission/S3GarbageCollectionTask.kt | 6 +++++- backend/src/main/resources/application.properties | 1 + kubernetes/loculus/templates/loculus-backend.yaml | 3 +++ kubernetes/loculus/values.schema.json | 6 ++++++ kubernetes/loculus/values.yaml | 1 + 6 files changed, 17 insertions(+), 1 deletion(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt b/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt index 0d9d8fa9da..c7fb6e178c 100644 --- a/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt +++ b/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt @@ -45,6 +45,7 @@ object BackendSpringProperty { const val S3_ENABLED = "loculus.s3.enabled" const val S3_GC_ENABLED = "loculus.s3.gc-enabled" const val S3_GC_GRACE_PERIOD_MINUTES = "loculus.s3.gc-grace-period-minutes" + const val S3_GC_INITIAL_DELAY_MINUTES = "loculus.s3.gc-initial-delay-minutes" const val S3_BUCKET_ENDPOINT = "loculus.s3.bucket.endpoint" const val S3_BUCKET_INTERNAL_ENDPOINT = "loculus.s3.bucket.internal-endpoint" const val S3_BUCKET_BUCKET = "loculus.s3.bucket.bucket" diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt index bc4cd403f7..87e4767e0c 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -33,7 +33,11 @@ class S3GarbageCollectionTask( * Deletes S3 objects older than `loculus.s3.gc-grace-period-minutes` that are * not referenced in submitted_data or processed_data */ - @Scheduled(initialDelay = 15, fixedDelay = 60 * 24, timeUnit = TimeUnit.MINUTES) + @Scheduled( + initialDelayString = "\${${BackendSpringProperty.S3_GC_INITIAL_DELAY_MINUTES}}", + fixedDelay = 60 * 24, + timeUnit = TimeUnit.MINUTES, + ) fun task() { // `gracePeriod` must be at least 1 or files produced by preprocessing may be // garbage collected before they're attached to sequence entries diff --git a/backend/src/main/resources/application.properties b/backend/src/main/resources/application.properties index 7ce8a196dc..7f8d250345 100644 --- a/backend/src/main/resources/application.properties +++ b/backend/src/main/resources/application.properties @@ -35,6 +35,7 @@ loculus.debug-mode=false loculus.s3.enabled=false loculus.s3.gc-enabled=false loculus.s3.gc-grace-period-minutes=1440 +loculus.s3.gc-initial-delay-minutes=15 loculus.s3.bucket.endpoint= loculus.s3.bucket.bucket= loculus.s3.bucket.region= diff --git a/kubernetes/loculus/templates/loculus-backend.yaml b/kubernetes/loculus/templates/loculus-backend.yaml index bcf59c139e..65506b8fee 100644 --- a/kubernetes/loculus/templates/loculus-backend.yaml +++ b/kubernetes/loculus/templates/loculus-backend.yaml @@ -73,6 +73,7 @@ spec: {{- if $.Values.s3.enabled }} - "--loculus.s3.gc-enabled=$(S3_GC_ENABLED)" - "--loculus.s3.gc-grace-period-minutes=$(S3_GC_GRACE_PERIOD_MINUTES)" + - "--loculus.s3.gc-initial-delay-minutes=$(S3_GC_INITIAL_DELAY_MINUTES)" - "--loculus.s3.bucket.endpoint=$(S3_BUCKET_ENDPOINT)" - "--loculus.s3.bucket.internal-endpoint=$(S3_BUCKET_INTERNAL_ENDPOINT)" - "--loculus.s3.bucket.region=$(S3_BUCKET_REGION)" @@ -137,6 +138,8 @@ spec: value: {{$.Values.s3.garbageCollectionEnabled | quote }} - name: S3_GC_GRACE_PERIOD_MINUTES value: {{$.Values.s3.garbageCollectionGracePeriodMinutes | quote }} + - name: S3_GC_INITIAL_DELAY_MINUTES + value: {{$.Values.s3.garbageCollectionInitialDelayMinutes | quote }} - name: S3_BUCKET_ENDPOINT value: {{ include "loculus.s3Url" . | quote }} - name: S3_BUCKET_INTERNAL_ENDPOINT diff --git a/kubernetes/loculus/values.schema.json b/kubernetes/loculus/values.schema.json index f4c0b78cb5..fda6d52bbe 100644 --- a/kubernetes/loculus/values.schema.json +++ b/kubernetes/loculus/values.schema.json @@ -1716,6 +1716,12 @@ "default": 1440, "description": "Files uploaded to S3 but never attached to a sequence entry are garbage-collected after this many minutes." }, + "garbageCollectionInitialDelayMinutes": { + "groups": ["s3"], + "type": "integer", + "default": 15, + "description": "How many minutes it takes before the first garbage collection task runs after the backend starts." + }, "bucket": { "type": "object", "properties": { diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 77d06e99ba..9a0938f3b0 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -61,6 +61,7 @@ s3: enabled: true garbageCollectionEnabled: false # have garbage collection log instead of actually delete files garbageCollectionGracePeriodMinutes: 1440 + garbageCollectionInitialDelayMinutes: 15 bucket: region: us-east-1 bucket: loculus-preview-private From a6a2bd0f6d082328b33b7c15fe90a74ca57f549e Mon Sep 17 00:00:00 2001 From: maverbiest Date: Tue, 16 Jun 2026 13:49:08 +0200 Subject: [PATCH 40/40] Add buildSet suggestion --- .../loculus/backend/service/files/FilesDatabaseService.kt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt index 75d3a3c797..c6161bb651 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt @@ -99,11 +99,11 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { listOf(KotlinLocalDateTimeColumnType() to threshold), explicitStatementType = StatementType.SELECT, ) { rs -> - val ids = mutableSetOf() - while (rs.next()) { - ids += rs.getObject("id", UUID::class.java) + buildSet { + while (rs.next()) { + add(rs.getObject("id", UUID::class.java)) + } } - ids } ?: emptySet() } }