diff --git a/backend/docs/db/schema.sql b/backend/docs/db/schema.sql index f2b4eb3909..9e39e44f3b 100644 --- a/backend/docs/db/schema.sql +++ b/backend/docs/db/schema.sql @@ -935,6 +935,13 @@ ALTER TABLE ONLY public.user_groups_table CREATE INDEX data_use_terms_table_accession_idx ON public.data_use_terms_table USING btree (accession); +-- +-- Name: files_upload_requested_at_idx; Type: INDEX; Schema: public; Owner: postgres +-- + +CREATE INDEX files_upload_requested_at_idx ON public.files USING btree (upload_requested_at); + + -- -- Name: flyway_schema_history_s_idx; Type: INDEX; Schema: public; Owner: postgres -- diff --git a/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt b/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt index e9ab9640a1..c7fb6e178c 100644 --- a/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt +++ b/backend/src/main/kotlin/org/loculus/backend/config/BackendSpringConfig.kt @@ -43,6 +43,9 @@ object BackendSpringProperty { const val ENABLE_SEQSETS = "loculus.enable-seqsets" const val S3_ENABLED = "loculus.s3.enabled" + const val S3_GC_ENABLED = "loculus.s3.gc-enabled" + const val S3_GC_GRACE_PERIOD_MINUTES = "loculus.s3.gc-grace-period-minutes" + const val S3_GC_INITIAL_DELAY_MINUTES = "loculus.s3.gc-initial-delay-minutes" const val S3_BUCKET_ENDPOINT = "loculus.s3.bucket.endpoint" const val S3_BUCKET_INTERNAL_ENDPOINT = "loculus.s3.bucket.internal-endpoint" const val S3_BUCKET_BUCKET = "loculus.s3.bucket.bucket" diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt index dfc6171106..c6161bb651 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/FilesDatabaseService.kt @@ -1,10 +1,15 @@ package org.loculus.backend.service.files +import kotlinx.datetime.LocalDateTime +import org.jetbrains.exposed.sql.SqlExpressionBuilder.eq import org.jetbrains.exposed.sql.and +import org.jetbrains.exposed.sql.deleteWhere import org.jetbrains.exposed.sql.insert +import org.jetbrains.exposed.sql.kotlin.datetime.KotlinLocalDateTimeColumnType import org.jetbrains.exposed.sql.not +import org.jetbrains.exposed.sql.statements.StatementType +import org.jetbrains.exposed.sql.transactions.transaction import org.jetbrains.exposed.sql.update -import org.loculus.backend.utils.DatabaseConstants import org.loculus.backend.utils.DateProvider import org.loculus.backend.utils.chunkedForDatabase import org.springframework.stereotype.Service @@ -26,6 +31,10 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { } } + fun deleteFileEntry(fileId: UUID) { + FilesTable.deleteWhere { FilesTable.idColumn eq fileId } + } + fun getGroupIds(fileIds: Set): Map = fileIds.chunkedForDatabase({ chunk -> FilesTable.select(FilesTable.idColumn, FilesTable.groupIdColumn) .where { FilesTable.idColumn inList chunk } @@ -57,6 +66,48 @@ class FilesDatabaseService(private val dateProvider: DateProvider) { chunk.filterNot { it in existingIds } }, 1).toSet() + fun getOrphanedFileIds(threshold: LocalDateTime): Set { + val sql = """ + -- check for files for which an upload was requested > threshold days ago + -- but are not referenced by a submission. For this, check the submitted_data + -- and processed_data jsonb objects (but not archive_of_submitted_data) + WITH referenced AS ( + -- fetch ids for files uploaded by users and referenced in submissions + SELECT (fil->>'fileId')::uuid AS file_id + FROM sequence_entries, + LATERAL jsonb_each(COALESCE(NULLIF(submitted_data->'files', 'null'::jsonb),'{}'::jsonb)) AS cat(k,v), + LATERAL jsonb_array_elements(cat.v) AS fil + UNION + -- also need to check processed_data since preprocessing + -- can create files that are never referenced in submissions + SELECT (fil->>'fileId')::uuid AS file_id + FROM sequence_entries_preprocessed_data sepd + JOIN sequence_entries se + ON se.accession = sepd.accession + AND se.version = sepd.version, + LATERAL jsonb_each(COALESCE(NULLIF(sepd.processed_data->'files', 'null'::jsonb),'{}'::jsonb)) AS cat(k,v), + LATERAL jsonb_array_elements(cat.v) AS fil + ) + SELECT f.id FROM files f + LEFT JOIN referenced r ON r.file_id = f.id + WHERE r.file_id IS NULL + AND f.upload_requested_at < ?; + """.trimIndent() + return transaction { + exec( + sql, + listOf(KotlinLocalDateTimeColumnType() to threshold), + explicitStatementType = StatementType.SELECT, + ) { rs -> + buildSet { + while (rs.next()) { + add(rs.getObject("id", UUID::class.java)) + } + } + } ?: emptySet() + } + } + /** * Return the subset of file IDs for which the file size hasn't been checked yet or * no file has been uploaded yet (and therefore there's no file size). diff --git a/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt b/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt index a03e5f6f85..6fa72beaac 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/files/S3Service.kt @@ -14,6 +14,7 @@ import software.amazon.awssdk.services.s3.model.CompleteMultipartUploadRequest import software.amazon.awssdk.services.s3.model.CompletedMultipartUpload import software.amazon.awssdk.services.s3.model.CompletedPart import software.amazon.awssdk.services.s3.model.CreateMultipartUploadRequest +import software.amazon.awssdk.services.s3.model.DeleteObjectRequest import software.amazon.awssdk.services.s3.model.GetObjectRequest import software.amazon.awssdk.services.s3.model.HeadObjectRequest import software.amazon.awssdk.services.s3.model.PutObjectRequest @@ -176,6 +177,18 @@ class S3Service(private val s3Config: S3Config) { Unit } + fun deleteFile(fileId: FileId) = s3ErrorMapping { + val config = getS3BucketConfig() + s3Client.deleteObject( + DeleteObjectRequest + .builder() + .bucket(config.bucket) + .key(getFileIdPath(fileId)) + .build(), + ) + Unit + } + private fun assertIsEnabled() { if (!s3Config.enabled) { throw IllegalStateException("S3 is not enabled") @@ -258,6 +271,11 @@ fun s3ErrorMapping(block: () -> T): T { "by part number.", ) + "NoSuchKey" -> UnprocessableEntityException( + "NoSuchKey: The referenced file does not exist in storage. Uploaded files that are not " + + "referenced by a submission for too long are cleaned up automatically.", + ) + else -> RuntimeException("Unexpected S3 error: ${e.awsErrorDetails().errorCode()}") } } diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt new file mode 100644 index 0000000000..87e4767e0c --- /dev/null +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTask.kt @@ -0,0 +1,92 @@ +package org.loculus.backend.service.submission + +import kotlinx.datetime.DateTimeUnit +import kotlinx.datetime.minus +import kotlinx.datetime.toLocalDateTime +import org.loculus.backend.config.BackendSpringProperty +import org.loculus.backend.log.AuditLogger +import org.loculus.backend.service.files.FilesDatabaseService +import org.loculus.backend.service.files.S3Service +import org.loculus.backend.utils.DateProvider +import org.springframework.beans.factory.annotation.Value +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty +import org.springframework.scheduling.annotation.Scheduled +import org.springframework.stereotype.Component +import java.util.UUID +import java.util.concurrent.TimeUnit +import kotlin.math.max + +private val log = mu.KotlinLogging.logger {} + +@Component +@ConditionalOnProperty("loculus.s3.enabled", havingValue = "true") +class S3GarbageCollectionTask( + private val filesDatabaseService: FilesDatabaseService, + private val s3Service: S3Service, + private val dateProvider: DateProvider, + private val auditLogger: AuditLogger, + @Value("\${${BackendSpringProperty.S3_GC_GRACE_PERIOD_MINUTES}}") private val gracePeriod: Int, + @Value("\${${BackendSpringProperty.S3_GC_ENABLED}}") private val deleteOrphans: Boolean, +) { + + /** + * Deletes S3 objects older than `loculus.s3.gc-grace-period-minutes` that are + * not referenced in submitted_data or processed_data + */ + @Scheduled( + initialDelayString = "\${${BackendSpringProperty.S3_GC_INITIAL_DELAY_MINUTES}}", + fixedDelay = 60 * 24, + timeUnit = TimeUnit.MINUTES, + ) + fun task() { + // `gracePeriod` must be at least 1 or files produced by preprocessing may be + // garbage collected before they're attached to sequence entries + val gracePeriod = max(gracePeriod, 1) + log.info { + "Running S3 garbage collection task to clean up orphan files at least $gracePeriod " + + "minutes old (garbageCollectionEnabled = $deleteOrphans)" + } + + val threshold = dateProvider.getCurrentInstant() + .minus(gracePeriod, DateTimeUnit.MINUTE, DateProvider.timeZone) + .toLocalDateTime(DateProvider.timeZone) + val orphans = filesDatabaseService.getOrphanedFileIds(threshold) + + if (!deleteOrphans) { + log.info { "S3 garbage collection task would have deleted ${orphans.size} files: $orphans" } + return + } + + var deleteFailures = 0 + orphans.forEach { fileId: UUID -> + try { + s3Service.deleteFile(fileId) + filesDatabaseService.deleteFileEntry(fileId) + } catch (e: Exception) { + log.warn("Failed to delete $fileId", e) + deleteFailures++ + } + } + + if (orphans.isNotEmpty()) { + log.info { + "S3 garbage collection task deleted ${orphans.size - deleteFailures} orphan(s) not referenced by a " + + "submission after $gracePeriod minutes" + } + auditLogger + .log( + "CLEANUP", + "S3 garbage collection task deleted ${orphans.size - deleteFailures} orphan(s) " + + "not referenced by a submission after $gracePeriod minutes", + ) + + if (deleteFailures > 0) { + log.warn { + "S3 garbage collection task unsuccessfully attempted to delete $deleteFailures orphan file(s)" + } + } + } else { + log.info { "S3 garbage collection task identified no orphan files on S3" } + } + } +} diff --git a/backend/src/main/resources/application.properties b/backend/src/main/resources/application.properties index 1ce6999fef..7f8d250345 100644 --- a/backend/src/main/resources/application.properties +++ b/backend/src/main/resources/application.properties @@ -33,6 +33,9 @@ loculus.stream.batch-size=1000 loculus.debug-mode=false loculus.s3.enabled=false +loculus.s3.gc-enabled=false +loculus.s3.gc-grace-period-minutes=1440 +loculus.s3.gc-initial-delay-minutes=15 loculus.s3.bucket.endpoint= loculus.s3.bucket.bucket= loculus.s3.bucket.region= diff --git a/backend/src/main/resources/db/migration/V1.31__add_files_upload_requested_at_index.sql b/backend/src/main/resources/db/migration/V1.31__add_files_upload_requested_at_index.sql new file mode 100644 index 0000000000..a7e75ce47c --- /dev/null +++ b/backend/src/main/resources/db/migration/V1.31__add_files_upload_requested_at_index.sql @@ -0,0 +1 @@ +CREATE INDEX IF NOT EXISTS files_upload_requested_at_idx ON files (upload_requested_at); diff --git a/backend/src/test/kotlin/org/loculus/backend/service/TestHelpers.kt b/backend/src/test/kotlin/org/loculus/backend/service/TestHelpers.kt new file mode 100644 index 0000000000..42282ab240 --- /dev/null +++ b/backend/src/test/kotlin/org/loculus/backend/service/TestHelpers.kt @@ -0,0 +1,26 @@ +package org.loculus.backend.service + +import kotlinx.datetime.DateTimeUnit +import kotlinx.datetime.LocalDateTime +import kotlinx.datetime.minus +import kotlinx.datetime.toLocalDateTime +import org.jetbrains.exposed.sql.insert +import org.jetbrains.exposed.sql.transactions.transaction +import org.loculus.backend.service.files.FilesTable +import org.loculus.backend.utils.DateProvider +import java.util.UUID +import kotlin.time.Clock + +fun insertFile(id: UUID, groupId: Int, requestedAt: LocalDateTime, uploader: String = "testuser") = transaction { + FilesTable.insert { + it[idColumn] = id + it[uploadRequestedAtColumn] = requestedAt + it[uploaderColumn] = uploader + it[groupIdColumn] = groupId + it[multipartCompleted] = true + } +} + +fun daysAgo(days: Long): LocalDateTime = Clock.System.now() + .minus(days, DateTimeUnit.DAY, DateProvider.timeZone) + .toLocalDateTime(DateProvider.timeZone) diff --git a/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt new file mode 100644 index 0000000000..cc121d1053 --- /dev/null +++ b/backend/src/test/kotlin/org/loculus/backend/service/files/GetOrphanedFileIdsTest.kt @@ -0,0 +1,193 @@ +package org.loculus.backend.service.files + +import org.hamcrest.MatcherAssert.assertThat +import org.hamcrest.Matchers.`is` +import org.jetbrains.exposed.sql.insert +import org.jetbrains.exposed.sql.transactions.transaction +import org.jetbrains.exposed.sql.update +import org.junit.jupiter.api.BeforeEach +import org.junit.jupiter.api.Test +import org.loculus.backend.api.FileIdAndName +import org.loculus.backend.api.ProcessedData +import org.loculus.backend.api.SubmittedData +import org.loculus.backend.config.BackendSpringProperty +import org.loculus.backend.controller.DEFAULT_GROUP +import org.loculus.backend.controller.DEFAULT_ORGANISM +import org.loculus.backend.controller.EndpointTest +import org.loculus.backend.controller.groupmanagement.GroupManagementControllerClient +import org.loculus.backend.controller.groupmanagement.andGetGroupId +import org.loculus.backend.controller.jwtForDefaultUser +import org.loculus.backend.service.daysAgo +import org.loculus.backend.service.insertFile +import org.loculus.backend.service.submission.CompressedSequence +import org.loculus.backend.service.submission.SequenceEntriesPreprocessedDataTable +import org.loculus.backend.service.submission.SequenceEntriesTable +import org.loculus.backend.service.submission.dbtables.CurrentProcessingPipelineTable +import org.loculus.backend.utils.DateProvider +import org.springframework.beans.factory.annotation.Autowired +import java.util.UUID + +/** + * Testing of orphan file detection logic in [FilesDatabaseService.getOrphanedFileIds]. + */ +@EndpointTest( + properties = [ + // set to high value to prevent tests from triggering pipeline version upgrade task + "${BackendSpringProperty.PIPELINE_VERSION_UPGRADE_CHECK_INTERVAL_SECONDS}=999999", + ], +) +class GetOrphanedFileIdsTest( + @Autowired val filesDatabaseService: FilesDatabaseService, + @Autowired val groupManagementClient: GroupManagementControllerClient, + @Autowired val dateProvider: DateProvider, +) { + private var groupId = 0 + + @BeforeEach + fun createGroup() { + groupId = groupManagementClient + .createNewGroup(group = DEFAULT_GROUP, jwt = jwtForDefaultUser) + .andGetGroupId() + } + + @Test + fun `GIVEN unreferenced files THEN only those whose upload was requested before the threshold are orphaned`() { + val old = UUID.randomUUID() + val recent = UUID.randomUUID() + insertFile(old, groupId, daysAgo(10)) + insertFile(recent, groupId, daysAgo(1)) + + val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) + + assertThat(orphans, `is`(setOf(old))) + } + + @Suppress("ktlint:standard:max-line-length") + @Test + fun `GIVEN a file only referenced in archive_of_submitted_data THEN it is orphaned, but submitted_data is protected`() { + // Simulate a case where a user edited a submission, replacing `editedAway` with `currentFile`. + val editedAway = UUID.randomUUID() + val currentFile = UUID.randomUUID() + listOf(editedAway, currentFile).forEach { insertFile(it, groupId, daysAgo(10)) } + insertSequenceEntry( + accession = "A", + version = 2, + archive = makeUnprocessedData(editedAway), + submitted = makeUnprocessedData(currentFile), + ) + + val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) + + assertThat(orphans, `is`(setOf(editedAway))) + } + + @Test + fun `GIVEN multiple pipeline versions THEN files from all pipeline versions are protected`() { + transaction { + CurrentProcessingPipelineTable.update( + { CurrentProcessingPipelineTable.organismColumn eq DEFAULT_ORGANISM }, + ) { + it[versionColumn] = 2 + } + } + + val fileFromOldPipeline = UUID.randomUUID() // pipeline version 1 (< current) + val fileFromCurrentPipeline = UUID.randomUUID() // pipeline version 2 (current) + val fileFromNewerPipeline = UUID.randomUUID() // pipeline version 3 (> current) + listOf(fileFromOldPipeline, fileFromCurrentPipeline, fileFromNewerPipeline) + .forEach { insertFile(it, groupId, daysAgo(10)) } + + // The sequence entry itself references no files, so only the processed_data references matter. + insertSequenceEntry(accession = "A", version = 1, archive = null, submitted = makeUnprocessedData(null)) + insertPreprocessedData( + accession = "A", + version = 1, + pipelineVersion = 1, + processed = makeProcessedData(fileFromOldPipeline), + ) + insertPreprocessedData( + accession = "A", + version = 1, + pipelineVersion = 2, + processed = makeProcessedData(fileFromCurrentPipeline), + ) + insertPreprocessedData( + accession = "A", + version = 1, + pipelineVersion = 3, + processed = makeProcessedData(fileFromNewerPipeline), + ) + + val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) + + assertThat(orphans, `is`(emptySet())) + } + + @Test + fun `GIVEN a file referenced only by old version THEN it is still protected`() { + val fileInOldVersion = UUID.randomUUID() + insertFile(fileInOldVersion, groupId, daysAgo(10)) + insertSequenceEntry( + accession = "A", + version = 1, + archive = null, + submitted = makeUnprocessedData(fileInOldVersion), + ) + insertSequenceEntry(accession = "A", version = 2, archive = null, submitted = makeUnprocessedData(null)) + + val orphans = filesDatabaseService.getOrphanedFileIds(daysAgo(5)) + + assertThat(orphans.isEmpty(), `is`(true)) + } + + private fun insertSequenceEntry( + accession: String, + version: Long, + archive: SubmittedData?, + submitted: SubmittedData?, + ) = transaction { + SequenceEntriesTable.insert { + it[accessionColumn] = accession + it[versionColumn] = version + it[organismColumn] = DEFAULT_ORGANISM + it[submissionIdColumn] = "submission-$accession-$version" + it[submitterColumn] = "testuser" + it[groupIdColumn] = groupId + it[submittedAtTimestampColumn] = dateProvider.getCurrentDateTime() + it[archiveOfSubmittedDataColumn] = archive + it[submittedDataColumn] = submitted + } + } + + private fun insertPreprocessedData( + accession: String, + version: Long, + pipelineVersion: Long, + processed: ProcessedData, + ) = transaction { + SequenceEntriesPreprocessedDataTable.insert { + it[accessionColumn] = accession + it[versionColumn] = version + it[pipelineVersionColumn] = pipelineVersion + it[processedDataColumn] = processed + it[processingStatusColumn] = "PROCESSED" + it[startedProcessingAtColumn] = dateProvider.getCurrentDateTime() + } + } + + private fun makeUnprocessedData(fileId: UUID?): SubmittedData = SubmittedData( + metadata = emptyMap(), + unalignedNucleotideSequences = emptyMap(), + files = fileId?.let { mapOf("rawReads" to listOf(FileIdAndName(it, "raw.fastq"))) }, + ) + + private fun makeProcessedData(fileId: UUID): ProcessedData = ProcessedData( + metadata = emptyMap(), + unalignedNucleotideSequences = emptyMap(), + alignedNucleotideSequences = emptyMap(), + nucleotideInsertions = emptyMap(), + alignedAminoAcidSequences = emptyMap(), + aminoAcidInsertions = emptyMap(), + files = mapOf("processedOutput" to listOf(FileIdAndName(fileId, "aligned.bam"))), + ) +} diff --git a/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt new file mode 100644 index 0000000000..81397076b8 --- /dev/null +++ b/backend/src/test/kotlin/org/loculus/backend/service/submission/S3GarbageCollectionTaskTest.kt @@ -0,0 +1,104 @@ +package org.loculus.backend.service.submission + +import com.ninjasquad.springmockk.MockkBean +import io.mockk.verify +import org.hamcrest.MatcherAssert.assertThat +import org.hamcrest.Matchers.`is` +import org.jetbrains.exposed.sql.insert +import org.jetbrains.exposed.sql.transactions.transaction +import org.junit.jupiter.api.Test +import org.loculus.backend.api.FileIdAndName +import org.loculus.backend.api.SubmittedData +import org.loculus.backend.config.BackendSpringProperty +import org.loculus.backend.controller.DEFAULT_GROUP +import org.loculus.backend.controller.DEFAULT_ORGANISM +import org.loculus.backend.controller.EndpointTest +import org.loculus.backend.controller.groupmanagement.GroupManagementControllerClient +import org.loculus.backend.controller.groupmanagement.andGetGroupId +import org.loculus.backend.controller.jwtForDefaultUser +import org.loculus.backend.log.AuditLogger +import org.loculus.backend.service.daysAgo +import org.loculus.backend.service.files.FilesDatabaseService +import org.loculus.backend.service.files.S3Service +import org.loculus.backend.service.insertFile +import org.loculus.backend.utils.DateProvider +import org.springframework.beans.factory.annotation.Autowired +import java.util.UUID + +@EndpointTest( + properties = [ + "${BackendSpringProperty.S3_ENABLED}=true", + "${BackendSpringProperty.S3_GC_ENABLED}=false", + "${BackendSpringProperty.S3_GC_GRACE_PERIOD_MINUTES}=1", + ], +) +class S3GarbageCollectionTaskTest( + @Autowired val s3GarbageCollectionTask: S3GarbageCollectionTask, + @Autowired val filesDatabaseService: FilesDatabaseService, + @Autowired val groupManagementClient: GroupManagementControllerClient, + @Autowired val dateProvider: DateProvider, + @Autowired val auditLogger: AuditLogger, +) { + @MockkBean(relaxed = true) + lateinit var s3Service: S3Service + + @Test + fun `GIVEN dry run is enabled WHEN the task runs THEN the orphan is not deleted from S3 or the DB`() { + val groupId = groupManagementClient + .createNewGroup(group = DEFAULT_GROUP, jwt = jwtForDefaultUser) + .andGetGroupId() + val orphan = UUID.randomUUID() + insertFile(orphan, groupId, daysAgo(2)) + + s3GarbageCollectionTask.task() + + verify(exactly = 0) { s3Service.deleteFile(any()) } + assertThat(filesDatabaseService.getNonExistentFileIds(setOf(orphan)), `is`(emptySet())) + } + + @Suppress("ktlint:standard:max-line-length") + @Test + fun `GIVEN an orphan and a referenced file WHEN the task runs THEN only the orphan is deleted from S3 and the DB`() { + val groupId = groupManagementClient + .createNewGroup(group = DEFAULT_GROUP, jwt = jwtForDefaultUser) + .andGetGroupId() + val orphan = UUID.randomUUID() + val referenced = UUID.randomUUID() + listOf(orphan, referenced).forEach { insertFile(it, groupId, daysAgo(2)) } + + // `referenced` is referenced by a submission's unprocessed_data, so it must be protected. + transaction { + SequenceEntriesTable.insert { + it[accessionColumn] = "A1" + it[versionColumn] = 1 + it[organismColumn] = DEFAULT_ORGANISM + it[submissionIdColumn] = "submission-A1" + it[submitterColumn] = "testuser" + it[groupIdColumn] = groupId + it[submittedAtTimestampColumn] = dateProvider.getCurrentDateTime() + it[submittedDataColumn] = SubmittedData( + metadata = emptyMap(), + unalignedNucleotideSequences = emptyMap(), + files = mapOf("rawReads" to listOf(FileIdAndName(referenced, "raw.fastq"))), + ) + } + } + + val deleteTask = S3GarbageCollectionTask( + filesDatabaseService, + s3Service, + dateProvider, + auditLogger, + gracePeriod = 1, + deleteOrphans = true, + ) + deleteTask.task() + + verify { s3Service.deleteFile(orphan) } + verify(exactly = 0) { s3Service.deleteFile(referenced) } + assertThat( + filesDatabaseService.getNonExistentFileIds(setOf(orphan, referenced)), + `is`(setOf(orphan)), + ) + } +} diff --git a/kubernetes/loculus/templates/loculus-backend.yaml b/kubernetes/loculus/templates/loculus-backend.yaml index dc6ef7d14d..65506b8fee 100644 --- a/kubernetes/loculus/templates/loculus-backend.yaml +++ b/kubernetes/loculus/templates/loculus-backend.yaml @@ -71,6 +71,9 @@ spec: - "--loculus.pipeline-version-upgrade-check.interval-seconds={{- .Values.pipelineVersionUpgradeCheckIntervalSeconds | default 10 }}" - "--loculus.s3.enabled=$(S3_ENABLED)" {{- if $.Values.s3.enabled }} + - "--loculus.s3.gc-enabled=$(S3_GC_ENABLED)" + - "--loculus.s3.gc-grace-period-minutes=$(S3_GC_GRACE_PERIOD_MINUTES)" + - "--loculus.s3.gc-initial-delay-minutes=$(S3_GC_INITIAL_DELAY_MINUTES)" - "--loculus.s3.bucket.endpoint=$(S3_BUCKET_ENDPOINT)" - "--loculus.s3.bucket.internal-endpoint=$(S3_BUCKET_INTERNAL_ENDPOINT)" - "--loculus.s3.bucket.region=$(S3_BUCKET_REGION)" @@ -131,6 +134,12 @@ spec: - name: S3_ENABLED value: {{$.Values.s3.enabled | quote }} {{- if $.Values.s3.enabled }} + - name: S3_GC_ENABLED + value: {{$.Values.s3.garbageCollectionEnabled | quote }} + - name: S3_GC_GRACE_PERIOD_MINUTES + value: {{$.Values.s3.garbageCollectionGracePeriodMinutes | quote }} + - name: S3_GC_INITIAL_DELAY_MINUTES + value: {{$.Values.s3.garbageCollectionInitialDelayMinutes | quote }} - name: S3_BUCKET_ENDPOINT value: {{ include "loculus.s3Url" . | quote }} - name: S3_BUCKET_INTERNAL_ENDPOINT @@ -155,4 +164,4 @@ spec: mountPath: /config volumes: {{ include "loculus.configVolume" (dict "name" "loculus-backend-config") | nindent 8 }} -{{- end }} \ No newline at end of file +{{- end }} diff --git a/kubernetes/loculus/values.schema.json b/kubernetes/loculus/values.schema.json index 9efa3dc6f2..fda6d52bbe 100644 --- a/kubernetes/loculus/values.schema.json +++ b/kubernetes/loculus/values.schema.json @@ -1704,6 +1704,24 @@ "default": false, "description": "Whether to enable S3. S3 is needed for the file sharing feature." }, + "garbageCollectionEnabled": { + "groups": ["s3"], + "type": "boolean", + "default": false, + "description": "Whether the S3 garbage collection task in the backend should delete orphan files (if true) or just output a log with orphan files (if false)." + }, + "garbageCollectionGracePeriodMinutes": { + "groups": ["s3"], + "type": "integer", + "default": 1440, + "description": "Files uploaded to S3 but never attached to a sequence entry are garbage-collected after this many minutes." + }, + "garbageCollectionInitialDelayMinutes": { + "groups": ["s3"], + "type": "integer", + "default": 15, + "description": "How many minutes it takes before the first garbage collection task runs after the backend starts." + }, "bucket": { "type": "object", "properties": { diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index ac55e61ca3..9a0938f3b0 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -59,6 +59,9 @@ fileSharing: outputFileUrlType: backend s3: enabled: true + garbageCollectionEnabled: false # have garbage collection log instead of actually delete files + garbageCollectionGracePeriodMinutes: 1440 + garbageCollectionInitialDelayMinutes: 15 bucket: region: us-east-1 bucket: loculus-preview-private