Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
884c04e
Initial S3 garbage collection implementation
maverbiest May 20, 2026
e107ba5
Update type
maverbiest May 31, 2026
0443280
Update query
maverbiest Jun 1, 2026
af95fb7
Fix query
maverbiest Jun 1, 2026
6f3d146
Update SQL to also GC files generated by old pipeline versions
maverbiest Jun 2, 2026
a3aa7f0
update comment
maverbiest Jun 2, 2026
aed1fb6
Linting, formatting
maverbiest Jun 2, 2026
a6500b8
fix imports
maverbiest Jun 2, 2026
683c5b0
Fix query
maverbiest Jun 5, 2026
71ca764
Claude: add tests
maverbiest Jun 5, 2026
4797c66
Clean up GetOrphanedFileIdsTest.kt
maverbiest Jun 8, 2026
702ccb3
Add test helper file, small typing fix
maverbiest Jun 8, 2026
0ae53a4
Fix import order
maverbiest Jun 8, 2026
0cf9e12
Clean up S3GarbageCollectionTaskTest.kt
maverbiest Jun 8, 2026
cbf53be
Comment
maverbiest Jun 8, 2026
7fdcd15
Add initialDelay to task, don't abort batch on first failure
maverbiest Jun 8, 2026
9f4e8f8
Fix
maverbiest Jun 8, 2026
5b89ed1
Fix indent
maverbiest Jun 8, 2026
e97e00a
Fix log
maverbiest Jun 8, 2026
d77f923
Add spaces
maverbiest Jun 8, 2026
26968a2
Add index to files.upload_requested_at
maverbiest Jun 8, 2026
f45252d
Update schema documentation based on migration changes
actions-user Jun 9, 2026
dd93d92
TO REVERT: configure garbage collection task for interactive testing
maverbiest Jun 9, 2026
9adaa8e
TO REVERT: set initial delay to 90 minutes to wait out initial prepro…
maverbiest Jun 9, 2026
fc3f17f
log on each GC run
maverbiest Jun 9, 2026
febad3f
Improve logs, enforce maxOrphanAge of at least 1
maverbiest Jun 10, 2026
3cd7cc4
Update tests and logs to account for clamping of maxOrphanAge
maverbiest Jun 10, 2026
27138b7
Revert "TO REVERT: set initial delay to 90 minutes to wait out initia…
maverbiest Jun 10, 2026
2441fa2
Revert "TO REVERT: configure garbage collection task for interactive …
maverbiest Jun 10, 2026
ff618b5
Make maxOrphanAgeDays configurable in values.yaml
maverbiest Jun 10, 2026
a8df88e
Add back default value in application.properties as fallback when no
maverbiest Jun 10, 2026
16065cb
Make consistent with major backend refactor after rebase
maverbiest Jun 10, 2026
06a1cd7
Formatting
maverbiest Jun 10, 2026
f68bf0c
Add gcDryRun flag, turned on by default
maverbiest Jun 11, 2026
6a83024
Prevent files from previous pipeline version from being garbage
maverbiest Jun 11, 2026
56caffe
Improve error message for missing S3 file
maverbiest Jun 11, 2026
13c1444
Edit stale comment
maverbiest Jun 16, 2026
c15ac0d
Refactor S3 GC config flags
maverbiest Jun 16, 2026
f2798aa
Make initial delay of GC task configurable
maverbiest Jun 16, 2026
a6a2bd0
Add buildSet suggestion
maverbiest Jun 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions backend/docs/db/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -935,6 +935,13 @@ ALTER TABLE ONLY public.user_groups_table
CREATE INDEX data_use_terms_table_accession_idx ON public.data_use_terms_table USING btree (accession);


--
-- Name: files_upload_requested_at_idx; Type: INDEX; Schema: public; Owner: postgres
--

CREATE INDEX files_upload_requested_at_idx ON public.files USING btree (upload_requested_at);


--
-- Name: flyway_schema_history_s_idx; Type: INDEX; Schema: public; Owner: postgres
--
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ object BackendSpringProperty {
const val ENABLE_SEQSETS = "loculus.enable-seqsets"

const val S3_ENABLED = "loculus.s3.enabled"
const val S3_GC_ENABLED = "loculus.s3.gc-enabled"
const val S3_GC_GRACE_PERIOD_MINUTES = "loculus.s3.gc-grace-period-minutes"
const val S3_GC_INITIAL_DELAY_MINUTES = "loculus.s3.gc-initial-delay-minutes"
const val S3_BUCKET_ENDPOINT = "loculus.s3.bucket.endpoint"
const val S3_BUCKET_INTERNAL_ENDPOINT = "loculus.s3.bucket.internal-endpoint"
const val S3_BUCKET_BUCKET = "loculus.s3.bucket.bucket"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
package org.loculus.backend.service.files

import kotlinx.datetime.LocalDateTime
import org.jetbrains.exposed.sql.SqlExpressionBuilder.eq
import org.jetbrains.exposed.sql.and
import org.jetbrains.exposed.sql.deleteWhere
import org.jetbrains.exposed.sql.insert
import org.jetbrains.exposed.sql.kotlin.datetime.KotlinLocalDateTimeColumnType
import org.jetbrains.exposed.sql.not
import org.jetbrains.exposed.sql.statements.StatementType
import org.jetbrains.exposed.sql.transactions.transaction
import org.jetbrains.exposed.sql.update
import org.loculus.backend.utils.DatabaseConstants
import org.loculus.backend.utils.DateProvider
import org.loculus.backend.utils.chunkedForDatabase
import org.springframework.stereotype.Service
Expand All @@ -26,6 +31,10 @@ class FilesDatabaseService(private val dateProvider: DateProvider) {
}
}

fun deleteFileEntry(fileId: UUID) {
FilesTable.deleteWhere { FilesTable.idColumn eq fileId }
}

fun getGroupIds(fileIds: Set<FileId>): Map<FileId, Int> = fileIds.chunkedForDatabase({ chunk ->
FilesTable.select(FilesTable.idColumn, FilesTable.groupIdColumn)
.where { FilesTable.idColumn inList chunk }
Expand Down Expand Up @@ -57,6 +66,48 @@ class FilesDatabaseService(private val dateProvider: DateProvider) {
chunk.filterNot { it in existingIds }
}, 1).toSet()

fun getOrphanedFileIds(threshold: LocalDateTime): Set<FileId> {
Comment thread
maverbiest marked this conversation as resolved.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We ran this on staging to do some perf (added one entry in the files table to actually run). Scanning every entry is definitely not teh greatest, so in future we might need to refactor but this should work for now :-)

Details

⏺ What This Query Plan Does

  The logical query: Find rows in files where the file's id does NOT appear as a fileId in any sequence entry's JSON data — i.e., find orphaned files.

  ---
  Why It Finished in 2ms Despite Terrifying Cost Estimates

  The plan executed a Merge Anti Join, which works like this:

  1. Sort the left side (files)
  2. Sort the right side (all fileIds extracted from JSON in sequence entries)
  3. Walk both sorted lists; emit left-side rows that have no match on the right

  The critical detail: the left side (files filtered by upload_requested_at < now()) returned 0 rows. Once the left side is exhausted, the right side is never executed. Every node tagged (never executed) was skipped entirely.

  ---
  The Right Side Would Have Been Catastrophic
  
  If any files had matched the filter, the right side would need to:

  sequence_entries (309k rows)
    → jsonb_each(metadata) → explode into category/value pairs      ~30M rows
      → jsonb_array_elements(fil) → explode arrays of file refs     ~3B rows

  sequence_entries_preprocessed_data (614k rows, joined via parallel hash join)
    → jsonb_each → jsonb_array_elements                             ~6B rows

  Total estimated: 9.2 billion rows to sort and deduplicate

  PostgreSQL has almost no statistics about what jsonb_each and jsonb_array_elements return — it assumes 100 rows per call by default, which compounds wildly across nested loops. The 2.6 trillion cost estimate is the planner's honest (if terrible) guess.

  ---
  Key Observations

  ┌───────────────────────────────────────────┬─────────────────────────────────────────────────────┐
  │                   Thing                   │                    What it means                    │
  ├───────────────────────────────────────────┼─────────────────────────────────────────────────────┤
  │ rows=0 on Seq Scan of files               │ The filter found nothing — query was trivially fast │
  ├───────────────────────────────────────────┼─────────────────────────────────────────────────────┤
  │ (never executed) everywhere on right side │ Short-circuited; never touched                      │
  ├───────────────────────────────────────────┼─────────────────────────────────────────────────────┤
  │ Cost 2645430954                           │ Planner's estimate, not actual work done            │
  ├───────────────────────────────────────────┼─────────────────────────────────────────────────────┤
  │ Actual time=0.010..0.013 on top node      │ 13 microseconds of real work                        │
  ├───────────────────────────────────────────┼─────────────────────────────────────────────────────┤
  │ Workers Launched: 0 (planned 2)           │ Parallel workers weren't even started               │
  └───────────────────────────────────────────┴─────────────────────────────────────────────────────┘

  ---
  The Real Risk
  
  This query has a latent performance bomb. If the files table ever has rows matching the filter, PostgreSQL will attempt to sort and deduplicate ~9 billion JSON-extracted values. The fix would be one of:

  - A materialized CTE or temp table to pre-extract fileIds with proper indexing
  - An index on a normalized file_references table instead of JSON scanning
  - Rewriting as NOT EXISTS (SELECT 1 FROM sequence_entries WHERE metadata @> ...) with a GIN index on the JSONB column

✻ Baked for 25s

❯ how can we insert a few rows just to trigger the right side                                                                                                                                                                                                                                                                                                                                                                                                                                         

⏺ Let me find the files table schema and the query itself.

  Searched for 6 patterns, read 2 files (ctrl+o to expand)
  ⎿  Loaded backend/CLAUDE.md
  ⎿  Loaded backend/AGENTS.md

⏺ Now I have everything. Here's the minimal insert to trigger the right side:

  INSERT INTO files (id, upload_requested_at, uploader, group_id, size, multipart_completed, multipart_upload_id)
  VALUES (
      gen_random_uuid(),
      now() - interval '1 day',  -- must be < now() to pass the filter
      'test-user',
      1,                          -- any group_id that exists
      NULL,
      false,
      NULL
  );
  
  The filter is upload_requested_at < now(), so any timestamp in the past works. The group_id needs to be a valid integer but there's no FK constraint enforced at the DB level based on the schema, so 1 should work regardless.

  After inserting, re-run the EXPLAIN ANALYZE and you'll see the right side execute — and likely see it sorting/deduplicating hundreds of millions of rows (or running out of memory/time). That's the performance cliff.

  To clean up afterwards:
  
  DELETE FROM files WHERE uploader = 'test-user';

✻ Baked for 26s

❯ guess how logn it will take                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

⏺ Hard to say precisely without knowing actual JSON density, but here's the reasoning:

  The 9.2 billion row estimate is almost certainly fiction. PostgreSQL blindly assumes jsonb_each returns 100 rows and jsonb_array_elements returns another 100 — multiplied across 309K + 614K base rows. In reality, most sequences probably have 0 or a handful of file references (file sharing looks like a newer/optional feature in this codebase).

  What actually drives the time:
  
  1. Seq scan of sequence_entries — 309K rows × 966 bytes each ≈ 300MB to read
  2. Seq scan + hash join of sequence_entries_preprocessed_data — 614K rows × 783 bytes ≈ 480MB
  3. JSON expansion — if most submitted_data->'files' are null/empty, jsonb_each('{}') emits 0 rows and it collapses fast
  4. Sort + Unique — only expensive if expansion actually produces many rows

  My guess:

  - If file sharing is sparsely used (most rows have no files): 10–60 seconds, dominated by the table scans
  - If sequences have ~1–5 file refs each: 2–10 minutes, sort of a few million rows
  - If the 9.2B estimate were anywhere close to real: hours, and it would probably spill to disk

  You can tell which world you're in after inserting the test row by watching whether the Sort node in EXPLAIN ANALYZE says Memory: Xkb (fast, fits in work_mem) or Sort Method: external merge  Disk: XMB (slow, spilling).

✻ Baked for 32s

❯ there is just one file and this came back "Merge Anti Join  (cost=2645430954.56..2837916997.28 rows=105 width=16) (actual time=165112.005..165115.272 rows=1 loops=1)"                                                                                                                                                                                                                                                                                                                              
  "  Merge Cond: (f.id = (((fil.value ->> 'fileId'::text))::uuid))"                                                                                                                                                                                                                                                                                                                                                                                                                                   
  "  Buffers: shared hit=1497697 read=232824"                                                                                                                                                                                                                                                                                                                                                                                                                                                         
  "  I/O Timings: shared read=156340.591"                                                                                                                                                                                                                                                                                                                                                                                                                                                             
  "  ->  Sort  (cost=27.55..28.07 rows=210 width=16) (actual time=0.019..0.020 rows=1 loops=1)"                                                                                                                                                                                                                                                                                                                                                                                                       
  "        Sort Key: f.id"                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
  "        Sort Method: quicksort  Memory: 25kB"                                                                                                                                                                                                                                                                                                                                                                                                                                                      
  "        Buffers: shared hit=1"                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
  "        ->  Seq Scan on files f  (cost=0.00..19.45 rows=210 width=16) (actual time=0.015..0.016 rows=1 loops=1)"                                                                                                                                                                                                                                                                                                                                                                                   
  "              Filter: (upload_requested_at < now())"                                                                                                                                                                                                                                                                                                                                                                                                                                               
  "              Buffers: shared hit=1"                                                                                                                                                                                                                                                                                                                                                                                                                                                               
  "  ->  Unique  (cost=2645430927.01..2691627577.01 rows=9239330000 width=16) (actual time=165111.984..165115.249 rows=0 loops=1)"                                                                                                                                                                                                                                                                                                                                                                    
  "        Buffers: shared hit=1497696 read=232824"                                                                                                                                                                                                                                                                                                                                                                                                                                                   
  "        I/O Timings: shared read=156340.591"                                                                                                                                                                                                                                                                                                                                                                                                                                                       
  "        ->  Sort  (cost=2645430927.01..2668529252.01 rows=9239330000 width=16) (actual time=165111.983..165115.248 rows=0 loops=1)"                                                                                                                                                                                                                                                                                                                                                                
  "              Sort Key: (((fil.value ->> 'fileId'::text))::uuid)"                                                                                                                                                                                                                                                                                                                                                                                                                                  
  "              Sort Method: quicksort  Memory: 25kB"                                                                                                                                                                                                                                                                                                                                                                                                                                                
  "              Buffers: shared hit=1497696 read=232824"                                                                                                                                                                                                                                                                                                                                                                                                                                             
  "              I/O Timings: shared read=156340.591"                                                                                                                                                                                                                                                                                                                                                                                                                                                 
  "              ->  Append  (cost=0.02..326590763.34 rows=9239330000 width=16) (actual time=165111.981..165115.245 rows=0 loops=1)"                                                                                                                                                                                                                                                                                                                                                                  
  "                    Buffers: shared hit=1497696 read=232824"                                                                                                                                                                                                                                                                                                                                                                                                                                       
  "                    I/O Timings: shared read=156340.591"                                                                                                                                                                                                                                                                                                                                                                                                                                           
  "                    ->  Nested Loop  (cost=0.02..62874337.85 rows=3090840000 width=16) (actual time=36509.936..36509.938 rows=0 loops=1)"                                                                                                                                                                                                                                                                                                                                                          
  "                          Buffers: shared hit=238372 read=53174"                                                                                                                                                                                                                                                                                                                                                                                                                                   
  "                          I/O Timings: shared read=34721.468"                                                                                                                                                                                                                                                                                                                                                                                                                                      
  "                          ->  Nested Loop  (cost=0.01..671081.85 rows=30908400 width=32) (actual time=36509.935..36509.936 rows=0 loops=1)"                                                                                                                                                                                                                                                                                                                                                        
  "                                Buffers: shared hit=238372 read=53174"                                                                                                                                                                                                                                                                                                                                                                                                                             
  "                                I/O Timings: shared read=34721.468"                                                                                                                                                                                                                                                                                                                                                                                                                                
  "                                ->  Seq Scan on sequence_entries  (cost=0.00..52913.84 rows=309084 width=966) (actual time=0.002..177.216 rows=309742 loops=1)"                                                                                                                                                                                                                                                                                                                                    
  "                                      Buffers: shared hit=49823"                                                                                                                                                                                                                                                                                                                                                                                                                                   
  "                                ->  Function Scan on jsonb_each cat  (cost=0.01..1.01 rows=100 width=32) (actual time=0.117..0.117 rows=0 loops=309742)"                                                                                                                                                                                                                                                                                                                                           
  "                                      Buffers: shared hit=188549 read=53174"                                                                                                                                                                                                                                                                                                                                                                                                                       
  "                                      I/O Timings: shared read=34721.468"                                                                                                                                                                                                                                                                                                                                                                                                                          
  "                          ->  Memoize  (cost=0.01..1.01 rows=100 width=32) (never executed)"                                                                                                                                                                                                                                                                                                                                                                                                       
  "                                Cache Key: cat.v"                                                                                                                                                                                                                                                                                                                                                                                                                                                  
  "                                Cache Mode: binary"                                                                                                                                                                                                                                                                                                                                                                                                                                                
  "                                ->  Function Scan on jsonb_array_elements fil  (cost=0.00..1.00 rows=100 width=32) (never executed)"                                                                                                                                                                                                                                                                                                                                                               
  "                    ->  Nested Loop  (cost=11917.49..125126475.49 rows=6148490000 width=16) (actual time=128602.042..128605.303 rows=0 loops=1)"                                                                                                                                                                                                                                                                                                                                                   
  "                          Buffers: shared hit=1259324 read=179650"                                                                                                                                                                                                                                                                                                                                                                                                                                 
  "                          I/O Timings: shared read=121619.123"                                                                                                                                                                                                                                                                                                                                                                                                                                     
  "                          ->  Nested Loop  (cost=11917.48..1388013.24 rows=61484900 width=32) (actual time=128602.041..128605.300 rows=0 loops=1)"                                                                                                                                                                                                                                                                                                                                                 
  "                                Buffers: shared hit=1259324 read=179650"                                                                                                                                                                                                                                                                                                                                                                                                                           
  "                                I/O Timings: shared read=121619.123"                                                                                                                                                                                                                                                                                                                                                                                                                               
  "                                ->  Gather  (cost=11917.47..158315.23 rows=614849 width=764) (actual time=61.485..366.351 rows=617729 loops=1)"                                                                                                                                                                                                                                                                                                                                                    
  "                                      Workers Planned: 2"                                                                                                                                                                                                                                                                                                                                                                                                                                          
  "                                      Workers Launched: 2"                                                                                                                                                                                                                                                                                                                                                                                                                                         
  "                                      Buffers: shared hit=133642"                                                                                                                                                                                                                                                                                                                                                                                                                                  
  "                                      ->  Parallel Hash Join  (cost=10917.47..95830.33 rows=256187 width=764) (actual time=56.310..279.466 rows=205910 loops=3)"                                                                                                                                                                                                                                                                                                                                   
  "                                            Hash Cond: ((sepd.accession = se.accession) AND (sepd.version = se.version))"                                                                                                                                                                                                                                                                                                                                                                          
  "                                            Buffers: shared hit=133642"                                                                                                                                                                                                                                                                                                                                                                                                                            
  "                                            ->  Parallel Seq Scan on sequence_entries_preprocessed_data sepd  (cost=0.00..83567.87 rows=256187 width=783) (actual time=0.007..87.833 rows=205910 loops=3)"                                                                                                                                                                                                                                                                                         
  "                                                  Buffers: shared hit=81006"                                                                                                                                                                                                                                                                                                                                                                                                                       
  "                                            ->  Parallel Hash  (cost=8985.69..8985.69 rows=128785 width=19) (actual time=55.291..55.291 rows=103247 loops=3)"                                                                                                                                                                                                                                                                                                                                      
  "                                                  Buckets: 524288  Batches: 1  Memory Usage: 21120kB"                                                                                                                                                                                                                                                                                                                                                                                              
  "                                                  Buffers: shared hit=52490"                                                                                                                                                                                                                                                                                                                                                                                                                       
  "                                                  ->  Parallel Index Only Scan using sequence_entries_pkey on sequence_entries se  (cost=0.42..8985.69 rows=128785 width=19) (actual time=0.082..20.626 rows=103247 loops=3)"                                                                                                                                                                                                                                                                      
  "                                                        Heap Fetches: 1924"                                                                                                                                                                                                                                                                                                                                                                                                                        
  "                                                        Buffers: shared hit=52490"                                                                                                                                                                                                                                                                                                                                                                                                                 
  "                                ->  Function Scan on jsonb_each cat_1  (cost=0.01..1.01 rows=100 width=32) (actual time=0.207..0.207 rows=0 loops=617729)"                                                                                                                                                                                                                                                                                                                                         
  "                                      Buffers: shared hit=1125682 read=179650"                                                                                                                                                                                                                                                                                                                                                                                                                     
  "                                      I/O Timings: shared read=121619.123"                                                                                                                                                                                                                                                                                                                                                                                                                         
  "                          ->  Memoize  (cost=0.01..1.01 rows=100 width=32) (never executed)"                                                                                                                                                                                                                                                                                                                                                                                                       
  "                                Cache Key: cat_1.v"                                                                                                                                                                                                                                                                                                                                                                                                                                                
  "                                Cache Mode: binary"                                                                                                                                                                                                                                                                                                                                                                                                                                                
  "                                ->  Function Scan on jsonb_array_elements fil_1  (cost=0.00..1.00 rows=100 width=32) (never executed)"                                                                                                                                                                                                                                                                                                                                                             
  "Planning:"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
  "  Buffers: shared hit=310"                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
  "Planning Time: 0.403 ms"                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
  "Execution Time: 165135.771 ms"                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

⏺ 165 seconds, almost entirely I/O. Here's what actually happened:

  The Real Story

  Zero actual file references exist in any sequence. Both jsonb_each calls returned 0 rows across ~930K invocations:

  jsonb_each cat   → rows=0  loops=309,742   (every sequence_entry)
  jsonb_each cat_1 → rows=0  loops=617,729   (every preprocessed_data row)

  Yet it still took 165 seconds because PostgreSQL had to read the entire JSONB blob for every row just to discover there's no files key. You can see it in the I/O:

  ┌───────────────────────────────────────────────┬──────────────────────────┬──────────┐
  │                    Branch                     │  Blocks read from disk   │ I/O time │
  ├───────────────────────────────────────────────┼──────────────────────────┼──────────┤
  │ sequence_entries jsonb_each                   │ 53,174 blocks (~415 MB)  │ 34.7s    │
  ├───────────────────────────────────────────────┼──────────────────────────┼──────────┤
  │ sequence_entries_preprocessed_data jsonb_each │ 179,650 blocks (~1.4 GB) │ 121.6s   │
  ├───────────────────────────────────────────────┼──────────────────────────┼──────────┤
  │ Total                                         │ ~1.8 GB                  │ 156.3s   │
  └───────────────────────────────────────────────┴──────────────────────────┴──────────┘
  
  The data wasn't in shared_buffers — so 156 of the 165 seconds were just waiting on disk reads. CPU work was trivial (sorting 0 rows is instant).

  The Structural Problem
  
  The query has no way to skip a row without reading its JSONB. To check submitted_data->'files', it must load the whole submitted_data blob. With 309K rows at ~966 bytes each (compressed; uncompressed likely much larger), that's a guaranteed full-scan of every JSONB column every time this runs.

  A targeted fix would be to add a GIN index and pre-filter before the lateral join:

  CREATE INDEX ON sequence_entries USING GIN (submitted_data jsonb_path_ops);

  -- then in the CTE:
  FROM sequence_entries
  WHERE submitted_data ? 'files'   -- index hit, skips rows with no files key
  LATERAL jsonb_each(...)

  That would reduce the 309K jsonb_each calls to however many sequences actually have a files key — likely near zero right now, making the whole query instant.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@corneliusroemer and I also discussed one potential race condition. If a user submits a file, waits for like a month (or however long the grace period is) and then submits data with the files at the same exact time this transaction is run the fileId will get into the list of files to delete but the submission will not be rejected.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we dont foresee this happening but it might be good to somehow mark columns as "marked for deletion" after running this job, stopping submission of files that are marked for deletion and then rerunning this transaction again before deleting

val sql = """
-- check for files for which an upload was requested > threshold days ago

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: we're reading through the whole database for this to find referenced files.

If this becomes perf problematic or otherwise we might want to consider normalizing files with proper foreign keys etc to allow full power of SQL.

-- but are not referenced by a submission. For this, check the submitted_data
-- and processed_data jsonb objects (but not archive_of_submitted_data)
Comment thread
anna-parker marked this conversation as resolved.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lets check the file is also not in the archive for consistency

@maverbiest maverbiest Jun 16, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alright I'll make it scan the archive as well, although I expect it will take the query take even longer unfortunately.

The more I think about it the more I become convinced we should take files out of the jsonb and add join tables from files to se and sepd, storing the files type (raw_reads, annotation, etc.) as a column on the files table. That's a pretty big refactor though so maybe something for a follow-up PR

WITH referenced AS (
Comment thread
anna-parker marked this conversation as resolved.
-- fetch ids for files uploaded by users and referenced in submissions
SELECT (fil->>'fileId')::uuid AS file_id
FROM sequence_entries,
LATERAL jsonb_each(COALESCE(NULLIF(submitted_data->'files', 'null'::jsonb),'{}'::jsonb)) AS cat(k,v),
LATERAL jsonb_array_elements(cat.v) AS fil
UNION
-- also need to check processed_data since preprocessing
-- can create files that are never referenced in submissions
SELECT (fil->>'fileId')::uuid AS file_id
FROM sequence_entries_preprocessed_data sepd
JOIN sequence_entries se
ON se.accession = sepd.accession
AND se.version = sepd.version,
LATERAL jsonb_each(COALESCE(NULLIF(sepd.processed_data->'files', 'null'::jsonb),'{}'::jsonb)) AS cat(k,v),
LATERAL jsonb_array_elements(cat.v) AS fil
)
SELECT f.id FROM files f
LEFT JOIN referenced r ON r.file_id = f.id
WHERE r.file_id IS NULL
AND f.upload_requested_at < ?;
""".trimIndent()
return transaction {
exec(
sql,
listOf(KotlinLocalDateTimeColumnType() to threshold),

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm...

Suggested change
listOf(KotlinLocalDateTimeColumnType() to threshold),
listOf(Pair(KotlinLocalDateTimeColumnType(), threshold)),

explicitStatementType = StatementType.SELECT,
) { rs ->
buildSet<FileId> {
while (rs.next()) {
add(rs.getObject("id", UUID::class.java))
}
}
} ?: emptySet()
}
}

/**
* Return the subset of file IDs for which the file size hasn't been checked yet or
* no file has been uploaded yet (and therefore there's no file size).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import software.amazon.awssdk.services.s3.model.CompleteMultipartUploadRequest
import software.amazon.awssdk.services.s3.model.CompletedMultipartUpload
import software.amazon.awssdk.services.s3.model.CompletedPart
import software.amazon.awssdk.services.s3.model.CreateMultipartUploadRequest
import software.amazon.awssdk.services.s3.model.DeleteObjectRequest
import software.amazon.awssdk.services.s3.model.GetObjectRequest
import software.amazon.awssdk.services.s3.model.HeadObjectRequest
import software.amazon.awssdk.services.s3.model.PutObjectRequest
Expand Down Expand Up @@ -176,6 +177,18 @@ class S3Service(private val s3Config: S3Config) {
Unit
}

fun deleteFile(fileId: FileId) = s3ErrorMapping {
val config = getS3BucketConfig()
s3Client.deleteObject(
DeleteObjectRequest
.builder()
.bucket(config.bucket)
.key(getFileIdPath(fileId))
.build(),
)
Unit
}

private fun assertIsEnabled() {
if (!s3Config.enabled) {
throw IllegalStateException("S3 is not enabled")
Expand Down Expand Up @@ -258,6 +271,11 @@ fun <T> s3ErrorMapping(block: () -> T): T {
"by part number.",
)

"NoSuchKey" -> UnprocessableEntityException(

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This reminds me there are no tests for non-happy-path cases like if we're trying to delete something that doesn't exist.

"NoSuchKey: The referenced file does not exist in storage. Uploaded files that are not " +
"referenced by a submission for too long are cleaned up automatically.",
)

else -> RuntimeException("Unexpected S3 error: ${e.awsErrorDetails().errorCode()}")
}
}
Expand Down

@corneliusroemer corneliusroemer Jun 15, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason this is under service/submission? To me this seems more service/files related.

Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package org.loculus.backend.service.submission

import kotlinx.datetime.DateTimeUnit
import kotlinx.datetime.minus
import kotlinx.datetime.toLocalDateTime
import org.loculus.backend.config.BackendSpringProperty
import org.loculus.backend.log.AuditLogger
import org.loculus.backend.service.files.FilesDatabaseService
import org.loculus.backend.service.files.S3Service
import org.loculus.backend.utils.DateProvider
import org.springframework.beans.factory.annotation.Value
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty
import org.springframework.scheduling.annotation.Scheduled
import org.springframework.stereotype.Component
import java.util.UUID
import java.util.concurrent.TimeUnit
import kotlin.math.max

private val log = mu.KotlinLogging.logger {}

@Component
@ConditionalOnProperty("loculus.s3.enabled", havingValue = "true")
class S3GarbageCollectionTask(
private val filesDatabaseService: FilesDatabaseService,
private val s3Service: S3Service,
private val dateProvider: DateProvider,
private val auditLogger: AuditLogger,
@Value("\${${BackendSpringProperty.S3_GC_GRACE_PERIOD_MINUTES}}") private val gracePeriod: Int,
@Value("\${${BackendSpringProperty.S3_GC_ENABLED}}") private val deleteOrphans: Boolean,
) {

/**
* Deletes S3 objects older than `loculus.s3.gc-grace-period-minutes` that are
* not referenced in submitted_data or processed_data
*/
@Scheduled(
initialDelayString = "\${${BackendSpringProperty.S3_GC_INITIAL_DELAY_MINUTES}}",
fixedDelay = 60 * 24,
timeUnit = TimeUnit.MINUTES,
)
fun task() {
// `gracePeriod` must be at least 1 or files produced by preprocessing may be
// garbage collected before they're attached to sequence entries
val gracePeriod = max(gracePeriod, 1)
log.info {
"Running S3 garbage collection task to clean up orphan files at least $gracePeriod " +
"minutes old (garbageCollectionEnabled = $deleteOrphans)"
}

val threshold = dateProvider.getCurrentInstant()
.minus(gracePeriod, DateTimeUnit.MINUTE, DateProvider.timeZone)
.toLocalDateTime(DateProvider.timeZone)
Comment on lines +50 to +52

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this the same function daysAgo that's in testHelpers?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Almost:

fun daysAgo(days: Long): LocalDateTime = Clock.System.now()
    .minus(days, DateTimeUnit.DAY, DateProvider.timeZone)
    .toLocalDateTime(DateProvider.timeZone)

Can probably be unified

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might be using dateprovider for a reason - so maybe shouldn't be replaced by Clock.System.now() in tests

val orphans = filesDatabaseService.getOrphanedFileIds(threshold)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not 100% certain but this whole task looks like it has not transaction isolation at all. Considering the 2 backends racing this is probably not a good idea.


if (!deleteOrphans) {
log.info { "S3 garbage collection task would have deleted ${orphans.size} files: $orphans" }
return
}

var deleteFailures = 0
orphans.forEach { fileId: UUID ->
try {
s3Service.deleteFile(fileId)
filesDatabaseService.deleteFileEntry(fileId)
Comment on lines +63 to +64

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Transactionally this might be some of the most critical code here - shoudl we really wrap it in a single try/catch? No retry, no recovery if deletion fails at step 1? Or at step 2?

Off the top of my head, for example, we might want to handle s3 deletion failure by setting a flag in our db about it. Vice versa, if s3 deletion succeeds but then db deletion doesn't what do we do? In the future deletion might fail every time as the file is no longer there? So we end in unrecoverable state?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An actual failure mode that can happen is that we have 2 backends in production - they might happen to cross and run at the same time racing to deletion. We might want to add a lock for this, maybe a table that tracks last cleanup time and if cleanup is in progress so we don't cleanup at the same time twice.

@anna-parker anna-parker Jun 15, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be good to check this out... chatGPT tells me the s3 deletion should not raise an error if the file is missing which means we would still try to remove the file from the db:

Usually deleting a missing object from S3 does not raise an error.

DeleteObject in S3 is idempotent: if the key does not exist, S3 still returns a successful response (typically HTTP 204 No Content).

But then Im not sure why we have code to handle NoSuchKey

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update: NoSuchKey is only thrown when trying to get a file that is missing

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes trying to delete a deleted file should be fine as far as I'm aware. The NoSuchKey errors happen when trying to access a deleted file, which came up when GC was deleting files generated by prepro before they were attached to files

} catch (e: Exception) {
log.warn("Failed to delete $fileId", e)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking about it we definitely at the very least want different error messages for failing to delete file from s3 and from postgres db.

deleteFailures++
}
Comment thread
maverbiest marked this conversation as resolved.
}

if (orphans.isNotEmpty()) {
log.info {
"S3 garbage collection task deleted ${orphans.size - deleteFailures} orphan(s) not referenced by a " +
"submission after $gracePeriod minutes"
}
auditLogger
.log(
"CLEANUP",
"S3 garbage collection task deleted ${orphans.size - deleteFailures} orphan(s) " +
"not referenced by a submission after $gracePeriod minutes",
)

if (deleteFailures > 0) {
log.warn {
"S3 garbage collection task unsuccessfully attempted to delete $deleteFailures orphan file(s)"
}
}
} else {
log.info { "S3 garbage collection task identified no orphan files on S3" }
}
}
}
3 changes: 3 additions & 0 deletions backend/src/main/resources/application.properties
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ loculus.stream.batch-size=1000
loculus.debug-mode=false

loculus.s3.enabled=false
loculus.s3.gc-enabled=false
loculus.s3.gc-grace-period-minutes=1440
loculus.s3.gc-initial-delay-minutes=15
loculus.s3.bucket.endpoint=
loculus.s3.bucket.bucket=
loculus.s3.bucket.region=
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE INDEX IF NOT EXISTS files_upload_requested_at_idx ON files (upload_requested_at);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The real index we'd need is knowing fileIds referenced by se and sepd - this one here doesn't harm but is not very helpful either..

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we do select based on upload_requested_at so it should help some :-)

26 changes: 26 additions & 0 deletions backend/src/test/kotlin/org/loculus/backend/service/TestHelpers.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package org.loculus.backend.service

import kotlinx.datetime.DateTimeUnit
import kotlinx.datetime.LocalDateTime
import kotlinx.datetime.minus
import kotlinx.datetime.toLocalDateTime
import org.jetbrains.exposed.sql.insert
import org.jetbrains.exposed.sql.transactions.transaction
import org.loculus.backend.service.files.FilesTable
import org.loculus.backend.utils.DateProvider
import java.util.UUID
import kotlin.time.Clock

fun insertFile(id: UUID, groupId: Int, requestedAt: LocalDateTime, uploader: String = "testuser") = transaction {
FilesTable.insert {
it[idColumn] = id
it[uploadRequestedAtColumn] = requestedAt
it[uploaderColumn] = uploader
it[groupIdColumn] = groupId
it[multipartCompleted] = true
}
}

fun daysAgo(days: Long): LocalDateTime = Clock.System.now()
.minus(days, DateTimeUnit.DAY, DateProvider.timeZone)
.toLocalDateTime(DateProvider.timeZone)
Loading
Loading