From 6c7aeb21eca4b132c238569c7dc483343c7a1b92 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sun, 23 Nov 2025 14:42:26 +0100 Subject: [PATCH 1/7] chore(ci): log basic details of downloaded ncbi datasets Doesn't harm to have md5, file size, number of sequences/avg length etc --- .github/workflows/datasets-mirror.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/datasets-mirror.yml b/.github/workflows/datasets-mirror.yml index 17f207b1d7..c387de6c37 100644 --- a/.github/workflows/datasets-mirror.yml +++ b/.github/workflows/datasets-mirror.yml @@ -24,13 +24,17 @@ jobs: - uses: mamba-org/setup-micromamba@v2 with: environment-name: datasets - create-args: ncbi-datasets-cli s3cmd - - name: Download NCBI Dataset + create-args: ncbi-datasets-cli s3cmd seqkit + - name: Download NCBI Dataset and create tzst archive shell: bash -l {0} run: | datasets download virus genome taxon ${{ inputs.taxon_id }} --no-progressbar --filename ${{ inputs.taxon_id }}.zip unzip -o ${{ inputs.taxon_id }}.zip tar -I 'zstd -T0 -18' -cvf ${{ inputs.taxon_id }}.tar.zst ncbi_dataset + cat md5sum.txt + find ncbi_dataset -type f -exec stat -c "%s %n" {} \; + stat -c "%s %n" ${{ inputs.taxon_id }}.zip ${{ inputs.taxon_id }}.tar.zst + seqkit stats ncbi_dataset/data/genomic.fna - name: Create S3cmd config run: | cat < ~/.s3cfg From 3cc853ce2fac77fc4b07969e01b2ccd5a50bbe3e Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sun, 23 Nov 2025 14:44:29 +0100 Subject: [PATCH 2/7] Run prio 2 with 1hr offset from 2-hourly prio 1 to not coincide --- .github/workflows/datasets-mirror-priority-2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/datasets-mirror-priority-2.yml b/.github/workflows/datasets-mirror-priority-2.yml index 54c89375e2..cbf59cb175 100644 --- a/.github/workflows/datasets-mirror-priority-2.yml +++ b/.github/workflows/datasets-mirror-priority-2.yml @@ -2,7 +2,7 @@ name: Mirror NCBI Datasets (Priority 2 - Daily) on: workflow_dispatch: # Allows manual triggering to re-run all priority-2 taxa schedule: - - cron: '12 2 * * *' # Runs daily at 02:12 UTC + - cron: '42 2 * * *' # Runs daily at 02:12 UTC jobs: mirror: strategy: From c0148439f9365b6e61459d69c13f58170aac8c93 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sun, 23 Nov 2025 14:44:29 +0100 Subject: [PATCH 3/7] Run prio 2 with 1hr offset from 2-hourly prio 1 to not coincide add bioconda channel --- .github/workflows/datasets-mirror-priority-2.yml | 2 +- .github/workflows/datasets-mirror.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/datasets-mirror-priority-2.yml b/.github/workflows/datasets-mirror-priority-2.yml index cbf59cb175..fdd1376cd8 100644 --- a/.github/workflows/datasets-mirror-priority-2.yml +++ b/.github/workflows/datasets-mirror-priority-2.yml @@ -2,7 +2,7 @@ name: Mirror NCBI Datasets (Priority 2 - Daily) on: workflow_dispatch: # Allows manual triggering to re-run all priority-2 taxa schedule: - - cron: '42 2 * * *' # Runs daily at 02:12 UTC + - cron: '12 3 * * *' # Runs daily at 03:12 UTC jobs: mirror: strategy: diff --git a/.github/workflows/datasets-mirror.yml b/.github/workflows/datasets-mirror.yml index c387de6c37..0ae25fc662 100644 --- a/.github/workflows/datasets-mirror.yml +++ b/.github/workflows/datasets-mirror.yml @@ -24,7 +24,7 @@ jobs: - uses: mamba-org/setup-micromamba@v2 with: environment-name: datasets - create-args: ncbi-datasets-cli s3cmd seqkit + create-args: -c conda-forge -c bioconda ncbi-datasets-cli s3cmd seqkit - name: Download NCBI Dataset and create tzst archive shell: bash -l {0} run: | From 37ecba54aea6a76c630c4e0098c2a693746d7938 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sun, 23 Nov 2025 14:49:32 +0100 Subject: [PATCH 4/7] Nicer log --- .github/workflows/datasets-mirror.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/datasets-mirror.yml b/.github/workflows/datasets-mirror.yml index 0ae25fc662..37d08b2080 100644 --- a/.github/workflows/datasets-mirror.yml +++ b/.github/workflows/datasets-mirror.yml @@ -29,11 +29,10 @@ jobs: shell: bash -l {0} run: | datasets download virus genome taxon ${{ inputs.taxon_id }} --no-progressbar --filename ${{ inputs.taxon_id }}.zip - unzip -o ${{ inputs.taxon_id }}.zip - tar -I 'zstd -T0 -18' -cvf ${{ inputs.taxon_id }}.tar.zst ncbi_dataset - cat md5sum.txt - find ncbi_dataset -type f -exec stat -c "%s %n" {} \; - stat -c "%s %n" ${{ inputs.taxon_id }}.zip ${{ inputs.taxon_id }}.tar.zst + unzip -o ${{ inputs.taxon_id }}.zip; echo + tar -I 'zstd -T0 -18' -cvf ${{ inputs.taxon_id }}.tar.zst ncbi_dataset; echo + cat md5sum.txt; echo + ls -lh ${{ inputs.taxon_id }}.* ncbi_dataset/data/*; echo seqkit stats ncbi_dataset/data/genomic.fna - name: Create S3cmd config run: | From 6c53e2ec0a78b06bbc28f4d84de44c9a287e4d0a Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sun, 23 Nov 2025 14:52:12 +0100 Subject: [PATCH 5/7] try again --- .github/workflows/datasets-mirror.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/datasets-mirror.yml b/.github/workflows/datasets-mirror.yml index 37d08b2080..4fbed94800 100644 --- a/.github/workflows/datasets-mirror.yml +++ b/.github/workflows/datasets-mirror.yml @@ -29,10 +29,10 @@ jobs: shell: bash -l {0} run: | datasets download virus genome taxon ${{ inputs.taxon_id }} --no-progressbar --filename ${{ inputs.taxon_id }}.zip - unzip -o ${{ inputs.taxon_id }}.zip; echo - tar -I 'zstd -T0 -18' -cvf ${{ inputs.taxon_id }}.tar.zst ncbi_dataset; echo - cat md5sum.txt; echo - ls -lh ${{ inputs.taxon_id }}.* ncbi_dataset/data/*; echo + unzip -o ${{ inputs.taxon_id }}.zip; echo "" + tar -I 'zstd -T0 -18' -cvf ${{ inputs.taxon_id }}.tar.zst ncbi_dataset; echo "" + cat md5sum.txt; echo "" + ls -lh ${{ inputs.taxon_id }}.* ncbi_dataset/data/*; echo "" seqkit stats ncbi_dataset/data/genomic.fna - name: Create S3cmd config run: | From 64ff17cc1f88e80f802a66ab76ef250d28d160a7 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sun, 23 Nov 2025 14:54:12 +0100 Subject: [PATCH 6/7] again --- .github/workflows/datasets-mirror.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/datasets-mirror.yml b/.github/workflows/datasets-mirror.yml index 4fbed94800..6f69d7998d 100644 --- a/.github/workflows/datasets-mirror.yml +++ b/.github/workflows/datasets-mirror.yml @@ -29,10 +29,10 @@ jobs: shell: bash -l {0} run: | datasets download virus genome taxon ${{ inputs.taxon_id }} --no-progressbar --filename ${{ inputs.taxon_id }}.zip - unzip -o ${{ inputs.taxon_id }}.zip; echo "" - tar -I 'zstd -T0 -18' -cvf ${{ inputs.taxon_id }}.tar.zst ncbi_dataset; echo "" - cat md5sum.txt; echo "" - ls -lh ${{ inputs.taxon_id }}.* ncbi_dataset/data/*; echo "" + unzip -o ${{ inputs.taxon_id }}.zip; printf "\n" + tar -I 'zstd -T0 -18' -cvf ${{ inputs.taxon_id }}.tar.zst ncbi_dataset; printf "\n" + cat md5sum.txt; printf "\n" + ls -lh ${{ inputs.taxon_id }}.* ncbi_dataset/data/*; printf "\n" seqkit stats ncbi_dataset/data/genomic.fna - name: Create S3cmd config run: | From 477eb02d8bd8128a78d216added655ca93cde7c8 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sun, 23 Nov 2025 14:55:49 +0100 Subject: [PATCH 7/7] never mind --- .github/workflows/datasets-mirror.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/datasets-mirror.yml b/.github/workflows/datasets-mirror.yml index 6f69d7998d..fc9be6b73d 100644 --- a/.github/workflows/datasets-mirror.yml +++ b/.github/workflows/datasets-mirror.yml @@ -29,10 +29,10 @@ jobs: shell: bash -l {0} run: | datasets download virus genome taxon ${{ inputs.taxon_id }} --no-progressbar --filename ${{ inputs.taxon_id }}.zip - unzip -o ${{ inputs.taxon_id }}.zip; printf "\n" - tar -I 'zstd -T0 -18' -cvf ${{ inputs.taxon_id }}.tar.zst ncbi_dataset; printf "\n" - cat md5sum.txt; printf "\n" - ls -lh ${{ inputs.taxon_id }}.* ncbi_dataset/data/*; printf "\n" + unzip -o ${{ inputs.taxon_id }}.zip + tar -I 'zstd -T0 -18' -cvf ${{ inputs.taxon_id }}.tar.zst ncbi_dataset + cat md5sum.txt + ls -lh ${{ inputs.taxon_id }}.* ncbi_dataset/data/* seqkit stats ncbi_dataset/data/genomic.fna - name: Create S3cmd config run: |