diff --git a/modules/nf-core/genomeuploader/environment.yml b/modules/nf-core/genomeuploader/environment.yml new file mode 100644 index 000000000000..86b49c57cc83 --- /dev/null +++ b/modules/nf-core/genomeuploader/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::genome-uploader=2.5.0" diff --git a/modules/nf-core/genomeuploader/main.nf b/modules/nf-core/genomeuploader/main.nf new file mode 100644 index 000000000000..6f12ed550156 --- /dev/null +++ b/modules/nf-core/genomeuploader/main.nf @@ -0,0 +1,62 @@ +process GENOMEUPLOADER { + tag "$meta.id" + label 'process_single' + + secret secrets.ENA_WEBIN ? "ENA_WEBIN" : "" + secret secrets.ENA_WEBIN_PASSWORD ? "ENA_WEBIN_PASSWORD" : "" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/genome-uploader:2.5.0--pyhdfd78af_0': + 'biocontainers/genome-uploader:2.5.0--pyhdfd78af_0' }" + + input: + tuple val(meta), path(metadata_tsv) + path(fastas, stageAs: "genomes/*") + + output: + tuple val(meta), path("upload_output/*") , emit: upload_output_dir + tuple val(meta), path("upload_output/MAG_upload/submission.xml") , emit: submission + tuple val(meta), path("upload_output/MAG_upload/registered_MAGs_${prefix}.tsv") , emit: registered_mags + tuple val(meta), path("upload_output/MAG_upload/genome_samples.xml") , emit: genome_samples + tuple val(meta), path("upload_output/MAG_upload/manifests_${prefix}/*.manifest"), emit: manifests + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + genome_upload \\ + $args \\ + --upload_study "${meta.study_accession}" \\ + --centre_name "${meta.center_name}" \\ + --genome_info ${metadata_tsv} \\ + --out upload_output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + genome_upload: \$( genome_upload --version | sed 's/genome_uploader //' ) + ena-webin-cli: \$( ena-webin-cli -version ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p upload_output/MAG_upload/manifests_${prefix} + touch upload_output/MAG_upload/submission.xml + touch upload_output/MAG_upload/registered_MAGs_${prefix}.tsv + touch upload_output/MAG_upload/genome_samples.xml + touch upload_output/MAG_upload/manifests_${prefix}/test_mag.manifest + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + genome_upload: \$( genome_upload --version | sed 's/genome_uploader //' ) + ena-webin-cli: \$( ena-webin-cli -version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/genomeuploader/meta.yml b/modules/nf-core/genomeuploader/meta.yml new file mode 100644 index 000000000000..1069a4582059 --- /dev/null +++ b/modules/nf-core/genomeuploader/meta.yml @@ -0,0 +1,109 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: genomeuploader +description: Upload genome bins and MAGs in FASTA format to ENA (European Nucleotide Archive) +keywords: + - archiving + - ena + - mags + - bins + - upload +tools: + - genomeuploader: + description: Python script to upload bins and MAGs in fasta format to ENA (European Nucleotide Archive). + homepage: https://github.com/EBI-Metagenomics/genome_uploader + documentation: https://github.com/EBI-Metagenomics/genome_uploader + tool_dev_url: https://github.com/EBI-Metagenomics/genome_uploader + licence: ["Apache-2.0"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information, the study_accession to upload the data, and the center name + e.g. `[ id:'test', study_accession:'ERP159782', center_name:'nf-core' ]` + - metadata_tsv: + type: file + description: TSV file containing metadata for genomes/bins to upload + pattern: "*.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + - fastas: + type: file + description: FASTA files containing genome/bin sequences to upload + pattern: "*.{fasta,fa,fna,fasta.gz,fa.gz,fna.gz}" + ontologies: + - edam: http://edamontology.org/format_1929 # FASTA + - edam: http://edamontology.org/format_3989 # GZIP format + +output: + upload_output_dir: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', study_accession:'ERP159782', center_name:'nf-core' ]` + - upload_output/*: + type: directory + description: Directory containing all upload outputs + pattern: "upload_output/*" + ontologies: [] + submission: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', study_accession:'ERP159782', center_name:'nf-core' ]` + - upload_output/MAG_upload/submission.xml: + type: file + description: ENA submission XML file + pattern: "upload_output/MAG_upload/submission.xml" + ontologies: + - edam: http://edamontology.org/format_2332 # XML + registered_mags: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', study_accession:'ERP159782', center_name:'nf-core' ]` + - upload_output/MAG_upload/registered_MAGs_${prefix}.tsv: + type: file + description: TSV file mapping genome names to ENA accession numbers + pattern: "upload_output/MAG_upload/registered_MAGs_*.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + genome_samples: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', study_accession:'ERP159782', center_name:'nf-core' ]` + - upload_output/MAG_upload/genome_samples.xml: + type: file + description: ENA genome samples XML file + pattern: "upload_output/MAG_upload/genome_samples.xml" + ontologies: + - edam: http://edamontology.org/format_2332 # XML + manifests: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', study_accession:'ERP159782', center_name:'nf-core' ]` + - upload_output/MAG_upload/manifests_${prefix}/*.manifest: + type: file + description: ENA manifest files for genome upload + pattern: "upload_output/MAG_upload/manifests_*/*.manifest" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML + +authors: + - "@mberacochea" +maintainers: + - "@mberacochea" diff --git a/modules/nf-core/genomeuploader/tests/main.nf.test b/modules/nf-core/genomeuploader/tests/main.nf.test new file mode 100644 index 000000000000..13a6758b9e35 --- /dev/null +++ b/modules/nf-core/genomeuploader/tests/main.nf.test @@ -0,0 +1,100 @@ +nextflow_process { + + name "Test Process GENOMEUPLOADER" + script "../main.nf" + config "./nextflow.config" + process "GENOMEUPLOADER" + + tag "modules" + tag "modules_nfcore" + tag "genomeuploader" + + test("genome - fasta - gz") { + + when { + process { + """ + // This module uses a csv as input, which contains the paths to the genomes/bins to upload + // That is why it contains a second parameter that accepts a Path with all the fasta files (mags and bins) to upload + // and that is why the path is genomes/ in the manifest + def metadata_content = [ + ["genome_name", "genome_path", "accessions", "assembly_software", "binning_software", "binning_parameters", "stats_generation_software", "completeness", "contamination", "genome_coverage", "metagenome", "co-assembly", "broad_environment", "local_environment", "environmental_medium", "rRNA_presence", "NCBI_lineage"].join("\t"), + ["test_mag", "genomes/GCA_002688505.1_ASM268850v1_genomic.fna.gz", "ERR4647712", "SPAdes_4.1.0", "nf-core/mag", "default", "CheckM2_1.1.0", "90.0", "1.0", "10.0", "chicken gut metagenome", "False", "chicken gut", "chicken gut mucosa", "chicken gut mucosa", "True", "d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__Lactobacillus_crispatus"].join("\t") + ].join("\\n") + + def metadata_file = file('genomes_metadata.tsv') + metadata_file.text = metadata_content + + input[0] = [ + [ + id: 'test', + study_accession: 'ERP159782', + center_name: 'nf-core' + ], + metadata_file + ] + input[1] = file('https://github.com/nf-core/test-datasets/raw/refs/heads/magmap/testdata/GCA_002688505.1_ASM268850v1_genomic.fna.gz', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.submission, + process.out.versions, + // Check registered_MAGs contains expected genome name (starts with test_mag) + file(process.out.registered_mags.get(0).get(1)).readLines()[0].split('\t')[0].startsWith('test_mag'), + // Check genome_samples.xml contains expected elements + file(process.out.genome_samples.get(0).get(1)).readLines().any { it.contains(' 0, + file(process.out.manifests.get(0).get(1)).readLines().any { it.contains('STUDY') } + ).match() } + ) + } + + } + + test("genome - fasta - gz -stub") { + + options "-stub" + + when { + process { + """ + // This module uses a csv as input, which contains the paths to the genomes/bins to upload + // That is why it contains a second parameter that accepts a Path with all the fasta files (mags and bins) to upload + // and that is why the path is genomes/ in the manifest + def metadata_content = [ + ["genome_name", "genome_path", "accessions", "assembly_software", "binning_software", "binning_parameters", "stats_generation_software", "completeness", "contamination", "genome_coverage", "metagenome", "co-assembly", "broad_environment", "local_environment", "environmental_medium", "rRNA_presence", "NCBI_lineage"].join("\t"), + ["test_mag", "genomes/GCA_002688505.1_ASM268850v1_genomic.fna.gz", "ERR4647712", "SPAdes_4.1.0", "nf-core/mag", "default", "CheckM2_1.1.0", "90.0", "1.0", "10.0", "chicken gut metagenome", "False", "chicken gut", "chicken gut mucosa", "chicken gut mucosa", "True", "d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__Lactobacillus_crispatus"].join("\t") + ].join("\\n") + + def metadata_file = file('genomes_metadata.tsv') + metadata_file.text = metadata_content + + input[0] = [ + [ + id: 'test', + study_accession: 'ERP159782', + center_name: 'nf-core' + ], + metadata_file + ] + input[1] = file('https://github.com/nf-core/test-datasets/raw/refs/heads/magmap/testdata/GCA_002688505.1_ASM268850v1_genomic.fna.gz', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + +} diff --git a/modules/nf-core/genomeuploader/tests/main.nf.test.snap b/modules/nf-core/genomeuploader/tests/main.nf.test.snap new file mode 100644 index 000000000000..9fcd7cf3ae73 --- /dev/null +++ b/modules/nf-core/genomeuploader/tests/main.nf.test.snap @@ -0,0 +1,29 @@ +{ + "GCA_002688505.1_ASM268850v1 - fasta - gz": { + "content": [ + [ + [ + { + "id": "test", + "study_accession": "ERP159782", + "center_name": "nf-core" + }, + "submission.xml:md5,705c441ca687726152f8540dab4bb322" + ] + ], + [ + "versions.yml:md5,36d7baf63c1face41d2fc0edd6263944" + ], + true, + true, + false, + true, + true + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.7" + }, + "timestamp": "2025-10-08T18:33:31.006282" + } +} \ No newline at end of file diff --git a/modules/nf-core/genomeuploader/tests/nextflow.config b/modules/nf-core/genomeuploader/tests/nextflow.config new file mode 100644 index 000000000000..49d0320ab8c4 --- /dev/null +++ b/modules/nf-core/genomeuploader/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'GENOMEUPLOADER' { + ext.args = '--mags' + } +}