diff --git a/CITATIONS.md b/CITATIONS.md index 77b8cea6..6baeeede 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -102,6 +102,10 @@ > Alcock, B. P., Huynh, W., Chalil, R., Smith, K. W., Raphenya, A. R., Wlodarski, M. A., Edalatmand, A., Petkau, A., Syed, S. A., Tsang, K. K., Baker, S. J. C., Dave, M., McCarthy, M. C., Mukiri, K. M., Nasir, J. A., Golbon, B., Imtiaz, H., Jiang, X., Kaur, K., Kwong, M., Liang, Z. C., Niu, K. C., Shan, P., Yang, J. Y. J., Gray, K. L., Hoad, G. R., Jia, B., Bhando, T., Carfrae, L. A., Farha, M. A., French, S., Gordzevich, R., Rachwalski, K., Tu, M. M., Bordeleau, E., Dooley, D., Griffiths, E., Zubyk, H. L., Brown, E. D., Maguire, F., Beiko, R. G., Hsiao, W. W. L., Brinkman F. S. L., Van Domselaar, G., McArthur, A. G. (2023). CARD 2023: expanded curation, support for machine learning, and resistome prediction at the Comprehensive Antibiotic Resistance Database. Nucleic acids research, 51(D1):D690-D699. [DOI: 10.1093/nar/gkac920](https://doi.org/10.1093/nar/gkac920) +- [dbCAN](https://doi.org/10.1093/nar/gkad328) + + > Jinfang Zheng, Qiwei Ge, Yuchen Yan, Xinpeng Zhang, Le Huang, Yanbin Yin, dbCAN3: automated carbohydrate-active enzyme and substrate annotation, Nucleic Acids Research, Volume 51, Issue W1, 5 July 2023, Pages W115–W121. [DOI:10.1093/nar/gkad328](https://doi.org/10.1093/nar/gkad328) + - [SeqKit](https://bioinf.shenwei.me/seqkit/) > Shen, W., Sipos, B., & Zhao, L. (2024). SeqKit2: A Swiss army knife for sequence and alignment processing. iMeta, e191. [https://doi.org/10.1002/imt2.191](https://doi.org/10.1002/imt2.191) diff --git a/README.md b/README.md index 38667c2c..7b0c1961 100644 --- a/README.md +++ b/README.md @@ -40,8 +40,9 @@ The nf-core/funcscan AWS full test dataset are contigs generated by the MGnify s 5. Screening contigs for antimicrobial peptide-like sequences with [`ampir`](https://cran.r-project.org/web/packages/ampir/index.html), [`Macrel`](https://github.com/BigDataBiology/macrel), [`HMMER`](http://hmmer.org/), [`AMPlify`](https://github.com/bcgsc/AMPlify) 6. Screening contigs for antibiotic resistant gene-like sequences with [`ABRicate`](https://github.com/tseemann/abricate), [`AMRFinderPlus`](https://github.com/ncbi/amr), [`fARGene`](https://github.com/fannyhb/fargene), [`RGI`](https://card.mcmaster.ca/analyze/rgi), [`DeepARG`](https://bench.cs.vt.edu/deeparg). [`argNorm`](https://github.com/BigDataBiology/argNorm) is used to map the outputs of `DeepARG`, `AMRFinderPlus`, and `ABRicate` to the [`Antibiotic Resistance Ontology`](https://www.ebi.ac.uk/ols4/ontologies/aro) for consistent ARG classification terms. 7. Screening contigs for biosynthetic gene cluster-like sequences with [`antiSMASH`](https://antismash.secondarymetabolites.org), [`DeepBGC`](https://github.com/Merck/deepbgc), [`GECCO`](https://gecco.embl.de/), [`HMMER`](http://hmmer.org/) -8. Creating aggregated reports for all samples across the workflows with [`AMPcombi`](https://github.com/Darcy220606/AMPcombi) for AMPs, [`hAMRonization`](https://github.com/pha4ge/hAMRonization) for ARGs, and [`comBGC`](https://raw.githubusercontent.com/nf-core/funcscan/master/bin/comBGC.py) for BGCs -9. Software version and methods text reporting with [`MultiQC`](http://multiqc.info/) +8. Screening contigs for carbohydrate-active enzymes (CAZymes), CAZyme gene clusters and substrates with [run_dbcan](https://github.com/bcb-unl/run_dbcan). +9. Creating aggregated reports for all samples across the workflows with [`AMPcombi`](https://github.com/Darcy220606/AMPcombi) for AMPs, [`hAMRonization`](https://github.com/pha4ge/hAMRonization) for ARGs, and [`comBGC`](https://raw.githubusercontent.com/nf-core/funcscan/master/bin/comBGC.py) for BGCs +10. Software version and methods text reporting with [`MultiQC`](http://multiqc.info/) ![funcscan metro workflow](docs/images/funcscan_metro_workflow.png) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 791912cd..c568ad65 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,4 @@ -sample,fasta,protein,gbk +sample,fasta,protein,gbk,gff sample_1,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_1.fasta.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_1.faa,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_1.gbk sample_2,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_2.fasta.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_2.faa.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_2.gbk.gz sample_3,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs.fasta diff --git a/assets/schema_input.json b/assets/schema_input.json index b86e9c79..733e8f37 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -33,12 +33,26 @@ "exists": true, "pattern": "^\\S+\\.(gbk|gbff)(\\.gz)?$", "errorMessage": "Input file for feature annotations has incorrect file format. File must end in `.gbk`, `.gbk.gz` or `.gbff`, or `.gbff.gz`" + }, + "gff": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.(gff|gff3)(\\.gz)?$", + "errorMessage": "Input file for feature annotations has incorrect file format. File must end in `.gff`, `.gff.gz` or `.gff3`, or `.gff3.gz`" + }, + "gff_type": { + "type": "string", + "enum": ["NCBI_prok", "prodigal", "NCBI_euk", "JGI"], + "errorMessage": "GFF type must be one of: NCBI_prok, prodigal, NCBI_euk, or JGI", + "meta": ["gff_type"] } }, "required": ["sample", "fasta"], "dependentRequired": { "protein": ["gbk"], - "gbk": ["protein"] + "gbk": ["protein"], + "gff": ["protein"] } }, "uniqueItems": true diff --git a/conf/modules.config b/conf/modules.config index 2812093b..a1fa5661 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -740,4 +740,39 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] } + + withName: RUNDBCAN_DATABASE { + publishDir = [ + path: { "${params.outdir}/databases/dbcan/" }, + mode: params.publish_dir_mode, + enabled: params.save_db, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: RUNDBCAN_CAZYMEANNOTATION { + publishDir = [ + path: { "${params.outdir}/cazyme/dbcan/cazyme_annotation/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: RUNDBCAN_EASYCGC { + publishDir = [ + path: { "${params.outdir}/cazyme/dbcan/cgc/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*_{cgc.gff,cgc_standard_out.tsv,diamond.out.tc,TF_hmm_results.tsv,STP_hmm_results.tsv}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: RUNDBCAN_EASYSUBSTRATE { + publishDir = [ + path: { "${params.outdir}/cazyme/dbcan/substrate/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*_{total_cgc_info.tsv,substrate_prediction.tsv,synteny_pdf}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } } diff --git a/conf/test.config b/conf/test.config index 61ad1c4d..21b32cb0 100644 --- a/conf/test.config +++ b/conf/test.config @@ -33,4 +33,6 @@ params { run_amp_screening = true amp_run_hmmsearch = true amp_hmmsearch_models = params.pipelines_testdata_base_path + 'funcscan/hmms/mybacteriocin.hmm' + + run_cazyme_screening = true } diff --git a/conf/test_cazyme_pyrodigal.config b/conf/test_cazyme_pyrodigal.config new file mode 100644 index 00000000..9d431f78 --- /dev/null +++ b/conf/test_cazyme_pyrodigal.config @@ -0,0 +1,34 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/funcscan -profile test_dbcan_pyrodigal, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'CAZyme Pyrodigal test profile' + config_profile_description = 'Minimal test dataset to check CAZyme workflow function' + + // Input data + input = params.pipelines_testdata_base_path + 'funcscan/samplesheet_reduced.csv' + + annotation_tool = 'pyrodigal' + + run_arg_screening = false + run_amp_screening = false + run_bgc_screening = false + run_cazyme_screening = true +} diff --git a/conf/test_preannotated_dbcan.config b/conf/test_preannotated_dbcan.config new file mode 100644 index 00000000..fde7e1fd --- /dev/null +++ b/conf/test_preannotated_dbcan.config @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/funcscan -profile test_preannotated_dbcan, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'CAZyme test profile - preannotated input' + config_profile_description = 'Minimal test dataset to check CAZyme workflow function' + + // Input data + input = params.pipelines_testdata_base_path + 'funcscan/samplesheet_preannotated.csv' + + annotation_tool = 'pyrodigal' + + run_arg_screening = false + run_amp_screening = false + run_bgc_screening = false + run_cazyme_screening = true + + dbcan_skip_cgc = true // Skip cgc annotation as .gbk (not .gff) is provided in samplesheet + dbcan_skip_substrate = true // Skip substrate annotation as .gbk (not .gff) is provided in samplesheet +} diff --git a/docs/output.md b/docs/output.md index 289d9086..505a60cb 100644 --- a/docs/output.md +++ b/docs/output.md @@ -7,10 +7,11 @@ The output of nf-core/funcscan provides reports for each of the functional group - **antibiotic resistance genes** (tools: [ABRicate](https://github.com/tseemann/abricate), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), [fARGene](https://github.com/fannyhb/fargene), [RGI](https://card.mcmaster.ca/analyze/rgi) – summarised by [hAMRonization](https://github.com/pha4ge/hAMRonization). Results from ABRicate, AMRFinderPlus, and DeepARG are normalised to [ARO](https://obofoundry.org/ontology/aro.html) by [argNorm](https://github.com/BigDataBiology/argNorm).) - **antimicrobial peptides** (tools: [Macrel](https://github.com/BigDataBiology/macrel), [AMPlify](https://github.com/bcgsc/AMPlify), [ampir](https://ampir.marine-omics.net), [hmmsearch](http://hmmer.org) – summarised by [AMPcombi](https://github.com/Darcy220606/AMPcombi)) - **biosynthetic gene clusters** (tools: [antiSMASH](https://docs.antismash.secondarymetabolites.org), [DeepBGC](https://github.com/Merck/deepbgc), [GECCO](https://gecco.embl.de), [hmmsearch](http://hmmer.org) – summarised by [comBGC](#combgc)) +- **carbohydrate-active enzymes (CAZymes)**, CAZyme gene clusters and substrates (tools: [run_dbcan](https://github.com/bcb-unl/run_dbcan)) As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set. Additionally, taxonomic classifications from [MMseqs2](https://github.com/soedinglab/MMseqs2) are saved if the `--taxa_classification_mmseqs_db_savetmp` and `--taxa_classification_mmseqs_taxonomy_savetmp` flags are set. -Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://github.com/soedinglab/MMseqs2), [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), [RGI](https://github.com/arpcard/rgi), and/or [AMPcombi](https://github.com/Darcy220606/AMPcombi)) into the output directory `/databases/` if the `--save_db` flag was set. +Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://github.com/soedinglab/MMseqs2), [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), [RGI](https://github.com/arpcard/rgi), [AMPcombi](https://github.com/Darcy220606/AMPcombi), and/or [run_dbcan](https://github.com/bcb-unl/run_dbcan)) into the output directory `/databases/` if the `--save_db` flag was set. Furthermore, for reproducibility, versions of all software used in the run is presented in a [MultiQC](http://multiqc.info) report. @@ -41,6 +42,8 @@ results/ | ├── deepbgc/ | ├── gecco/ | └── hmmsearch/ +├── cazyme/ +| └── dbcan/ ├── databases/ ├── multiqc/ ├── pipeline_info/ @@ -63,11 +66,11 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes p Input contig QC with: -- [SeqKit](https://bioinf.shenwei.me/seqkit/) (default) - for separating into long- and short- categories +- [SeqKit](https://bioinf.shenwei.me/seqkit/) (default) – for separating into long- and short- categories Taxonomy classification of nucleotide sequences with: -- [MMseqs2](https://github.com/soedinglab/MMseqs2) (default) - for contig taxonomic classification using 2bLCA. +- [MMseqs2](https://github.com/soedinglab/MMseqs2) (default) – for contig taxonomic classification using 2bLCA. ORF prediction and annotation with any of: @@ -98,18 +101,22 @@ Antimicrobial Peptides (AMPs): Biosynthetic Gene Clusters (BGCs): - [antiSMASH](#antismash) – biosynthetic gene cluster detection. -- [deepBGC](#deepbgc) - biosynthetic gene cluster detection, using a deep learning model. +- [deepBGC](#deepbgc) – biosynthetic gene cluster detection, using a deep learning model. - [GECCO](#gecco) – biosynthetic gene cluster detection, using Conditional Random Fields (CRFs). - [hmmsearch](#hmmsearch) – biosynthetic gene cluster detection, based on hidden Markov models. +Carbohydrate-active enzymes (CAZYMEs) + +- [run_dbcan](https://github.com/bcb-unl/run_dbcan) – carbohydrate-active enzyme (CAZyme), CAZyme gene clusters and substrate detection. + Output Summaries: -- [AMPcombi](#ampcombi) – summary report of antimicrobial peptide gene output from various detection tools. -- [hAMRonization](#hamronization) – summary of antimicrobial resistance gene output from various detection tools. -- [argNorm](#argNorm) - Normalize ARG annotations from [ABRicate](#abricate), [AMRFinderPlus](#amrfinderplus), and [DeepARG](#deeparg) to the ARO -- [comBGC](#combgc) – summary of biosynthetic gene cluster output from various detection tools. -- [MultiQC](#multiqc) – report of all software and versions used in the pipeline. -- [Pipeline information](#pipeline-information) – report metrics generated during the workflow execution. +- [AMPcombi](#ampcombi) – summary report of antimicrobial peptide gene output from various detection tools +- [hAMRonization](#hamronization) – summary of antimicrobial resistance gene output from various detection tools +- [argNorm](#argNorm) – Normalize ARG annotations from [ABRicate](#abricate), [AMRFinderPlus](#amrfinderplus), and [DeepARG](#deeparg) to the ARO +- [comBGC](#combgc) – summary of biosynthetic gene cluster output from various detection tools +- [MultiQC](#multiqc) – report of all software and versions used in the pipeline +- [Pipeline information](#pipeline-information) – report metrics generated during the workflow execution ## Tool details @@ -466,6 +473,35 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation [GECCO](https://gecco.embl.de) is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs). +### CAZyme annotation tools + +#### run_dbcan + +
+Output files + +- `cazyme/` + - `dbcan/` + - `cazyme_annotation/` + - `_overview.tsv`: TSV file containing the results of dbCAN CAZyme annotation + - `_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation + - `_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation + - `_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation + - `cgc/` + - `_cgc.gff`: GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome + - `_cgc_standard_out.tsv`: Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome + - `_diamond.out.tc`: TSV file containing the diamond output for transporter annotation + - `_TF_hmm_results.tsv`: TSV file containing the results of transcription factor screening + - `_STP_hmm_results.tsv`: TSV file containing the results of signaling transduction proteins (STP) annotation + - `substrate/` + - `_total_cgc_info.tsv`: TSV file summarizing the total additional genes in the genome + - `_substrate_prediction.tsv`: TSV file containing the substrate predictions based on the CGC annotations from dbCAN + - `_synteny_pdf/`: Directory containing one or more PDF files showing the syntenic regions of the CGCs in DNA sequence as identified by dbCAN + +
+ +[run_dbcan](https://github.com/bcb-unl/run_dbcan) is an automated tool for carbohydrate-active enzyme (CAZyme), CAZyme gene cluster and substrate annotation. + ### Summary tools [AMPcombi](#ampcombi), [hAMRonization](#hamronization), [comBGC](#combgc), [MultiQC](#multiqc), [pipeline information](#pipeline-information), [argNorm](#argnorm). diff --git a/docs/usage.md b/docs/usage.md index 4fe1028c..11e636c6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -25,6 +25,7 @@ To run any of the three screening workflows (AMP, ARG, and/or BGC), taxonomic cl - `--run_bgc_screening` - `--run_taxa_classification` (for optional additional taxonomic annotations) - `--run_protein_annotation` (for optional additional protein family and domain annotation) +- `--run_cazyme_annotation` (for optional additional carbohydrate-active enzyme annotation) When switched on, all tools of the given workflow will be run by default. If you don't need specific tools, you can explicitly skip them. The exception is HMMsearch, which needs to be explicitly switched on and provided with HMM screening files (AMP and BGC workflows, see [parameter documentation](/funcscan/parameters)). For the taxonomic classification, MMseqs2 is currently the only tool implemented in the pipeline. Likewise, InterProScan is the only tool for protein sequence annotation. @@ -556,6 +557,24 @@ interproscan_db/ └── tmhmm ``` +### Run_dbCAN + +The [run_dbcan](https://github.com/bcb-unl/run_dbcan) tool requires a pre-built database to perform carbohydrate-active enzyme (CAZyme) annotation. +To download the database automatically, install the [`dbcan`](https://bioconda.github.io/recipes/dbcan/README.html) package, e.g. with conda: + +```bash +conda create -n dbcan -c bioconda dbcan +conda activate dbcan +``` + +Then, download the database: + +```bash +run_dbcan database --db_dir +``` + +Replace `` with your preferred directory path for storing the database files. Once the database download is complete, the file are ready for use with the `run_dbcan` tool without additional configurations or modifications. + ## Updating the pipeline When you run the below command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: diff --git a/modules.json b/modules.json index eac420fd..777fec92 100644 --- a/modules.json +++ b/modules.json @@ -200,6 +200,26 @@ "git_sha": "531d8a465158c5842f416623017997462a240e16", "installed_by": ["modules"] }, + "rundbcan/cazymeannotation": { + "branch": "master", + "git_sha": "d06da24d16537815f9699c4a4edd4a6ec5bc517f", + "installed_by": ["modules"] + }, + "rundbcan/database": { + "branch": "master", + "git_sha": "d06da24d16537815f9699c4a4edd4a6ec5bc517f", + "installed_by": ["modules"] + }, + "rundbcan/easycgc": { + "branch": "master", + "git_sha": "d06da24d16537815f9699c4a4edd4a6ec5bc517f", + "installed_by": ["modules"] + }, + "rundbcan/easysubstrate": { + "branch": "master", + "git_sha": "d06da24d16537815f9699c4a4edd4a6ec5bc517f", + "installed_by": ["modules"] + }, "seqkit/seq": { "branch": "master", "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", diff --git a/modules/nf-core/rundbcan/cazymeannotation/environment.yml b/modules/nf-core/rundbcan/cazymeannotation/environment.yml new file mode 100644 index 00000000..6d9a56ae --- /dev/null +++ b/modules/nf-core/rundbcan/cazymeannotation/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - bioconda + - conda-forge +dependencies: + - bioconda::dbcan=5.1.2 diff --git a/modules/nf-core/rundbcan/cazymeannotation/main.nf b/modules/nf-core/rundbcan/cazymeannotation/main.nf new file mode 100644 index 00000000..208bd4bd --- /dev/null +++ b/modules/nf-core/rundbcan/cazymeannotation/main.nf @@ -0,0 +1,60 @@ +process RUNDBCAN_CAZYMEANNOTATION { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/dbcan:5.1.2--pyhdfd78af_0' : + 'biocontainers/dbcan:5.1.2--pyhdfd78af_0' }" + + input: + tuple val(meta), path(input_raw_data) + path dbcan_db + + output: + tuple val(meta), path("${prefix}_overview.tsv") , emit: cazyme_annotation + tuple val(meta), path("${prefix}_dbCAN_hmm_results.tsv") , emit: dbcanhmm_results + tuple val(meta), path("${prefix}_dbCANsub_hmm_results.tsv"), emit: dbcansub_results + tuple val(meta), path("${prefix}_diamond.out") , emit: dbcandiamond_results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + run_dbcan CAZyme_annotation \\ + --mode protein \\ + --db_dir ${dbcan_db} \\ + --input_raw_data ${input_raw_data} \\ + --output_dir . \\ + ${args} + + mv overview.tsv ${prefix}_overview.tsv + mv dbCAN_hmm_results.tsv ${prefix}_dbCAN_hmm_results.tsv + mv dbCANsub_hmm_results.tsv ${prefix}_dbCANsub_hmm_results.tsv + mv diamond.out ${prefix}_diamond.out + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_overview.tsv + touch ${prefix}_dbCAN_hmm_results.tsv + touch ${prefix}_dbCANsub_hmm_results.tsv + touch ${prefix}_diamond.out + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/rundbcan/cazymeannotation/meta.yml b/modules/nf-core/rundbcan/cazymeannotation/meta.yml new file mode 100644 index 00000000..a40c515c --- /dev/null +++ b/modules/nf-core/rundbcan/cazymeannotation/meta.yml @@ -0,0 +1,88 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "rundbcan_cazymeannotation" +description: CAZyme annotation module for the dbcan pipeline. This module is used + to annotate carbohydrate-active enzymes (CAZymes) from genomic data using the dbCAN + annotation tool. +keywords: + - dbCAN + - download + - CAZyme + - CAZyme gene Cluster + - genomes +tools: + - "dbcan": + description: "Standalone version of dbCAN annotation tool for automated CAZyme + annotation." + homepage: "https://bcb.unl.edu/dbCAN2/" + documentation: "https://run-dbcan.readthedocs.io/en/latest/" + tool_dev_url: "https://github.com/bcb-unl/run_dbcan" + doi: "10.1093/nar/gkad328" + licence: ["GPL v3-or-later"] + identifier: biotools:dbcan + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - input_raw_data: + type: file + description: FASTA file for protein sequences. + pattern: "*.{fasta,fa,faa}" + ontologies: + - edam: "http://edamontology.org/data_2044" # Sequence + - edam: "http://edamontology.org/format_1929" # FASTA + - - dbcan_db: + type: directory + description: Path to the dbCAN database directory. +output: + - cazyme_annotation: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_overview.tsv: + type: file + description: | + TSV file containing the results of dbCAN CAZyme annotation. + - dbcanhmm_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_dbCAN_hmm_results.tsv: + type: file + description: | + TSV file containing the detailed dbCAN HMM results for CAZyme annotation. + - dbcansub_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - ${prefix}_dbCANsub_hmm_results.tsv: + type: file + description: | + TSV file containing the detailed dbCAN subfamily results for CAZyme annotation. + - dbcandiamond_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - ${prefix}_diamond.out: + type: file + description: | + TSV file containing the detailed dbCAN diamond results for CAZyme annotation. + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Xinpeng021001" +maintainers: + - "@Xinpeng021001" diff --git a/modules/nf-core/rundbcan/cazymeannotation/tests/main.nf.test b/modules/nf-core/rundbcan/cazymeannotation/tests/main.nf.test new file mode 100644 index 00000000..d3ff7578 --- /dev/null +++ b/modules/nf-core/rundbcan/cazymeannotation/tests/main.nf.test @@ -0,0 +1,72 @@ +nextflow_process { + + name "Test Process RUNDBCAN_CAZYMEANNOTATION" + script "../main.nf" + process "RUNDBCAN_CAZYMEANNOTATION" + + tag "modules" + tag "modules_nfcore" + tag "rundbcan" + tag "rundbcan/database" + tag "rundbcan/cazymeannotation" + + test("dbcancazyme - simplified") { + + setup { + run("RUNDBCAN_DATABASE"){ + script "../../database/main.nf" + process { + """ + """ + } + } + } + + when { + process { + """ + input[0] = [ + [id:'test'], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/proteome.fasta', checkIfExists: true) + ] + input[1] = RUNDBCAN_DATABASE.out.dbcan_db + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match() + } + ) + } + } + + test("dbcancazyme - cazyme annotation - stub") { + options "-stub" + + when { + process { + """ + input[0] = [[id: 'stub'],file('stub') ] + input[1] = file('stub_db') + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match() + } + ) + } + } +} diff --git a/modules/nf-core/rundbcan/cazymeannotation/tests/main.nf.test.snap b/modules/nf-core/rundbcan/cazymeannotation/tests/main.nf.test.snap new file mode 100644 index 00000000..cf210a60 --- /dev/null +++ b/modules/nf-core/rundbcan/cazymeannotation/tests/main.nf.test.snap @@ -0,0 +1,174 @@ +{ + "dbcancazyme - cazyme annotation - stub": { + "content": [ + { + "0": [ + [ + { + "id": "stub" + }, + "stub_overview.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "stub" + }, + "stub_dbCAN_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "stub" + }, + "stub_dbCANsub_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "stub" + }, + "stub_diamond.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + "versions.yml:md5,40f0cf24dce2629d444781eaee026c7f" + ], + "cazyme_annotation": [ + [ + { + "id": "stub" + }, + "stub_overview.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcandiamond_results": [ + [ + { + "id": "stub" + }, + "stub_diamond.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcanhmm_results": [ + [ + { + "id": "stub" + }, + "stub_dbCAN_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcansub_results": [ + [ + { + "id": "stub" + }, + "stub_dbCANsub_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,40f0cf24dce2629d444781eaee026c7f" + ] + }, + { + "RUNDBCAN_CAZYMEANNOTATION": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T17:00:00.485809769" + }, + "dbcancazyme - simplified": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_overview.tsv:md5,73bd9acee752d61e096370d4fedfee54" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_dbCAN_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test_dbCANsub_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test_diamond.out:md5,c935cda6778ca2b6aaaa4362b6f24d84" + ] + ], + "4": [ + "versions.yml:md5,40f0cf24dce2629d444781eaee026c7f" + ], + "cazyme_annotation": [ + [ + { + "id": "test" + }, + "test_overview.tsv:md5,73bd9acee752d61e096370d4fedfee54" + ] + ], + "dbcandiamond_results": [ + [ + { + "id": "test" + }, + "test_diamond.out:md5,c935cda6778ca2b6aaaa4362b6f24d84" + ] + ], + "dbcanhmm_results": [ + [ + { + "id": "test" + }, + "test_dbCAN_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "dbcansub_results": [ + [ + { + "id": "test" + }, + "test_dbCANsub_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "versions": [ + "versions.yml:md5,40f0cf24dce2629d444781eaee026c7f" + ] + }, + { + "RUNDBCAN_CAZYMEANNOTATION": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T16:59:56.40131421" + } +} \ No newline at end of file diff --git a/modules/nf-core/rundbcan/database/environment.yml b/modules/nf-core/rundbcan/database/environment.yml new file mode 100644 index 00000000..6d9a56ae --- /dev/null +++ b/modules/nf-core/rundbcan/database/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - bioconda + - conda-forge +dependencies: + - bioconda::dbcan=5.1.2 diff --git a/modules/nf-core/rundbcan/database/main.nf b/modules/nf-core/rundbcan/database/main.nf new file mode 100644 index 00000000..617ab882 --- /dev/null +++ b/modules/nf-core/rundbcan/database/main.nf @@ -0,0 +1,38 @@ +process RUNDBCAN_DATABASE { + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/dbcan:5.1.2--pyhdfd78af_0' : + 'biocontainers/dbcan:5.1.2--pyhdfd78af_0' }" + + output: + path "dbcan_db" , emit: dbcan_db + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + run_dbcan database \\ + --db_dir dbcan_db + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + """ + mkdir -p dbcan_db + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/rundbcan/database/meta.yml b/modules/nf-core/rundbcan/database/meta.yml new file mode 100644 index 00000000..0ae300b0 --- /dev/null +++ b/modules/nf-core/rundbcan/database/meta.yml @@ -0,0 +1,36 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "rundbcan_database" +description: command from run_dbcan to prepare the database for dbCAN annotation. +keywords: + - dbCAN + - download + - CAZyme + - CAZyme gene Cluster + - genomes +tools: + - "run_dbcan": + description: "Standalone version of dbCAN annotation tool for automated CAZyme annotation." + homepage: "https://bcb.unl.edu/dbCAN2/" + documentation: "https://run-dbcan.readthedocs.io/en/latest/" + tool_dev_url: "https://github.com/bcb-unl/run_dbcan" + doi: "10.1093/nar/gkad328" + licence: ["GPL v3-or-later"] + identifier: biotools:dbcan + +output: + - dbcan_db: + - dbcan_db: + type: directory + description: Download directory for dbCAN databases + pattern: "dbcan_db" + - versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@Xinpeng021001" +maintainers: + - "@Xinpeng021001" diff --git a/modules/nf-core/rundbcan/database/tests/main.nf.test b/modules/nf-core/rundbcan/database/tests/main.nf.test new file mode 100644 index 00000000..d81a3319 --- /dev/null +++ b/modules/nf-core/rundbcan/database/tests/main.nf.test @@ -0,0 +1,56 @@ +nextflow_process { + + name "Test Process RUNDBCAN_DATABASE" + script "../main.nf" + process "RUNDBCAN_DATABASE" + + tag "modules" + tag "modules_nfcore" + tag "rundbcan" + tag "rundbcan/database" + + test("rundbcan - database - basic") { + + when { + process { + """ + """ + } + } + + then { + assert process.success + assert path(process.out.dbcan_db.get(0)).exists() + assert path(process.out.versions[0]).exists() + assertAll( + { assert snapshot( + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) + } + } + + test("rundbcan - database - stub") { + + options "-stub" + + when { + process { + """ + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + file(process.out.dbcan_db.get(0)).name, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) + } + } +} diff --git a/modules/nf-core/rundbcan/database/tests/main.nf.test.snap b/modules/nf-core/rundbcan/database/tests/main.nf.test.snap new file mode 100644 index 00000000..e54b59a8 --- /dev/null +++ b/modules/nf-core/rundbcan/database/tests/main.nf.test.snap @@ -0,0 +1,37 @@ +{ + "rundbcan - database - basic": { + "content": [ + [ + "versions.yml:md5,b064fe90159963e182ec980f0f4677c5" + ], + { + "RUNDBCAN_DATABASE": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T17:01:26.958626278" + }, + "rundbcan - database - stub": { + "content": [ + "dbcan_db", + [ + "versions.yml:md5,b064fe90159963e182ec980f0f4677c5" + ], + { + "RUNDBCAN_DATABASE": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T17:01:31.197377024" + } +} \ No newline at end of file diff --git a/modules/nf-core/rundbcan/easycgc/environment.yml b/modules/nf-core/rundbcan/easycgc/environment.yml new file mode 100644 index 00000000..6d9a56ae --- /dev/null +++ b/modules/nf-core/rundbcan/easycgc/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - bioconda + - conda-forge +dependencies: + - bioconda::dbcan=5.1.2 diff --git a/modules/nf-core/rundbcan/easycgc/main.nf b/modules/nf-core/rundbcan/easycgc/main.nf new file mode 100644 index 00000000..0095e81d --- /dev/null +++ b/modules/nf-core/rundbcan/easycgc/main.nf @@ -0,0 +1,80 @@ +process RUNDBCAN_EASYCGC { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/dbcan:5.1.2--pyhdfd78af_0' : + 'biocontainers/dbcan:5.1.2--pyhdfd78af_0' }" + + input: + tuple val(meta), path(input_raw_data) + tuple val(meta2), path(input_gff), val(gff_type) + path dbcan_db + + output: + tuple val(meta), path("${prefix}_overview.tsv") , emit: cazyme_annotation + tuple val(meta), path("${prefix}_dbCAN_hmm_results.tsv") , emit: dbcanhmm_results + tuple val(meta), path("${prefix}_dbCANsub_hmm_results.tsv"), emit: dbcansub_results + tuple val(meta), path("${prefix}_diamond.out") , emit: dbcandiamond_results + tuple val(meta), path("${prefix}_cgc.gff") , emit: cgc_gff + tuple val(meta), path("${prefix}_cgc_standard_out.tsv") , emit: cgc_standard_out + tuple val(meta), path("${prefix}_diamond.out.tc") , emit: diamond_out_tc + tuple val(meta), path("${prefix}_TF_hmm_results.tsv") , emit: tf_hmm_results + tuple val(meta), path("${prefix}_STP_hmm_results.tsv") , emit: stp_hmm_results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + run_dbcan easy_CGC \\ + --mode protein \\ + --db_dir ${dbcan_db} \\ + --input_raw_data ${input_raw_data} \\ + --output_dir . \\ + --input_gff ${input_gff} \\ + --gff_type ${gff_type} \\ + ${args} + + mv overview.tsv ${prefix}_overview.tsv + mv dbCAN_hmm_results.tsv ${prefix}_dbCAN_hmm_results.tsv + mv dbCANsub_hmm_results.tsv ${prefix}_dbCANsub_hmm_results.tsv + mv diamond.out ${prefix}_diamond.out + mv cgc.gff ${prefix}_cgc.gff + mv cgc_standard_out.tsv ${prefix}_cgc_standard_out.tsv + mv diamond.out.tc ${prefix}_diamond.out.tc + mv TF_hmm_results.tsv ${prefix}_TF_hmm_results.tsv + mv STP_hmm_results.tsv ${prefix}_STP_hmm_results.tsv + mv total_cgc_info.tsv ${prefix}_total_cgc_info.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_overview.tsv + touch ${prefix}_dbCAN_hmm_results.tsv + touch ${prefix}_dbCANsub_hmm_results.tsv + touch ${prefix}_diamond.out + touch ${prefix}_cgc.gff + touch ${prefix}_cgc_standard_out.tsv + touch ${prefix}_diamond.out.tc + touch ${prefix}_TF_hmm_results.tsv + touch ${prefix}_STP_hmm_results.tsv + touch ${prefix}_total_cgc_info.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/rundbcan/easycgc/meta.yml b/modules/nf-core/rundbcan/easycgc/meta.yml new file mode 100644 index 00000000..8e6e1f99 --- /dev/null +++ b/modules/nf-core/rundbcan/easycgc/meta.yml @@ -0,0 +1,151 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "rundbcan_easycgc" +description: CGC annotation module for the dbcan pipeline. This module is used to + annotate carbohydrate-active enzymes (CAZymes) from genomic data using the dbCAN + annotation tool. +keywords: + - dbCAN + - download + - CAZyme + - CAZyme gene Cluster + - genomes +tools: + - "dbcan": + description: "Standalone version of dbCAN annotation tool for automated CAZyme + annotation." + homepage: "https://bcb.unl.edu/dbCAN2/" + documentation: "https://run-dbcan.readthedocs.io/en/latest/" + tool_dev_url: "https://github.com/bcb-unl/run_dbcan" + doi: "10.1093/nar/gkad328" + licence: ["GPL v3-or-later"] + identifier: biotools:dbcan + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - input_raw_data: + type: file + description: FASTA file for protein sequences. + pattern: "*.{fasta,fa,faa}" + ontologies: + - edam: "http://edamontology.org/data_2044" # Sequence + - edam: "http://edamontology.org/format_1929" # FASTA + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - input_gff: + type: file + description: GFF file for protein sequences. + - gff_type: + type: string + description: | + Type of GFF file. Options are `NCBI_prok`, `JGI`, `NCBI_euk`, and `prodigal`. This is used to parse the GFF file correctly. + - - dbcan_db: + type: directory + description: Path to the dbCAN database directory. + +output: + - cazyme_annotation: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_overview.tsv: + type: file + description: | + TSV file containing the results of dbCAN CAZyme annotation. + - dbcanhmm_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_dbCAN_hmm_results.tsv: + type: file + description: | + TSV file containing the detailed dbCAN HMM results for CAZyme annotation. + - dbcansub_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_dbCANsub_hmm_results.tsv: + type: file + description: | + TSV file containing the detailed dbCAN subfamily results for CAZyme annotation. + - dbcandiamond_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_diamond.out: + type: file + description: | + TSV file containing the detailed dbCAN diamond results for CAZyme annotation. + - cgc_gff: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_cgc.gff: + type: file + description: | + GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome. + - cgc_standard_out: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_cgc_standard_out.tsv: + type: file + description: | + Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. + - diamond_out_tc: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_diamond.out.tc: + type: file + description: | + TSV file containing the diamond output for transporter annotation. + - tf_hmm_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_TF_hmm_results.tsv: + type: file + description: | + TSV file containing the results of Transcription factor. + - stp_hmm_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_STP_hmm_results.tsv: + type: file + description: | + TSV file containing the results of signaling transduction proteins (STP) annotation. + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Xinpeng021001" +maintainers: + - "@Xinpeng021001" diff --git a/modules/nf-core/rundbcan/easycgc/tests/main.nf.test b/modules/nf-core/rundbcan/easycgc/tests/main.nf.test new file mode 100644 index 00000000..fe316389 --- /dev/null +++ b/modules/nf-core/rundbcan/easycgc/tests/main.nf.test @@ -0,0 +1,80 @@ +nextflow_process { + + name "Test Process RUNDBCAN_EASYCGC" + script "../main.nf" + process "RUNDBCAN_EASYCGC" + + tag "modules" + tag "modules_nfcore" + tag "rundbcan" + tag "rundbcan/database" + tag "rundbcan/easycgc" + + + test("easycgc - simplified") { + + setup { + run("RUNDBCAN_DATABASE"){ + script "../../database/main.nf" + process { + """ + """ + } + } + } + + when { + process { + """ + input[0] = [ + [id:'test'], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/proteome.fasta', checkIfExists: true) + ] + input[1] = [ + [id:'test'], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/gff/test1.gff', checkIfExists: true) + ,"prodigal" + ] + input[2] = RUNDBCAN_DATABASE.out.dbcan_db + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match() + } + ) + } + } + + test("easycgc - stub") { + options "-stub" + + when { + process { + """ + input[0] = [[id: 'stub'], file('stub')] + input[1] = [[id: 'stub'], file('stub.gff'), "prodigal"] + input[2] = file('stub_db') + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match() + } + ) + } + } +} + diff --git a/modules/nf-core/rundbcan/easycgc/tests/main.nf.test.snap b/modules/nf-core/rundbcan/easycgc/tests/main.nf.test.snap new file mode 100644 index 00000000..664f7670 --- /dev/null +++ b/modules/nf-core/rundbcan/easycgc/tests/main.nf.test.snap @@ -0,0 +1,334 @@ +{ + "easycgc - simplified": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_overview.tsv:md5,73bd9acee752d61e096370d4fedfee54" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_dbCAN_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test_dbCANsub_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test_diamond.out:md5,c935cda6778ca2b6aaaa4362b6f24d84" + ] + ], + "4": [ + [ + { + "id": "test" + }, + "test_cgc.gff:md5,cb1bd08c0276b4a0a37540032863a0ac" + ] + ], + "5": [ + [ + { + "id": "test" + }, + "test_cgc_standard_out.tsv:md5,6be9ab29b289ff46cc6e4b6fe48dc3d7" + ] + ], + "6": [ + [ + { + "id": "test" + }, + "test_diamond.out.tc:md5,4b9747475aaf438eede8556832060f83" + ] + ], + "7": [ + [ + { + "id": "test" + }, + "test_TF_hmm_results.tsv:md5,f63f2b2b3c4439304fa1c20c19cf1e99" + ] + ], + "8": [ + [ + { + "id": "test" + }, + "test_STP_hmm_results.tsv:md5,e03c881b99bb7c7637ed600324816b5c" + ] + ], + "9": [ + "versions.yml:md5,98440d2e11ce6a66cf63395467603bb9" + ], + "cazyme_annotation": [ + [ + { + "id": "test" + }, + "test_overview.tsv:md5,73bd9acee752d61e096370d4fedfee54" + ] + ], + "cgc_gff": [ + [ + { + "id": "test" + }, + "test_cgc.gff:md5,cb1bd08c0276b4a0a37540032863a0ac" + ] + ], + "cgc_standard_out": [ + [ + { + "id": "test" + }, + "test_cgc_standard_out.tsv:md5,6be9ab29b289ff46cc6e4b6fe48dc3d7" + ] + ], + "dbcandiamond_results": [ + [ + { + "id": "test" + }, + "test_diamond.out:md5,c935cda6778ca2b6aaaa4362b6f24d84" + ] + ], + "dbcanhmm_results": [ + [ + { + "id": "test" + }, + "test_dbCAN_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "dbcansub_results": [ + [ + { + "id": "test" + }, + "test_dbCANsub_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "diamond_out_tc": [ + [ + { + "id": "test" + }, + "test_diamond.out.tc:md5,4b9747475aaf438eede8556832060f83" + ] + ], + "stp_hmm_results": [ + [ + { + "id": "test" + }, + "test_STP_hmm_results.tsv:md5,e03c881b99bb7c7637ed600324816b5c" + ] + ], + "tf_hmm_results": [ + [ + { + "id": "test" + }, + "test_TF_hmm_results.tsv:md5,f63f2b2b3c4439304fa1c20c19cf1e99" + ] + ], + "versions": [ + "versions.yml:md5,98440d2e11ce6a66cf63395467603bb9" + ] + }, + { + "RUNDBCAN_EASYCGC": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T17:03:26.553650524" + }, + "easycgc - stub": { + "content": [ + { + "0": [ + [ + { + "id": "stub" + }, + "stub_overview.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "stub" + }, + "stub_dbCAN_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "stub" + }, + "stub_dbCANsub_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "stub" + }, + "stub_diamond.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "stub" + }, + "stub_cgc.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "stub" + }, + "stub_cgc_standard_out.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + { + "id": "stub" + }, + "stub_diamond.out.tc:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + [ + { + "id": "stub" + }, + "stub_TF_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "8": [ + [ + { + "id": "stub" + }, + "stub_STP_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "9": [ + "versions.yml:md5,98440d2e11ce6a66cf63395467603bb9" + ], + "cazyme_annotation": [ + [ + { + "id": "stub" + }, + "stub_overview.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cgc_gff": [ + [ + { + "id": "stub" + }, + "stub_cgc.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cgc_standard_out": [ + [ + { + "id": "stub" + }, + "stub_cgc_standard_out.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcandiamond_results": [ + [ + { + "id": "stub" + }, + "stub_diamond.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcanhmm_results": [ + [ + { + "id": "stub" + }, + "stub_dbCAN_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcansub_results": [ + [ + { + "id": "stub" + }, + "stub_dbCANsub_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "diamond_out_tc": [ + [ + { + "id": "stub" + }, + "stub_diamond.out.tc:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "stp_hmm_results": [ + [ + { + "id": "stub" + }, + "stub_STP_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tf_hmm_results": [ + [ + { + "id": "stub" + }, + "stub_TF_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,98440d2e11ce6a66cf63395467603bb9" + ] + }, + { + "RUNDBCAN_EASYCGC": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T17:03:30.257931353" + } +} \ No newline at end of file diff --git a/modules/nf-core/rundbcan/easysubstrate/environment.yml b/modules/nf-core/rundbcan/easysubstrate/environment.yml new file mode 100644 index 00000000..6d9a56ae --- /dev/null +++ b/modules/nf-core/rundbcan/easysubstrate/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - bioconda + - conda-forge +dependencies: + - bioconda::dbcan=5.1.2 diff --git a/modules/nf-core/rundbcan/easysubstrate/main.nf b/modules/nf-core/rundbcan/easysubstrate/main.nf new file mode 100644 index 00000000..442debfc --- /dev/null +++ b/modules/nf-core/rundbcan/easysubstrate/main.nf @@ -0,0 +1,92 @@ +process RUNDBCAN_EASYSUBSTRATE { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/dbcan:5.1.2--pyhdfd78af_0' : + 'biocontainers/dbcan:5.1.2--pyhdfd78af_0' }" + + input: + tuple val(meta), path(input_raw_data) + tuple val(meta2), path(input_gff), val(gff_type) + path dbcan_db + + output: + tuple val(meta), path("${prefix}_overview.tsv") , emit: cazyme_annotation + tuple val(meta), path("${prefix}_dbCAN_hmm_results.tsv") , emit: dbcanhmm_results + tuple val(meta), path("${prefix}_dbCANsub_hmm_results.tsv"), emit: dbcansub_results + tuple val(meta), path("${prefix}_diamond.out") , emit: dbcandiamond_results + tuple val(meta), path("${prefix}_cgc.gff") , emit: cgc_gff + tuple val(meta), path("${prefix}_cgc_standard_out.tsv") , emit: cgc_standard_out + tuple val(meta), path("${prefix}_diamond.out.tc") , emit: diamond_out_tc + tuple val(meta), path("${prefix}_TF_hmm_results.tsv") , emit: tf_hmm_results + tuple val(meta), path("${prefix}_STP_hmm_results.tsv") , emit: stp_hmm_results + tuple val(meta), path("${prefix}_total_cgc_info.tsv") , emit: total_cgc_info + tuple val(meta), path("${prefix}_substrate_prediction.tsv"), emit: substrate_prediction + tuple val(meta), path("${prefix}_synteny_pdf/") , emit: synteny_pdf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + + run_dbcan easy_substrate \\ + --mode protein \\ + --db_dir ${dbcan_db} \\ + --input_raw_data ${input_raw_data} \\ + --output_dir . \\ + --input_gff ${input_gff} \\ + --gff_type ${gff_type} \\ + ${args} + + mv overview.tsv ${prefix}_overview.tsv + mv dbCAN_hmm_results.tsv ${prefix}_dbCAN_hmm_results.tsv + mv dbCANsub_hmm_results.tsv ${prefix}_dbCANsub_hmm_results.tsv + mv diamond.out ${prefix}_diamond.out + mv cgc.gff ${prefix}_cgc.gff + mv cgc_standard_out.tsv ${prefix}_cgc_standard_out.tsv + mv diamond.out.tc ${prefix}_diamond.out.tc + mv TF_hmm_results.tsv ${prefix}_TF_hmm_results.tsv + mv STP_hmm_results.tsv ${prefix}_STP_hmm_results.tsv + mv total_cgc_info.tsv ${prefix}_total_cgc_info.tsv + mv CGC.faa ${prefix}_CGC.faa + mv PUL_blast.out ${prefix}_PUL_blast.out + mv substrate_prediction.tsv ${prefix}_substrate_prediction.tsv + mv synteny_pdf/ ${prefix}_synteny_pdf/ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_overview.tsv + touch ${prefix}_dbCAN_hmm_results.tsv + touch ${prefix}_dbCANsub_hmm_results.tsv + touch ${prefix}_diamond.out + touch ${prefix}_cgc.gff + touch ${prefix}_cgc_standard_out.tsv + touch ${prefix}_diamond.out.tc + touch ${prefix}_TF_hmm_results.tsv + touch ${prefix}_STP_hmm_results.tsv + touch ${prefix}_total_cgc_info.tsv + touch ${prefix}_CGC.faa + touch ${prefix}_PUL_blast.out + touch ${prefix}_substrate_prediction.tsv + mkdir -p ${prefix}_synteny_pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/rundbcan/easysubstrate/meta.yml b/modules/nf-core/rundbcan/easysubstrate/meta.yml new file mode 100644 index 00000000..32672c50 --- /dev/null +++ b/modules/nf-core/rundbcan/easysubstrate/meta.yml @@ -0,0 +1,181 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "rundbcan_easysubstrate" +description: Substrate annotation module for the dbcan pipeline. This module is used + to annotate carbohydrate-active enzymes (CAZymes) from genomic data using the dbCAN + annotation tool. +keywords: + - dbCAN + - download + - CAZyme + - CAZyme gene Cluster + - genomes +tools: + - "dbcan": + description: "Standalone version of dbCAN annotation tool for automated CAZyme + annotation." + homepage: "https://bcb.unl.edu/dbCAN2/" + documentation: "https://run-dbcan.readthedocs.io/en/latest/" + tool_dev_url: "https://github.com/bcb-unl/run_dbcan" + doi: "10.1093/nar/gkad328" + licence: ["GPL v3-or-later"] + identifier: biotools:dbcan + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - input_raw_data: + type: file + description: FASTA file for protein sequences. + pattern: "*.{fasta,fa,faa}" + ontologies: + - edam: "http://edamontology.org/data_2044" # Sequence + - edam: "http://edamontology.org/format_1929" # FASTA + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - input_gff: + type: file + description: GFF file for protein sequences. + - gff_type: + type: string + description: | + Type of GFF file. Options are `NCBI_prok`, `JGI`, `NCBI_euk`, and `prodigal`. This is used to parse the GFF file correctly. + - - dbcan_db: + type: directory + description: Path to the dbCAN database directory. + +output: + - cazyme_annotation: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_overview.tsv: + type: file + description: | + TSV file containing the results of dbCAN CAZyme annotation. + - dbcanhmm_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_dbCAN_hmm_results.tsv: + type: file + description: | + TSV file containing the detailed dbCAN HMM results for CAZyme annotation. + - dbcansub_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_dbCANsub_hmm_results.tsv: + type: file + description: | + TSV file containing the detailed dbCAN subfamily results for CAZyme annotation. + - dbcandiamond_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_diamond.out: + type: file + description: | + TSV file containing the detailed dbCAN diamond results for CAZyme annotation. + - cgc_gff: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_cgc.gff: + type: file + description: | + GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome. + - cgc_standard_out: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_cgc_standard_out.tsv: + type: file + description: | + Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. + - diamond_out_tc: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_diamond.out.tc: + type: file + description: | + TSV file containing the diamond output for transporter annotation. + - tf_hmm_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_TF_hmm_results.tsv: + type: file + description: | + TSV file containing the results of Transcription factor. + - stp_hmm_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_STP_hmm_results.tsv: + type: file + description: | + TSV file containing the results of signaling transduction proteins (STP) annotation. + - total_cgc_info: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_total_cgc_info.tsv: + type: file + description: | + TSV file summarizing the total additional genes in the genome. + - substrate_prediction: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_substrate_prediction.tsv: + type: file + description: | + TSV file containing the substrate predictions based on the CGC annotations from dbCAN. + - synteny_pdf: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_synteny_pdf/: + type: directory + description: | + Directory containing the synteny plots in PDF format for the CAZyme gene clusters (CGC) identified by dbCAN. This directory will contain one or more PDF files showing the syntenic regions of the CGC in the genome. + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Xinpeng021001" +maintainers: + - "@Xinpeng021001" diff --git a/modules/nf-core/rundbcan/easysubstrate/tests/main.nf.test b/modules/nf-core/rundbcan/easysubstrate/tests/main.nf.test new file mode 100644 index 00000000..b6c016a1 --- /dev/null +++ b/modules/nf-core/rundbcan/easysubstrate/tests/main.nf.test @@ -0,0 +1,90 @@ +// nf-core modules test dbcan +nextflow_process { + + name "Test Process RUNDBCAN_EASYSUBSTRATE" + script "../main.nf" + process "RUNDBCAN_EASYSUBSTRATE" + + tag "modules" + tag "modules_nfcore" + tag "rundbcan" + tag "rundbcan/database" + tag "rundbcan/easysubstrate" + + test("easysubstrate - simplified") { + + setup { + run("RUNDBCAN_DATABASE"){ + script "../../database/main.nf" + process { + """ + """ + } + } + } + + when { + process { + """ + input[0] = [ + [id:'test'], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/proteome.fasta', checkIfExists: true) + ] + input[1] = [ + [id:'test'], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/gff/test1.gff', checkIfExists: true), + "prodigal" + ] + input[2] = RUNDBCAN_DATABASE.out.dbcan_db + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out.cazyme_annotation, + process.out.dbcanhmm_results, + process.out.dbcansub_results, + process.out.dbcandiamond_results, + process.out.cgc_gff, + process.out.cgc_standard_out, + process.out.diamond_out_tc, + process.out.tf_hmm_results, + process.out.stp_hmm_results, + process.out.total_cgc_info, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() + } + ) + } + } + + test("easysubstrate - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [[id: 'stub'], file('stub')] + input[1] = [[id: 'stub'], file('stub.gff'), "prodigal"] + input[2] = file('stub_db') + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match() + } + ) + } + } +} diff --git a/modules/nf-core/rundbcan/easysubstrate/tests/main.nf.test.snap b/modules/nf-core/rundbcan/easysubstrate/tests/main.nf.test.snap new file mode 100644 index 00000000..936e9472 --- /dev/null +++ b/modules/nf-core/rundbcan/easysubstrate/tests/main.nf.test.snap @@ -0,0 +1,317 @@ +{ + "easysubstrate - stub": { + "content": [ + { + "0": [ + [ + { + "id": "stub" + }, + "stub_overview.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "stub" + }, + "stub_dbCAN_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "10": [ + [ + { + "id": "stub" + }, + "stub_substrate_prediction.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "11": [ + [ + { + "id": "stub" + }, + [ + + ] + ] + ], + "12": [ + "versions.yml:md5,d5dd0946a8485d35c9593ca672a9387c" + ], + "2": [ + [ + { + "id": "stub" + }, + "stub_dbCANsub_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "stub" + }, + "stub_diamond.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "stub" + }, + "stub_cgc.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "stub" + }, + "stub_cgc_standard_out.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + { + "id": "stub" + }, + "stub_diamond.out.tc:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + [ + { + "id": "stub" + }, + "stub_TF_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "8": [ + [ + { + "id": "stub" + }, + "stub_STP_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "9": [ + [ + { + "id": "stub" + }, + "stub_total_cgc_info.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cazyme_annotation": [ + [ + { + "id": "stub" + }, + "stub_overview.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cgc_gff": [ + [ + { + "id": "stub" + }, + "stub_cgc.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cgc_standard_out": [ + [ + { + "id": "stub" + }, + "stub_cgc_standard_out.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcandiamond_results": [ + [ + { + "id": "stub" + }, + "stub_diamond.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcanhmm_results": [ + [ + { + "id": "stub" + }, + "stub_dbCAN_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcansub_results": [ + [ + { + "id": "stub" + }, + "stub_dbCANsub_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "diamond_out_tc": [ + [ + { + "id": "stub" + }, + "stub_diamond.out.tc:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "stp_hmm_results": [ + [ + { + "id": "stub" + }, + "stub_STP_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "substrate_prediction": [ + [ + { + "id": "stub" + }, + "stub_substrate_prediction.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "synteny_pdf": [ + [ + { + "id": "stub" + }, + [ + + ] + ] + ], + "tf_hmm_results": [ + [ + { + "id": "stub" + }, + "stub_TF_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "total_cgc_info": [ + [ + { + "id": "stub" + }, + "stub_total_cgc_info.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,d5dd0946a8485d35c9593ca672a9387c" + ] + }, + { + "RUNDBCAN_EASYSUBSTRATE": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T17:05:25.261676793" + }, + "easysubstrate - simplified": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_overview.tsv:md5,73bd9acee752d61e096370d4fedfee54" + ] + ], + [ + [ + { + "id": "test" + }, + "test_dbCAN_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + [ + [ + { + "id": "test" + }, + "test_dbCANsub_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + [ + [ + { + "id": "test" + }, + "test_diamond.out:md5,c935cda6778ca2b6aaaa4362b6f24d84" + ] + ], + [ + [ + { + "id": "test" + }, + "test_cgc.gff:md5,cb1bd08c0276b4a0a37540032863a0ac" + ] + ], + [ + [ + { + "id": "test" + }, + "test_cgc_standard_out.tsv:md5,6be9ab29b289ff46cc6e4b6fe48dc3d7" + ] + ], + [ + [ + { + "id": "test" + }, + "test_diamond.out.tc:md5,4b9747475aaf438eede8556832060f83" + ] + ], + [ + [ + { + "id": "test" + }, + "test_TF_hmm_results.tsv:md5,f63f2b2b3c4439304fa1c20c19cf1e99" + ] + ], + [ + [ + { + "id": "test" + }, + "test_STP_hmm_results.tsv:md5,e03c881b99bb7c7637ed600324816b5c" + ] + ], + [ + [ + { + "id": "test" + }, + "test_total_cgc_info.tsv:md5,0b1411698abea697723acd7be2ff03a7" + ] + ], + [ + "versions.yml:md5,d5dd0946a8485d35c9593ca672a9387c" + ], + { + "RUNDBCAN_EASYSUBSTRATE": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T17:05:21.376354173" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 9c2b60fb..de530c14 100644 --- a/nextflow.config +++ b/nextflow.config @@ -257,6 +257,15 @@ params { bgc_hmmsearch_savetargets = false bgc_hmmsearch_savedomains = false + // RUNDBCAN options + run_cazyme_screening = false + + cazyme_skip_dbcan = false + cazyme_dbcan_db = null + + dbcan_skip_cgc = false + dbcan_skip_substrate = false + // Schema validation default options validate_params = true } @@ -413,6 +422,12 @@ profiles { test_preannotated_bgc { includeConfig 'conf/test_preannotated_bgc.config' } + test_cazyme_pyrodigal { + includeConfig 'conf/test_cazyme_pyrodigal.config' + } + test_preannotated_dbcan { + includeConfig 'conf/test_preannotated_dbcan.config' + } } // Load nf-core custom profiles from different institutions diff --git a/nextflow_schema.json b/nextflow_schema.json index 4c70e28d..c2c3f76e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -63,6 +63,11 @@ "type": "boolean", "description": "Activate biosynthetic gene cluster screening tools.", "fa_icon": "fas fa-check-circle" + }, + "run_cazyme_screening": { + "type": "boolean", + "description": "Activate CAZyme and CAZyme gene cluster screening tools.", + "fa_icon": "fas fa-check-circle" } }, "fa_icon": "fa fa-list" @@ -1457,6 +1462,35 @@ }, "fa_icon": "fas fa-angle-double-right" }, + "cazyme_dbcan": { + "title": "dbCAN", + "type": "object", + "description": "Carbohydrate-active enzyme annotation based on pre-defined HMM models.\n\nFor more information check the dbCAN [documentation](https://run-dbcan.readthedocs.io/en/latest)", + "default": "", + "properties": { + "cazyme_skip_dbcan": { + "type": "boolean", + "description": "Skip dbCAN during the CAZyme screening.", + "fa_icon": "fas fa-ban" + }, + "cazyme_dbcan_db": { + "type": "string", + "fa_icon": "fas fa-database", + "description": "Path to local dbCAN database folder.", + "help_text": "For more information of preparing dbCAN database, refer to the [documentation](https://run-dbcan.readthedocs.io/en/latest/user_guide/prepare_the_database.html)." + }, + "dbcan_skip_cgc": { + "type": "boolean", + "description": "Skip CGC during the dbCAN screening.", + "fa_icon": "fas fa-ban" + }, + "dbcan_skip_substrate": { + "type": "boolean", + "description": "Skip substrate during the dbCAN screening.", + "fa_icon": "fas fa-ban" + } + } + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -1706,6 +1740,9 @@ { "$ref": "#/$defs/bgc_hmmsearch" }, + { + "$ref": "#/$defs/cazyme_dbcan" + }, { "$ref": "#/$defs/institutional_config_options" }, diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index a59fe561..2d990958 100644 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -3,16 +3,20 @@ */ include { PROKKA } from '../../modules/nf-core/prokka/main' -include { PRODIGAL } from '../../modules/nf-core/prodigal/main' -include { PYRODIGAL } from '../../modules/nf-core/pyrodigal/main' +include { PRODIGAL as PRODIGAL_GBK } from '../../modules/nf-core/prodigal/main' +include { PRODIGAL as PRODIGAL_GFF } from '../../modules/nf-core/prodigal/main' +include { PYRODIGAL as PYRODIGAL_GBK } from '../../modules/nf-core/pyrodigal/main' +include { PYRODIGAL as PYRODIGAL_GFF } from '../../modules/nf-core/pyrodigal/main' include { BAKTA_BAKTADBDOWNLOAD } from '../../modules/nf-core/bakta/baktadbdownload/main' include { BAKTA_BAKTA } from '../../modules/nf-core/bakta/bakta/main' include { GUNZIP as GUNZIP_PRODIGAL_FNA } from '../../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PRODIGAL_FAA } from '../../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PRODIGAL_GBK } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PRODIGAL_GFF } from '../../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PYRODIGAL_FNA } from '../../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PYRODIGAL_FAA } from '../../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PYRODIGAL_GBK } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PYRODIGAL_GFF } from '../../modules/nf-core/gunzip/main' workflow ANNOTATION { take: @@ -32,31 +36,41 @@ workflow ANNOTATION { log.warn("[nf-core/funcscan] Switching annotation tool to: Pyrodigal. This is because Prodigal annotations (in GBK format) are incompatible with AMPcombi. If you specifically wish to run Prodigal instead, please skip AMP workflow or provide a pre-annotated GBK file in the samplesheet.") } - PYRODIGAL(fasta, "gbk") - GUNZIP_PYRODIGAL_FAA(PYRODIGAL.out.faa) - GUNZIP_PYRODIGAL_FNA(PYRODIGAL.out.fna) - GUNZIP_PYRODIGAL_GBK(PYRODIGAL.out.annotations) - ch_versions = ch_versions.mix(PYRODIGAL.out.versions) + PYRODIGAL_GBK(fasta, "gbk") + PYRODIGAL_GFF(fasta, "gff") + GUNZIP_PYRODIGAL_FAA(PYRODIGAL_GBK.out.faa) + GUNZIP_PYRODIGAL_FNA(PYRODIGAL_GBK.out.fna) + GUNZIP_PYRODIGAL_GBK(PYRODIGAL_GBK.out.annotations) + GUNZIP_PYRODIGAL_GFF(PYRODIGAL_GFF.out.annotations) + ch_versions = ch_versions.mix(PYRODIGAL_GBK.out.versions) + ch_versions = ch_versions.mix(PYRODIGAL_GFF.out.versions) ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_FAA.out.versions) ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_FNA.out.versions) ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_GBK.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_GFF.out.versions) ch_annotation_faa = GUNZIP_PYRODIGAL_FAA.out.gunzip ch_annotation_fna = GUNZIP_PYRODIGAL_FNA.out.gunzip ch_annotation_gbk = GUNZIP_PYRODIGAL_GBK.out.gunzip + ch_annotation_gff = GUNZIP_PYRODIGAL_GFF.out.gunzip } else if (params.annotation_tool == "prodigal") { - PRODIGAL(fasta, "gbk") - GUNZIP_PRODIGAL_FAA(PRODIGAL.out.amino_acid_fasta) - GUNZIP_PRODIGAL_FNA(PRODIGAL.out.nucleotide_fasta) - GUNZIP_PRODIGAL_GBK(PRODIGAL.out.gene_annotations) - ch_versions = ch_versions.mix(PRODIGAL.out.versions) + PRODIGAL_GBK(fasta, "gbk") + PRODIGAL_GFF(fasta, "gff") + GUNZIP_PRODIGAL_FAA(PRODIGAL_GBK.out.amino_acid_fasta) + GUNZIP_PRODIGAL_FNA(PRODIGAL_GBK.out.nucleotide_fasta) + GUNZIP_PRODIGAL_GBK(PRODIGAL_GBK.out.gene_annotations) + GUNZIP_PRODIGAL_GFF(PRODIGAL_GFF.out.gene_annotations) + ch_versions = ch_versions.mix(PRODIGAL_GBK.out.versions) + ch_versions = ch_versions.mix(PRODIGAL_GFF.out.versions) ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FAA.out.versions) ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FNA.out.versions) ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_GBK.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_GFF.out.versions) ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip ch_annotation_gbk = GUNZIP_PRODIGAL_GBK.out.gunzip + ch_annotation_gff = GUNZIP_PRODIGAL_GFF.out.gunzip } else if (params.annotation_tool == "prokka") { @@ -66,6 +80,7 @@ workflow ANNOTATION { ch_annotation_faa = PROKKA.out.faa ch_annotation_fna = PROKKA.out.fna ch_annotation_gbk = PROKKA.out.gbk + ch_annotation_gff = PROKKA.out.gff } else if (params.annotation_tool == "bakta") { @@ -87,6 +102,7 @@ workflow ANNOTATION { ch_annotation_faa = BAKTA_BAKTA.out.faa ch_annotation_fna = BAKTA_BAKTA.out.fna ch_annotation_gbk = BAKTA_BAKTA.out.gbff + ch_annotation_gff = BAKTA_BAKTA.out.gff } emit: @@ -95,4 +111,5 @@ workflow ANNOTATION { faa = ch_annotation_faa // [ [meta], path(faa) ] fna = ch_annotation_fna // [ [meta], path(fna) ] gbk = ch_annotation_gbk // [ [meta], path(gbk) ] + gff = ch_annotation_gff // [ [meta], path(gff) ] } diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index 6bdc2881..b1157f81 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -25,7 +25,7 @@ workflow BGC { ch_bgcresults_for_combgc = Channel.empty() // When adding new tool that requires FAA, make sure to update conditions - // in funcscan.nf around annotation and AMP subworkflow execution + // in funcscan.nf around annotation and BGC subworkflow execution // to ensure annotation is executed! ch_faa_for_bgc_hmmsearch = faas diff --git a/subworkflows/local/cazyme.nf b/subworkflows/local/cazyme.nf new file mode 100644 index 00000000..b957ea29 --- /dev/null +++ b/subworkflows/local/cazyme.nf @@ -0,0 +1,86 @@ +/* + Run rundbcan screening tools +*/ + +include { RUNDBCAN_DATABASE } from '../../modules/nf-core/rundbcan/database/main' +include { RUNDBCAN_CAZYMEANNOTATION } from '../../modules/nf-core/rundbcan/cazymeannotation/main' +include { RUNDBCAN_EASYCGC } from '../../modules/nf-core/rundbcan/easycgc/main' +include { RUNDBCAN_EASYSUBSTRATE } from '../../modules/nf-core/rundbcan/easysubstrate/main' + + +workflow CAZYME { + + take: + faas // tuple val(meta), path(PROKKA/PRODIGAL.out.faa) + gffs // tuple val(meta), path(ANNOTATION_ANNOTATION_TOOL.out.gff) + + main: + + ch_versions = Channel.empty() + + // When adding new tool that requires FAA, make sure to update conditions + // in funcscan.nf around annotation and dbCAN subworkflow execution + // to ensure annotation is executed! + ch_faas_for_rundbcan = faas + ch_gffs_for_rundbcan = gffs + + // Prepare channel for database + if (!params.cazyme_skip_dbcan && params.cazyme_dbcan_db) { + ch_dbcan_db = Channel + .fromPath(params.cazyme_dbcan_db, checkIfExists: true) + .first() + } + else if (!params.cazyme_skip_dbcan && !params.cazyme_dbcan_db) { + // Download dbCAN database + RUNDBCAN_DATABASE () + ch_versions = ch_versions.mix(RUNDBCAN_DATABASE.out.versions) + ch_dbcan_db = RUNDBCAN_DATABASE.out.dbcan_db + } + + if (!params.cazyme_skip_dbcan) { + // CAZyme annotation + RUNDBCAN_CAZYMEANNOTATION (ch_faas_for_rundbcan, ch_dbcan_db) + ch_versions = ch_versions.mix(RUNDBCAN_CAZYMEANNOTATION.out.versions) + + // Prepare input for dbCAN CGC and substrate annotation + if ( !params.dbcan_skip_cgc || !params.dbcan_skip_substrate ) { + ch_input_for_dbcan = ch_faas_for_rundbcan + .join(ch_gffs_for_rundbcan) + .filter { meta, faa, gff -> + if (!gff || !meta.gff_type) { + log.warn "Skipping sample: ${meta.id ?: 'unknown'} for dbcan cgc and substrate annotation due to empty gff or gff_type" + return false + } + return true + } + .multiMap { meta, faa, gff -> + faa: [meta, faa] + gff: [meta, gff, meta.gff_type] + } + + // CGC annotation + if ( !params.dbcan_skip_cgc ) { + RUNDBCAN_EASYCGC ( + ch_input_for_dbcan.faa, + ch_input_for_dbcan.gff, + ch_dbcan_db + ) + ch_versions = ch_versions.mix(RUNDBCAN_EASYCGC.out.versions) + } + + + // substrate annotation + if ( !params.dbcan_skip_substrate ) { + RUNDBCAN_EASYSUBSTRATE ( + ch_input_for_dbcan.faa, + ch_input_for_dbcan.gff, + ch_dbcan_db + ) + ch_versions = ch_versions.mix(RUNDBCAN_EASYSUBSTRATE.out.versions) + } + } + } + + emit: + versions = ch_versions +} diff --git a/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf b/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf index 929b5cca..b67f5585 100644 --- a/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf @@ -226,6 +226,11 @@ def toolCitationText() { ". The output from the biosynthetic gene cluster screening tools were standardised and summarised with comBGC (Frangenberg et al. 2023).", ].join(' ').replaceAll(', +.', ".").trim() + def cazyme_text = [ + "The following carbohydrate-active enzymes (CAZymes) screening tools were used:", + !params.cazyme_skip_dbcan ? "dbCAN3 (Zheng, Jinfang, et al. 2023)," : "", + ].join(' ').replaceAll(', +.', ".").trim() + def postprocessing_text = "Run statistics were reported using MultiQC (Ewels et al. 2016)." def citation_text = [ @@ -234,6 +239,7 @@ def toolCitationText() { params.run_amp_screening ? amp_text : "", params.run_arg_screening ? arg_text : "", params.run_bgc_screening ? bgc_text : "", + params.run_cazyme_screening ? cazyme_text : "", postprocessing_text, ].join(' ').trim() @@ -277,6 +283,10 @@ def toolBibliographyText() { '
  • Frangenberg, J. Fellows Yates, J. A., Ibrahim, A., Perelo, L., & Beber, M. E. (2023). nf-core/funcscan: 1.0.0 - German Rollmops - 2023-02-15. https://doi.org/10.5281/zenodo.7643100
  • ', ].join(' ').replaceAll(', +.', ".").trim() + def cazyme_text = [ + !params.cazyme_skip_dbcan ? '
  • Jinfang Zheng, Qiwei Ge, Yuchen Yan, Xinpeng Zhang, Le Huang, Yanbin Yin, dbCAN3: automated carbohydrate-active enzyme and substrate annotation, Nucleic Acids Research, Volume 51, Issue W1, 5 July 2023, Pages W115–W121. DOI: 10.1093/nar/gkad328
  • ' : "" + ].join(' ').replaceAll(', +.', ".").trim() + def postprocessing_text = '
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. https://doi.org/10.1093/bioinformatics/btw354
  • ' // Special as reused in multiple subworkflows, and we don't want to cause duplicates @@ -288,6 +298,7 @@ def toolBibliographyText() { params.run_amp_screening ? amp_text : "", params.run_arg_screening ? arg_text : "", params.run_bgc_screening ? bgc_text : "", + params.run_cazyme_screening ? cazyme_text : "", hmmsearch_text, postprocessing_text, ].join(' ').trim() diff --git a/tests/default.nf.test b/tests/default.nf.test index 2d484c10..e572530a 100644 --- a/tests/default.nf.test +++ b/tests/default.nf.test @@ -114,7 +114,17 @@ nextflow_pipeline { { assert path("$outputDir/arg/fargene/sample_2/sample_2-class_b_1_2.log").text.contains("fARGene is done.") }, // hAMRonization - { assert snapshot(path("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv")).match("hamronization_summarize") } + { assert snapshot(path("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv")).match("hamronization_summarize") }, + + // dbCAN + { assert path("$outputDir/cazyme/dbcan/cazyme_annotation/sample_2/sample_2_overview.tsv").text.contains("dbCAN_hmm") }, + { assert path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_cgc_standard_out.tsv").text.contains("CGC#") }, + { assert path("$outputDir/cazyme/dbcan/substrate/sample_2/sample_2_substrate_prediction.tsv").text.contains("#cgcid") }, + { assert snapshot( + path("$outputDir/cazyme/dbcan/cazyme_annotation/sample_2/sample_2_overview.tsv"), + path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_cgc_standard_out.tsv"), + path("$outputDir/cazyme/dbcan/substrate/sample_2/sample_2_substrate_prediction.tsv") + ).match('dbcan') } ) } } diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap index 9b2d87cc..e3678fe8 100644 --- a/tests/default.nf.test.snap +++ b/tests/default.nf.test.snap @@ -97,6 +97,9 @@ "GUNZIP_PYRODIGAL_GBK": { "gunzip": 1.13 }, + "GUNZIP_PYRODIGAL_GFF": { + "gunzip": 1.13 + }, "HAMRONIZATION_ABRICATE": { "hamronization": "1.1.9" }, @@ -115,7 +118,10 @@ "MACREL_CONTIGS": { "macrel": "1.4.0" }, - "PYRODIGAL": { + "PYRODIGAL_GBK": { + "pyrodigal": "3.6.3" + }, + "PYRODIGAL_GFF": { "pyrodigal": "3.6.3" }, "RGI_CARDANNOTATION": { @@ -136,9 +142,9 @@ ], "meta": { "nf-test": "0.9.2", - "nextflow": "25.04.7" + "nextflow": "25.04.6" }, - "timestamp": "2025-10-04T11:45:22.53155218" + "timestamp": "2025-09-28T00:25:27.627242047" }, "rgi": { "content": [ @@ -199,6 +205,18 @@ }, "timestamp": "2025-06-12T13:50:58.955107983" }, + "dbcan": { + "content": [ + "sample_2_overview.tsv:md5,f1f42b20b6438a6d9cde75415276ded6", + "sample_2_cgc_standard_out.tsv:md5,6be9ab29b289ff46cc6e4b6fe48dc3d7", + "sample_2_substrate_prediction.tsv:md5,fe2a5ea9e19c4f1108798103547ff98d" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-09-28T00:25:27.655500872" + }, "macrel": { "content": [ "sample_1.macrel.smorfs.faa.gz:md5,1b5e2434860e635e95324d1804a3be7b", diff --git a/tests/test_cazyme_pyrodigal.nf.test b/tests/test_cazyme_pyrodigal.nf.test new file mode 100644 index 00000000..3b5f74f7 --- /dev/null +++ b/tests/test_cazyme_pyrodigal.nf.test @@ -0,0 +1,51 @@ +nextflow_pipeline { + + name "Test pipeline: NFCORE_FUNCSCAN" + script "main.nf" + tag "pipeline" + tag "nfcore_funcscan" + tag "test_cazyme_pyrodigal" + profile "test_cazyme_pyrodigal" + + test("-profile test_cazyme_pyrodigal") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + assertAll( + { assert workflow.success }, + { assert new File("$outputDir/pipeline_info/nf_core_funcscan_software_mqc_versions.yml").exists() }, + { assert new File("$outputDir/multiqc/multiqc_report.html").exists() }, + + // dbCAN annotation + { assert path("$outputDir/cazyme/dbcan/cazyme_annotation/sample_2/sample_2_dbCAN_hmm_results.tsv").text.contains("dbCAN") }, + { assert path("$outputDir/cazyme/dbcan/cazyme_annotation/sample_2/sample_2_dbCANsub_hmm_results.tsv").text.contains("dbCAN-sub") }, + { assert path("$outputDir/cazyme/dbcan/cazyme_annotation/sample_2/sample_2_diamond.out").exists() }, + { assert path("$outputDir/cazyme/dbcan/cazyme_annotation/sample_2/sample_2_overview.tsv").text.contains("dbCAN_hmm") }, + + // dbCAN cgc + { assert path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_cgc.gff").exists() }, + { assert path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_cgc_standard_out.tsv").text.contains("CGC#") }, + { assert path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_diamond.out.tc").exists() }, + { assert path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_STP_hmm_results.tsv").text.contains("HMM") }, + { assert path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_TF_hmm_results.tsv").text.contains("HMM") }, + + // dbCAN substrate + { assert path("$outputDir/cazyme/dbcan/substrate/sample_2/sample_2_substrate_prediction.tsv").text.contains("#cgcid") }, + { assert path("$outputDir/cazyme/dbcan/substrate/sample_2/sample_2_synteny_pdf").exists() }, + { assert path("$outputDir/cazyme/dbcan/substrate/sample_2/sample_2_total_cgc_info.tsv").text.contains("Annotate") }, + + // snap shot + { assert snapshot( + path("$outputDir/cazyme/dbcan/cazyme_annotation/sample_2/sample_2_overview.tsv"), + path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_cgc_standard_out.tsv"), + path("$outputDir/cazyme/dbcan/substrate/sample_2/sample_2_substrate_prediction.tsv") + ).match('dbcan') } + ) + } + } +} diff --git a/tests/test_cazyme_pyrodigal.nf.test.snap b/tests/test_cazyme_pyrodigal.nf.test.snap new file mode 100644 index 00000000..8a1d5e2c --- /dev/null +++ b/tests/test_cazyme_pyrodigal.nf.test.snap @@ -0,0 +1,14 @@ +{ + "dbcan": { + "content": [ + "sample_2_overview.tsv:md5,f1f42b20b6438a6d9cde75415276ded6", + "sample_2_cgc_standard_out.tsv:md5,6be9ab29b289ff46cc6e4b6fe48dc3d7", + "sample_2_substrate_prediction.tsv:md5,fe2a5ea9e19c4f1108798103547ff98d" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-09-28T00:25:17.456937683" + } +} \ No newline at end of file diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index ba8f997a..da2d7310 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -24,6 +24,7 @@ include { PROTEIN_ANNOTATION } from '../subworkflows/local/protein_annota include { AMP } from '../subworkflows/local/amp' include { ARG } from '../subworkflows/local/arg' include { BGC } from '../subworkflows/local/bgc' +include { CAZYME } from '../subworkflows/local/cazyme' include { TAXA_CLASS } from '../subworkflows/local/taxa_class' /* @@ -68,7 +69,7 @@ workflow FUNCSCAN { // Some tools require uncompressed input ch_input_prep = ch_samplesheet - .map { meta, fasta, faa, gbk -> [meta + [category: 'all'], [fasta, faa, gbk]] } + .map { meta, fasta, faa, gbk, gff -> [meta + [category: 'all'], [fasta, faa, gbk, gff]] } .transpose() .branch { compressed: it[1].toString().endsWith('.gz') @@ -86,24 +87,26 @@ workflow FUNCSCAN { .map { meta, files -> def fasta_found = files.find { it.toString().tokenize('.').last().matches('fasta|fas|fna|fa') } def faa_found = files.find { it.toString().endsWith('.faa') } + def gff_found = files.find { it.toString().endsWith('.gff') } def gbk_found = files.find { it.toString().tokenize('.').last().matches('gbk|gbff') } def fasta = fasta_found != null ? fasta_found : [] def faa = faa_found != null ? faa_found : [] + def gff = gff_found != null ? gff_found : [] def gbk = gbk_found != null ? gbk_found : [] - [meta, fasta, faa, gbk] + [meta, fasta, faa, gff, gbk] } - .branch { meta, fasta, faa, gbk -> - preannotated: gbk != [] + .branch { meta, fasta, faa, gff, gbk -> + preannotated: gff != [] || gbk != [] fastas: true } // Duplicate and filter the duplicated file for long contigs only for BGC // This is to speed up BGC run and prevent 'no hits found' fails if (params.run_bgc_screening) { - SEQKIT_SEQ_LENGTH(ch_intermediate_input.fastas.map { meta, fasta, faa, gbk -> [meta, fasta] }) + SEQKIT_SEQ_LENGTH(ch_intermediate_input.fastas.map { meta, fasta, faa, gff, gbk -> [meta, fasta] }) ch_input_for_annotation = ch_intermediate_input.fastas - .map { meta, fasta, protein, gbk -> [meta, fasta] } + .map { meta, fasta, protein, gff, gbk -> [meta, fasta] } .mix(SEQKIT_SEQ_LENGTH.out.fastx.map { meta, fasta -> [meta + [category: 'long'], fasta] }) .filter { meta, fasta -> if (fasta != [] && fasta.isEmpty()) { @@ -114,7 +117,7 @@ workflow FUNCSCAN { ch_versions = ch_versions.mix(SEQKIT_SEQ_LENGTH.out.versions) } else { - ch_input_for_annotation = ch_intermediate_input.fastas.map { meta, fasta, protein, gbk -> [meta, fasta] } + ch_input_for_annotation = ch_intermediate_input.fastas.map { meta, fasta, protein, gff, gbk -> [meta, fasta] } } /* @@ -122,12 +125,13 @@ workflow FUNCSCAN { */ // Some tools require annotated FASTAs - if ((params.run_arg_screening && !params.arg_skip_deeparg) || params.run_amp_screening || params.run_bgc_screening) { + if ((params.run_arg_screening && !params.arg_skip_deeparg) || params.run_amp_screening || params.run_bgc_screening || params.run_cazyme_screening) { ANNOTATION(ch_input_for_annotation) ch_versions = ch_versions.mix(ANNOTATION.out.versions) ch_new_annotation = ch_input_for_annotation .join(ANNOTATION.out.faa) + .join(ANNOTATION.out.gff) .join(ANNOTATION.out.gbk) } else { @@ -135,23 +139,35 @@ workflow FUNCSCAN { } // Mix back the preannotated samples with the newly annotated ones - ch_prepped_input = ch_new_annotation - .filter { meta, fasta, faa, gbk -> meta.category != 'long' } + ch_new_annotation_short = ch_new_annotation + .filter { meta, fasta, faa, gff, gbk -> meta.category != 'long' } + + // Add gff_type to meta for cazyme screening + if ((params.run_cazyme_screening && !params.cazyme_skip_dbcan && (!params.dbcan_skip_cgc || !params.dbcan_skip_substrate)) && params.annotation_tool in ['pyrodigal', 'prodigal', 'prokka', 'bakta']) { + ch_new_annotation_short.map { meta, fasta, faa, gff, gbk -> + def new_meta = meta + [gff_type: 'prodigal'] // Only Use 'prodigal' as dbcan does not distinguish 'pyrodigal' and 'prodigal' + [new_meta, fasta, faa, gff, gbk] + }.set { ch_new_annotation_short } + } + + ch_prepped_input = ch_new_annotation_short .mix(ch_intermediate_input.preannotated) - .multiMap { meta, fasta, faa, gbk -> + .multiMap { meta, fasta, faa, gff, gbk -> fastas: [meta, fasta] faas: [meta, faa] + gffs: [meta, gff] gbks: [meta, gbk] } if (params.run_bgc_screening) { ch_prepped_input_long = ch_new_annotation - .filter { meta, fasta, faa, gbk -> meta.category == 'long' } + .filter { meta, fasta, faa, gff, gbk -> meta.category == 'long' } .mix(ch_intermediate_input.preannotated) - .multiMap { meta, fasta, faa, gbk -> + .multiMap { meta, fasta, faa, gff, gbk -> fastas: [meta, fasta] faas: [meta, faa] + gffs: [meta, gff] gbks: [meta, gbk] } } @@ -357,6 +373,21 @@ workflow FUNCSCAN { ch_versions = ch_versions.mix(BGC.out.versions) } + /* + CAZYMEs + */ + if ( params.run_cazyme_screening ) { + CAZYME ( + ch_prepped_input.faas.filter { meta, file -> + if (file != [] && file.isEmpty()) { + log.warn("[nf-core/funcscan] Annotation of following sample produced an empty FAA file. CAZyme screening tools requiring this file will not be executed: ${meta.id}") + } + !file.isEmpty() + }, + ch_prepped_input.gffs + ) + } + // // Collate and save software versions //