diff --git a/CONTRIBUTORS.yaml b/CONTRIBUTORS.yaml index 42cecf54591a0f..49cb25e1c81c86 100644 --- a/CONTRIBUTORS.yaml +++ b/CONTRIBUTORS.yaml @@ -3291,3 +3291,14 @@ sfragkoul: elixir_node: gr affiliations: - elixir-europe + +minamehr: + name: Mina Hojat Ansari + email: mina.ansari82@gmail.com + orcid: 0000-0002-3602-7884 + matrix: 'mina24:matrix.org' + joined: 2024-03 + elixir_node: de + affiliations: + - uni-freiburg + - elixir-europe diff --git a/topics/microbiome/tutorials/host-removal/tutorial.bib b/topics/microbiome/tutorials/host-removal/tutorial.bib new file mode 100644 index 00000000000000..9ddf83b46f30d7 --- /dev/null +++ b/topics/microbiome/tutorials/host-removal/tutorial.bib @@ -0,0 +1,37 @@ +@article{Ewels2016, + + author = {Ewels, Philip and Magnusson, M\textbackslash{}aans and Lundin, Sverker and K{\~ A}\textcurrency{}ller, Max}, + journal = {Bioinformatics}, + number = {19}, + year = {2016}, + month = {6}, + pages = {3047--3048}, + publisher = {Oxford University Press (OUP)}, + title = {MultiQC: summarize analysis results for multiple tools and samples in a single report}, + volume = {32}, + doi={10.1093/bioinformatics/btw354}, +} +@article{Langmead2009, + author = {Langmead, Ben and Trapnell, Cole and Pop, Mihai and Salzberg, Steven L.}, + title = {Ultrafast and memory-efficient alignment of short DNA sequences to the human genome}, + journal = {Genome Biology}, + year = {2009}, + volume = {10}, + number = {3}, + pages = {R25}, + doi = {10.1186/gb-2009-10-3-r25}, + url = {https://doi.org/10.1186/gb-2009-10-3-r25} +} +@article{Langmead2012, + author = {Langmead, Ben and Salzberg, Steven L.}, + title = {Fast gapped-read alignment with Bowtie 2}, + journal = {Nature Methods}, + year = {2012}, + volume = {9}, + number = {4}, + pages = {357--359}, + doi = {10.1038/nmeth.1923}, + url = {https://doi.org/10.1038/nmeth.1923}, + abstract = {The Bowtie 2 software achieves fast, sensitive, accurate and memory-efficient gapped alignment of sequencing reads using the full-text minute index and hardware-accelerated dynamic programming algorithms.}, + issn = {1548-7105} +} \ No newline at end of file diff --git a/topics/microbiome/tutorials/host-removal/tutorial.md b/topics/microbiome/tutorials/host-removal/tutorial.md new file mode 100644 index 00000000000000..baf17c60036572 --- /dev/null +++ b/topics/microbiome/tutorials/host-removal/tutorial.md @@ -0,0 +1,243 @@ +--- +layout: tutorial_hands_on + +title: 'Remove contamination and host reads' +zenodo_link: '' +questions: +- What preprocessing steps are required to obtain cleaned reads for downstream analysis? +- How can we identify and remove contaminant or host-derived reads from raw sequencing data? +objectives: +- Identify reads originating from contaminants or host genomes. +- Remove those reads to produce high-quality, clean metagenomic data suitable for downstream analyses. +- Bloom's Taxonomy +time_estimation: 1H +key_points: +- Identifying and removing contaminant and host reads is a critical preprocessing step in metagenomic workflows. +- Clean reads improve the accuracy of downstream assembly, binning, and taxonomic profiling. +contributions: + authorship: + - minamehr + - bebatut +--- + + +Metagenomic sequencing generates reads from all DNA present in a sample, including the **microbial community**, **host DNA**, and potential **environmental contaminants** (for example: sometimes human sequences introduced during sampling or processing). +Before taxonomic or functional analysis, it is essential to remove reads belonging to the host or other contaminants to avoid misleading results. + +In this tutorial, we will learn how to identify and remove host or contaminant reads using Galaxy. +We will: +- Map raw reads to a **host reference genome** using Bowtie2 and extract unmapped reads. +- Repeat the process with unmapped reads against a **human reference genome** to remove potential human contamination. +- Generate a final set of **clean, non-host reads** ready for downstream analyses such as assembly, binning, or profiling. + +To test and illustriate the process, we will use data from .... + + +> +> +> In this tutorial, we will cover: +> +> 1. TOC +> {:toc} +> +{: .agenda} + + +## Prepare Galaxy and data +Any analysis should get its own Galaxy history. So let's start by creating a new one: + +> Data Upload +> +> 1. Create a new history for this tutorial +> +> {% snippet faqs/galaxy/histories_create_new.md %} +> +> 2. Rename the history +> +> {% snippet faqs/galaxy/histories_rename.md %} +> +{: .hands_on} + +Now, we need to import the data + +> Import datasets +> +> 1. Import the files from [Zenodo]({{ page.zenodo_link }}) or from +> the shared data library (`GTN - Material` -> `{{ page.topic_name }}` +> -> `{{ page.title }}`): +> +> +> {% snippet faqs/galaxy/datasets_import_via_link.md %} +> +> {% snippet faqs/galaxy/datasets_import_from_data_library.md %} +> +> 2. Create a paired collection. +> +> {% snippet faqs/galaxy/collections_build_list_paired.md %} +> +{: .hands_on} + +## Map reads to a host genome with Bowtie2 +To remove host contamination, we start by mapping the reads to the host genome using Bowtie2 to detect and remove host-derived sequences. + +> Remove host reads +> + +> 1. {% tool [Bowtie2](toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/bowtie2/2.5.3+galaxy1) %} with the following parameters: +> - *"Is this single or paired library"*: `Paired-end Dataset Collection` +> - {% icon param-collection %} *"FASTQ Paired Dataset"*: `Input reads` +> - *"Write unaligned reads (in fastq format) to separate file(s)"*: `Yes` +> - *"Do you want to set paired-end options?"*: `Yes` +> - *"Will you select a reference genome from your history or use a built-in index?"*: `Use a built-in genome index` +> - *"Select reference genome"*: `the target host genome` +> - *"Set read groups information?"*: `Do not set` +> - *"Select analysis mode"*: `1: Default setting only` +> - *"Do you want to tweak SAM/BAM Options?"*: `No` +> - *"Save the bowtie2 mapping statistics to the history"*: `Yes` + +> +> 2. Run the tool. The outputs will include: +> - Mapping statistics report (`bowtie2.log`) +> - Unaligned (unmapped) forward and reverse reads +> +> 3. These unmapped reads represent sequences **not belonging to the host** and will be used in the next step. +> +> > Tip +> > Host reference genomes vary depending on the study organism. You can upload a FASTA file of your host genome if it is not available as a built-in index. +> {: .comment} +> +{: .hands_on} + +> +> +> 1. What percentage of reads mapped to the host genome? +> 2. Why might different datasets show different mapping percentages? +> +> > +> > +> > 1. The mapping rate depends on the host content of the sample. +> > 2. Host DNA contamination varies depending on tissue type, sampling method, and extraction procedure. +> > +> {: .solution} +> +{: .question} + +## Re-pair unmapped reads +We now combine the unmapped forward and reverse reads into a new paired-end dataset for further processing. + +> Combine unmapped forward and reverse reads into a paired collection +> +> 1. {% tool [Zip collections](__ZIP_COLLECTION__) %} with the following parameters: +> - {% icon param-file %} *"Input 1"*: `output_unaligned_reads_l` +> - {% icon param-file %} *"Input 2"*: `output_unaligned_reads_r` +> +> 2. This step creates a new paired-end collection that represents all reads **not aligned to the host genome**. +> +> > Note +> > +> > Zipping restores the normal paired-end structure, which is required for downstream tools or for rerunning the workflow on another reference. +> {: .comment} +> +{: .hands_on} + +> +> +> 1. How many reads remain after host-read removal? +> 2. Why is it important to re-pair the unmapped reads before further analysis? +> +> > +> > +> > 1. The total depends on the dataset, usually 10–50 % of reads remain after host removal. +> > 2. Paired-end structure ensures that downstream tools (e.g. assemblers) correctly interpret forward/reverse relationships. +> > +> {: .solution} +> +{: .question} + +## Summarize mapping statistics +Once the host mapping is complete, we use MultiQC to summarize and visualize the mapping statistics, helping us assess how many reads were removed and how many remain. + +> Evaluate host read removal results +> +> 1. {% tool [MultiQC](toolshed.g2.bx.psu.edu/repos/iuc/multiqc/multiqc/1.27+galaxy3) %} with the following parameters: +> - In *"Results"*: +> - {% icon param-repeat %} *"Insert Results"* +> - *"Which tool was used generate logs?"*: `Bowtie 2` +> - {% icon param-file %} *"Output of Bowtie 2"*: `mapping_stats` (output of **Bowtie2** {% icon tool %}) +> - *"Report title"*: `Host Removal` +> +> 2. Run the tool and open the generated HTML report. +> 3. Review the mapping percentage, number of reads aligned, and number of unmapped reads. +> +> > Tip +> > +> > Low mapping percentages in the report confirm that most host reads were successfully removed. +> {: .comment} +> +{: .hands_on} + +> +> +> 1. How does the mapping percentage differ between the host and human filtering runs? +> 2. What does a low mapping percentage in both runs indicate? +> +> > +> > +> > 1. The human filtering step usually removes only a small additional fraction of reads. +> > 2. Low mapping in both runs means the dataset is now largely free of host and human sequences and ready for downstream analysis. +> > +> {: .solution} +> +{: .question} + +## Remove potential human contamination +After removing host reads, we can run the **same workflow again** to eliminate possible **human contamination** that may remain in the dataset. + +> Rerun the workflow using the human genome as reference +> +> 1. Use the **unmapped reads** (output from Step 2) as the input for this second run. +> 2. In the **Bowtie2** step: +> - *"Will you select a reference genome from your history or use a built-in index?"*: `Use a built-in genome index` +> - *"Select reference genome"*: `Human (GRCh38)` +> - Keep all other parameters the same as in the first run. +> +> 3. Continue through the **Zip collections** and **MultiQC** steps as before. +> 4. The output of this second run represents your **final cleaned reads**, free from both host and human sequences. +> +> > Note +> > Rerunning the same workflow maintains reproducibility. +> > Only the reference genome and the input data change between the two runs. +> {: .comment} +> +{: .hands_on} + +> Verify your final dataset +> +> 1. How does the mapping percentage differ between the host and human filtering runs? +> 2. What does a low mapping percentage in both runs indicate? +> +> > +> > +> > 1. The second run (against the human genome) usually removes only a small number of additional reads. +> > 2. Low mapping rates in both runs confirm that the dataset is largely free of host and human contamination. +> > +> {: .solution} +> +{: .question} + + +# Conclusion + +In this tutorial, you learned how to: + +- Identify and remove reads originating from host or contaminant genomes using **Bowtie2**. +- Combine unmapped forward and reverse reads into a paired collection for reuse. +- Summarize mapping statistics and verify host-read removal using **MultiQC**. +- Rerun the same workflow with a human reference genome to remove residual human contamination. + +The resulting **clean reads** are now ready for downstream metagenomic analyses such as: +- **Assembly** +- **Binning** +- **Functional or taxonomic profiling** + +These preprocessing steps are essential to ensure accurate microbial community reconstruction without interference from host DNA. \ No newline at end of file diff --git a/topics/microbiome/tutorials/host-removal/workflows/index.md b/topics/microbiome/tutorials/host-removal/workflows/index.md new file mode 100644 index 00000000000000..e092e0ae66ddd4 --- /dev/null +++ b/topics/microbiome/tutorials/host-removal/workflows/index.md @@ -0,0 +1,3 @@ +--- +layout: workflow-list +--- diff --git a/topics/microbiome/tutorials/host-removal/workflows/main_workflow.ga b/topics/microbiome/tutorials/host-removal/workflows/main_workflow.ga new file mode 100644 index 00000000000000..03adb02468b620 --- /dev/null +++ b/topics/microbiome/tutorials/host-removal/workflows/main_workflow.ga @@ -0,0 +1 @@ +{"a_galaxy_workflow": "true", "annotation": "", "comments": [], "creator": [{"class": "Person", "identifier": "0000-0003-2982-388X", "name": "Paul Zierep"}], "format-version": "0.1", "license": "MIT", "name": "Host contamination removal", "report": {"markdown": "\n# Workflow Execution Report\n\n## Workflow Inputs\n```galaxy\ninvocation_inputs()\n```\n\n## Workflow Outputs\n```galaxy\ninvocation_outputs()\n```\n\n## Workflow\n```galaxy\nworkflow_display()\n```\n"}, "steps": {"0": {"annotation": "", "content_id": null, "errors": null, "id": 0, "input_connections": {}, "inputs": [{"description": "", "name": "Input paired fastq "}], "label": "Input paired fastq ", "name": "Input dataset collection", "outputs": [], "position": {"left": 10, "top": 50}, "tool_id": null, "tool_state": "{\"optional\": false, \"tag\": null, \"collection_type\": \"list:paired\", \"fields\": null}", "tool_version": null, "type": "data_collection_input", "uuid": "13e4060f-f337-4f44-824f-ee85235fcc8e", "when": null, "workflow_outputs": []}, "1": {"annotation": "", "content_id": null, "errors": null, "id": 1, "input_connections": {}, "inputs": [{"description": "", "name": "Reference Genome Build In"}], "label": "Reference Genome Build In", "name": "Input parameter", "outputs": [], "position": {"left": 0, "top": 240}, "tool_id": null, "tool_state": "{\"multiple\": false, \"validators\": [], \"restrictOnConnections\": true, \"parameter_type\": \"text\", \"optional\": false}", "tool_version": null, "type": "parameter_input", "uuid": "47ad0b2d-0d31-4260-82df-8fed2da6b150", "when": null, "workflow_outputs": []}, "2": {"annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/bowtie2/2.5.3+galaxy1", "errors": null, "id": 2, "input_connections": {"library|input_1": {"id": 0, "output_name": "output"}, "reference_genome|index": {"id": 1, "output_name": "output"}}, "inputs": [{"description": "runtime parameter for tool Bowtie2", "name": "library"}, {"description": "runtime parameter for tool Bowtie2", "name": "reference_genome"}], "label": null, "name": "Bowtie2", "outputs": [{"name": "output_unaligned_reads_l", "type": "fastqsanger"}, {"name": "output_unaligned_reads_r", "type": "fastqsanger"}, {"name": "output", "type": "bam"}, {"name": "mapping_stats", "type": "txt"}], "position": {"left": 570, "top": 10}, "post_job_actions": {}, "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/bowtie2/2.5.3+galaxy1", "tool_shed_repository": {"changeset_revision": "d5ceb9f3c25b", "name": "bowtie2", "owner": "devteam", "tool_shed": "toolshed.g2.bx.psu.edu"}, "tool_state": "{\"analysis_type\": {\"analysis_type_selector\": \"simple\", \"__current_case__\": 0, \"presets\": \"no_presets\"}, \"library\": {\"type\": \"paired_collection\", \"__current_case__\": 2, \"input_1\": {\"__class__\": \"ConnectedValue\"}, \"unaligned_file\": true, \"aligned_file\": false, \"paired_options\": {\"paired_options_selector\": \"no\", \"__current_case__\": 1}}, \"reference_genome\": {\"source\": \"indexed\", \"__current_case__\": 0, \"index\": {\"__class__\": \"ConnectedValue\"}}, \"rg\": {\"rg_selector\": \"do_not_set\", \"__current_case__\": 3}, \"sam_options\": {\"sam_options_selector\": \"no\", \"__current_case__\": 1}, \"save_mapping_stats\": true, \"__page__\": 0, \"__rerun_remap_job_id__\": null}", "tool_version": "2.5.3+galaxy1", "type": "tool", "uuid": "c5da8956-2e29-45d3-8a38-7104c7408a1e", "when": null, "workflow_outputs": []}, "3": {"annotation": "", "content_id": "__ZIP_COLLECTION__", "errors": null, "id": 3, "input_connections": {"input_forward": {"id": 2, "output_name": "output_unaligned_reads_l"}, "input_reverse": {"id": 2, "output_name": "output_unaligned_reads_r"}}, "inputs": [{"description": "runtime parameter for tool Zip collections", "name": "input_forward"}, {"description": "runtime parameter for tool Zip collections", "name": "input_reverse"}], "label": null, "name": "Zip collections", "outputs": [{"name": "output", "type": "input"}], "position": {"left": 960, "top": 0}, "post_job_actions": {}, "tool_id": "__ZIP_COLLECTION__", "tool_state": "{\"input_forward\": {\"__class__\": \"RuntimeValue\"}, \"input_reverse\": {\"__class__\": \"RuntimeValue\"}, \"__page__\": 0, \"__rerun_remap_job_id__\": null}", "tool_version": "1.0.0", "type": "tool", "uuid": "944b30b8-dfa2-459c-be49-5f62db677b84", "when": null, "workflow_outputs": []}, "4": {"annotation": "", "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/multiqc/multiqc/1.27+galaxy3", "errors": null, "id": 4, "input_connections": {"results_0|software_cond|input": {"id": 2, "output_name": "mapping_stats"}}, "inputs": [{"description": "runtime parameter for tool MultiQC", "name": "image_content_input"}], "label": null, "name": "MultiQC", "outputs": [{"name": "html_report", "type": "html"}, {"name": "stats", "type": "tabular"}], "position": {"left": 970, "top": 290}, "post_job_actions": {}, "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/multiqc/multiqc/1.27+galaxy3", "tool_shed_repository": {"changeset_revision": "31c42a2c02d3", "name": "multiqc", "owner": "iuc", "tool_shed": "toolshed.g2.bx.psu.edu"}, "tool_state": "{\"comment\": \"\", \"export\": false, \"flat\": false, \"image_content_input\": {\"__class__\": \"RuntimeValue\"}, \"results\": [{\"__index__\": 0, \"software_cond\": {\"software\": \"bowtie2\", \"__current_case__\": 3, \"input\": {\"__class__\": \"ConnectedValue\"}}}], \"title\": \"Host Removal\", \"__page__\": 0, \"__rerun_remap_job_id__\": null}", "tool_version": "1.27+galaxy3", "type": "tool", "uuid": "5d2f5ec8-c386-4ca9-8c2f-d6baccc4e5b8", "when": null, "workflow_outputs": []}}, "tags": ["name:FAIRyMAGs"], "uuid": "fb860ecd-f176-4e48-be10-9153a8a9032c", "version": 4} \ No newline at end of file