galaxyproject · Deeptivarshney · Oct 18, 2025
diff --git a/tools/oatk/.shed.yml b/tools/oatk/.shed.yml
@@ -0,0 +1,10 @@
+name: Oatk
+owner: iuc
+description: an organelle genome assembly toolkit
+long_description: |
+  Oatk is designed for de novo assembly of complex plant organelle genomes using PacBio HiFi data. It can also be used to assemble other simple organelle genomes such as animal mitochondria. 
+categories:
+- Genome annotation 
+remote_repository_url: https://github.com/c-zhou/oatk/
+homepage_url: https://github.com/c-zhou/oatk/
+type: unrestricted
diff --git a/tools/oatk/oatk.xml b/tools/oatk/oatk.xml
@@ -0,0 +1,181 @@
+<tool id="oatk" name="oatk" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" python_template_version="3.5" profile="21.05">
+    <description>Organelle Genome Assembly Toolkit</description>
+    <macros>
+        <token name="@TOOL_VERSION@">1.0.0</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
+    <!-- <xrefs>
+        <xref type="bio.tools">oatk</xref>
+    </xrefs>  -->
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">oatk</requirement>
+        <requirement type="package" version="@TOOL_VERSION@">nhmmscan</requirement>
+    </requirements>
+    <!-- #./oatk -o oatk.asm -t 16 -m angiosperm_mito.fam -p angiosperm_pltd.fam hifi.fa.gz -->
+    <command detect_errors="exit_code"><![CDATA[
+        oatk
+          -t \${GALAXY_SLOTS:-2} 
+          -m "/home/varshney/Galaxy/tools-iuc/tools/oatk/test-data/embryophyta_mito.fam"
+          -p "/home/varshney/Galaxy/tools-iuc/tools/oatk/test-data/embryophyta_pltd.fam"
+          ${use_graph_mode}
+          ${minicircle_mode} 
+          -k ${kmer_size} 
+          -s ${smer_size} 
+          -c ${min_kmer_coverage} 
+          -a ${min_arc_coverage} 
+          -D ${max_data_usage}
+        ##   -N ${max_graph_paths}
+          --max-bubble ${max_bubble_size} 
+          --max-tip ${max_tip_size} 
+          --weak-cross ${weak_crosslink} 
+          --unzip-round ${unzip_rounds} 
+          -b ${batch_size} 
+          -f ${prefer_circular_path} 
+          -S ${min_annotation_score}
+          -e ${max_evalue} 
+          -g ${min_core_gene_gain} 
+          -l ${min_seq_length} 
+          -q ${min_seq_coverage} 
+          -C ${max_copy_number} 
+          ${include_trna} 
+          ${include_rrna} 
+          ${disable_graph_clean}
+          --edge-c-tag ${edge_coverage_tag} 
+          --kmer-c-tag ${kmer_coverage_tag} 
+          --seq-c-tag ${seq_coverage_tag}
+          "${input_reads}"
+    ]]></command>
+    <inputs>
+        <!-- Input data  -->
+        <param name="input_reads" type="data" format="fastq,fasta,fastq.gz" label="Input PacBio HiFi Reads"/>
+        <!-- Syncasm Options -->
+        <section name="Syncasm" title="Syncasm Options" expanded="False">
+            <param name="use_graph_mode" type="boolean" optional="true" truevalue="-G" falsevalue="" label="Use Graph Mode?"/>
+            <param name="minicircle_mode" type="boolean" optional="true" truevalue="-M" falsevalue="" label="Minicircle Mode?"/>
+            <param name="kmer_size" type="integer" value="1001" min="21" label="K-mer Size"/>
+            <param name="smer_size" type="integer" value="31" min="1" max="31" label="S-mer Size"/>
+            <param name="min_kmer_coverage" type="integer" value="30" min="1" label="Minimum K-mer Coverage"/>
+            <param name="min_arc_coverage" type="float" value="0.35" min="0" label="Minimum Arc Coverage"/>
+            <param name="max_data_usage" type="text" value="0" label="Maximum Data Usage (e.g., 0, 10K, 10M)"/>
+            <param name="max_bubble_size" type="integer" value="100000" min="1" label="Maximum Bubble Size"/>
+            <param name="max_tip_size" type="integer" value="10000" min="1" label="Maximum Tip Size"/>
+            <param name="weak_crosslink" type="float" value="0.30" min="0" label="Maximum Relative Edge Coverage for Weak Crosslink"/>
+            <param name="unzip_rounds" type="integer" value="3" min="1" label="Maximum Unzip Rounds"/>
+            <param name="disable_read_ec" type="boolean" optional="true" truevalue="--no-read-ec" falsevalue="" label="Disable Read Error Correction?"/>
+        </section>
+        <!-- Annotation Options (hmmannot) -->
+        <section name="Annotation" title="Annotation Options" expanded="False">
+            <param name="annotation_hmm_mito" type="data" format="dat,hmm" optional="true" label="Mitochondria HMM Profile Database"/>
+            <param name="annotation_hmm_plastid" type="data" format="dat,hmm" optional="true" label="Plastid HMM Profile Database"/>
+            <param name="batch_size" type="integer" value="100000" min="1" label="Batch Size"/>
+            <param name="temp_dir" type="text" optional="true" label="Temporary Directory"/>
+            <param name="nhmmscan_path" type="text" value="nhmmscan" label="nhmmscan Executable Path"/>
+        </section>
+        <!-- Pathfinder Options -->
+        <section name="Pathfinder" title="Pathfinder Options" expanded="False">
+            <param name="prefer_circular_path" type="float" value="0.90" min="0" max="1" label="Prefer Circular Path Threshold"/>
+            <param name="min_annotation_score" type="float" value="300.0" min="0" label="Minimum Annotation Score"/>
+            <param name="max_evalue" type="float" value="1.0e-06" min="0" label="Maximum E-value"/>
+            <param name="min_core_gene_gain" type="text" value="3,1" label="Minimum Core Gene Gain (General, Mitochondria)"/>
+            <param name="min_seq_length" type="integer" value="10000" min="1" label="Minimum Sequence Length to Keep"/>
+            <param name="min_seq_coverage" type="float" value="0.20" min="0" label="Minimum Sequence Coverage Relative to Subgraph Average"/>
+            <param name="max_copy_number" type="integer" value="10" min="1" label="Maximum Copy Number"/>
+            <param name="max_graph_paths" type="integer" value="1000000" min="1" label="Maximum Graph Paths to Explore"/>
+            <param name="include_trna" type="boolean" optional="true" truevalue="--include-trn" falsevalue="" label="Include tRNA Genes for Classification?"/>
+            <param name="include_rrna" type="boolean" optional="true" truevalue="--include-rrn" falsevalue="" label="Include rRNA Genes for Classification?"/>
+            <param name="disable_graph_clean" type="boolean" optional="true" truevalue="--no-graph-clean" falsevalue="" label="Disable Graph Cleaning?"/>
+        </section>
+        <!-- Tagging Options -->
+        <section name="Tagging Options" title="Tagging" expanded="False">
+            <param name="edge_coverage_tag" type="text" value="EC:i" label="Edge Coverage Tag in GFA File"/>
+            <param name="kmer_coverage_tag" type="text" value="KC:i" label="K-mer Coverage Tag in GFA File"/>
+            <param name="seq_coverage_tag" type="text" value="SC:f" label="Sequence Coverage Tag in GFA File"/>
+        </section>
+    </inputs>
+    <outputs>
+        <data name="final_gfa_file" format="gfa2" from_work_dir="oatk.asm.utg.final.gfa" label="The GFA file for the final genome assembly"/>
+        <data name="mt_gene" format="txt" from_work_dir="oatk.asm.annot_mito.txt" label="The MT gene annotation file for assembled sequences"/>
+        <data name="pt_gene" format="txt" from_work_dir="oatk.asm.annot_pltd.txt" label="The PT gene annotation file for assembled sequences"/>
+        <data name="MT_subgraph" format="gfa2" from_work_dir="oatk.asm.mito.gfa" label="The subgraph for the MT genome"/>
+        <data name="MT_structure_solved" format="fasta" from_work_dir="oatk.asm.mito.ctg.fasta" label="The structure-solved MT contigs"/>
+        <data name="MT_genome_annotation" format="bed" from_work_dir="oatk.asm.mito.ctg.bed" label="The genome annotation for MT contigs"/>
+        <data name="MT_gene_annotation" format="bed" from_work_dir="oatk.asm.mito.bed" label="The gene annotation for the MT sequences"/>
+        <data name="PT_subgraph" format="gfa2" from_work_dir="oatk.asm.pltd.gfa" label="The subgraph for the PT genome"/>
+        <data name="PT_structure_solved" format="fasta" from_work_dir="oatk.asm.pltd.ctg.fasta" label="The structure-solved PT contigs"/>
+        <data name="PT_genome_annotation" format="bed" from_work_dir="oatk.asm.pltd.ctg.bed" label="The genome annotation for PT contigs"/>
+        <data name="PT_gene_annotation" format="bed" from_work_dir="oatk.asm.pltd.bed" label="The gene annotation for the PT sequences"/>
+    </outputs>
+    <tests>
+        <test>
+            <!-- Input Data -->
+            <param name="input_reads" value="ddAraThal4_organelle.hifi.fa.gz"/>
+            <param name="annotation_hmm_mito" value="embryophyta_mito.fam"/>
+            <param name="annotation_hmm_plastid" value="embryophyta_pltd.fam"/>
+            <!-- Expected Outputs -->
+            <output name="final_gfa_file" file="expected_output.utg.final.gfa"/>
+            <output name="mt_gene" file="expected_output.annot_mito.txt"/>
+            <output name="pt_gene" file="expected_output.annot_pltd.txt"/>
+            <output name="MT_subgraph" file="expected_output.mito.gfa"/>
+            <output name="MT_structure_solved" file="expected_output.mito.ctg.fasta"/>
+            <output name="MT_genome_annotation" file="expected_output.mito.ctg.bed"/>
+            <output name="MT_gene_annotation" file="expected_output.mito.bed"/>
+            <output name="PT_subgraph" file="expected_output.pltd.gfa"/>
+            <output name="PT_structure_solved" file="expected_output.pltd.ctg.fasta"/>
+            <output name="PT_genome_annotation" file="expected_output.pltd.ctg.bed"/>
+            <output name="PT_gene_annotation" file="expected_output.pltd.bed"/>
+        </test>
+        <test>
+            <!-- Input Data -->
+            <param name="input_reads" value="test.fq"/>
+            <param name="annotation_hmm_mito" value="embryophyta_mito.fam"/>
+            <param name="annotation_hmm_plastid" value="embryophyta_pltd.fam"/>
+            <!-- Expected Outputs -->
+            <output name="final_gfa_file" file="1000trimmed.fastq.utg.final.gfa"/>
+            <output name="mt_gene" file="1000trimmed.fastq.annot_mito.txt"/>
+            <output name="pt_gene" file="1000trimmed.fastq.annot_pltd.txt"/>
+            <output name="MT_subgraph" file="1000trimmed.fastq.mito.gfa"/>
+            <output name="MT_structure_solved" file="1000trimmed.fastq.mito.ctg.fasta"/>
+            <output name="MT_genome_annotation" file="1000trimmed.fastq.mito.ctg.bed"/>
+            <output name="MT_gene_annotation" file="1000trimmed.fastq.mito.bed"/>
+            <output name="PT_subgraph" file="1000trimmed.fastq.pltd.gfa"/>
+            <output name="PT_structure_solved" file="1000trimmed.fastq.pltd.ctg.fasta"/>
+            <output name="PT_genome_annotation" file="1000trimmed.fastq.pltd.ctg.bed"/>
+            <output name="PT_gene_annotation" file="1000trimmed.fastq.pltd.bed"/>
+        </test>
+        <test>
+            <!-- Input Data -->
+            <param name="input_reads" value="ddAraThal4_organelle.hifi.fa.gz"/>
+            <param name="annotation_hmm_mito" value="embryophyta_mito.fam"/>
+            <param name="annotation_hmm_plastid" value="embryophyta_pltd.fam"/>
+            <param name="kmer_size" value="500"/>
+            <!-- Expected Outputs -->
+            <output name="final_gfa_file" file="expected_output.utg.final.kmer.gfa"/>
+            <output name="mt_gene" file="expected_output.annot_mito.kmer.txt"/>
+            <output name="pt_gene" file="expected_output.annot_pltd.kmer.txt"/>
+            <output name="MT_subgraph" file="expected_output.mito.kmer.gfa"/>
+            <output name="MT_structure_solved" file="expected_output.mito.ctg.kmer.fasta"/>
+            <output name="MT_genome_annotation" file="expected_output.mito.kmer.ctg.bed"/>
+            <output name="MT_gene_annotation" file="expected_output.mito.kmer.bed"/>
+            <output name="PT_subgraph" file="expected_output.pltd.kmer.gfa"/>
+            <output name="PT_structure_solved" file="expected_output.pltd.kmer.ctg.fasta"/>
+            <output name="PT_genome_annotation" file="expected_output.pltd.kmer.ctg.bed"/>
+            <output name="PT_gene_annotation" file="expected_output.kmer.pltd.bed"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+        **Oatk** is designed for de novo assembly of complex plant organelle genomes using PacBio HiFi data.
+        It can also be used to assemble other simple organelle genomes such as animal mitochondria.
+
+        **Toolkit Components**:
+         - **syncasm**: A de novo HiFi read assembler using a sparse de Bruijn graph constructed from closed syncmers (Edgar, R. 2021).
+         - **hmmannot**: A HMMER wrapper for annotating organelle sequences using a pre-built HMM profile database (available at OatkDB).
+         - **pathfinder**: Parses and circularises organelle genomes from assemblies by integrating HMM annotations with assembly graph structure.
+         - **oatk**: Runs syncasm, hmmannot, and pathfinder collectively.
+
+        **Auxiliary Tools**:
+         - **path_to_fasta**: Extracts FASTA sequences from a GFA file given a path.
+         - **rotate**: Rotates a circular sequence to a specified position.
+
+        This Galaxy wrapper provides a user-friendly interface to configure and run Oatk for organelle genome assembly.
+    ]]></help>
+</tool>