Merge pull request #7322 from veg/cawlign_truly_clean_wrapper

bgruening · web-flow · commit 848922d15b3e · 2025-10-15T17:22:37.000+02:00
feat: Add cawlign Galaxy tool wrapper
diff --git a/tools/cawlign/.shed.yml b/tools/cawlign/.shed.yml
@@ -0,0 +1,9 @@
+name: cawlign
+owner: iuc
+categories:
+  - Sequence Analysis
+description: Codon-aware alignment of sequences to a reference.
+homepage_url: https://github.com/veg/cawlign
+long_description: |
+  cawlign is a standalone C++ port of bealign, used for codon-aware alignment of sequences in a FASTA file to a reference sequence.
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/cawlign
diff --git a/tools/cawlign/cawlign.xml b/tools/cawlign/cawlign.xml
@@ -0,0 +1,264 @@
+<tool id="cawlign" name="cawlign" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="24.2">
+    <description>Codon-aware (pairwise) alignment</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <expand macro="stdio"/>
+    <command detect_errors="exit_code"><![CDATA[
+        cawlign
+            #if $reference_cond.reference_source == "history"
+            -r '$reference_cond.reference_history'
+            #else
+            -r '$reference_cond.reference_builtin'
+            #end if
+            -s $scoring_matrix_cond.scoring_matrix_source.scoring_matrix
+            -t $scoring_matrix_cond.datatype
+            -l $local_alignment
+            -f $format
+            -R $reverse_complement
+            $affine_gap
+            $write_reference
+            $fasta
+            > '$output'
+    ]]></command>
+    <inputs>
+        <param name="fasta" type="data" format="fasta" label="Sequences to align" help="Input FASTA file containing sequences to align."/>
+        <conditional name="reference_cond">
+            <param name="reference_source" type="select" label="Reference sequence source">
+                <option value="builtin" selected="true">Use a built-in reference</option>
+                <option value="history">Use a custom reference from history</option>
+            </param>
+            <when value="builtin">
+                <param name="reference_builtin" type="select" argument="-r" label="Select a built-in reference">
+                    <option value="CoV2-E">CoV2-E</option>
+                    <option value="CoV2-endornase">CoV2-endornase</option>
+                    <option value="CoV2-exonuclease">CoV2-exonuclease</option>
+                    <option value="CoV2-helicase">CoV2-helicase</option>
+                    <option value="CoV2-leader">CoV2-leader</option>
+                    <option value="CoV2-M">CoV2-M</option>
+                    <option value="CoV2-methyltransferase">CoV2-methyltransferase</option>
+                    <option value="CoV2-N">CoV2-N</option>
+                    <option value="CoV2-nsp10">CoV2-nsp10</option>
+                    <option value="CoV2-nsp2">CoV2-nsp2</option>
+                    <option value="CoV2-nsp3">CoV2-nsp3</option>
+                    <option value="CoV2-nsp4">CoV2-nsp4</option>
+                    <option value="CoV2-nsp6">CoV2-nsp6</option>
+                    <option value="CoV2-nsp7">CoV2-nsp7</option>
+                    <option value="CoV2-nsp8">CoV2-nsp8</option>
+                    <option value="CoV2-nsp9">CoV2-nsp9</option>
+                    <option value="CoV2-ORF10">CoV2-ORF10</option>
+                    <option value="CoV2-ORF1a">CoV2-ORF1a</option>
+                    <option value="CoV2-ORF1b">CoV2-ORF1b</option>
+                    <option value="CoV2-ORF3a">CoV2-ORF3a</option>
+                    <option value="CoV2-ORF5">CoV2-ORF5</option>
+                    <option value="CoV2-ORF6">CoV2-ORF6</option>
+                    <option value="CoV2-ORF7a">CoV2-ORF7a</option>
+                    <option value="CoV2-ORF7b">CoV2-ORF7b</option>
+                    <option value="CoV2-ORF8">CoV2-ORF8</option>
+                    <option value="CoV2-RdRp">CoV2-RdRp</option>
+                    <option value="CoV2-S">CoV2-S</option>
+                    <option value="CoV2-threeC">CoV2-threeC</option>
+                    <option value="HXB2_gag">HXB2_gag</option>
+                    <option value="HXB2_int">HXB2_int</option>
+                    <option value="HXB2_nef">HXB2_nef</option>
+                    <option value="HXB2_pol" selected="true">HXB2_pol</option>
+                    <option value="HXB2_pr">HXB2_pr</option>
+                    <option value="HXB2_prrt">HXB2_prrt</option>
+                    <option value="HXB2_rev">HXB2_rev</option>
+                    <option value="HXB2_rt">HXB2_rt</option>
+                    <option value="HXB2_tat">HXB2_tat</option>
+                    <option value="HXB2_vif">HXB2_vif</option>
+                </param>
+            </when>
+            <when value="history">
+                <param name="reference_history" type="data" argument="-r" format="fasta" label="Reference sequence" help="Reference sequence FASTA file from your history."/>
+            </when>
+        </conditional>
+        <conditional name="scoring_matrix_cond">
+            <param name="datatype" type="select" argument="-t" label="Data type" help="Choose the alignment space (nucleotide, protein, or codon).">
+                <option value="codon" selected="true">Align sequences in codon space</option>
+                <option value="nucleotide">Align sequences in nucleotide space</option>
+                <option value="protein">Align sequences in protein space</option>
+            </param>
+            <when value="codon">
+                <conditional name="scoring_matrix_source">
+                    <param name="matrix_source" type="select" label="Scoring matrix source">
+                        <option value="builtin" selected="true">Use a built-in scoring matrix</option>
+                        <option value="history">Use a custom scoring matrix from history</option>
+                    </param>
+                    <when value="builtin">
+                        <param name="scoring_matrix" type="select" argument="-s" label="Scoring matrix" help="Select the scoring matrix to use.">
+                            <option value="BLOSUM62" selected="true">BLOSUM62 (for protein/codon alignments)</option>
+                            <option value="HIV_BETWEEN_F">HIV_BETWEEN_F (for HIV alignments)</option>
+                        </param>
+                    </when>
+                    <when value="history">
+                        <param name="scoring_matrix" type="data" argument="-s" format="tabular" label="Scoring matrix file" help="Scoring matrix file from your history. The file should be a tabular matrix with rows and columns representing amino acids or nucleotides, and cells containing substitution scores."/>
+                    </when>
+                </conditional>
+            </when>
+            <when value="nucleotide">
+                <conditional name="scoring_matrix_source">
+                    <param name="matrix_source" type="select" label="Scoring matrix source">
+                        <option value="builtin" selected="true">Use a built-in scoring matrix</option>
+                        <option value="history">Use a custom scoring matrix from history</option>
+                    </param>
+                    <when value="builtin">
+                        <param name="scoring_matrix" type="select" argument="-s" label="Scoring matrix" help="Select the scoring matrix to use.">
+                            <option value="NUC4.4" selected="true">NUC4.4 (for nucleotide alignments)</option>
+                        </param>
+                    </when>
+                    <when value="history">
+                        <param name="scoring_matrix" type="data" argument="-s" format="tabular" label="Scoring matrix file" help="Scoring matrix file from your history. The file should be a tabular matrix with rows and columns representing amino acids or nucleotides, and cells containing substitution scores."/>
+                    </when>
+                </conditional>
+            </when>
+            <when value="protein">
+                <conditional name="scoring_matrix_source">
+                    <param name="matrix_source" type="select" label="Scoring matrix source">
+                        <option value="history" selected="true">Use a custom scoring matrix from history</option>
+                    </param>
+                    <when value="history">
+                        <param name="scoring_matrix" type="data" argument="-s" format="tabular" label="Scoring matrix file" help="Scoring matrix file from your history. The file should be a tabular matrix with rows and columns representing amino acids or nucleotides, and cells containing substitution scores."/>
+                    </when>
+                </conditional>
+            </when>
+        </conditional>
+        <param name="local_alignment" type="select" argument="-l" label="Global/local alignment" help="Select the alignment type.">
+            <option value="trim" selected="true">Trim alignment (global to query, local to reference)</option>
+            <option value="global">Global alignment (all gaps scored equally)</option>
+            <option value="local">Local alignment (Smith-Waterman)</option>
+        </param>
+        <param name="format" type="select" argument="-f" label="Output format" help="Control the output format.">
+            <option value="refmap" selected="true">Reference map (insertions not retained)</option>
+            <option value="refalign">Reference align (insertions retained)</option>
+            <option value="pairwise">Pairwise alignment (insertions retained, all pairwise alignments reported)</option>
+        </param>
+        <param name="reverse_complement" type="select" argument="-R" label="Reverse complementation" help="Handle reverse complementation.">
+            <option value="none" selected="true">None</option>
+            <option value="silent">Silent (try both strands, report best score)</option>
+            <option value="annotated">Annotated (like silent, but annotates strand)</option>
+        </param>
+        <param name="affine_gap" type="boolean" argument="-a" truevalue="-a" falsevalue="" checked="false" label="Disable affine gap scoring" help="Disable affine gap scoring (enabled by default)."/>
+        <param name="write_reference" type="boolean" argument="-I" truevalue="-I" falsevalue="" checked="false" label="Write out the reference sequence" help="Include the reference sequence in the output."/>
+    </inputs>
+    <outputs>
+        <data name="output" format="fasta" label="${tool.name} on ${on_string}: ${format}"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="fasta" value="OQ948666-pr-int-indel.fa" ftype="fasta"/>
+            <conditional name="reference_cond">
+                <param name="reference_source" value="builtin"/>
+                <param name="reference_builtin" value="HXB2_pol"/>
+            </conditional>
+            <param name="format" value="refalign"/>
+            <conditional name="scoring_matrix_cond">
+                <param name="datatype" value="codon"/>
+                <conditional name="scoring_matrix_source">
+                    <param name="matrix_source" value="builtin"/>
+                    <param name="scoring_matrix" value="BLOSUM62"/>
+                </conditional>
+            </conditional>
+            <output name="output" file="expected-result.fa" ftype="fasta"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+        .. class:: infomark
+
+        **What it does**
+
+        `cawlign` is a codon-aware aligner that maps sequences from a FASTA file to a reference sequence. It can perform nucleotide, protein, and codon-aware alignments.
+
+        **Input**
+
+        - **Sequences to align**: A FASTA file containing the sequences to be aligned.
+        - **Reference sequence**: You can use a built-in reference sequence or provide one from your history.
+        - **Scoring matrix**: You can use a built-in scoring matrix or provide one from your history. The available built-in matrices depend on the selected data type.
+
+        **Output**
+
+        A FASTA file containing the alignments. The structure of the output depends on the selected output format parameter (see Output Examples below).
+
+        .. class:: infomark
+
+        **Alignment Methods**
+
+        `cawlign` can perform three types of alignment: nucleotide, protein, and codon-aware.
+
+        *   **Nucleotide Alignment**: This is a standard pairwise alignment of nucleotide sequences using the Smith-Waterman-Gotoh algorithm with affine gap penalties.
+
+        *   **Protein Alignment**: This is a standard pairwise alignment of protein sequences, also using the Smith-Waterman-Gotoh algorithm with affine gap penalties. The nucleotide sequences are translated into amino acid sequences before alignment.
+
+        *   **Codon-aware Alignment**: This is a more complex alignment method that is aware of the codon structure of the sequences. It aligns nucleotide sequences in codon space, which allows it to handle frameshift mutations (insertions or deletions that are not a multiple of 3 nucleotides) more accurately than a simple nucleotide alignment. This is achieved by using a dynamic programming algorithm that considers various types of codon matches and mismatches, including 3-to-1, 3-to-2, 3-to-4, and 3-to-5 matches. This makes it particularly useful for aligning coding sequences where frameshift mutations may have occurred, such as in viral genomes.
+
+        **Options**
+
+        - **Data type**: The type of alignment to perform.
+            - **Nucleotide**: Align sequences in nucleotide space.
+            - **Protein**: Align sequences in protein space.
+            - **Codon**: Align sequences in codon space. This requires the reference to be in-frame.
+
+        - **Global/local alignment**: The type of alignment strategy.
+            - **Trim**: A trimming alignment that is global with respect to the query and local with respect to the reference.
+            - **Global**: Full string alignment; all gaps are scored equally.
+            - **Local**: Partial string local (Smith-Waterman type) alignment that maximizes the alignment score.
+
+        - **Output format**: The format of the output file.
+            - **Reference map**: Aligns query sequences to the reference and does not retain insertions relative to the reference.
+            - **Reference align**: Aligns query sequences to the reference and does retain insertions relative to the reference. Insertions are shown in lowercase.
+            - **Pairwise**: Aligns query sequences to the reference and does retain insertions relative to the reference; reports all pairwise alignments.
+
+        - **Reverse complementation**: How to handle reverse complementation.
+            - **None**: No reverse complementation.
+            - **Silent**: Try both forward and reverse-complemented query sequences and report the alignment with the best score.
+            - **Annotated**: Like "Silent", but also annotates which strand was used.
+
+        - **Disable affine gap scoring**: By default, `cawlign` uses affine gap scoring. Check this option to disable it.
+
+        - **Write out the reference sequence**: Include the reference sequence in the output.
+
+        .. class:: infomark
+
+        **Output Examples**
+
+        Here are examples of what the different output formats look like using an example with an insertion ('gataca') and a deletion. The sequences are truncated for clarity.
+
+        **refmap**
+
+        The `refmap` output format aligns the query sequences to the reference but does not retain insertions relative to the reference.
+
+        .. code-block:: text
+
+            >OQ948666.1-indel
+            CCTCAAATCACTCTTTGGCAGCGACCCATTGTCACAATAAGGGTAGGGGGGCAATTAAAG...
+
+        **refalign**
+
+        The `refalign` output format also aligns the query sequences to the reference but *does* retain insertions, which are shown in lowercase.
+
+        .. code-block:: text
+
+            >OQ948666.1-indel
+            CCTCAAATCACTCTTTGGCAGCGACCCATTGTCACgatacaAATAAGGGTAGGGGGGCAATTAAAG...
+
+        **pairwise**
+
+        The `pairwise` output format reports the full pairwise alignment, including the reference sequence, with insertions and deletions shown as gaps in the corresponding sequence.
+
+        .. code-block:: text
+
+            >HXB2_pol
+            CCTCAGGTCACTCTTTGGCAACGACCCCTCGTCAC------AATAAAGATAGGGGGGCAACTAAAG...
+            >OQ948666.1-indel
+            CCTCAAATCACTCTTTGGCAGCGACCCATTGTCACGATACAAATAAGGGTAGGGGGGCAATTAAAG...
+
+        .. class:: infomalign
+
+        **References**
+
+        For more information, please visit the `cawlign` GitHub repository: https://github.com/veg/cawlign
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/cawlign/macros.xml b/tools/cawlign/macros.xml
@@ -0,0 +1,26 @@
+<macros>
+    <token name="@TOOL_VERSION@">0.1.14</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">cawlign</requirement>
+        </requirements>
+    </xml>
+    <xml name="stdio">
+        <stdio>
+            <exit_code range="1:"/>
+        </stdio>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="bibtex">@misc{githubcawlign,
+                author = {Pond, Sergei and Weaver, Steven and Moshiri, Niema and Hepler, Lance},
+                year = {2025},
+                title = {cawlign: a C++ port of bealign},
+                publisher = {GitHub},
+                journal = {GitHub repository},
+                url = {https://github.com/veg/cawlign}
+            }</citation>
+        </citations>
+    </xml>
+</macros>
diff --git a/tools/cawlign/test-data/OQ948666-pr-int-indel.fa b/tools/cawlign/test-data/OQ948666-pr-int-indel.fa
@@ -0,0 +1,2 @@
+>OQ948666.1-indel
+CCTCAAATCACTCTTTGGCAGCGACCCATTGTCACgatacaAATAAGGGTAGGGGGGCAATTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTGTTAGAAGAAATGGATTTGCCAGGAAGATGGAAGCCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGAGCAGATAACCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTAACTCAGCTTGGTTGCACTTTAAATTTTATGTAGATGGGGCAGCCAACAGGGAGACTAAATTGGGAAAAGCAGGATATGTTACTAACAGAGGAAGACAAAGGGTTGTCACCTTAACTGACACAACAAATCAGAAGACTGAGTTACAAGCAATTCATCTAGCTTTGCAGGATTCAGGATCAGAAGTAAACATAGTAACAGACTCACAGTATGCACTTGGAATCATCCAAGCACACCCAGATAGAAGTGAATCAGGGATAGTCAATCAAATCATAGAGCAGCTAATAAAAAAGGAAAAGGTCTACCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTTAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGAAAAGGCACAGGAAGAACATGAGAAATATCACAATAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCAGCTGTAGTAGCAAAAGAAATAGTAGCCTGCTGTGATAAATGCCAGATAAAGGGAGAAGCTATACATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTTCAGCAGAGACAGGGCAGGAAACAGCCTACTTCATCTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACAGACAATGGCAGCAATTTCACCAGTGCTACAGTTAAGGCCGCCTGTTGGTGGGCAGGGATCAAGCAGGAATTTGGCATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAATAATGCATTAAAGAAAATTATAGGACAGGTAAGAGAGCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATTCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAATAGACATAATAGCATCAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAACTTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAGACCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATCAGGACATAAAAGTAGTGCCAAGAAGGAAAGCAAAGATTATCAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAGGTAGACAGGATGAGGA
diff --git a/tools/cawlign/test-data/expected-result.fa b/tools/cawlign/test-data/expected-result.fa

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+>OQ948666.1-indel`
	`2`	+CCTCAAATCACTCTTTGGCAGCGACCCATTGTCACgatacaAATAAGGGTAGGGGGGCAATTAAAGGAAGCCCTATTAGATACAGGAGCAGATGATACAGTGTTAGAAGAAATGGATTTGCCAGGAAGATGGAAGCCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGAGCAGATAACCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTAACTCAGCTTGGTTGCACTTTAAATTTTATGTAGATGGGGCAGCCAACAGGGAGACTAAATTGGGAAAAGCAGGATATGTTACTAACAGAGGAAGACAAAGGGTTGTCACCTTAACTGACACAACAAATCAGAAGACTGAGTTACAAGCAATTCATCTAGCTTTGCAGGATTCAGGATCAGAAGTAAACATAGTAACAGACTCACAGTATGCACTTGGAATCATCCAAGCACACCCAGATAGAAGTGAATCAGGGATAGTCAATCAAATCATAGAGCAGCTAATAAAAAAGGAAAAGGTCTACCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTTAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGAAAAGGCACAGGAAGAACATGAGAAATATCACAATAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCAGCTGTAGTAGCAAAAGAAATAGTAGCCTGCTGTGATAAATGCCAGATAAAGGGAGAAGCTATACATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTTCAGCAGAGACAGGGCAGGAAACAGCCTACTTCATCTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACAGACAATGGCAGCAATTTCACCAGTGCTACAGTTAAGGCCGCCTGTTGGTGGGCAGGGATCAAGCAGGAATTTGGCATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAATAATGCATTAAAGAAAATTATAGGACAGGTAAGAGAGCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATTCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAATAGACATAATAGCATCAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAACTTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAGACCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATCAGGACATAAAAGTAGTGCCAAGAAGGAAAGCAAAGATTATCAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAGGTAGACAGGATGAGGA