Skip to content

Commit f5c7c76

Browse files
Added an option to reverse complement only some sequences in a given input file
1 parent f75f0b3 commit f5c7c76

File tree

4 files changed

+79
-13
lines changed

4 files changed

+79
-13
lines changed

align/seqbag.go

+30-5
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,12 @@ type SeqBag interface {
6464
RarefySeqBag(nb int, counts map[string]int) (SeqBag, error) // Take a new rarefied sample taking into accounts weights
6565
Rename(namemap map[string]string)
6666
RenameRegexp(regex, replace string, namemap map[string]string) error
67-
Replace(old, new string, regex bool) error // Replaces old string with new string in sequences of the alignment
68-
ShuffleSequences() // Shuffle sequence order
69-
String() string // Raw string representation (just write all sequences)
70-
Translate(phase int, geneticcode int) (err error) // Translates nt sequence in aa
71-
ReverseComplement() (err error) // Reverse-complements the alignment
67+
Replace(old, new string, regex bool) error // Replaces old string with new string in sequences of the alignment
68+
ShuffleSequences() // Shuffle sequence order
69+
String() string // Raw string representation (just write all sequences)
70+
Translate(phase int, geneticcode int) (err error) // Translates nt sequence in aa
71+
ReverseComplement() (err error) // Reverse-complements the alignment
72+
ReverseComplementSequences(name ...string) (err error) // Reverse-complements some sequences in the alignment
7273
TrimNames(namemap map[string]string, size int) error
7374
TrimNamesAuto(namemap map[string]string, curid *int) error
7475
Sort() // Sorts the sequences by name
@@ -945,6 +946,30 @@ func (sb *seqbag) ReverseComplement() (err error) {
945946
return
946947
}
947948

949+
/*
950+
ReverseComplement reverse complements all input seuqences.
951+
- if the alphabet is not NUCLEOTIDES: returns an error
952+
- IUPAC characters are supported
953+
*/
954+
func (sb *seqbag) ReverseComplementSequences(names ...string) (err error) {
955+
if sb.Alphabet() != NUCLEOTIDS {
956+
err = errors.New("wrong alphabet, cannot reverse complement")
957+
return
958+
}
959+
960+
for _, name := range names {
961+
s, found := sb.SequenceByName(name)
962+
if found {
963+
if err = Complement(s.SequenceChar()); err != nil {
964+
return
965+
}
966+
Reverse(s.SequenceChar())
967+
}
968+
}
969+
970+
return
971+
}
972+
948973
// Translate sequences in 3 phases (or 6 phases if reverse strand is true)
949974
// And return the longest orf found
950975
func (sb *seqbag) LongestORF(reverse bool) (orf Sequence, err error) {

cmd/revcomp.go

+26-6
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ var revCompCmd = &cobra.Command{
2020
If the input alignment is not nucleotides, then returns an error.
2121
2222
IUPAC codes are taken into account for the reverse complement.
23+
24+
If sequence names are given in the command line (e.g. goalign revcomp -i al.fasta s1 s2 s3),
25+
only given sequences are reverse-complemented, if they exist in the alignment.
2326
`,
2427
RunE: func(cmd *cobra.Command, args []string) (err error) {
2528
var f utils.StringWriterCloser
@@ -37,9 +40,18 @@ IUPAC codes are taken into account for the reverse complement.
3740
io.LogError(err)
3841
return
3942
}
40-
if err = seqs.ReverseComplement(); err != nil {
41-
io.LogError(err)
42-
return
43+
44+
// If names are given as arguments, we reverse complement only these sequences
45+
if len(args) > 0 {
46+
if err = seqs.ReverseComplementSequences(args...); err != nil {
47+
io.LogError(err)
48+
return
49+
}
50+
} else {
51+
if err = seqs.ReverseComplement(); err != nil {
52+
io.LogError(err)
53+
return
54+
}
4355
}
4456
writeSequences(seqs, f)
4557
} else {
@@ -51,9 +63,17 @@ IUPAC codes are taken into account for the reverse complement.
5163
return
5264
}
5365
for al = range aligns.Achan {
54-
if err = al.ReverseComplement(); err != nil {
55-
io.LogError(err)
56-
return
66+
// If names are given as arguments, we reverse complement only these sequences
67+
if len(args) > 0 {
68+
if err = al.ReverseComplementSequences(args...); err != nil {
69+
io.LogError(err)
70+
return
71+
}
72+
} else {
73+
if err = al.ReverseComplement(); err != nil {
74+
io.LogError(err)
75+
return
76+
}
5777
}
5878
writeAlign(al, f)
5979
}

docs/commands/revcomp.md

+3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ If `--unaligned` is specified, then input sequences may be unaligned.
1111

1212
IUPAC codes are taken into account.
1313

14+
If sequence names are given in the command line (e.g. goalign revcomp -i al.fasta s1 s2 s3),
15+
only given sequences are reverse-complemented, if they exist in the alignment.
16+
1417
#### Usage
1518
```
1619
Usage:

test.sh

+20-2
Original file line numberDiff line numberDiff line change
@@ -5918,10 +5918,19 @@ cat > expected <<EOF
59185918
>s2
59195919
.*NVHDBMKWSRYCGTTA
59205920
EOF
5921+
cat > expected.2 <<EOF
5922+
>s1
5923+
.*NBDHVKMWSYRGCAAT
5924+
>s2
5925+
TAACGRYSWMKVHDBN*.
5926+
EOF
59215927

59225928
${GOALIGN} revcomp -i input > output
59235929
diff -q -b expected output
5924-
rm -rf input output expected
5930+
5931+
${GOALIGN} revcomp -i input s1 > output
5932+
diff -q -b expected.2 output
5933+
rm -rf input output expected expected.2
59255934

59265935

59275936
echo "->goalign revcomp --unaligned"
@@ -5937,7 +5946,16 @@ cat > expected <<EOF
59375946
>s2
59385947
ACGT.*NVHDBMKWSRYCGTTA
59395948
EOF
5949+
cat > expected.2 <<EOF
5950+
>s1
5951+
ATUGCYRSWKMBDHVN*.
5952+
>s2
5953+
ACGT.*NVHDBMKWSRYCGTTA
5954+
EOF
59405955

59415956
${GOALIGN} revcomp -i input --unaligned > output
59425957
diff -q -b expected output
5943-
rm -rf input output expected
5958+
5959+
${GOALIGN} revcomp -i input --unaligned s2 > output
5960+
diff -q -b expected.2 output
5961+
rm -rf input output expected expected.2

0 commit comments

Comments
 (0)