-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathremove_uninformative_sites_from_aln.pl
executable file
·135 lines (119 loc) · 4.45 KB
/
remove_uninformative_sites_from_aln.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env perl
# Removes uninformative sites from a fasta multiple sequence alignment: Ns, gaps, and invariant sites
# Author: Lee Katz
# https://github.com/lskatz/lyve-SET/blob/master/scripts/removeUninformativeSites.pl
use strict;
use warnings;
use Data::Dumper;
use Getopt::Long;
use FindBin '$Bin'; # added by Bruno May2017
use lib "$Bin/lib/perl/bioperl-1.5.2_102/";
use Bio::SeqIO;
sub logmsg{$|++;print STDERR "@_\n"; $|--;}
exit main();
sub main{
my $settings={};
GetOptions($settings,qw(help verbose ambiguities-allowed gaps-allowed sort=s)) or die $!;
die usage() if($$settings{help});
$$settings{"ambiguities-allowed"} ||=0;
$$settings{"gaps-allowed"} ||=0;
$$settings{sort}||="";
$$settings{sort}=lc($$settings{sort});
## read in the fasta file into @seq and %seq, and keep the deflines
my $in;
if(defined($ARGV[0]) && -f $ARGV[0]){
$in=Bio::SeqIO->new(-file=>$ARGV[0]);
} else {
$in=Bio::SeqIO->new(-fh=>\*STDIN,-format=>"fasta");
}
my($length,$defline,@defline,@seq);
while(my $seqObj=$in->next_seq){
push(@seq,$seqObj->seq);
push(@defline,$seqObj->id);
$length=$seqObj->length;
}
$in->close;
## read informative positions into @pos
# compare each sequence to the reference sequence (the first sequence)
my $removeAmbiguities=!$$settings{"ambiguities-allowed"};
my $removeGaps=!$$settings{"gaps-allowed"};
my (%aln,@pos);
#my $numOtherSeq=@seq;
my $numSeq=@seq;
my $informativeCount=0;
POSITION:for(my $j=0;$j<$length;$j++){
# Find a good reference base to compare against
my $refNt='N'; # a dummy reference base to start
for(my $i=0;$i<$numSeq;$i++){
$refNt=uc(substr($seq[$i],$j,1)); # uppercase for comparison
last if($refNt ne 'N');
}
# Start checking if the site is informative or not
next if($removeAmbiguities && $refNt eq 'N'); # if it's informative, but the ref base isn't good, then skip
next if($removeGaps && $refNt eq '-'); # if it's informative, but the ref base isn't good, then skip
# See if the rest of this column shows that it is informative.
# Cycle through the rest of the members of this column to check.
my $informative=0; # guilty until proven innocent
for(my $i=0;$i<$numSeq;$i++){
my $nt=uc(substr($seq[$i],$j,1)); # compare everything on the uppercase level
die "ERROR: Sequence $i does not have a nucleotide at position $j! Is the MSA flush?" if(!$nt);
# It's informative, but you have to continue reviewing
# the other sequences if you are ignoring N or gap columns.
# Also, do not consider ambiguities when deciding if this is an invariant site or not
if($nt ne 'N' && $nt ne $refNt){
$informative=1;
}
# Check to make sure it doesn't have an N anywhere
next POSITION if($removeAmbiguities && $nt eq 'N');
# Check to make sure there is no gap at all
next POSITION if($removeGaps && $nt eq '-');
}
next if(!$informative);
push(@pos,$j); # retain this informative position
$informativeCount++;
logmsg $j if($$settings{verbose});
}
## Print all nucleotides found at informative positions.
my %seq;
@seq{@defline}=@seq;
# sort the sequences by identifier
my @sortedId;
if($$settings{sort} eq 'alpha'){
@sortedId=sort {$a cmp $b} @defline;
} elsif($$settings{sort} eq 'num'){
@sortedId=sort {$a <=> $b} @defline;
} else {
@sortedId=@defline;
}
for (my $i=0;$i<@sortedId;$i++){
my $id=$sortedId[$i];
my $sequence=$seq{$id};
print ">$id\n";
for my $pos(@pos){
print substr($sequence,$pos,1);
}
print "\n";
}
return 0;
for (my $i=0;$i<@seq;$i++){
my $sequence=$seq[$i];
my $id=$defline[$i];
print ">$id\n";
for my $pos(@pos){
print substr($sequence,$pos,1);
}
print "\n";
}
return 0;
}
sub usage{
"Removes all the uninformative sites in a multiple sequence alignment fasta file: Ns, gaps, and invariant sites
An invariant site will also be defined as a site with only ambiguities and no other variations
Usage: $0 < aln.fasta > informative.fasta
-v for verbose (technically makes the script slower)
Using the following two options will allow you to keep a master MSA list at-hand if you are converting all VCFs in a project
--gaps-allowed Allow gaps in the alignment
--ambiguities-allowed Allow ambiguous bases in the alignment
--sort alpha,num Sort the sequences by their deflines. Values for sort are either ALPHA or NUM (case insensitive)
"
}