-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_protein_from_refseq1.pl
More file actions
100 lines (94 loc) · 1.92 KB
/
Copy pathgenerate_protein_from_refseq1.pl
File metadata and controls
100 lines (94 loc) · 1.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#! perl -w
my $new_name = shift;
my $pep_infile = "protein.faa";
my $pep_outfile = "$new_name.pep.fa";
my $cds_infile = "rna.fna";
my $cds_outfile = "$new_name.cds.fa";
my $gff_file ="genomic.gff";
open IN, "<", $gff_file;
while(<IN>){
next if /^#/;
my @in = split /\t/;
if($in[2] eq 'mRNA'){
if($in[8] =~ m/ID=(\S+?);Parent=(\S+?);[\d\D]+?product=([\d\D]+?);/){
$gid{$1} = $2;
$anno{$1} = $3;
print "$1\t$2\t$3\n";
}
}
elsif($in[2] eq 'exon'){
if($in[8] =~ m/Parent=(\S+?);[\d\D]+?transcript_id=(\S+)[;\s]/){
$mid{$2} = $1;
$rid{$1} = $2;
#print "$1\t$2\n";
}
}
elsif($in[2] eq 'CDS'){
if($in[8] =~ m/Parent=(\S+?);[\d\D]+?protein_id=(\S+)[;\s]/){
my $m = (split /;/,$2)[0];
$mid{$m} = $1;
$pid{$1} = $m;
#print "$1\t$m\n";
}
}
}
close IN;
my $flag = 0;
open IN, "<", $cds_infile;
open BUG, ">", "not_used.list";
while(<IN>){
if(/^>(\S+)/){
if(defined($mid{$1})){
$id = $mid{$1};
print "$1\t$id\n";
$flag = 1;
}
else{
$flag = 0;
print BUG "$1\n";
}
}
elsif($flag){
s/\s+//g;
$cds{$id} .= $_;
}
}
close IN;
$flag = 0;
open IN, "<", $pep_infile;
while(<IN>){
if(/^>(\S+)/){
if(defined($mid{$1})){
$id = $mid{$1};
$flag = 1;
}
else{
$flag = 0;
print BUG "$1\n";
}
}
elsif($flag){
s/\s+//g;
$pep{$id} .= $_;
$len{$gid{$id}}{$id} += length($_) if defined $gid{$id};
}
}
close IN;
close BUG;
open OUT1, ">", $pep_outfile;
open OUT2, ">", $cds_outfile;
open DUP, ">", "duplicate.list";
foreach my $gid (keys %len){
my $flag = 0;
foreach my $id (sort {$len{$gid}{$b} <=> $len{$gid}{$a}} keys %{$len{$gid}}){
$flag ++;
if($flag == 1){
print OUT1 ">$gid{$id} $pid{$id} $anno{$id}\n$pep{$id}\n" if defined $pep{$id};
print OUT2 ">$gid{$id} $pid{$id} $anno{$id}\n$cds{$id}\n" if defined $cds{$id};
}
else{ print DUP "$id\t$gid\t$pid{$id}\t$rid{$id}\n"; }
}
}
close OUT1;
close OUT2;
close DUP;