-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtsva.py
222 lines (210 loc) · 9.32 KB
/
tsva.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# Author: Liping Hou
# Email: [email protected]
# May 18th, 2015
import argparse
import datetime
import sys
import gzip
parser = argparse.ArgumentParser(description="Tissue specific variant annotator")
parser.add_argument('vep', help="VEP result file name (generated by the VEP)")
parser.add_argument('gtex', help="Tissue specific RPKM (based on GTEx)")
parser.add_argument('output', help='output file name')
parser.add_argument('--tissue', help='which tissue to use', nargs='+', required=True)
parser.add_argument('--verbose', action='store_true', help='use this flag to get more details for variants')
parser.add_argument('--list', action='store_true', help='use this flag to show the full list of tissues supported so far')
args = parser.parse_args()
def tsva(vep, gtex, tissue, file, verbose, list):
tissues = ['Adipose Tissue','Adipose - Subcutaneous','Adipose - Visceral','Adrenal Gland','Bladder','Blood','Whole Blood','Cells - EBV-transformed lymphocytes','Blood Vessel','Artery - Coronary','Artery - Aorta','Artery - Tibial','Brain','Brain - Spinal cord','Brain - Putamen','Brain - Cerebellum','Brain - Frontal Cortex','Brain - Hippocampus','Brain - Substantia nigra','Brain - Cerebellar Hemisphere','Brain - Amygdala','Brain - Anterior cingulate cortex','Brain - Hypothalamus','Brain - Caudate','Brain - Cortex','Brain - Nucleus accumbens','Breast','Breast - Mammary Tissue','Cervix Uteri','Cervix - Ectocervix','Cervix - Endocervix','Colon','Colon - Sigmoid','Colon - Transverse','Esophagus','Esophagus - Muscularis','Esophagus - Mucosa','Esophagus - Gastroesophageal Junction','Fallopian Tube','Heart','Heart - Left Ventricle','Heart - Atrial Appendage','Kidney','Kidney - Cortex','Liver','Lung','Muscle','Muscle - Skeletal','Nerve','Nerve - Tibial','Ovary','Pancreas','Pituitary','Prostate','Salivary Gland','Minor Salivary Gland','Skin','Cells - Transformed fibroblasts','Skin - Sun Exposed','Skin - Not Sun Exposed','Small Intestine','Small Intestine - Terminal Ileum','Spleen','Stomach','Testis','Thyroid','Uterus','Vagina']
tissue = ' '.join(i.strip("'\"") for i in tissue)
print(tissue)
try:
list
except NameError:
list = False
if tissue not in tissues or list:
error = """
Note: only supports the following tissues so far
Adipose Tissue
Adipose - Subcutaneous
Adipose - Visceral
Adrenal Gland
Adrenal Gland
Bladder
Bladder
Blood
Whole Blood
Cells - EBV-transformed lymphocytes
Blood Vessel
Artery - Tibial
Artery - Aorta
Artery - Coronary
Brain
Brain - Hippocampus
Brain - Hypothalamus
Brain - Cerebellar Hemisphere
Brain - Nucleus accumbens
Brain - Amygdala
Brain - Frontal Cortex
Brain - Substantia nigra
Brain - Anterior cingulate cortex
Brain - Cerebellum
Brain - Caudate
Brain - Spinal cord
Brain - Putamen
Brain - Cortex
Breast
Breast - Mammary Tissue
Cervix Uteri
Cervix - Ectocervix
Cervix - Endocervix
Colon
Colon - Transverse
Colon - Sigmoid
Esophagus
Esophagus - Muscularis
Esophagus - Mucosa
Esophagus - Gastroesophageal Junction
Fallopian Tube
Fallopian Tube
Heart
Heart - Left Ventricle
Heart - Atrial Appendage
Kidney
Kidney - Cortex
Liver
Liver
Lung
Lung
Muscle
Muscle - Skeletal
Nerve
Nerve - Tibial
Ovary
Ovary
Pancreas
Pancreas
Pituitary
Pituitary
Prostate
Prostate
Salivary Gland
Minor Salivary Gland
Skin
Cells - Transformed fibroblasts
Skin - Not Sun Exposed
Skin - Sun Exposed
Small Intestine
Small Intestine - Terminal Ileum
Spleen
Spleen
Stomach
Stomach
Testis
Testis
Thyroid
Thyroid
Uterus
Uterus
Vagina
Vagina
"""
sys.exit(error)
print("Analysis started: {}".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
print()
print("Reading VEP annotation results from [ {} ]".format(vep))
input = open(vep)
rpkm = gzip.open(gtex, 'rt')
output = open(file, 'w')
try:
verbose
except NameError:
verbose = False
header = next(rpkm).strip().split("\t")
try:
tissue_index = header.index(tissue + " median")
except ValueError:
print("Error: can not find {} in the header of the {}".format(tissue, gtex))
TRANSCRIPTS = set()
for line in rpkm:
data = line.strip().split("\t")
val = data[tissue_index]
if val == 'NA':
val = 0
else:
val = float(val)
if val > 0:
TRANSCRIPTS.add(data[0].split(".")[0])
print("Found {} transcripts with a RPKM greater than 0 for {} tissue".format(len(TRANSCRIPTS), tissue))
def mostSevere(x, verbose):
#orderOfSeverity = ["transcript_ablation","splice_acceptor_variant","splice_donor_variant","stop_gained","frameshift_variant","stop_lost","initiator_codon_variant","transcript_amplification","inframe_insertion","inframe_deletion","missense_variant","splice_region_variant","incomplete_terminal_codon_variant","stop_retained_variant","synonymous_variant","coding_sequence_variant","mature_miRNA_variant","5_prime_UTR_variant","3_prime_UTR_variant","non_coding_transcript_exon_variant","intron_variant","NMD_transcript_variant","non_coding_transcript_variant","upstream_gene_variant","downstream_gene_variant","TFBS_ablation","TFBS_amplification","TF_binding_site_variant","regulatory_region_ablation","regulatory_region_amplification","regulatory_region_variant","feature_elongation","feature_truncation","intergenic_variant"]
orderOfSeverity = ['transcript_ablation','splice_acceptor_variant','splice_donor_variant','stop_gained','frameshift_variant','stop_lost','start_lost','transcript_amplification','inframe_insertion','inframe_deletion','missense_variant','protein_altering_variant','splice_region_variant','incomplete_terminal_codon_variant','stop_retained_variant','synonymous_variant','coding_sequence_variant','mature_miRNA_variant','5_prime_UTR_variant','3_prime_UTR_variant','non_coding_transcript_exon_variant','intron_variant','NMD_transcript_variant','non_coding_transcript_variant','upstream_gene_variant','downstream_gene_variant','TFBS_ablation','TFBS_amplification','TF_binding_site_variant','regulatory_region_ablation','regulatory_region_amplification','feature_elongation','regulatory_region_variant','feature_truncation','intergenic_variant']
if len(x) == 1:
if verbose:
return([0])
else:
return(0)
else:
res = []
multi = []
for i in x:
if len(i.split(",")) == 1:
res.append(orderOfSeverity.index(i))
elif len(i.split(",")) > 1:
for j in i.split(","):
multi.append(orderOfSeverity.index(j))
res.append(min(multi))
if verbose:
indices = [n for n, m in enumerate(res) if m == min(res)]
return(indices)
else:
return(res.index(min(res)))
variant_list = []
VARIANTS = set()
ANNOTATION = {}
VEP = {}
for line in input:
if line.startswith("##"):
output.write(line)
elif line.startswith("#Uploaded_variation"):
if verbose:
output.write(line)
else:
header = line.strip().split()
output.write("{}\t{}\t{}\t{}\n".format(header[0], header[1], header[2], header[6]))
else:
data = line.strip().split()
transcript = data[4]
variant = data[0]
ANNOTATION.setdefault(variant, [])
VEP.setdefault(variant, [])
func = data[6]
if transcript in TRANSCRIPTS or transcript == '-':
#if variant not in variant_list:
if variant not in VARIANTS:
variant_list.append(variant)
VARIANTS.add(variant)
ANNOTATION[variant].append(func)
if verbose:
VEP[variant].append(line.strip())
else:
if len(func.split(",")) > 1:
severeFunc = func.split(",")[mostSevere(func.split(","), verbose)]
info = "{}\t{}\t{}\t{}".format(data[0], data[1], data[2], severeFunc)
else:
info = "{}\t{}\t{}\t{}".format(data[0], data[1], data[2], data[6])
VEP[variant].append(info)
print("Found {} unique variants from {}".format(len(variant_list), vep))
for i in variant_list:
if verbose:
for j in mostSevere(ANNOTATION[i], verbose):
output.write("{}\n".format(VEP[i][j]))
else:
output.write("{}\n".format(VEP[i][mostSevere(ANNOTATION[i], verbose)]))
input.close()
rpkm.close()
output.close()
print("Output file was written to: [ {} ]".format(file))
print()
print("Analysis finished: {}".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
print()
tsva(args.vep, args.gtex, args.tissue, args.output, args.verbose, args.list)