-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathlinkStarToRSID.py
53 lines (39 loc) · 1.63 KB
/
linkStarToRSID.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import re
import os
import argparse
import kindred
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Try to map star alleles to rs IDs with some basic text mining')
parser.add_argument('--inFile',required=True,type=str,help='Input directory of BioC xml files')
parser.add_argument('--genes',required=True,type=str,help='Gene file')
parser.add_argument('--outFile',required=True,type=str,help='Output file')
args = parser.parse_args()
geneList = set()
with open(args.genes) as f:
for line in f:
hugo_id,single,synonyms,entrez_id = line.strip('\n').split('\t')
geneList.add(single.lower())
with open(args.outFile,'w') as outF:
for corpus in kindred.iterLoad('biocxml',args.inFile):
corpus.documents = [ doc for doc in corpus.documents if 'rs' in doc.text and '*' in doc.text ]
for doc in corpus.documents:
for sentence in doc.text.split('.'):
genes = []
matches = re.finditer('(?P<gene>\w+)\s*\*\s*(?P<star>\w+)', sentence)
for match in matches:
startPos,endPos = match.span('gene')
if match.group('gene').lower() in geneList:
genes.append((startPos,match.group('gene')))
genes = sorted(genes)
matches = re.finditer('\*\s*(?P<star>\w+)(?P<inbetween>[^,]{0,40}?)\((?P<rs>rs\d+)\)', sentence)
for match in matches:
startPos = match.span()[0]
gene = [ gene for p,gene in genes if p < startPos ]
if len(gene) == 0:
continue
gene = gene[-1]
star = match.group('star')
rsid = match.group('rs')
inbetween = match.group('inbetween')
out = [gene,star,rsid,inbetween]
outF.write("\t".join(out) + '\n')