-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathprepareForAnnotation.py
99 lines (80 loc) · 4.29 KB
/
prepareForAnnotation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import argparse
import json
import os
import kindred
import random
import pickle
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
import numpy as np
import math
import time
import random
import itertools
import pgmine
def entitiesOverlap(candidateRelation):
for e1,e2 in itertools.combinations(candidateRelation.entities,2):
for start1,end1 in e1.position:
for start2,end2 in e2.position:
if not (end1 <= start2 or end2 <= start1 ):
return True
return False
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Merge sentence data and do some annotation')
parser.add_argument('--mode',required=True,type=str,help='Whether to focus on star_allele, rs or other')
parser.add_argument('--selectedChemicals',required=True,type=str,help='Which chemicals to filter for')
parser.add_argument('--sentenceData',required=True,type=str,help='Directory with sentence data')
parser.add_argument('--variantStopwords',required=True,type=str,help='Mutations to remove')
parser.add_argument('--outDir',required=True,type=str,help='Directory to dump standoff format corpus')
parser.add_argument('--fileCount',required=False,type=int,help='Optionally limit the number of BioC file to load')
parser.add_argument('--filterTerms',type=str,help='Optional file of filter terms')
args = parser.parse_args()
start = time.time()
assert args.mode in ['star_allele','rs','star_rs','other']
with open(args.selectedChemicals) as f:
drugData = json.load(f)
drugMeshIDs = set([ 'MESH:'+d['MeSH'] for d in drugData ])
filterTerms = set()
if args.filterTerms:
with open(args.filterTerms) as f:
for line in f:
filterTerms.add(line.strip().lower())
with open(args.variantStopwords) as f:
variantStopwords = set( line.strip().lower() for line in f )
stopwords = {'dopamine','insulin','caffeine','nicotine','choline'}
print("Loading sentences...")
corpus = kindred.Corpus()
filenames = sorted(os.listdir(args.sentenceData))
if args.fileCount:
filenames = random.sample(filenames, args.fileCount)
for i,filename in enumerate(filenames):
print(i, filename)
tmpCorpus = kindred.load('biocxml',os.path.join(args.sentenceData,filename))
if args.filterTerms:
tmpCorpus.documents = [ doc for doc in tmpCorpus.documents if any ( filterTerm in doc.text.lower() for filterTerm in filterTerms ) ]
tmpCorpus.documents = [ doc for doc in tmpCorpus.documents if not any ( stopword in doc.text.lower() for stopword in stopwords ) ]
filtered = []
for doc in tmpCorpus.documents:
if args.mode == 'star_allele':
doc.entities = [ e for e in doc.entities if not (e.entityType == 'Mutation' and not e.text.strip().startswith('*')) ]
elif args.mode == 'rs':
doc.entities = [ e for e in doc.entities if not (e.entityType == 'Mutation' and not e.text.strip().startswith('rs')) ]
elif args.mode == 'star_rs':
doc.entities = [ e for e in doc.entities if not (e.entityType == 'Mutation' and not (e.text.strip().startswith('rs') or e.text.strip().startswith('*'))) ]
else:
doc.entities = [ e for e in doc.entities if not (e.entityType == 'Mutation' and (e.text.strip().startswith('*') or e.text.strip().startswith('rs'))) ]
#doc.entities = [ e for e in doc.entities if not (e.entityType == 'Mutation' and not e.metadata['conceptid'].startswith('rs')) ]
doc.entities = [ e for e in doc.entities if e.position[0][0] >= 0 ]
doc.entities = [ e for e in doc.entities if e.position[0][1] <= len(doc.text) ]
doc.entities = [ e for e in doc.entities if not (e.entityType == 'Chemical' and not e.metadata['conceptid'] in drugMeshIDs) ]
doc.entities = [ e for e in doc.entities if not (e.entityType == 'Chemical' and len(e.text) <= 4) ]
doc.entities = [ e for e in doc.entities if not (e.entityType == 'Mutation' and pgmine.normalizeMutation(e.text) is None) ]
doc.entities = [ e for e in doc.entities if not (e.entityType == 'Mutation' and e.text.lower() in variantStopwords) ]
entityTypes = set( e.entityType for e in doc.entities )
if 'Chemical' in entityTypes and 'Mutation' in entityTypes:
filtered.append(doc)
corpus.documents += filtered
print("Found: ", len(corpus.documents))
corpus.documents = random.sample(corpus.documents, 500)
kindred.save(corpus,'standoff',args.outDir)
kindred.save(corpus,'biocxml',os.path.join(args.outDir,'corpus.bioc.xml'))