-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathfilterAndCollate.py
112 lines (80 loc) · 4.23 KB
/
filterAndCollate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import argparse
import csv
import os
import hashlib
from collections import defaultdict
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Filter PGxMine for more conservative predictions')
parser.add_argument('--inData',required=True,type=str,help='Input directory with TSV files to be filtered')
parser.add_argument('--outUnfiltered',required=True,type=str,help='Output unfiltered data')
parser.add_argument('--outCollated',required=True,type=str,help='Output collated and filtered data')
parser.add_argument('--outSentences',required=True,type=str,help='Output filtered sentences that match collated data')
args = parser.parse_args()
assert os.path.isdir(args.inData)
threshold = 0.75
collated = defaultdict(set)
collated_pediatricOnly = defaultdict(set)
collatedMatchingID = {}
collatedKeyFields = 'chemical_mesh_id,chemical_pharmgkb_id,chemical_drugbank_id,chemical_normalized,variant_id,variant_normalized,variant_type,gene_ids,gene_names'
pubmedInputFiles = [ f for f in os.listdir(args.inData) if f.startswith('pubmed') and f.endswith('.tsv') ]
pmcInputFiles = [ f for f in os.listdir(args.inData) if f.startswith('pmc') and f.endswith('.tsv') ]
print("Found %d PubMed files" % len(pubmedInputFiles))
print("Found %d PMC files" % len(pmcInputFiles))
# Process PMC files before PubMed, and newer before older to get the latest version of each document
inputFiles = sorted(pmcInputFiles,reverse=True) + sorted(pubmedInputFiles,reverse=True)
inputFilesHeader = None
alreadySeen = set()
recordCount,filteredRecordCount = 0,0
with open(args.outUnfiltered,'w') as outUnfiltered, open(args.outSentences,'w') as outSentences:
for inputFile in inputFiles:
with open(os.path.join(args.inData,inputFile)) as inF:
headers = inF.readline().strip('\n').split('\t')
if inputFilesHeader is None:
inputFilesHeader = headers
outUnfiltered.write("\t".join(headers) + '\n')
outSentences.write("matching_id\t" + "\t".join(headers) + '\n')
else:
assert inputFilesHeader == headers, "Headers don't match expected in file %s" % inputFile
pmidsInThisFile = set()
for i,line in enumerate(inF):
row = line.strip('\n').split('\t')
assert len(row) == len(headers), "Got %d columns, expected %d in row %d, file %s" % (len(row),len(headers),i+1,inputFile)
r = { h:v for h,v in zip(headers,row) }
score = float(r['score'])
recordCount += 1
pmid = r['pmid']
if pmid == 'None':
continue
# Make sure that we haven't processed this PMID in any previous files
key = (r['pmid'],r['formatted_sentence'])
if key in alreadySeen:
continue
alreadySeen.add(key)
outUnfiltered.write("\t".join(r[h] for h in headers) + "\n")
keepIt = score > threshold
if keepIt:
collatedKey = tuple( [ r[k] for k in collatedKeyFields.split(',') ] )
collated[collatedKey].add(pmid)
isPediatricPaper = r['is_pediatric_paper'] == 'True'
if isPediatricPaper:
collated_pediatricOnly[collatedKey].add(pmid)
# Make a field using the key data that can be used to match between tables
matchingID = hashlib.md5("|".join(list(collatedKey)).encode('utf-8')).hexdigest()
collatedMatchingID[collatedKey] = matchingID
outSentences.write(matchingID + "\t" + "\t".join(r[h] for h in headers) + "\n")
filteredRecordCount += 1
with open(args.outCollated,'w') as outF:
headers = 'matching_id,%s,paper_count,pediatric_paper_count' % collatedKeyFields
headerCount = len(headers.split(','))
outF.write(headers.replace(',','\t') + '\n')
collatedCounts = [ (len(pmids),key) for key,pmids in collated.items() ]
collatedCounts = sorted(collatedCounts,reverse=True)
for paperCount,collatedKey in collatedCounts:
matchingID = collatedMatchingID[collatedKey]
pediatricOnlyCount = len(collated_pediatricOnly[collatedKey])
outData = [matchingID] + list(collatedKey) + [str(paperCount),str(pediatricOnlyCount)]
assert len(outData) == headerCount
outLine = "\t".join(outData)
outF.write(outLine + "\n")
print("%d records filtered to %d sentences and collated to %d chemical/variant associations" % (recordCount, filteredRecordCount, len(collated)))
print("Written to %s and %s" % (args.outSentences, args.outCollated))