-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathgetRelevantMeSH.py
55 lines (41 loc) · 1.84 KB
/
getRelevantMeSH.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import argparse
import bioc
import json
import gzip
if __name__ == '__main__':
parser = argparse.ArgumentParser('Check if a document has relevant MeSH tags')
parser.add_argument('--inBioc',required=True,type=str,help='Input BioC file')
parser.add_argument('--outJSONGZ',required=True,type=str,help='JSON file with PMIDs mapped to relevant MeSH terms')
args = parser.parse_args()
pmids = set()
print("Loaded PMIDs from corpus file...")
relevantTerms = ['Pediatrics','Infant','Infant, Newborn','Child','Child, Preschool','Adolescent','Birth Cohort']
relevantTerms += ['Adult','Aged','Middle Aged','Young Adult']
relevantTerms = set(relevantTerms)
print("Searching for MeSH terms in: ", sorted(relevantTerms))
print()
seenPMIDs = set()
pmidToMesh = {}
with bioc.biocxml.iterparse(open(args.inBioc, 'rb')) as parser:
for i,doc in enumerate(parser):
if not ('pmid' in doc.infons and doc.infons['pmid'] and doc.infons['pmid'] != 'None'):
continue
if not ('meshHeadings' in doc.infons and doc.infons['meshHeadings'] and doc.infons['meshHeadings'] != 'None'):
continue
pmid = int(doc.infons['pmid'])
if pmid in seenPMIDs:
continue
seenPMIDs.add(pmid)
meshHeadings = [ heading.split('~') for heading in doc.infons['meshHeadings'].split('\t') ]
descriptorNames = []
for descriptorQualifiers in meshHeadings:
descriptor = descriptorQualifiers[0].split('|')
assert len(descriptor) == 4, "Expected four pipe-delimited columns. Got: %s" % descriptorQualifiers[0]
_, meshID, isMajorYN, name = descriptor
descriptorNames.append(name)
descriptorNames = [ x for x in descriptorNames if x in relevantTerms ]
if descriptorNames:
pmidToMesh[pmid] = descriptorNames
print("Found %d PubMed ID(s) with relevant MeSH terms" % len(pmidToMesh))
with gzip.open(args.outJSONGZ,'wt') as f:
json.dump(pmidToMesh,f)