-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathgetPregnantPMIDs.py
45 lines (32 loc) · 1.5 KB
/
getPregnantPMIDs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import argparse
import bioc
if __name__ == '__main__':
parser = argparse.ArgumentParser('Check if a document has pregnant MeSH terms')
parser.add_argument('--inBioc',required=True,type=str,help='Input BioC file')
parser.add_argument('--outTxt',required=True,type=str,help='Text file with PMIDs for pregnant documents')
args = parser.parse_args()
pmids = set()
print("Loaded PMIDs from corpus file...")
#pregnantTerms = set(['Pregnant Women'])
pregnantTerms = set(['Pregnant Women','Pregnancy'])
pregnantPMIDs = []
with bioc.biocxml.iterparse(open(args.inBioc, 'rb')) as parser:
for i,doc in enumerate(parser):
if not ('pmid' in doc.infons and doc.infons['pmid'] and doc.infons['pmid'] != 'None'):
continue
if not ('meshHeadings' in doc.infons and doc.infons['meshHeadings'] and doc.infons['meshHeadings'] != 'None'):
continue
pmid = int(doc.infons['pmid'])
meshHeadings = [ heading.split('~') for heading in doc.infons['meshHeadings'].split('\t') ]
for descriptorQualifiers in meshHeadings:
descriptor = descriptorQualifiers[0].split('|')
assert len(descriptor) == 4, "Expected four pipe-delimited columns. Got: %s" % descriptorQualifiers[0]
_, meshID, isMajorYN, name = descriptor
if name in pregnantTerms:
pregnantPMIDs.append(pmid)
#print(descriptor)
pregnantPMIDs = sorted(set(pregnantPMIDs))
print("Found %d PubMed ID(s) with pregnant MeSH terms" % len(pregnantPMIDs))
with open(args.outTxt,'w') as f:
for pmid in pregnantPMIDs:
f.write("%d\n" % pmid)