-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess-data.py
More file actions
94 lines (62 loc) · 2.2 KB
/
process-data.py
File metadata and controls
94 lines (62 loc) · 2.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# import regular expression library
import re as re
'''
function:
preprocessLine(inputLine)
description:
converts character reference to original form and removes any HTML tags
input:
(string) inputLine, a line from input XML file
returns:
(string) cleanTxt, cleaned body with HTML tags removed
'''
def preprocessLine(inputLine):
# substitute "&" with "&" for pattern "amp;" preceded by "&"
inputLine = re.sub(r'&(?<=amp;)', '&', inputLine)
inputLine = re.sub(r'<', '<', inputLine)
inputLine = re.sub(r'>', '>', inputLine)
inputLine = re.sub(r''', '\'', inputLine)
inputLine = re.sub(r'"', '\"', inputLine)
inputLine = re.sub(r'&', '&', inputLine)
inputLine = re.sub(r'
', ' ', inputLine)
inputLine = re.sub(r'
', ' ', inputLine)
inputLine = re.sub(r'"', '', inputLine)
inputLine = re.sub(r'/>', '', inputLine)
# substitute any character with 0 or more repeating patterns in <...> with empty character
cleanTag = re.compile('<.*?>')
cleanTxt = re.sub(cleanTag, '', inputLine)
return cleanTxt
'''
function:
splitFile(inFile, outputFile_question, outputFile_answer)
description:
reads input XML file, perform pre-processing to clean the body and split the file into question
and answer based on the post ID type
input:
(file) inFile, input file to be read
(file) outFileQ, output file to be written for questions
(file) outFileA, output file to be written for answers
returns:
None
'''
def splitFile(inFile, outFileQ, outFileA):
# open file to read input XML file and write to output files
with open(inFile, 'r', encoding='utf-8') as inFile, open(outFileQ, 'w', encoding='utf-8') as fOutQ,\
open(outFileA, 'w', encoding='utf-8') as fOutA:
for line in inFile:
# check based on postTypeId and write to respective files
postIdType = re.search('PostTypeId="', line)
if postIdType is not None:
# question
if line[postIdType.end()] == '1':
line = preprocessLine(line)
fOutQ.write(line)
# answer
elif line[postIdType.end()] == '2':
line = preprocessLine(line)
fOutA.write(line)
if __name__ == '__main__':
fQuestion = 'question.txt'
f_answer = 'answer.txt'
f_data = 'data.xml'
splitFile(f_data, fQuestion, f_answer)