-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenize_utils.py
73 lines (71 loc) · 2.72 KB
/
tokenize_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import re
def tokenize_eng_file(mainString):
sentences = []
e = '((?<=[^A-Z])\. *(?=[A-Z|"])|\? |\n|\*\*|\([MDCLXVImdclxvi]+\)|^[0-9]+\.[^0-9])'
mainString = re.sub('(?<=[0-9][0-9])\.','. ',mainString)
mainString = re.sub('(?<=[A-Z][A-Z])\. ',' .',mainString)
mainString = re.sub('(?<=[a-z])\. +','. ',mainString)
mainString = mainString.replace('Prof. ','Prof.')
mainString = mainString.replace('Dr. ','Dr.')
mainString = mainString.replace('Mr. ','Mr.')
mainString = mainString.replace('Mrs. ','Mrs.')
mainString = mainString.replace('Ms. ','Ms.')
mainString = mainString.replace('viz. ','viz.')
mainString = mainString.replace('Hon. ','Hon.')
mainString = mainString.replace('i.e. ','i.e.')
mainString = mainString.replace('Smt. ','Smt.')
mainString = mainString.replace('Shri. ','Shri.')
mainString = mainString.replace('St. ','St.')
mainString = mainString.replace('Lt. ','Lt.')
mainString = mainString.replace('i.e.','i.e. ')
splitString = re.split(e,mainString)
# print(splitString)
for i,s in enumerate(splitString):
if(s.strip() == ''):
continue
if(s == '\n'):
continue
if(re.search('\([MDCLXVImdclxvi]+\)$',s) is not None):
splitString[i+1] = s + splitString[i+1]
continue
if(re.search('^\.$',s.strip()) is not None):
# sentences[-1] += s[0]
# splitString[i+1] = s[2] + splitString[i+1]
continue
if(re.search('\? $',s) is not None):
sentences[-1] += s[0]
continue
if(re.search('^[0-9]+$',s.strip()) is not None):
continue
if(re.search('^[a-z]+$',s.strip()) is not None):
continue
if('*' in s):
continue
sentences.append(s.strip().replace('. ','.'))
return sentences
def tokenize_hi_file(mainHinString):
e = '(। +|\? |\n|\*\*|\([MDCLXVImdclxvi]+\)|[0-9]+\.[^0-9])'
sentences = []
splitString = re.split(e,mainHinString)
for i,s in enumerate(splitString):
if(s.strip() == ''):
continue
if(s == '\n'):
continue
if(re.search('\? $',s) is not None):
sentences[-1] += s[0]
continue
if(re.search('\([MDCLXVImdclxvi]+\)$',s) is not None):
# splitString[i+1] = s + splitString[i+1]
continue
if(re.search('। +$',s) is not None):
# sentences[-1] += s[0]
continue
if(re.search('^[0-9]+\.$',s.strip()) is not None):
continue
if(re.search('^[a-z]+$',s.strip()) is not None):
continue
if('*' in s):
continue
sentences.append(s.strip())
return sentences