-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_analysis.py
More file actions
32 lines (30 loc) · 1.13 KB
/
data_analysis.py
File metadata and controls
32 lines (30 loc) · 1.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import re
import html
# Preprocess text function
def preprocess_text(text):
# 1. Find #[xX]?\w+; and put '&' to the first
text = re.sub(r'(^|\D)#\w+;', lambda match: f'&{match.group()}' if match.group().startswith(
'#') else f'{match.group()[0]}&{match.group()[1:]}', text)
# 2. Convert HTML character to unicode
text = html.unescape(text) # 수정된 부분
# 3. Remove http, https
text = re.sub(r'http\S+|https\S+', '', text)
# 4. Remove email
text = re.sub(r'\S+@\S+', '', text)
# 5. Remove twitter id
text = re.sub(r'@\w+', '', text)
# 6. Remove "</b>"
text = re.sub(r'</b>', '', text)
# 7. Remove " and quot;
text = re.sub(r'"|quot;', '', text)
# 8. Replace & and amp; with &
text = re.sub(r'&|amp;', '&', text)
# 9. Replace < and lt; with <
text = re.sub(r'<|lt;', '<', text)
# 10. Replace > and gt; with >
text = re.sub(r'>|gt;', '>', text)
# 11. Remove the text inside parentheses
text = re.sub(r'\(.*?\)', '', text)
# 12. Remove extra spaces
text = re.sub(r'\s+', ' ', text).strip()
return text