-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPreProcess.py
More file actions
37 lines (29 loc) · 772 Bytes
/
PreProcess.py
File metadata and controls
37 lines (29 loc) · 772 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
def Tokenize(string):
str_lst = string.split()
return str_lst
def RemoveStopWords(string):
str_lst = string
stop_words = stopwords.words('english')
for word in str_lst:
if word.lower() in stop_words:
str_lst.remove(word)
return str_lst
def Lemmatize(string):
str_lst = string
lem = WordNetLemmatizer()
lem_lst=[]
for word in str_lst:
lem_lst.append(lem.lemmatize(word))
return lem_lst, str_lst
def Refine(string):
string = Tokenize(string)
string = RemoveStopWords(string)
string, words = Lemmatize(string)
string = " ".join(string)
return string