-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnlp_chatbot.py
234 lines (165 loc) · 7.59 KB
/
nlp_chatbot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import pandas as pd
import nltk
import numpy as np
import re
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.stem import wordnet # to perform lemmitization
from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words
df=pd.read_excel('dialog_talk_agent.xlsx')
df.head(20)
df.shape[0] # returns the number of rows in dataset
df.ffill(axis = 0,inplace=True) # fills the null value with the previous value.
"""faq = pd.read_excel("faq_data.xls")
faq = faq.drop(['Unnamed: 0'], axis=1)
faq.shape[0]
faq = faq.replace(r'^\s*$', np.nan, regex=True)
faq.isnull().sum()
faq.dropna(inplace=True)
faq.isnull().sum()
faq.shape[0]
faq = faq.rename(columns={'Question': 'Context', 'Answer': 'Text Response'})
df = pd.concat([df, faq], ignore_index=True)
"""
# word tokenizing
s='tell me about your personality'
words=word_tokenize(s)
lemma = wordnet.WordNetLemmatizer() # intializing lemmatizer
lemma.lemmatize('absorbed', pos = 'v')
pos_tag(nltk.word_tokenize(s),tagset = None) # returns the parts of speech of every word
# function that performs text normalization steps
def text_normalization(text):
text=str(text).lower() # text to lower case
spl_char_text=re.sub(r'[^ a-z]','',text) # removing special characters
tokens=nltk.word_tokenize(spl_char_text) # word tokenizing
lema=wordnet.WordNetLemmatizer() # intializing lemmatization
tags_list=pos_tag(tokens,tagset=None) # parts of speech
lema_words=[] # empty list
for token,pos_token in tags_list:
if pos_token.startswith('V'): # Verb
pos_val='v'
elif pos_token.startswith('J'): # Adjective
pos_val='a'
elif pos_token.startswith('R'): # Adverb
pos_val='r'
else:
pos_val='n' # Noun
lema_token=lema.lemmatize(token,pos_val) # performing lemmatization
lema_words.append(lema_token) # appending the lemmatized token into a list
return " ".join(lema_words) # returns the lemmatized tokens as a sentence
text_normalization('telling you some stuff about me')
df['lemmatized_text']=df['Context'].apply(text_normalization) # applying the fuction to the dataset to get clean text
df.tail(15)
# all the stop words we have
stop = stopwords.words('english')
"""# bag of words"""
cv = CountVectorizer() # intializing the count vectorizer
X = cv.fit_transform(df['lemmatized_text']).toarray()
# returns all the unique word from data
features = cv.get_feature_names()
df_bow = pd.DataFrame(X, columns = features)
df_bow.head()
Question ='Will you help me and tell me about yourself more' # considering an example query
# checking for stop words
Q=[]
a=Question.split()
for i in a:
if i in stop:
continue
else:
Q.append(i)
b=" ".join(Q)
Question_lemma = text_normalization(b) # applying the function that we created for text normalizing
Question_bow = cv.transform([Question_lemma]).toarray() # applying bow
text_normalization
Question_bow
"""# similarity"""
# cosine similarity for the above question we considered.
cosine_value = 1- pairwise_distances(df_bow, Question_bow, metric = 'cosine' )
(cosine_value)
df['similarity_bow']=cosine_value # creating a new column
df_simi = pd.DataFrame(df, columns=['Text Response','similarity_bow']) # taking similarity value of responses for the question we took
df_simi
df_simi_sort = df_simi.sort_values(by='similarity_bow', ascending=False) # sorting the values
df_simi_sort.head()
threshold = 0.2 # considering the value of p=smiliarity to be greater than 0.2
df_threshold = df_simi_sort[df_simi_sort['similarity_bow'] > threshold]
df_threshold
"""- Finally using bow for the question 'Will you help me and tell me about yourself more' , the above are the responses we got using bow and the smiliarity value of responses, we consider the response with highest similarity"""
index_value = cosine_value.argmax() # returns the index number of highest value
index_value
(Question)
df['Text Response'].loc[index_value] # The text at the above index becomes the response for the question
"""# tf-idf"""
# using tf-idf
tfidf=TfidfVectorizer() # intializing tf-id
x_tfidf=tfidf.fit_transform(df['lemmatized_text']).toarray() # transforming the data into array
Question1 ='Tell me about yourself.'
Question_lemma1 = text_normalization(Question1)
Question_tfidf = tfidf.transform([Question_lemma1]).toarray() # applying tf-idf
# returns all the unique word from data with a score of that word
df_tfidf=pd.DataFrame(x_tfidf,columns=tfidf.get_feature_names())
df_tfidf.head()
"""# similarity"""
cos=1-pairwise_distances(df_tfidf,Question_tfidf,metric='cosine') # applying cosine similarity
cos
df['similarity_tfidf']=cos # creating a new column
df_simi_tfidf = pd.DataFrame(df, columns=['Text Response','similarity_tfidf']) # taking similarity value of responses for the question we took
df_simi_tfidf
df_simi_tfidf_sort = df_simi_tfidf.sort_values(by='similarity_tfidf', ascending=False) # sorting the values
df_simi_tfidf_sort.head(10)
threshold = 0.2 # considering the value of p=smiliarity to be greater than 0.2
df_threshold = df_simi_tfidf_sort[df_simi_tfidf_sort['similarity_tfidf'] > threshold]
df_threshold
"""- by using tfidf for the question 'Will you help me and tell me about yourself more' , the above are the responses we got and the smiliarity value of responses, we consider the response with highest similarity"""
index_value1 = cos.argmax() # returns the index number of highest value
index_value1
Question1
df['Text Response'].loc[index_value1] # returns the text at that index
"""# Model Using Bag of Words"""
# Function that removes stop words and process the text
def stopword_(text):
tag_list=pos_tag(nltk.word_tokenize(text),tagset=None)
stop=stopwords.words('english')
lema=wordnet.WordNetLemmatizer()
lema_word=[]
for token,pos_token in tag_list:
if token in stop:
continue
if pos_token.startswith('V'):
pos_val='v'
elif pos_token.startswith('J'):
pos_val='a'
elif pos_token.startswith('R'):
pos_val='r'
else:
pos_val='n'
lema_token=lema.lemmatize(token,pos_val)
lema_word.append(lema_token)
return " ".join(lema_word)
# defining a function that returns response to query using bow
def chat_bow(text):
s=stopword_(text)
lemma=text_normalization(s) # calling the function to perform text normalization
bow=cv.transform([lemma]).toarray() # applying bow
cosine_value = 1- pairwise_distances(df_bow,bow, metric = 'cosine' )
index_value=cosine_value.argmax() # getting index value
return df['Text Response'].loc[index_value]
"""# Model Using tf-idf"""
# defining a function that returns response to query using tf-idf
def chat_tfidf(text):
lemma=text_normalization(text) # calling the function to perform text normalization
tf=tfidf.transform([lemma]).toarray() # applying tf-idf
cos=1-pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosine similarity
index_value=cos.argmax() # getting index value
return df['Text Response'].loc[index_value]
"""# Conclusion
- Our chat bot worked well with both bow and tf-idf, tf-idf model worked well even with stop words compared to bow where we had to remove stop words before using bag of words.
"""