-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 545e409
Showing
12 changed files
with
956 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# NLP_Based_Chatbot | ||
Chatbot with GUI interface |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# import files | ||
from flask import Flask, render_template, request, jsonify | ||
from nlp_chatbot import chat_tfidf | ||
#from chatbot import bot | ||
from flask_cors import CORS | ||
import re | ||
import json | ||
from urllib.request import urlopen | ||
|
||
url = 'http://ipinfo.io/json' | ||
response = urlopen(url) | ||
data = json.load(response) | ||
|
||
IP=data['ip'] | ||
org=data['org'] | ||
city = data['city'] | ||
country=data['country'] | ||
region=data['region'] | ||
|
||
app = Flask(__name__) | ||
CORS(app) | ||
|
||
print ('Your IP details:\n ') | ||
print ('IP : {4} \nRegion : {1} \nCountry : {2} \nCity : {3} \nOrg : {0}'.format(org,region,country,city,IP)) | ||
|
||
|
||
@app.route("/", methods=['GET']) | ||
def index(): | ||
return render_template("index1.html") | ||
@app.route("/get") | ||
def get_bot_response(): | ||
userText = request.args.get('msg') | ||
message = str(chat_tfidf(userText)) | ||
return message | ||
if __name__ == "__main__": | ||
app.run() |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,234 @@ | ||
import pandas as pd | ||
import nltk | ||
import numpy as np | ||
import re | ||
nltk.download("punkt") | ||
nltk.download("wordnet") | ||
nltk.download('averaged_perceptron_tagger') | ||
nltk.download('stopwords') | ||
from nltk.stem import wordnet # to perform lemmitization | ||
from sklearn.feature_extraction.text import CountVectorizer # to perform bow | ||
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf | ||
from nltk import pos_tag # for parts of speech | ||
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity | ||
from nltk import word_tokenize # to create tokens | ||
from nltk.corpus import stopwords # for stop words | ||
|
||
df=pd.read_excel('dialog_talk_agent.xlsx') | ||
df.head(20) | ||
|
||
df.shape[0] # returns the number of rows in dataset | ||
|
||
df.ffill(axis = 0,inplace=True) # fills the null value with the previous value. | ||
|
||
"""faq = pd.read_excel("faq_data.xls") | ||
faq = faq.drop(['Unnamed: 0'], axis=1) | ||
faq.shape[0] | ||
faq = faq.replace(r'^\s*$', np.nan, regex=True) | ||
faq.isnull().sum() | ||
faq.dropna(inplace=True) | ||
faq.isnull().sum() | ||
faq.shape[0] | ||
faq = faq.rename(columns={'Question': 'Context', 'Answer': 'Text Response'}) | ||
df = pd.concat([df, faq], ignore_index=True) | ||
""" | ||
|
||
# word tokenizing | ||
|
||
s='tell me about your personality' | ||
words=word_tokenize(s) | ||
|
||
lemma = wordnet.WordNetLemmatizer() # intializing lemmatizer | ||
lemma.lemmatize('absorbed', pos = 'v') | ||
|
||
pos_tag(nltk.word_tokenize(s),tagset = None) # returns the parts of speech of every word | ||
|
||
# function that performs text normalization steps | ||
|
||
def text_normalization(text): | ||
text=str(text).lower() # text to lower case | ||
spl_char_text=re.sub(r'[^ a-z]','',text) # removing special characters | ||
tokens=nltk.word_tokenize(spl_char_text) # word tokenizing | ||
lema=wordnet.WordNetLemmatizer() # intializing lemmatization | ||
tags_list=pos_tag(tokens,tagset=None) # parts of speech | ||
lema_words=[] # empty list | ||
for token,pos_token in tags_list: | ||
if pos_token.startswith('V'): # Verb | ||
pos_val='v' | ||
elif pos_token.startswith('J'): # Adjective | ||
pos_val='a' | ||
elif pos_token.startswith('R'): # Adverb | ||
pos_val='r' | ||
else: | ||
pos_val='n' # Noun | ||
lema_token=lema.lemmatize(token,pos_val) # performing lemmatization | ||
lema_words.append(lema_token) # appending the lemmatized token into a list | ||
|
||
return " ".join(lema_words) # returns the lemmatized tokens as a sentence | ||
|
||
text_normalization('telling you some stuff about me') | ||
|
||
df['lemmatized_text']=df['Context'].apply(text_normalization) # applying the fuction to the dataset to get clean text | ||
df.tail(15) | ||
|
||
# all the stop words we have | ||
|
||
stop = stopwords.words('english') | ||
|
||
"""# bag of words""" | ||
|
||
cv = CountVectorizer() # intializing the count vectorizer | ||
X = cv.fit_transform(df['lemmatized_text']).toarray() | ||
|
||
# returns all the unique word from data | ||
|
||
features = cv.get_feature_names() | ||
df_bow = pd.DataFrame(X, columns = features) | ||
df_bow.head() | ||
|
||
Question ='Will you help me and tell me about yourself more' # considering an example query | ||
|
||
# checking for stop words | ||
|
||
Q=[] | ||
a=Question.split() | ||
for i in a: | ||
if i in stop: | ||
continue | ||
else: | ||
Q.append(i) | ||
b=" ".join(Q) | ||
|
||
Question_lemma = text_normalization(b) # applying the function that we created for text normalizing | ||
Question_bow = cv.transform([Question_lemma]).toarray() # applying bow | ||
|
||
text_normalization | ||
|
||
Question_bow | ||
|
||
"""# similarity""" | ||
|
||
# cosine similarity for the above question we considered. | ||
|
||
cosine_value = 1- pairwise_distances(df_bow, Question_bow, metric = 'cosine' ) | ||
(cosine_value) | ||
|
||
df['similarity_bow']=cosine_value # creating a new column | ||
|
||
df_simi = pd.DataFrame(df, columns=['Text Response','similarity_bow']) # taking similarity value of responses for the question we took | ||
df_simi | ||
|
||
df_simi_sort = df_simi.sort_values(by='similarity_bow', ascending=False) # sorting the values | ||
df_simi_sort.head() | ||
|
||
threshold = 0.2 # considering the value of p=smiliarity to be greater than 0.2 | ||
df_threshold = df_simi_sort[df_simi_sort['similarity_bow'] > threshold] | ||
df_threshold | ||
|
||
"""- Finally using bow for the question 'Will you help me and tell me about yourself more' , the above are the responses we got using bow and the smiliarity value of responses, we consider the response with highest similarity""" | ||
|
||
index_value = cosine_value.argmax() # returns the index number of highest value | ||
index_value | ||
|
||
(Question) | ||
|
||
df['Text Response'].loc[index_value] # The text at the above index becomes the response for the question | ||
|
||
"""# tf-idf""" | ||
|
||
# using tf-idf | ||
|
||
tfidf=TfidfVectorizer() # intializing tf-id | ||
x_tfidf=tfidf.fit_transform(df['lemmatized_text']).toarray() # transforming the data into array | ||
|
||
Question1 ='Tell me about yourself.' | ||
|
||
Question_lemma1 = text_normalization(Question1) | ||
Question_tfidf = tfidf.transform([Question_lemma1]).toarray() # applying tf-idf | ||
|
||
# returns all the unique word from data with a score of that word | ||
|
||
df_tfidf=pd.DataFrame(x_tfidf,columns=tfidf.get_feature_names()) | ||
df_tfidf.head() | ||
|
||
"""# similarity""" | ||
|
||
cos=1-pairwise_distances(df_tfidf,Question_tfidf,metric='cosine') # applying cosine similarity | ||
cos | ||
|
||
df['similarity_tfidf']=cos # creating a new column | ||
df_simi_tfidf = pd.DataFrame(df, columns=['Text Response','similarity_tfidf']) # taking similarity value of responses for the question we took | ||
df_simi_tfidf | ||
|
||
df_simi_tfidf_sort = df_simi_tfidf.sort_values(by='similarity_tfidf', ascending=False) # sorting the values | ||
df_simi_tfidf_sort.head(10) | ||
|
||
threshold = 0.2 # considering the value of p=smiliarity to be greater than 0.2 | ||
df_threshold = df_simi_tfidf_sort[df_simi_tfidf_sort['similarity_tfidf'] > threshold] | ||
df_threshold | ||
|
||
"""- by using tfidf for the question 'Will you help me and tell me about yourself more' , the above are the responses we got and the smiliarity value of responses, we consider the response with highest similarity""" | ||
|
||
index_value1 = cos.argmax() # returns the index number of highest value | ||
index_value1 | ||
|
||
Question1 | ||
|
||
df['Text Response'].loc[index_value1] # returns the text at that index | ||
|
||
"""# Model Using Bag of Words""" | ||
|
||
# Function that removes stop words and process the text | ||
|
||
def stopword_(text): | ||
tag_list=pos_tag(nltk.word_tokenize(text),tagset=None) | ||
stop=stopwords.words('english') | ||
lema=wordnet.WordNetLemmatizer() | ||
lema_word=[] | ||
for token,pos_token in tag_list: | ||
if token in stop: | ||
continue | ||
if pos_token.startswith('V'): | ||
pos_val='v' | ||
elif pos_token.startswith('J'): | ||
pos_val='a' | ||
elif pos_token.startswith('R'): | ||
pos_val='r' | ||
else: | ||
pos_val='n' | ||
lema_token=lema.lemmatize(token,pos_val) | ||
lema_word.append(lema_token) | ||
return " ".join(lema_word) | ||
|
||
# defining a function that returns response to query using bow | ||
|
||
def chat_bow(text): | ||
s=stopword_(text) | ||
lemma=text_normalization(s) # calling the function to perform text normalization | ||
bow=cv.transform([lemma]).toarray() # applying bow | ||
cosine_value = 1- pairwise_distances(df_bow,bow, metric = 'cosine' ) | ||
index_value=cosine_value.argmax() # getting index value | ||
return df['Text Response'].loc[index_value] | ||
|
||
"""# Model Using tf-idf""" | ||
|
||
# defining a function that returns response to query using tf-idf | ||
|
||
def chat_tfidf(text): | ||
lemma=text_normalization(text) # calling the function to perform text normalization | ||
tf=tfidf.transform([lemma]).toarray() # applying tf-idf | ||
cos=1-pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosine similarity | ||
index_value=cos.argmax() # getting index value | ||
return df['Text Response'].loc[index_value] | ||
|
||
"""# Conclusion | ||
- Our chat bot worked well with both bow and tf-idf, tf-idf model worked well even with stop words compared to bow where we had to remove stop words before using bag of words. | ||
""" |
Binary file not shown.
Oops, something went wrong.