Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added assets/profile-white-logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
71 changes: 71 additions & 0 deletions clustering/agglomerative.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from os import truncate
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from keybert import KeyBERT
import pandas as pd


class Agglomerative:
def __init__(self, sentences, parameter, cohere_client):
self.sentences = sentences
self.parameter = parameter
self.cohere_client = cohere_client

def find_cluster_name(self, sentences, embedding_model):
"""_Get the cluster name_
Args:
value (_List_): _List of utterances_
Returns:
_str_: _cluster name_
"""

#### KEY BERT ######
kw_model = KeyBERT(model=embedding_model)
s = ','.join((str(n) for n in sentences))
keywords = kw_model.extract_keywords(s, keyphrase_ngram_range=(
1, 3), stop_words='english', use_mmr=True, diversity=0.7)
cluster_name = keywords[0][0]
return cluster_name

def get_clusters(self):
"""_Return clusters of sentences_

Returns:
_dataframe_: _Clustername,Utterance_
"""
corpus_embeddings_model = self.cohere_client.embed(
texts=self.sentences, model='large', truncate='LEFT')
corpus_embeddings_embed = corpus_embeddings_model.embeddings
# Normalize the embeddings to unit length
corpus_embeddings_embed = corpus_embeddings_embed / \
np.linalg.norm(corpus_embeddings_embed, axis=1, keepdims=True)
clustering_model = AgglomerativeClustering(
n_clusters=None, distance_threshold=self.parameter)
clustering_model.fit(corpus_embeddings_embed)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
if cluster_id not in clustered_sentences:
clustered_sentences[cluster_id] = []
clustered_sentences[cluster_id].append(self.sentences[sentence_id])

final_clustered = {}

for i, cluster in clustered_sentences.items():
print(f'Cluster: {i+1}')
print(cluster)
print("")
final_clustered.update({i+1: cluster})

final_clustered_names = []

for key, value in final_clustered.items():
cluster_name = self.find_cluster_name(
value, corpus_embeddings_model)
for utterance in value:
final_clustered_names.append([cluster_name, utterance])

df = pd.DataFrame(final_clustered_names, columns=[
'Cluster_name', 'Utterance'])
return df,corpus_embeddings_embed
47 changes: 47 additions & 0 deletions clustering/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@


from mimetypes import init
from concurrent.futures import ThreadPoolExecutor
import getpass
import logging

import cohere
import numpy as np
from clustering.agglomerative import Agglomerative


class Clustering:
def __init__(self, technique: str = 'Agglomerative', api_key: str = None, mockAPI: bool = False):
self.technique = technique
if mockAPI:
self.co = MockCohereAPI()
else:
if api_key is None:
api_key = getpass.getpass('Enter your Cohere API Key')

self.co = cohere.Client(api_key)

def get_clusters(self, sentences, parameter):
if self.technique == 'Agglomerative':
agglomerative = Agglomerative(sentences, parameter, self.co)
cluster_assignments, embeddings = agglomerative.get_clusters()
return cluster_assignments, embeddings


class MockCohereAPI:
"""Mock Cohere API for testing."""

def __init__(self):
pass

def generate(self, **kwargs):
mock_gens_list = [
cohere.generation.Generation(
text='two', likelihood=-10, token_likelihoods=None),
cohere.generation.Generation(
text='one', likelihood=-5, token_likelihoods=None)
]
mock_gens = cohere.generation.Generations(
generations=mock_gens_list, return_likelihoods=None)

return mock_gens
91 changes: 91 additions & 0 deletions streamlit_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import streamlit as st
import numpy as np
from collections import OrderedDict
import pandas as pd
from streamlit_option_menu import option_menu
from clustering.model import Clustering
import topically
from topic_visualization import umap_reduce
import plotly.express as px

APP_NAME = "Cohere Topic Modelling co:topic 🚀"


###### HELPER METHOD ######

def display_clustering(selected2, sentences, parameter):
technique = Clustering(
selected2, 'api_key')
df,embeddings = technique.get_clusters(sentences, parameter)
return df,embeddings

####### STREAMLIT COMPONENTS FOR CONTROLLING PARAMETERS ########

######### CREATE SCATTER PLOT OF VISUALIZING TOPICS #########

def create_scatter(table_df,embeddings):
reduced_df = umap_reduce(table_df,embeddings)

scatter = px.scatter(reduced_df, x='x',y='label',color='label',labels={'color':'label'},
hover_data=['text'],title='Topic Visualization')

st.plotly_chart(scatter)


##########################
# UI
##########################
if __name__ == '__main__':
st.set_page_config(
page_title=APP_NAME,
page_icon=":bar_chart:",
layout="wide",
initial_sidebar_state="expanded",
)
# title and image
sentences = []
h1, h2 = st.columns([1, 10])
h1.image('./assets/profile-white-logo.png', width=90)
h2.title('Topic Modelling with Cohere')
with st.empty():
st.write('\n#\n')
with st.form('Utterances'):
uploaded_file = st.file_uploader(
"Add lines of text for Topic modelling(.txt file)",type=["txt"])
selected2 = option_menu(None, ["Agglomerative", "K-Means", "DBSCAN"],
icons=['tree', 'collection', 'upc-scan'],
menu_icon="cast", orientation="horizontal")
if selected2 == 'Agglomerative':
parameter = st.sidebar.slider(
"distance_threshold", min_value=1.4, max_value=2.0, value=1.4, step=0.05
)
elif selected2 == 'K-Means':
parameter = st.sidebar.slider(
"Value of K", min_value=2, max_value=10, value=2, step=1
)
option = st.selectbox(
'Choose a Topic Modelling Approach', ('KeyBert + Prompt', 'BERTopic', 'Latent Dirichlet Allocation'))
submitted = st.form_submit_button('Submit')

if submitted:
print(selected2)
if uploaded_file:
for line in uploaded_file:
sentences.append(line.decode("utf-8").replace('\n', ''))
print(sentences)
with st.spinner(text='Clustering in progress'):
df,embeddings = display_clustering(selected2, sentences, parameter)
print(embeddings)
app = topically.Topically(option,'api_key')
df['topic_name'], topic_names = app.name_topics((df['Utterance'],df['Cluster_name']))
table_df = df.drop('Cluster_name',axis=1)

st.table(table_df)
with st.container():
with st.empty():
st.write('\n#\n#\n')
with st.expander('Topic Visualization'):
create_scatter(table_df,embeddings)

else:
st.write('Please check the input file 💥')
44 changes: 44 additions & 0 deletions test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
Please help me with my card. It won't activate
I tired but an unable to activate my card
I want to start using my card., How do I verify my new card?
I tried activating my plug-in and it didn't piece of work
I want to open an account for my children
How old do you need to be to use the banks services?
Whats the minimum age to have an account
Can my children open an account?, How old do I need to be?
Cancel a transaction
Am I able to cancel a transfer I just made
I made a mistake this morning when I did a transfer. How do I reverse it?Can I cancel a transaction?
What do I do if the ATM took my card?
An ATM machine didn't give me back my card., My card got trapped inside an ATM, what should I do?
What do I do if I can't get my card out of the ATM?
Is there a top up fee for transfer?
Will there be a charge for topping up by account with a SEPA transfer?
What are the charges for receiving a SEPA transfer?
Is there a charge for SEPA transfers?
Will I be charged a fee for a SEPA transfer?
Why do you have an identity check?
I do not feel comfortable verifying my identity
Why on earth do you need so much personal id info from me?
DO you know the reason for the identity check?
I answered so many questions about my identity. Why do you need this info?
atm cash limits
atm withdrawal limit
automated teller machine cash withdrawal limit
how much can i take from an atm
how much can i take out from the atm
how much can i take out from the atm at once
how much can i take out from the atm per day
how much cash can be withdrawn from an atm on a daily basis
How much cash can I get from an ATM?
how much money can i get from an atm
how much money can i withdrawal from an atm
how much money can i withdrawal from an atm in 24 hours
is there a limit on how much money i can withdrawal from an atm
Is there a maximum amount of money I can withdrawal from an ATM?
limits to atm withdrawal
tell me the atm cash limit
what are the bank atm limits
whats the limits of withdrawal from an atm
whats the max amount of cash i can take out at a time
withdrawal limits from atm
28 changes: 28 additions & 0 deletions topic_visualization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import umap.umap_ as umap
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE


def umap_reduce(table_df,embeds):
"""
this will reduce the embeddings to a manageable dimension for plotting

param embeds: embedding array of the sentences
param n_neighbors: This parameter controls how UMAP balances local versus global structure in the data

returns: dataframe of reduced dimensions
"""
labels = table_df['topic_name']
sentences = table_df['Utterance']
X_embedded = TSNE(random_state=0, n_components=1).fit_transform(embeds)
df_embeddings = pd.DataFrame(X_embedded)
df_embeddings = df_embeddings.rename(columns={0:'x',1:'y'})
df_embeddings = df_embeddings.assign(label=table_df['topic_name'].values)
df_embeddings = df_embeddings.assign(text=table_df['Utterance'].values)
return df_embeddings





19 changes: 11 additions & 8 deletions topically/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,21 @@
import numpy as np

from .cluster_namers import ClusterNamer
from .prompts.prompts import generic_cluster_naming_prompt
# from .prompts.prompts import generic_cluster_naming_prompt
from .prompts.prompts_keybert import generic_cluster_naming_prompt_keybert


class Topically(object):

def __init__(self, api_key: str = None, mockAPI: bool = False):
def __init__(self, source_model='KeyBert + Prompt',api_key: str = None, mockAPI: bool = False):
if mockAPI:
self.co = MockCohereAPI()
else:
if api_key is None:
api_key = getpass.getpass('Enter your Cohere API Key')

self.co = cohere.Client(api_key)
self.source_model = source_model

#TODO: Encapsulate this functionality into cluter_namers
def name_topics(self, X, prompt: str = '', num_generations=1, num_sample_texts=10):
Expand Down Expand Up @@ -55,14 +57,16 @@ def name_topics(self, X, prompt: str = '', num_generations=1, num_sample_texts=1
if isinstance(texts, list):
texts = np.array(texts)

if prompt == '':
prompt = generic_cluster_naming_prompt

if self.source_model == 'KeyBert + Prompt':
prompt = generic_cluster_naming_prompt_keybert

# Instantiate ClusterNamer
cluster_namer = ClusterNamer(self.co, prompt, num_generations=num_generations)
cluster_namer = ClusterNamer(self.co, prompt, num_generations=num_generations,source_model=self.source_model)

# Get the unique cluster assignments
unique_cluster_assignments = np.unique(cluster_assignments)
print(unique_cluster_assignments)

# Create a dictionary to store the cluster names for each cluster
cluster_names = {}
Expand All @@ -79,7 +83,7 @@ def name_cluster(cluster_number):
else:
sample_texts_from_cluster = cluster_texts

cluster_name = cluster_namer.predict(sample_texts_from_cluster)
cluster_name = cluster_namer.predict(sample_texts_from_cluster,cluster_number)

logging.info(f'naming cluster {cluster_number}: {cluster_name}')

Expand Down Expand Up @@ -111,12 +115,11 @@ def name_cluster(self, cluster_texts, temperature=0.6, num_generations=1):
The cluster name assigned to the cluster

"""

# Create the prompt, starting with the global task description
prompt = 'The following texts are from the same cluster. Please name the cluster.'

# Add the data of the current cluster we want to label
prompt += self.construct_example_for_prompt(cluster_texts)
prompt += self.construct_example_for_prompt(cluster_texts,self.source_model)

# Generate the cluster name
cluster_name = self.generate(prompt, temperature=temperature, num_generations=num_generations)[0]
Expand Down
Loading