cohere-ai · SangeethaVenkatesan · Dec 15, 2022
diff --git a/assets/profile-white-logo.png b/assets/profile-white-logo.png
diff --git a/clustering/agglomerative.py b/clustering/agglomerative.py
@@ -0,0 +1,71 @@
+from os import truncate
+from sklearn.cluster import AgglomerativeClustering
+import numpy as np
+from keybert import KeyBERT
+import pandas as pd
+
+
+class Agglomerative:
+    def __init__(self, sentences, parameter, cohere_client):
+        self.sentences = sentences
+        self.parameter = parameter
+        self.cohere_client = cohere_client
+
+    def find_cluster_name(self, sentences, embedding_model):
+        """_Get the cluster name_
+        Args:
+            value (_List_): _List of utterances_
+        Returns:
+            _str_: _cluster name_
+        """
+
+        #### KEY BERT ######
+        kw_model = KeyBERT(model=embedding_model)
+        s = ','.join((str(n) for n in sentences))
+        keywords = kw_model.extract_keywords(s, keyphrase_ngram_range=(
+            1, 3), stop_words='english', use_mmr=True, diversity=0.7)
+        cluster_name = keywords[0][0]
+        return cluster_name
+
+    def get_clusters(self):
+        """_Return clusters of sentences_
+
+        Returns:
+            _dataframe_: _Clustername,Utterance_
+        """
+        corpus_embeddings_model = self.cohere_client.embed(
+            texts=self.sentences, model='large', truncate='LEFT')
+        corpus_embeddings_embed = corpus_embeddings_model.embeddings
+        # Normalize the embeddings to unit length
+        corpus_embeddings_embed = corpus_embeddings_embed / \
+            np.linalg.norm(corpus_embeddings_embed, axis=1, keepdims=True)
+        clustering_model = AgglomerativeClustering(
+            n_clusters=None, distance_threshold=self.parameter)
+        clustering_model.fit(corpus_embeddings_embed)
+        cluster_assignment = clustering_model.labels_
+
+        clustered_sentences = {}
+        for sentence_id, cluster_id in enumerate(cluster_assignment):
+            if cluster_id not in clustered_sentences:
+                clustered_sentences[cluster_id] = []
+            clustered_sentences[cluster_id].append(self.sentences[sentence_id])
+
+        final_clustered = {}
+
+        for i, cluster in clustered_sentences.items():
+            print(f'Cluster: {i+1}')
+            print(cluster)
+            print("")
+            final_clustered.update({i+1: cluster})
+
+        final_clustered_names = []
+
+        for key, value in final_clustered.items():
+            cluster_name = self.find_cluster_name(
+                value, corpus_embeddings_model)
+            for utterance in value:
+                final_clustered_names.append([cluster_name, utterance])
+
+        df = pd.DataFrame(final_clustered_names, columns=[
+                          'Cluster_name', 'Utterance'])
+        return df,corpus_embeddings_embed
diff --git a/clustering/model.py b/clustering/model.py
@@ -0,0 +1,47 @@
+
+
+from mimetypes import init
+from concurrent.futures import ThreadPoolExecutor
+import getpass
+import logging
+
+import cohere
+import numpy as np
+from clustering.agglomerative import Agglomerative
+
+
+class Clustering:
+    def __init__(self, technique: str = 'Agglomerative', api_key: str = None, mockAPI: bool = False):
+        self.technique = technique
+        if mockAPI:
+            self.co = MockCohereAPI()
+        else:
+            if api_key is None:
+                api_key = getpass.getpass('Enter your Cohere API Key')
+
+            self.co = cohere.Client(api_key)
+
+    def get_clusters(self, sentences, parameter):
+        if self.technique == 'Agglomerative':
+            agglomerative = Agglomerative(sentences, parameter, self.co)
+            cluster_assignments, embeddings = agglomerative.get_clusters()
+            return cluster_assignments, embeddings
+
+
+class MockCohereAPI:
+    """Mock Cohere API for testing."""
+
+    def __init__(self):
+        pass
+
+    def generate(self, **kwargs):
+        mock_gens_list = [
+            cohere.generation.Generation(
+                text='two', likelihood=-10, token_likelihoods=None),
+            cohere.generation.Generation(
+                text='one', likelihood=-5, token_likelihoods=None)
+        ]
+        mock_gens = cohere.generation.Generations(
+            generations=mock_gens_list, return_likelihoods=None)
+
+        return mock_gens
diff --git a/streamlit_app.py b/streamlit_app.py
@@ -0,0 +1,91 @@
+import streamlit as st
+import numpy as np
+from collections import OrderedDict
+import pandas as pd
+from streamlit_option_menu import option_menu
+from clustering.model import Clustering
+import topically
+from topic_visualization import umap_reduce
+import plotly.express as px
+
+APP_NAME = "Cohere Topic Modelling co:topic 🚀"
+
+
+###### HELPER METHOD ######
+
+def display_clustering(selected2, sentences, parameter):
+    technique = Clustering(
+        selected2, 'api_key')
+    df,embeddings = technique.get_clusters(sentences, parameter)
+    return df,embeddings
+
+####### STREAMLIT COMPONENTS FOR CONTROLLING PARAMETERS ########
+
+######### CREATE SCATTER PLOT OF VISUALIZING TOPICS #########
+
+def create_scatter(table_df,embeddings):
+    reduced_df = umap_reduce(table_df,embeddings)
+
+    scatter = px.scatter(reduced_df, x='x',y='label',color='label',labels={'color':'label'},
+                            hover_data=['text'],title='Topic Visualization')
+
+    st.plotly_chart(scatter)
+
+
+##########################
+# UI
+##########################
+if __name__ == '__main__':
+    st.set_page_config(
+        page_title=APP_NAME,
+        page_icon=":bar_chart:",
+        layout="wide",
+        initial_sidebar_state="expanded",
+    )
+    # title and image
+    sentences = []
+    h1, h2 = st.columns([1, 10])
+    h1.image('./assets/profile-white-logo.png', width=90)
+    h2.title('Topic Modelling with Cohere')
+    with st.empty():
+        st.write('\n#\n')
+    with st.form('Utterances'):
+        uploaded_file = st.file_uploader(
+            "Add lines of text for Topic modelling(.txt file)",type=["txt"])
+        selected2 = option_menu(None, ["Agglomerative", "K-Means", "DBSCAN"],
+                                icons=['tree', 'collection', 'upc-scan'],
+                                menu_icon="cast", orientation="horizontal")
+        if selected2 == 'Agglomerative':
+            parameter = st.sidebar.slider(
+                "distance_threshold", min_value=1.4, max_value=2.0, value=1.4, step=0.05
+            )
+        elif selected2 == 'K-Means':
+            parameter = st.sidebar.slider(
+                "Value of K", min_value=2, max_value=10, value=2, step=1
+            )
+        option = st.selectbox(
+            'Choose a Topic Modelling Approach', ('KeyBert + Prompt', 'BERTopic', 'Latent Dirichlet Allocation'))
+        submitted = st.form_submit_button('Submit')
+
+    if submitted:
+        print(selected2)
+        if uploaded_file:
+            for line in uploaded_file:
+                sentences.append(line.decode("utf-8").replace('\n', ''))
+            print(sentences)
+            with st.spinner(text='Clustering in progress'):
+                df,embeddings = display_clustering(selected2, sentences, parameter)
+            print(embeddings)
+            app = topically.Topically(option,'api_key')
+            df['topic_name'], topic_names = app.name_topics((df['Utterance'],df['Cluster_name']))
+            table_df = df.drop('Cluster_name',axis=1)
+
+            st.table(table_df)
+            with st.container():
+                with st.empty():
+                    st.write('\n#\n#\n')
+                with st.expander('Topic Visualization'):
+                    create_scatter(table_df,embeddings)
+
+        else:
+            st.write('Please check the input file 💥')
diff --git a/test.txt b/test.txt
@@ -0,0 +1,44 @@
+Please help me with my card. It won't activate
+I tired but an unable to activate my card
+I want to start using my card., How do I verify my new card?
+I tried activating my plug-in and it didn't piece of work
+I want to open an account for my children
+How old do you need to be to use the banks services?
+Whats the minimum age to have an account
+Can my children open an account?, How old do I need to be?
+Cancel a transaction
+Am I able to cancel a transfer I just made
+I made a mistake this morning when I did a transfer. How do I reverse it?Can I cancel a transaction?
+What do I do if the ATM took my card?
+An ATM machine didn't give me back my card., My card got trapped inside an ATM, what should I do?
+What do I do if I can't get my card out of the ATM?
+Is there a top up fee for transfer?
+Will there be a charge for topping up by account with a SEPA transfer?
+What are the charges for receiving a SEPA transfer?
+Is there a charge for SEPA transfers?
+Will I be charged a fee for a SEPA transfer?
+Why do you have an identity check?
+I do not feel comfortable verifying my identity
+Why on earth do you need so much personal id info from me?
+DO you know the reason for the identity check?
+I answered so many questions about my identity. Why do you need this info?
+atm cash limits
+atm withdrawal limit
+automated teller machine cash withdrawal limit
+how much can i take from an atm
+how much can i take out from the atm
+how much can i take out from the atm at once
+how much can i take out from the atm per day
+how much cash can be withdrawn from an atm on a daily basis
+How much cash can I get from an ATM?
+how much money can i get from an atm
+how much money can i withdrawal from an atm
+how much money can i withdrawal from an atm in 24 hours
+is there a limit on how much money i can withdrawal from an atm
+Is there a maximum amount of money I can withdrawal from an ATM?
+limits to atm withdrawal
+tell me the atm cash limit
+what are the bank atm limits
+whats the limits of withdrawal from an atm
+whats the max amount of cash i can take out at a time
+withdrawal limits from atm
diff --git a/topic_visualization.py b/topic_visualization.py
@@ -0,0 +1,28 @@
+import umap.umap_ as umap
+import numpy as np
+import pandas as pd
+from sklearn.manifold import TSNE
+
+
+def umap_reduce(table_df,embeds):
+    """
+    this will reduce the embeddings to a manageable dimension for plotting
+
+    param embeds: embedding array of the sentences
+    param n_neighbors: This parameter controls how UMAP balances local versus global structure in the data
+
+    returns: dataframe of reduced dimensions
+    """
+    labels = table_df['topic_name']
+    sentences = table_df['Utterance']
+    X_embedded = TSNE(random_state=0, n_components=1).fit_transform(embeds)
+    df_embeddings = pd.DataFrame(X_embedded)
+    df_embeddings = df_embeddings.rename(columns={0:'x',1:'y'})
+    df_embeddings = df_embeddings.assign(label=table_df['topic_name'].values)
+    df_embeddings = df_embeddings.assign(text=table_df['Utterance'].values)
+    return df_embeddings
+
+
+
+
+
diff --git a/topically/app.py b/topically/app.py
@@ -14,19 +14,21 @@
 import numpy as np
 
 from .cluster_namers import ClusterNamer
-from .prompts.prompts import generic_cluster_naming_prompt
+# from .prompts.prompts import generic_cluster_naming_prompt
+from .prompts.prompts_keybert import generic_cluster_naming_prompt_keybert
 
 
 class Topically(object):
 
-    def __init__(self, api_key: str = None, mockAPI: bool = False):
+    def __init__(self, source_model='KeyBert + Prompt',api_key: str = None, mockAPI: bool = False):
         if mockAPI:
             self.co = MockCohereAPI()
         else:
             if api_key is None:
                 api_key = getpass.getpass('Enter your Cohere API Key')
 
             self.co = cohere.Client(api_key)
+            self.source_model = source_model
 
     #TODO: Encapsulate this functionality into cluter_namers 
     def name_topics(self, X, prompt: str = '', num_generations=1, num_sample_texts=10):
@@ -55,14 +57,16 @@ def name_topics(self, X, prompt: str = '', num_generations=1, num_sample_texts=1
         if isinstance(texts, list):
             texts = np.array(texts)
 
-        if prompt == '':
-            prompt = generic_cluster_naming_prompt
+
+        if self.source_model == 'KeyBert + Prompt':
+            prompt = generic_cluster_naming_prompt_keybert
 
         # Instantiate ClusterNamer
-        cluster_namer = ClusterNamer(self.co, prompt, num_generations=num_generations)
+        cluster_namer = ClusterNamer(self.co, prompt, num_generations=num_generations,source_model=self.source_model)
 
         # Get the unique cluster assignments
         unique_cluster_assignments = np.unique(cluster_assignments)
+        print(unique_cluster_assignments)
 
         # Create a dictionary to store the cluster names for each cluster
         cluster_names = {}
@@ -79,7 +83,7 @@ def name_cluster(cluster_number):
             else:
                 sample_texts_from_cluster = cluster_texts
 
-            cluster_name = cluster_namer.predict(sample_texts_from_cluster)
+            cluster_name = cluster_namer.predict(sample_texts_from_cluster,cluster_number)
 
             logging.info(f'naming cluster {cluster_number}: {cluster_name}')
 
@@ -111,12 +115,11 @@ def name_cluster(self, cluster_texts, temperature=0.6, num_generations=1):
                The cluster name assigned to the cluster
 
         """
-
         # Create the prompt, starting with the global task description
         prompt = 'The following texts are from the same cluster. Please name the cluster.'
 
         # Add the data of the current cluster we want to label
-        prompt += self.construct_example_for_prompt(cluster_texts)
+        prompt += self.construct_example_for_prompt(cluster_texts,self.source_model)
 
         # Generate the cluster name
         cluster_name = self.generate(prompt, temperature=temperature, num_generations=num_generations)[0]