From 3b7066d3e66f8ca42c830d7a20d4e5a618723e3d Mon Sep 17 00:00:00 2001 From: Akash A Desai <62583018+akashAD98@users.noreply.github.com> Date: Tue, 13 Feb 2024 20:02:01 +0530 Subject: [PATCH] Application/multilingual rag (#135) * Create README.md * updated main file * Create .env-example * Update README.md * added docstring for each function * Update doc string * fixing lint format * Update main.py * linting --------- Co-authored-by: PrashantDixit-dev --- README.md | 3 +- applications/Multilingual_RAG/.env-example | 1 + applications/Multilingual_RAG/README.md | 45 ++++ applications/Multilingual_RAG/main.py | 218 ++++++++++++++++++ .../Multilingual_RAG/requirements.txt | 5 + applications/chat_with_anywebsite/app.py | 2 - .../chat_with_anywebsite/main_app.ipynb | 2 - applications/docchat-with-langroid/app.py | 1 - applications/docchat-with-langroid/utils.py | 1 - .../talk-with-podcast/langroid_utils.py | 1 - examples/RAG_Fusion/main.ipynb | 1 - .../movie-recommender/lancedb_cloud/main.py | 1 - examples/movie-recommender/main.py | 1 - examples/multi-lingual-wiki-qa/main.ipynb | 2 - examples/multi-lingual-wiki-qa/main.py | 2 - examples/parent_document_retriever/main.ipynb | 6 +- .../lancedb_cloud/main.ipynb | 1 - examples/product-recommender/main.ipynb | 1 - examples/product-recommender/main.py | 1 - .../main.ipynb | 1 - .../clip_text_image_search.ipynb | 4 - 21 files changed, 273 insertions(+), 27 deletions(-) create mode 100644 applications/Multilingual_RAG/.env-example create mode 100644 applications/Multilingual_RAG/README.md create mode 100644 applications/Multilingual_RAG/main.py create mode 100644 applications/Multilingual_RAG/requirements.txt diff --git a/README.md b/README.md index 3df35704..e7220da8 100644 --- a/README.md +++ b/README.md @@ -79,8 +79,7 @@ These are ready to use applications built using LanceDB serverless vector databa | [ Document Chat with Langroid ](https://github.com/lancedb/vectordb-recipes/tree/main/applications/docchat-with-langroid) | Talk with your Documents using Langroid | ![demo](./assets/document-chat-langroid.png)| | [ Fastapi RAG template ](https://github.com/lancedb/vectordb-recipes/tree/main/applications/Chatbot_RAG_with_FASTAPI) | FastAPI based RAG template with Websocket support | ![image](./assets/chatbot_fastapi.png)| | [ GTE MLX RAG ](https://github.com/lancedb/vectordb-recipes/tree/main/applications/GTE_mlx_RAG/CLI_example.ipynb) | mlx based RAG model using lancedb api support | ![image](./assets/apple_mlx.png)| - - +| [Multilingual-RAG](https://github.com/lancedb/vectordb-recipes/tree/main/applications/Multilingual_RAG/) | Multilingual RAG with cohere embedding & support 100+ languages|![image](https://github.com/akashAD98/vectordb-recipes/assets/62583018/be65eb39-25c4-4441-98fc-6ded09689819)| ## Tutorials Looking to get started with LLMs, vectorDBs, and the world of Generative AI? These in-depth tutorials and courses cover these concepts with practical follow along colabs where possible. diff --git a/applications/Multilingual_RAG/.env-example b/applications/Multilingual_RAG/.env-example new file mode 100644 index 00000000..39553fb5 --- /dev/null +++ b/applications/Multilingual_RAG/.env-example @@ -0,0 +1 @@ +COHERE_API_KEY = pastyourapikeyhere diff --git a/applications/Multilingual_RAG/README.md b/applications/Multilingual_RAG/README.md new file mode 100644 index 00000000..7f773405 --- /dev/null +++ b/applications/Multilingual_RAG/README.md @@ -0,0 +1,45 @@ +# Multilingual-RAG + +![Multilingual-RAG](https://github.com/akashAD98/Multilingual-RAG/assets/62583018/a84e1839-a311-496c-b545-3533ef348dea.png) + +## Overview +Multilingual-RAG is an innovative question-answering system with multilingual capabilities, capable of understanding and generating responses in multiple languages. It is built upon the powerful architecture of Large Language Models (LLMs) with Retrieve-And-Generate (RAG) capabilities. This application harnesses the capabilities of Cohere's multilingual embeddings, LanceDB vector store, LangChain for question answering, and Argos Translate for seamless translation between languages. The user interface is provided by Gradio, ensuring a smooth and interactive user experience. + +## Supported Languages +Multilingual RAG is designed to support over 100 languages. The specific list of supported languages depends on the capabilities of the Cohere multilingual model and Argos Translate. By default, it includes support for English, Hindi, French, and Turkish languages. Additional languages can be added to suit your use case. + +## Getting Started +Follow these instructions to set up Multilingual-RAG in your local environment. + +### Prerequisites +Ensure you have the following prerequisites installed: +- Python 3.x + +Create a `.env` file and add your Cohere API key: +just rename `.env-example` with `.env` & past your API + + + +## Installation +You can install the required dependencies using the following commands: + +``` +pip install -r requirements.txt +``` +For Argos Translate, you can install it as follows: + +``` +git clone https://github.com/argosopentech/argos-translate.git +cd argos-translate +virtualenv env +source env/bin/activate +pip install -e . +``` + +## Running the App +To run the Multilingual-RAG app, use the following command: +Currently, support text/pdf file - change the file path inside main.py + +``` +python3 main.py +``` diff --git a/applications/Multilingual_RAG/main.py b/applications/Multilingual_RAG/main.py new file mode 100644 index 00000000..269d33b6 --- /dev/null +++ b/applications/Multilingual_RAG/main.py @@ -0,0 +1,218 @@ +import os +import dotenv +import gradio as gr +import lancedb +import logging +from langchain.embeddings.cohere import CohereEmbeddings +from langchain.llms import Cohere +from langchain.prompts import PromptTemplate +from langchain.chains import RetrievalQA +from langchain.vectorstores import LanceDB +from langchain.document_loaders import TextLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.document_loaders import PyPDFLoader +import argostranslate.package +import argostranslate.translate + + +# Configuration Management +dotenv.load_dotenv(".env") +DB_PATH = "/tmp/lancedb" + +COHERE_MODEL_NAME = "multilingual-22-12" +LANGUAGE_ISO_CODES = { + "English": "en", + "Hindi": "hi", + "Turkish": "tr", + "French": "fr", +} + +# Logging Configuration +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def initialize_documents_and_embeddings(input_file_path): + """ + Initialize documents and their embeddings from a given file. + + Parameters: + - input_file_path (str): The path to the input file. Supported formats are .txt and .pdf. + + Returns: + - tuple: A tuple containing a list of texts split from the document and the embeddings object. + """ + file_extension = os.path.splitext(input_file_path)[1] + if file_extension == ".txt": + logger.info("txt file processing") + # Handle text file + loader = TextLoader(input_file_path) + documents = loader.load() + text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50) + texts = text_splitter.split_documents(documents) + elif file_extension == ".pdf": + logger.info("pdf file processing") + # Handle PDF file + loader = PyPDFLoader(input_file_path) + texts = loader.load_and_split() + text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50) + texts = text_splitter.split_documents(texts) + else: + raise ValueError( + "Unsupported file type. Supported files are .txt and .pdf only." + ) + + embeddings = CohereEmbeddings(model=COHERE_MODEL_NAME) + return texts, embeddings + + +# Database Initialization +def initialize_database(texts, embeddings): + """ + Initialize and populate a LanceDB database with documents and their embeddings. + + Parameters: + - texts (list): A list of texts to be stored in the database. + - embeddings (CohereEmbeddings): An embeddings object used to generate vector embeddings for the texts. + + Returns: + - LanceDB: An instance of LanceDB with the documents and their embeddings stored. + """ + db = lancedb.connect(DB_PATH) + table = db.create_table( + "multiling-rag", + data=[ + { + "vector": embeddings.embed_query("Hello World"), + "text": "Hello World", + "id": "1", + } + ], + mode="overwrite", + ) + return LanceDB.from_documents(texts, embeddings, connection=table) + + +# Translation Function +def translate_text(text, from_code, to_code): + """ + Translate a given text from one language to another. + + Parameters: + - text (str): The text to translate. + - from_code (str): The ISO language code of the source language. + - to_code (str): The ISO language code of the target language. + + Returns: + - str: The translated text. + """ + try: + argostranslate.package.update_package_index() + available_packages = argostranslate.package.get_available_packages() + package_to_install = next( + filter( + lambda x: x.from_code == from_code and x.to_code == to_code, + available_packages, + ) + ) + argostranslate.package.install_from_path(package_to_install.download()) + return argostranslate.translate.translate(text, from_code, to_code) + except Exception as e: + logger.error(f"Error in translate_text: {str(e)}") + return "Translation error" + + +prompt_template = """Text: {context} + +Question: {question} + +Answer the question based on the text provided. If the text doesn't contain the answer, reply that the answer is not available.""" +PROMPT = PromptTemplate( + template=prompt_template, input_variables=["context", "question"] +) + + +# Question Answering Function +def answer_question(question, input_language, output_language, db): + """ + Answer a given question by retrieving relevant information from a database, + translating the question and answer if necessary. + Parameters: + - question (str): The question to answer. + - input_language (str): The language of the input question. + - output_language (str): The desired language of the answer. + - db (LanceDB): The LanceDB instance to use for information retrieval. + + Returns: + - str: The answer to the question, in the desired output language + """ + try: + input_lang_code = LANGUAGE_ISO_CODES[input_language] + output_lang_code = LANGUAGE_ISO_CODES[output_language] + + question_in_english = ( + translate_text(question, from_code=input_lang_code, to_code="en") + if input_language != "English" + else question + ) + prompt = PromptTemplate( + template=prompt_template, input_variables=["context", "question"] + ) + qa = RetrievalQA.from_chain_type( + llm=Cohere(model="command", temperature=0), + chain_type="stuff", + retriever=db.as_retriever(), + chain_type_kwargs={"prompt": prompt}, + return_source_documents=True, + ) + + answer = qa({"query": question_in_english}) + result_in_english = answer["result"].replace("\n", "").replace("Answer:", "") + + return ( + translate_text(result_in_english, from_code="en", to_code=output_lang_code) + if output_language != "English" + else result_in_english + ) + except Exception as e: + logger.error(f"Error in answer_question: {str(e)}") + return "An error occurred while processing your question. Please try again." + + +def setup_gradio_interface(db): + """ + Setup a Gradio interface for interacting with the multilingual chatbot. + + Parameters: + - db (LanceDB): The database instance to use for information retrieval. + + Returns: + - gr.Interface: A Gradio interface object for the chatbot. + """ + + return gr.Interface( + fn=lambda question, input_language, output_language: answer_question( + question, input_language, output_language, db + ), + inputs=[ + gr.Textbox(lines=2, placeholder="Type your question here..."), + gr.Dropdown(list(LANGUAGE_ISO_CODES.keys()), label="Input Language"), + gr.Dropdown(list(LANGUAGE_ISO_CODES.keys()), label="Output Language"), + ], + outputs="text", + title="Multilingual Chatbot", + description="Ask any question in your chosen language and get an answer in the language of your choice.", + ) + + +# Main Function +def main(): + INPUT_FILE_PATH = "healthy-diet-fact-sheet-394.pdf" + texts, embeddings = initialize_documents_and_embeddings(INPUT_FILE_PATH) + db = initialize_database(texts, embeddings) + iface = setup_gradio_interface(db) + iface.launch(share=True, debug=True) + + +if __name__ == "__main__": + main() diff --git a/applications/Multilingual_RAG/requirements.txt b/applications/Multilingual_RAG/requirements.txt new file mode 100644 index 00000000..f8e08ae1 --- /dev/null +++ b/applications/Multilingual_RAG/requirements.txt @@ -0,0 +1,5 @@ +cohere +langchain +lancedb +python-dotenv +gradio diff --git a/applications/chat_with_anywebsite/app.py b/applications/chat_with_anywebsite/app.py index 8b8d076f..7c5e7460 100644 --- a/applications/chat_with_anywebsite/app.py +++ b/applications/chat_with_anywebsite/app.py @@ -13,7 +13,6 @@ class ChatbotHelper: - def __init__(self): self.chatbot_instance = None self.chat_history = [] @@ -135,7 +134,6 @@ def respond(self, message): return bot_message def run_interface(self): - iface = gr.Interface( fn=self.respond, title="Chatbot with URL or any website ", diff --git a/applications/chat_with_anywebsite/main_app.ipynb b/applications/chat_with_anywebsite/main_app.ipynb index 54226f94..699034aa 100644 --- a/applications/chat_with_anywebsite/main_app.ipynb +++ b/applications/chat_with_anywebsite/main_app.ipynb @@ -79,7 +79,6 @@ "\n", "\n", "class ChatbotHelper:\n", - "\n", " def __init__(self):\n", " self.chatbot_instance = None\n", " self.chat_history = []\n", @@ -207,7 +206,6 @@ " return bot_message\n", "\n", " def run_interface(self):\n", - "\n", " iface = gr.Interface(\n", " fn=self.respond,\n", " title=\"Chatbot with URL or any website \",\n", diff --git a/applications/docchat-with-langroid/app.py b/applications/docchat-with-langroid/app.py index d4f4a318..f92949de 100644 --- a/applications/docchat-with-langroid/app.py +++ b/applications/docchat-with-langroid/app.py @@ -11,7 +11,6 @@ uploadedFile = st.file_uploader("Choose a txt file") if uploadedFile is not None: - with open(os.path.join("tempDir", uploadedFile.name), "wb") as f: f.write(uploadedFile.getbuffer()) diff --git a/applications/docchat-with-langroid/utils.py b/applications/docchat-with-langroid/utils.py index ea50cdcc..3e7871c0 100644 --- a/applications/docchat-with-langroid/utils.py +++ b/applications/docchat-with-langroid/utils.py @@ -41,7 +41,6 @@ def configure(filename): def agent(cfg, prompt): - # Creating DocChatAgent rag_agent = DocChatAgent(cfg) diff --git a/applications/talk-with-podcast/langroid_utils.py b/applications/talk-with-podcast/langroid_utils.py index 0112811e..28b8732d 100644 --- a/applications/talk-with-podcast/langroid_utils.py +++ b/applications/talk-with-podcast/langroid_utils.py @@ -41,7 +41,6 @@ def configure(filename): def agent(cfg, prompt): - # Creating DocChatAgent rag_agent = DocChatAgent(cfg) diff --git a/examples/RAG_Fusion/main.ipynb b/examples/RAG_Fusion/main.ipynb index 03a60e25..fd9e80d0 100644 --- a/examples/RAG_Fusion/main.ipynb +++ b/examples/RAG_Fusion/main.ipynb @@ -556,7 +556,6 @@ "outputs": [], "source": [ "def generate_queries_chatgpt(original_query):\n", - "\n", " response = openai.chat.completions.create(\n", " model=\"gpt-3.5-turbo\",\n", " messages=[\n", diff --git a/examples/movie-recommender/lancedb_cloud/main.py b/examples/movie-recommender/lancedb_cloud/main.py index ed52d475..f57fe691 100644 --- a/examples/movie-recommender/lancedb_cloud/main.py +++ b/examples/movie-recommender/lancedb_cloud/main.py @@ -36,7 +36,6 @@ def get_recommendations(title): if __name__ == "__main__": - # Load and prepare data ratings = pd.read_csv( "./ml-latest-small/ratings.csv", diff --git a/examples/movie-recommender/main.py b/examples/movie-recommender/main.py index 7458eec3..c7775840 100644 --- a/examples/movie-recommender/main.py +++ b/examples/movie-recommender/main.py @@ -36,7 +36,6 @@ def get_recommendations(title): if __name__ == "__main__": - # Load and prepare data ratings = pd.read_csv( "./ml-latest-small/ratings.csv", diff --git a/examples/multi-lingual-wiki-qa/main.ipynb b/examples/multi-lingual-wiki-qa/main.ipynb index a6fd00de..20a96123 100644 --- a/examples/multi-lingual-wiki-qa/main.ipynb +++ b/examples/multi-lingual-wiki-qa/main.ipynb @@ -438,9 +438,7 @@ "data = []\n", "\n", "for i in tqdm(range(0, num_records, batch_size)):\n", - "\n", " for lang, dataset in datasets.items():\n", - "\n", " batch = [next(dataset) for _ in range(batch_size)]\n", "\n", " texts = [x[\"text\"] for x in batch]\n", diff --git a/examples/multi-lingual-wiki-qa/main.py b/examples/multi-lingual-wiki-qa/main.py index 3db185bf..393c8e2b 100644 --- a/examples/multi-lingual-wiki-qa/main.py +++ b/examples/multi-lingual-wiki-qa/main.py @@ -48,9 +48,7 @@ class Schema(LanceModel): data = [] for i in tqdm(range(0, num_records, batch_size)): - for lang, dataset in datasets.items(): - batch = [next(dataset) for _ in range(batch_size)] texts = [x["text"] for x in batch] diff --git a/examples/parent_document_retriever/main.ipynb b/examples/parent_document_retriever/main.ipynb index eead31ea..9b7e51a3 100644 --- a/examples/parent_document_retriever/main.ipynb +++ b/examples/parent_document_retriever/main.ipynb @@ -109,9 +109,9 @@ "metadata": {}, "outputs": [], "source": [ - "os.environ[\"OPENAI_API_KEY\"] = (\n", - " \"YOUR_API_KEY_HERE\" # NEEDED if you run LLM Experiment below\n", - ")" + "os.environ[\n", + " \"OPENAI_API_KEY\"\n", + "] = \"YOUR_API_KEY_HERE\" # NEEDED if you run LLM Experiment below" ] }, { diff --git a/examples/product-recommender/lancedb_cloud/main.ipynb b/examples/product-recommender/lancedb_cloud/main.ipynb index caa91495..f3c6f44c 100644 --- a/examples/product-recommender/lancedb_cloud/main.ipynb +++ b/examples/product-recommender/lancedb_cloud/main.ipynb @@ -1491,7 +1491,6 @@ "outputs": [], "source": [ "def products_bought_by_user_in_the_past(user_id: int, top: int = 10):\n", - "\n", " selected = data[data.user_id == user_id].sort_values(\n", " by=[\"total_orders\"], ascending=False\n", " )\n", diff --git a/examples/product-recommender/main.ipynb b/examples/product-recommender/main.ipynb index 2a802a79..66c5c688 100644 --- a/examples/product-recommender/main.ipynb +++ b/examples/product-recommender/main.ipynb @@ -1107,7 +1107,6 @@ "outputs": [], "source": [ "def products_bought_by_user_in_the_past(user_id: int, top: int = 10):\n", - "\n", " selected = data[data.user_id == user_id].sort_values(\n", " by=[\"total_orders\"], ascending=False\n", " )\n", diff --git a/examples/product-recommender/main.py b/examples/product-recommender/main.py index ffcb218c..776c04f0 100644 --- a/examples/product-recommender/main.py +++ b/examples/product-recommender/main.py @@ -12,7 +12,6 @@ def products_bought_by_user_in_the_past(user_id: int, top: int = 10): - selected = data[data.user_id == user_id].sort_values( by=["total_orders"], ascending=False ) diff --git a/examples/search-within-images-with-sam-and-clip/main.ipynb b/examples/search-within-images-with-sam-and-clip/main.ipynb index 5e2dfe89..f45ef591 100644 --- a/examples/search-within-images-with-sam-and-clip/main.ipynb +++ b/examples/search-within-images-with-sam-and-clip/main.ipynb @@ -533,7 +533,6 @@ "source": [ "# find the image using natural language query\n", "def search_image_with_user_query(vector_table, img_id, user_query):\n", - "\n", " text = tokenizer(user_query)\n", " k_embedding = model.encode_text(text).tolist() # Use tolist() instead of to_list()\n", " # Flatten k_embedding to a List[float]\n", diff --git a/tutorials/Accelerate-Vector-Search-Applications-Using-OpenVINO/clip_text_image_search.ipynb b/tutorials/Accelerate-Vector-Search-Applications-Using-OpenVINO/clip_text_image_search.ipynb index 82ec6b1d..87f62b2d 100644 --- a/tutorials/Accelerate-Vector-Search-Applications-Using-OpenVINO/clip_text_image_search.ipynb +++ b/tutorials/Accelerate-Vector-Search-Applications-Using-OpenVINO/clip_text_image_search.ipynb @@ -537,7 +537,6 @@ "\n", "\n", "def get_image(image_URL):\n", - "\n", " response = requests.get(image_URL)\n", " image = Image.open(BytesIO(response.content)).convert(\"RGB\")\n", "\n", @@ -545,7 +544,6 @@ "\n", "\n", "def get_image_caption(image_ID):\n", - "\n", " return image_data[image_ID][\"caption\"]" ] }, @@ -1119,14 +1117,12 @@ "\n", "\n", "def plot_images(images):\n", - "\n", " for image in images:\n", " plt.imshow(image)\n", " plt.show()\n", "\n", "\n", "def plot_images_by_side(top_images):\n", - "\n", " index_values = list(top_images.index.values)\n", " list_images = [top_images.iloc[idx].image for idx in index_values]\n", " list_captions = [top_images.iloc[idx].caption for idx in index_values]\n",