diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index 31e36225..3b0b139a 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - uses: psf/black@stable + - uses: psf/black with: options: "--check --verbose" src: "../../vectordb-recipes" diff --git a/README.md b/README.md index acf06192..5374d578 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ Develop a Retrieval-Augmented Generation (RAG) application using LanceDB for eff | [Corrective RAG with Langgraph](./tutorials/Corrective-RAG-with_Langgraph/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/Corrective-RAG-with_Langgraph/CRAG_with_Langgraph.ipynb) [![LLM](https://img.shields.io/badge/openai-api-white)](#) [![intermediate](https://img.shields.io/badge/intermediate-FFDA33)](#)| [![Ghost](https://img.shields.io/badge/ghost-000?style=for-the-badge&logo=ghost&logoColor=%23F7DF1E)](https://blog.lancedb.com/implementing-corrective-rag-in-the-easiest-way-2/)| | [Contextual-Compression-with-RAG](/examples/Contextual-Compression-with-RAG/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/Contextual-Compression-with-RAG/main.ipynb) [![local LLM](https://img.shields.io/badge/local-llm-green)](#) [![intermediate](https://img.shields.io/badge/intermediate-FFDA33)](#)|[![Ghost](https://img.shields.io/badge/ghost-000?style=for-the-badge&logo=ghost&logoColor=%23F7DF1E)](https://blog.lancedb.com/enhance-rag-integrate-contextual-compression-and-filtering-for-precision-a29d4a810301/) | | [Improve RAG with FLARE](./examples/better-rag-FLAIR) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/examples/better-rag-FLAIR/main.ipynb) [![local LLM](https://img.shields.io/badge/local-llm-green)](#) [![LLM](https://img.shields.io/badge/openai-api-white)](#) [![advanced](https://img.shields.io/badge/advanced-FF3333)](#)|[![Ghost](https://img.shields.io/badge/ghost-000?style=for-the-badge&logo=ghost&logoColor=%23F7DF1E)](https://blog.lancedb.com/better-rag-with-active-retrieval-augmented-generation-flare-3b66646e2a9f/) | -| [Query Expansion and Reranker ](/examples/QueryExpansion&Reranker/) | Open In Colab [![LLM](https://img.shields.io/badge/openai-api-white)](#) [![advanced](https://img.shields.io/badge/advanced-FF3333)](#)|[![Ghost](https://img.shields.io/badge/ghost-000?style=for-the-badge&logo=ghost&logoColor=%23F7DF1E)](https://blog.lancedb.com/improving-rag-with-query-expansion-reranking-models/)| +| [Query Expansion and Reranker ](/examples/QueryExpansion&Reranker/) | Open In Colab [![LLM](https://img.shields.io/badge/openai-api-white)](#) [![advanced](https://img.shields.io/badge/advanced-FF3333)](#)|| | [RAG Fusion](/examples/RAG_Fusion/) | Open In Colab [![LLM](https://img.shields.io/badge/openai-api-white)](#) [![advanced](https://img.shields.io/badge/advanced-FF3333)](#)| | [Agentic RAG ](/tutorials/Agentic_RAG/) | Open In Colab [![LLM](https://img.shields.io/badge/openai-api-white)](#) [![advanced](https://img.shields.io/badge/advanced-FF3333)](#)| |||| @@ -139,7 +139,7 @@ Create a recommender system application with LanceDB for efficient vector-based |||| | [Movie Recommender](/examples/movie-recommender/) | Open In Colab [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](./examples/movie-recommender/main.py) [![beginner](https://img.shields.io/badge/beginner-B5FF33)](#)| | | [Movie Recommender with Genre](./examples/movie-recommendation-with-genres/) | Open In Colab [![beginner](https://img.shields.io/badge/beginner-B5FF33)](#)| [![Ghost](https://img.shields.io/badge/ghost-000?style=for-the-badge&logo=ghost&logoColor=%23F7DF1E)](https://blog.lancedb.com/movie-recommendation-system-using-lancedb-and-doc2vec/)| -| [Product Recommender](./examples/product-recommender/) | Open In Colab [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](./examples/product-recommender/main.py)[![intermediate](https://img.shields.io/badge/intermediate-FFDA33)](#)| | +| [Product Recommender](./examples/product-recommender/) | Open In Colab [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](./examples/product-recommender/main.py) [![intermediate](https://img.shields.io/badge/intermediate-FFDA33)](#)| | | [Arxiv paper recommender](/examples/arxiv-recommender) | Open In Colab [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](./examples/arxiv-recommender/main.py) [![LLM](https://img.shields.io/badge/local-llm-green)](#) [![beginner](https://img.shields.io/badge/beginner-B5FF33)](#)| | |||| @@ -152,6 +152,7 @@ Checkout concepts of LLM applications pipeline to ensures accurate information r | | | | | [A Primer on Text Chunking and its Types](./tutorials/different-types-text-chunking-in-RAG) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/different-types-text-chunking-in-RAG/Text_Chunking_on_RAG_application_with_LanceDB.ipynb) [![beginner](https://img.shields.io/badge/beginner-B5FF33)](#)| [![Ghost](https://img.shields.io/badge/ghost-000?style=for-the-badge&logo=ghost&logoColor=%23F7DF1E)](https://blog.lancedb.com/a-primer-on-text-chunking-and-its-types-a420efc96a13) | | [Langchain LlamaIndex Chunking](./tutorials/Langchain-LlamaIndex-Chunking) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lancedb/vectordb-recipes/blob/main/tutorials/Langchain-LlamaIndex-Chunking/Langchain_Llamaindex_chunking.ipynb) [![beginner](https://img.shields.io/badge/beginner-B5FF33)](#)| [![Ghost](https://img.shields.io/badge/ghost-000?style=for-the-badge&logo=ghost&logoColor=%23F7DF1E)](https://blog.lancedb.com/chunking-techniques-with-langchain-and-llamaindex/) | +| [Create structured dataset using Instructor](./tutorials/NER-dataset-with-Instructor/) | [![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)](./tutorials/NER-dataset-with-Instructor/main.py) [![beginner](https://img.shields.io/badge/beginner-B5FF33)](#)| | | [Comparing Cohere Rerankers with LanceDB](./tutorials/cohere-reranker) | [![beginner](https://img.shields.io/badge/beginner-B5FF33)](#)| [![Ghost](https://img.shields.io/badge/ghost-000?style=for-the-badge&logo=ghost&logoColor=%23F7DF1E)](https://blog.lancedb.com/benchmarking-cohere-reranker-with-lancedb/) | | [Product Quantization: Compress High Dimensional Vectors](https://blog.lancedb.com/benchmarking-lancedb-92b01032874a-2/) |[![intermediate](https://img.shields.io/badge/intermediate-FFDA33)](#) | [![Ghost](https://img.shields.io/badge/ghost-000?style=for-the-badge&logo=ghost&logoColor=%23F7DF1E)](https://blog.lancedb.com/benchmarking-lancedb-92b01032874a-2/) | | [LLMs, RAG, & the missing storage layer for AI](https://blog.lancedb.com/llms-rag-the-missing-storage-layer-for-ai-28ded35fa984) | [![intermediate](https://img.shields.io/badge/intermediate-FFDA33)](#)| [![Ghost](https://img.shields.io/badge/ghost-000?style=for-the-badge&logo=ghost&logoColor=%23F7DF1E)](https://blog.lancedb.com/llms-rag-the-missing-storage-layer-for-ai-28ded35fa984/) | diff --git a/assets/critique-based-contexting.png b/assets/critique-based-contexting.png index 5dc3326a..8903c89f 100644 Binary files a/assets/critique-based-contexting.png and b/assets/critique-based-contexting.png differ diff --git a/assets/instructor.png b/assets/instructor.png new file mode 100644 index 00000000..a61b9c3b Binary files /dev/null and b/assets/instructor.png differ diff --git a/examples/parent_document_retriever/main.ipynb b/examples/parent_document_retriever/main.ipynb index 9b7e51a3..f0bdd9bd 100644 --- a/examples/parent_document_retriever/main.ipynb +++ b/examples/parent_document_retriever/main.ipynb @@ -309,176 +309,7 @@ "id": "F4xuShlW1yY_", "outputId": "cb4bfd0e-553c-4764-c0f4-e5fcd0583c36" }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e703d70a7cb44ad8a47330773d1f732a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - ".gitattributes: 0%| | 0.00/1.52k [00:00" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Debug output: {'agent': {'messages': [AIMessage(content='The PM Gati Shakti National Master Plan (NMP) is an ambitious initiative launched by the Government of India aimed at improving infrastructure development across the country. Launched by Prime Minister Narendra Modi, the plan seeks to integrate the planning and coordination of various infrastructure projects across different sectors and ministries.\\n\\nThe core objective of the Gati Shakti NMP is to enhance multi-modal connectivity and reduce logistics costs by bringing together rail, road, air, and waterways projects under a single, unified framework. This holistic approach is intended to boost economic growth, create jobs, and promote regional connectivity.\\n\\nThe plan utilizes digital technology to map and synchronize projects, ensuring that all related departments and stakeholders are aligned, which helps in eliminating bottlenecks, improving project execution speed, and enhancing overall efficiency. The Gati Shakti NMP is seen as a transformative step towards making India a global manufacturing hub and improving the ease of doing business.', response_metadata={'finish_reason': 'stop'}, id='run-8bc90fb4-007b-4384-9320-35c3621eb9b8-0')]}}\n", - "Extracted content: The PM Gati Shakti National Master Plan (NMP) is an ambitious initiative launched by the Government of India aimed at improving infrastructure development across the country. Launched by Prime Minister Narendra Modi, the plan seeks to integrate the planning and coordination of various infrastructure projects across different sectors and ministries.\n", - "\n", - "The core objective of the Gati Shakti NMP is to enhance multi-modal connectivity and reduce logistics costs by bringing together rail, road, air, and waterways projects under a single, unified framework. This holistic approach is intended to boost economic growth, create jobs, and promote regional connectivity.\n", - "\n", - "The plan utilizes digital technology to map and synchronize projects, ensuring that all related departments and stakeholders are aligned, which helps in eliminating bottlenecks, improving project execution speed, and enhancing overall efficiency. The Gati Shakti NMP is seen as a transformative step towards making India a global manufacturing hub and improving the ease of doing business.\n", - "Debug output: {'agent': {'messages': [AIMessage(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_AE3GmC26FUSF63Nb4NhJCSMb', 'function': {'arguments': '{\"query\": \"steps for export import procedure\"}', 'name': 'retrieve_blog_posts'}, 'type': 'function'}, {'index': 1, 'id': 'call_B56lmGd2JIHRDCdxf9Um9Ml7', 'function': {'arguments': '{\"query\": \"customs import export procedure\"}', 'name': 'retrieve_blog_posts'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-2c42f15d-390b-4509-8b23-fbc6d1fb203d-0', tool_calls=[{'name': 'retrieve_blog_posts', 'args': {'query': 'steps for export import procedure'}, 'id': 'call_AE3GmC26FUSF63Nb4NhJCSMb'}, {'name': 'retrieve_blog_posts', 'args': {'query': 'customs import export procedure'}, 'id': 'call_B56lmGd2JIHRDCdxf9Um9Ml7'}])]}}\n", - "Extracted content: \n", - "Debug output: {'retrieve': {'messages': [ToolMessage(content='<Microsoft Word - CUSTOMS IMPORT EXPORT PROCEDURES _final_Admin\\n\\nendobj\\r\\n118 0 obj\\r\\n<>/F 4/A<>/StructParent 32>>\\r\\nendobj\\r\\n119 0 obj\\n\\nEP\\x12t1Ԗ�m�l1�PI�����ٲW$��`�B[C��\\x1e�2�R�ν7ȗ���C�3�a���\\x1fZ��U\\x7f��\\x10�5y<:', name='retrieve_blog_posts', id='8714e8e7-ed08-4452-8d1f-621f4f25af81', tool_call_id='call_AE3GmC26FUSF63Nb4NhJCSMb'), ToolMessage(content='<Microsoft Word - CUSTOMS IMPORT EXPORT PROCEDURES _final_Admin\\n\\nCdTa�x\\t&��A\\x0eb�1����\\x0errMM�q�\\x7f1Q2cҩ0de�˂���>�\\x1b\\x12�\\x19?�\\x0fa�L���Q\\n\\nt\\'�P��h\\x13��u\\u05eex����|^���(I�$�%\\x1f�Q%�.\\x19U�B��jY��\\x16�?\\x19\\x19�\\x0e�$��W�BԸ���\\x1ck��\\x19��l!K�\\x05�!z�\\x1dq\\x12�\\x1d�v��]�P\"\\x11ROC��\\x14', name='retrieve_blog_posts', id='8a56ed96-cd2a-4846-a60d-258e6430aa79', tool_call_id='call_B56lmGd2JIHRDCdxf9Um9Ml7')]}}\n", - "Debug output: {'generate': {'messages': [\"I don't know.\"]}}\n", - "Debug output: {'agent': {'messages': [AIMessage(content='The term \"RCMC\" stands for Registration Cum Membership Certificate. It is a certificate that is provided by the Export Promotion Councils (EPCs) or commodity boards in India. An RCMC is issued to exporters dealing in products registered with these agencies. Holding an RCMC is mandatory for exporters to avail benefits under the Foreign Trade Policy like duty drawback, concessions, and other support.\\n\\nHere are some key points about RCMC:\\n\\n1. **Purpose**: The RCMC is used to certify that an exporter is registered with the respective EPC and is eligible for various benefits under the export-import policy.\\n\\n2. **Validity**: Typically, an RCMC is valid for five years.\\n\\n3. **Application**: Exporters must apply for an RCMC with the relevant EPC that pertains to their main line of business. If the exporter wishes to export items that are not covered by any EPC, they can obtain an RCMC from the Federation of Indian Export Organisations (FIEO).\\n\\n4. **Benefits**: With an RCMC, exporters can participate in international trade fairs, get sponsorship for trade delegations, and access market development assistance among other benefits.\\n\\n5. **Renewal and Cancellation**: The certificate needs to be renewed upon expiry. It can also be cancelled or suspended if the holder fails to abide by the regulatory requirements.\\n\\nIf you need detailed information or specific guidance related to obtaining an RCMC, please let me know!', response_metadata={'finish_reason': 'stop'}, id='run-4fa5e544-510a-4140-984c-89dedd855e71-0')]}}\n", - "Extracted content: The term \"RCMC\" stands for Registration Cum Membership Certificate. It is a certificate that is provided by the Export Promotion Councils (EPCs) or commodity boards in India. An RCMC is issued to exporters dealing in products registered with these agencies. Holding an RCMC is mandatory for exporters to avail benefits under the Foreign Trade Policy like duty drawback, concessions, and other support.\n", - "\n", - "Here are some key points about RCMC:\n", - "\n", - "1. **Purpose**: The RCMC is used to certify that an exporter is registered with the respective EPC and is eligible for various benefits under the export-import policy.\n", - "\n", - "2. **Validity**: Typically, an RCMC is valid for five years.\n", - "\n", - "3. **Application**: Exporters must apply for an RCMC with the relevant EPC that pertains to their main line of business. If the exporter wishes to export items that are not covered by any EPC, they can obtain an RCMC from the Federation of Indian Export Organisations (FIEO).\n", - "\n", - "4. **Benefits**: With an RCMC, exporters can participate in international trade fairs, get sponsorship for trade delegations, and access market development assistance among other benefits.\n", - "\n", - "5. **Renewal and Cancellation**: The certificate needs to be renewed upon expiry. It can also be cancelled or suspended if the holder fails to abide by the regulatory requirements.\n", - "\n", - "If you need detailed information or specific guidance related to obtaining an RCMC, please let me know!\n" - ] - } + "data": { + "text/html": [ + "
" ], - "source": [ - "\n", - "# Function to set environment variables securely\n", - "def _set_env(key: str):\n", - " if key not in os.environ:\n", - " os.environ[key] = getpass.getpass(f\"{key}:\")\n", - "\n", - "_set_env(\"OPENAI_API_KEY\")\n", - "\n", - "# (Optional) For tracing\n", - "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"False\"\n", - "_set_env(\"LANGCHAIN_API_KEY\")\n", - "\n", - "\n", - "# upload the data based on your usecase\n", - "\n", - "urls = [\n", - " 'https://content.dgft.gov.in/Website/CIEP.pdf',\n", - " 'https://content.dgft.gov.in/Website/GAE.pdf',\n", - " 'https://content.dgft.gov.in/Website/HTE.pdf',\n", - "]\n", - "\n", - "\n", - "docs = [WebBaseLoader(url).load() for url in urls]\n", - "docs_list = [item for sublist in docs for item in sublist]\n", - "\n", - "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n", - " chunk_size=100, chunk_overlap=50\n", - ")\n", - "doc_splits = text_splitter.split_documents(docs_list)\n", - "\n", - "# Add to lancedb as vectordb\n", - "\n", - "vectorstore = LanceDB.from_documents(\n", - " documents=doc_splits,\n", - " embedding=OpenAIEmbeddings(),\n", - ")\n", - "retriever = vectorstore.as_retriever()\n", - "\n", - "\n", - "# create the tools\n", - "retriever_tool = create_retriever_tool(\n", - " retriever,\n", - " \"retrieve_blog_posts\",\n", - " \"Search and return information about customs import export procedure,GST & EXPORTS , How to export\",\n", - ")\n", - "\n", - "tools = [retriever_tool]\n", - "tool_executor = ToolExecutor(tools)\n", - "\n", - "\n", - "\n", - "class AgentState(TypedDict):\n", - " messages: Annotated[Sequence[BaseMessage], add_messages]\n", - "\n", - "def grade_documents(state) -> Literal[\"generate\", \"rewrite\"]:\n", - " class grade(BaseModel):\n", - " binary_score: str = Field(description=\"Relevance score 'yes' or 'no'\")\n", - "\n", - " model = ChatOpenAI(temperature=0, model=\"gpt-4-0125-preview\", streaming=True)\n", - " llm_with_tool = model.with_structured_output(grade)\n", - " prompt = PromptTemplate(\n", - " template=\"\"\"You are a grader assessing relevance of a retrieved document to a user question. \\n\n", - " Here is the retrieved document: \\n\\n {context} \\n\\n\n", - " Here is the user question: {question} \\n\n", - " If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \\n\n", - " Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.\"\"\",\n", - " input_variables=[\"context\", \"question\"],\n", - " )\n", - " chain = prompt | llm_with_tool\n", - "\n", - " messages = state[\"messages\"]\n", - " last_message = messages[-1]\n", - " question = messages[0].content\n", - " docs = last_message.content\n", - "\n", - " scored_result = chain.invoke({\"question\": question, \"context\": docs})\n", - " score = scored_result.binary_score\n", - "\n", - " return \"generate\" if score == \"yes\" else \"rewrite\"\n", - "\n", - "def agent(state):\n", - " messages = state[\"messages\"]\n", - " model = ChatOpenAI(temperature=0, streaming=True, model=\"gpt-4-turbo\")\n", - " model = model.bind_tools(tools)\n", - " response = model.invoke(messages)\n", - " return {\"messages\": [response]}\n", - "\n", - "def rewrite(state):\n", - " messages = state[\"messages\"]\n", - " question = messages[0].content\n", - " msg = [\n", - " HumanMessage(\n", - " content=f\"\"\" \\n\n", - " Look at the input and try to reason about the underlying semantic intent / meaning. \\n\n", - " Here is the initial question:\n", - " \\n ------- \\n\n", - " {question}\n", - " \\n ------- \\n\n", - " Formulate an improved question: \"\"\",\n", - " )\n", - " ]\n", - " model = ChatOpenAI(temperature=0, model=\"gpt-4-0125-preview\", streaming=True)\n", - " response = model.invoke(msg)\n", - " return {\"messages\": [response]}\n", - "\n", - "def generate(state):\n", - " messages = state[\"messages\"]\n", - " question = messages[0].content\n", - " last_message = messages[-1]\n", - " docs = last_message.content\n", - "\n", - " prompt = hub.pull(\"rlm/rag-prompt\")\n", - " llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0, streaming=True)\n", - "\n", - " def format_docs(docs):\n", - " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", - "\n", - " rag_chain = prompt | llm | StrOutputParser()\n", - " response = rag_chain.invoke({\"context\": docs, \"question\": question})\n", - " return {\"messages\": [response]}\n", - "\n", - "workflow = StateGraph(AgentState)\n", - "workflow.add_node(\"agent\", agent)\n", - "retrieve = ToolNode([retriever_tool])\n", - "workflow.add_node(\"retrieve\", retrieve)\n", - "workflow.add_node(\"rewrite\", rewrite)\n", - "workflow.add_node(\"generate\", generate)\n", - "workflow.set_entry_point(\"agent\")\n", - "workflow.add_conditional_edges(\"agent\", tools_condition, {\"tools\": \"retrieve\", END: END})\n", - "workflow.add_conditional_edges(\"retrieve\", grade_documents)\n", - "workflow.add_edge(\"generate\", END)\n", - "workflow.add_edge(\"rewrite\", \"agent\")\n", - "graph = workflow.compile()\n", - "\n", - "\n", - "def process_message(user_message):\n", - " inputs = {\n", - " \"messages\": [(\"user\", user_message)]\n", - " }\n", - " content_output = None\n", - " for output in graph.stream(inputs):\n", - " print(f\"Debug output: {output}\") # Debugging line to print the output\n", - " if 'agent' in output and 'messages' in output['agent']:\n", - " messages = output['agent']['messages']\n", - " if messages and hasattr(messages[0], 'content'):\n", - " content_output = messages[0].content # Accessing attribute directly\n", - " print(f\"Extracted content: {content_output}\") # Print extracted content\n", - " return content_output if content_output else \"No relevant output found.\"\n", - "\n", - "\n", - "# Define example questions to guide the user\n", - "example_questions = [\n", - "\"explain me in short what is PM Gati Shakti National Master Plan (NMP)?\"\n", - "\n", - "]\n", - "\n", - "# Create a Gradio interface\n", - "iface = gr.Interface(\n", - " fn=process_message,\n", - " inputs=\"text\",\n", - " outputs=\"text\",\n", - " title=\"Agentic RAG \",\n", - " description=\"Enter a message to query related to export import .\",\n", - " examples=example_questions,\n", - ")\n", - "\n", - "# Launch the Gradio app\n", - "iface.launch(debug=True)\n", - "\n", - "\n", - "\n" + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JjmUVVn1TdH0" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "source": [ - "#some quetions for testing\n", - "explain me in short what is PM Gati Shakti National Master Plan (NMP)?\n", - "\n", - "what is Zero Rating of Exports?\n", - "\n", - "what is Export Inspection Council of India?\n", - "\n", - "please give us some Details of some of the major initiatives /schemes please ?" - ], - "metadata": { - "id": "c921cw61mPdh" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "k6Tq9E1Lqwtj" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" + "name": "stdout", + "output_type": "stream", + "text": [ + "Debug output: {'agent': {'messages': [AIMessage(content='The PM Gati Shakti National Master Plan (NMP) is an ambitious initiative launched by the Government of India aimed at improving infrastructure development across the country. Launched by Prime Minister Narendra Modi, the plan seeks to integrate the planning and coordination of various infrastructure projects across different sectors and ministries.\\n\\nThe core objective of the Gati Shakti NMP is to enhance multi-modal connectivity and reduce logistics costs by bringing together rail, road, air, and waterways projects under a single, unified framework. This holistic approach is intended to boost economic growth, create jobs, and promote regional connectivity.\\n\\nThe plan utilizes digital technology to map and synchronize projects, ensuring that all related departments and stakeholders are aligned, which helps in eliminating bottlenecks, improving project execution speed, and enhancing overall efficiency. The Gati Shakti NMP is seen as a transformative step towards making India a global manufacturing hub and improving the ease of doing business.', response_metadata={'finish_reason': 'stop'}, id='run-8bc90fb4-007b-4384-9320-35c3621eb9b8-0')]}}\n", + "Extracted content: The PM Gati Shakti National Master Plan (NMP) is an ambitious initiative launched by the Government of India aimed at improving infrastructure development across the country. Launched by Prime Minister Narendra Modi, the plan seeks to integrate the planning and coordination of various infrastructure projects across different sectors and ministries.\n", + "\n", + "The core objective of the Gati Shakti NMP is to enhance multi-modal connectivity and reduce logistics costs by bringing together rail, road, air, and waterways projects under a single, unified framework. This holistic approach is intended to boost economic growth, create jobs, and promote regional connectivity.\n", + "\n", + "The plan utilizes digital technology to map and synchronize projects, ensuring that all related departments and stakeholders are aligned, which helps in eliminating bottlenecks, improving project execution speed, and enhancing overall efficiency. The Gati Shakti NMP is seen as a transformative step towards making India a global manufacturing hub and improving the ease of doing business.\n", + "Debug output: {'agent': {'messages': [AIMessage(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_AE3GmC26FUSF63Nb4NhJCSMb', 'function': {'arguments': '{\"query\": \"steps for export import procedure\"}', 'name': 'retrieve_blog_posts'}, 'type': 'function'}, {'index': 1, 'id': 'call_B56lmGd2JIHRDCdxf9Um9Ml7', 'function': {'arguments': '{\"query\": \"customs import export procedure\"}', 'name': 'retrieve_blog_posts'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-2c42f15d-390b-4509-8b23-fbc6d1fb203d-0', tool_calls=[{'name': 'retrieve_blog_posts', 'args': {'query': 'steps for export import procedure'}, 'id': 'call_AE3GmC26FUSF63Nb4NhJCSMb'}, {'name': 'retrieve_blog_posts', 'args': {'query': 'customs import export procedure'}, 'id': 'call_B56lmGd2JIHRDCdxf9Um9Ml7'}])]}}\n", + "Extracted content: \n", + "Debug output: {'retrieve': {'messages': [ToolMessage(content='<Microsoft Word - CUSTOMS IMPORT EXPORT PROCEDURES _final_Admin\\n\\nendobj\\r\\n118 0 obj\\r\\n<>/F 4/A<>/StructParent 32>>\\r\\nendobj\\r\\n119 0 obj\\n\\nEP\\x12t1Ԗ�m�l1�PI�����ٲW$��`�B[C��\\x1e�2�R�ν7ȗ���C�3�a���\\x1fZ��U\\x7f��\\x10�5y<:', name='retrieve_blog_posts', id='8714e8e7-ed08-4452-8d1f-621f4f25af81', tool_call_id='call_AE3GmC26FUSF63Nb4NhJCSMb'), ToolMessage(content='<Microsoft Word - CUSTOMS IMPORT EXPORT PROCEDURES _final_Admin\\n\\nCdTa�x\\t&��A\\x0eb�1����\\x0errMM�q�\\x7f1Q2cҩ0de�˂���>�\\x1b\\x12�\\x19?�\\x0fa�L���Q\\n\\nt\\'�P��h\\x13��u\\u05eex����|^���(I�$�%\\x1f�Q%�.\\x19U�B��jY��\\x16�?\\x19\\x19�\\x0e�$��W�BԸ���\\x1ck��\\x19��l!K�\\x05�!z�\\x1dq\\x12�\\x1d�v��]�P\"\\x11ROC��\\x14', name='retrieve_blog_posts', id='8a56ed96-cd2a-4846-a60d-258e6430aa79', tool_call_id='call_B56lmGd2JIHRDCdxf9Um9Ml7')]}}\n", + "Debug output: {'generate': {'messages': [\"I don't know.\"]}}\n", + "Debug output: {'agent': {'messages': [AIMessage(content='The term \"RCMC\" stands for Registration Cum Membership Certificate. It is a certificate that is provided by the Export Promotion Councils (EPCs) or commodity boards in India. An RCMC is issued to exporters dealing in products registered with these agencies. Holding an RCMC is mandatory for exporters to avail benefits under the Foreign Trade Policy like duty drawback, concessions, and other support.\\n\\nHere are some key points about RCMC:\\n\\n1. **Purpose**: The RCMC is used to certify that an exporter is registered with the respective EPC and is eligible for various benefits under the export-import policy.\\n\\n2. **Validity**: Typically, an RCMC is valid for five years.\\n\\n3. **Application**: Exporters must apply for an RCMC with the relevant EPC that pertains to their main line of business. If the exporter wishes to export items that are not covered by any EPC, they can obtain an RCMC from the Federation of Indian Export Organisations (FIEO).\\n\\n4. **Benefits**: With an RCMC, exporters can participate in international trade fairs, get sponsorship for trade delegations, and access market development assistance among other benefits.\\n\\n5. **Renewal and Cancellation**: The certificate needs to be renewed upon expiry. It can also be cancelled or suspended if the holder fails to abide by the regulatory requirements.\\n\\nIf you need detailed information or specific guidance related to obtaining an RCMC, please let me know!', response_metadata={'finish_reason': 'stop'}, id='run-4fa5e544-510a-4140-984c-89dedd855e71-0')]}}\n", + "Extracted content: The term \"RCMC\" stands for Registration Cum Membership Certificate. It is a certificate that is provided by the Export Promotion Councils (EPCs) or commodity boards in India. An RCMC is issued to exporters dealing in products registered with these agencies. Holding an RCMC is mandatory for exporters to avail benefits under the Foreign Trade Policy like duty drawback, concessions, and other support.\n", + "\n", + "Here are some key points about RCMC:\n", + "\n", + "1. **Purpose**: The RCMC is used to certify that an exporter is registered with the respective EPC and is eligible for various benefits under the export-import policy.\n", + "\n", + "2. **Validity**: Typically, an RCMC is valid for five years.\n", + "\n", + "3. **Application**: Exporters must apply for an RCMC with the relevant EPC that pertains to their main line of business. If the exporter wishes to export items that are not covered by any EPC, they can obtain an RCMC from the Federation of Indian Export Organisations (FIEO).\n", + "\n", + "4. **Benefits**: With an RCMC, exporters can participate in international trade fairs, get sponsorship for trade delegations, and access market development assistance among other benefits.\n", + "\n", + "5. **Renewal and Cancellation**: The certificate needs to be renewed upon expiry. It can also be cancelled or suspended if the holder fails to abide by the regulatory requirements.\n", + "\n", + "If you need detailed information or specific guidance related to obtaining an RCMC, please let me know!\n" + ] } + ], + "source": [ + "# Function to set environment variables securely\n", + "def _set_env(key: str):\n", + " if key not in os.environ:\n", + " os.environ[key] = getpass.getpass(f\"{key}:\")\n", + "\n", + "\n", + "_set_env(\"OPENAI_API_KEY\")\n", + "\n", + "# (Optional) For tracing\n", + "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"False\"\n", + "_set_env(\"LANGCHAIN_API_KEY\")\n", + "\n", + "\n", + "# upload the data based on your usecase\n", + "\n", + "urls = [\n", + " \"https://content.dgft.gov.in/Website/CIEP.pdf\",\n", + " \"https://content.dgft.gov.in/Website/GAE.pdf\",\n", + " \"https://content.dgft.gov.in/Website/HTE.pdf\",\n", + "]\n", + "\n", + "\n", + "docs = [WebBaseLoader(url).load() for url in urls]\n", + "docs_list = [item for sublist in docs for item in sublist]\n", + "\n", + "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n", + " chunk_size=100, chunk_overlap=50\n", + ")\n", + "doc_splits = text_splitter.split_documents(docs_list)\n", + "\n", + "# Add to lancedb as vectordb\n", + "\n", + "vectorstore = LanceDB.from_documents(\n", + " documents=doc_splits,\n", + " embedding=OpenAIEmbeddings(),\n", + ")\n", + "retriever = vectorstore.as_retriever()\n", + "\n", + "\n", + "# create the tools\n", + "retriever_tool = create_retriever_tool(\n", + " retriever,\n", + " \"retrieve_blog_posts\",\n", + " \"Search and return information about customs import export procedure,GST & EXPORTS , How to export\",\n", + ")\n", + "\n", + "tools = [retriever_tool]\n", + "tool_executor = ToolExecutor(tools)\n", + "\n", + "\n", + "class AgentState(TypedDict):\n", + " messages: Annotated[Sequence[BaseMessage], add_messages]\n", + "\n", + "\n", + "def grade_documents(state) -> Literal[\"generate\", \"rewrite\"]:\n", + " class grade(BaseModel):\n", + " binary_score: str = Field(description=\"Relevance score 'yes' or 'no'\")\n", + "\n", + " model = ChatOpenAI(temperature=0, model=\"gpt-4-0125-preview\", streaming=True)\n", + " llm_with_tool = model.with_structured_output(grade)\n", + " prompt = PromptTemplate(\n", + " template=\"\"\"You are a grader assessing relevance of a retrieved document to a user question. \\n\n", + " Here is the retrieved document: \\n\\n {context} \\n\\n\n", + " Here is the user question: {question} \\n\n", + " If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \\n\n", + " Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.\"\"\",\n", + " input_variables=[\"context\", \"question\"],\n", + " )\n", + " chain = prompt | llm_with_tool\n", + "\n", + " messages = state[\"messages\"]\n", + " last_message = messages[-1]\n", + " question = messages[0].content\n", + " docs = last_message.content\n", + "\n", + " scored_result = chain.invoke({\"question\": question, \"context\": docs})\n", + " score = scored_result.binary_score\n", + "\n", + " return \"generate\" if score == \"yes\" else \"rewrite\"\n", + "\n", + "\n", + "def agent(state):\n", + " messages = state[\"messages\"]\n", + " model = ChatOpenAI(temperature=0, streaming=True, model=\"gpt-4-turbo\")\n", + " model = model.bind_tools(tools)\n", + " response = model.invoke(messages)\n", + " return {\"messages\": [response]}\n", + "\n", + "\n", + "def rewrite(state):\n", + " messages = state[\"messages\"]\n", + " question = messages[0].content\n", + " msg = [\n", + " HumanMessage(\n", + " content=f\"\"\" \\n\n", + " Look at the input and try to reason about the underlying semantic intent / meaning. \\n\n", + " Here is the initial question:\n", + " \\n ------- \\n\n", + " {question}\n", + " \\n ------- \\n\n", + " Formulate an improved question: \"\"\",\n", + " )\n", + " ]\n", + " model = ChatOpenAI(temperature=0, model=\"gpt-4-0125-preview\", streaming=True)\n", + " response = model.invoke(msg)\n", + " return {\"messages\": [response]}\n", + "\n", + "\n", + "def generate(state):\n", + " messages = state[\"messages\"]\n", + " question = messages[0].content\n", + " last_message = messages[-1]\n", + " docs = last_message.content\n", + "\n", + " prompt = hub.pull(\"rlm/rag-prompt\")\n", + " llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0, streaming=True)\n", + "\n", + " def format_docs(docs):\n", + " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", + "\n", + " rag_chain = prompt | llm | StrOutputParser()\n", + " response = rag_chain.invoke({\"context\": docs, \"question\": question})\n", + " return {\"messages\": [response]}\n", + "\n", + "\n", + "workflow = StateGraph(AgentState)\n", + "workflow.add_node(\"agent\", agent)\n", + "retrieve = ToolNode([retriever_tool])\n", + "workflow.add_node(\"retrieve\", retrieve)\n", + "workflow.add_node(\"rewrite\", rewrite)\n", + "workflow.add_node(\"generate\", generate)\n", + "workflow.set_entry_point(\"agent\")\n", + "workflow.add_conditional_edges(\n", + " \"agent\", tools_condition, {\"tools\": \"retrieve\", END: END}\n", + ")\n", + "workflow.add_conditional_edges(\"retrieve\", grade_documents)\n", + "workflow.add_edge(\"generate\", END)\n", + "workflow.add_edge(\"rewrite\", \"agent\")\n", + "graph = workflow.compile()\n", + "\n", + "\n", + "def process_message(user_message):\n", + " inputs = {\"messages\": [(\"user\", user_message)]}\n", + " content_output = None\n", + " for output in graph.stream(inputs):\n", + " print(f\"Debug output: {output}\") # Debugging line to print the output\n", + " if \"agent\" in output and \"messages\" in output[\"agent\"]:\n", + " messages = output[\"agent\"][\"messages\"]\n", + " if messages and hasattr(messages[0], \"content\"):\n", + " content_output = messages[0].content # Accessing attribute directly\n", + " print(f\"Extracted content: {content_output}\") # Print extracted content\n", + " return content_output if content_output else \"No relevant output found.\"\n", + "\n", + "\n", + "# Define example questions to guide the user\n", + "example_questions = [\n", + " \"explain me in short what is PM Gati Shakti National Master Plan (NMP)?\"\n", + "]\n", + "\n", + "# Create a Gradio interface\n", + "iface = gr.Interface(\n", + " fn=process_message,\n", + " inputs=\"text\",\n", + " outputs=\"text\",\n", + " title=\"Agentic RAG \",\n", + " description=\"Enter a message to query related to export import .\",\n", + " examples=example_questions,\n", + ")\n", + "\n", + "# Launch the Gradio app\n", + "iface.launch(debug=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JjmUVVn1TdH0" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "source": [ + "#some quetions for testing\n", + "explain me in short what is PM Gati Shakti National Master Plan (NMP)?\n", + "\n", + "what is Zero Rating of Exports?\n", + "\n", + "what is Export Inspection Council of India?\n", + "\n", + "please give us some Details of some of the major initiatives /schemes please ?" + ], + "metadata": { + "id": "c921cw61mPdh" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k6Tq9E1Lqwtj" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file diff --git a/tutorials/NER-dataset-with-Instructor/README.md b/tutorials/NER-dataset-with-Instructor/README.md new file mode 100644 index 00000000..39ea3d8e --- /dev/null +++ b/tutorials/NER-dataset-with-Instructor/README.md @@ -0,0 +1,23 @@ +# Create a structured dataset using the Instructor +Structured data is still crucial when working with Structured datasets to ensure high data quality, easy management, and meaningful analysis. Using LLMs to extract structured data automates and improves this process, making it faster and scalable for different fields. + +Pydantic, the backbone of [Instructor](https://github.com/jxnl/instructor/), enables high customization and utilizes return datatype hints for seamless schema validation. It seamlessly integrates with LanceDB and directly inserts data into tables. + +![instructor](../../assets/instructor.png) + +## Run Code + +### Set Env variable +Add `OPENAI_API_KEY` as env variable + +```bash +export OPENAI_API_KEY=sk-... +``` + +Once you have added key as env variable, You are ready to run following code. +```bash +python3 main.py +``` +**NOTE**: For this example, we are using [**medium article dataset**](https://huggingface.co/datasets/fabiochiu/medium-articles), check `main.py` for changing dataset or entities to be extracted in schemas mentioned. + +Once you have created this dataset, You are ready to do [NER-powered semantic search](https://blog.lancedb.com/ner-powered-semantic-search-using-lancedb-51051dc3e493/) diff --git a/tutorials/NER-dataset-with-Instructor/main.py b/tutorials/NER-dataset-with-Instructor/main.py new file mode 100644 index 00000000..8e2122c6 --- /dev/null +++ b/tutorials/NER-dataset-with-Instructor/main.py @@ -0,0 +1,70 @@ +# import libraries +import instructor +from pydantic import BaseModel +from openai import OpenAI +from lancedb.pydantic import Vector, LanceModel +import lancedb +from langchain_openai import OpenAIEmbeddings +from datasets import load_dataset + + +# load the dataset and convert to pandas dataframe +df = load_dataset( + "fabiochiu/medium-articles", data_files="medium_articles.csv", split="train" +).to_pandas() + +df = df.dropna().sample(20000, random_state=32) +# select first 1000 characters from each article +df["text"] = df["text"].str[:1000] +# join article title and the text +df["title_text"] = df["title"] + ". " + df["text"] + + +# schema for table +class UserData(LanceModel): + vector: Vector(1536) + headline: str + content: str + entity: str + sentiment: str + news_article: bool + + +# schema for instructor output +class structureData(BaseModel): + headline: str + entity: str + sentiment: str + news: bool + + +# Patch the OpenAI client +client = instructor.from_openai(OpenAI()) +openai_embedding = OpenAIEmbeddings(model="text-embedding-3-small") + +# connect lancedb +db = lancedb.connect("~/.lancedb") +table_name = "instructor_lancedb" +table = db.create_table(table_name, schema=UserData, mode="overwrite") + +for index, row in df[:10].iterrows(): + # generate response + structured_info = client.chat.completions.create( + model="gpt-3.5-turbo", + response_model=structureData, + messages=[{"role": "user", "content": row["title_text"]}], + ) + + embedding = openai_embedding.embed_query(row["title_text"]) + userdata = UserData( + vector=embedding, + headline=structured_info.headline, + content=row["title_text"], + entity=structured_info.entity, + sentiment=structured_info.sentiment, + news_article=structured_info.news, + ) + table.add([userdata]) + +# show table content +print(table.to_pandas())