DiFerMa · DiFerMa · Apr 1, 2025 · Apr 21, 2025
diff --git a/examples/3-applied-rag/embeddings.ipynb b/examples/3-applied-rag/embeddings.ipynb
@@ -5,54 +5,67 @@
    "execution_count": 1,
    "id": "d566bb99-6808-4976-8476-ed05b7941b80",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/var/folders/fh/_0hrdmkn5bdcs8mzhjtd7w7c0000gn/T/ipykernel_29348/3557198016.py:1: DeprecationWarning: \n",
-      "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
-      "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
-      "but was not found to be installed on your system.\n",
-      "If this would cause problems for you,\n",
-      "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
-      "        \n",
-      "  import pandas as pd\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "700"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "df = pd.read_csv('../../top_rated_wines.csv')\n",
-    "df = df[df['variety'].notna()] # remove any NaN values as it blows up serialization\n",
-    "data = df.sample(700).to_dict('records') # Get only 700 records. More records will make it slower to index\n",
-    "len(data)"
+    "# "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "af8bce2c-e123-498a-a5f2-cefffd17fc74",
+   "id": "cf4f8184",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "with open(\"../../rag_dataset.json\", \"r\", encoding=\"utf-8\") as f:\n",
+    "    data = json.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "22ada10c",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+    "\n",
+    "splitter = RecursiveCharacterTextSplitter(\n",
+    "    chunk_size=800,\n",
+    "    chunk_overlap=100,\n",
+    "    separators=[\"\\n\\n\", \"\\n\", \".\", \" \"]\n",
+    ")\n",
+    "\n",
+    "documents = splitter.create_documents(\n",
+    "    texts=[item[\"content\"] for item in data],\n",
+    "    metadatas=[item[\"metadata\"] for item in data]\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "af8bce2c-e123-498a-a5f2-cefffd17fc74",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\d.fernandez.macias\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "from qdrant_client import models, QdrantClient\n",
     "from sentence_transformers import SentenceTransformer"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "id": "2b0e4be5-7518-4458-bf47-6913ef9a72a9",
    "metadata": {},
    "outputs": [],
@@ -62,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "id": "5efa031d-b18a-4db1-9c34-9989a15c822b",
    "metadata": {},
    "outputs": [],
@@ -73,49 +86,68 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "6c03be93-a076-425e-8df1-5a8b6367e558",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\d.fernandez.macias\\AppData\\Local\\Temp\\ipykernel_26240\\439416039.py:2: DeprecationWarning: `recreate_collection` method is deprecated and will be removed in the future. Use `collection_exists` to check collection existence and `create_collection` instead.\n",
+      "  qdrant.recreate_collection(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# Create collection to store wines\n",
     "qdrant.recreate_collection(\n",
-    "    collection_name=\"top_wines\",\n",
+    "    collection_name=\"resumes\",\n",
     "    vectors_config=models.VectorParams(\n",
-    "        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model\n",
+    "        size=encoder.get_sentence_embedding_dimension(),\n",
     "        distance=models.Distance.COSINE\n",
     "    )\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "655d08af-758f-4338-b112-cf94045c7b0d",
    "metadata": {},
    "outputs": [],
    "source": [
     "# vectorize!\n",
     "qdrant.upload_points(\n",
-    "    collection_name=\"top_wines\",\n",
+    "    collection_name=\"resumes\",\n",
     "    points=[\n",
     "        models.PointStruct(\n",
     "            id=idx,\n",
-    "            vector=encoder.encode(doc[\"notes\"]).tolist(),\n",
-    "            payload=doc,\n",
-    "        ) for idx, doc in enumerate(data) # data is the variable holding all the wines\n",
+    "            vector=encoder.encode(doc.page_content).tolist(),\n",
+    "            payload={**doc.metadata, \"text\": doc.page_content}\n",
+    "        ) for idx, doc in enumerate(documents)\n",
     "    ]\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "id": "f23bc999",
    "metadata": {},
    "outputs": [],
    "source": [
-    "user_prompt = \"Suggest me an amazing Malbec wine from Argentina\""
+    "user_prompt = \"I need someone to program in javascript\""
    ]
   },
   {
@@ -125,56 +157,94 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Search time for awesome wines!\n",
-    "\n",
     "hits = qdrant.search(\n",
-    "    collection_name=\"top_wines\",\n",
+    "    collection_name=\"resumes\",\n",
     "    query_vector=encoder.encode(user_prompt).tolist(),\n",
-    "    limit=3\n",
-    ")\n",
-    "for hit in hits:\n",
-    "  print(hit.payload, \"score:\", hit.score)"
+    "    limit=50\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "33243e5d-9e0d-4ec4-98e9-3fc56b8bdb10",
+   "execution_count": 11,
+   "id": "60b15e00",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# define a variable to hold the search results\n",
-    "search_results = [hit.payload for hit in hits]"
+    "from collections import defaultdict\n",
+    "\n",
+    "file_stats = defaultdict(lambda: {\"count\": 0, \"max_score\": 0, \"chunks\": []})\n",
+    "\n",
+    "for hit in hits:\n",
+    "    fname = hit.payload[\"file_name\"]\n",
+    "    file_stats[fname][\"count\"] += 1\n",
+    "    file_stats[fname][\"max_score\"] = max(file_stats[fname][\"max_score\"], hit.score)\n",
+    "    file_stats[fname][\"chunks\"].append(hit.payload[\"text\"])\n",
+    "\n",
+    "# (count * max_score) to prioritize\n",
+    "sorted_files = sorted(\n",
+    "    file_stats.items(),\n",
+    "    key=lambda x: (x[1][\"count\"] * x[1][\"max_score\"]),\n",
+    "    reverse=True\n",
+    ")\n",
+    "\n",
+    "# Step 3: Build a context string from the top 2 results\n",
+    "top_n = 2\n",
+    "context_chunks = []\n",
+    "\n",
+    "for i, (fname, stats) in enumerate(sorted_files[:top_n]):\n",
+    "    joined_chunks = \"\\n\".join(stats[\"chunks\"])\n",
+    "    context_chunks.append(f\"From {fname}:\\n{joined_chunks}\")\n",
+    "\n",
+    "main_context = \"\\n\\n---\\n\\n\".join(context_chunks)\n",
+    "\n",
+    "# Optional: get names of other people you *didn't* include\n",
+    "excluded = [fname for fname, _ in sorted_files[top_n:]]\n",
+    "also_mentioned = \", \".join(set(excluded))\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "e6c2b91e",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I'm sorry, but as an AI text-based model, I don't have the capability to remember personal data or previous interactions unless they are included in the current session's input. If you mentioned your name earlier in this conversation and want me to use it again, please feel free to remind me of it!\n"
+     ]
+    }
+   ],
    "source": [
-    "# Now time to connect to the local large language model\n",
-    "from openai import OpenAI\n",
-    "client = OpenAI(\n",
-    "    base_url=\"http://127.0.0.1:8080/v1\", # \"http://<Your api-server IP>:port\"\n",
-    "    api_key = \"sk-no-key-required\"\n",
-    ")\n",
-    "completion = client.chat.completions.create(\n",
-    "    model=\"LLaMA_CPP\",\n",
+    "import ollama\n",
+    "o_models=[\"deepseek-r1:1.5b\",\"phi4:latest\"]\n",
+    "# Construct your RAG prompt\n",
+    "prompt_text = f\"\"\"\n",
+    "You are an assistant specialized in analyzing resumes. Based on the following resume information:\n",
+    "\n",
+    "{main_context}\n",
+    "\n",
+    "Answer the question: {user_prompt}\n",
+    "\"\"\"\n",
+    "# Call the model with the chat interface\n",
+    "response = ollama.chat(\n",
+    "    model=o_models[1],\n",
     "    messages=[\n",
-    "        {\"role\": \"system\", \"content\": \"You are chatbot, a wine specialist. Your top priority is to help guide users into selecting amazing wine and guide them with their requests.\"},\n",
-    "        {\"role\": \"user\", \"content\": \"Suggest me an amazing Malbec wine from Argentina\"},\n",
-    "        {\"role\": \"assistant\", \"content\": str(search_results)}\n",
-    "    ]\n",
+    "        {\"role\": \"system\", \"content\": \"You are a helpful assistant for analyzing CVs and finding relevant candidates.\"},\n",
+    "        {\"role\": \"user\", \"content\": prompt_text}\n",
+    "    ],\n",
+    "\n",
     ")\n",
-    "print(completion.choices[0].message)"
+    "\n",
+    "print(response['message']['content'])\n"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -188,7 +258,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.13.1"
   }
  },
  "nbformat": 4,