mongodb
diff --git a/‎create-embeddings/README.md
Lines changed: 4 additions & 2 deletions b/‎create-embeddings/README.md
Lines changed: 4 additions & 2 deletions
diff --git a/‎create-embeddings/voyage-existing-data.ipynb
Lines changed: 271 additions & 0 deletions b/‎create-embeddings/voyage-existing-data.ipynb
Lines changed: 271 additions & 0 deletions
@@ -11,5 +11,7 @@ new data or from data you already have in MongoDB Atlas.
 |----------|-------------|
 | [open-source-new-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/open-source-new-data.ipynb) | Generate embeddings from new data using an open-source embedding model |
 | [open-source-existing-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/open-source-existing-data.ipynb) | Generate embeddings from  existing data in Atlas using an open-source embedding model |
-| [openai-new-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/openai-new-data.ipynb) | Generate embeddings from new data using an OpenAI embedding model |
-| [openai-existing-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/openai-existing-data.ipynb) | Generate embeddings from existing data in Atlas using an OpenAI embedding model |
+| [voyage-new-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/voyage-new-data.ipynb) | Generate embeddings from new data using an embedding model from Voyage AI |
+| [voyage-existing-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/voyage-existing-data.ipynb) | Generate embeddings from existing data in Atlas using an embedding model from Voyage AI |
+| [openai-new-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/openai-new-data.ipynb) | Generate embeddings from new data using an embedding model from OpenAI |
+| [openai-existing-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/voyage-existing-data.ipynb) | Generate embeddings from existing data in Atlas using an embedding model from OpenAI |
@@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Atlas Vector Search - Create Embeddings - Voyage AI - Existing Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook is a companion to the [Create Embeddings](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/) page. Refer to the page for set-up instructions and detailed explanations.\n",
+    "\n",
+    "This notebook takes you through how to generate embeddings from **existing data in Atlas** by using the ``voyage-3`` model from Voyage AI.\n",
+    "\n",
+    "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/create-embeddings/voyage-existing-data.ipynb\">\n",
+    "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+    "</a>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "pip install --quiet --upgrade voyageai pymongo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "source": [
+    "## Use an Embedding Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import voyageai\n",
+    "\n",
+    "# Specify your Voyage API key and embedding model\n",
+    "os.environ[\"VOYAGE_API_KEY\"] = \"<api-key>\"\n",
+    "model = \"voyage-3\"\n",
+    "vo = voyageai.Client()\n",
+    "\n",
+    "# Define a function to generate embeddings\n",
+    "def get_embedding(data, input_type = \"document\"):\n",
+    "  embeddings = vo.embed(\n",
+    "      data, model = model, input_type = input_type\n",
+    "  ).embeddings\n",
+    "  return embeddings[0]\n",
+    "\n",
+    "# Generate an embedding\n",
+    "embedding = get_embedding(\"foo\")\n",
+    "print(embedding)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### (Optional) Compress your embeddings\n",
+    "\n",
+    "Optionally, run the following code to define a function that converts your embeddings into BSON `binData` vectors for [efficient storage and retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/#vector-compression)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bson.binary import Binary \n",
+    "from bson.binary import BinaryVectorDtype\n",
+    "\n",
+    "# Define a function to generate BSON vectors\n",
+    "def generate_bson_vector(vector, vector_dtype):\n",
+    "   return Binary.from_vector(vector, vector_dtype)\n",
+    "\n",
+    "# Generate BSON vector from the sample float32 embedding\n",
+    "bson_float32_embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
+    "\n",
+    "# Print the converted embedding\n",
+    "print(f\"The converted BSON embedding is: {bson_float32_embedding}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate Embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pymongo\n",
+    "\n",
+    "# Connect to your Atlas cluster\n",
+    "mongo_client = pymongo.MongoClient(\"<connection-string>\")\n",
+    "db = mongo_client[\"sample_airbnb\"]\n",
+    "collection = db[\"listingsAndReviews\"]\n",
+    "\n",
+    "# Define a filter to exclude documents with null or empty 'summary' fields\n",
+    "filter = { 'summary': { '$exists': True, \"$nin\": [ None, \"\" ] } }\n",
+    "\n",
+    "# Get a subset of documents in the collection\n",
+    "documents = collection.find(filter, {'_id': 1, 'summary': 1}).limit(50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pymongo import UpdateOne\n",
+    "\n",
+    "# Generate the list of bulk write operations\n",
+    "operations = []\n",
+    "for doc in documents:\n",
+    "   summary = doc[\"summary\"]\n",
+    "   # Generate embeddings for this document\n",
+    "   embedding = get_embedding(summary)\n",
+    "\n",
+    "   # Uncomment the following line to convert to BSON vectors\n",
+    "   # embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
+    "\n",
+    "   # Add the update operation to the list\n",
+    "   operations.append(UpdateOne(\n",
+    "      {\"_id\": doc[\"_id\"]},\n",
+    "      {\"$set\": {\n",
+    "         \"embedding\": embedding\n",
+    "      }}\n",
+    "   ))\n",
+    "\n",
+    "# Execute the bulk write operation\n",
+    "if operations:\n",
+    "   result = collection.bulk_write(operations)\n",
+    "   updated_doc_count = result.modified_count\n",
+    "\n",
+    "print(f\"Updated {updated_doc_count} documents.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Index and Query Your Embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pymongo.operations import SearchIndexModel\n",
+    "import time\n",
+    "\n",
+    "# Create your index model, then create the search index\n",
+    "search_index_model = SearchIndexModel(\n",
+    "  definition = {\n",
+    "    \"fields\": [\n",
+    "      {\n",
+    "        \"type\": \"vector\",\n",
+    "        \"path\": \"embedding\",\n",
+    "        \"similarity\": \"dotProduct\",\n",
+    "        \"numDimensions\": 1024\n",
+    "      }\n",
+    "    ]\n",
+    "  },\n",
+    "  name=\"vector_index\",\n",
+    "  type=\"vectorSearch\"\n",
+    ")\n",
+    "result = collection.create_search_index(model=search_index_model)\n",
+    "\n",
+    "# Wait for initial sync to complete\n",
+    "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
+    "predicate=None\n",
+    "if predicate is None:\n",
+    "  predicate = lambda index: index.get(\"queryable\") is True\n",
+    "\n",
+    "while True:\n",
+    "  indices = list(collection.list_search_indexes(result))\n",
+    "  if len(indices) and predicate(indices[0]):\n",
+    "    break\n",
+    "  time.sleep(5)\n",
+    "print(result + \" is ready for querying.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate embedding for the search query\n",
+    "query_embedding = get_embedding(\"beach house\", input_type=\"query\")\n",
+    "\n",
+    "# Sample vector search pipeline\n",
+    "pipeline = [\n",
+    "   {\n",
+    "      \"$vectorSearch\": {\n",
+    "            \"index\": \"vector_index\",\n",
+    "            \"queryVector\": query_embedding,\n",
+    "            \"path\": \"embedding\",\n",
+    "            \"exact\": True,\n",
+    "            \"limit\": 5\n",
+    "      }\n",
+    "   }, \n",
+    "   {\n",
+    "      \"$project\": {\n",
+    "         \"_id\": 0, \n",
+    "         \"summary\": 1,\n",
+    "         \"score\": {\n",
+    "            \"$meta\": \"vectorSearchScore\"\n",
+    "         }\n",
+    "      }\n",
+    "   }\n",
+    "]\n",
+    "\n",
+    "# Execute the search\n",
+    "results = collection.aggregate(pipeline)\n",
+    "\n",
+    "# Print results\n",
+    "for i in results:\n",
+    "   print(i)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}