use voyage embeddings

davidhou17 · davidhou17 · commit 7ee4fe4f4360 · 2025-07-09T10:18:44.000-04:00
diff --git a/ai-integrations/langchain-parent-document-retrieval.ipynb b/ai-integrations/langchain-parent-document-retrieval.ipynb
@@ -28,7 +28,7 @@
    },
    "outputs": [],
    "source": [
-    "pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-openai pymongo pypdf"
+    "pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-voyageai langchain-openai pymongo pypdf"
    ]
   },
   {
@@ -39,7 +39,8 @@
    "source": [
     "import os\n",
     "\n",
-    "os.environ[\"OPENAI_API_KEY\"] = \"<api-key>\"\n",
+    "os.environ[\"VOYAGE_API_KEY\"] = \"<voyage-api-key>\"\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"<openai-api-key>\"\n",
     "ATLAS_CONNECTION_STRING = \"<connection-string>\""
    ]
   },
@@ -71,15 +72,15 @@
    "outputs": [],
    "source": [
     "from langchain_mongodb.retrievers import MongoDBAtlasParentDocumentRetriever\n",
-    "from langchain_openai import OpenAIEmbeddings\n",
+    "from langchain_voyageai import VoyageAIEmbeddings\n",
     "\n",
     "# Define the embedding model to use\n",
-    "embedding_model = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
+    "embedding_model = VoyageAIEmbeddings(model=\"voyage-3-large\")\n",
     "\n",
     "# Define the chunking method for the child documents\n",
     "child_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)\n",
     "\n",
-    "# Database and collection name\n",
+    "# Specify the database and collection name\n",
     "database_name = \"langchain_db\"\n",
     "collection_name = \"parent_document\"\n",
     "\n",
@@ -117,7 +118,7 @@
     "\n",
     "# Use helper method to create the vector search index\n",
     "vector_store.create_vector_search_index(\n",
-    "   dimensions = 1536,       # The dimensions of the vector embeddings to be indexed\n",
+    "   dimensions = 1024,       # The dimensions of the vector embeddings to be indexed\n",
     "   wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n",
     ")\n"
    ]
diff --git a/ai-integrations/langchain.ipynb b/ai-integrations/langchain.ipynb
@@ -28,7 +28,7 @@
    },
    "outputs": [],
    "source": [
-    "pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-openai pymongo pypdf"
+    "pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-voyageai langchain-openai pymongo pypdf"
    ]
   },
   {
@@ -39,7 +39,8 @@
    "source": [
     "import os\n",
     "\n",
-    "os.environ[\"OPENAI_API_KEY\"] = \"<api-key>\"\n",
+    "os.environ[\"VOYAGE_API_KEY\"] = \"<voyage-api-key>\"\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"<openai-api-key>\"\n",
     "ATLAS_CONNECTION_STRING = \"<connection-string>\""
    ]
   },
@@ -77,7 +78,7 @@
     "vector_store = MongoDBAtlasVectorSearch.from_connection_string(\n",
     "  connection_string = ATLAS_CONNECTION_STRING,\n",
     "  namespace = \"langchain_db.test\",\n",
-    "  embedding =  OpenAIEmbeddings(model=\"text-embedding-3-large\"),\n",
+    "  embedding =  VoyageAIEmbeddings(model=\"voyage-3-large\"),\n",
     "  index_name = \"vector_index\"\n",
     ")\n",
     "\n",
@@ -93,7 +94,7 @@
    "source": [
     "# Use helper method to create the vector search index\n",
     "vector_store.create_vector_search_index(\n",
-    "   dimensions = 3072, # The dimensions of the vector embeddings to be indexed\n",
+    "   dimensions = 1024, # The dimensions of the vector embeddings to be indexed\n",
     "   filters = [ \"page_label\" ],\n",
     "   wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n",
     ")"
diff --git a/quantization/existing-data.ipynb b/quantization/existing-data.ipynb
@@ -30,7 +30,7 @@
    },
    "outputs": [],
    "source": [
-    "pip install --quiet --upgrade pymongo cohere"
+    "pip install --quiet --upgrade voyageai pymongo"
    ]
   },
   {
@@ -40,33 +40,29 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "import pymongo\n",
-    "import cohere\n",
+    "import voyageai\n",
     "from bson.binary import Binary, BinaryVectorDtype\n",
     "\n",
-    "# Specify your Cohere API key\n",
-    "os.environ[\"COHERE_API_KEY\"] = \"<COHERE-API-KEY>\"\n",
-    "cohere_client = cohere.Client(os.environ[\"COHERE_API_KEY\"])\n",
+    "# Initialize the VoyageAI Client\n",
+    "os.environ[\"VOYAGE_API_KEY\"] = \"<VOYAGEAI-API-KEY>\"\n",
+    "vo = voyageai.Client()\n",
     "\n",
-    "# Define function to generate embeddings using the embed-english-v3.0 model\n",
-    "def get_embedding(text):\n",
-    "    response = cohere_client.embed(\n",
-    "      texts=[text],\n",
-    "      model='embed-english-v3.0',\n",
-    "      input_type='search_document',\n",
-    "      embedding_types=[\"float\"] # Can also be \"int8\" or \"ubinary\" (int1)\n",
-    "    )\n",
-    "    embedding = response.embeddings.float[0]\n",
+    "# Define a function to generate embeddings for all strings in `texts`\n",
+    "def generate_embeddings(texts, model: str, dtype: str, output_dimension: int):\n",
+    "    embeddings = []\n",
+    "    for text in texts:  # Process eachstring in the data list\n",
+    "        embedding = vo.embed(\n",
+    "            texts=[text],  # Pass each string as a list with a single item\n",
+    "            model=model,\n",
+    "            output_dtype=dtype,\n",
+    "            output_dimension=output_dimension,\n",
+    "        ).embeddings[0]\n",
+    "        embeddings.append(embedding)  # Collect the embedding for the current text\n",
+    "    return embeddings\n",
     "\n",
-    "    # If you specified a different data type, uncomment one of following lines and delete the preceding line\n",
-    "    # embedding = response.embeddings.int8[0]\n",
-    "    # embedding = response.embeddings.ubinary[0] # refers to int1 data type\n",
-    "\n",
-    "    return embedding\n",
-    "\n",
-    "# Define function to convert embeddings to BSON-compatible format\n",
+    "# Convert embeddings to BSON vectors\n",
     "def generate_bson_vector(vector, vector_dtype):\n",
-    "    return Binary.from_vector(vector, vector_dtype)"
+    "   return Binary.from_vector(vector, vector_dtype)"
    ]
   },
   {
@@ -75,8 +71,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import pymongo  \n",
+    "\n",
     "# Connect to your Atlas cluster\n",
-    "mongo_client = pymongo.MongoClient(\"<ATLAS-CONNECTION-STRING>\")\n",
+    "mongo_client = pymongo.MongoClient(\"<CONNECTION-STRING>\")\n",
     "db = mongo_client[\"sample_airbnb\"]\n",
     "collection = db[\"listingsAndReviews\"]\n",
     "\n",
@@ -96,26 +94,43 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for doc in documents:\n",
-    "    # Generate embeddings based on the summary\n",
-    "    summary = doc[\"summary\"]\n",
-    "    embedding = get_embedding(summary)  # Get float32 embedding\n",
-    "\n",
-    "    # Convert float32 embeddings into BSON format\n",
-    "    bson_vector = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
-    "\n",
-    "    # If you specified a different data type, uncomment one of following lines and delete the preceding line\n",
-    "    # bson_vector = generate_bson_vector(embedding, BinaryVectorDtype.INT8)\n",
-    "    # bson_vector = generate_bson_vector(embedding, BinaryVectorDtype.PACKED_BIT) # refers to int1 data type\n",
-    "\n",
-    "    # Update the document with the BSON embedding\n",
-    "    collection.update_one(\n",
-    "        {\"_id\": doc[\"_id\"]},\n",
-    "        {\"$set\": {\"embedding\": bson_vector}}\n",
-    "    )\n",
-    "    updated_doc_count += 1\n",
-    "\n",
-    "print(f\"Updated {updated_doc_count} documents with BSON embeddings.\")"
+    "model_name = \"voyage-3-large\"\n",
+    "output_dimension = 1024\n",
+    "float32_field = \"float32-embedding\"\n",
+    "int8_field = \"int8-embedding\"\n",
+    "int1_field = \"int1-embedding\"\n",
+    "\n",
+    "# Process and update each document\n",
+    "updated_doc_count = 0  \n",
+    "for document in documents:  \n",
+    "    summary = document.get(\"summary\")  \n",
+    "    if not summary:  \n",
+    "        continue  \n",
+    "  \n",
+    "    # Generate embeddings for the summary field  \n",
+    "    float_embeddings = generate_embeddings([summary], model=model_name, dtype=\"float\", output_dimension=output_dimension)  \n",
+    "    int8_embeddings = generate_embeddings([summary], model=model_name, dtype=\"int8\", output_dimension=output_dimension)  \n",
+    "    ubinary_embeddings = generate_embeddings([summary], model=model_name, dtype=\"ubinary\", output_dimension=output_dimension)  \n",
+    "  \n",
+    "    # Convert embeddings to BSON-compatible format  \n",
+    "    bson_float = generate_bson_vector(float_embeddings[0], BinaryVectorDtype.FLOAT32)  \n",
+    "    bson_int8 = generate_bson_vector(int8_embeddings[0], BinaryVectorDtype.INT8)  \n",
+    "    bson_ubinary = generate_bson_vector(ubinary_embeddings[0], BinaryVectorDtype.PACKED_BIT)  \n",
+    "  \n",
+    "    # Prepare the updated document  \n",
+    "    updated_fields = {  \n",
+    "        float32_field: bson_float,  \n",
+    "        int8_field: bson_int8,  \n",
+    "        int1_field: bson_ubinary,\n",
+    "    }  \n",
+    "  \n",
+    "    # Update the document in MongoDB  \n",
+    "    result = collection.update_one({\"_id\": document[\"_id\"]}, {\"$set\": updated_fields})  \n",
+    "    if result.modified_count > 0:  \n",
+    "        updated_doc_count += 1  \n",
+    "  \n",
+    "# Print the results  \n",
+    "print(f\"Number of documents updated: {updated_doc_count}\") "
    ]
   },
   {
@@ -128,13 +143,25 @@
     "import time\n",
     "\n",
     "# Define and create the vector search index\n",
-    "index_name = \"<INDEX-NAME>\"\n",
+    "index_name = \"vector_index\"\n",
     "search_index_model = SearchIndexModel(\n",
     "  definition={\n",
     "    \"fields\": [\n",
     "      {\n",
     "        \"type\": \"vector\",\n",
-    "        \"path\": \"embedding\",\n",
+    "        \"path\": float32_field,\n",
+    "        \"similarity\": \"dotProduct\",\n",
+    "        \"numDimensions\": 1024\n",
+    "      },\n",
+    "      {\n",
+    "        \"type\": \"vector\",\n",
+    "        \"path\": int8_field,\n",
+    "        \"similarity\": \"dotProduct\",\n",
+    "        \"numDimensions\": 1024\n",
+    "      },\n",
+    "      {\n",
+    "        \"type\": \"vector\",\n",
+    "        \"path\": int1_field,\n",
     "        \"similarity\": \"euclidean\",\n",
     "        \"numDimensions\": 1024\n",
     "      }\n",
@@ -165,36 +192,57 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Define function to run a vector search query\n",
+    "import voyageai\n",
+    "from bson.binary import Binary, BinaryVectorDtype\n",
+    "\n",
+    "# Define a function to run a vector search query\n",
     "def run_vector_search(query_text, collection, path):\n",
-    "  query_embedding = get_embedding(\"query_text\")\n",
-    "  bson_query_vector = generate_bson_vector(query_embedding, BinaryVectorDtype.FLOAT32)\n",
-    "\n",
-    "  # If you specified a different data type, uncomment one of following lines and delete the preceding line\n",
-    "  # bson_query_vector = generate_bson_vector(query_embedding, BinaryVectorDtype.INT8)\n",
-    "  # bson_query_vector = generate_bson_vector(query_embedding, BinaryVectorDtype.PACKED_BIT) # refers to int1 data type\n",
-    "\n",
-    "  pipeline = [\n",
-    "    {\n",
-    "      '$vectorSearch': {\n",
-    "        'index': index_name,\n",
-    "        'path': path,\n",
-    "        'queryVector': bson_query_vector,\n",
-    "        'numCandidates': 20,\n",
-    "        'limit': 5\n",
-    "       }\n",
-    "     },\n",
-    "     {\n",
-    "       '$project': {\n",
-    "         '_id': 0,\n",
-    "         'name': 1,\n",
-    "         'summary': 1,\n",
-    "         'score': { '$meta': 'vectorSearchScore' }\n",
+    "    # Map path to output dtype and BSON vector type\n",
+    "    path_to_dtype = {\n",
+    "        float32_field: (\"float\", BinaryVectorDtype.FLOAT32),\n",
+    "        int8_field: (\"int8\", BinaryVectorDtype.INT8),\n",
+    "        int1_field: (\"ubinary\", BinaryVectorDtype.PACKED_BIT),\n",
+    "    }\n",
+    "\n",
+    "    if path not in path_to_dtype:\n",
+    "        raise ValueError(\"Invalid path. Must be one of float32_field, int8_field, int1_field.\")\n",
+    "\n",
+    "    # Get Voyage AI output dtype and BSON vector type based on the path\n",
+    "    output_dtype, bson_dtype = path_to_dtype[path]\n",
+    "\n",
+    "    # Generate query embeddings using Voyage AI\n",
+    "    query_vector = vo.embed(\n",
+    "        texts=[query_text],\n",
+    "        model=\"voyage-3-large\",\n",
+    "        input_type=\"query\",\n",
+    "        output_dtype=output_dtype\n",
+    "    ).embeddings[0]\n",
+    "\n",
+    "    # Convert the query vector to BSON format\n",
+    "    bson_query_vector = Binary.from_vector(query_vector, bson_dtype)\n",
+    "\n",
+    "    # Define the aggregation pipeline for vector search\n",
+    "    pipeline = [\n",
+    "        {\n",
+    "            \"$vectorSearch\": {\n",
+    "                \"index\": index_name,  # Replace with your index name\n",
+    "                \"path\": path,         # Path to the embedding field\n",
+    "                \"queryVector\": bson_query_vector,  # BSON-encoded query vector\n",
+    "                \"numCandidates\": 20,\n",
+    "                \"limit\": 5\n",
+    "            }\n",
+    "        },\n",
+    "        {\n",
+    "            \"$project\": {\n",
+    "                \"_id\": 0,\n",
+    "                \"summary\": 1,\n",
+    "                \"score\": { \"$meta\": \"vectorSearchScore\" }  # Include the similarity score\n",
+    "            }\n",
     "        }\n",
-    "     }\n",
-    "  ]\n",
+    "    ]\n",
     "\n",
-    "  return collection.aggregate(pipeline)"
+    "    # Run the aggregation pipeline and return results\n",
+    "    return collection.aggregate(pipeline)"
    ]
   },
   {
@@ -205,12 +253,19 @@
    "source": [
     "from pprint import pprint\n",
     "\n",
-    "# Run a vector search query\n",
+    "# Define a list of embedding fields to query\n",
+    "embedding_fields = [float32_field, int8_field, int1_field] \n",
+    "results = {}\n",
+    "\n",
+    "# Run vector search queries for each embedding type\n",
     "query_text = \"ocean view\"\n",
-    "query_results = run_vector_search(query_text, collection, \"embedding\")\n",
+    "for field in embedding_fields:\n",
+    "    results[field] = list(run_vector_search(query_text, collection, field)) \n",
     "\n",
-    "print(\"query results:\")\n",
-    "pprint(list(query_results))"
+    "# Print the results\n",
+    "for field, field_results in results.items():\n",
+    "    print(f\"Results from {field}\")\n",
+    "    pprint(field_results)"
    ]
   }
  ],
diff --git a/quantization/new-data.ipynb b/quantization/new-data.ipynb