Skip to content

Commit 7ee4fe4

Browse files
committed
use voyage embeddings
1 parent ae70be1 commit 7ee4fe4

File tree

4 files changed

+250
-169
lines changed

4 files changed

+250
-169
lines changed

ai-integrations/langchain-parent-document-retrieval.ipynb

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
},
2929
"outputs": [],
3030
"source": [
31-
"pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-openai pymongo pypdf"
31+
"pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-voyageai langchain-openai pymongo pypdf"
3232
]
3333
},
3434
{
@@ -39,7 +39,8 @@
3939
"source": [
4040
"import os\n",
4141
"\n",
42-
"os.environ[\"OPENAI_API_KEY\"] = \"<api-key>\"\n",
42+
"os.environ[\"VOYAGE_API_KEY\"] = \"<voyage-api-key>\"\n",
43+
"os.environ[\"OPENAI_API_KEY\"] = \"<openai-api-key>\"\n",
4344
"ATLAS_CONNECTION_STRING = \"<connection-string>\""
4445
]
4546
},
@@ -71,15 +72,15 @@
7172
"outputs": [],
7273
"source": [
7374
"from langchain_mongodb.retrievers import MongoDBAtlasParentDocumentRetriever\n",
74-
"from langchain_openai import OpenAIEmbeddings\n",
75+
"from langchain_voyageai import VoyageAIEmbeddings\n",
7576
"\n",
7677
"# Define the embedding model to use\n",
77-
"embedding_model = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
78+
"embedding_model = VoyageAIEmbeddings(model=\"voyage-3-large\")\n",
7879
"\n",
7980
"# Define the chunking method for the child documents\n",
8081
"child_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)\n",
8182
"\n",
82-
"# Database and collection name\n",
83+
"# Specify the database and collection name\n",
8384
"database_name = \"langchain_db\"\n",
8485
"collection_name = \"parent_document\"\n",
8586
"\n",
@@ -117,7 +118,7 @@
117118
"\n",
118119
"# Use helper method to create the vector search index\n",
119120
"vector_store.create_vector_search_index(\n",
120-
" dimensions = 1536, # The dimensions of the vector embeddings to be indexed\n",
121+
" dimensions = 1024, # The dimensions of the vector embeddings to be indexed\n",
121122
" wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n",
122123
")\n"
123124
]

ai-integrations/langchain.ipynb

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
},
2929
"outputs": [],
3030
"source": [
31-
"pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-openai pymongo pypdf"
31+
"pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-voyageai langchain-openai pymongo pypdf"
3232
]
3333
},
3434
{
@@ -39,7 +39,8 @@
3939
"source": [
4040
"import os\n",
4141
"\n",
42-
"os.environ[\"OPENAI_API_KEY\"] = \"<api-key>\"\n",
42+
"os.environ[\"VOYAGE_API_KEY\"] = \"<voyage-api-key>\"\n",
43+
"os.environ[\"OPENAI_API_KEY\"] = \"<openai-api-key>\"\n",
4344
"ATLAS_CONNECTION_STRING = \"<connection-string>\""
4445
]
4546
},
@@ -77,7 +78,7 @@
7778
"vector_store = MongoDBAtlasVectorSearch.from_connection_string(\n",
7879
" connection_string = ATLAS_CONNECTION_STRING,\n",
7980
" namespace = \"langchain_db.test\",\n",
80-
" embedding = OpenAIEmbeddings(model=\"text-embedding-3-large\"),\n",
81+
" embedding = VoyageAIEmbeddings(model=\"voyage-3-large\"),\n",
8182
" index_name = \"vector_index\"\n",
8283
")\n",
8384
"\n",
@@ -93,7 +94,7 @@
9394
"source": [
9495
"# Use helper method to create the vector search index\n",
9596
"vector_store.create_vector_search_index(\n",
96-
" dimensions = 3072, # The dimensions of the vector embeddings to be indexed\n",
97+
" dimensions = 1024, # The dimensions of the vector embeddings to be indexed\n",
9798
" filters = [ \"page_label\" ],\n",
9899
" wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n",
99100
")"

quantization/existing-data.ipynb

Lines changed: 132 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
},
3131
"outputs": [],
3232
"source": [
33-
"pip install --quiet --upgrade pymongo cohere"
33+
"pip install --quiet --upgrade voyageai pymongo"
3434
]
3535
},
3636
{
@@ -40,33 +40,29 @@
4040
"outputs": [],
4141
"source": [
4242
"import os\n",
43-
"import pymongo\n",
44-
"import cohere\n",
43+
"import voyageai\n",
4544
"from bson.binary import Binary, BinaryVectorDtype\n",
4645
"\n",
47-
"# Specify your Cohere API key\n",
48-
"os.environ[\"COHERE_API_KEY\"] = \"<COHERE-API-KEY>\"\n",
49-
"cohere_client = cohere.Client(os.environ[\"COHERE_API_KEY\"])\n",
46+
"# Initialize the VoyageAI Client\n",
47+
"os.environ[\"VOYAGE_API_KEY\"] = \"<VOYAGEAI-API-KEY>\"\n",
48+
"vo = voyageai.Client()\n",
5049
"\n",
51-
"# Define function to generate embeddings using the embed-english-v3.0 model\n",
52-
"def get_embedding(text):\n",
53-
" response = cohere_client.embed(\n",
54-
" texts=[text],\n",
55-
" model='embed-english-v3.0',\n",
56-
" input_type='search_document',\n",
57-
" embedding_types=[\"float\"] # Can also be \"int8\" or \"ubinary\" (int1)\n",
58-
" )\n",
59-
" embedding = response.embeddings.float[0]\n",
50+
"# Define a function to generate embeddings for all strings in `texts`\n",
51+
"def generate_embeddings(texts, model: str, dtype: str, output_dimension: int):\n",
52+
" embeddings = []\n",
53+
" for text in texts: # Process eachstring in the data list\n",
54+
" embedding = vo.embed(\n",
55+
" texts=[text], # Pass each string as a list with a single item\n",
56+
" model=model,\n",
57+
" output_dtype=dtype,\n",
58+
" output_dimension=output_dimension,\n",
59+
" ).embeddings[0]\n",
60+
" embeddings.append(embedding) # Collect the embedding for the current text\n",
61+
" return embeddings\n",
6062
"\n",
61-
" # If you specified a different data type, uncomment one of following lines and delete the preceding line\n",
62-
" # embedding = response.embeddings.int8[0]\n",
63-
" # embedding = response.embeddings.ubinary[0] # refers to int1 data type\n",
64-
"\n",
65-
" return embedding\n",
66-
"\n",
67-
"# Define function to convert embeddings to BSON-compatible format\n",
63+
"# Convert embeddings to BSON vectors\n",
6864
"def generate_bson_vector(vector, vector_dtype):\n",
69-
" return Binary.from_vector(vector, vector_dtype)"
65+
" return Binary.from_vector(vector, vector_dtype)"
7066
]
7167
},
7268
{
@@ -75,8 +71,10 @@
7571
"metadata": {},
7672
"outputs": [],
7773
"source": [
74+
"import pymongo \n",
75+
"\n",
7876
"# Connect to your Atlas cluster\n",
79-
"mongo_client = pymongo.MongoClient(\"<ATLAS-CONNECTION-STRING>\")\n",
77+
"mongo_client = pymongo.MongoClient(\"<CONNECTION-STRING>\")\n",
8078
"db = mongo_client[\"sample_airbnb\"]\n",
8179
"collection = db[\"listingsAndReviews\"]\n",
8280
"\n",
@@ -96,26 +94,43 @@
9694
"metadata": {},
9795
"outputs": [],
9896
"source": [
99-
"for doc in documents:\n",
100-
" # Generate embeddings based on the summary\n",
101-
" summary = doc[\"summary\"]\n",
102-
" embedding = get_embedding(summary) # Get float32 embedding\n",
103-
"\n",
104-
" # Convert float32 embeddings into BSON format\n",
105-
" bson_vector = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
106-
"\n",
107-
" # If you specified a different data type, uncomment one of following lines and delete the preceding line\n",
108-
" # bson_vector = generate_bson_vector(embedding, BinaryVectorDtype.INT8)\n",
109-
" # bson_vector = generate_bson_vector(embedding, BinaryVectorDtype.PACKED_BIT) # refers to int1 data type\n",
110-
"\n",
111-
" # Update the document with the BSON embedding\n",
112-
" collection.update_one(\n",
113-
" {\"_id\": doc[\"_id\"]},\n",
114-
" {\"$set\": {\"embedding\": bson_vector}}\n",
115-
" )\n",
116-
" updated_doc_count += 1\n",
117-
"\n",
118-
"print(f\"Updated {updated_doc_count} documents with BSON embeddings.\")"
97+
"model_name = \"voyage-3-large\"\n",
98+
"output_dimension = 1024\n",
99+
"float32_field = \"float32-embedding\"\n",
100+
"int8_field = \"int8-embedding\"\n",
101+
"int1_field = \"int1-embedding\"\n",
102+
"\n",
103+
"# Process and update each document\n",
104+
"updated_doc_count = 0 \n",
105+
"for document in documents: \n",
106+
" summary = document.get(\"summary\") \n",
107+
" if not summary: \n",
108+
" continue \n",
109+
" \n",
110+
" # Generate embeddings for the summary field \n",
111+
" float_embeddings = generate_embeddings([summary], model=model_name, dtype=\"float\", output_dimension=output_dimension) \n",
112+
" int8_embeddings = generate_embeddings([summary], model=model_name, dtype=\"int8\", output_dimension=output_dimension) \n",
113+
" ubinary_embeddings = generate_embeddings([summary], model=model_name, dtype=\"ubinary\", output_dimension=output_dimension) \n",
114+
" \n",
115+
" # Convert embeddings to BSON-compatible format \n",
116+
" bson_float = generate_bson_vector(float_embeddings[0], BinaryVectorDtype.FLOAT32) \n",
117+
" bson_int8 = generate_bson_vector(int8_embeddings[0], BinaryVectorDtype.INT8) \n",
118+
" bson_ubinary = generate_bson_vector(ubinary_embeddings[0], BinaryVectorDtype.PACKED_BIT) \n",
119+
" \n",
120+
" # Prepare the updated document \n",
121+
" updated_fields = { \n",
122+
" float32_field: bson_float, \n",
123+
" int8_field: bson_int8, \n",
124+
" int1_field: bson_ubinary,\n",
125+
" } \n",
126+
" \n",
127+
" # Update the document in MongoDB \n",
128+
" result = collection.update_one({\"_id\": document[\"_id\"]}, {\"$set\": updated_fields}) \n",
129+
" if result.modified_count > 0: \n",
130+
" updated_doc_count += 1 \n",
131+
" \n",
132+
"# Print the results \n",
133+
"print(f\"Number of documents updated: {updated_doc_count}\") "
119134
]
120135
},
121136
{
@@ -128,13 +143,25 @@
128143
"import time\n",
129144
"\n",
130145
"# Define and create the vector search index\n",
131-
"index_name = \"<INDEX-NAME>\"\n",
146+
"index_name = \"vector_index\"\n",
132147
"search_index_model = SearchIndexModel(\n",
133148
" definition={\n",
134149
" \"fields\": [\n",
135150
" {\n",
136151
" \"type\": \"vector\",\n",
137-
" \"path\": \"embedding\",\n",
152+
" \"path\": float32_field,\n",
153+
" \"similarity\": \"dotProduct\",\n",
154+
" \"numDimensions\": 1024\n",
155+
" },\n",
156+
" {\n",
157+
" \"type\": \"vector\",\n",
158+
" \"path\": int8_field,\n",
159+
" \"similarity\": \"dotProduct\",\n",
160+
" \"numDimensions\": 1024\n",
161+
" },\n",
162+
" {\n",
163+
" \"type\": \"vector\",\n",
164+
" \"path\": int1_field,\n",
138165
" \"similarity\": \"euclidean\",\n",
139166
" \"numDimensions\": 1024\n",
140167
" }\n",
@@ -165,36 +192,57 @@
165192
"metadata": {},
166193
"outputs": [],
167194
"source": [
168-
"# Define function to run a vector search query\n",
195+
"import voyageai\n",
196+
"from bson.binary import Binary, BinaryVectorDtype\n",
197+
"\n",
198+
"# Define a function to run a vector search query\n",
169199
"def run_vector_search(query_text, collection, path):\n",
170-
" query_embedding = get_embedding(\"query_text\")\n",
171-
" bson_query_vector = generate_bson_vector(query_embedding, BinaryVectorDtype.FLOAT32)\n",
172-
"\n",
173-
" # If you specified a different data type, uncomment one of following lines and delete the preceding line\n",
174-
" # bson_query_vector = generate_bson_vector(query_embedding, BinaryVectorDtype.INT8)\n",
175-
" # bson_query_vector = generate_bson_vector(query_embedding, BinaryVectorDtype.PACKED_BIT) # refers to int1 data type\n",
176-
"\n",
177-
" pipeline = [\n",
178-
" {\n",
179-
" '$vectorSearch': {\n",
180-
" 'index': index_name,\n",
181-
" 'path': path,\n",
182-
" 'queryVector': bson_query_vector,\n",
183-
" 'numCandidates': 20,\n",
184-
" 'limit': 5\n",
185-
" }\n",
186-
" },\n",
187-
" {\n",
188-
" '$project': {\n",
189-
" '_id': 0,\n",
190-
" 'name': 1,\n",
191-
" 'summary': 1,\n",
192-
" 'score': { '$meta': 'vectorSearchScore' }\n",
200+
" # Map path to output dtype and BSON vector type\n",
201+
" path_to_dtype = {\n",
202+
" float32_field: (\"float\", BinaryVectorDtype.FLOAT32),\n",
203+
" int8_field: (\"int8\", BinaryVectorDtype.INT8),\n",
204+
" int1_field: (\"ubinary\", BinaryVectorDtype.PACKED_BIT),\n",
205+
" }\n",
206+
"\n",
207+
" if path not in path_to_dtype:\n",
208+
" raise ValueError(\"Invalid path. Must be one of float32_field, int8_field, int1_field.\")\n",
209+
"\n",
210+
" # Get Voyage AI output dtype and BSON vector type based on the path\n",
211+
" output_dtype, bson_dtype = path_to_dtype[path]\n",
212+
"\n",
213+
" # Generate query embeddings using Voyage AI\n",
214+
" query_vector = vo.embed(\n",
215+
" texts=[query_text],\n",
216+
" model=\"voyage-3-large\",\n",
217+
" input_type=\"query\",\n",
218+
" output_dtype=output_dtype\n",
219+
" ).embeddings[0]\n",
220+
"\n",
221+
" # Convert the query vector to BSON format\n",
222+
" bson_query_vector = Binary.from_vector(query_vector, bson_dtype)\n",
223+
"\n",
224+
" # Define the aggregation pipeline for vector search\n",
225+
" pipeline = [\n",
226+
" {\n",
227+
" \"$vectorSearch\": {\n",
228+
" \"index\": index_name, # Replace with your index name\n",
229+
" \"path\": path, # Path to the embedding field\n",
230+
" \"queryVector\": bson_query_vector, # BSON-encoded query vector\n",
231+
" \"numCandidates\": 20,\n",
232+
" \"limit\": 5\n",
233+
" }\n",
234+
" },\n",
235+
" {\n",
236+
" \"$project\": {\n",
237+
" \"_id\": 0,\n",
238+
" \"summary\": 1,\n",
239+
" \"score\": { \"$meta\": \"vectorSearchScore\" } # Include the similarity score\n",
240+
" }\n",
193241
" }\n",
194-
" }\n",
195-
" ]\n",
242+
" ]\n",
196243
"\n",
197-
" return collection.aggregate(pipeline)"
244+
" # Run the aggregation pipeline and return results\n",
245+
" return collection.aggregate(pipeline)"
198246
]
199247
},
200248
{
@@ -205,12 +253,19 @@
205253
"source": [
206254
"from pprint import pprint\n",
207255
"\n",
208-
"# Run a vector search query\n",
256+
"# Define a list of embedding fields to query\n",
257+
"embedding_fields = [float32_field, int8_field, int1_field] \n",
258+
"results = {}\n",
259+
"\n",
260+
"# Run vector search queries for each embedding type\n",
209261
"query_text = \"ocean view\"\n",
210-
"query_results = run_vector_search(query_text, collection, \"embedding\")\n",
262+
"for field in embedding_fields:\n",
263+
" results[field] = list(run_vector_search(query_text, collection, field)) \n",
211264
"\n",
212-
"print(\"query results:\")\n",
213-
"pprint(list(query_results))"
265+
"# Print the results\n",
266+
"for field, field_results in results.items():\n",
267+
" print(f\"Results from {field}\")\n",
268+
" pprint(field_results)"
214269
]
215270
}
216271
],

0 commit comments

Comments
 (0)