feat: Add configurable batch_size and max_workers to embed method

Fede Kamelhar · fede-kamel · commit 0a61a81f1ba8 · 2025-10-28T11:18:46.000-04:00
Fixes #534 This PR makes the embed batch size configurable, allowing users to customize the batch size based on their specific use cases and constraints. Changes: - Add optional batch_size parameter to Client.embed() and AsyncClient.embed() - Add optional max_workers parameter to Client.embed() for thread pool control - Default behavior remains unchanged (batch_size=96 from config) - Full backward compatibility maintained The implementation allows users to: - Use smaller batches to reduce memory usage - Use larger batches to reduce API calls - Control thread pool size for rate limiting scenarios - Optimize for their specific embedding model and text sizes
diff --git a/demo_configurable_batch_size.py b/demo_configurable_batch_size.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+"""
+Demo script for the configurable batch size feature in Cohere SDK.
+
+This demonstrates how to use the new batch_size and max_workers parameters
+to control embedding batch processing.
+"""
+
+import os
+import time
+import cohere
+
+# Initialize client (requires CO_API_KEY environment variable)
+client = cohere.Client()
+
+# Sample texts for embedding
+texts = [f"Text document number {i}" for i in range(20)]
+
+print(f"Embedding {len(texts)} texts...")
+print()
+
+# Example 1: Default behavior (batch_size=96)
+print("1. Default behavior (batch_size=96):")
+start = time.time()
+response = client.embed(
+    texts=texts,
+    model="embed-english-v3.0",
+    input_type="search_document"
+)
+print(f"   Time: {time.time() - start:.2f}s")
+print(f"   Number of embeddings: {len(response.embeddings)}")
+print()
+
+# Example 2: Custom small batch size
+print("2. Custom small batch size (batch_size=5):")
+start = time.time()
+response = client.embed(
+    texts=texts,
+    model="embed-english-v3.0",
+    input_type="search_document",
+    batch_size=5  # Will make 4 API calls for 20 texts
+)
+print(f"   Time: {time.time() - start:.2f}s")
+print(f"   Number of embeddings: {len(response.embeddings)}")
+print()
+
+# Example 3: Custom batch size with fewer workers
+print("3. Custom batch size with fewer workers (batch_size=5, max_workers=2):")
+start = time.time()
+response = client.embed(
+    texts=texts,
+    model="embed-english-v3.0",
+    input_type="search_document",
+    batch_size=5,
+    max_workers=2  # Limit concurrency to 2 threads
+)
+print(f"   Time: {time.time() - start:.2f}s")
+print(f"   Number of embeddings: {len(response.embeddings)}")
+print()
+
+# Example 4: Large batch size (all in one API call)
+print("4. Large batch size (batch_size=100):")
+start = time.time()
+response = client.embed(
+    texts=texts,
+    model="embed-english-v3.0",
+    input_type="search_document",
+    batch_size=100  # All texts in a single API call
+)
+print(f"   Time: {time.time() - start:.2f}s")
+print(f"   Number of embeddings: {len(response.embeddings)}")
+print()
+
+print("Demo completed!")
+print()
+print("Key benefits of configurable batch size:")
+print("- batch_size: Control memory usage and API call granularity")
+print("- max_workers: Control concurrency for rate limiting or resource constraints")
+print("- Backward compatible: Defaults to existing behavior if not specified")
diff --git a/src/cohere/client.py b/src/cohere/client.py
@@ -1,24 +1,23 @@
 import asyncio
+import logging
 import os
 import typing
 from concurrent.futures import ThreadPoolExecutor
-from tokenizers import Tokenizer  # type: ignore
-import logging
 
 import httpx
-
-from cohere.types.detokenize_response import DetokenizeResponse
-from cohere.types.tokenize_response import TokenizeResponse
-
-from . import EmbedResponse, EmbedInputType, EmbeddingType, EmbedRequestTruncate
-from .base_client import BaseCohere, AsyncBaseCohere, OMIT
+from . import EmbeddingType, EmbedInputType, EmbedRequestTruncate, EmbedResponse
+from .base_client import OMIT, AsyncBaseCohere, BaseCohere
 from .config import embed_batch_size
 from .core import RequestOptions
 from .environment import ClientEnvironment
-from .manually_maintained.cache import CacheMixin
 from .manually_maintained import tokenizers as local_tokenizers
+from .manually_maintained.cache import CacheMixin
 from .overrides import run_overrides
-from .utils import wait, async_wait, merge_embed_responses, SyncSdkUtils, AsyncSdkUtils
+from .utils import AsyncSdkUtils, SyncSdkUtils, async_wait, merge_embed_responses, wait
+from tokenizers import Tokenizer  # type: ignore
+
+from cohere.types.detokenize_response import DetokenizeResponse
+from cohere.types.tokenize_response import TokenizeResponse
 
 logger = logging.getLogger(__name__)
 run_overrides()
@@ -188,6 +187,8 @@ def embed(
         truncate: typing.Optional[EmbedRequestTruncate] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
         batching: typing.Optional[bool] = True,
+        batch_size: typing.Optional[int] = None,
+        max_workers: typing.Optional[int] = None,
     ) -> EmbedResponse:
         # skip batching for images for now
         if batching is False or images is not OMIT:
@@ -203,23 +204,34 @@ def embed(
             )
 
         textsarr: typing.Sequence[str]  = texts if texts is not OMIT and texts is not None else []
-        texts_batches = [textsarr[i : i + embed_batch_size] for i in range(0, len(textsarr), embed_batch_size)]
-
-        responses = [
-            response
-            for response in self._executor.map(
-                lambda text_batch: BaseCohere.embed(
-                    self,
-                    texts=text_batch,
-                    model=model,
-                    input_type=input_type,
-                    embedding_types=embedding_types,
-                    truncate=truncate,
-                    request_options=request_options,
-                ),
-                texts_batches,
-            )
-        ]
+        effective_batch_size = batch_size if batch_size is not None else embed_batch_size
+        texts_batches = [textsarr[i : i + effective_batch_size] for i in range(0, len(textsarr), effective_batch_size)]
+
+        # Use custom executor if max_workers is specified
+        executor = self._executor
+        if max_workers is not None:
+            executor = ThreadPoolExecutor(max_workers=max_workers)
+        
+        try:
+            responses = [
+                response
+                for response in executor.map(
+                    lambda text_batch: BaseCohere.embed(
+                        self,
+                        texts=text_batch,
+                        model=model,
+                        input_type=input_type,
+                        embedding_types=embedding_types,
+                        truncate=truncate,
+                        request_options=request_options,
+                    ),
+                    texts_batches,
+                )
+            ]
+        finally:
+            # Clean up custom executor if created
+            if max_workers is not None:
+                executor.shutdown(wait=False)
 
         return merge_embed_responses(responses)
 
@@ -380,6 +392,8 @@ async def embed(
         truncate: typing.Optional[EmbedRequestTruncate] = OMIT,
         request_options: typing.Optional[RequestOptions] = None,
         batching: typing.Optional[bool] = True,
+        batch_size: typing.Optional[int] = None,
+        max_workers: typing.Optional[int] = None,
     ) -> EmbedResponse:
         # skip batching for images for now
         if batching is False or images is not OMIT:
@@ -395,8 +409,15 @@ async def embed(
             )
 
         textsarr: typing.Sequence[str]  = texts if texts is not OMIT and texts is not None else []
-        texts_batches = [textsarr[i : i + embed_batch_size] for i in range(0, len(textsarr), embed_batch_size)]
-
+        effective_batch_size = batch_size if batch_size is not None else embed_batch_size
+        texts_batches = [textsarr[i : i + effective_batch_size] for i in range(0, len(textsarr), effective_batch_size)]
+
+        # Note: max_workers parameter is not used in async version since asyncio.gather
+        # handles concurrency differently than ThreadPoolExecutor
+        if max_workers is not None:
+            # Log a warning or silently ignore - asyncio manages its own concurrency
+            pass
+        
         responses = typing.cast(
             typing.List[EmbedResponse],
             await asyncio.gather(
diff --git a/tests/test_configurable_batch_size.py b/tests/test_configurable_batch_size.py