Skip to content

Commit 480ead9

Browse files
committed
[ENH] Add nomic embedding function
1 parent acf3839 commit 480ead9

File tree

4 files changed

+141
-0
lines changed

4 files changed

+141
-0
lines changed

chromadb/test/ef/test_ef.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def test_get_builtins_holds() -> None:
3737
"InstructorEmbeddingFunction",
3838
"JinaEmbeddingFunction",
3939
"MistralEmbeddingFunction",
40+
"NomicEmbeddingFunction",
4041
"ONNXMiniLM_L6_V2",
4142
"OllamaEmbeddingFunction",
4243
"OpenAIEmbeddingFunction",

chromadb/utils/embedding_functions/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@
6565
from chromadb.utils.embedding_functions.mistral_embedding_function import (
6666
MistralEmbeddingFunction,
6767
)
68+
from chromadb.utils.embedding_functions.nomic_embedding_function import (
69+
NomicEmbeddingFunction,
70+
NomicQueryConfig,
71+
)
6872

6973
try:
7074
from chromadb.is_thin_client import is_thin_client
@@ -84,7 +88,10 @@
8488
"OllamaEmbeddingFunction",
8589
"InstructorEmbeddingFunction",
8690
"JinaEmbeddingFunction",
91+
"JinaQueryConfig",
8792
"MistralEmbeddingFunction",
93+
"NomicEmbeddingFunction",
94+
"NomicQueryConfig",
8895
"VoyageAIEmbeddingFunction",
8996
"ONNXMiniLM_L6_V2",
9097
"OpenCLIPEmbeddingFunction",
@@ -146,6 +153,7 @@ def validate_config(config: Dict[str, Any]) -> None:
146153
"instructor": InstructorEmbeddingFunction,
147154
"jina": JinaEmbeddingFunction,
148155
"mistral": MistralEmbeddingFunction,
156+
"nomic": NomicEmbeddingFunction,
149157
"voyageai": VoyageAIEmbeddingFunction,
150158
"onnx_mini_lm_l6_v2": ONNXMiniLM_L6_V2,
151159
"open_clip": OpenCLIPEmbeddingFunction,
@@ -235,6 +243,8 @@ def config_to_embedding_function(config: Dict[str, Any]) -> EmbeddingFunction:
235243
"JinaEmbeddingFunction",
236244
"JinaQueryConfig",
237245
"MistralEmbeddingFunction",
246+
"NomicEmbeddingFunction",
247+
"NomicQueryConfig",
238248
"VoyageAIEmbeddingFunction",
239249
"ONNXMiniLM_L6_V2",
240250
"OpenCLIPEmbeddingFunction",
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
from chromadb.api.types import (
2+
Embeddings,
3+
Documents,
4+
EmbeddingFunction,
5+
Space,
6+
QueryConfig,
7+
)
8+
from chromadb.utils.embedding_functions.schemas import validate_config_schema
9+
from typing import List, Dict, Any
10+
from typing_extensions import override
11+
import os
12+
import numpy as np
13+
14+
15+
class NomicEmbeddingFunction(EmbeddingFunction[Documents]):
16+
"""
17+
This class is used to get embeddings for a list of texts using the Nomic API.
18+
"""
19+
20+
def __init__(
21+
self,
22+
model: str,
23+
api_key_env_var: str = "NOMIC_API_KEY",
24+
task_type: str = "search_document",
25+
):
26+
try:
27+
from nomic import embed
28+
except ImportError:
29+
raise ValueError(
30+
"The nomic python package is not installed. Please install it with `pip install nomic`"
31+
)
32+
33+
self.model = model
34+
self.task_type = task_type
35+
self.api_key_env_var = api_key_env_var
36+
self.api_key = os.getenv(api_key_env_var)
37+
if not self.api_key:
38+
raise ValueError(f"The {api_key_env_var} environment variable is not set.")
39+
self.embed = embed
40+
41+
def __call__(self, input: Documents) -> Embeddings:
42+
if not all(isinstance(item, str) for item in input):
43+
raise ValueError("Nomic only supports text documents, not images")
44+
output = self.embed.text(
45+
model=self.model,
46+
texts=input,
47+
task_type=self.task_type,
48+
)
49+
return [np.array(data.embedding) for data in output.data]
50+
51+
@staticmethod
52+
def name() -> str:
53+
return "nomic"
54+
55+
def default_space(self) -> Space:
56+
return "cosine"
57+
58+
def supported_spaces(self) -> List[Space]:
59+
return ["cosine", "l2", "ip"]
60+
61+
@staticmethod
62+
def build_from_config(config: Dict[str, Any]) -> "EmbeddingFunction[Documents]":
63+
model = config.get("model")
64+
api_key_env_var = config.get("api_key_env_var")
65+
task_type = config.get("task_type")
66+
if model is None or api_key_env_var is None or task_type is None:
67+
assert False, "This code should not be reached" # this is for type checking
68+
return NomicEmbeddingFunction(
69+
model=model, api_key_env_var=api_key_env_var, task_type=task_type
70+
)
71+
72+
def get_config(self) -> Dict[str, Any]:
73+
return {
74+
"model": self.model,
75+
"api_key_env_var": self.api_key_env_var,
76+
"task_type": self.task_type,
77+
}
78+
79+
def validate_config_update(
80+
self, old_config: Dict[str, Any], new_config: Dict[str, Any]
81+
) -> None:
82+
if "model" in new_config:
83+
raise ValueError(
84+
"The model cannot be changed after the embedding function has been initialized."
85+
)
86+
87+
@staticmethod
88+
def validate_config(config: Dict[str, Any]) -> None:
89+
"""
90+
Validate the configuration using the JSON schema.
91+
92+
Args:
93+
config: Configuration to validate
94+
"""
95+
validate_config_schema(config, "nomic")
96+
97+
98+
class NomicQueryConfig(QueryConfig):
99+
def __init__(self, task_type: str = "search_query"):
100+
self.task_type = task_type
101+
102+
@override
103+
def name(self) -> str:
104+
return "nomic"
105+
106+
def get_config(self) -> Dict[str, Any]:
107+
return {
108+
"task_type": self.task_type,
109+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"$schema": "http://json-schema.org/draft-07/schema#",
3+
"title": "Nomic Embedding Function Schema",
4+
"description": "Schema for the Nomic embedding function configuration",
5+
"version": "1.0.0",
6+
"type": "object",
7+
"properties": {
8+
"model": {
9+
"type": "string",
10+
"description": "Parameter model for the Nomic embedding function"
11+
},
12+
"api_key_env_var": {
13+
"type": "string",
14+
"description": "Parameter api_key_env_var for the Nomic embedding function"
15+
},
16+
"task_type": {
17+
"type": "string",
18+
"description": "Parameter task_type for the Nomic embedding function"
19+
}
20+
}
21+
}

0 commit comments

Comments
 (0)