Skip to content

Commit 3d9843b

Browse files
authored
[CLN] schema: build default with config ef & default_knn_index, remove #document population in defaults (#5775)
## Description of changes _Summarize the changes made by this PR._ - Improvements & Bug fixes - This PR is to clean up how collections are created: 1. when collection configuration was hnsw default and schema was any default, it blindly converted config -> schema. instead, this should use the knn index to build the correct default schema, and only take the embedding function from config 2. when converting config -> schema, it writes #document as the source key for both defaults and #embedding vector indexes. Instead, it should only write #document as the source key for #embedding 3. On the distributed modify path, the json mapping for the boolean type in go does not match the rust type - New functionality - ... ## Test plan _How are these changes tested?_ added unit tests for all 8 default cases (config hnsw or spann default, schema hnsw or spann default, default_knn_index), test thta #document does not populate for defaults, and embedding functions do. - [ x] Tests pass locally with `pytest` for python, `yarn test` for js, `cargo test` for rust ## Migration plan _Are there any migrations, or any forwards/backwards compatibility changes needed in order to make sure this change deploys reliably?_ ## Observability plan _What is the plan to instrument and monitor this change?_ ## Documentation Changes _Are all docstrings for user-facing APIs updated if required? Do we need to make documentation changes in the [docs section](https://github.com/chroma-core/chroma/tree/main/docs/docs.trychroma.com)?_
1 parent 5a60dee commit 3d9843b

File tree

12 files changed

+913
-280
lines changed

12 files changed

+913
-280
lines changed

go/pkg/sysdb/coordinator/model/collection_configuration.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ type ValueTypes struct {
249249
SparseVector *SparseVectorValueType `json:"sparse_vector,omitempty"`
250250
Int *IntValueType `json:"int,omitempty"`
251251
Float *FloatValueType `json:"float,omitempty"`
252-
Boolean *BoolValueType `json:"boolean,omitempty"`
252+
Boolean *BoolValueType `json:"bool,omitempty"`
253253
}
254254

255255
type Schema struct {

go/pkg/sysdb/coordinator/model/collection_configuration_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ func TestUpdateSchemaFromConfig_HnswSuccess(t *testing.T) {
147147
"config": {}
148148
}
149149
},
150-
"boolean": {
150+
"bool": {
151151
"bool_inverted_index": {
152152
"enabled": true,
153153
"config": {}
@@ -312,7 +312,7 @@ func TestUpdateSchemaFromConfig_SpannSuccess(t *testing.T) {
312312
"config": {}
313313
}
314314
},
315-
"boolean": {
315+
"bool": {
316316
"bool_inverted_index": {
317317
"enabled": true,
318318
"config": {}
@@ -481,7 +481,7 @@ func TestUpdateSchemaFromConfig_EmbeddingFunction(t *testing.T) {
481481
"config": {}
482482
}
483483
},
484-
"boolean": {
484+
"bool": {
485485
"bool_inverted_index": {
486486
"enabled": true,
487487
"config": {}

go/pkg/sysdb/coordinator/table_catalog_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1555,7 +1555,7 @@ func TestUpdateCollection_WithSchema(t *testing.T) {
15551555
"config": {}
15561556
}
15571557
},
1558-
"boolean": {
1558+
"bool": {
15591559
"bool_inverted_index": {
15601560
"enabled": true,
15611561
"config": {}

rust/cli/src/commands/vacuum.rs

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ use chroma_segment::local_segment_manager::LocalSegmentManager;
1111
use chroma_sqlite::db::SqliteDb;
1212
use chroma_sysdb::SysDb;
1313
use chroma_system::System;
14-
use chroma_types::{CollectionUuid, KnnIndex, ListCollectionsRequest};
14+
use chroma_types::{CollectionUuid, ListCollectionsRequest, Schema};
1515
use clap::Parser;
1616
use colored::Colorize;
1717
use dialoguer::Confirm;
@@ -101,17 +101,18 @@ async fn trigger_vector_segments_max_seq_id_migration(
101101
sqlite: &SqliteDb,
102102
sysdb: &mut SysDb,
103103
segment_manager: &LocalSegmentManager,
104-
default_knn_index: KnnIndex,
105104
) -> Result<(), Box<dyn Error>> {
106105
let collection_ids = get_collection_ids_to_migrate(sqlite).await?;
107106

108107
for collection_id in collection_ids {
109108
let mut collection = sysdb.get_collection_with_segments(collection_id).await?;
110109

111-
collection
112-
.collection
113-
.reconcile_schema_with_config(default_knn_index)
114-
.map_err(|e| Box::new(e) as Box<dyn Error>)?;
110+
if collection.collection.schema.is_none() {
111+
collection.collection.schema = Some(
112+
Schema::try_from(&collection.collection.config)
113+
.map_err(|e| Box::new(e) as Box<dyn Error>)?,
114+
);
115+
}
115116

116117
// If collection is uninitialized, that means nothing has been written yet.
117118
let dim = match collection.collection.dimension {
@@ -153,13 +154,7 @@ pub async fn vacuum_chroma(config: FrontendConfig) -> Result<(), Box<dyn Error>>
153154

154155
println!("Purging the log...\n");
155156

156-
trigger_vector_segments_max_seq_id_migration(
157-
&sqlite,
158-
&mut sysdb,
159-
&segment_manager,
160-
config.default_knn_index,
161-
)
162-
.await?;
157+
trigger_vector_segments_max_seq_id_migration(&sqlite, &mut sysdb, &segment_manager).await?;
163158

164159
let tenant = String::from("default_tenant");
165160
let database = String::from("default_database");

rust/frontend/src/get_collection_with_segments_provider.rs

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ use chroma_config::Configurable;
44
use chroma_error::{ChromaError, ErrorCodes};
55
use chroma_sysdb::SysDb;
66
use chroma_types::{
7-
CollectionAndSegments, CollectionUuid, GetCollectionWithSegmentsError, KnnIndex, Schema,
8-
SchemaError,
7+
CollectionAndSegments, CollectionUuid, GetCollectionWithSegmentsError, Schema, SchemaError,
98
};
109
use serde::{Deserialize, Serialize};
1110
use std::{
@@ -143,7 +142,6 @@ impl CollectionsWithSegmentsProvider {
143142
pub(crate) async fn get_collection_with_segments(
144143
&mut self,
145144
collection_id: CollectionUuid,
146-
knn_index: KnnIndex,
147145
) -> Result<CollectionAndSegments, CollectionsWithSegmentsProviderError> {
148146
if let Some(collection_and_segments_with_ttl) = self
149147
.collections_with_segments_cache
@@ -185,14 +183,13 @@ impl CollectionsWithSegmentsProvider {
185183
.await?
186184
};
187185

188-
// reconcile schema and config
189-
let reconciled_schema = Schema::reconcile_schema_and_config(
190-
collection_and_segments_sysdb.collection.schema.as_ref(),
191-
Some(&collection_and_segments_sysdb.collection.config),
192-
knn_index,
193-
)
194-
.map_err(CollectionsWithSegmentsProviderError::InvalidSchema)?;
195-
collection_and_segments_sysdb.collection.schema = Some(reconciled_schema);
186+
if collection_and_segments_sysdb.collection.schema.is_none() {
187+
collection_and_segments_sysdb.collection.schema = Some(
188+
Schema::try_from(&collection_and_segments_sysdb.collection.config)
189+
.map_err(CollectionsWithSegmentsProviderError::InvalidSchema)?,
190+
);
191+
}
192+
196193
self.set_collection_with_segments(collection_and_segments_sysdb.clone())
197194
.await;
198195
Ok(collection_and_segments_sysdb)

rust/frontend/src/impls/in_memory_frontend.rs

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ use chroma_types::operator::{Filter, KnnBatch, KnnProjection, Limit, Projection,
66
use chroma_types::plan::{Count, Get, Knn};
77
use chroma_types::{
88
test_segment, Collection, CollectionAndSegments, CreateCollectionError, Database, Include,
9-
IncludeList, InternalCollectionConfiguration, KnnIndex, Segment, VectorIndexConfiguration,
9+
IncludeList, InternalCollectionConfiguration, KnnIndex, Schema, SchemaError, Segment,
10+
VectorIndexConfiguration,
1011
};
1112
use std::collections::HashSet;
1213

@@ -221,22 +222,27 @@ impl InMemoryFrontend {
221222
));
222223
}
223224

224-
let mut collection = Collection {
225+
let schema = Schema::reconcile_schema_and_config(
226+
request.schema.as_ref(),
227+
request.configuration.as_ref(),
228+
KnnIndex::Hnsw,
229+
)
230+
.map_err(CreateCollectionError::InvalidSchema)?;
231+
232+
let config = InternalCollectionConfiguration::try_from(&schema).map_err(|e| {
233+
CreateCollectionError::InvalidSchema(SchemaError::InvalidSchema { reason: e })
234+
})?;
235+
236+
let collection = Collection {
225237
name: request.name.clone(),
226238
tenant: request.tenant_id.clone(),
227239
database: request.database_name.clone(),
228240
metadata: request.metadata,
229-
config: request
230-
.configuration
231-
.unwrap_or(InternalCollectionConfiguration::default_hnsw()),
232-
schema: request.schema,
241+
config,
242+
schema: Some(schema),
233243
..Default::default()
234244
};
235245

236-
collection
237-
.reconcile_schema_with_config(KnnIndex::Hnsw)
238-
.map_err(CreateCollectionError::InvalidSchema)?;
239-
240246
// Prevent SPANN usage in InMemoryFrontend
241247
if matches!(
242248
collection.config.vector_index,

rust/frontend/src/impls/service_based_frontend.rs

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ impl ServiceBasedFrontend {
176176
) -> Result<Collection, GetCollectionError> {
177177
Ok(self
178178
.collections_with_segments_provider
179-
.get_collection_with_segments(collection_id, self.default_knn_index)
179+
.get_collection_with_segments(collection_id)
180180
.await
181181
.map_err(|err| Box::new(err) as Box<dyn ChromaError>)?
182182
.collection)
@@ -188,7 +188,7 @@ impl ServiceBasedFrontend {
188188
) -> Result<Option<u32>, GetCollectionError> {
189189
Ok(self
190190
.collections_with_segments_provider
191-
.get_collection_with_segments(collection_id, self.default_knn_index)
191+
.get_collection_with_segments(collection_id)
192192
.await
193193
.map_err(|err| Box::new(err) as Box<dyn ChromaError>)?
194194
.collection
@@ -381,7 +381,7 @@ impl ServiceBasedFrontend {
381381
if self.enable_schema {
382382
for collection in collections.iter_mut() {
383383
collection
384-
.reconcile_schema_with_config(self.default_knn_index)
384+
.reconcile_schema_for_read()
385385
.map_err(GetCollectionsError::InvalidSchema)?;
386386
}
387387
}
@@ -425,7 +425,7 @@ impl ServiceBasedFrontend {
425425
if self.enable_schema {
426426
for collection in &mut collections {
427427
collection
428-
.reconcile_schema_with_config(self.default_knn_index)
428+
.reconcile_schema_for_read()
429429
.map_err(GetCollectionError::InvalidSchema)?;
430430
}
431431
}
@@ -450,7 +450,7 @@ impl ServiceBasedFrontend {
450450

451451
if self.enable_schema {
452452
collection
453-
.reconcile_schema_with_config(self.default_knn_index)
453+
.reconcile_schema_for_read()
454454
.map_err(GetCollectionByCrnError::InvalidSchema)?;
455455
}
456456
Ok(collection)
@@ -630,9 +630,10 @@ impl ServiceBasedFrontend {
630630
// that was retrieved from sysdb, rather than the one that was passed in
631631
if self.enable_schema {
632632
collection
633-
.reconcile_schema_with_config(self.default_knn_index)
633+
.reconcile_schema_for_read()
634634
.map_err(CreateCollectionError::InvalidSchema)?;
635635
}
636+
636637
Ok(collection)
637638
}
638639

@@ -735,7 +736,7 @@ impl ServiceBasedFrontend {
735736
.await?;
736737
collection_and_segments
737738
.collection
738-
.reconcile_schema_with_config(self.default_knn_index)
739+
.reconcile_schema_for_read()
739740
.map_err(ForkCollectionError::InvalidSchema)?;
740741
let collection = collection_and_segments.collection.clone();
741742
let latest_collection_logical_size_bytes = collection_and_segments
@@ -1099,7 +1100,7 @@ impl ServiceBasedFrontend {
10991100
let read_event = if let Some(where_clause) = r#where {
11001101
let collection_and_segments = self
11011102
.collections_with_segments_provider
1102-
.get_collection_with_segments(collection_id, self.default_knn_index)
1103+
.get_collection_with_segments(collection_id)
11031104
.await
11041105
.map_err(|err| Box::new(err) as Box<dyn ChromaError>)?;
11051106
if self.enable_schema {
@@ -1309,7 +1310,7 @@ impl ServiceBasedFrontend {
13091310
) -> Result<CountResponse, QueryError> {
13101311
let collection_and_segments = self
13111312
.collections_with_segments_provider
1312-
.get_collection_with_segments(collection_id, self.default_knn_index)
1313+
.get_collection_with_segments(collection_id)
13131314
.await
13141315
.map_err(|err| Box::new(err) as Box<dyn ChromaError>)?;
13151316
let latest_collection_logical_size_bytes = collection_and_segments
@@ -1424,7 +1425,7 @@ impl ServiceBasedFrontend {
14241425
) -> Result<GetResponse, QueryError> {
14251426
let collection_and_segments = self
14261427
.collections_with_segments_provider
1427-
.get_collection_with_segments(collection_id, self.default_knn_index)
1428+
.get_collection_with_segments(collection_id)
14281429
.await
14291430
.map_err(|err| Box::new(err) as Box<dyn ChromaError>)?;
14301431
if self.enable_schema {
@@ -1569,7 +1570,7 @@ impl ServiceBasedFrontend {
15691570
) -> Result<QueryResponse, QueryError> {
15701571
let collection_and_segments = self
15711572
.collections_with_segments_provider
1572-
.get_collection_with_segments(collection_id, self.default_knn_index)
1573+
.get_collection_with_segments(collection_id)
15731574
.await
15741575
.map_err(|err| Box::new(err) as Box<dyn ChromaError>)?;
15751576
if self.enable_schema {
@@ -1726,7 +1727,7 @@ impl ServiceBasedFrontend {
17261727
// Get collection and segments once for all queries
17271728
let collection_and_segments = self
17281729
.collections_with_segments_provider
1729-
.get_collection_with_segments(request.collection_id, self.default_knn_index)
1730+
.get_collection_with_segments(request.collection_id)
17301731
.await
17311732
.map_err(|err| QueryError::Other(Box::new(err) as Box<dyn ChromaError>))?;
17321733
if self.enable_schema {

rust/log/src/local_compaction_manager.rs

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use chroma_sysdb::SysDb;
1515
use chroma_system::Handler;
1616
use chroma_system::{Component, ComponentContext};
1717
use chroma_types::{
18-
Chunk, CollectionUuid, GetCollectionWithSegmentsError, KnnIndex, LogRecord, SchemaError,
18+
Chunk, CollectionUuid, GetCollectionWithSegmentsError, LogRecord, Schema, SchemaError,
1919
};
2020
use serde::{Deserialize, Serialize};
2121
use thiserror::Error;
@@ -141,9 +141,12 @@ impl Handler<BackfillMessage> for LocalCompactionManager {
141141
.get_collection_with_segments(message.collection_id)
142142
.await?;
143143
let schema_previously_persisted = collection_and_segments.collection.schema.is_some();
144-
collection_and_segments
145-
.collection
146-
.reconcile_schema_with_config(KnnIndex::Hnsw)?;
144+
if !schema_previously_persisted {
145+
collection_and_segments.collection.schema = Some(
146+
Schema::try_from(&collection_and_segments.collection.config)
147+
.map_err(CompactionManagerError::SchemaReconcileError)?,
148+
);
149+
}
147150
// If collection is uninitialized, that means nothing has been written yet.
148151
let dim = match collection_and_segments.collection.dimension {
149152
Some(dim) => dim,
@@ -267,7 +270,12 @@ impl Handler<PurgeLogsMessage> for LocalCompactionManager {
267270
.get_collection_with_segments(message.collection_id)
268271
.await?;
269272
let mut collection = collection_segments.collection.clone();
270-
collection.reconcile_schema_with_config(KnnIndex::Hnsw)?;
273+
if collection.schema.is_none() {
274+
collection.schema = Some(
275+
Schema::try_from(&collection.config)
276+
.map_err(CompactionManagerError::SchemaReconcileError)?,
277+
);
278+
}
271279
// If dimension is None, that means nothing has been written yet.
272280
let dim = match collection.dimension {
273281
Some(dim) => dim,

0 commit comments

Comments
 (0)