Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 15 additions & 18 deletions colgrep/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1114,16 +1114,16 @@ impl IndexBuilder {
filtering_count, vector_count
);
if filtering_count > vector_count {
// Filtering DB has orphan entries (docs without embeddings)
// Get all doc IDs from filtering that exceed the vector index count
// The vector index uses sequential IDs starting from 0, so any ID >= vector_count is orphan
let all_metadata = filtering::get(index_path, None, &[], None)?;

let orphan_ids: Vec<i64> = all_metadata
.iter()
.filter_map(|meta| meta.get("_subset_").and_then(|v| v.as_i64()))
.filter(|&id| id >= vector_count as i64)
.collect();
// Filtering DB has orphan entries (docs without embeddings).
// The vector index uses sequential IDs starting from 0, so any
// `_subset_` ID >= vector_count is an orphan. Push the filter into
// SQL so we don't materialize every row's metadata just to find a
// few stray IDs.
let orphan_ids = filtering::where_condition(
index_path,
"_subset_ >= ?",
&[serde_json::json!(vector_count as i64)],
)?;

if !orphan_ids.is_empty() {
// Delete orphan entries from filtering DB
Expand Down Expand Up @@ -2241,14 +2241,11 @@ impl IndexBuilder {
/// Clean up orphaned entries: files in index but not on disk
/// This handles directory deletion/rename and any state inconsistencies
fn cleanup_orphaned_entries(&self, index_path: &str) -> Result<usize> {
// Get all indexed files from filtering DB
let all_metadata = filtering::get(index_path, None, &[], None).unwrap_or_default();
let mut files: HashSet<String> = HashSet::new();
for meta in &all_metadata {
if let Some(file) = meta.get("file").and_then(|v| v.as_str()) {
files.insert(file.to_string());
}
}
// Pull only the distinct file paths from the metadata DB. The previous
// implementation called `filtering::get` which streams every column of
// every row (code text, embeddings metadata, etc.) on every search — a
// tens-of-megabytes JSON deserialize on large indexes.
let files = filtering::get_distinct_strings(index_path, "file").unwrap_or_default();

let mut deleted_count = 0;
for file_str in files {
Expand Down
107 changes: 107 additions & 0 deletions next-plaid/src/filtering.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1327,6 +1327,62 @@ pub fn where_condition_regexp(
Ok(result)
}

/// Get distinct non-NULL string values from a single METADATA column.
///
/// This is a focused, low-cost alternative to [`get`] when callers only need
/// to enumerate the unique values of a single string column (for example, the
/// distinct file paths represented in the index). It runs a single
/// `SELECT DISTINCT` query and avoids loading every row's full metadata.
///
/// # Arguments
///
/// * `index_path` - Path to the index directory
/// * `column` - Column name (validated against the METADATA schema)
///
/// # Returns
///
/// * `Ok(values)` - Distinct non-NULL string values from the column
/// * `Ok(vec![])` - The database does not exist or the column is not present
/// * `Err(_)` - Invalid column name or a database error
pub fn get_distinct_strings(index_path: &str, column: &str) -> Result<Vec<String>> {
let db_path = get_db_path(index_path);
if !db_path.exists() {
return Ok(Vec::new());
}

// Reject column names that aren't safe SQL identifiers up front (defense in
// depth — the schema check below would also catch unknown names).
if !is_valid_column_name(column) {
return Err(Error::Filtering(format!(
"Invalid column name: '{}'",
column
)));
}

let conn = open_db(&db_path)?;

let columns = get_schema_columns(&conn)?;
if !columns.contains(column) {
return Ok(Vec::new());
}

let query = format!(
"SELECT DISTINCT \"{0}\" FROM METADATA WHERE \"{0}\" IS NOT NULL",
column
);
let mut stmt = conn.prepare(&query)?;
let rows = stmt.query_map([], |row| row.get::<_, Option<String>>(0))?;

let mut values: Vec<String> = Vec::new();
for row in rows {
if let Some(value) = row? {
values.push(value);
}
}

Ok(values)
}

/// Get full metadata rows by condition or subset IDs.
///
/// Returns metadata as JSON objects with the `_subset_` field included.
Expand Down Expand Up @@ -1703,6 +1759,57 @@ mod tests {
assert_eq!(subset, vec![0]);
}

#[test]
fn test_get_distinct_strings_returns_unique_values() {
let dir = setup_test_dir();
let path = dir.path().to_str().unwrap();

let metadata = vec![
json!({"file": "src/a.rs", "code": "x"}),
json!({"file": "src/a.rs", "code": "y"}),
json!({"file": "src/b.rs", "code": "z"}),
];
let doc_ids: Vec<i64> = (0..3).collect();
create(path, &metadata, &doc_ids).unwrap();

let mut files = get_distinct_strings(path, "file").unwrap();
files.sort();
assert_eq!(files, vec!["src/a.rs".to_string(), "src/b.rs".to_string()]);
}

#[test]
fn test_get_distinct_strings_missing_db_returns_empty() {
let dir = setup_test_dir();
let path = dir.path().to_str().unwrap();
// No create() call — DB does not exist.
let files = get_distinct_strings(path, "file").unwrap();
assert!(files.is_empty());
}

#[test]
fn test_get_distinct_strings_unknown_column_returns_empty() {
let dir = setup_test_dir();
let path = dir.path().to_str().unwrap();

let metadata = vec![json!({"file": "src/a.rs"})];
create(path, &metadata, &[0]).unwrap();

let values = get_distinct_strings(path, "not_a_column").unwrap();
assert!(values.is_empty());
}

#[test]
fn test_get_distinct_strings_rejects_invalid_column_name() {
let dir = setup_test_dir();
let path = dir.path().to_str().unwrap();

let metadata = vec![json!({"file": "src/a.rs"})];
create(path, &metadata, &[0]).unwrap();

let result = get_distinct_strings(path, "file; DROP TABLE METADATA --");
assert!(result.is_err());
}

#[test]
fn test_get_all() {
let dir = setup_test_dir();
Expand Down
Loading