Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions colgrep/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ pub struct Cli {
#[arg(long = "semantic-only")]
pub no_fts: bool,

/// Hybrid search alpha: balance between keyword (0.0) and semantic (1.0). Default: 0.75.
/// Hybrid search alpha: balance between keyword (0.0) and semantic (1.0). Default: 0.60.
#[arg(long, value_name = "FLOAT")]
pub alpha: Option<f32>,

Expand Down Expand Up @@ -508,7 +508,7 @@ pub enum Commands {
#[arg(long = "semantic-only")]
no_fts: bool,

/// Hybrid search alpha: balance between keyword (0.0) and semantic (1.0). Default: 0.75.
/// Hybrid search alpha: balance between keyword (0.0) and semantic (1.0). Default: 0.60.
#[arg(long, value_name = "FLOAT")]
alpha: Option<f32>,

Expand Down
2 changes: 1 addition & 1 deletion colgrep/src/commands/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ pub fn cmd_config(
if let Some(a) = alpha {
if a == 0.0 {
config.clear_hybrid_alpha();
println!("✅ Reset hybrid alpha to 0.75 (default)");
println!("✅ Reset hybrid alpha to 0.60 (default)");
} else {
config.set_hybrid_alpha(a);
println!("✅ Set hybrid alpha to {:.2}", config.get_hybrid_alpha());
Expand Down
76 changes: 39 additions & 37 deletions colgrep/src/commands/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use crate::display::{
calc_display_ranges, find_representative_lines, group_results_by_file,
print_highlighted_content, print_highlighted_ranges,
};
use crate::scoring::{compute_final_score, should_search_from_root};
use crate::scoring::should_search_from_root;

/// Pre-compiled pattern matcher for efficient repeated matching.
/// Compiling regex is expensive (~microseconds), so we do it once and reuse.
Expand Down Expand Up @@ -1340,7 +1340,7 @@ fn search_single_path(
!config.use_hybrid_search()
};

// CLI --alpha overrides config, config overrides default (0.75)
// CLI --alpha overrides config, config overrides default (0.55)
let hybrid_alpha = alpha.unwrap_or_else(|| config.get_hybrid_alpha());

// When no -e flag is provided, run BOTH semantic/hybrid search and text-pattern search
Expand Down Expand Up @@ -1413,13 +1413,17 @@ fn search_single_path(
Some(&hybrid_subset),
)?
} else {
// Pass `None` so FTS5 is refetched *within* the subset.
// Reusing the global `fts5_results` here would carry
// BM25 hits from outside the subset; they'd get filtered
// down to a tiny intersection, hurting recall.
searcher.search_hybrid_with_embedding(
&query_emb,
query,
search_top_k,
Some(&hybrid_subset),
hybrid_alpha,
fts5_results.as_ref(),
None,
)?
}
} else {
Expand All @@ -1429,47 +1433,45 @@ fn search_single_path(
vec![]
};

// 3. Merge results: keep max score for each unique code unit (by file + line)
let mut merged: HashMap<(PathBuf, usize), colgrep::SearchResult> = HashMap::new();

for result in semantic_results {
let key = (result.unit.file.clone(), result.unit.line);
merged
.entry(key)
.and_modify(|existing| {
// 3. Merge results: one entry per file, span covers every matched
// unit, score is the max across both calls.
//
// The previous `(file, line)` dedup was buggy: both
// `search_hybrid_with_embedding` calls run `collapse_by_file`
// internally, which sets `unit.line = min(line_i)` *across that
// call's candidate pool*. Two pools → two different mins for the
// same file → same file occupied two top-K slots.
use std::collections::hash_map::Entry;
let mut merged: HashMap<PathBuf, colgrep::SearchResult> = HashMap::new();
for result in semantic_results.into_iter().chain(hybrid_results) {
let key = result.unit.file.clone();
match merged.entry(key) {
Entry::Occupied(mut e) => {
let existing = e.get_mut();
let new_start = existing.unit.line.min(result.unit.line);
let new_end = existing.unit.end_line.max(result.unit.end_line);
if result.score > existing.score {
*existing = result.clone();
*existing = result;
}
})
.or_insert(result);
}

for result in hybrid_results {
let key = (result.unit.file.clone(), result.unit.line);
merged
.entry(key)
.and_modify(|existing| {
if result.score > existing.score {
*existing = result.clone();
}
})
.or_insert(result);
existing.unit.line = new_start;
existing.unit.end_line = new_end;
}
Entry::Vacant(e) => {
e.insert(result);
}
}
}

merged.into_values().collect::<Vec<_>>()
};

// Note: When -e is used, results are already filtered to units containing the pattern
// via filter_by_text_pattern_with_options() above, which supports -E, -F, -w flags

// Apply query boost and re-sort results
let mut results: Vec<_> = results
.into_iter()
.map(|mut r| {
r.score = compute_final_score(r.score, query, &r.unit, text_pattern);
r
})
.collect();
// via filter_by_text_pattern_with_options() above, which supports -E, -F, -w flags.
//
// The legacy `compute_final_score` test-name demotion was removed; the
// hybrid pipeline now applies `ranking::file_path_penalty` (a much more
// complete language-aware test/bench/example/compat penalty) inside
// `Searcher::search_hybrid_with_embedding`.
let mut results: Vec<_> = results;
results.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
Expand Down
19 changes: 16 additions & 3 deletions colgrep/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -403,17 +403,30 @@ impl Config {
}

/// Get hybrid search alpha (keyword vs semantic balance).
/// Defaults to 0.75 (favors semantic).
///
/// Defaults to 0.60. With the dedup + fts5-refetch fixes and the
/// path-stem / definition / file-coherence / file-collapse stack in
/// place, the plateau across alpha is broad (0.55–0.70 all land in
/// the 0.829–0.831 NDCG@10 band on the semble bench) and 0.60 is the
/// empirical peak.
///
/// Overrideable at runtime via `COLGREP_ALPHA` env var (used by the
/// benchmark harness to grid-search without rebuilding).
pub fn get_hybrid_alpha(&self) -> f32 {
self.hybrid_alpha.unwrap_or(0.75)
if let Ok(env_alpha) = std::env::var("COLGREP_ALPHA") {
if let Ok(v) = env_alpha.parse::<f32>() {
return v.clamp(0.0, 1.0);
}
}
self.hybrid_alpha.unwrap_or(0.60)
}

/// Set hybrid search alpha (0.0 = pure keyword, 1.0 = pure semantic).
pub fn set_hybrid_alpha(&mut self, alpha: f32) {
self.hybrid_alpha = Some(alpha.clamp(0.0, 1.0));
}

/// Clear hybrid alpha setting (revert to default: 0.75).
/// Clear hybrid alpha setting (revert to default: 0.60).
pub fn clear_hybrid_alpha(&mut self) {
self.hybrid_alpha = None;
}
Expand Down
Loading
Loading