From c989487d631f03b58e0be3d653cf7571797f09e8 Mon Sep 17 00:00:00 2001 From: nuri-yoo Date: Wed, 29 Apr 2026 01:23:45 +0900 Subject: [PATCH 1/4] feat(speedwagon): add DescriptionAgent for KB-level description generation --- speedwagon/src/store/description.rs | 367 ++++++++++++++++++++++++++++ speedwagon/src/store/mod.rs | 29 +++ 2 files changed, 396 insertions(+) create mode 100644 speedwagon/src/store/description.rs diff --git a/speedwagon/src/store/description.rs b/speedwagon/src/store/description.rs new file mode 100644 index 0000000..6233265 --- /dev/null +++ b/speedwagon/src/store/description.rs @@ -0,0 +1,367 @@ +//! KB-level description generation from per-document `(title, purpose)` tuples. +//! +//! Pairs with `parser::PurposeAgent`: where `PurposeAgent` produces one +//! line of search-tuned metadata per document, `DescriptionAgent` produces +//! one line of routing-tuned metadata per knowledge base. +//! +//! This module is a library — it does not own any database, HTTP surface, +//! or "Manual vs Auto" policy. Callers (backend, CLI) decide when to call +//! `Store::describe`, what to do with the result, and how to persist it. +//! +//! Output shape chosen by offline experiments on financebench / cuad / +//! kpaperqa / bioasq: +//! +//! - The description is self-contained: it states what is *inside* the +//! KB (document types, entities covered, topics it can answer) without +//! referencing any other KB. The routing agent that reads several KB +//! descriptions side-by-side does the comparison from a higher level; +//! we don't bake that comparison into each description, so an +//! adjacent KB being renamed or removed cannot stale a description. +//! - Target output ~200 chars: shortest length that still preserved a +//! coherent identity sentence in the experiments. +//! - Input is each doc's `purpose` only — adding `title` did not improve +//! routing on the probe set and increased input tokens by ~25%. +//! - On LLM failure, `fallback_description` returns +//! `"{N} documents including: {top-5 titles}"`. Empty string would have +//! silently degraded routing (an empty description anchored a wrong KB +//! in the experiments). + +use ailoy::{ + agent::{Agent, AgentProvider, AgentSpec}, + message::{Message, Part, Role}, +}; +use anyhow::{Context as _, Result}; +use futures::StreamExt as _; + +const MODEL: &str = "openai/gpt-5.4-mini"; + +const DESCRIPTION_INSTRUCTION: &str = concat!( + "You write a description of a knowledge base for a routing agent. ", + "The routing agent reads this description alongside descriptions of other ", + "knowledge bases and picks the right one for a user's question. ", + "Inputs: KB name, optional instruction, and a list of one-line document purposes. ", + "Describe what is INSIDE this knowledge base — its document types, ", + "the entities and time periods covered, and the topics it can answer. ", + "Lead with the collective identity of the documents. ", + "Do not compare this KB to others, list what it excludes, or mention ", + "neighboring KB names. The routing agent does that comparison from a ", + "higher level; your job is to describe this KB on its own terms. ", + "Output must NOT mention dataset names, QA pairs, paper IDs, contract IDs, ", + "or any metadata about how this knowledge base was assembled. ", + "Describe ONLY what documents are inside, as if a curator wrote it. ", + "Length: ~200 characters. Output a JSON object: {\"description\": \"\"}." +); + +/// Wraps an Ailoy agent that turns a `(title, purpose)` document list into +/// a single KB-level description string. +pub struct DescriptionAgent { + spec: AgentSpec, + provider: Option, +} + +impl DescriptionAgent { + pub fn new(provider: Option) -> Self { + Self { + spec: AgentSpec::new(MODEL).instruction(DESCRIPTION_INSTRUCTION), + provider, + } + } + + /// Generate a KB-level description. + /// + /// `docs` is the full `(title, purpose)` list for the KB. Only `purpose` + /// is rendered into the user message; `title` is kept in the signature + /// so callers can feed the same slice to `fallback_description` when + /// the LLM call returns empty. + pub async fn generate( + &self, + kb_name: &str, + instruction: Option<&str>, + docs: &[(String, String)], + ) -> Result { + let user = build_user_message(kb_name, instruction, docs); + let query = Message::new(Role::User).with_contents([Part::text(user)]); + + let mut agent = match &self.provider { + Some(provider) => Agent::try_with_provider(self.spec.clone(), provider).await?, + None => Agent::try_new(self.spec.clone()).await?, + }; + + let mut text_parts: Vec = Vec::new(); + { + let mut stream = agent.run(query); + while let Some(result) = stream.next().await { + let output = result?; + for part in &output.message.contents { + if let Some(text) = part.as_text() { + text_parts.push(text.to_string()); + } + } + } + } + let raw = text_parts.join(""); + Ok(parse_description_response(&raw)) + } +} + +fn build_user_message( + kb_name: &str, + instruction: Option<&str>, + docs: &[(String, String)], +) -> String { + let mut s = String::new(); + s.push_str(&format!("KB name: {kb_name}\n")); + if let Some(instr) = instruction { + if !instr.trim().is_empty() { + s.push_str(&format!("KB instruction: {instr}\n")); + } + } + s.push_str(&format!("\nDocuments ({}):\n", docs.len())); + for (_title, purpose) in docs { + let p = if purpose.is_empty() { "(no purpose)" } else { purpose.as_str() }; + s.push_str(&format!("- {p}\n")); + } + s +} + +/// Mirrors `parser::parse_purpose_response`: try direct JSON, retry after +/// stripping any text around the JSON object, fall back to empty string. +/// An empty return signals the caller to use `fallback_description`. +fn parse_description_response(raw: &str) -> String { + let trimmed = raw.trim(); + if trimmed.is_empty() { + return String::new(); + } + + if let Ok(value) = serde_json::from_str::(trimmed) { + if let Some(d) = value.get("description").and_then(|v| v.as_str()) { + return d.trim().to_string(); + } + } + + if let (Some(start), Some(end)) = (trimmed.find('{'), trimmed.rfind('}')) { + if start < end { + if let Ok(value) = serde_json::from_str::(&trimmed[start..=end]) { + if let Some(d) = value.get("description").and_then(|v| v.as_str()) { + return d.trim().to_string(); + } + } + } + } + + String::new() +} + +/// Module-level entry point that mirrors `parser::get_title` / +/// `parser::get_purpose`: read `OPENAI_API_KEY` from the environment, +/// build the provider, run `DescriptionAgent`, and substitute +/// `fallback_description` if the LLM call returns an empty body. Pure +/// transport errors (missing key, network) propagate so the caller can +/// surface them. +pub async fn get_description( + kb_name: &str, + instruction: Option<&str>, + docs: &[(String, String)], +) -> Result { + dotenvy::dotenv().ok(); + + let mut provider = AgentProvider::new(); + provider.model_openai( + std::env::var("OPENAI_API_KEY").context("OPENAI_API_KEY not set in environment")?, + ); + + let agent = DescriptionAgent::new(Some(provider)); + let result = agent.generate(kb_name, instruction, docs).await?; + if result.is_empty() { + log::warn!("description generation returned empty string; using fallback"); + let titles: Vec = docs.iter().map(|(t, _)| t.clone()).collect(); + Ok(fallback_description(docs.len(), &titles)) + } else { + Ok(result) + } +} + +/// Deterministic fallback used when the LLM call fails or returns empty. +/// +/// The string is short enough to fit a system prompt's KB list and still +/// names the entities a routing agent needs. Doc count of zero produces an +/// empty string — there is nothing to describe. +pub fn fallback_description(doc_count: usize, top_titles: &[String]) -> String { + if doc_count == 0 { + return String::new(); + } + let titles: Vec<&str> = top_titles + .iter() + .filter(|t| !t.is_empty()) + .take(5) + .map(String::as_str) + .collect(); + if titles.is_empty() { + format!("{doc_count} documents") + } else { + format!("{doc_count} documents including: {}", titles.join("; ")) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_direct_json() { + let raw = r#"{"description": "hello"}"#; + assert_eq!(parse_description_response(raw), "hello"); + } + + #[test] + fn parse_json_with_surrounding_text() { + let raw = r#"Here you go: {"description": "hello"} done."#; + assert_eq!(parse_description_response(raw), "hello"); + } + + #[test] + fn parse_trims_inner_whitespace() { + let raw = r#"{"description": " hello "}"#; + assert_eq!(parse_description_response(raw), "hello"); + } + + #[test] + fn parse_empty_input() { + assert_eq!(parse_description_response(""), ""); + assert_eq!(parse_description_response(" "), ""); + } + + #[test] + fn parse_missing_field() { + let raw = r#"{"other": "value"}"#; + assert_eq!(parse_description_response(raw), ""); + } + + #[test] + fn parse_malformed_json() { + assert_eq!(parse_description_response("{not json"), ""); + } + + #[test] + fn fallback_zero_docs() { + assert_eq!(fallback_description(0, &[]), ""); + } + + #[test] + fn fallback_no_titles_falls_back_to_count_only() { + assert_eq!(fallback_description(7, &[]), "7 documents"); + assert_eq!( + fallback_description(7, &["".to_string(), "".to_string()]), + "7 documents" + ); + } + + #[test] + fn fallback_with_titles_takes_top_five() { + let titles = vec![ + "A".to_string(), + "B".to_string(), + "C".to_string(), + "D".to_string(), + "E".to_string(), + "F".to_string(), + "G".to_string(), + ]; + assert_eq!( + fallback_description(7, &titles), + "7 documents including: A; B; C; D; E" + ); + } + + #[test] + fn user_message_skips_blank_instruction() { + let docs = vec![ + ("T1".to_string(), "P1".to_string()), + ("T2".to_string(), "".to_string()), + ]; + let msg = build_user_message("kb1", Some(" "), &docs); + assert!(msg.contains("KB name: kb1")); + assert!(!msg.contains("KB instruction")); + assert!(msg.contains("- P1")); + assert!(msg.contains("- (no purpose)")); + } + + #[test] + fn user_message_renders_purpose_only_not_title() { + let docs = vec![("Apple 10-K".to_string(), "Apple FY2021 annual report".to_string())]; + let msg = build_user_message("finance", None, &docs); + // purpose is in, title is not + assert!(msg.contains("Apple FY2021 annual report")); + assert!(!msg.contains("Apple 10-K")); + } + + // ---- Store integration ---- + + #[tokio::test] + async fn describe_returns_empty_for_empty_store() { + // Empty store → describe short-circuits before any LLM call, + // so this runs without OPENAI_API_KEY. + let tmp = tempfile::tempdir().expect("tempdir"); + let store = crate::store::Store::new(tmp.path()).expect("open store"); + let result = store + .describe("kb-empty", None) + .await + .expect("describe should not fail on empty store"); + assert_eq!(result, ""); + } + + #[tokio::test] + #[ignore = "requires OPENAI_API_KEY"] + async fn describe_round_trips_against_pre_populated_index() { + // Pre-populate the index with deterministic (title, purpose) pairs + // so this test only exercises the description path, not PurposeAgent + // / TitleAgent. Mirrors the indexer round-trip tests in spirit. + let tmp = tempfile::tempdir().expect("tempdir"); + let root = tmp.path(); + let index_dir = root.join("index"); + let index = crate::store::indexer::open_or_create(&index_dir).expect("open index"); + crate::store::indexer::add_document( + &index, + "doc1", + "Apple FY2021 10-K", + "Apple Inc. 2021 Form 10-K annual report — revenue, iPhone, services, SEC filing", + "body 1", + ) + .expect("add doc1"); + crate::store::indexer::add_document( + &index, + "doc2", + "Walmart FY2023 10-K", + "Walmart Inc. FY2023 Form 10-K annual report — omnichannel, retail, SEC filing", + "body 2", + ) + .expect("add doc2"); + // drop the writer-backed Index so Store::new can reopen the same dir. + drop(index); + + // We need origin/ and corpus/ to exist for Store::new. + std::fs::create_dir_all(root.join("origin")).unwrap(); + std::fs::create_dir_all(root.join("corpus")).unwrap(); + + let store = crate::store::Store::new(root).expect("open store"); + let description = store + .describe("finance", Some("public-company financial filings")) + .await + .expect("describe"); + + assert!(!description.is_empty(), "description should not be empty"); + assert!( + description.chars().count() < 600, + "description should stay short: {description:?}" + ); + // The model should pick up at least one of the entity tokens we fed in. + let lowered = description.to_lowercase(); + let mentions_finance_token = ["apple", "walmart", "10-k", "filing", "financial"] + .iter() + .any(|t| lowered.contains(t)); + assert!( + mentions_finance_token, + "description did not mention any expected token: {description:?}" + ); + } +} diff --git a/speedwagon/src/store/mod.rs b/speedwagon/src/store/mod.rs index 4e8f024..9a85736 100644 --- a/speedwagon/src/store/mod.rs +++ b/speedwagon/src/store/mod.rs @@ -1,3 +1,4 @@ +mod description; mod document; mod indexer; mod parser; @@ -15,6 +16,7 @@ use anyhow::{Context as _, Result}; use tantivy::Index; use uuid::Uuid; +pub use description::{DescriptionAgent, fallback_description}; pub use document::{Document, FindResult}; pub use searcher::{SearchPage, SearchResult}; @@ -254,6 +256,33 @@ impl Store { context_bytes, )) } + + /// Generate a single KB-level description from every document's + /// `(title, purpose)` already in the index. + /// + /// Reads `OPENAI_API_KEY` from the environment (mirrors + /// `parser::get_title` / `parser::get_purpose`). On an empty LLM + /// body the call substitutes `description::fallback_description`, + /// so routing always has some signal. + /// + /// The output describes only this KB; comparison with adjacent KBs + /// is the routing agent's job, not this function's, so there is no + /// peer-KB input. + pub async fn describe( + &self, + kb_name: &str, + instruction: Option<&str>, + ) -> Result { + let docs = indexer::list_documents(&self.index, false)?; + if docs.is_empty() { + return Ok(String::new()); + } + let pairs: Vec<(String, String)> = docs + .iter() + .map(|d| (d.title.clone(), d.purpose.clone())) + .collect(); + description::get_description(kb_name, instruction, &pairs).await + } } #[cfg(test)] From e4b0646b1f243fa54ea7e1c082de0175bbb6219e Mon Sep 17 00:00:00 2001 From: nuri-yoo Date: Wed, 29 Apr 2026 12:06:26 +0900 Subject: [PATCH 2/4] refactor(speedwagon): self-anchor description prompt and trim doc-comments - Drop the "alongside other KBs" framing from the prompt opening so the model isn't primed to emit comparison vocabulary; the existing "do not compare" clause now matches the framing. - Note Korean output's ~1/3 char density at the same budget; per-language budgets are deferred (LLM can't count Korean words reliably either). - Trim verbose doc-comments across description.rs and Store::describe; add cost note (~24K input chars at N=200) to discourage synchronous use on indexing hot paths. Verified against the 4-KB / 12-probe harness: routing accuracy stays 12/12 across the shipped baseline, the prompt-only change, and word-budget variants. cargo test -p speedwagon --lib description: 12 passed, 1 ignored. --- speedwagon/src/store/description.rs | 72 +++++++---------------------- speedwagon/src/store/mod.rs | 15 ++---- 2 files changed, 21 insertions(+), 66 deletions(-) diff --git a/speedwagon/src/store/description.rs b/speedwagon/src/store/description.rs index 6233265..e1490f6 100644 --- a/speedwagon/src/store/description.rs +++ b/speedwagon/src/store/description.rs @@ -1,30 +1,8 @@ -//! KB-level description generation from per-document `(title, purpose)` tuples. -//! -//! Pairs with `parser::PurposeAgent`: where `PurposeAgent` produces one -//! line of search-tuned metadata per document, `DescriptionAgent` produces -//! one line of routing-tuned metadata per knowledge base. -//! -//! This module is a library — it does not own any database, HTTP surface, -//! or "Manual vs Auto" policy. Callers (backend, CLI) decide when to call -//! `Store::describe`, what to do with the result, and how to persist it. -//! -//! Output shape chosen by offline experiments on financebench / cuad / -//! kpaperqa / bioasq: -//! -//! - The description is self-contained: it states what is *inside* the -//! KB (document types, entities covered, topics it can answer) without -//! referencing any other KB. The routing agent that reads several KB -//! descriptions side-by-side does the comparison from a higher level; -//! we don't bake that comparison into each description, so an -//! adjacent KB being renamed or removed cannot stale a description. -//! - Target output ~200 chars: shortest length that still preserved a -//! coherent identity sentence in the experiments. -//! - Input is each doc's `purpose` only — adding `title` did not improve -//! routing on the probe set and increased input tokens by ~25%. -//! - On LLM failure, `fallback_description` returns -//! `"{N} documents including: {top-5 titles}"`. Empty string would have -//! silently degraded routing (an empty description anchored a wrong KB -//! in the experiments). +//! KB-level description for a routing agent. Built from each doc's `purpose` +//! (titles are kept for the fallback). Output stays self-contained — no +//! mention of peer KBs — so a renamed or removed neighbor cannot stale it. +//! Korean output lands ~1/3 the chars of English at the same budget; +//! per-language budgets are deferred until a near-domain Korean KB shows up. use ailoy::{ agent::{Agent, AgentProvider, AgentSpec}, @@ -36,24 +14,21 @@ use futures::StreamExt as _; const MODEL: &str = "openai/gpt-5.4-mini"; const DESCRIPTION_INSTRUCTION: &str = concat!( - "You write a description of a knowledge base for a routing agent. ", - "The routing agent reads this description alongside descriptions of other ", - "knowledge bases and picks the right one for a user's question. ", + "You write a self-contained description of a knowledge base. ", + "This description will be read by a routing agent that picks the right ", + "knowledge base for a user's question. ", "Inputs: KB name, optional instruction, and a list of one-line document purposes. ", "Describe what is INSIDE this knowledge base — its document types, ", "the entities and time periods covered, and the topics it can answer. ", "Lead with the collective identity of the documents. ", - "Do not compare this KB to others, list what it excludes, or mention ", - "neighboring KB names. The routing agent does that comparison from a ", - "higher level; your job is to describe this KB on its own terms. ", + "Describe this KB on its own terms — do not compare it to other KBs, ", + "list what it excludes, or mention neighboring KB names. ", "Output must NOT mention dataset names, QA pairs, paper IDs, contract IDs, ", "or any metadata about how this knowledge base was assembled. ", "Describe ONLY what documents are inside, as if a curator wrote it. ", "Length: ~200 characters. Output a JSON object: {\"description\": \"\"}." ); -/// Wraps an Ailoy agent that turns a `(title, purpose)` document list into -/// a single KB-level description string. pub struct DescriptionAgent { spec: AgentSpec, provider: Option, @@ -67,12 +42,8 @@ impl DescriptionAgent { } } - /// Generate a KB-level description. - /// - /// `docs` is the full `(title, purpose)` list for the KB. Only `purpose` - /// is rendered into the user message; `title` is kept in the signature - /// so callers can feed the same slice to `fallback_description` when - /// the LLM call returns empty. + /// Only `purpose` is sent to the LLM; `title` is kept in the signature + /// so the caller can feed the same slice to `fallback_description`. pub async fn generate( &self, kb_name: &str, @@ -124,9 +95,7 @@ fn build_user_message( s } -/// Mirrors `parser::parse_purpose_response`: try direct JSON, retry after -/// stripping any text around the JSON object, fall back to empty string. -/// An empty return signals the caller to use `fallback_description`. +/// Empty return signals the caller to use `fallback_description`. fn parse_description_response(raw: &str) -> String { let trimmed = raw.trim(); if trimmed.is_empty() { @@ -152,12 +121,9 @@ fn parse_description_response(raw: &str) -> String { String::new() } -/// Module-level entry point that mirrors `parser::get_title` / -/// `parser::get_purpose`: read `OPENAI_API_KEY` from the environment, -/// build the provider, run `DescriptionAgent`, and substitute -/// `fallback_description` if the LLM call returns an empty body. Pure -/// transport errors (missing key, network) propagate so the caller can -/// surface them. +/// Reads `OPENAI_API_KEY` from the environment, runs `DescriptionAgent`, and +/// substitutes `fallback_description` if the LLM body is empty. Transport +/// errors (missing key, network) propagate. pub async fn get_description( kb_name: &str, instruction: Option<&str>, @@ -181,11 +147,7 @@ pub async fn get_description( } } -/// Deterministic fallback used when the LLM call fails or returns empty. -/// -/// The string is short enough to fit a system prompt's KB list and still -/// names the entities a routing agent needs. Doc count of zero produces an -/// empty string — there is nothing to describe. +/// Deterministic fallback when the LLM call fails or returns empty. pub fn fallback_description(doc_count: usize, top_titles: &[String]) -> String { if doc_count == 0 { return String::new(); diff --git a/speedwagon/src/store/mod.rs b/speedwagon/src/store/mod.rs index 9a85736..0084caf 100644 --- a/speedwagon/src/store/mod.rs +++ b/speedwagon/src/store/mod.rs @@ -257,17 +257,10 @@ impl Store { )) } - /// Generate a single KB-level description from every document's - /// `(title, purpose)` already in the index. - /// - /// Reads `OPENAI_API_KEY` from the environment (mirrors - /// `parser::get_title` / `parser::get_purpose`). On an empty LLM - /// body the call substitutes `description::fallback_description`, - /// so routing always has some signal. - /// - /// The output describes only this KB; comparison with adjacent KBs - /// is the routing agent's job, not this function's, so there is no - /// peer-KB input. + /// One LLM call over every doc's `(title, purpose)` in the index. Input + /// is proportional to doc count (~24K chars at N=200), so don't run this + /// synchronously on indexing hot paths — use a finalize hook or + /// background job. Empty LLM body falls back to a deterministic string. pub async fn describe( &self, kb_name: &str, From 287b649bbec78a1d42989b3f3ddebcc4ce3cf2b4 Mon Sep 17 00:00:00 2001 From: nuri-yoo Date: Thu, 30 Apr 2026 11:26:35 +0900 Subject: [PATCH 3/4] feat(speedwagon): force description output to English --- speedwagon/src/store/description.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/speedwagon/src/store/description.rs b/speedwagon/src/store/description.rs index e1490f6..0b77bdd 100644 --- a/speedwagon/src/store/description.rs +++ b/speedwagon/src/store/description.rs @@ -26,6 +26,7 @@ const DESCRIPTION_INSTRUCTION: &str = concat!( "Output must NOT mention dataset names, QA pairs, paper IDs, contract IDs, ", "or any metadata about how this knowledge base was assembled. ", "Describe ONLY what documents are inside, as if a curator wrote it. ", + "Write the description in English regardless of the document language. ", "Length: ~200 characters. Output a JSON object: {\"description\": \"\"}." ); From 57fa2dd1db26c8f7b5fe7c9fc6e624cdf9bd217f Mon Sep 17 00:00:00 2001 From: nuri-yoo Date: Mon, 4 May 2026 11:22:47 +0900 Subject: [PATCH 4/4] refactor(speedwagon): borrow doc slices in description path Switch `&[(String, String)]` / `&[String]` to `&[(&str, &str)]` / `&[&str]` in `generate`, `get_description`, `build_user_message`, and `fallback_description`, and drop the upfront title/purpose clone in `Store::describe`. Caller-side `Document` strings are borrowed directly, and the fallback title vec is built only on the empty-LLM-response branch. --- speedwagon/src/store/description.rs | 36 +++++++++-------------------- speedwagon/src/store/mod.rs | 4 ++-- 2 files changed, 13 insertions(+), 27 deletions(-) diff --git a/speedwagon/src/store/description.rs b/speedwagon/src/store/description.rs index 0b77bdd..5b5de60 100644 --- a/speedwagon/src/store/description.rs +++ b/speedwagon/src/store/description.rs @@ -49,7 +49,7 @@ impl DescriptionAgent { &self, kb_name: &str, instruction: Option<&str>, - docs: &[(String, String)], + docs: &[(&str, &str)], ) -> Result { let user = build_user_message(kb_name, instruction, docs); let query = Message::new(Role::User).with_contents([Part::text(user)]); @@ -79,7 +79,7 @@ impl DescriptionAgent { fn build_user_message( kb_name: &str, instruction: Option<&str>, - docs: &[(String, String)], + docs: &[(&str, &str)], ) -> String { let mut s = String::new(); s.push_str(&format!("KB name: {kb_name}\n")); @@ -90,7 +90,7 @@ fn build_user_message( } s.push_str(&format!("\nDocuments ({}):\n", docs.len())); for (_title, purpose) in docs { - let p = if purpose.is_empty() { "(no purpose)" } else { purpose.as_str() }; + let p = if purpose.is_empty() { "(no purpose)" } else { *purpose }; s.push_str(&format!("- {p}\n")); } s @@ -128,7 +128,7 @@ fn parse_description_response(raw: &str) -> String { pub async fn get_description( kb_name: &str, instruction: Option<&str>, - docs: &[(String, String)], + docs: &[(&str, &str)], ) -> Result { dotenvy::dotenv().ok(); @@ -141,7 +141,7 @@ pub async fn get_description( let result = agent.generate(kb_name, instruction, docs).await?; if result.is_empty() { log::warn!("description generation returned empty string; using fallback"); - let titles: Vec = docs.iter().map(|(t, _)| t.clone()).collect(); + let titles: Vec<&str> = docs.iter().map(|(t, _)| *t).collect(); Ok(fallback_description(docs.len(), &titles)) } else { Ok(result) @@ -149,7 +149,7 @@ pub async fn get_description( } /// Deterministic fallback when the LLM call fails or returns empty. -pub fn fallback_description(doc_count: usize, top_titles: &[String]) -> String { +pub fn fallback_description(doc_count: usize, top_titles: &[&str]) -> String { if doc_count == 0 { return String::new(); } @@ -157,7 +157,7 @@ pub fn fallback_description(doc_count: usize, top_titles: &[String]) -> String { .iter() .filter(|t| !t.is_empty()) .take(5) - .map(String::as_str) + .copied() .collect(); if titles.is_empty() { format!("{doc_count} documents") @@ -213,23 +213,12 @@ mod tests { #[test] fn fallback_no_titles_falls_back_to_count_only() { assert_eq!(fallback_description(7, &[]), "7 documents"); - assert_eq!( - fallback_description(7, &["".to_string(), "".to_string()]), - "7 documents" - ); + assert_eq!(fallback_description(7, &["", ""]), "7 documents"); } #[test] fn fallback_with_titles_takes_top_five() { - let titles = vec![ - "A".to_string(), - "B".to_string(), - "C".to_string(), - "D".to_string(), - "E".to_string(), - "F".to_string(), - "G".to_string(), - ]; + let titles = ["A", "B", "C", "D", "E", "F", "G"]; assert_eq!( fallback_description(7, &titles), "7 documents including: A; B; C; D; E" @@ -238,10 +227,7 @@ mod tests { #[test] fn user_message_skips_blank_instruction() { - let docs = vec![ - ("T1".to_string(), "P1".to_string()), - ("T2".to_string(), "".to_string()), - ]; + let docs = [("T1", "P1"), ("T2", "")]; let msg = build_user_message("kb1", Some(" "), &docs); assert!(msg.contains("KB name: kb1")); assert!(!msg.contains("KB instruction")); @@ -251,7 +237,7 @@ mod tests { #[test] fn user_message_renders_purpose_only_not_title() { - let docs = vec![("Apple 10-K".to_string(), "Apple FY2021 annual report".to_string())]; + let docs = [("Apple 10-K", "Apple FY2021 annual report")]; let msg = build_user_message("finance", None, &docs); // purpose is in, title is not assert!(msg.contains("Apple FY2021 annual report")); diff --git a/speedwagon/src/store/mod.rs b/speedwagon/src/store/mod.rs index 0084caf..b1e37db 100644 --- a/speedwagon/src/store/mod.rs +++ b/speedwagon/src/store/mod.rs @@ -270,9 +270,9 @@ impl Store { if docs.is_empty() { return Ok(String::new()); } - let pairs: Vec<(String, String)> = docs + let pairs: Vec<(&str, &str)> = docs .iter() - .map(|d| (d.title.clone(), d.purpose.clone())) + .map(|d| (d.title.as_str(), d.purpose.as_str())) .collect(); description::get_description(kb_name, instruction, &pairs).await }