diff --git a/.gitignore b/.gitignore index a5e3f66..501c859 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,6 @@ npm-debug.log* yarn-debug.log* yarn-error.log* pnpm-debug.log* -package-lock.json yarn.lock pnpm-lock.yaml diff --git a/README.md b/README.md index ba247f3..b464096 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,26 @@ Hybrid semantic reranking is optional and **never replaces lexical grounding**. | `create-knolo-app` | Next.js scaffolding with playground | | `@knolo/langchain` | LangChain-style retriever interface | | `@knolo/llamaindex` | LlamaIndex-style retriever interface | +| `knolo-core-rust` | Native Rust pack mount + lexical query runtime | + +--- + +# 🦀 Rust Runtime Support (New) + +Knolo now includes an initial Rust runtime in `packages/core-rust`. + +Current Rust support includes: + +* Mounting `.knolo` packs from bytes +* Parsing v1/v3-compatible core sections (`meta`, `lexicon`, `postings`, `blocks`) +* Deterministic lexical querying with `top_k`, `min_score`, `namespace`, and `source` filters + +Run Rust tests: + +```bash +cd packages/core-rust +cargo test +``` --- diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..b174d27 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,120 @@ +{ + "name": "knolo-monorepo", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "knolo-monorepo", + "workspaces": [ + "packages/*" + ] + }, + "node_modules/@knolo/cli": { + "resolved": "packages/cli", + "link": true + }, + "node_modules/@knolo/core": { + "resolved": "packages/core", + "link": true + }, + "node_modules/@knolo/langchain": { + "resolved": "packages/langchain", + "link": true + }, + "node_modules/@knolo/llamaindex": { + "resolved": "packages/llamaindex", + "link": true + }, + "node_modules/@knolo/semantic-ollama": { + "resolved": "packages/semantic-ollama", + "link": true + }, + "node_modules/@types/node": { + "version": "20.19.39", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.39.tgz", + "integrity": "sha512-orrrD74MBUyK8jOAD/r0+lfa1I2MO6I+vAkmAWzMYbCcgrN4lCrmK52gRFQq/JRxfYPfonkr4b0jcY7Olqdqbw==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/create-knolo-app": { + "resolved": "packages/create-knolo-app", + "link": true + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + }, + "packages/cli": { + "name": "@knolo/cli", + "version": "3.2.1", + "dependencies": { + "@knolo/core": "^3.2.0" + }, + "bin": { + "knolo": "bin/knolo.mjs" + } + }, + "packages/core": { + "name": "@knolo/core", + "version": "3.2.1", + "license": "Apache-2.0", + "devDependencies": { + "@types/node": "^20.11.0", + "typescript": "^5.5.0" + } + }, + "packages/create-knolo-app": { + "version": "0.1.1", + "bin": { + "create-knolo-app": "bin/index.mjs" + }, + "engines": { + "node": ">=20" + } + }, + "packages/langchain": { + "name": "@knolo/langchain", + "version": "0.0.0", + "dependencies": { + "@knolo/core": "file:../core" + } + }, + "packages/llamaindex": { + "name": "@knolo/llamaindex", + "version": "0.0.0", + "dependencies": { + "@knolo/core": "file:../core" + } + }, + "packages/semantic-ollama": { + "name": "@knolo/semantic-ollama", + "version": "0.1.0", + "dependencies": { + "@knolo/core": "^3.2.1" + }, + "devDependencies": { + "typescript": "^5.5.0" + } + } + } +} diff --git a/package.json b/package.json index 8d3fdb4..86ffbed 100644 --- a/package.json +++ b/package.json @@ -9,6 +9,7 @@ "test": "npm run test --workspaces --if-present", "format": "npm run format --workspace @knolo/core", "format:check": "npm run format:check --workspace @knolo/core", - "knolo": "node ./packages/cli/bin/knolo.mjs" + "knolo": "node ./packages/cli/bin/knolo.mjs", + "test:rust": "cargo test --manifest-path packages/core-rust/Cargo.toml" } } diff --git a/packages/core-rust/.gitignore b/packages/core-rust/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/packages/core-rust/.gitignore @@ -0,0 +1 @@ +/target diff --git a/packages/core-rust/Cargo.lock b/packages/core-rust/Cargo.lock new file mode 100644 index 0000000..f4325fb --- /dev/null +++ b/packages/core-rust/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "knolo-core-rust" +version = "0.1.0" diff --git a/packages/core-rust/Cargo.toml b/packages/core-rust/Cargo.toml new file mode 100644 index 0000000..4d7ee19 --- /dev/null +++ b/packages/core-rust/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "knolo-core-rust" +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" +description = "Rust runtime support for reading and querying .knolo packs" + +[lib] +name = "knolo_core_rust" +path = "src/lib.rs" diff --git a/packages/core-rust/README.md b/packages/core-rust/README.md new file mode 100644 index 0000000..e9e5a1e --- /dev/null +++ b/packages/core-rust/README.md @@ -0,0 +1,36 @@ +# knolo-core-rust + +Native Rust runtime support for Knolo `.knolo` packs. + +## Included in this initial release + +- `mount_pack_from_bytes(&[u8]) -> Pack` +- `query(&Pack, &str, QueryOptions) -> Vec` +- Pack parsing support for: + - `meta` + - `lexicon` + - `postings` + - `blocks` (legacy string array and v3 object array) + +## Example + +```rust +use knolo_core_rust::{mount_pack_from_bytes, query, QueryOptions}; + +let bytes: Vec = std::fs::read("knowledge.knolo")?; +let pack = mount_pack_from_bytes(&bytes)?; + +let hits = query( + &pack, + "react native bridge throttling", + QueryOptions { + top_k: 5, + ..Default::default() + }, +); + +for hit in hits { + println!("{} => {}", hit.source.unwrap_or_default(), hit.score); +} +# Ok::<(), Box>(()) +``` diff --git a/packages/core-rust/src/lib.rs b/packages/core-rust/src/lib.rs new file mode 100644 index 0000000..108f0e6 --- /dev/null +++ b/packages/core-rust/src/lib.rs @@ -0,0 +1,472 @@ +use std::collections::{HashMap, HashSet}; +use std::error::Error; +use std::fmt::{Display, Formatter}; + +#[derive(Debug, Clone)] +pub enum KnoloError { + InvalidPack(String), +} + +impl Display for KnoloError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + KnoloError::InvalidPack(msg) => write!(f, "invalid pack: {msg}"), + } + } +} + +impl Error for KnoloError {} + +#[derive(Debug, Clone)] +pub struct PackMeta { + pub version: u32, + pub stats: PackStats, +} + +#[derive(Debug, Clone)] +pub struct PackStats { + pub docs: usize, + pub blocks: usize, + pub terms: usize, + pub avg_block_len: Option, +} + +#[derive(Debug, Clone)] +pub struct Pack { + pub meta: PackMeta, + pub lexicon: HashMap, + pub postings: Vec, + pub blocks: Vec, + pub headings: Vec>, + pub doc_ids: Vec>, + pub namespaces: Vec>, + pub block_token_lens: Vec, +} + +#[derive(Debug, Clone)] +pub struct QueryOptions { + pub top_k: usize, + pub min_score: f64, + pub namespace: Option>, + pub source: Option>, +} + +impl Default for QueryOptions { + fn default() -> Self { + Self { + top_k: 10, + min_score: 0.0, + namespace: None, + source: None, + } + } +} + +#[derive(Debug, Clone, PartialEq)] +pub struct Hit { + pub block_id: usize, + pub score: f64, + pub text: String, + pub source: Option, + pub namespace: Option, +} + +pub fn mount_pack_from_bytes(bytes: &[u8]) -> Result { + let mut cursor = 0usize; + + let meta_len = read_u32(bytes, &mut cursor)? as usize; + let meta_json = read_slice(bytes, &mut cursor, meta_len)?; + let meta = parse_meta(std::str::from_utf8(meta_json).map_err(|_| KnoloError::InvalidPack("meta utf8".into()))?)?; + + let lex_len = read_u32(bytes, &mut cursor)? as usize; + let lex_json = read_slice(bytes, &mut cursor, lex_len)?; + let lexicon = parse_lexicon(std::str::from_utf8(lex_json).map_err(|_| KnoloError::InvalidPack("lexicon utf8".into()))?)?; + + let post_count = read_u32(bytes, &mut cursor)? as usize; + let postings = read_u32_array(bytes, &mut cursor, post_count)?; + + let blocks_len = read_u32(bytes, &mut cursor)? as usize; + let blocks_json = read_slice(bytes, &mut cursor, blocks_len)?; + let blocks_str = std::str::from_utf8(blocks_json).map_err(|_| KnoloError::InvalidPack("blocks utf8".into()))?; + let parsed_blocks = parse_blocks(blocks_str)?; + + Ok(Pack { + meta, + lexicon, + postings, + blocks: parsed_blocks.texts, + headings: parsed_blocks.headings, + doc_ids: parsed_blocks.doc_ids, + namespaces: parsed_blocks.namespaces, + block_token_lens: parsed_blocks.lens, + }) +} + +pub fn query(pack: &Pack, q: &str, opts: QueryOptions) -> Vec { + if q.trim().is_empty() { + return vec![]; + } + let tokens = tokenize(q); + if tokens.is_empty() { + return vec![]; + } + + let term_ids = tokens + .iter() + .filter_map(|t| pack.lexicon.get(t).copied()) + .collect::>(); + if term_ids.is_empty() { + return vec![]; + } + + let namespace_filter = normalize_filter(opts.namespace.as_ref()); + let source_filter = normalize_filter(opts.source.as_ref()); + + let mut candidates: HashMap> = HashMap::new(); + let mut dfs: HashMap = HashMap::new(); + let uses_offset_block_ids = pack.meta.version >= 3; + + let mut i = 0usize; + while i < pack.postings.len() { + let tid = pack.postings[i]; + i += 1; + if tid == 0 { + continue; + } + let relevant = term_ids.contains(&tid); + let mut term_df = 0usize; + + if i >= pack.postings.len() { break; } + let mut encoded_bid = pack.postings[i]; + i += 1; + + while encoded_bid != 0 && i < pack.postings.len() { + let bid = if uses_offset_block_ids { + encoded_bid.saturating_sub(1) as usize + } else { + encoded_bid as usize + }; + + let mut tf = 0usize; + while i < pack.postings.len() { + let pos = pack.postings[i]; + i += 1; + if pos == 0 { + break; + } + tf += 1; + } + + term_df += 1; + if relevant && bid < pack.blocks.len() { + let entry = candidates.entry(bid).or_default(); + *entry.entry(tid).or_insert(0.0) += tf as f64; + } + + if i >= pack.postings.len() { break; } + encoded_bid = pack.postings[i]; + i += 1; + } + + if relevant { + dfs.insert(tid, term_df); + } + } + + if !namespace_filter.is_empty() { + candidates.retain(|bid, _| { + pack.namespaces + .get(*bid) + .and_then(|n| n.clone()) + .map(|n| namespace_filter.contains(&normalize(&n))) + .unwrap_or(false) + }); + } + + if !source_filter.is_empty() { + candidates.retain(|bid, _| { + pack.doc_ids + .get(*bid) + .and_then(|n| n.clone()) + .map(|n| source_filter.contains(&normalize(&n))) + .unwrap_or(false) + }); + } + + let doc_count = pack.meta.stats.blocks.max(1) as f64; + let avg_len = pack + .meta + .stats + .avg_block_len + .unwrap_or_else(|| { + if pack.block_token_lens.is_empty() { + 1.0 + } else { + pack.block_token_lens.iter().sum::() as f64 / pack.block_token_lens.len() as f64 + } + }) + .max(1.0); + + let mut scored = candidates + .into_iter() + .map(|(bid, tf_map)| { + let mut score = 0.0; + let len = *pack.block_token_lens.get(bid).unwrap_or(&1) as f64; + for (tid, tf) in tf_map { + let df = *dfs.get(&tid).unwrap_or(&0) as f64; + let idf = (1.0 + (doc_count - df + 0.5) / (df + 0.5)).ln(); + let k1 = 1.5; + let b = 0.75; + let numer = tf * (k1 + 1.0); + let denom = tf + k1 * (1.0 - b + b * (len / avg_len)); + score += idf * (numer / denom); + } + (bid, score) + }) + .filter(|(_, score)| *score >= opts.min_score) + .collect::>(); + + scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + + scored + .into_iter() + .take(opts.top_k.max(1)) + .map(|(bid, score)| Hit { + block_id: bid, + score, + text: pack.blocks.get(bid).cloned().unwrap_or_default(), + source: pack.doc_ids.get(bid).and_then(|s| s.clone()), + namespace: pack.namespaces.get(bid).and_then(|s| s.clone()), + }) + .collect() +} + +struct ParsedBlocks { + texts: Vec, + headings: Vec>, + doc_ids: Vec>, + namespaces: Vec>, + lens: Vec, +} + +fn parse_meta(json: &str) -> Result { + Ok(PackMeta { + version: parse_u32_field(json, "version")?, + stats: PackStats { + docs: parse_u32_field(json, "docs")? as usize, + blocks: parse_u32_field(json, "blocks")? as usize, + terms: parse_u32_field(json, "terms")? as usize, + avg_block_len: parse_f64_field(json, "avgBlockLen"), + }, + }) +} + +fn parse_lexicon(json: &str) -> Result, KnoloError> { + let mut map = HashMap::new(); + let s = compact(json); + let mut i = 0usize; + while let Some(start) = s[i..].find("[\"") { + let abs = i + start + 2; + let rest = &s[abs..]; + let end = rest.find('"').ok_or_else(|| KnoloError::InvalidPack("lexicon key".into()))?; + let key = rest[..end].to_string(); + let rest2 = &rest[end + 1..]; + let comma = rest2.find(',').ok_or_else(|| KnoloError::InvalidPack("lexicon comma".into()))?; + let rest3 = &rest2[comma + 1..]; + let mut n = String::new(); + for ch in rest3.chars() { + if ch.is_ascii_digit() { + n.push(ch); + } else { + break; + } + } + if !n.is_empty() { + map.insert(key, n.parse::().map_err(|_| KnoloError::InvalidPack("lexicon tid".into()))?); + } + i = abs + end + 1; + } + Ok(map) +} + +fn parse_blocks(json: &str) -> Result { + let s = compact(json); + if s.starts_with("[\"") { + let mut texts = Vec::new(); + let mut i = 2usize; + while i < s.len() { + if let Some(end) = s[i..].find('"') { + let piece = &s[i..i + end]; + texts.push(unescape(piece)); + i += end + 1; + if let Some(next) = s[i..].find('"') { + i += next + 1; + } else { + break; + } + } else { + break; + } + } + let lens = texts.iter().map(|t| tokenize(t).len()).collect::>(); + return Ok(ParsedBlocks { + headings: vec![None; texts.len()], + doc_ids: vec![None; texts.len()], + namespaces: vec![None; texts.len()], + lens, + texts, + }); + } + + let objects = split_top_level_objects(&s)?; + let mut texts = Vec::new(); + let mut headings = Vec::new(); + let mut doc_ids = Vec::new(); + let mut namespaces = Vec::new(); + let mut lens = Vec::new(); + + for obj in objects { + let text = parse_string_or_null(&obj, "text").unwrap_or_default(); + let len = parse_u32_field_optional(&obj, "len").map(|v| v as usize).unwrap_or_else(|| tokenize(&text).len()); + texts.push(text); + headings.push(parse_string_or_null(&obj, "heading")); + doc_ids.push(parse_string_or_null(&obj, "docId")); + namespaces.push(parse_string_or_null(&obj, "namespace")); + lens.push(len); + } + + Ok(ParsedBlocks { texts, headings, doc_ids, namespaces, lens }) +} + +fn split_top_level_objects(s: &str) -> Result, KnoloError> { + let mut out = Vec::new(); + let mut depth = 0i32; + let mut start = None; + let chars: Vec = s.chars().collect(); + for (i, ch) in chars.iter().enumerate() { + if *ch == '{' { + if depth == 0 { + start = Some(i); + } + depth += 1; + } else if *ch == '}' { + depth -= 1; + if depth == 0 { + if let Some(st) = start { + out.push(chars[st..=i].iter().collect()); + } + start = None; + } + } + } + if out.is_empty() { + return Err(KnoloError::InvalidPack("blocks objects".into())); + } + Ok(out) +} + +fn parse_string_or_null(obj: &str, key: &str) -> Option { + let needle = format!("\"{}\":", key); + let idx = obj.find(&needle)? + needle.len(); + let tail = &obj[idx..]; + if tail.starts_with("null") { + return None; + } + if !tail.starts_with('"') { + return None; + } + let rest = &tail[1..]; + let end = rest.find('"')?; + Some(unescape(&rest[..end])) +} + +fn parse_u32_field(json: &str, key: &str) -> Result { + parse_u32_field_optional(json, key).ok_or_else(|| KnoloError::InvalidPack(format!("missing {key}"))) +} + +fn parse_u32_field_optional(json: &str, key: &str) -> Option { + let needle = format!("\"{}\":", key); + let idx = json.find(&needle)? + needle.len(); + let tail = &json[idx..]; + let mut n = String::new(); + for ch in tail.chars() { + if ch.is_ascii_digit() { + n.push(ch); + } else if !n.is_empty() { + break; + } + } + n.parse().ok() +} + +fn parse_f64_field(json: &str, key: &str) -> Option { + let needle = format!("\"{}\":", key); + let idx = json.find(&needle)? + needle.len(); + let tail = &json[idx..]; + let mut n = String::new(); + for ch in tail.chars() { + if ch.is_ascii_digit() || ch == '.' { + n.push(ch); + } else if !n.is_empty() { + break; + } + } + n.parse().ok() +} + +fn normalize_filter(values: Option<&Vec>) -> HashSet { + values + .map(|arr| arr.iter().map(|s| normalize(s)).collect::>()) + .unwrap_or_default() +} + +fn normalize(s: &str) -> String { + s.to_lowercase().trim().to_string() +} + +fn tokenize(text: &str) -> Vec { + let mut out = Vec::new(); + let mut cur = String::new(); + for ch in text.chars() { + if ch.is_alphanumeric() { + cur.push(ch.to_ascii_lowercase()); + } else if !cur.is_empty() { + out.push(std::mem::take(&mut cur)); + } + } + if !cur.is_empty() { + out.push(cur); + } + out +} + +fn compact(s: &str) -> String { + s.chars().filter(|c| !c.is_whitespace()).collect() +} + +fn unescape(s: &str) -> String { + s.replace("\\\"", "\"") +} + +fn read_u32(bytes: &[u8], cursor: &mut usize) -> Result { + let chunk = read_slice(bytes, cursor, 4)?; + Ok(u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])) +} + +fn read_u32_array(bytes: &[u8], cursor: &mut usize, len: usize) -> Result, KnoloError> { + let mut out = Vec::with_capacity(len); + for _ in 0..len { + out.push(read_u32(bytes, cursor)?); + } + Ok(out) +} + +fn read_slice<'a>(bytes: &'a [u8], cursor: &mut usize, len: usize) -> Result<&'a [u8], KnoloError> { + let end = cursor.saturating_add(len); + if end > bytes.len() { + return Err(KnoloError::InvalidPack("unexpected end-of-buffer".into())); + } + let slice = &bytes[*cursor..end]; + *cursor = end; + Ok(slice) +} diff --git a/packages/core-rust/tests/core_rust_test.rs b/packages/core-rust/tests/core_rust_test.rs new file mode 100644 index 0000000..7f3b3db --- /dev/null +++ b/packages/core-rust/tests/core_rust_test.rs @@ -0,0 +1,81 @@ +use knolo_core_rust::{mount_pack_from_bytes, query, QueryOptions}; + +fn build_test_pack_bytes() -> Vec { + let meta = b"{\"version\":3,\"stats\":{\"docs\":2,\"blocks\":2,\"terms\":4,\"avgBlockLen\":2.5}}".to_vec(); + let lexicon = b"[[\"alpha\",1],[\"beta\",2],[\"gamma\",3],[\"delta\",4]]".to_vec(); + + let postings: Vec = vec![ + 1, 1, 1, 0, 0, + 2, 1, 2, 0, 2, 1, 0, 0, + 3, 1, 3, 0, 0, + 4, 2, 2, 0, 0, + ]; + + let blocks = b"[{\"text\":\"alpha beta gamma\",\"heading\":\"A\",\"docId\":\"a\",\"namespace\":\"docs\",\"len\":3},{\"text\":\"beta delta\",\"heading\":\"B\",\"docId\":\"b\",\"namespace\":\"guides\",\"len\":2}]".to_vec(); + + let mut out = Vec::new(); + push_section(&mut out, &meta); + push_section(&mut out, &lexicon); + out.extend_from_slice(&(postings.len() as u32).to_le_bytes()); + for p in postings { + out.extend_from_slice(&p.to_le_bytes()); + } + push_section(&mut out, &blocks); + out +} + +fn push_section(out: &mut Vec, bytes: &[u8]) { + out.extend_from_slice(&(bytes.len() as u32).to_le_bytes()); + out.extend_from_slice(bytes); +} + +#[test] +fn mounts_pack_and_exposes_meta() { + let bytes = build_test_pack_bytes(); + let pack = mount_pack_from_bytes(&bytes).expect("mount should succeed"); + + assert_eq!(pack.meta.version, 3); + assert_eq!(pack.meta.stats.blocks, 2); + assert_eq!(pack.blocks.len(), 2); + assert_eq!(pack.doc_ids[0].as_deref(), Some("a")); + assert_eq!(pack.namespaces[1].as_deref(), Some("guides")); +} + +#[test] +fn lexical_query_returns_expected_rank() { + let bytes = build_test_pack_bytes(); + let pack = mount_pack_from_bytes(&bytes).expect("mount should succeed"); + + let hits = query( + &pack, + "alpha beta", + QueryOptions { + top_k: 2, + ..Default::default() + }, + ); + + assert_eq!(hits.len(), 2); + assert_eq!(hits[0].source.as_deref(), Some("a")); + assert_eq!(hits[1].source.as_deref(), Some("b")); + assert!(hits[0].score > hits[1].score); +} + +#[test] +fn namespace_filter_works() { + let bytes = build_test_pack_bytes(); + let pack = mount_pack_from_bytes(&bytes).expect("mount should succeed"); + + let hits = query( + &pack, + "beta", + QueryOptions { + top_k: 5, + namespace: Some(vec!["docs".to_string()]), + ..Default::default() + }, + ); + + assert_eq!(hits.len(), 1); + assert_eq!(hits[0].namespace.as_deref(), Some("docs")); +}