Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions helix-db/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,18 @@ helix-metrics = { path = "../metrics" }
# external dependencies
tokio = { version = "1.44.2", features = ["full"] }
serde = { version = "1.0.217", features = ["derive"] }
bincode = "1.3.3" # TODO: Figure out bincode 2 impl with current serde impl
bincode = "1.3.3" # TODO: Figure out bincode 2 impl with current serde impl
sonic-rs = "0.5.0"
inventory = "0.3.16"
twox-hash = "2.1.0"
heed3 = "0.22.0"
uuid = { version = "1.12.1", features = ["v4", "v6", "fast-rng"] }
rand = "0.9.0"
chrono = "0.4.39"
flume = { version = "0.11.1", default-features = false, features = ["async", "select"] }
flume = { version = "0.11.1", default-features = false, features = [
"async",
"select",
] }
itertools = "0.14.0"
tempfile = "3.20.0"
paste = "1.0.15"
Expand Down Expand Up @@ -57,12 +60,8 @@ num_cpus = "1.17" # TODO:
[features]
debug-output = ["helix-macros/debug-output"]
compiler = ["pest", "pest_derive"]

# vector features
cosine = []

build = ["compiler"]
vectors = ["cosine", "url"]
vectors = ["url"]
server = ["build", "compiler", "vectors", "reqwest"]
full = ["build", "compiler", "vectors"]
dev = ["debug-output", "server"]
Expand Down
41 changes: 21 additions & 20 deletions helix-db/benches/hnsw_benches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,17 @@
mod tests {
use heed3::{Env, EnvOpenOptions, RoTxn};
use helix_db::{
helix_engine::vector_core::{
helix_engine::{traversal_core::config::SimilarityMethod, vector_core::{
hnsw::HNSW,
vector::HVector,
vector_core::{HNSWConfig, VectorCore},
},
}},
utils::tqdm::tqdm,
};
use polars::prelude::*;
use rand::{
prelude::SliceRandom,
Rng,
};
use rand::{Rng, prelude::SliceRandom};
use std::{
collections::{HashSet, HashMap},
collections::{HashMap, HashSet},
fs::{self, File},
sync::{Arc, Mutex},
thread,
Expand Down Expand Up @@ -84,30 +81,27 @@ mod tests {
.iter()
.filter_map(|base_vec| {
query_hvector
.distance_to(base_vec)
.map(|dist| (base_vec.id.clone(), dist))
.distance_to(base_vec, &SimilarityMethod::default())
.map(|dist| (base_vec.id.clone(), *dist))
.ok()
})
.collect();
.collect();

distances.sort_by(|a, b| {
a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)
});

let top_k_ids: Vec<u128> = distances
.into_iter()
.take(k)
.map(|(id, _)| id)
.collect();
let top_k_ids: Vec<u128> =
distances.into_iter().take(k).map(|(id, _)| id).collect();

(query_id, top_k_ids)
})
.collect();
.collect();

results.lock().unwrap().extend(local_results);
})
})
.collect();
.collect();

for handle in handles {
handle.join().unwrap();
Expand Down Expand Up @@ -312,7 +306,13 @@ mod tests {

let env = setup_temp_env();
let mut txn = env.write_txn().unwrap();
let index = VectorCore::new(&env, &mut txn, HNSWConfig::new(None, None, None)).unwrap();
let index = VectorCore::new(
&env,
&mut txn,
HNSWConfig::new(None, None, None),
Some(SimilarityMethod::default()),
)
.unwrap();
let mut total_insertion_time = std::time::Duration::from_secs(0);

let mut base_all_vectors: Vec<HVector> = Vec::new();
Expand Down Expand Up @@ -354,7 +354,9 @@ mod tests {
let mut total_search_time = std::time::Duration::from_secs(0);
for (qid, query) in query_vectors.iter() {
let start_time = Instant::now();
let results = index.search::<Filter>(&txn, query, k, "vector", None, false).unwrap();
let results = index
.search::<Filter>(&txn, query, k, "vector", None, false)
.unwrap();
let search_duration = start_time.elapsed();
total_search_time += search_duration;

Expand Down Expand Up @@ -400,4 +402,3 @@ mod tests {
}

// TODO: memory benchmark (only the hnsw index ofc)

45 changes: 22 additions & 23 deletions helix-db/src/helix_engine/bm25/bm25.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
use crate::{
debug_println,
helix_engine::{
storage_core::HelixGraphStorage,
types::GraphError,
vector_core::{hnsw::HNSW, vector::HVector},
},
protocol::value::Value,
debug_println,
};

use heed3::{types::*, Database, Env, RoTxn, RwTxn};
use heed3::{Database, Env, RoTxn, RwTxn, types::*};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use tokio::task;
Expand Down Expand Up @@ -82,10 +82,10 @@ impl HBM25Config {

let doc_lengths_db: Database<U128<heed3::byteorder::BE>, U32<heed3::byteorder::BE>> =
graph_env
.database_options()
.types::<U128<heed3::byteorder::BE>, U32<heed3::byteorder::BE>>()
.name(DB_BM25_DOC_LENGTHS)
.create(wtxn)?;
.database_options()
.types::<U128<heed3::byteorder::BE>, U32<heed3::byteorder::BE>>()
.name(DB_BM25_DOC_LENGTHS)
.create(wtxn)?;

let term_frequencies_db: Database<Bytes, U32<heed3::byteorder::BE>> = graph_env
.database_options()
Expand Down Expand Up @@ -188,7 +188,7 @@ impl BM25 for HBM25Config {
let current_df = self.term_frequencies_db.get(txn, term_bytes)?.unwrap_or(0);
self.term_frequencies_db
.put(txn, term_bytes, &(current_df + 1))?;
}
}

let mut metadata = if let Some(data) = self.metadata_db.get(txn, METADATA_KEY)? {
bincode::deserialize::<BM25Metadata>(data)?
Expand Down Expand Up @@ -404,7 +404,6 @@ impl HybridSearch for HelixGraphStorage {

let graph_env_bm25 = self.graph_env.clone();
let graph_env_vector = self.graph_env.clone();

let bm25_handle = task::spawn_blocking(move || -> Result<Vec<(u128, f32)>, GraphError> {
let txn = graph_env_bm25.read_txn()?;
match self.bm25.as_ref() {
Expand All @@ -413,18 +412,19 @@ impl HybridSearch for HelixGraphStorage {
}
});

let vector_handle = task::spawn_blocking(move || -> Result<Option<Vec<HVector>>, GraphError> {
let txn = graph_env_vector.read_txn()?;
let results = self.vectors.search::<fn(&HVector, &RoTxn) -> bool>(
&txn,
&query_vector_owned,
limit * 2,
"vector",
None,
false,
)?;
Ok(Some(results))
});
let vector_handle =
task::spawn_blocking(move || -> Result<Option<Vec<HVector>>, GraphError> {
let txn = graph_env_vector.read_txn()?;
let results = self.vectors.search::<fn(&HVector, &RoTxn) -> bool>(
&txn,
&query_vector_owned,
limit * 2,
"vector",
None,
false,
)?;
Ok(Some(results))
});

let (bm25_results, vector_results) = match tokio::try_join!(bm25_handle, vector_handle) {
Ok((a, b)) => (a, b),
Expand All @@ -441,13 +441,13 @@ impl HybridSearch for HelixGraphStorage {
if let Some(vector_results) = vector_results? {
for doc in vector_results {
let doc_id = doc.id;
let score = doc.distance.unwrap_or(0.0);
let score = *doc.distance;
let similarity = (1.0 / (1.0 + score)) as f32;
combined_scores
.entry(doc_id)
.and_modify(|existing_score| *existing_score += (1.0 - alpha) * similarity)
.or_insert((1.0 - alpha) * similarity); // correction made here from score as f32 to similarity
}
}
}

let mut results = combined_scores.into_iter().collect::<Vec<(u128, f32)>>();
Expand Down Expand Up @@ -475,4 +475,3 @@ impl BM25Flatten for HashMap<String, Value> {
})
}
}

9 changes: 2 additions & 7 deletions helix-db/src/helix_engine/storage_core/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::{
storage_methods::{DBMethods, StorageMethods},
version_info::VersionInfo,
},
traversal_core::config::Config,
traversal_core::config::{Config},
types::GraphError,
vector_core::{
hnsw::HNSW,
Expand Down Expand Up @@ -41,13 +41,11 @@ pub type EdgeId = u128;

pub struct StorageConfig {
pub schema: String,
pub graphvis_node_label: Option<String>,
pub embedding_model: Option<String>,
}

pub struct HelixGraphStorage {
pub graph_env: Env,

pub nodes_db: Database<U128<BE>, Bytes>,
pub edges_db: Database<U128<BE>, Bytes>,
pub out_edges_db: Database<Bytes, Bytes>,
Expand All @@ -56,7 +54,6 @@ pub struct HelixGraphStorage {
pub vectors: VectorCore,
pub bm25: Option<HBM25Config>,
pub version_info: VersionInfo,

pub storage_config: StorageConfig,
}

Expand Down Expand Up @@ -152,6 +149,7 @@ impl HelixGraphStorage {
vector_config.ef_construction,
vector_config.ef_search,
),
vector_config.vector_similarity,
)?;

let bm25 = config
Expand All @@ -161,7 +159,6 @@ impl HelixGraphStorage {

let storage_config = StorageConfig::new(
config.schema.unwrap_or("".to_string()),
config.graphvis_node_label,
config.embedding_model,
);

Expand Down Expand Up @@ -265,12 +262,10 @@ impl HelixGraphStorage {
impl StorageConfig {
pub fn new(
schema: String,
graphvis_node_label: Option<String>,
embedding_model: Option<String>,
) -> StorageConfig {
Self {
schema,
graphvis_node_label,
embedding_model,
}
}
Expand Down
Loading
Loading