Skip to content

Commit ee43cb6

Browse files
authored
perf: optimize map_default_isolates
1 parent f1ce2c0 commit ee43cb6

File tree

4 files changed

+530
-64
lines changed

4 files changed

+530
-64
lines changed

src/lib.rs

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@ mod coverage;
33
mod matrix;
44
mod em;
55
mod parse_sam;
6+
mod stream_processor;
67

78
use subtraction::eliminate_subtraction;
89
use matrix::build_matrix;
910
use em::{em, compute_best_hit};
1011
use pyo3::exceptions::PyIOError;
1112
use pyo3::prelude::*;
12-
use std::collections::HashMap;
13+
use std::collections::{HashMap, HashSet};
1314

1415
// Type aliases for complex HashMap types used throughout the codebase
1516
pub type UniqueReads = HashMap<i32, (i32, f64)>;
@@ -54,6 +55,8 @@ fn rust(_py: Python, m: &PyModule) -> PyResult<()> {
5455
m.add_function(wrap_pyfunction!(run_expectation_maximization, m)?)?;
5556
m.add_function(wrap_pyfunction!(run_eliminate_subtraction, m)?)?;
5657
m.add_function(wrap_pyfunction!(calculate_coverage_from_em_results, m)?)?;
58+
m.add_function(wrap_pyfunction!(find_candidate_otus, m)?)?;
59+
m.add_function(wrap_pyfunction!(find_candidate_otus_from_bytes, m)?)?;
5760
Ok(())
5861
}
5962

@@ -206,8 +209,48 @@ pub fn run_expectation_maximization(
206209
})
207210
}
208211

212+
#[pyfunction]
213+
/// Extract candidate OTU reference IDs from a SAM/BAM file
214+
///
215+
/// This function replaces the Python line-by-line processing in map_default_isolates
216+
/// with high-performance Rust processing. It reads a SAM/BAM file and extracts
217+
/// reference IDs for reads that meet the score cutoff.
218+
///
219+
/// # Arguments
220+
/// * `sam_path` - Path to the SAM/BAM file to process
221+
/// * `p_score_cutoff` - Minimum score threshold (AS:i score + read length)
222+
///
223+
/// # Returns
224+
/// Set of reference IDs that have reads meeting the score cutoff
225+
pub fn find_candidate_otus(
226+
_py: Python,
227+
sam_path: String,
228+
p_score_cutoff: f64,
229+
) -> PyResult<HashSet<String>> {
230+
stream_processor::extract_candidate_otus_from_sam_file(&sam_path, p_score_cutoff)
231+
.map_err(|e| PyErr::new::<PyIOError, _>(e.to_string()))
232+
}
209233

210-
234+
#[pyfunction]
235+
/// Extract candidate OTU reference IDs from SAM text data
236+
///
237+
/// This function parses SAM format data directly from bytes without using rust-htslib.
238+
/// It provides memory-based processing that doesn't require temporary files or unsafe code.
239+
///
240+
/// # Arguments
241+
/// * `sam_bytes` - Raw SAM format data as bytes (typically from subprocess stdout)
242+
/// * `p_score_cutoff` - Minimum score threshold (AS:i score + read length)
243+
///
244+
/// # Returns
245+
/// Set of reference IDs that have reads meeting the score cutoff
246+
pub fn find_candidate_otus_from_bytes(
247+
_py: Python,
248+
sam_bytes: &[u8],
249+
p_score_cutoff: f64,
250+
) -> PyResult<HashSet<String>> {
251+
stream_processor::extract_candidate_otus_from_bytes(sam_bytes, p_score_cutoff)
252+
.map_err(|e| PyErr::new::<PyIOError, _>(e.to_string()))
253+
}
211254

212255

213256

0 commit comments

Comments
 (0)