@@ -3,13 +3,14 @@ mod coverage;
33mod matrix;
44mod em;
55mod parse_sam;
6+ mod stream_processor;
67
78use subtraction:: eliminate_subtraction;
89use matrix:: build_matrix;
910use em:: { em, compute_best_hit} ;
1011use pyo3:: exceptions:: PyIOError ;
1112use pyo3:: prelude:: * ;
12- use std:: collections:: HashMap ;
13+ use std:: collections:: { HashMap , HashSet } ;
1314
1415// Type aliases for complex HashMap types used throughout the codebase
1516pub type UniqueReads = HashMap < i32 , ( i32 , f64 ) > ;
@@ -54,6 +55,8 @@ fn rust(_py: Python, m: &PyModule) -> PyResult<()> {
5455 m. add_function ( wrap_pyfunction ! ( run_expectation_maximization, m) ?) ?;
5556 m. add_function ( wrap_pyfunction ! ( run_eliminate_subtraction, m) ?) ?;
5657 m. add_function ( wrap_pyfunction ! ( calculate_coverage_from_em_results, m) ?) ?;
58+ m. add_function ( wrap_pyfunction ! ( find_candidate_otus, m) ?) ?;
59+ m. add_function ( wrap_pyfunction ! ( find_candidate_otus_from_bytes, m) ?) ?;
5760 Ok ( ( ) )
5861}
5962
@@ -206,8 +209,48 @@ pub fn run_expectation_maximization(
206209 } )
207210}
208211
212+ #[ pyfunction]
213+ /// Extract candidate OTU reference IDs from a SAM/BAM file
214+ ///
215+ /// This function replaces the Python line-by-line processing in map_default_isolates
216+ /// with high-performance Rust processing. It reads a SAM/BAM file and extracts
217+ /// reference IDs for reads that meet the score cutoff.
218+ ///
219+ /// # Arguments
220+ /// * `sam_path` - Path to the SAM/BAM file to process
221+ /// * `p_score_cutoff` - Minimum score threshold (AS:i score + read length)
222+ ///
223+ /// # Returns
224+ /// Set of reference IDs that have reads meeting the score cutoff
225+ pub fn find_candidate_otus (
226+ _py : Python ,
227+ sam_path : String ,
228+ p_score_cutoff : f64 ,
229+ ) -> PyResult < HashSet < String > > {
230+ stream_processor:: extract_candidate_otus_from_sam_file ( & sam_path, p_score_cutoff)
231+ . map_err ( |e| PyErr :: new :: < PyIOError , _ > ( e. to_string ( ) ) )
232+ }
209233
210-
234+ #[ pyfunction]
235+ /// Extract candidate OTU reference IDs from SAM text data
236+ ///
237+ /// This function parses SAM format data directly from bytes without using rust-htslib.
238+ /// It provides memory-based processing that doesn't require temporary files or unsafe code.
239+ ///
240+ /// # Arguments
241+ /// * `sam_bytes` - Raw SAM format data as bytes (typically from subprocess stdout)
242+ /// * `p_score_cutoff` - Minimum score threshold (AS:i score + read length)
243+ ///
244+ /// # Returns
245+ /// Set of reference IDs that have reads meeting the score cutoff
246+ pub fn find_candidate_otus_from_bytes (
247+ _py : Python ,
248+ sam_bytes : & [ u8 ] ,
249+ p_score_cutoff : f64 ,
250+ ) -> PyResult < HashSet < String > > {
251+ stream_processor:: extract_candidate_otus_from_bytes ( sam_bytes, p_score_cutoff)
252+ . map_err ( |e| PyErr :: new :: < PyIOError , _ > ( e. to_string ( ) ) )
253+ }
211254
212255
213256
0 commit comments