@@ -183,48 +183,83 @@ pub fn parse_isolate_scores(
183183 p_score_cutoff : f64 ,
184184) -> PyResult < HashMap < String , f64 > > {
185185 use rust_htslib:: { bam, bam:: Read } ;
186+ use std:: time:: Instant ;
186187
187188 info ! ( "parsing isolate scores from {} with cutoff {}" , alignment_path, p_score_cutoff) ;
188189
189190 // Release the GIL during the CPU-intensive BAM file processing
190191 py. allow_threads ( || {
192+ let start = Instant :: now ( ) ;
193+
191194 let mut reader = bam:: Reader :: from_path ( & alignment_path)
192195 . map_err ( |e| PyErr :: new :: < PyIOError , _ > ( format ! ( "Failed to open alignment file '{}': {}" , alignment_path, e) ) ) ?;
193196
194- let mut isolate_high_scores: HashMap < String , f64 > = HashMap :: new ( ) ;
197+ // Pre-allocate HashMap capacity based on typical BAM files
198+ // For 11M records, assuming ~30% pass cutoff = ~3.3M unique reads
199+ let estimated_capacity = 3_500_000 ;
200+ let mut isolate_high_scores: HashMap < String , f64 > = HashMap :: with_capacity ( estimated_capacity) ;
201+
202+ info ! ( "initialized hashmap with capacity {}" , estimated_capacity) ;
203+
204+ let mut total_records = 0u64 ;
205+ let mut mapped_records = 0u64 ;
206+ let mut passed_cutoff = 0u64 ;
207+ let mut last_log = Instant :: now ( ) ;
195208
196209 for result in reader. records ( ) {
197- let record = result. map_err ( |e| PyErr :: new :: < PyIOError , _ > ( format ! ( "Error reading record: {}" , e) ) ) ?;
210+ total_records += 1 ;
211+
212+ if total_records % 1_000_000 == 0 {
213+ let elapsed = last_log. elapsed ( ) ;
214+ let rate = 1_000_000.0 / elapsed. as_secs_f64 ( ) ;
215+ info ! (
216+ "processed {}m records ({} mapped, {} passed cutoff) - {:.0} records/sec" ,
217+ total_records / 1_000_000 ,
218+ mapped_records,
219+ passed_cutoff,
220+ rate
221+ ) ;
222+ last_log = Instant :: now ( ) ;
223+ }
224+
225+ let record = result. map_err ( |e| PyErr :: new :: < PyIOError , _ > ( format ! ( "error reading record: {}" , e) ) ) ?;
198226
199- // Skip unmapped reads
200227 if record. is_unmapped ( ) {
201228 continue ;
202229 }
230+ mapped_records += 1 ;
203231
204- // Get read ID (qname)
205232 let read_id = std:: str:: from_utf8 ( record. qname ( ) )
206- . map_err ( |e| PyErr :: new :: < PyIOError , _ > ( format ! ( "Invalid UTF -8 in read ID : {}" , e) ) ) ?
233+ . map_err ( |e| PyErr :: new :: < PyIOError , _ > ( format ! ( "invalid utf -8 in read id : {}" , e) ) ) ?
207234 . to_string ( ) ;
208235
209- // Get read length (unused but kept for potential future use)
210- let _read_length = record. seq_len ( ) ;
211-
212- // Get alignment score using shared function
213236 let total_score = match parse_sam:: extract_alignment_score ( & record) {
214237 Some ( score) => score,
215- None => continue , // Skip records without valid AS field
238+ None => continue ,
216239 } ;
217240
218- // Apply score cutoff
219241 if total_score >= p_score_cutoff {
220- // Track highest score for this read
221- if total_score > * isolate_high_scores. get ( & read_id) . unwrap_or ( & 0.0 ) {
222- isolate_high_scores. insert ( read_id, total_score) ;
223- }
242+ passed_cutoff += 1 ;
243+
244+ isolate_high_scores
245+ . entry ( read_id)
246+ . and_modify ( |existing_score| {
247+ if total_score > * existing_score {
248+ * existing_score = total_score;
249+ }
250+ } )
251+ . or_insert ( total_score) ;
224252 }
225253 }
226254
227- info ! ( "parsed {} isolate scores" , isolate_high_scores. len( ) ) ;
255+ let elapsed = start. elapsed ( ) ;
256+ info ! (
257+ "parsed {} isolate scores from {} total records in {:.2}s ({:.0} records/sec)" ,
258+ isolate_high_scores. len( ) ,
259+ total_records,
260+ elapsed. as_secs_f64( ) ,
261+ total_records as f64 / elapsed. as_secs_f64( )
262+ ) ;
228263 Ok ( isolate_high_scores)
229264 } )
230265}
0 commit comments