Skip to content

Commit bf300aa

Browse files
authored
perf: optimize isolate bam parsing
1 parent ef46cf8 commit bf300aa

File tree

1 file changed

+51
-16
lines changed

1 file changed

+51
-16
lines changed

src/lib.rs

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -183,48 +183,83 @@ pub fn parse_isolate_scores(
183183
p_score_cutoff: f64,
184184
) -> PyResult<HashMap<String, f64>> {
185185
use rust_htslib::{bam, bam::Read};
186+
use std::time::Instant;
186187

187188
info!("parsing isolate scores from {} with cutoff {}", alignment_path, p_score_cutoff);
188189

189190
// Release the GIL during the CPU-intensive BAM file processing
190191
py.allow_threads(|| {
192+
let start = Instant::now();
193+
191194
let mut reader = bam::Reader::from_path(&alignment_path)
192195
.map_err(|e| PyErr::new::<PyIOError, _>(format!("Failed to open alignment file '{}': {}", alignment_path, e)))?;
193196

194-
let mut isolate_high_scores: HashMap<String, f64> = HashMap::new();
197+
// Pre-allocate HashMap capacity based on typical BAM files
198+
// For 11M records, assuming ~30% pass cutoff = ~3.3M unique reads
199+
let estimated_capacity = 3_500_000;
200+
let mut isolate_high_scores: HashMap<String, f64> = HashMap::with_capacity(estimated_capacity);
201+
202+
info!("initialized hashmap with capacity {}", estimated_capacity);
203+
204+
let mut total_records = 0u64;
205+
let mut mapped_records = 0u64;
206+
let mut passed_cutoff = 0u64;
207+
let mut last_log = Instant::now();
195208

196209
for result in reader.records() {
197-
let record = result.map_err(|e| PyErr::new::<PyIOError, _>(format!("Error reading record: {}", e)))?;
210+
total_records += 1;
211+
212+
if total_records % 1_000_000 == 0 {
213+
let elapsed = last_log.elapsed();
214+
let rate = 1_000_000.0 / elapsed.as_secs_f64();
215+
info!(
216+
"processed {}m records ({} mapped, {} passed cutoff) - {:.0} records/sec",
217+
total_records / 1_000_000,
218+
mapped_records,
219+
passed_cutoff,
220+
rate
221+
);
222+
last_log = Instant::now();
223+
}
224+
225+
let record = result.map_err(|e| PyErr::new::<PyIOError, _>(format!("error reading record: {}", e)))?;
198226

199-
// Skip unmapped reads
200227
if record.is_unmapped() {
201228
continue;
202229
}
230+
mapped_records += 1;
203231

204-
// Get read ID (qname)
205232
let read_id = std::str::from_utf8(record.qname())
206-
.map_err(|e| PyErr::new::<PyIOError, _>(format!("Invalid UTF-8 in read ID: {}", e)))?
233+
.map_err(|e| PyErr::new::<PyIOError, _>(format!("invalid utf-8 in read id: {}", e)))?
207234
.to_string();
208235

209-
// Get read length (unused but kept for potential future use)
210-
let _read_length = record.seq_len();
211-
212-
// Get alignment score using shared function
213236
let total_score = match parse_sam::extract_alignment_score(&record) {
214237
Some(score) => score,
215-
None => continue, // Skip records without valid AS field
238+
None => continue,
216239
};
217240

218-
// Apply score cutoff
219241
if total_score >= p_score_cutoff {
220-
// Track highest score for this read
221-
if total_score > *isolate_high_scores.get(&read_id).unwrap_or(&0.0) {
222-
isolate_high_scores.insert(read_id, total_score);
223-
}
242+
passed_cutoff += 1;
243+
244+
isolate_high_scores
245+
.entry(read_id)
246+
.and_modify(|existing_score| {
247+
if total_score > *existing_score {
248+
*existing_score = total_score;
249+
}
250+
})
251+
.or_insert(total_score);
224252
}
225253
}
226254

227-
info!("parsed {} isolate scores", isolate_high_scores.len());
255+
let elapsed = start.elapsed();
256+
info!(
257+
"parsed {} isolate scores from {} total records in {:.2}s ({:.0} records/sec)",
258+
isolate_high_scores.len(),
259+
total_records,
260+
elapsed.as_secs_f64(),
261+
total_records as f64 / elapsed.as_secs_f64()
262+
);
228263
Ok(isolate_high_scores)
229264
})
230265
}

0 commit comments

Comments
 (0)