Skip to content

Commit 2340210

Browse files
authored
fix: improve performance and memory usage
- Consolidate SAM/BAM parsing to a streaming-based solution. - Clean up Rust code. - Make many minor Rust performance improvements. - Use batching and parallel processing (rayon) in subtraction elimination.
1 parent 424801c commit 2340210

File tree

17 files changed

+565
-1357
lines changed

17 files changed

+565
-1357
lines changed

.github/workflows/ci.yml renamed to .github/workflows/ci.yaml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ env:
1515

1616
jobs:
1717
commitlint:
18+
name: Commitlint
1819
runs-on: ubuntu-24.04
1920
steps:
2021
- name: Checkout
@@ -23,6 +24,42 @@ jobs:
2324
fetch-depth: 0
2425
- name: commitlint
2526
uses: wagoid/commitlint-github-action@v5
27+
ruff-format:
28+
name: Ruff / Format
29+
runs-on: ubuntu-latest
30+
steps:
31+
- name: Checkout
32+
uses: actions/checkout@v4
33+
- name: Install mise
34+
uses: jdx/mise-action@v2
35+
- name: Install uv
36+
uses: astral-sh/setup-uv@v4
37+
with:
38+
version: "latest"
39+
- name: Set up Python
40+
run: uv python install
41+
- name: Install dependencies
42+
run: uv sync
43+
- name: Check formatting
44+
run: uv run ruff format --check .
45+
ruff-lint:
46+
name: Ruff / Lint
47+
runs-on: ubuntu-latest
48+
steps:
49+
- name: Checkout
50+
uses: actions/checkout@v4
51+
- name: Install mise
52+
uses: jdx/mise-action@v2
53+
- name: Install uv
54+
uses: astral-sh/setup-uv@v4
55+
with:
56+
version: "latest"
57+
- name: Set up Python
58+
run: uv python install
59+
- name: Install dependencies
60+
run: uv sync
61+
- name: Check linting
62+
run: uv run ruff check .
2663
build:
2764
name: Build
2865
runs-on: ubuntu-24.04

Cargo.lock

Lines changed: 46 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ rustc-hash = "2.0"
1616
thiserror = "1.0"
1717
log = "0.4"
1818
env_logger = "0.11"
19+
rayon = "1.8"
1920

2021
[dependencies.pyo3]
2122
version = "^0.22.0"

fixtures.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
def intermediate():
99
"""A namespace for storing intermediate values."""
1010
return SimpleNamespace(
11-
isolate_high_scores={},
1211
to_otus=set(),
1312
)
1413

python/workflow_pathoscope/rust.pyi

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
def run_eliminate_subtraction(
2-
isolate_sam_path: str, subtraction_sam_path: str, output_sam_path: str
3-
) -> None:
4-
"""Eliminate subtraction reads from isolate reads using Rust."""
2+
isolate_sam_path: str,
3+
subtraction_sam_path: str,
4+
output_sam_path: str,
5+
input_fastq_path: str,
6+
output_fastq_path: str,
7+
proc: int,
8+
) -> int:
9+
"""Eliminate subtraction reads from BAM and filter FASTQ file. Returns number of eliminated reads."""
510

611
class PathoscopeResults:
712
best_hit_initial_reads: list[float]
@@ -25,12 +30,6 @@ def run_expectation_maximization(
2530
) -> PathoscopeResults:
2631
"""Run Pathoscope expectation maximization algorithm using Rust on SAM/BAM files."""
2732

28-
def parse_isolate_scores(
29-
alignment_path: str,
30-
p_score_cutoff: float,
31-
) -> dict[str, float]:
32-
"""Parse isolate alignment file (SAM or BAM) and extract high scores for each read."""
33-
3433
def find_candidate_otus(
3534
alignment_path: str,
3635
p_score_cutoff: float,

src/candidates.rs

Lines changed: 22 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,31 @@ use log::info;
33
use pyo3::prelude::*;
44
use pyo3::exceptions::PyIOError;
55

6+
const AS_TAG_PREFIX: &str = "AS:i:";
67

7-
8+
/// Extract AS:i alignment score from SAM optional fields
9+
///
10+
/// # Arguments
11+
/// * `fields` - SAM fields starting from the optional fields (field 11+)
12+
///
13+
/// # Returns
14+
/// Option containing the AS:i score as f64, None if not found or invalid
15+
fn extract_as_score(fields: &[&str]) -> Option<f64> {
16+
for field in fields {
17+
if let Some(stripped) = field.strip_prefix(AS_TAG_PREFIX) {
18+
if let Ok(score) = stripped.parse::<i32>() {
19+
return Some(score as f64);
20+
}
21+
}
22+
}
23+
None
24+
}
825

926
/// Parse a single SAM line and extract candidate OTU information
1027
///
1128
/// This function processes one SAM line and determines if the read meets the score cutoff.
1229
/// Used for testing and by the streaming functions.
13-
///
30+
///
1431
/// # Arguments
1532
/// * `line` - A SAM format line as string
1633
/// * `p_score_cutoff` - Minimum score threshold (AS:i score + read length)
@@ -44,19 +61,8 @@ pub fn parse_sam_line(line: &str, p_score_cutoff: f64) -> Option<String> {
4461
return None;
4562
}
4663

47-
// Find AS:i score in the optional fields (starting from field 11)
48-
let mut as_score: Option<f64> = None;
49-
for field in &fields[11..] {
50-
if let Some(stripped) = field.strip_prefix("AS:i:") {
51-
if let Ok(score) = stripped.parse::<i32>() {
52-
as_score = Some(score as f64);
53-
break;
54-
}
55-
}
56-
}
57-
58-
// Skip if no AS score found
59-
if let Some(as_score) = as_score {
64+
// Extract AS:i score from optional fields (starting from field 11)
65+
if let Some(as_score) = extract_as_score(&fields[11..]) {
6066
// Calculate total score (AS score + read length)
6167
let total_score = as_score + seq_len;
6268

@@ -69,31 +75,6 @@ pub fn parse_sam_line(line: &str, p_score_cutoff: f64) -> Option<String> {
6975
None
7076
}
7177

72-
/// Extract candidate OTU reference IDs from SAM text data
73-
///
74-
/// This function parses SAM format data directly from text without using rust-htslib.
75-
/// Used for testing and can be called by other functions that need SAM text parsing.
76-
///
77-
/// # Arguments
78-
/// * `sam_text` - Raw SAM format data as string
79-
/// * `p_score_cutoff` - Minimum score threshold (AS:i score + read length)
80-
///
81-
/// # Returns
82-
/// HashSet of reference IDs that have reads meeting the score cutoff
83-
pub fn extract_candidates_from_sam_text(
84-
sam_text: &str,
85-
p_score_cutoff: f64,
86-
) -> HashSet<String> {
87-
let mut candidate_otus = HashSet::new();
88-
89-
for line in sam_text.lines() {
90-
if let Some(ref_name) = parse_sam_line(line, p_score_cutoff) {
91-
candidate_otus.insert(ref_name);
92-
}
93-
}
94-
95-
candidate_otus
96-
}
9778

9879
/// Extract candidate OTU reference IDs by running bowtie2 directly with streaming
9980
///
@@ -119,7 +100,7 @@ pub fn find_candidate_otus_with_bowtie2(
119100
use std::process::{Command, Stdio};
120101
use std::io::{BufRead, BufReader};
121102

122-
info!("running bowtie2 directly from rust: index={}, reads={:?}, cutoff={}",
103+
info!("running bowtie2: index={}, reads={:?}, cutoff={}",
123104
bowtie_index_path, read_paths, p_score_cutoff);
124105
py.allow_threads(|| {
125106
let mut cmd = Command::new("bowtie2");
@@ -236,65 +217,5 @@ mod tests {
236217
assert_eq!(result, None);
237218
}
238219

239-
#[test]
240-
fn test_extract_candidates_from_sam_text_basic() {
241-
let sam_data = "@HD\tVN:1.0\tSO:unsorted
242-
@SQ\tSN:ref1\tLN:1000
243-
@SQ\tSN:ref2\tLN:2000
244-
read1\t0\tref1\t100\t255\t50M\t*\t0\t0\tAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\t*\tAS:i:45
245-
read2\t0\tref2\t200\t255\t30M\t*\t0\t0\tTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\t*\tAS:i:25
246-
read3\t0\tref1\t300\t255\t40M\t*\t0\t0\tCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\t*\tAS:i:2";
247-
248-
let result = extract_candidates_from_sam_text(sam_data, 0.01);
249-
250-
// Should find 2 unique references since scores are:
251-
// read1: AS:i:45 + 50 = 95.0
252-
// read2: AS:i:25 + 30 = 55.0
253-
// read3: AS:i:2 + 40 = 42.0
254-
assert_eq!(result.len(), 2, "Should find 2 unique references");
255-
assert!(result.contains("ref1"), "Should contain ref1");
256-
assert!(result.contains("ref2"), "Should contain ref2");
257-
}
258-
259-
#[test]
260-
fn test_extract_candidates_from_sam_text_with_cutoff() {
261-
let sam_data = "@HD\tVN:1.0\tSO:unsorted
262-
read1\t0\tref1\t100\t255\t50M\t*\t0\t0\tAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\t*\tAS:i:45
263-
read2\t0\tref2\t200\t255\t30M\t*\t0\t0\tTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\t*\tAS:i:25
264-
read3\t0\tref1\t300\t255\t40M\t*\t0\t0\tCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\t*\tAS:i:2";
265-
266-
let result = extract_candidates_from_sam_text(sam_data, 50.0);
267-
268-
// Only read1 (95.0) and read2 (55.0) should pass, read3 (42.0) should be filtered
269-
assert_eq!(result.len(), 2, "Should find 2 references with high cutoff");
270-
assert!(result.contains("ref1"), "Should contain ref1");
271-
assert!(result.contains("ref2"), "Should contain ref2");
272-
}
273-
274-
#[test]
275-
fn test_extract_candidates_from_sam_text_very_high_cutoff() {
276-
let sam_data = "@HD\tVN:1.0\tSO:unsorted
277-
read1\t0\tref1\t100\t255\t50M\t*\t0\t0\tAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\t*\tAS:i:45";
278-
279-
let result = extract_candidates_from_sam_text(sam_data, 100.0);
280-
281-
// No reads should pass this cutoff (read1 is 95.0)
282-
assert_eq!(result.len(), 0, "Should find no references with very high cutoff");
283-
}
284-
285-
#[test]
286-
fn test_extract_candidates_from_sam_text_deduplication() {
287-
let sam_data = "@HD\tVN:1.0\tSO:unsorted
288-
read1\t0\tref1\t100\t255\t50M\t*\t0\t0\tAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\t*\tAS:i:45
289-
read2\t0\tref1\t200\t255\t30M\t*\t0\t0\tTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\t*\tAS:i:25";
290-
291-
let result = extract_candidates_from_sam_text(sam_data, 0.01);
292-
293-
// Even if multiple reads map to ref1, it should only appear once in the set
294-
assert_eq!(result.len(), 1, "Should deduplicate reference names");
295-
assert!(result.contains("ref1"), "Should contain ref1");
296-
let ref1_count = result.iter().filter(|&r| r == "ref1").count();
297-
assert_eq!(ref1_count, 1, "Each reference should appear only once in the result set");
298-
}
299220

300221
}

src/coverage.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use crate::parse_sam::MinimalAlignment;
1+
use crate::sam::MinimalAlignment;
22
use crate::em::find_updated_score;
33
use crate::{UniqueReads, MultiMappingReads};
44
use rustc_hash::FxHashMap;

src/em.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,7 @@ mod tests {
632632

633633
#[test]
634634
fn test_em_integration_with_real_sam_data() {
635-
use crate::build_matrix;
635+
use crate::matrix::build_matrix;
636636

637637
// Use real SAM data with multi-mapping reads to test the full pipeline
638638
let sam_path = "example/rust/test_em_with_multimapping.sam";

0 commit comments

Comments
 (0)