Skip to content

Commit ed5c6a7

Browse files
authored
perf: improve subtraction and em performance
* Remove some unused code. * Use Rust for paring down eliminated reads during subtraction. * Improve efficiency of subtraction code. * Replace all SAM files with BAM in subtraction elimination.
1 parent ee43cb6 commit ed5c6a7

File tree

14 files changed

+617737
-923980
lines changed

14 files changed

+617737
-923980
lines changed

example/to_isolates.bam

1.79 MB
Binary file not shown.

fixtures.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,6 @@ def p_score_cutoff():
4747

4848

4949
@fixture
50-
def subtracted_sam_path(work_path: Path) -> Path:
51-
"""The path to the SAM file after subtraction reads have been eliminated."""
52-
return work_path / "subtracted.sam"
50+
def subtracted_bam_path(work_path: Path) -> Path:
51+
"""The path to the BAM file after subtraction reads have been eliminated."""
52+
return work_path / "subtracted.bam"

python/workflow_pathoscope/rust.pyi

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,33 @@ class PathoscopeResults:
2020
coverage: dict[str, list[int]]
2121

2222
def run_expectation_maximization(
23-
sam_path: str,
23+
alignment_path: str,
2424
p_score_cutoff: float,
2525
ref_lengths: dict[str, int],
2626
) -> PathoscopeResults:
27-
"""Run Pathoscope expectation maximization algorithm using Rust."""
27+
"""Run Pathoscope expectation maximization algorithm using Rust on SAM/BAM files."""
28+
29+
def parse_isolate_scores(
30+
alignment_path: str,
31+
p_score_cutoff: float,
32+
) -> dict[str, float]:
33+
"""Parse isolate alignment file (SAM or BAM) and extract high scores for each read."""
34+
35+
def find_candidate_otus(
36+
alignment_path: str,
37+
p_score_cutoff: float,
38+
) -> set[str]:
39+
"""Extract candidate OTU reference IDs from an alignment file (SAM/BAM)."""
40+
41+
def find_candidate_otus_from_bytes(
42+
sam_bytes: bytes,
43+
p_score_cutoff: float,
44+
) -> set[str]:
45+
"""Extract candidate OTU reference IDs from SAM text data."""
46+
47+
def calculate_coverage_from_em_results(
48+
alignment_path: str,
49+
p_score_cutoff: float,
50+
ref_lengths: dict[str, int],
51+
) -> dict[str, list[int]]:
52+
"""Calculate coverage directly from EM results and alignment data."""

python/workflow_pathoscope/utils.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -218,20 +218,32 @@ def write_report(
218218

219219

220220
def run_pathoscope(
221-
sam_path: Path,
221+
alignment_path: Path,
222222
p_score_cutoff: float,
223223
ref_lengths: dict[str, int],
224224
):
225-
"""Run Pathoscope on the SAM file at ``sam_path`` with the given ``p_score_cutoff``.
225+
"""Run Pathoscope on an alignment file.
226226
227227
Returns PathoscopeResults containing EM results and coverage data.
228228
229-
:param sam_path: The path to the SAM file.
229+
:param alignment_path: The path to the SAM or BAM file.
230230
:param p_score_cutoff: The minimum allowed ``p_score`` for an alignment.
231231
:param ref_lengths: Dictionary mapping reference IDs to their lengths.
232232
"""
233233
return run_expectation_maximization(
234-
str(sam_path),
234+
str(alignment_path),
235235
p_score_cutoff,
236236
ref_lengths,
237237
)
238+
239+
240+
# Backward compatibility alias - DEPRECATED
241+
def run_pathoscope_sam(
242+
sam_path: Path, p_score_cutoff: float, ref_lengths: dict[str, int]
243+
):
244+
"""
245+
Deprecated: Use run_pathoscope instead.
246+
247+
This function is kept for backward compatibility.
248+
"""
249+
return run_pathoscope(sam_path, p_score_cutoff, ref_lengths)

0 commit comments

Comments
 (0)