@@ -3,14 +3,31 @@ use log::info;
33use pyo3:: prelude:: * ;
44use pyo3:: exceptions:: PyIOError ;
55
6+ const AS_TAG_PREFIX : & str = "AS:i:" ;
67
7-
8+ /// Extract AS:i alignment score from SAM optional fields
9+ ///
10+ /// # Arguments
11+ /// * `fields` - SAM fields starting from the optional fields (field 11+)
12+ ///
13+ /// # Returns
14+ /// Option containing the AS:i score as f64, None if not found or invalid
15+ fn extract_as_score ( fields : & [ & str ] ) -> Option < f64 > {
16+ for field in fields {
17+ if let Some ( stripped) = field. strip_prefix ( AS_TAG_PREFIX ) {
18+ if let Ok ( score) = stripped. parse :: < i32 > ( ) {
19+ return Some ( score as f64 ) ;
20+ }
21+ }
22+ }
23+ None
24+ }
825
926/// Parse a single SAM line and extract candidate OTU information
1027///
1128/// This function processes one SAM line and determines if the read meets the score cutoff.
1229/// Used for testing and by the streaming functions.
13- ///
30+ ///
1431/// # Arguments
1532/// * `line` - A SAM format line as string
1633/// * `p_score_cutoff` - Minimum score threshold (AS:i score + read length)
@@ -44,19 +61,8 @@ pub fn parse_sam_line(line: &str, p_score_cutoff: f64) -> Option<String> {
4461 return None ;
4562 }
4663
47- // Find AS:i score in the optional fields (starting from field 11)
48- let mut as_score: Option < f64 > = None ;
49- for field in & fields[ 11 ..] {
50- if let Some ( stripped) = field. strip_prefix ( "AS:i:" ) {
51- if let Ok ( score) = stripped. parse :: < i32 > ( ) {
52- as_score = Some ( score as f64 ) ;
53- break ;
54- }
55- }
56- }
57-
58- // Skip if no AS score found
59- if let Some ( as_score) = as_score {
64+ // Extract AS:i score from optional fields (starting from field 11)
65+ if let Some ( as_score) = extract_as_score ( & fields[ 11 ..] ) {
6066 // Calculate total score (AS score + read length)
6167 let total_score = as_score + seq_len;
6268
@@ -69,31 +75,6 @@ pub fn parse_sam_line(line: &str, p_score_cutoff: f64) -> Option<String> {
6975 None
7076}
7177
72- /// Extract candidate OTU reference IDs from SAM text data
73- ///
74- /// This function parses SAM format data directly from text without using rust-htslib.
75- /// Used for testing and can be called by other functions that need SAM text parsing.
76- ///
77- /// # Arguments
78- /// * `sam_text` - Raw SAM format data as string
79- /// * `p_score_cutoff` - Minimum score threshold (AS:i score + read length)
80- ///
81- /// # Returns
82- /// HashSet of reference IDs that have reads meeting the score cutoff
83- pub fn extract_candidates_from_sam_text (
84- sam_text : & str ,
85- p_score_cutoff : f64 ,
86- ) -> HashSet < String > {
87- let mut candidate_otus = HashSet :: new ( ) ;
88-
89- for line in sam_text. lines ( ) {
90- if let Some ( ref_name) = parse_sam_line ( line, p_score_cutoff) {
91- candidate_otus. insert ( ref_name) ;
92- }
93- }
94-
95- candidate_otus
96- }
9778
9879/// Extract candidate OTU reference IDs by running bowtie2 directly with streaming
9980///
@@ -119,7 +100,7 @@ pub fn find_candidate_otus_with_bowtie2(
119100 use std:: process:: { Command , Stdio } ;
120101 use std:: io:: { BufRead , BufReader } ;
121102
122- info ! ( "running bowtie2 directly from rust : index={}, reads={:?}, cutoff={}" ,
103+ info ! ( "running bowtie2: index={}, reads={:?}, cutoff={}" ,
123104 bowtie_index_path, read_paths, p_score_cutoff) ;
124105 py. allow_threads ( || {
125106 let mut cmd = Command :: new ( "bowtie2" ) ;
@@ -236,65 +217,5 @@ mod tests {
236217 assert_eq ! ( result, None ) ;
237218 }
238219
239- #[ test]
240- fn test_extract_candidates_from_sam_text_basic ( ) {
241- let sam_data = "@HD\t VN:1.0\t SO:unsorted
242- @SQ\t SN:ref1\t LN:1000
243- @SQ\t SN:ref2\t LN:2000
244- read1\t 0\t ref1\t 100\t 255\t 50M\t *\t 0\t 0\t AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\t *\t AS:i:45
245- read2\t 0\t ref2\t 200\t 255\t 30M\t *\t 0\t 0\t TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\t *\t AS:i:25
246- read3\t 0\t ref1\t 300\t 255\t 40M\t *\t 0\t 0\t CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\t *\t AS:i:2" ;
247-
248- let result = extract_candidates_from_sam_text ( sam_data, 0.01 ) ;
249-
250- // Should find 2 unique references since scores are:
251- // read1: AS:i:45 + 50 = 95.0
252- // read2: AS:i:25 + 30 = 55.0
253- // read3: AS:i:2 + 40 = 42.0
254- assert_eq ! ( result. len( ) , 2 , "Should find 2 unique references" ) ;
255- assert ! ( result. contains( "ref1" ) , "Should contain ref1" ) ;
256- assert ! ( result. contains( "ref2" ) , "Should contain ref2" ) ;
257- }
258-
259- #[ test]
260- fn test_extract_candidates_from_sam_text_with_cutoff ( ) {
261- let sam_data = "@HD\t VN:1.0\t SO:unsorted
262- read1\t 0\t ref1\t 100\t 255\t 50M\t *\t 0\t 0\t AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\t *\t AS:i:45
263- read2\t 0\t ref2\t 200\t 255\t 30M\t *\t 0\t 0\t TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\t *\t AS:i:25
264- read3\t 0\t ref1\t 300\t 255\t 40M\t *\t 0\t 0\t CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\t *\t AS:i:2" ;
265-
266- let result = extract_candidates_from_sam_text ( sam_data, 50.0 ) ;
267-
268- // Only read1 (95.0) and read2 (55.0) should pass, read3 (42.0) should be filtered
269- assert_eq ! ( result. len( ) , 2 , "Should find 2 references with high cutoff" ) ;
270- assert ! ( result. contains( "ref1" ) , "Should contain ref1" ) ;
271- assert ! ( result. contains( "ref2" ) , "Should contain ref2" ) ;
272- }
273-
274- #[ test]
275- fn test_extract_candidates_from_sam_text_very_high_cutoff ( ) {
276- let sam_data = "@HD\t VN:1.0\t SO:unsorted
277- read1\t 0\t ref1\t 100\t 255\t 50M\t *\t 0\t 0\t AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\t *\t AS:i:45" ;
278-
279- let result = extract_candidates_from_sam_text ( sam_data, 100.0 ) ;
280-
281- // No reads should pass this cutoff (read1 is 95.0)
282- assert_eq ! ( result. len( ) , 0 , "Should find no references with very high cutoff" ) ;
283- }
284-
285- #[ test]
286- fn test_extract_candidates_from_sam_text_deduplication ( ) {
287- let sam_data = "@HD\t VN:1.0\t SO:unsorted
288- read1\t 0\t ref1\t 100\t 255\t 50M\t *\t 0\t 0\t AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\t *\t AS:i:45
289- read2\t 0\t ref1\t 200\t 255\t 30M\t *\t 0\t 0\t TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\t *\t AS:i:25" ;
290-
291- let result = extract_candidates_from_sam_text ( sam_data, 0.01 ) ;
292-
293- // Even if multiple reads map to ref1, it should only appear once in the set
294- assert_eq ! ( result. len( ) , 1 , "Should deduplicate reference names" ) ;
295- assert ! ( result. contains( "ref1" ) , "Should contain ref1" ) ;
296- let ref1_count = result. iter ( ) . filter ( |& r| r == "ref1" ) . count ( ) ;
297- assert_eq ! ( ref1_count, 1 , "Each reference should appear only once in the result set" ) ;
298- }
299220
300221}
0 commit comments