diff --git a/examples/advanced.rs b/examples/advanced.rs index 57f6ae9..56d575d 100644 --- a/examples/advanced.rs +++ b/examples/advanced.rs @@ -17,23 +17,23 @@ fn main() { println!("Encoding with pipe delimiter:\n"); let pipe_options = EncodeOptions::new().delimiter(Delimiter::Pipe); let toon_pipe = encode(&data, Some(&pipe_options)).unwrap(); - println!("{}", toon_pipe); + println!("{toon_pipe}"); // Encode with length marker println!("\nEncoding with length marker:\n"); let marker_options = EncodeOptions::new().length_marker('#'); let toon_marker = encode(&data, Some(&marker_options)).unwrap(); - println!("{}", toon_marker); + println!("{toon_marker}"); // Encode with custom indentation println!("\nEncoding with 4-space indentation:\n"); let indent_options = EncodeOptions::new().indent(4); let toon_indent = encode(&data, Some(&indent_options)).unwrap(); - println!("{}", toon_indent); + println!("{toon_indent}"); // Decode with custom options println!("\nDecoding with custom indentation:\n"); let decode_options = DecodeOptions::new().indent(4).strict(true); let decoded = decode(&toon_indent, Some(&decode_options)).unwrap(); - println!("Decoded: {}", decoded); + println!("Decoded: {decoded}"); } diff --git a/examples/basic.rs b/examples/basic.rs index 597e66b..560ab8a 100644 --- a/examples/basic.rs +++ b/examples/basic.rs @@ -18,10 +18,10 @@ fn main() { // Encode to TOON println!("Encoding to TOON format:\n"); let toon = encode(&data, None).unwrap(); - println!("{}", toon); + println!("{toon}"); // Decode from TOON println!("\nDecoding from TOON format:\n"); let decoded = decode(&toon, None).unwrap(); - println!("Decoded: {}", decoded); + println!("Decoded: {decoded}"); } diff --git a/examples/serde.rs b/examples/serde.rs index db9e11f..87d8d7a 100644 --- a/examples/serde.rs +++ b/examples/serde.rs @@ -38,11 +38,11 @@ fn main() { // Serialize to TOON println!("Serializing to TOON:\n"); let toon = to_string(&user).unwrap(); - println!("{}", toon); + println!("{toon}"); // Deserialize from TOON println!("\nDeserializing from TOON:\n"); let decoded: User = from_str(&toon).unwrap(); - println!("Decoded: {:?}", decoded); + println!("Decoded: {decoded:?}"); assert_eq!(user, decoded); } diff --git a/src/decode.rs b/src/decode.rs index 3241a55..37a1547 100644 --- a/src/decode.rs +++ b/src/decode.rs @@ -2,6 +2,7 @@ use crate::error::Error; use crate::options::DecodeOptions; +use crate::simd; use serde_json::{Map, Value}; /// Decode a TOON-formatted string to a JSON value @@ -671,35 +672,24 @@ impl<'a> Parser<'a> { fn detect_delimiter(&self) -> char { // Look ahead to detect delimiter let remaining = &self.input[self.pos..]; - if remaining.contains('\t') { - '\t' - } else if remaining.contains('|') { - '|' + + // Use SIMD for larger inputs, fallback for small ones + // Threshold: use SIMD if input is large enough to benefit (>= 32 bytes) + if remaining.len() >= 32 { + simd::detect_delimiter_simd(remaining) } else { - ',' + simd::detect_delimiter_fallback(remaining) } } fn split_row<'b>(&self, row: &'b str, delimiter: char) -> Vec<&'b str> { - let mut result = Vec::new(); - let mut start = 0; - let mut in_quotes = false; - let chars: Vec = row.chars().collect(); - - for (i, ch) in chars.iter().enumerate() { - match ch { - '"' if i == 0 || chars[i - 1] != '\\' => { - in_quotes = !in_quotes; - } - _ if *ch == delimiter && !in_quotes => { - result.push(&row[start..i]); - start = i + 1; - } - _ => {} - } + // Use SIMD for larger inputs, fallback for small ones + // Threshold: use SIMD if row is large enough to benefit (>= 32 bytes) + if row.len() >= 32 { + simd::split_row_simd(row, delimiter) + } else { + simd::split_row_fallback(row, delimiter) } - result.push(&row[start..]); - result } fn count_indent(&mut self, indent_size: usize) -> usize { diff --git a/src/lib.rs b/src/lib.rs index a7f9973..b0cf2bd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -48,6 +48,7 @@ pub mod decode; pub mod encode; pub mod error; pub mod options; +mod simd; pub use decode::decode; pub use encode::encode; diff --git a/src/simd.rs b/src/simd.rs new file mode 100644 index 0000000..33f9ed7 --- /dev/null +++ b/src/simd.rs @@ -0,0 +1,318 @@ +//! SIMD-optimized parsing functions for tabular arrays +//! +//! This module provides high-performance implementations of delimiter detection +//! and row splitting using SIMD instructions for parallel processing. + +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +/// Detect delimiter character using SIMD for fast scanning +/// +/// Scans the input string for tab ('\t'), pipe ('|'), or comma (',') delimiters +/// in parallel using SIMD instructions. +/// +/// # Returns +/// +/// The first delimiter found in priority order: tab > pipe > comma +/// Defaults to comma if none found. +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "sse2")] +unsafe fn detect_delimiter_simd_x86_64(input: &str) -> char { + let bytes = input.as_bytes(); + if bytes.is_empty() { + return ','; + } + + // Create SIMD vectors for each delimiter (16 bytes for SSE2) + let tab_vec = _mm_set1_epi8(b'\t' as i8); + let pipe_vec = _mm_set1_epi8(b'|' as i8); + + let mut found_tab = false; + let mut found_pipe = false; + + // Process in chunks of 16 bytes (SSE2 register size) + let chunks = bytes.chunks_exact(16); + let remainder = chunks.remainder(); + + for chunk in chunks { + // Load 16 bytes (unaligned is fine for most cases) + let chunk_vec = _mm_loadu_si128(chunk.as_ptr() as *const __m128i); + + // Compare with each delimiter + let tab_mask = _mm_cmpeq_epi8(chunk_vec, tab_vec); + let pipe_mask = _mm_cmpeq_epi8(chunk_vec, pipe_vec); + + // Check if any byte matches (movemask gives us a bitmask) + let tab_bits = _mm_movemask_epi8(tab_mask); + let pipe_bits = _mm_movemask_epi8(pipe_mask); + + if tab_bits != 0 { + found_tab = true; + } + if pipe_bits != 0 { + found_pipe = true; + } + + // Early exit if we found tab (highest priority) + if found_tab { + return '\t'; + } + } + + // Process remainder + for &byte in remainder { + if byte == b'\t' { + return '\t'; + } else if byte == b'|' { + found_pipe = true; + } + } + + // Return in priority order + if found_tab { + '\t' + } else if found_pipe { + '|' + } else { + ',' // default (comma or none found) + } +} + +/// Split a row by delimiter while respecting quoted strings, using SIMD +/// +/// This function uses SIMD to quickly find delimiter positions and quote positions, +/// then processes them to handle quote tracking correctly. +/// +/// # Arguments +/// +/// * `row` - The row string to split +/// * `delimiter` - The delimiter character to split on +/// +/// # Returns +/// +/// A vector of string slices representing the split fields +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "sse2")] +unsafe fn split_row_simd_x86_64(row: &str, delimiter: char) -> Vec<&str> { + let bytes = row.as_bytes(); + if bytes.is_empty() { + return vec![row]; + } + + let delimiter_byte = delimiter as u8; + let quote_byte = b'"'; + let backslash_byte = b'\\'; + + let delim_vec = _mm_set1_epi8(delimiter_byte as i8); + let quote_vec = _mm_set1_epi8(quote_byte as i8); + let backslash_vec = _mm_set1_epi8(backslash_byte as i8); + + let mut result = Vec::new(); + let mut start = 0; + let mut in_quotes = false; + let mut prev_was_backslash = false; + + // Process in chunks of 16 bytes + let chunks = bytes.chunks_exact(16); + let remainder_start = chunks.len() * 16; + + for (chunk_idx, chunk) in chunks.enumerate() { + let chunk_start = chunk_idx * 16; + let chunk_vec = _mm_loadu_si128(chunk.as_ptr() as *const __m128i); + + // Find delimiters, quotes, and backslashes in parallel + let delim_mask = _mm_cmpeq_epi8(chunk_vec, delim_vec); + let quote_mask = _mm_cmpeq_epi8(chunk_vec, quote_vec); + let backslash_mask = _mm_cmpeq_epi8(chunk_vec, backslash_vec); + + // Get bitmasks + let delim_bits = _mm_movemask_epi8(delim_mask) as u16; + let quote_bits = _mm_movemask_epi8(quote_mask) as u16; + let backslash_bits = _mm_movemask_epi8(backslash_mask) as u16; + + // Process each byte in the chunk + for i in 0..16 { + let pos = chunk_start + i; + if pos >= bytes.len() { + break; + } + let is_backslash = (backslash_bits >> i) & 1 != 0; + let is_quote = (quote_bits >> i) & 1 != 0; + let is_delimiter = (delim_bits >> i) & 1 != 0; + + // Handle backslash: if we see a backslash, the next character might be escaped + // But if the previous was a backslash, this one escapes it, so reset + if is_backslash { + prev_was_backslash = !prev_was_backslash; + } else { + // Not a backslash - if prev was backslash, this char is escaped + let is_escaped = prev_was_backslash; + prev_was_backslash = false; + + // Handle quotes (only if not escaped) + if is_quote && !is_escaped { + in_quotes = !in_quotes; + } + + // Handle delimiter (only if not in quotes) + if is_delimiter && !in_quotes { + result.push(&row[start..pos]); + start = pos + 1; + } + } + } + } + + // Process remainder + for (i, &byte) in bytes[remainder_start..].iter().enumerate() { + let pos = remainder_start + i; + + // Handle backslash tracking + if byte == backslash_byte { + prev_was_backslash = !prev_was_backslash; + } else { + // Not a backslash - check if previous was backslash (this char is escaped) + let is_escaped = prev_was_backslash; + prev_was_backslash = false; + + // Handle quotes (only if not escaped) + if byte == quote_byte && !is_escaped { + in_quotes = !in_quotes; + } + + // Handle delimiter (only if not in quotes) + if byte == delimiter_byte && !in_quotes { + result.push(&row[start..pos]); + start = pos + 1; + } + } + } + + // Add the final segment + result.push(&row[start..]); + result +} + +/// Public wrapper for SIMD delimiter detection with fallback +pub fn detect_delimiter_simd(input: &str) -> char { + #[cfg(target_arch = "x86_64")] + { + if is_x86_feature_detected!("sse2") && input.len() >= 16 { + unsafe { + return detect_delimiter_simd_x86_64(input); + } + } + } + + // Fallback for other architectures or small inputs + detect_delimiter_fallback(input) +} + +/// Public wrapper for SIMD row splitting with fallback +pub fn split_row_simd(row: &str, delimiter: char) -> Vec<&str> { + #[cfg(target_arch = "x86_64")] + { + if is_x86_feature_detected!("sse2") && row.len() >= 16 { + unsafe { + return split_row_simd_x86_64(row, delimiter); + } + } + } + + // Fallback for other architectures or small inputs + split_row_fallback(row, delimiter) +} + +/// Fallback implementation for small inputs or when SIMD isn't beneficial +/// +/// This is used when the input is too small to benefit from SIMD operations. +pub fn detect_delimiter_fallback(input: &str) -> char { + if input.contains('\t') { + '\t' + } else if input.contains('|') { + '|' + } else { + ',' + } +} + +/// Fallback implementation for row splitting +pub fn split_row_fallback(row: &str, delimiter: char) -> Vec<&str> { + let mut result = Vec::new(); + let mut start = 0; + let mut in_quotes = false; + let chars: Vec = row.chars().collect(); + + for (i, ch) in chars.iter().enumerate() { + match ch { + '"' if i == 0 || chars[i - 1] != '\\' => { + in_quotes = !in_quotes; + } + _ if *ch == delimiter && !in_quotes => { + result.push(&row[start..i]); + start = i + 1; + } + _ => {} + } + } + result.push(&row[start..]); + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_detect_delimiter_tab() { + let input = "field1\tfield2\tfield3"; + assert_eq!(detect_delimiter_simd(input), '\t'); + } + + #[test] + fn test_detect_delimiter_pipe() { + let input = "field1|field2|field3"; + assert_eq!(detect_delimiter_simd(input), '|'); + } + + #[test] + fn test_detect_delimiter_comma() { + let input = "field1,field2,field3"; + assert_eq!(detect_delimiter_simd(input), ','); + } + + #[test] + fn test_detect_delimiter_priority() { + // Tab should have priority over pipe and comma + let input = "field1,field2|field3\tfield4"; + assert_eq!(detect_delimiter_simd(input), '\t'); + } + + #[test] + fn test_split_row_simple() { + let row = "a,b,c"; + let result = split_row_simd(row, ','); + assert_eq!(result, vec!["a", "b", "c"]); + } + + #[test] + fn test_split_row_with_quotes() { + let row = r#"a,"b,c",d"#; + let result = split_row_simd(row, ','); + assert_eq!(result, vec!["a", r#""b,c""#, "d"]); + } + + #[test] + fn test_split_row_with_escaped_quotes() { + let row = r#"a,"b\"c",d"#; + let result = split_row_simd(row, ','); + assert_eq!(result, vec!["a", r#""b\"c""#, "d"]); + } + + #[test] + fn test_split_row_tab_delimiter() { + let row = "a\tb\tc"; + let result = split_row_simd(row, '\t'); + assert_eq!(result, vec!["a", "b", "c"]); + } +} diff --git a/tests/roundtrip.rs b/tests/roundtrip.rs index 26d21a9..26cb158 100644 --- a/tests/roundtrip.rs +++ b/tests/roundtrip.rs @@ -58,7 +58,7 @@ fn test_roundtrip_mixed_types() { let original = json!({ "string": "hello", "number": 42, - "float": 3.14, + "float": std::f64::consts::PI, "boolean": true, "null": null, "array": [1, 2, 3],