diff --git a/examples/streaming.rs b/examples/streaming.rs new file mode 100644 index 0000000..3749498 --- /dev/null +++ b/examples/streaming.rs @@ -0,0 +1,60 @@ +//! Streaming API example for TOON format +//! +//! This example demonstrates how to use the streaming API to encode and decode +//! large datasets without loading everything into memory. + +use serde_json::json; +use std::fs::File; +use std::io::{BufReader, BufWriter}; +use toon_rust::{decode_stream, encode_stream}; + +fn main() -> Result<(), Box> { + // Create a large dataset + let items: Vec<_> = (0..100) + .map(|i| { + json!({ + "id": i, + "name": format!("Product {}", i), + "price": (i as f64) * 1.5, + "in_stock": i % 2 == 0 + }) + }) + .collect(); + let data = json!({ + "products": items, + "metadata": { + "total": 100, + "source": "example" + } + }); + + println!("Encoding large dataset to file using streaming..."); + + // Encode using streaming API - writes directly to file without building string in memory + let file = File::create("example_output.toon")?; + let mut writer = BufWriter::new(file); + encode_stream(&data, &mut writer, None)?; + drop(writer); // Ensure buffer is flushed + + println!("āœ“ Encoded to example_output.toon"); + + println!("\nDecoding from file using streaming..."); + + // Decode using streaming API - reads incrementally without loading entire file + let file = File::open("example_output.toon")?; + let decoded = decode_stream(file, None)?; + + println!("āœ“ Decoded successfully"); + println!("\nDecoded data summary:"); + println!( + " Products: {}", + decoded["products"].as_array().unwrap().len() + ); + println!(" Metadata total: {}", decoded["metadata"]["total"]); + + // Clean up + std::fs::remove_file("example_output.toon")?; + println!("\nāœ“ Cleaned up example_output.toon"); + + Ok(()) +} diff --git a/src/decode.rs b/src/decode.rs index 37a1547..763c985 100644 --- a/src/decode.rs +++ b/src/decode.rs @@ -4,6 +4,7 @@ use crate::error::Error; use crate::options::DecodeOptions; use crate::simd; use serde_json::{Map, Value}; +use std::io::{BufReader, Read}; /// Decode a TOON-formatted string to a JSON value /// @@ -762,3 +763,864 @@ impl<'a> Parser<'a> { } } } + +/// Decode a TOON-formatted stream from a reader to a JSON value +/// +/// This function reads and parses the TOON format incrementally from the reader +/// without loading the entire input into memory, making it suitable for large datasets. +/// +/// # Arguments +/// +/// * `reader` - The reader to read the TOON-formatted input from (will be wrapped in BufReader) +/// * `options` - Optional decoding options +/// +/// # Returns +/// +/// A `Result` containing the decoded JSON value or an error +/// +/// # Example +/// +/// ```rust,no_run +/// use std::fs::File; +/// use toon_rust::decode_stream; +/// +/// let file = File::open("input.toon").unwrap(); +/// let value = decode_stream(file, None).unwrap(); +/// ``` +pub fn decode_stream(reader: R, options: Option<&DecodeOptions>) -> Result { + let default_opts = DecodeOptions::default(); + let opts = options.unwrap_or(&default_opts); + let mut buf_reader = BufReader::with_capacity(8192, reader); + let mut parser = StreamingParser::new(&mut buf_reader, opts)?; + parser.parse() +} + +struct StreamingParser<'a, R: Read> { + reader: &'a mut BufReader, + buffer: String, + pos: usize, + options: &'a DecodeOptions, + eof: bool, +} + +impl<'a, R: Read> StreamingParser<'a, R> { + fn new(reader: &'a mut BufReader, options: &'a DecodeOptions) -> Result { + let mut parser = Self { + reader, + buffer: String::new(), + pos: 0, + options, + eof: false, + }; + // Read initial chunk (8KB buffer) + parser.fill_buffer(8192)?; + Ok(parser) + } + + fn fill_buffer(&mut self, min_size: usize) -> Result<(), Error> { + if self.eof { + return Ok(()); + } + + // Only drain if we've consumed a significant portion and need more space + // This helps with memory efficiency for very large streams + if self.pos > 8192 && self.pos > self.buffer.len() / 2 { + self.buffer.drain(..self.pos); + self.pos = 0; + } + + // Read more data if needed + let needed = if self.pos + min_size > self.buffer.len() { + self.pos + min_size - self.buffer.len() + } else { + 0 + }; + + if needed > 0 { + let mut temp_buf = vec![0u8; needed.max(8192)]; + match self.reader.read(&mut temp_buf) { + Ok(0) => { + self.eof = true; + } + Ok(n) => { + let chunk = String::from_utf8(temp_buf[..n].to_vec()) + .map_err(|e| Error::Io(format!("Invalid UTF-8: {}", e)))?; + self.buffer.push_str(&chunk); + } + Err(e) => return Err(Error::Io(e.to_string())), + } + } + + Ok(()) + } + + fn ensure_buffer(&mut self, needed: usize) -> Result<(), Error> { + if self.pos + needed > self.buffer.len() && !self.eof { + self.fill_buffer(self.pos + needed)?; + } + Ok(()) + } + + fn parse(&mut self) -> Result { + self.skip_whitespace(); + if self.pos >= self.buffer.len() && self.eof { + return Ok(Value::Object(Map::new())); + } + + // Check if it's a root array (starts with [) + if self.peek_char() == Some('[') { + self.parse_array_value() + } else { + self.parse_object() + } + } + + fn parse_object(&mut self) -> Result { + let mut map = Map::new(); + let indent = self.options.get_indent(); + let initial_indent = self.count_indent(indent)?; + + loop { + // Count indentation first (before skipping whitespace) + let line_indent = self.count_indent(indent)?; + if line_indent < initial_indent { + // We've gone back to a lower indentation level + break; + } + + // Now skip the indentation whitespace + for _ in 0..(line_indent * indent) { + if self.peek_char() == Some(' ') { + self.advance(); + } else { + break; + } + } + + if self.pos >= self.buffer.len() && self.eof { + break; + } + if line_indent == 0 && !map.is_empty() && initial_indent == 0 { + // Check if there's actually more content on this line + let saved_pos = self.pos; + let key_result = self.parse_key(); + self.pos = saved_pos; + if key_result.is_err() { + break; + } + } + + // Parse key (may include array notation like "tags[3]") + let key = self.parse_key()?; + self.skip_whitespace(); + + // Check if we have array notation in the key (e.g., "tags[3]:") + let has_array_notation = self.peek_char() == Some('['); + + if !has_array_notation { + // Normal key-value: key: value + if self.peek_char() != Some(':') { + return Err(Error::parse( + self.pos, + format!("Expected ':' after key '{key}'"), + )); + } + self.advance(); // consume ':' + self.skip_whitespace(); + } else { + // Key with array notation: key[3]: value + // The array part will be parsed as the value + } + + // Check if value is on next line (indented) or inline + let value = if has_array_notation { + // Array notation: key[3]: value + // Parse the array value + let value = self.parse_array_value()?; + // Skip to next line + if self.pos < self.buffer.len() && self.peek_char() == Some('\n') { + self.advance(); + } + value + } else if self.peek_char() == Some('\n') { + self.advance(); // consume '\n' + // Check if next line is more indented (nested object/array) + let next_indent = self.count_indent(indent)?; + if next_indent > line_indent { + // Parse nested object or array + if self.peek_char() == Some('[') { + self.parse_array_value()? + } else { + // Parse nested object + self.parse_object()? + } + } else { + // Same or less indent means we're done with this value + Value::Null + } + } else { + // Inline value - parse until end of line or newline + let value = self.parse_value_until_newline()?; + // Skip to next line (if not already at end) + if self.pos < self.buffer.len() && self.peek_char() != Some('\n') { + self.skip_to_next_line(); + } else if self.peek_char() == Some('\n') { + self.advance(); // consume newline + } + value + }; + + map.insert(key, value); + + // After inserting a nested object, check if we should continue + if self.pos >= self.buffer.len() && self.eof { + break; + } + + // Check indentation for next iteration + let next_line_indent = self.count_indent(indent)?; + if next_line_indent < initial_indent { + break; + } + if next_line_indent == 0 && initial_indent == 0 && !map.is_empty() { + // Check if there's actually a key to parse + let saved_pos = self.pos; + let key_result = self.parse_key(); + self.pos = saved_pos; + if key_result.is_err() { + break; + } + } + } + + Ok(Value::Object(map)) + } + + fn parse_value(&mut self) -> Result { + self.skip_whitespace(); + match self.peek_char() { + Some('[') => self.parse_array_value(), + Some('"') => self.parse_string(), + Some('-') => { + // List item marker + self.advance(); + self.skip_whitespace(); + self.parse_value() + } + Some(ch) if ch.is_ascii_digit() || ch == '-' => self.parse_number(), + Some(ch) if ch.is_ascii_alphabetic() => { + // Try boolean/null first, then fall back to string + let start = self.pos; + let value = self.parse_boolean_or_null(); + if value.is_ok() { + return value; + } + // Reset and parse as string + self.pos = start; + self.parse_unquoted_string() + } + _ => self.parse_unquoted_string(), + } + } + + fn parse_unquoted_string(&mut self) -> Result { + let start = self.pos; + // Parse until we hit whitespace, newline, or end + while self.pos < self.buffer.len() || !self.eof { + self.ensure_buffer(1)?; + if self.pos >= self.buffer.len() { + break; + } + match self.peek_char() { + Some(ch) if ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r' => break, + Some(_) => self.advance(), + None => break, + } + } + if self.pos == start { + return Err(Error::parse(self.pos, "Expected value")); + } + Ok(Value::String(self.buffer[start..self.pos].to_string())) + } + + fn parse_value_until_newline(&mut self) -> Result { + self.skip_whitespace(); + + // Check what type of value we have + match self.peek_char() { + Some('[') => { + // Array - parse array value + self.parse_array_value() + } + Some('"') => self.parse_string(), + Some(ch) if ch.is_ascii_digit() || ch == '-' => self.parse_number(), + Some(ch) if ch.is_ascii_alphabetic() => { + // Try boolean/null first, then fall back to string + let start_pos = self.pos; + let value = self.parse_boolean_or_null(); + if value.is_ok() { + return value; + } + // Reset and parse as string + self.pos = start_pos; + self.parse_unquoted_string() + } + _ => self.parse_unquoted_string(), + } + } + + fn parse_array_value(&mut self) -> Result { + if self.peek_char() != Some('[') { + return Err(Error::parse(self.pos, "Expected '['")); + } + self.advance(); // consume '[' + + // Parse length marker (optional #) and length + let has_length_marker = self.peek_char() == Some('#'); + if has_length_marker { + self.advance(); // consume '#' + } + + let length_str = self.parse_while(|ch| ch.is_ascii_digit())?; + let length: usize = length_str + .parse() + .map_err(|_| Error::parse(self.pos, "Invalid array length"))?; + + if self.peek_char() != Some(']') { + return Err(Error::parse(self.pos, "Expected ']'")); + } + self.advance(); // consume ']' + + // Check for tabular format: {field1,field2}: + if self.peek_char() == Some('{') { + self.parse_tabular_array(length) + } else if self.peek_char() == Some(':') { + self.advance(); // consume ':' + self.skip_whitespace(); + + // Check if it's inline (same line) or list format (next line) + if length == 0 { + // Empty array - skip any whitespace and newline + self.skip_whitespace(); + if self.peek_char() == Some('\n') { + self.advance(); + } + Ok(Value::Array(Vec::new())) + } else if self.peek_char() == Some('\n') || (self.pos >= self.buffer.len() && self.eof) + { + self.parse_list_array(length) + } else { + self.parse_inline_array(length) + } + } else { + Err(Error::parse( + self.pos, + "Expected ':' or '{' after array length", + )) + } + } + + fn parse_tabular_array(&mut self, expected_length: usize) -> Result { + if self.peek_char() != Some('{') { + return Err(Error::parse(self.pos, "Expected '{'")); + } + self.advance(); // consume '{' + + // Parse field names + let fields_str = self.parse_while(|ch| ch != '}')?; + let fields: Vec<&str> = fields_str.split(',').map(|s| s.trim()).collect(); + let delimiter = self.detect_delimiter(); + + if self.peek_char() != Some('}') { + return Err(Error::parse(self.pos, "Expected '}'")); + } + self.advance(); // consume '}' + + if self.peek_char() != Some(':') { + return Err(Error::parse(self.pos, "Expected ':'")); + } + self.advance(); // consume ':' + // Skip to next line (consume newline if present) + if self.peek_char() == Some('\n') { + self.advance(); + } + + // Parse rows + let mut items = Vec::new(); + let indent = self.options.get_indent(); + // Count base indentation of first row + let base_indent = self.count_indent(indent)?; + + for _ in 0..expected_length { + // Ensure we have enough buffer to read a line + self.ensure_buffer(100)?; + + if self.pos >= self.buffer.len() && self.eof { + break; + } + + // Count indentation of current line + let line_indent = self.count_indent(indent)?; + if line_indent < base_indent { + break; // Back at lower indentation level + } + + // Skip the indentation whitespace + for _ in 0..(line_indent * indent) { + if self.peek_char() == Some(' ') { + self.advance(); + } else { + break; + } + } + + let mut obj = Map::new(); + let start = self.pos; + // Parse until newline - keep reading if we hit buffer end + loop { + self.ensure_buffer(1)?; + if self.pos >= self.buffer.len() && self.eof { + break; + } + if self.pos < self.buffer.len() && self.peek_char() == Some('\n') { + break; + } + if self.pos < self.buffer.len() { + self.advance(); + } else { + break; + } + } + let row = &self.buffer[start..self.pos]; + let values: Vec<&str> = self.split_row(row, delimiter); + + if values.len() != fields.len() && self.options.get_strict() { + return Err(Error::LengthMismatch { + expected: fields.len(), + found: values.len(), + }); + } + + for (i, field) in fields.iter().enumerate() { + let value_str = values.get(i).unwrap_or(&""); + let value = self.parse_primitive_value(value_str.trim())?; + obj.insert(field.to_string(), value); + } + + items.push(Value::Object(obj)); + // Skip to next line + if self.pos < self.buffer.len() && self.peek_char() == Some('\n') { + self.advance(); + } + } + + if self.options.get_strict() && items.len() != expected_length { + return Err(Error::LengthMismatch { + expected: expected_length, + found: items.len(), + }); + } + + Ok(Value::Array(items)) + } + + fn parse_inline_array(&mut self, expected_length: usize) -> Result { + let delimiter = self.detect_delimiter(); + let start = self.pos; + // Parse until newline + while self.pos < self.buffer.len() && self.peek_char() != Some('\n') { + self.advance(); + } + let row = &self.buffer[start..self.pos]; + let values: Vec<&str> = self.split_row(row, delimiter); + + let mut items = Vec::new(); + for value_str in values { + let trimmed = value_str.trim(); + if !trimmed.is_empty() { + items.push(self.parse_primitive_value(trimmed)?); + } + } + + if self.options.get_strict() && items.len() != expected_length { + return Err(Error::LengthMismatch { + expected: expected_length, + found: items.len(), + }); + } + + Ok(Value::Array(items)) + } + + fn parse_list_array(&mut self, expected_length: usize) -> Result { + // Skip to next line if we're not already there + if self.peek_char() == Some('\n') { + self.advance(); + } + let indent = self.options.get_indent(); + // Count base indentation of first item + let base_indent = self.count_indent(indent)?; + let mut items = Vec::new(); + + for _ in 0..expected_length { + if self.pos >= self.buffer.len() && self.eof { + break; + } + + // Count indentation of current line + let line_indent = self.count_indent(indent)?; + if line_indent < base_indent { + break; // Back at lower indentation level + } + + // Skip the indentation whitespace + for _ in 0..(line_indent * indent) { + if self.peek_char() == Some(' ') { + self.advance(); + } else { + break; + } + } + + // Check if there's a '-' marker (optional in some formats) + let has_dash = self.peek_char() == Some('-'); + if has_dash { + self.advance(); // consume '-' + self.skip_whitespace(); + } + + // Parse the value + let line_start = self.pos; + let line_end = self.buffer[line_start..] + .find('\n') + .map(|i| line_start + i) + .unwrap_or(self.buffer.len()); + let line = &self.buffer[line_start..line_end].trim(); + + let value = if self.peek_char() == Some('[') { + self.parse_array_value()? + } else if line.contains(':') + && !line.starts_with('"') + && line.matches(':').count() == 1 + && !line.trim_start().starts_with('-') + { + // It's an object (single key:value on this line) + let key = self.parse_key()?; + self.skip_whitespace(); + if self.peek_char() != Some(':') { + return Err(Error::parse( + self.pos, + format!("Expected ':' after key '{key}'"), + )); + } + self.advance(); // consume ':' + self.skip_whitespace(); + let val = self.parse_value()?; + let mut obj = Map::new(); + obj.insert(key, val); + Value::Object(obj) + } else { + // Primitive value + self.parse_value()? + }; + items.push(value); + // Skip to next line + if self.pos < self.buffer.len() && self.peek_char() == Some('\n') { + self.advance(); + } + } + + if self.options.get_strict() && items.len() != expected_length { + return Err(Error::LengthMismatch { + expected: expected_length, + found: items.len(), + }); + } + + Ok(Value::Array(items)) + } + + fn parse_primitive_value(&self, s: &str) -> Result { + if s.is_empty() { + return Ok(Value::Null); + } + + // Try boolean + if s == "true" { + return Ok(Value::Bool(true)); + } + if s == "false" { + return Ok(Value::Bool(false)); + } + + // Try number + if let Ok(n) = s.parse::() { + return Ok(Value::Number(n.into())); + } + if let Ok(n) = s.parse::() { + return Ok(Value::Number( + serde_json::Number::from_f64(n) + .ok_or_else(|| Error::InvalidNumber(s.to_string()))?, + )); + } + + // Must be a string (possibly quoted) + if s.starts_with('"') && s.ends_with('"') { + self.parse_quoted_string(s) + } else { + Ok(Value::String(s.to_string())) + } + } + + fn parse_quoted_string(&self, s: &str) -> Result { + let mut result = String::new(); + let chars: Vec = s.chars().collect(); + let mut i = 1; // Skip opening quote + + while i < chars.len() - 1 { + // Skip closing quote + match chars[i] { + '\\' => { + i += 1; + if i >= chars.len() - 1 { + return Err(Error::InvalidEscape("Unterminated escape".to_string())); + } + match chars[i] { + '"' => result.push('"'), + '\\' => result.push('\\'), + 'n' => result.push('\n'), + 'r' => result.push('\r'), + 't' => result.push('\t'), + _ => { + return Err(Error::InvalidEscape(format!("\\{}", chars[i]))); + } + } + } + ch => result.push(ch), + } + i += 1; + } + + Ok(Value::String(result)) + } + + fn parse_string(&mut self) -> Result { + if self.peek_char() != Some('"') { + return Err(Error::parse(self.pos, "Expected '\"'")); + } + self.advance(); // consume opening quote + + let start = self.pos; + let mut escaped = false; + + while self.pos < self.buffer.len() || !self.eof { + self.ensure_buffer(1)?; + if self.pos >= self.buffer.len() { + break; + } + let ch = self.buffer.chars().nth(self.pos).unwrap(); + if escaped { + escaped = false; + } else if ch == '\\' { + escaped = true; + } else if ch == '"' { + let s = self.buffer[start..self.pos].to_string(); + self.advance(); // consume closing quote + return self.parse_quoted_string(&format!("\"{s}\"")); + } + self.advance(); + } + + Err(Error::UnterminatedString) + } + + fn parse_number(&mut self) -> Result { + let start = self.pos; + let mut has_dot = false; + + if self.peek_char() == Some('-') { + self.advance(); + } + + while self.pos < self.buffer.len() || !self.eof { + self.ensure_buffer(1)?; + if self.pos >= self.buffer.len() { + break; + } + match self.peek_char() { + Some(ch) if ch.is_ascii_digit() => { + self.advance(); + } + Some('.') if !has_dot => { + has_dot = true; + self.advance(); + } + _ => break, + } + } + + let s = &self.buffer[start..self.pos]; + if has_dot { + let n = s + .parse::() + .map_err(|_| Error::InvalidNumber(s.to_string()))?; + serde_json::Number::from_f64(n) + .ok_or_else(|| Error::InvalidNumber(s.to_string())) + .map(Value::Number) + } else { + s.parse::() + .map(|n| Value::Number(n.into())) + .map_err(|_| Error::InvalidNumber(s.to_string())) + } + } + + fn parse_boolean_or_null(&mut self) -> Result { + let start = self.pos; + self.parse_while(|ch| ch.is_ascii_alphabetic())?; + let s = &self.buffer[start..self.pos]; + + match s { + "true" => Ok(Value::Bool(true)), + "false" => Ok(Value::Bool(false)), + "null" => Ok(Value::Null), + _ => { + // Not a boolean/null, reset position + self.pos = start; + Err(Error::parse( + self.pos, + format!("Not a boolean or null: {s}"), + )) + } + } + } + + fn parse_key(&mut self) -> Result { + self.skip_whitespace(); + let start = self.pos; + // Parse key - stop at ':', '[', space, newline, or tab + while self.pos < self.buffer.len() || !self.eof { + self.ensure_buffer(1)?; + if self.pos >= self.buffer.len() { + break; + } + match self.peek_char() { + Some(ch) if ch == ':' || ch == '[' || ch == ' ' || ch == '\n' || ch == '\t' => { + break + } + Some(_) => self.advance(), + None => break, + } + } + if self.pos == start { + return Err(Error::parse(self.pos, "Expected key")); + } + Ok(self.buffer[start..self.pos].to_string()) + } + + fn detect_delimiter(&self) -> char { + // Look ahead to detect delimiter + let remaining = &self.buffer[self.pos..]; + + // Use SIMD for larger inputs, fallback for small ones + if remaining.len() >= 32 { + simd::detect_delimiter_simd(remaining) + } else { + simd::detect_delimiter_fallback(remaining) + } + } + + fn split_row<'b>(&self, row: &'b str, delimiter: char) -> Vec<&'b str> { + // Use SIMD for larger inputs, fallback for small ones + if row.len() >= 32 { + simd::split_row_simd(row, delimiter) + } else { + simd::split_row_fallback(row, delimiter) + } + } + + fn count_indent(&mut self, indent_size: usize) -> Result { + let start = self.pos; + let mut count = 0; + let indent_str = " ".repeat(indent_size); + while self.pos < self.buffer.len() || !self.eof { + self.ensure_buffer(indent_size)?; + if self.pos + indent_size > self.buffer.len() { + break; + } + let slice = &self.buffer[self.pos..self.pos + indent_size]; + if slice == indent_str { + count += 1; + self.pos += indent_size; + } else { + break; + } + } + let indent_level = count; + self.pos = start; + Ok(indent_level) + } + + fn skip_whitespace(&mut self) { + while self.pos < self.buffer.len() || !self.eof { + self.ensure_buffer(1).ok(); + if self.pos >= self.buffer.len() { + break; + } + match self.buffer.chars().nth(self.pos) { + Some(' ') | Some('\t') => self.pos += 1, + _ => break, + } + } + } + + fn skip_to_next_line(&mut self) { + while self.pos < self.buffer.len() || !self.eof { + self.ensure_buffer(1).ok(); + if self.pos >= self.buffer.len() { + break; + } + if self.buffer.chars().nth(self.pos) == Some('\n') { + self.pos += 1; + break; + } + self.pos += 1; + } + } + + fn parse_while(&mut self, mut pred: F) -> Result + where + F: FnMut(char) -> bool, + { + let start = self.pos; + while self.pos < self.buffer.len() || !self.eof { + self.ensure_buffer(1)?; + if self.pos >= self.buffer.len() { + break; + } + if let Some(ch) = self.buffer.chars().nth(self.pos) { + if pred(ch) { + self.pos += 1; + } else { + break; + } + } else { + break; + } + } + Ok(self.buffer[start..self.pos].to_string()) + } + + fn peek_char(&self) -> Option { + if self.pos < self.buffer.len() { + self.buffer.chars().nth(self.pos) + } else { + None + } + } + + fn advance(&mut self) { + if self.pos < self.buffer.len() { + self.pos += 1; + } + } +} diff --git a/src/encode.rs b/src/encode.rs index c303370..2296440 100644 --- a/src/encode.rs +++ b/src/encode.rs @@ -3,6 +3,7 @@ use crate::error::Error; use crate::options::EncodeOptions; use serde_json::Value; +use std::io::Write; /// Encode a JSON value to TOON format /// @@ -383,3 +384,503 @@ fn encode_object( Ok(()) } + +/// Encode a JSON value to TOON format and write it to a writer +/// +/// This function streams the output directly to the writer without building +/// the entire string in memory, making it suitable for large datasets. +/// +/// # Arguments +/// +/// * `value` - The JSON value to encode +/// * `writer` - The writer to write the TOON-formatted output to +/// * `options` - Optional encoding options +/// +/// # Returns +/// +/// A `Result` indicating success or failure +/// +/// # Example +/// +/// ```rust,no_run +/// use std::fs::File; +/// use std::io::BufWriter; +/// use serde_json::json; +/// use toon_rust::encode_stream; +/// +/// let data = json!({"name": "Alice", "age": 30}); +/// let file = File::create("output.toon").unwrap(); +/// let mut writer = BufWriter::new(file); +/// encode_stream(&data, &mut writer, None).unwrap(); +/// ``` +pub fn encode_stream( + value: &Value, + writer: &mut W, + options: Option<&EncodeOptions>, +) -> Result<(), Error> { + let default_opts = EncodeOptions::default(); + let opts = options.unwrap_or(&default_opts); + encode_value_to_writer(value, writer, 0, opts)?; + writer.flush().map_err(|e| Error::Io(e.to_string()))?; + Ok(()) +} + +fn encode_value_to_writer( + value: &Value, + writer: &mut W, + indent_level: usize, + options: &EncodeOptions, +) -> Result<(), Error> { + match value { + Value::Null => { + // Null values are typically omitted or represented as empty + } + Value::Bool(b) => { + let s = if *b { "true" } else { "false" }; + writer + .write_all(s.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + } + Value::Number(n) => { + if let Some(i) = n.as_i64() { + let s = i.to_string(); + writer + .write_all(s.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + } else if let Some(f) = n.as_f64() { + let s = f.to_string(); + writer + .write_all(s.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + } else { + return Err(Error::Serialization("Invalid number".to_string())); + } + } + Value::String(s) => { + encode_string_to_writer(s, writer, options.get_delimiter())?; + } + Value::Array(arr) => { + encode_array_to_writer(arr, writer, indent_level, options)?; + } + Value::Object(obj) => { + encode_object_to_writer(obj, writer, indent_level, options)?; + } + } + Ok(()) +} + +fn encode_string_to_writer( + s: &str, + writer: &mut W, + delimiter: char, +) -> Result<(), Error> { + // Check if we need to quote the string + let needs_quoting = s.contains(delimiter) + || s.contains(' ') + || s.contains('\n') + || s.contains('\t') + || s == "true" + || s == "false" + || s == "null" + || s.parse::().is_ok(); + + if needs_quoting { + writer + .write_all(b"\"") + .map_err(|e| Error::Io(e.to_string()))?; + for ch in s.chars() { + match ch { + '"' => writer + .write_all(b"\\\"") + .map_err(|e| Error::Io(e.to_string()))?, + '\\' => writer + .write_all(b"\\\\") + .map_err(|e| Error::Io(e.to_string()))?, + '\n' => writer + .write_all(b"\\n") + .map_err(|e| Error::Io(e.to_string()))?, + '\r' => writer + .write_all(b"\\r") + .map_err(|e| Error::Io(e.to_string()))?, + '\t' => writer + .write_all(b"\\t") + .map_err(|e| Error::Io(e.to_string()))?, + _ => { + let mut buf = [0; 4]; + let bytes = ch.encode_utf8(&mut buf).as_bytes(); + writer + .write_all(bytes) + .map_err(|e| Error::Io(e.to_string()))?; + } + } + } + writer + .write_all(b"\"") + .map_err(|e| Error::Io(e.to_string()))?; + } else { + writer + .write_all(s.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + } + Ok(()) +} + +fn encode_array_to_writer( + arr: &[Value], + writer: &mut W, + indent_level: usize, + options: &EncodeOptions, +) -> Result<(), Error> { + if arr.is_empty() { + writer + .write_all(b"[0]:") + .map_err(|e| Error::Io(e.to_string()))?; + return Ok(()); + } + + // Check if array contains uniform objects (tabular format) + if let Some(keys) = check_uniform_objects(arr) { + // For root-level arrays, include the header + let length_marker = options + .length_marker + .map(|m| format!("{m}")) + .unwrap_or_default(); + let header = format!("[{}{}]", length_marker, arr.len()); + writer + .write_all(header.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + writer + .write_all(b"{") + .map_err(|e| Error::Io(e.to_string()))?; + let keys_str = keys.join(&options.get_delimiter().to_string()); + writer + .write_all(keys_str.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + writer + .write_all(b"}:\n") + .map_err(|e| Error::Io(e.to_string()))?; + encode_tabular_array_rows_to_writer(arr, keys, writer, indent_level, options)?; + return Ok(()); + } + + // Check if all elements are primitives (inline format) + if arr.iter().all(is_primitive) { + encode_inline_array_to_writer(arr, writer, options)?; + return Ok(()); + } + + // Otherwise, use list format + encode_list_array_to_writer(arr, writer, indent_level, options)?; + Ok(()) +} + +fn encode_tabular_array_rows_to_writer( + arr: &[Value], + keys: Vec, + writer: &mut W, + indent_level: usize, + options: &EncodeOptions, +) -> Result<(), Error> { + let indent = options.get_indent(); + let indent_str = " ".repeat(indent_level * indent); + let delimiter = options.get_delimiter(); + + // Write rows (header already written by caller) + for item in arr { + writer + .write_all(indent_str.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + writer + .write_all(" ".repeat(indent).as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + let obj = item + .as_object() + .ok_or_else(|| Error::Serialization("Expected object in tabular array".to_string()))?; + + let mut first = true; + for key in &keys { + if !first { + let delim_bytes = [delimiter as u8]; + writer + .write_all(&delim_bytes) + .map_err(|e| Error::Io(e.to_string()))?; + } + let value = obj + .get(key) + .ok_or_else(|| Error::Serialization(format!("Missing key: {key}")))?; + encode_primitive_value_to_writer(value, writer, delimiter)?; + first = false; + } + writer + .write_all(b"\n") + .map_err(|e| Error::Io(e.to_string()))?; + } + + Ok(()) +} + +fn encode_primitive_value_to_writer( + value: &Value, + writer: &mut W, + delimiter: char, +) -> Result<(), Error> { + match value { + Value::Null => {} + Value::Bool(b) => { + let s = if *b { "true" } else { "false" }; + writer + .write_all(s.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + } + Value::Number(n) => { + if let Some(i) = n.as_i64() { + let s = i.to_string(); + writer + .write_all(s.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + } else if let Some(f) = n.as_f64() { + let s = f.to_string(); + writer + .write_all(s.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + } else { + return Err(Error::Serialization("Invalid number".to_string())); + } + } + Value::String(s) => { + encode_string_to_writer(s, writer, delimiter)?; + } + _ => { + return Err(Error::Serialization( + "Non-primitive value in tabular array".to_string(), + )); + } + } + Ok(()) +} + +fn encode_inline_array_to_writer( + arr: &[Value], + writer: &mut W, + options: &EncodeOptions, +) -> Result<(), Error> { + let length_marker = options + .length_marker + .map(|m| format!("{m}")) + .unwrap_or_default(); + let header = format!("[{}{}]:", length_marker, arr.len()); + writer + .write_all(header.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + + let delimiter = options.get_delimiter(); + let mut first = true; + for item in arr { + if !first { + let delim_bytes = [delimiter as u8]; + writer + .write_all(&delim_bytes) + .map_err(|e| Error::Io(e.to_string()))?; + } + match item { + Value::Null => {} + Value::Bool(b) => { + let s = if *b { "true" } else { "false" }; + writer + .write_all(s.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + } + Value::Number(n) => { + if let Some(i) = n.as_i64() { + let s = i.to_string(); + writer + .write_all(s.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + } else if let Some(f) = n.as_f64() { + let s = f.to_string(); + writer + .write_all(s.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + } + } + Value::String(s) => { + encode_string_to_writer(s, writer, delimiter)?; + } + _ => { + return Err(Error::Serialization( + "Non-primitive in inline array".to_string(), + )); + } + } + first = false; + } + + Ok(()) +} + +fn encode_list_array_to_writer( + arr: &[Value], + writer: &mut W, + indent_level: usize, + options: &EncodeOptions, +) -> Result<(), Error> { + let indent = options.get_indent(); + let indent_str = " ".repeat(indent_level * indent); + + for item in arr { + writer + .write_all(indent_str.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + writer + .write_all(" ".repeat(indent).as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + writer + .write_all(b"- ") + .map_err(|e| Error::Io(e.to_string()))?; + // For objects in list arrays, encode them inline as key: value + match item { + Value::Object(obj) => { + let mut first = true; + for (key, val) in obj { + if !first { + writer + .write_all(b" ") + .map_err(|e| Error::Io(e.to_string()))?; + } + writer + .write_all(key.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + writer + .write_all(b": ") + .map_err(|e| Error::Io(e.to_string()))?; + encode_primitive_value_to_writer(val, writer, options.get_delimiter())?; + first = false; + } + } + _ => { + encode_value_to_writer(item, writer, indent_level + 1, options)?; + } + } + writer + .write_all(b"\n") + .map_err(|e| Error::Io(e.to_string()))?; + } + + Ok(()) +} + +fn encode_object_to_writer( + obj: &serde_json::Map, + writer: &mut W, + indent_level: usize, + options: &EncodeOptions, +) -> Result<(), Error> { + if obj.is_empty() { + return Ok(()); + } + + let indent = options.get_indent(); + let indent_str = " ".repeat(indent_level * indent); + + let mut first = true; + for (key, value) in obj { + if !first { + writer + .write_all(b"\n") + .map_err(|e| Error::Io(e.to_string()))?; + } + writer + .write_all(indent_str.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + writer + .write_all(key.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + + match value { + Value::Array(arr) => { + // For arrays, check the format and encode appropriately + if arr.is_empty() { + writer + .write_all(b"[0]:") + .map_err(|e| Error::Io(e.to_string()))?; + } else if let Some(keys) = check_uniform_objects(arr) { + // Tabular array - output on same line: key[N]{...}: + let length_marker = options + .length_marker + .map(|m| format!("{m}")) + .unwrap_or_default(); + let header = format!("[{}{}]", length_marker, arr.len()); + writer + .write_all(header.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + writer + .write_all(b"{") + .map_err(|e| Error::Io(e.to_string()))?; + let keys_str = keys.join(&options.get_delimiter().to_string()); + writer + .write_all(keys_str.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + writer + .write_all(b"}:\n") + .map_err(|e| Error::Io(e.to_string()))?; + // Now output the rows + encode_tabular_array_rows_to_writer(arr, keys, writer, indent_level, options)?; + } else if arr.iter().all(is_primitive) { + // Inline array - output on same line: key[N]: value1,value2 + let length_marker = options + .length_marker + .map(|m| format!("{m}")) + .unwrap_or_default(); + let header = format!("[{}{}]:", length_marker, arr.len()); + writer + .write_all(header.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + let delimiter = options.get_delimiter(); + let mut first = true; + for item in arr { + if !first { + let delim_bytes = [delimiter as u8]; + writer + .write_all(&delim_bytes) + .map_err(|e| Error::Io(e.to_string()))?; + } + encode_primitive_value_to_writer(item, writer, delimiter)?; + first = false; + } + } else { + // List array - output on same line: key[N]: + let length_marker = options + .length_marker + .map(|m| format!("{m}")) + .unwrap_or_default(); + let header = format!("[{}{}]:", length_marker, arr.len()); + writer + .write_all(header.as_bytes()) + .map_err(|e| Error::Io(e.to_string()))?; + writer + .write_all(b"\n") + .map_err(|e| Error::Io(e.to_string()))?; + encode_list_array_to_writer(arr, writer, indent_level, options)?; + } + } + Value::Object(_) => { + writer + .write_all(b": ") + .map_err(|e| Error::Io(e.to_string()))?; + writer + .write_all(b"\n") + .map_err(|e| Error::Io(e.to_string()))?; + encode_value_to_writer(value, writer, indent_level + 1, options)?; + } + _ => { + writer + .write_all(b": ") + .map_err(|e| Error::Io(e.to_string()))?; + encode_value_to_writer(value, writer, indent_level, options)?; + } + } + first = false; + } + + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs index b0cf2bd..bdcbd8b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,6 +22,27 @@ //! let decoded = decode(&toon, None).unwrap(); //! ``` //! +//! ## Streaming API +//! +//! For large datasets, use the streaming API to avoid loading everything into memory: +//! +//! ```rust,no_run +//! use std::fs::File; +//! use std::io::BufWriter; +//! use serde_json::json; +//! use toon_rust::{encode_stream, decode_stream}; +//! +//! // Encode to file +//! let data = json!({"name": "Alice", "age": 30}); +//! let file = File::create("output.toon").unwrap(); +//! let mut writer = BufWriter::new(file); +//! encode_stream(&data, &mut writer, None).unwrap(); +//! +//! // Decode from file +//! let file = File::open("output.toon").unwrap(); +//! let decoded = decode_stream(file, None).unwrap(); +//! ``` +//! //! ## Serde API (requires `serde` feature) //! //! ```rust,no_run @@ -50,8 +71,8 @@ pub mod error; pub mod options; mod simd; -pub use decode::decode; -pub use encode::encode; +pub use decode::{decode, decode_stream}; +pub use encode::{encode, encode_stream}; pub use error::Error; pub use options::{DecodeOptions, EncodeOptions}; diff --git a/tests/streaming.rs b/tests/streaming.rs new file mode 100644 index 0000000..9e47aae --- /dev/null +++ b/tests/streaming.rs @@ -0,0 +1,314 @@ +//! Tests for TOON streaming API + +use serde_json::json; +use std::io::{BufWriter, Cursor}; +use toon_rust::{decode, decode_stream, encode, encode_stream, DecodeOptions, EncodeOptions}; + +#[test] +fn test_encode_stream_simple_object() { + let data = json!({ + "name": "Alice", + "age": 30 + }); + + let mut buffer = Vec::new(); + encode_stream(&data, &mut buffer, None).unwrap(); + let output = String::from_utf8(buffer).unwrap(); + + // Compare with non-streaming version + let expected = encode(&data, None).unwrap(); + assert_eq!(output, expected); + assert!(output.contains("name: Alice")); + assert!(output.contains("age: 30")); +} + +#[test] +fn test_decode_stream_simple_object() { + let toon = "name: Alice\nage: 30"; + let mut cursor = Cursor::new(toon.as_bytes()); + let result = decode_stream(&mut cursor, None).unwrap(); + + // Compare with non-streaming version + let expected = decode(toon, None).unwrap(); + assert_eq!(result, expected); + assert_eq!(result["name"], "Alice"); + assert_eq!(result["age"], 30); +} + +#[test] +fn test_encode_stream_primitive_array() { + let data = json!({ + "tags": ["reading", "gaming", "coding"] + }); + + let mut buffer = Vec::new(); + encode_stream(&data, &mut buffer, None).unwrap(); + let output = String::from_utf8(buffer).unwrap(); + + let expected = encode(&data, None).unwrap(); + assert_eq!(output, expected); +} + +#[test] +fn test_decode_stream_primitive_array() { + let toon = "tags[3]: reading,gaming,coding"; + let mut cursor = Cursor::new(toon.as_bytes()); + let result = decode_stream(&mut cursor, None).unwrap(); + + let expected = decode(toon, None).unwrap(); + assert_eq!(result, expected); +} + +#[test] +fn test_encode_stream_tabular_array() { + let data = json!({ + "items": [ + {"sku": "A1", "qty": 2, "price": 9.99}, + {"sku": "B2", "qty": 1, "price": 14.5} + ] + }); + + let mut buffer = Vec::new(); + encode_stream(&data, &mut buffer, None).unwrap(); + let output = String::from_utf8(buffer).unwrap(); + + let expected = encode(&data, None).unwrap(); + assert_eq!(output, expected); +} + +#[test] +fn test_decode_stream_tabular_array() { + let toon = "items[2]{sku,qty,price}:\n A1,2,9.99\n B2,1,14.5"; + let mut cursor = Cursor::new(toon.as_bytes()); + let result = decode_stream(&mut cursor, None).unwrap(); + + let expected = decode(toon, None).unwrap(); + assert_eq!(result, expected); +} + +#[test] +fn test_encode_stream_nested_object() { + let data = json!({ + "user": { + "id": 1, + "name": "Alice" + } + }); + + let mut buffer = Vec::new(); + encode_stream(&data, &mut buffer, None).unwrap(); + let output = String::from_utf8(buffer).unwrap(); + + let expected = encode(&data, None).unwrap(); + assert_eq!(output, expected); +} + +#[test] +fn test_decode_stream_nested_object() { + let toon = "user:\n id: 1\n name: Alice"; + let mut cursor = Cursor::new(toon.as_bytes()); + let result = decode_stream(&mut cursor, None).unwrap(); + + let expected = decode(toon, None).unwrap(); + assert_eq!(result, expected); +} + +#[test] +fn test_streaming_roundtrip() { + let data = json!({ + "name": "Alice", + "age": 30, + "tags": ["reading", "gaming"], + "items": [ + {"sku": "A1", "qty": 2, "price": 9.99} + ] + }); + + // Encode using streaming + let mut buffer = Vec::new(); + encode_stream(&data, &mut buffer, None).unwrap(); + + // Decode using streaming + let mut cursor = Cursor::new(&buffer); + let decoded = decode_stream(&mut cursor, None).unwrap(); + + assert_eq!(data, decoded); +} + +#[test] +fn test_streaming_roundtrip_large_array() { + // Create a large array of objects + let items: Vec<_> = (0..1000) + .map(|i| { + json!({ + "id": i, + "name": format!("Item {}", i), + "value": i * 2 + }) + }) + .collect(); + let data = json!({ "items": items }); + + // Encode using streaming + let mut buffer = Vec::new(); + encode_stream(&data, &mut buffer, None).unwrap(); + + // Decode using streaming + let mut cursor = Cursor::new(&buffer); + let decoded = decode_stream(&mut cursor, None).unwrap(); + + assert_eq!(data, decoded); +} + +#[test] +fn test_encode_stream_with_options() { + let data = json!({ + "items": [ + {"sku": "A1", "qty": 2}, + {"sku": "B2", "qty": 1} + ] + }); + + let options = EncodeOptions::new().delimiter(toon_rust::options::Delimiter::Pipe); + let mut buffer = Vec::new(); + encode_stream(&data, &mut buffer, Some(&options)).unwrap(); + let output = String::from_utf8(buffer).unwrap(); + + let expected = encode(&data, Some(&options)).unwrap(); + assert_eq!(output, expected); + assert!(output.contains("|")); +} + +#[test] +fn test_decode_stream_with_options() { + let toon = "user:\n id: 1\n name: Alice"; + let options = DecodeOptions::new().indent(4); + let mut cursor = Cursor::new(toon.as_bytes()); + let result = decode_stream(&mut cursor, Some(&options)).unwrap(); + + let expected = decode(toon, Some(&options)).unwrap(); + assert_eq!(result, expected); +} + +#[test] +fn test_encode_stream_to_bufwriter() { + let data = json!({"test": "value"}); + let mut buffer = Vec::new(); + let mut writer = BufWriter::new(&mut buffer); + encode_stream(&data, &mut writer, None).unwrap(); + drop(writer); // Ensure buffer is flushed + + let output = String::from_utf8(buffer).unwrap(); + let expected = encode(&data, None).unwrap(); + assert_eq!(output, expected); +} + +#[test] +fn test_streaming_empty_array() { + let data = json!({"items": []}); + let mut buffer = Vec::new(); + encode_stream(&data, &mut buffer, None).unwrap(); + let output = String::from_utf8(buffer).unwrap(); + + let mut cursor = Cursor::new(output.as_bytes()); + let decoded = decode_stream(&mut cursor, None).unwrap(); + assert_eq!(data, decoded); +} + +#[test] +fn test_streaming_empty_object() { + let data = json!({}); + let mut buffer = Vec::new(); + encode_stream(&data, &mut buffer, None).unwrap(); + let output = String::from_utf8(buffer).unwrap(); + + let mut cursor = Cursor::new(output.as_bytes()); + let decoded = decode_stream(&mut cursor, None).unwrap(); + assert_eq!(data, decoded); +} + +#[test] +fn test_streaming_mixed_types() { + let data = json!({ + "string": "hello", + "number": 42, + "float": 3.14, + "boolean": true, + "null": null, + "array": [1, 2, 3], + "object": {"key": "value"} + }); + + let mut buffer = Vec::new(); + encode_stream(&data, &mut buffer, None).unwrap(); + + let mut cursor = Cursor::new(&buffer); + let decoded = decode_stream(&mut cursor, None).unwrap(); + + assert_eq!(data, decoded); +} + +#[test] +fn test_streaming_root_array() { + let data = json!([ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"} + ]); + + let mut buffer = Vec::new(); + encode_stream(&data, &mut buffer, None).unwrap(); + + let mut cursor = Cursor::new(&buffer); + let decoded = decode_stream(&mut cursor, None).unwrap(); + + assert_eq!(data, decoded); +} + +#[test] +fn test_streaming_quoted_strings() { + let data = json!({ + "note": "hello, world", + "quote": "She said \"hello\"" + }); + + let mut buffer = Vec::new(); + encode_stream(&data, &mut buffer, None).unwrap(); + + let mut cursor = Cursor::new(&buffer); + let decoded = decode_stream(&mut cursor, None).unwrap(); + + assert_eq!(data, decoded); +} + +// Test that streaming produces identical output to non-streaming +#[test] +fn test_streaming_output_identical() { + let test_cases = vec![ + json!({"simple": "value"}), + json!({"array": [1, 2, 3]}), + json!({"nested": {"key": "value"}}), + json!({"mixed": [1, "two", true, null]}), + json!({ + "items": [ + {"sku": "A1", "qty": 2, "price": 9.99}, + {"sku": "B2", "qty": 1, "price": 14.5} + ] + }), + ]; + + for data in test_cases { + // Non-streaming + let non_streaming = encode(&data, None).unwrap(); + + // Streaming + let mut buffer = Vec::new(); + encode_stream(&data, &mut buffer, None).unwrap(); + let streaming = String::from_utf8(buffer).unwrap(); + + assert_eq!( + non_streaming, streaming, + "Output mismatch for data: {:?}", + data + ); + } +}