diff --git a/src/toon_format/_scanner.py b/src/toon_format/_scanner.py index cb927a2..e514857 100644 --- a/src/toon_format/_scanner.py +++ b/src/toon_format/_scanner.py @@ -207,6 +207,13 @@ def to_parsed_lines( if not source.strip(): return [], [] + # Normalize Windows CRLF line endings to LF + # This prevents stray \r characters from appearing in content + source = source.replace("\r\n", "\n") + + # Replace any remaining standalone \r characters (old Mac format) with \n + source = source.replace("\r", "\n") + lines = source.split("\n") parsed: List[ParsedLine] = [] blank_lines: List[BlankLineInfo] = [] diff --git a/tests/test_decoder.py b/tests/test_decoder.py index 13c7736..784e614 100644 --- a/tests/test_decoder.py +++ b/tests/test_decoder.py @@ -140,3 +140,58 @@ def test_object_key_order_preserved(self): assert keys == ["z", "a", "m", "b"] # Verify order is not alphabetical assert keys != ["a", "b", "m", "z"] + + +class TestCRLFDecoding: + """Test CRLF (Windows) line ending handling in decoder.""" + + def test_decode_object_with_crlf(self): + """Test decoding objects with CRLF line endings.""" + toon = "name: Alice\r\nage: 30\r\n" + result = decode(toon) + assert result == {"name": "Alice", "age": 30} + + def test_decode_nested_object_with_crlf(self): + """Test decoding nested objects with CRLF line endings.""" + toon = "person:\r\n name: Alice\r\n age: 30\r\n" + result = decode(toon) + assert result == {"person": {"name": "Alice", "age": 30}} + + def test_decode_array_with_crlf(self): + """Test decoding arrays with CRLF line endings.""" + toon = "items[3]:\r\n - apple\r\n - banana\r\n - cherry\r\n" + result = decode(toon) + assert result == {"items": ["apple", "banana", "cherry"]} + + def test_decode_delimited_array_with_crlf(self): + """Test decoding delimited arrays with CRLF line endings.""" + toon = "items[3]: apple,banana,cherry\r\n" + result = decode(toon) + assert result == {"items": ["apple", "banana", "cherry"]} + + def test_decode_with_old_mac_cr(self): + """Test decoding with old Mac CR line endings.""" + toon = "name: Alice\rage: 30\r" + result = decode(toon) + assert result == {"name": "Alice", "age": 30} + + def test_decode_with_mixed_line_endings(self): + """Test decoding with mixed line endings.""" + toon = "name: Alice\r\nage: 30\ncity: NYC\r" + result = decode(toon) + assert result == {"name": "Alice", "age": 30, "city": "NYC"} + + def test_crlf_does_not_affect_quoted_strings(self): + """Test that CRLF normalization doesn't affect escaped \\r in strings.""" + toon = 'text: "line1\\r\\nline2"\r\n' + result = decode(toon) + # The string should contain the escaped sequences + assert result == {"text": "line1\r\nline2"} + + def test_crlf_in_strict_mode(self): + """Test CRLF works correctly in strict mode.""" + toon = "name:\r\n first: Alice\r\n age: 30\r\n" + options = DecodeOptions(strict=True) + result = decode(toon, options) + assert result == {"name": {"first": "Alice", "age": 30}} + diff --git a/tests/test_scanner.py b/tests/test_scanner.py index 3870e94..b2e2621 100644 --- a/tests/test_scanner.py +++ b/tests/test_scanner.py @@ -241,3 +241,62 @@ def test_blank_lines_not_validated_in_strict_mode(self): # Should not raise error for blank line with invalid indentation assert len(blanks) == 1 assert blanks[0].line_num == 2 + + +class TestCRLFHandling: + """Tests for CRLF and CR normalization.""" + + def test_crlf_normalization(self): + """Test Windows CRLF line endings are normalized to LF.""" + source = "name: Alice\r\nage: 30\r\n" + lines, blanks = to_parsed_lines(source, 2, False) + # Verify no \r remains in content + assert "\r" not in lines[0].content + assert "\r" not in lines[1].content + assert lines[0].content == "name: Alice" + assert lines[1].content == "age: 30" + assert len(lines) == 3 # name, age, and trailing empty line + + def test_standalone_cr_normalization(self): + """Test old Mac CR line endings are normalized to LF.""" + source = "name: Alice\rage: 30\r" + lines, blanks = to_parsed_lines(source, 2, False) + # Verify no \r remains in content + assert "\r" not in lines[0].content + assert "\r" not in lines[1].content + assert lines[0].content == "name: Alice" + assert lines[1].content == "age: 30" + assert len(lines) == 3 # name, age, and trailing empty line + + def test_mixed_line_endings(self): + """Test mixed line endings are all normalized.""" + source = "line1\r\nline2\nline3\rline4" + lines, blanks = to_parsed_lines(source, 2, False) + assert len(lines) == 4 + for line in lines: + assert "\r" not in line.content + assert lines[0].content == "line1" + assert lines[1].content == "line2" + assert lines[2].content == "line3" + assert lines[3].content == "line4" + + def test_crlf_with_indentation(self): + """Test CRLF handling preserves indentation.""" + source = "parent:\r\n child: value\r\n" + lines, blanks = to_parsed_lines(source, 2, False) + assert lines[0].content == "parent:" + assert lines[0].depth == 0 + assert lines[1].content == "child: value" + assert lines[1].depth == 1 + assert lines[1].indent == 2 + + def test_crlf_in_strict_mode(self): + """Test CRLF normalization works in strict mode.""" + source = "name: Alice\r\n age: 30\r\n" + lines, blanks = to_parsed_lines(source, 2, True) + # Should not raise error and should properly normalize + assert len(lines) == 3 + assert "\r" not in lines[0].content + assert "\r" not in lines[1].content + assert lines[1].depth == 1 +