diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 77138f5..a8c6213 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -3,28 +3,78 @@ name: Publish to PyPI on: release: types: [published] + workflow_dispatch: + +permissions: + contents: read jobs: - publish: - name: Publish to PyPI + build: + name: Build distribution runs-on: ubuntu-latest - permissions: - id-token: write - contents: read steps: - uses: actions/checkout@v4 - - name: Install uv - uses: astral-sh/setup-uv@v5 - - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.12" + python-version: "3.x" + + - name: Install build dependencies + run: | + python -m pip install --upgrade pip + pip install build - name: Build package - run: uv build + run: python -m build + + - name: Store distribution packages + uses: actions/upload-artifact@v4 + with: + name: python-package-distributions + path: dist/ + + publish-to-pypi: + name: Publish to PyPI + if: github.event_name == 'release' && github.event.action == 'published' + needs: build + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/toon-format + permissions: + id-token: write + + steps: + - name: Download distributions + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1 + + publish-to-testpypi: + name: Publish to TestPyPI + if: github.event_name == 'workflow_dispatch' + needs: build + runs-on: ubuntu-latest + environment: + name: testpypi + url: https://test.pypi.org/p/toon-format + permissions: + id-token: write + + steps: + - name: Download distributions + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + + - name: Publish to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 171c10d..e2ae360 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,47 +2,62 @@ name: Tests on: push: - branches: [main] + branches: [main, develop] pull_request: - branches: [main] + branches: [main, develop] jobs: test: - name: Python ${{ matrix.python-version }} on ${{ matrix.os }} - runs-on: ${{ matrix.os }} + name: Test Python ${{ matrix.python-version }} + runs-on: ubuntu-latest strategy: - fail-fast: false matrix: - os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ["3.11", "3.12", "3.13", "3.14"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 - - name: Install uv - uses: astral-sh/setup-uv@v5 - with: - enable-cache: true - - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies - run: uv sync + run: | + python -m pip install --upgrade pip + pip install -e . + pip install pytest pytest-cov - name: Run tests - run: uv run pytest tests/ -v - - - name: Run tests with coverage - if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12' - run: | - uv run pytest tests/ --cov=src/toon_format --cov-report=xml --cov-report=term-missing + run: pytest --cov=toon_format --cov-report=xml --cov-report=term - - name: Upload coverage to Codecov - if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12' + - name: Upload coverage uses: codecov/codecov-action@v4 + if: matrix.python-version == '3.12' with: file: ./coverage.xml fail_ci_if_error: false + + lint: + name: Lint + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ruff mypy + + - name: Run ruff + run: ruff check src/toon_format tests + + - name: Run mypy + run: mypy src/toon_format + continue-on-error: true # Mypy is informational only diff --git a/.gitignore b/.gitignore index 38f0c6c..f291515 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,12 @@ -# Python +# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class + +# C extensions *.so + +# Distribution / packaging .Python build/ develop-eggs/ @@ -23,7 +27,36 @@ share/python-wheels/ *.egg MANIFEST -# Virtual environments +# Package-specific +toon_format.egg-info/ + +# Ruff cache +.ruff_cache/ + +# Mypy cache +.mypy_cache/ +.dmypy.json +dmypy.json + +# PyInstaller +*.manifest +*.spec + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Environments .env .venv env/ @@ -38,20 +71,30 @@ venv.bak/ *.swp *.swo *~ +.claude/ +CLAUDE.md + +# macOS .DS_Store +.AppleDouble +.LSOverride +._* -# Testing -.pytest_cache/ -.coverage -htmlcov/ -.tox/ -.nox/ +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent -# Type checking -.mypy_cache/ -.pytype/ -.pyre/ -.pyright/ +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk # uv .uv/ diff --git a/README.md b/README.md index 85fbdc2..68655e5 100644 --- a/README.md +++ b/README.md @@ -1,57 +1,496 @@ # TOON Format for Python -[![PyPI version](https://img.shields.io/pypi/v/toon-format.svg)](https://pypi.org/project/toon-format/) -[![Python versions](https://img.shields.io/pypi/pyversions/toon-format.svg)](https://pypi.org/project/toon-format/) -[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE) +A compact, human-readable serialization format designed for passing structured data to Large Language Models with significantly reduced token usage. -**Token-Oriented Object Notation** is a compact, human-readable format designed for passing structured data to Large Language Models with significantly reduced token usage. +[![Tests](https://github.com/toon-format/toon-python/actions/workflows/test.yml/badge.svg)](https://github.com/toon-format/toon-python/actions) +[![PyPI](https://img.shields.io/pypi/v/toon-format.svg)](https://pypi.org/project/toon-format/) +[![Python Versions](https://img.shields.io/pypi/pyversions/toon-format.svg)](https://pypi.org/project/toon-format/) -## Status +## Installation -🚧 **This package is currently a namespace reservation.** Full implementation coming soon! +```bash +# With pip +pip install toon-format -### Example +# With uv (recommended) +uv pip install toon-format +``` -**JSON** (verbose): -```json -{ - "users": [ - { "id": 1, "name": "Alice", "role": "admin" }, - { "id": 2, "name": "Bob", "role": "user" } - ] +## What is TOON? + +TOON (Token-Oriented Object Notation) combines YAML's indentation-based structure for nested objects and CSV's tabular format for uniform data rows, optimized specifically for token efficiency in LLM contexts. + +This is a faithful Python implementation maintaining 100% output compatibility with the [official TOON specification](https://github.com/toon-format/spec). + +### Key Features + +- **30-60% token reduction** compared to standard JSON +- **Minimal syntax**: Eliminates redundant punctuation (braces, brackets, most quotes) +- **Tabular arrays**: CSV-like row format for uniform object collections +- **Explicit metadata**: Array length indicators `[N]` for validation +- **LLM-friendly**: Maintains semantic clarity while reducing token count +- **100% compatible** with original TypeScript implementation + + +## Quick Start + +```python +from toon_format import encode + +# Simple object +data = {"name": "Alice", "age": 30} +print(encode(data)) +# Output: +# name: Alice +# age: 30 + +# Tabular array (uniform objects) +users = [ + {"id": 1, "name": "Alice", "age": 30}, + {"id": 2, "name": "Bob", "age": 25}, + {"id": 3, "name": "Charlie", "age": 35}, +] +print(encode(users)) +# Output: +# [3,]{id,name,age}: +# 1,Alice,30 +# 2,Bob,25 +# 3,Charlie,35 + +# Complex nested structure +data = { + "metadata": {"version": 1, "author": "test"}, + "items": [ + {"id": 1, "name": "Item1"}, + {"id": 2, "name": "Item2"}, + ], + "tags": ["alpha", "beta", "gamma"], } +print(encode(data)) +# Output: +# metadata: +# version: 1 +# author: test +# items[2,]{id,name}: +# 1,Item1 +# 2,Item2 +# tags[3]: alpha,beta,gamma ``` -**TOON** (compact): +## CLI Usage + +Command-line tool for converting between JSON and TOON formats. + +```bash +# Encode JSON to TOON (auto-detected by .json extension) +toon input.json -o output.toon + +# Decode TOON to JSON (auto-detected by .toon extension) +toon data.toon -o output.json + +# Use stdin/stdout +echo '{"name": "Ada"}' | toon - +# Output: name: Ada + +# Force encode mode +toon data.json --encode + +# Force decode mode +toon data.toon --decode + +# Custom delimiter +toon data.json --delimiter "\t" -o output.toon + +# With length markers +toon data.json --length-marker -o output.toon + +# Lenient decoding (disable strict validation) +toon data.toon --no-strict -o output.json ``` -users[2]{id,name,role}: - 1,Alice,admin - 2,Bob,user + +### CLI Options + +| Option | Description | +|--------|-------------| +| `-o, --output ` | Output file path (prints to stdout if omitted) | +| `-e, --encode` | Force encode mode (overrides auto-detection) | +| `-d, --decode` | Force decode mode (overrides auto-detection) | +| `--delimiter ` | Array delimiter: `,` (comma), `\t` (tab), `\|` (pipe) | +| `--indent ` | Indentation size (default: 2) | +| `--length-marker` | Add `#` prefix to array lengths (e.g., `items[#3]`) | +| `--no-strict` | Disable strict validation when decoding | + +## API Reference + +### `encode(value, options=None)` + +Converts a Python value to TOON format. + +**Parameters:** +- `value` (Any): JSON-serializable value to encode +- `options` (dict, optional): Encoding options + +**Returns:** `str` - TOON-formatted string + +**Example:** + +```python +from toon_format import encode + +data = {"id": 123, "name": "Ada"} +toon_str = encode(data) +print(toon_str) +# Output: +# id: 123 +# name: Ada ``` -## Resources +### `decode(input_str, options=None)` + +Converts a TOON-formatted string back to Python values. -- [TOON Specification](https://github.com/johannschopplich/toon/blob/main/SPEC.md) -- [Main Repository](https://github.com/johannschopplich/toon) -- [Benchmarks & Performance](https://github.com/johannschopplich/toon#benchmarks) -- [Other Language Implementations](https://github.com/johannschopplich/toon#other-implementations) +**Parameters:** +- `input_str` (str): TOON-formatted string to parse +- `options` (DecodeOptions, optional): Decoding options -## Future Usage +**Returns:** Python value (dict, list, or primitive) -Once implemented, the package will provide: +**Example:** ```python -from toon_format import encode, decode +from toon_format import decode -data = # your data structure -toon_string = encode(data) -decoded = decode(toon_string) +toon_str = """items[2]{sku,qty,price}: + A1,2,9.99 + B2,1,14.5""" + +data = decode(toon_str) +print(data) +# Output: {'items': [{'sku': 'A1', 'qty': 2, 'price': 9.99}, {'sku': 'B2', 'qty': 1, 'price': 14.5}]} ``` -## Contributing +### Encoding Options + +```python +from toon_format import encode -Interested in implementing TOON for Python? Check out the [specification](https://github.com/johannschopplich/toon/blob/main/SPEC.md) and feel free to contribute! +encode(data, { + "indent": 2, # Spaces per indentation level (default: 2) + "delimiter": ",", # Delimiter for arrays: "," | "\t" | "|" (default: ",") + "lengthMarker": "#" # Optional marker prefix: "#" | False (default: False) +}) +``` + +### Decoding Options + +```python +from toon_format import decode, DecodeOptions + +options = DecodeOptions( + indent=2, # Expected number of spaces per indentation level (default: 2) + strict=True # Enable strict validation (default: True) +) + +data = decode(toon_str, options) +``` + +**Strict Mode:** + +By default, the decoder validates input strictly: +- **Invalid escape sequences**: Throws on `"\x"`, unterminated strings +- **Syntax errors**: Throws on missing colons, malformed headers +- **Array length mismatches**: Throws when declared length doesn't match actual count +- **Delimiter mismatches**: Throws when row delimiters don't match header + +Set `strict=False` to allow lenient parsing. + +### Delimiter Options + +You can use string literals directly: + +```python +data = [1, 2, 3, 4, 5] + +# Comma (default) +print(encode(data)) +# [5]: 1,2,3,4,5 + +# Tab +print(encode(data, {"delimiter": "\t"})) +# [5 ]: 1 2 3 4 5 + +# Pipe +print(encode(data, {"delimiter": "|"})) +# [5|]: 1|2|3|4|5 +``` + +Or use the string keys: + +```python +encode(data, {"delimiter": "comma"}) # Default +encode(data, {"delimiter": "tab"}) # Tab-separated +encode(data, {"delimiter": "pipe"}) # Pipe-separated +``` + +### Length Markers + +Add the `#` prefix to array length indicators: + +```python +users = [ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"}, +] + +# Without marker (default) +print(encode(users)) +# [2,]{id,name}: +# 1,Alice +# 2,Bob + +# With marker +print(encode(users, {"lengthMarker": "#"})) +# [#2,]{id,name}: +# 1,Alice +# 2,Bob +``` + +## Format Rules + +### Objects +Key-value pairs with primitives or nested structures: +```python +{"name": "Alice", "age": 30} +# => +# name: Alice +# age: 30 +``` + +### Primitive Arrays +Arrays always include length `[N]`: +```python +[1, 2, 3, 4, 5] +# => [5]: 1,2,3,4,5 + +["alpha", "beta", "gamma"] +# => [3]: alpha,beta,gamma +``` + +### Tabular Arrays +Uniform objects with identical primitive-only fields use CSV-like format: +```python +[ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"}, +] +# => +# [2,]{id,name}: +# 1,Alice +# 2,Bob +``` + +**Note**: The delimiter appears in the length bracket `[2,]` for tabular arrays. + +### Mixed Arrays +Non-uniform data using list format with `-` markers: +```python +[{"name": "Alice"}, 42, "hello"] +# => +# [3]: +# - name: Alice +# - 42 +# - hello +``` + +### Array Length Format + +The length bracket format depends on the array type: + +**Tabular arrays (with fields):** +- Delimiter always shown: `[2,]{fields}:` or `[2|]{fields}:` or `[2\t]{fields}:` + +**Primitive arrays (no fields):** +- Comma: `[3]:` (delimiter hidden) +- Other: `[3|]:` or `[3\t]:` (delimiter shown) + +### Quoting Rules + +Strings are quoted only when necessary (following the [TOON specification](https://github.com/toon-format/spec)): + +- Empty strings +- Keywords: `null`, `true`, `false` +- Numeric strings: `42`, `-3.14` +- Leading or trailing whitespace +- Contains structural characters: `:`, `[`, `]`, `{`, `}`, `-`, `"` +- Contains current delimiter (`,`, `|`, or tab) +- Contains control characters (newline, carriage return, tab, backslash) + +```python +"hello" # => hello (no quotes) +"hello world" # => hello world (internal spaces OK) +" hello" # => " hello" (leading space requires quotes) +"null" # => "null" (keyword) +"42" # => "42" (looks like number) +"" # => "" (empty) +``` + +## Type Conversions + +Non-JSON types are normalized automatically: +- **Numbers**: Decimal form (no scientific notation) +- **Dates/DateTime**: ISO 8601 strings (quoted) +- **Decimal**: Converted to float +- **Infinity/NaN**: Converted to `null` +- **Functions/Callables**: Converted to `null` +- **-0**: Normalized to `0` + +## LLM Integration Best Practices + +When using TOON with LLMs: + +1. **Wrap in code blocks** for clarity: + ````markdown + ```toon + name: Alice + age: 30 + ``` + ```` + +2. **Instruct the model** about the format: + > "Respond using TOON format (Token-Oriented Object Notation). Use `key: value` syntax, indentation for nesting, and tabular format `[N,]{fields}:` for uniform arrays." + +3. **Leverage length markers** for validation: + ```python + encode(data, {"lengthMarker": "#"}) + ``` + Tell the model: "Array lengths are marked with `[#N]`. Ensure your response matches these counts." + +4. **Acknowledge tokenizer variance**: Token savings depend on the specific tokenizer and model being used. + +## Token Efficiency Example + +```python +import json +from toon_format import encode + +data = { + "users": [ + {"id": 1, "name": "Alice", "age": 30, "active": True}, + {"id": 2, "name": "Bob", "age": 25, "active": True}, + {"id": 3, "name": "Charlie", "age": 35, "active": False}, + ] +} + +json_str = json.dumps(data) +toon_str = encode(data) + +print(f"JSON: {len(json_str)} characters") +print(f"TOON: {len(toon_str)} characters") +print(f"Reduction: {100 * (1 - len(toon_str) / len(json_str)):.1f}%") + +# Output: +# JSON: 177 characters +# TOON: 85 characters +# Reduction: 52.0% +``` + +**JSON output:** +```json +{"users": [{"id": 1, "name": "Alice", "age": 30, "active": true}, {"id": 2, "name": "Bob", "age": 25, "active": true}, {"id": 3, "name": "Charlie", "age": 35, "active": false}]} +``` + +**TOON output:** +``` +users[3,]{id,name,age,active}: + 1,Alice,30,true + 2,Bob,25,true + 3,Charlie,35,false +``` + +## Development + +This project uses [uv](https://docs.astral.sh/uv/) for fast, reliable package and environment management. + +### Setup with uv (Recommended) + +```bash +# Install uv if you haven't already +curl -LsSf https://astral.sh/uv/install.sh | sh + +# Clone the repository +git clone https://github.com/toon-format/toon-python.git +cd toon-python + +# Create virtual environment and install dependencies +uv venv +source .venv/bin/activate # On Windows: .venv\Scripts\activate + +# Install package in editable mode with dev dependencies +uv pip install -e ".[dev]" +``` + +### Setup with pip (Alternative) + +```bash +# Clone the repository +git clone https://github.com/toon-format/toon-python.git +cd toon-python + +# Create virtual environment +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# Install in development mode +pip install -e . + +# Install development dependencies +pip install -r requirements-dev.txt +``` + +### Running Tests + +```bash +# Run all tests +pytest + +# Run with coverage +pytest --cov=toon_format --cov-report=term +``` + +### Type Checking + +```bash +mypy src/toon_format +``` + +### Linting + +```bash +ruff check src/toon_format tests +``` + +## Credits + +This project is a Python implementation of the TOON format. ## License -MIT License © 2025-PRESENT [Johann Schopplich](https://github.com/johannschopplich) +MIT License - see [LICENSE](LICENSE) file for details + +## Related + +- [TOON Format Specification](https://github.com/toon-format/spec) - Official specification with normative encoding rules +- [TOON Format Organization](https://github.com/toon-format) - Official TOON format organization + +## Contributing + +Contributions are welcome! Please feel free to submit a Pull Request. + +When contributing, please: +- Add tests for new features +- Update documentation as needed +- Ensure compatibility with the TOON specification + +## Support + +For bugs and feature requests, please [open an issue](https://github.com/toon-format/toon-python/issues). diff --git a/examples.py b/examples.py new file mode 100644 index 0000000..aebb67d --- /dev/null +++ b/examples.py @@ -0,0 +1,99 @@ +"""Examples demonstrating toon-format usage.""" + +from toon_format import encode + +# Example 1: Simple object +print("=" * 60) +print("Example 1: Simple Object") +print("=" * 60) +data = {"name": "Alice", "age": 30, "city": "New York"} +print("Input:", data) +print("\nTOON Output:") +print(encode(data)) + +# Example 2: Tabular array +print("\n" + "=" * 60) +print("Example 2: Tabular Array (Uniform Objects)") +print("=" * 60) +users = [ + {"id": 1, "name": "Alice", "age": 30}, + {"id": 2, "name": "Bob", "age": 25}, + {"id": 3, "name": "Charlie", "age": 35}, +] +print("Input:", users) +print("\nTOON Output:") +print(encode(users)) + +# Example 3: Complex nested structure +print("\n" + "=" * 60) +print("Example 3: Complex Nested Structure") +print("=" * 60) +data = { + "metadata": {"version": 1, "author": "test"}, + "items": [ + {"id": 1, "name": "Item1", "price": 9.99}, + {"id": 2, "name": "Item2", "price": 19.99}, + ], + "tags": ["alpha", "beta", "gamma"], +} +print("Input:", data) +print("\nTOON Output:") +print(encode(data)) + +# Example 4: Different delimiters +print("\n" + "=" * 60) +print("Example 4: Different Delimiters") +print("=" * 60) +arr = [1, 2, 3, 4, 5] +print("Input:", arr) +print("\nComma (default):") +print(encode(arr)) +print("\nTab delimiter:") +print(encode(arr, {"delimiter": "\t"})) +print("\nPipe delimiter:") +print(encode(arr, {"delimiter": "|"})) + +# Example 5: Length markers +print("\n" + "=" * 60) +print("Example 5: Length Markers") +print("=" * 60) +users = [ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"}, +] +print("Input:", users) +print("\nWith length marker:") +print(encode(users, {"length_marker": True})) + +# Example 6: Primitive arrays +print("\n" + "=" * 60) +print("Example 6: Primitive Arrays") +print("=" * 60) +print("Numbers:", encode([1, 2, 3, 4, 5])) +print("Strings:", encode(["apple", "banana", "cherry"])) +print("Mixed:", encode([1, "two", True, None])) + +# Example 7: Token comparison +print("\n" + "=" * 60) +print("Example 7: Token Efficiency Demo") +print("=" * 60) +import json + +data = { + "users": [ + {"id": 1, "name": "Alice", "age": 30, "active": True}, + {"id": 2, "name": "Bob", "age": 25, "active": True}, + {"id": 3, "name": "Charlie", "age": 35, "active": False}, + ] +} + +json_str = json.dumps(data) +toon_str = encode(data) + +print(f"JSON length: {len(json_str)} characters") +print(f"TOON length: {len(toon_str)} characters") +print(f"Reduction: {100 * (1 - len(toon_str) / len(json_str)):.1f}%") +print("\nJSON:") +print(json_str) +print("\nTOON:") +print(toon_str) diff --git a/pyproject.toml b/pyproject.toml index c3adf51..4ed81cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,20 +1,23 @@ [project] name = "toon-format" version = "0.1.0" -description = "Token-Oriented Object Notation – a token-efficient JSON alternative for LLM prompts" +description = "A compact, human-readable serialization format designed for passing structured data to Large Language Models with significantly reduced token usage" readme = "README.md" authors = [ { name = "Johann Schopplich", email = "hello@johannschopplich.com" } ] -requires-python = ">=3.11" +requires-python = ">=3.8" dependencies = [] license = { text = "MIT" } keywords = ["toon", "serialization", "llm", "data-format", "token-efficient"] classifiers = [ - "Development Status :: 3 - Alpha", + "Development Status :: 4 - Beta", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", @@ -23,11 +26,14 @@ classifiers = [ ] [project.urls] -Homepage = "https://toonformat.dev" +Homepage = "https://github.com/toon-format/toon-python" Repository = "https://github.com/toon-format/toon-python" -Documentation = "https://github.com/toon-format/toon" +Documentation = "https://github.com/toon-format/spec" "Bug Tracker" = "https://github.com/toon-format/toon-python/issues" +[project.scripts] +toon = "toon_format.cli:main" + [dependency-groups] dev = [ "pytest>=8.0.0", @@ -47,8 +53,8 @@ addopts = [ ] [tool.ruff] -target-version = "py311" -line-length = 88 +target-version = "py38" +line-length = 100 [tool.ruff.lint] select = [ @@ -56,29 +62,20 @@ select = [ "W", # pycodestyle warnings "F", # pyflakes "I", # isort - "B", # flake8-bugbear - "C4", # flake8-comprehensions "UP", # pyupgrade ] -ignore = [] +ignore = ["N"] [tool.ruff.format] quote-style = "double" indent-style = "space" [tool.mypy] -python_version = "3.11" -strict = true -warn_return_any = true +python_version = "3.9" +warn_return_any = false warn_unused_configs = true -disallow_untyped_defs = true -disallow_any_generics = true -check_untyped_defs = true -no_implicit_optional = true -warn_redundant_casts = true -warn_unused_ignores = true -warn_no_return = true -show_error_codes = true +disallow_untyped_defs = false +check_untyped_defs = false [build-system] requires = ["uv_build>=0.9.7,<0.10.0"] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..e593301 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,7 @@ +# Development dependencies +pytest>=8.0.0 +pytest-cov>=4.1.0 +mypy>=1.8.0 +ruff>=0.1.0 +build>=1.0.0 +twine>=5.0.0 diff --git a/src/toon_format/__init__.py b/src/toon_format/__init__.py index ec15242..cb4063a 100644 --- a/src/toon_format/__init__.py +++ b/src/toon_format/__init__.py @@ -1,13 +1,21 @@ """ -Token-Oriented Object Notation (TOON) for Python. +pytoon - Token-Oriented Object Notation for Python -A compact, human-readable format designed for passing structured data -to Large Language Models with significantly reduced token usage. +A compact data format optimized for transmitting structured information to LLMs +with 30-60% fewer tokens than JSON. """ -from toon_format.decoder import decode -from toon_format.encoder import encode -from toon_format.types import DecodeOptions, EncodeOptions +from .decoder import ToonDecodeError, decode +from .encoder import encode +from .types import DecodeOptions, Delimiter, DelimiterKey, EncodeOptions -__version__ = "0.1.0" -__all__ = ["encode", "decode", "EncodeOptions", "DecodeOptions"] +__version__ = "0.1.1" +__all__ = [ + "encode", + "decode", + "ToonDecodeError", + "Delimiter", + "DelimiterKey", + "EncodeOptions", + "DecodeOptions", +] diff --git a/src/toon_format/__main__.py b/src/toon_format/__main__.py new file mode 100644 index 0000000..64696d4 --- /dev/null +++ b/src/toon_format/__main__.py @@ -0,0 +1,8 @@ +"""CLI entry point for TOON.""" + +import sys + +from .cli import main + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/toon_format/cli.py b/src/toon_format/cli.py new file mode 100644 index 0000000..509bdf2 --- /dev/null +++ b/src/toon_format/cli.py @@ -0,0 +1,210 @@ +"""Command-line interface for TOON encoding/decoding.""" + +import argparse +import json +import sys +from pathlib import Path + +from . import decode, encode +from .types import DecodeOptions, EncodeOptions + + +def main() -> int: + """Main CLI entry point.""" + parser = argparse.ArgumentParser( + prog="toon", + description="Convert between JSON and TOON formats", + ) + + parser.add_argument( + "input", + type=str, + help="Input file path (or - for stdin)", + ) + + parser.add_argument( + "-o", + "--output", + type=str, + help="Output file path (prints to stdout if omitted)", + ) + + parser.add_argument( + "-e", + "--encode", + action="store_true", + help="Force encode mode (overrides auto-detection)", + ) + + parser.add_argument( + "-d", + "--decode", + action="store_true", + help="Force decode mode (overrides auto-detection)", + ) + + parser.add_argument( + "--delimiter", + type=str, + choices=[",", "\t", "|"], + default=",", + help='Array delimiter: , (comma), \\t (tab), | (pipe) (default: ",")', + ) + + parser.add_argument( + "--indent", + type=int, + default=2, + help="Indentation size (default: 2)", + ) + + parser.add_argument( + "--length-marker", + action="store_true", + help="Add # prefix to array lengths (e.g., items[#3])", + ) + + parser.add_argument( + "--no-strict", + action="store_true", + help="Disable strict validation when decoding", + ) + + args = parser.parse_args() + + # Read input + try: + if args.input == "-": + input_text = sys.stdin.read() + input_path = None + else: + input_path = Path(args.input) + if not input_path.exists(): + print(f"Error: Input file not found: {args.input}", file=sys.stderr) + return 1 + input_text = input_path.read_text(encoding="utf-8") + except Exception as e: + print(f"Error reading input: {e}", file=sys.stderr) + return 1 + + # Determine operation mode + if args.encode and args.decode: + print("Error: Cannot specify both --encode and --decode", file=sys.stderr) + return 1 + + if args.encode: + mode = "encode" + elif args.decode: + mode = "decode" + else: + # Auto-detect based on file extension + if input_path: + if input_path.suffix.lower() == ".json": + mode = "encode" + elif input_path.suffix.lower() == ".toon": + mode = "decode" + else: + # Try to detect by content + try: + json.loads(input_text) + mode = "encode" + except json.JSONDecodeError: + mode = "decode" + else: + # No file path, try to detect by content + try: + json.loads(input_text) + mode = "encode" + except json.JSONDecodeError: + mode = "decode" + + # Process + try: + if mode == "encode": + output_text = encode_json_to_toon( + input_text, + delimiter=args.delimiter, + indent=args.indent, + length_marker=args.length_marker, + ) + else: + output_text = decode_toon_to_json( + input_text, + indent=args.indent, + strict=not args.no_strict, + ) + except Exception as e: + print(f"Error during {mode}: {e}", file=sys.stderr) + return 1 + + # Write output + try: + if args.output: + output_path = Path(args.output) + output_path.write_text(output_text, encoding="utf-8") + else: + print(output_text) + except Exception as e: + print(f"Error writing output: {e}", file=sys.stderr) + return 1 + + return 0 + + +def encode_json_to_toon( + json_text: str, + delimiter: str = ",", + indent: int = 2, + length_marker: bool = False, +) -> str: + """Encode JSON text to TOON format. + + Args: + json_text: JSON input string + delimiter: Delimiter character + indent: Indentation size + length_marker: Whether to add # prefix + + Returns: + TOON-formatted string + + Raises: + json.JSONDecodeError: If JSON is invalid + """ + data = json.loads(json_text) + + options: EncodeOptions = { + "indent": indent, + "delimiter": delimiter, + "lengthMarker": "#" if length_marker else False, + } + + return encode(data, options) + + +def decode_toon_to_json( + toon_text: str, + indent: int = 2, + strict: bool = True, +) -> str: + """Decode TOON text to JSON format. + + Args: + toon_text: TOON input string + indent: Indentation size + strict: Whether to use strict validation + + Returns: + JSON-formatted string + + Raises: + ToonDecodeError: If TOON is invalid + """ + options = DecodeOptions(indent=indent, strict=strict) + data = decode(toon_text, options) + + return json.dumps(data, indent=2, ensure_ascii=False) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/toon_format/constants.py b/src/toon_format/constants.py new file mode 100644 index 0000000..d0541da --- /dev/null +++ b/src/toon_format/constants.py @@ -0,0 +1,38 @@ +"""Constants for TOON encoding.""" + +# List markers +LIST_ITEM_MARKER = "-" +LIST_ITEM_PREFIX = "- " + +# Structural characters +COMMA = "," +COLON = ":" +SPACE = " " +PIPE = "|" + +# Brackets/braces +OPEN_BRACKET = "[" +CLOSE_BRACKET = "]" +OPEN_BRACE = "{" +CLOSE_BRACE = "}" + +# Literals +NULL_LITERAL = "null" +TRUE_LITERAL = "true" +FALSE_LITERAL = "false" + +# Escape characters +BACKSLASH = "\\" +DOUBLE_QUOTE = '"' +NEWLINE = "\n" +CARRIAGE_RETURN = "\r" +TAB = "\t" + +# Delimiters +DELIMITERS = { + "comma": ",", + "tab": "\t", + "pipe": "|", +} + +DEFAULT_DELIMITER = DELIMITERS["comma"] diff --git a/src/toon_format/decoder.py b/src/toon_format/decoder.py index 6cd01d3..915ba85 100644 --- a/src/toon_format/decoder.py +++ b/src/toon_format/decoder.py @@ -1,31 +1,902 @@ -"""TOON decoder implementation.""" +"""TOON decoder implementation following v1.2 spec.""" -from toon_format.types import DecodeOptions, JsonValue +import re +from typing import Any, Dict, List, Optional, Tuple +from .constants import ( + BACKSLASH, + CARRIAGE_RETURN, + CLOSE_BRACE, + CLOSE_BRACKET, + COLON, + COMMA, + DOUBLE_QUOTE, + FALSE_LITERAL, + LIST_ITEM_MARKER, + NEWLINE, + NULL_LITERAL, + OPEN_BRACE, + OPEN_BRACKET, + PIPE, + TAB, + TRUE_LITERAL, +) +from .types import DecodeOptions, JsonValue -def decode(input: str, options: DecodeOptions | None = None) -> JsonValue: - """Convert a TOON-formatted string to a Python value. + +class ToonDecodeError(Exception): + """TOON decoding error.""" + pass + + +class Line: + """Represents a line in the TOON document.""" + + def __init__(self, content: str, depth: int, line_number: int): + self.content = content + self.depth = depth + self.line_number = line_number + self.is_blank = not content.strip() + + +def compute_depth(line: str, indent_size: int, strict: bool) -> int: + """Compute indentation depth for a line. + + Args: + line: Line content + indent_size: Number of spaces per indentation level + strict: Whether to enforce strict indentation rules + + Returns: + Indentation depth + + Raises: + ToonDecodeError: If indentation is invalid in strict mode + """ + if not line: + return 0 + + # Count leading spaces + leading_spaces = len(line) - len(line.lstrip(' ')) + + # Check for tabs in indentation (always error in strict mode) + if strict and '\t' in line[:leading_spaces]: + raise ToonDecodeError("Tabs are not allowed in indentation") + + # In strict mode, leading spaces must be exact multiple of indent_size + if strict: + if leading_spaces % indent_size != 0: + raise ToonDecodeError( + f"Indentation must be an exact multiple of {indent_size} spaces" + ) + return leading_spaces // indent_size + else: + # Non-strict mode: use floor division + return leading_spaces // indent_size + + +def unescape_string(value: str) -> str: + """Unescape a quoted string. Args: - input: A TOON-formatted string to parse - options: Optional decoding options: - - indent: Expected number of spaces per indentation level (default: 2) - - strict: Enable strict validation (default: True) + value: Escaped string (without surrounding quotes) Returns: - A Python value (dict, list, or primitive) representing the parsed TOON data. + Unescaped string Raises: - ValueError: If the input is malformed (when strict=True) + ToonDecodeError: If escape sequence is invalid + """ + result = [] + i = 0 + while i < len(value): + if value[i] == BACKSLASH: + if i + 1 >= len(value): + raise ToonDecodeError("Unterminated string: missing closing quote") + next_char = value[i + 1] + if next_char == BACKSLASH: + result.append(BACKSLASH) + elif next_char == DOUBLE_QUOTE: + result.append(DOUBLE_QUOTE) + elif next_char == 'n': + result.append(NEWLINE) + elif next_char == 'r': + result.append(CARRIAGE_RETURN) + elif next_char == 't': + result.append(TAB) + else: + raise ToonDecodeError(f"Invalid escape sequence: \\{next_char}") + i += 2 + else: + result.append(value[i]) + i += 1 + return ''.join(result) + + +def parse_primitive(token: str) -> JsonValue: + """Parse a primitive token. - Examples: - >>> decode('items[2]{sku,qty}:\\n A1,2\\n B2,1') - {'items': [{'sku': 'A1', 'qty': 2}, {'sku': 'B2', 'qty': 1}]} + Args: + token: Token string - >>> decode('tags[2]: foo,bar') - {'tags': ['foo', 'bar']} + Returns: + Parsed value - >>> decode('[3]: 1,2,3') - [1, 2, 3] + Raises: + ToonDecodeError: If quoted string is malformed """ - raise NotImplementedError("TOON decoder is not yet implemented") + token = token.strip() + + # Quoted string + if token.startswith(DOUBLE_QUOTE): + if not token.endswith(DOUBLE_QUOTE) or len(token) < 2: + raise ToonDecodeError("Unterminated string: missing closing quote") + return unescape_string(token[1:-1]) + + # Boolean literals + if token == TRUE_LITERAL: + return True + if token == FALSE_LITERAL: + return False + if token == NULL_LITERAL: + return None + + # Try to parse as number + # Must handle: 42, -3.14, 1e-6, -1E+9 + # Must reject leading zeros like "05", "0001" + if token: + # Check for forbidden leading zeros + if re.match(r'^0\d+$', token): + # Leading zero like "05" -> string + return token + + try: + # Try int first + if '.' not in token and 'e' not in token.lower(): + return int(token) + # Then float + return float(token) + except ValueError: + pass + + # Otherwise it's an unquoted string + return token + + +def parse_delimited_values(line: str, delimiter: str) -> List[str]: + """Parse delimiter-separated values, respecting quotes. + + Args: + line: Line content + delimiter: Active delimiter + + Returns: + List of token strings + """ + tokens = [] + current = [] + in_quotes = False + i = 0 + + while i < len(line): + char = line[i] + + if char == DOUBLE_QUOTE: + in_quotes = not in_quotes + current.append(char) + elif char == BACKSLASH and i + 1 < len(line) and in_quotes: + # In quotes, consume escape sequence + current.append(char) + current.append(line[i + 1]) + i += 1 + elif char == delimiter and not in_quotes: + # Split on unquoted delimiter + tokens.append(''.join(current)) + current = [] + i += 1 + continue + else: + current.append(char) + + i += 1 + + # Add final token + if current or tokens: # Include empty final token if there was a delimiter + tokens.append(''.join(current)) + + return tokens + + +def parse_header(line: str) -> Optional[Tuple[Optional[str], int, str, Optional[List[str]]]]: + """Parse an array header. + + Args: + line: Line content + + Returns: + Tuple of (key, length, delimiter, fields) or None if not a header + + Raises: + ToonDecodeError: If header is malformed + """ + line = line.strip() + + # Find the bracket segment + bracket_start = line.find(OPEN_BRACKET) + if bracket_start == -1: + return None + + # Extract key (if any) + key = None + if bracket_start > 0: + key_part = line[:bracket_start].strip() + key = parse_key(key_part) if key_part else None + + # Find closing bracket + bracket_end = line.find(CLOSE_BRACKET, bracket_start) + if bracket_end == -1: + return None + + # Parse bracket content: [#?N] + bracket_content = line[bracket_start + 1:bracket_end] + + # Remove optional # marker + if bracket_content.startswith('#'): + bracket_content = bracket_content[1:] + + # Determine delimiter from bracket content + delimiter = COMMA # default + length_str = bracket_content + + if bracket_content.endswith(TAB): + delimiter = TAB + length_str = bracket_content[:-1] + elif bracket_content.endswith(PIPE): + delimiter = PIPE + length_str = bracket_content[:-1] + elif bracket_content.endswith(COMMA): + # Explicit comma delimiter (for tabular arrays) + delimiter = COMMA + length_str = bracket_content[:-1] + + # Parse length + try: + length = int(length_str) + except ValueError: + return None + + # Check for fields segment + fields = None + after_bracket = line[bracket_end + 1:].strip() + + if after_bracket.startswith(OPEN_BRACE): + brace_end = after_bracket.find(CLOSE_BRACE) + if brace_end == -1: + raise ToonDecodeError("Unterminated fields segment") + + fields_content = after_bracket[1:brace_end] + # Parse fields using the delimiter + field_tokens = parse_delimited_values(fields_content, delimiter) + fields = [parse_key(f.strip()) for f in field_tokens] + + after_bracket = after_bracket[brace_end + 1:].strip() + + # Must end with colon + if not after_bracket.startswith(COLON): + return None + + return (key, length, delimiter, fields) + + +def parse_key(key_str: str) -> str: + """Parse a key (quoted or unquoted). + + Args: + key_str: Key string + + Returns: + Parsed key + + Raises: + ToonDecodeError: If quoted key is malformed + """ + key_str = key_str.strip() + + if key_str.startswith(DOUBLE_QUOTE): + if not key_str.endswith(DOUBLE_QUOTE) or len(key_str) < 2: + raise ToonDecodeError("Unterminated quoted key") + return unescape_string(key_str[1:-1]) + + return key_str + + +def split_key_value(line: str) -> Tuple[str, str]: + """Split a line into key and value at first unquoted colon. + + Args: + line: Line content + + Returns: + Tuple of (key, value) + + Raises: + ToonDecodeError: If no colon found + """ + in_quotes = False + i = 0 + + while i < len(line): + char = line[i] + + if char == DOUBLE_QUOTE: + in_quotes = not in_quotes + elif char == BACKSLASH and i + 1 < len(line) and in_quotes: + i += 1 # Skip next char + elif char == COLON and not in_quotes: + key = line[:i].strip() + value = line[i + 1:].strip() + return (key, value) + + i += 1 + + raise ToonDecodeError("Missing colon after key") + + +def decode(input_str: str, options: Optional[DecodeOptions] = None) -> JsonValue: + """Decode a TOON-formatted string to a Python value. + + Args: + input_str: TOON-formatted string + options: Optional decoding options + + Returns: + Decoded Python value + + Raises: + ToonDecodeError: If input is malformed + """ + if options is None: + options = DecodeOptions() + + indent_size = options.indent + strict = options.strict + + # Split into lines + raw_lines = input_str.split('\n') + + # Process lines: compute depth and filter blanks outside arrays + lines: List[Line] = [] + for i, raw in enumerate(raw_lines): + # Skip trailing newline + if i == len(raw_lines) - 1 and not raw.strip(): + continue + + depth = compute_depth(raw, indent_size, strict) + line = Line(raw.strip(), depth, i + 1) + + # Keep all lines for now (we'll handle blank line rules during parsing) + if line.content or not strict: + lines.append(line) + + # Remove blank lines outside arrays (Section 12) + # For simplicity, we'll handle this during parsing + + # Check for empty input + non_blank_lines = [ln for ln in lines if not ln.is_blank] + if not non_blank_lines: + if strict: + raise ToonDecodeError("Empty input") + return None + + # Determine root form (Section 5) + first_line = non_blank_lines[0] + + # Check if it's a root array header + header_info = parse_header(first_line.content) + if header_info is not None and header_info[0] is None: # No key = root array + # Root array + return decode_array(lines, 0, 0, header_info, strict) + + # Check if it's a single primitive + if len(non_blank_lines) == 1: + line_content = first_line.content + # Check if it's not a key-value line + try: + split_key_value(line_content) + # It's a key-value, so root object + except ToonDecodeError: + # Not a key-value, check if it's a header + if header_info is None: + # Single primitive + return parse_primitive(line_content) + + # Otherwise, root object + return decode_object(lines, 0, 0, strict) + + +def decode_object( + lines: List[Line], + start_idx: int, + parent_depth: int, + strict: bool +) -> Dict[str, Any]: + """Decode an object starting at given line index. + + Args: + lines: List of lines + start_idx: Starting line index + parent_depth: Parent indentation depth + strict: Strict mode flag + + Returns: + Decoded object + """ + result = {} + i = start_idx + expected_depth = parent_depth if start_idx == 0 else parent_depth + 1 + + while i < len(lines): + line = lines[i] + + # Skip blank lines outside arrays (allowed) + if line.is_blank: + i += 1 + continue + + # Stop if we've dedented below expected depth + if line.depth < expected_depth: + break + + # Skip lines that are too deeply indented (they belong to nested structures) + if line.depth > expected_depth: + i += 1 + continue + + content = line.content + + # Check for array header + header_info = parse_header(content) + if header_info is not None: + key, length, delimiter, fields = header_info + if key is not None: + # Array field + array_val, next_i = decode_array_from_header( + lines, i, line.depth, header_info, strict + ) + result[key] = array_val + i = next_i + continue + + # Must be a key-value line + try: + key_str, value_str = split_key_value(content) + except ToonDecodeError: + # Invalid line, skip in non-strict mode + if strict: + raise + i += 1 + continue + + key = parse_key(key_str) + + # Check if value is empty (nested object) + if not value_str: + # Nested object + result[key] = decode_object(lines, i + 1, line.depth, strict) + # Skip past nested object + i += 1 + while i < len(lines) and lines[i].depth > line.depth: + i += 1 + else: + # Primitive value + result[key] = parse_primitive(value_str) + i += 1 + + return result + + +def decode_array_from_header( + lines: List[Line], + header_idx: int, + header_depth: int, + header_info: Tuple[Optional[str], int, str, Optional[List[str]]], + strict: bool +) -> Tuple[List[Any], int]: + """Decode array starting from a header line. + + Args: + lines: List of lines + header_idx: Index of header line + header_depth: Depth of header line + header_info: Parsed header info + strict: Strict mode flag + + Returns: + Tuple of (decoded array, next line index) + """ + key, length, delimiter, fields = header_info + header_line = lines[header_idx].content + + # Check if there's inline content after the colon + colon_idx = header_line.rfind(COLON) + inline_content = header_line[colon_idx + 1:].strip() + + if inline_content: + # Inline primitive array + return decode_inline_array(inline_content, delimiter, length, strict), header_idx + 1 + + # Non-inline array + if fields is not None: + # Tabular array + return decode_tabular_array( + lines, header_idx + 1, header_depth, fields, delimiter, length, strict + ) + else: + # List format (mixed/non-uniform) + return decode_list_array(lines, header_idx + 1, header_depth, delimiter, length, strict) + + +def decode_array( + lines: List[Line], + start_idx: int, + parent_depth: int, + header_info: Tuple[Optional[str], int, str, Optional[List[str]]], + strict: bool +) -> List[Any]: + """Decode array (convenience wrapper). + + Args: + lines: List of lines + start_idx: Starting line index + parent_depth: Parent depth + header_info: Header info + strict: Strict mode + + Returns: + Decoded array + """ + arr, _ = decode_array_from_header(lines, start_idx, parent_depth, header_info, strict) + return arr + + +def decode_inline_array( + content: str, + delimiter: str, + expected_length: int, + strict: bool +) -> List[Any]: + """Decode an inline primitive array. + + Args: + content: Inline content after colon + delimiter: Active delimiter + expected_length: Expected array length + strict: Strict mode flag + + Returns: + Decoded array + + Raises: + ToonDecodeError: If length mismatch in strict mode + """ + if not content and expected_length == 0: + return [] + + tokens = parse_delimited_values(content, delimiter) + values = [parse_primitive(token) for token in tokens] + + if strict and len(values) != expected_length: + raise ToonDecodeError( + f"Expected {expected_length} values, but got {len(values)}" + ) + + return values + + +def decode_tabular_array( + lines: List[Line], + start_idx: int, + header_depth: int, + fields: List[str], + delimiter: str, + expected_length: int, + strict: bool +) -> Tuple[List[Dict[str, Any]], int]: + """Decode a tabular array. + + Args: + lines: List of lines + start_idx: Starting line index (after header) + header_depth: Depth of header + fields: Field names + delimiter: Active delimiter + expected_length: Expected number of rows + strict: Strict mode flag + + Returns: + Tuple of (decoded array, next line index) + + Raises: + ToonDecodeError: If row width or count mismatch in strict mode + """ + result = [] + i = start_idx + row_depth = header_depth + 1 + + while i < len(lines): + line = lines[i] + + # Check for blank lines in array (error in strict mode) + if line.is_blank: + if strict: + raise ToonDecodeError("Blank lines not allowed inside arrays") + i += 1 + continue + + # Stop if dedented or different depth + if line.depth < row_depth: + break + if line.depth > row_depth: + # End of tabular rows (might be next key-value) + break + + content = line.content + + # Disambiguation: check if this is a row or a key-value line + # A row has no unquoted colon, or delimiter before colon + if is_row_line(content, delimiter): + # Parse as row + tokens = parse_delimited_values(content, delimiter) + values = [parse_primitive(token) for token in tokens] + + if strict and len(values) != len(fields): + raise ToonDecodeError( + f"Expected {len(fields)} values in row, but got {len(values)}" + ) + + obj = {fields[j]: values[j] for j in range(min(len(fields), len(values)))} + result.append(obj) + i += 1 + else: + # Not a row, end of tabular data + break + + if strict and len(result) != expected_length: + raise ToonDecodeError( + f"Expected {expected_length} rows, but got {len(result)}" + ) + + return result, i + + +def is_row_line(line: str, delimiter: str) -> bool: + """Check if a line is a tabular row (not a key-value line). + + Args: + line: Line content + delimiter: Active delimiter + + Returns: + True if it's a row line + """ + # Find first unquoted delimiter and first unquoted colon + first_delim_pos = None + first_colon_pos = None + in_quotes = False + i = 0 + + while i < len(line): + char = line[i] + + if char == DOUBLE_QUOTE: + in_quotes = not in_quotes + elif char == BACKSLASH and i + 1 < len(line) and in_quotes: + i += 1 + elif not in_quotes: + if char == delimiter and first_delim_pos is None: + first_delim_pos = i + if char == COLON and first_colon_pos is None: + first_colon_pos = i + + i += 1 + + # No unquoted colon -> row + if first_colon_pos is None: + return True + + # Both present: delimiter before colon -> row + if first_delim_pos is not None and first_delim_pos < first_colon_pos: + return True + + # Colon before delimiter or no delimiter -> key-value + return False + + +def decode_list_array( + lines: List[Line], + start_idx: int, + header_depth: int, + delimiter: str, + expected_length: int, + strict: bool +) -> Tuple[List[Any], int]: + """Decode a list-format array (mixed/non-uniform). + + Args: + lines: List of lines + start_idx: Starting line index + header_depth: Header depth + delimiter: Active delimiter + expected_length: Expected number of items + strict: Strict mode flag + + Returns: + Tuple of (decoded array, next line index) + + Raises: + ToonDecodeError: If item count mismatch in strict mode + """ + result = [] + i = start_idx + item_depth = header_depth + 1 + + while i < len(lines): + line = lines[i] + + # Skip blank lines (error in strict mode) + if line.is_blank: + if strict: + raise ToonDecodeError("Blank lines not allowed inside arrays") + i += 1 + continue + + # Stop if dedented + if line.depth < item_depth: + break + + # Must start with "- " + content = line.content + if not content.startswith(LIST_ITEM_MARKER): + # Not a list item, end of array + break + + # Remove "- " prefix + item_content = content[len(LIST_ITEM_MARKER):].strip() + + # Check what kind of item this is + item_header = parse_header(item_content) + if item_header is not None: + # It's an array header: - [N]: ... or - key[N]: ... + key, length, item_delim, fields = item_header + + if key is None: + # - [N]: inline array + colon_idx = item_content.find(COLON) + if colon_idx != -1: + inline_part = item_content[colon_idx + 1:].strip() + if inline_part: + # Inline primitive array + item_val = decode_inline_array(inline_part, item_delim, length, strict) + result.append(item_val) + i += 1 + continue + else: + # - key[N]: array field in object + # This is an object with an array as its first field + item_obj = {} + array_val, next_i = decode_array_from_header( + lines, i, line.depth, item_header, strict + ) + item_obj[key] = array_val + + # Continue reading remaining fields at depth +1 + i = next_i + while i < len(lines) and lines[i].depth == line.depth + 1: + field_line = lines[i] + if field_line.is_blank: + i += 1 + continue + + field_content = field_line.content + + # Check for array header + field_header = parse_header(field_content) + if field_header is not None and field_header[0] is not None: + field_key, field_length, field_delim, field_fields = field_header + field_val, next_i = decode_array_from_header( + lines, i, field_line.depth, field_header, strict + ) + item_obj[field_key] = field_val + i = next_i + continue + + try: + field_key_str, field_value_str = split_key_value(field_content) + field_key = parse_key(field_key_str) + + if not field_value_str: + # Nested object + item_obj[field_key] = decode_object( + lines, i + 1, field_line.depth, strict + ) + i += 1 + while i < len(lines) and lines[i].depth > field_line.depth: + i += 1 + else: + item_obj[field_key] = parse_primitive(field_value_str) + i += 1 + except ToonDecodeError: + break + + result.append(item_obj) + continue + + # Check if it's an object (has colon) + try: + key_str, value_str = split_key_value(item_content) + # It's an object item + item_obj = {} + + # First field + key = parse_key(key_str) + if not value_str: + # First field is nested object: fields at depth +2 + nested = decode_object(lines, i + 1, line.depth + 1, strict) + item_obj[key] = nested + # Skip nested content + i += 1 + while i < len(lines) and lines[i].depth > line.depth + 1: + i += 1 + else: + # First field is primitive + item_obj[key] = parse_primitive(value_str) + i += 1 + + # Remaining fields at depth +1 + while i < len(lines) and lines[i].depth == line.depth + 1: + field_line = lines[i] + if field_line.is_blank: + i += 1 + continue + + field_content = field_line.content + + # Check for array header + field_header = parse_header(field_content) + if field_header is not None and field_header[0] is not None: + field_key, field_length, field_delim, field_fields = field_header + field_val, next_i = decode_array_from_header( + lines, i, field_line.depth, field_header, strict + ) + item_obj[field_key] = field_val + i = next_i + continue + + try: + field_key_str, field_value_str = split_key_value(field_content) + field_key = parse_key(field_key_str) + + if not field_value_str: + # Nested object + item_obj[field_key] = decode_object(lines, i + 1, field_line.depth, strict) + i += 1 + while i < len(lines) and lines[i].depth > field_line.depth: + i += 1 + else: + item_obj[field_key] = parse_primitive(field_value_str) + i += 1 + except ToonDecodeError: + break + + result.append(item_obj) + except ToonDecodeError: + # Not an object, must be a primitive + result.append(parse_primitive(item_content)) + i += 1 + + if strict and len(result) != expected_length: + raise ToonDecodeError( + f"Expected {expected_length} items, but got {len(result)}" + ) + + return result, i diff --git a/src/toon_format/encoder.py b/src/toon_format/encoder.py index 8199fa2..df61140 100644 --- a/src/toon_format/encoder.py +++ b/src/toon_format/encoder.py @@ -1,34 +1,49 @@ -"""TOON encoder implementation.""" +"""Core TOON encoding functionality.""" -from typing import Any +from typing import Any, Optional -from toon_format.types import EncodeOptions +from .constants import DEFAULT_DELIMITER, DELIMITERS +from .encoders import encode_value +from .normalize import normalize_value +from .types import EncodeOptions, ResolvedEncodeOptions +from .writer import LineWriter -def encode(value: Any, options: EncodeOptions | None = None) -> str: - """Convert a value to TOON format. +def encode(value: Any, options: Optional[EncodeOptions] = None) -> str: + """Encode a value into TOON format. Args: - value: Any JSON-serializable value (object, array, primitive, or nested structure). - Non-JSON-serializable values (functions, undefined, non-finite numbers) are - converted to null. Dates are converted to ISO strings, and BigInts are emitted - as decimal integers. - options: Optional encoding options: - - indent: Number of spaces per indentation level (default: 2) - - delimiter: Delimiter for array values and tabular rows (default: ',') - - length_marker: Optional marker to prefix array lengths (default: False) + value: The value to encode (must be JSON-serializable) + options: Optional encoding options Returns: - A TOON-formatted string with no trailing newline or spaces. + TOON-formatted string + """ + normalized = normalize_value(value) + resolved_options = resolve_options(options) + writer = LineWriter(resolved_options.indent) + encode_value(normalized, resolved_options, writer, 0) + return writer.to_string() + - Examples: - >>> encode({"items": [{"sku": "A1", "qty": 2}, {"sku": "B2", "qty": 1}]}) - 'items[2]{sku,qty}:\\n A1,2\\n B2,1' +def resolve_options(options: Optional[EncodeOptions]) -> ResolvedEncodeOptions: + """Resolve encoding options with defaults. - >>> encode({"tags": ["foo", "bar"]}, {"delimiter": "\\t"}) - 'tags[2 ]: foo bar' + Args: + options: Optional user-provided options - >>> encode([1, 2, 3], {"length_marker": "#"}) - '[#3]: 1,2,3' + Returns: + Resolved options with defaults applied """ - raise NotImplementedError("TOON encoder is not yet implemented") + if options is None: + return ResolvedEncodeOptions() + + indent = options.get("indent", 2) + delimiter = options.get("delimiter", DEFAULT_DELIMITER) + length_marker = options.get("lengthMarker", False) + + # Resolve delimiter if it's a key + if delimiter in DELIMITERS: + delimiter = DELIMITERS[delimiter] + + return ResolvedEncodeOptions(indent=indent, delimiter=delimiter, length_marker=length_marker) diff --git a/src/toon_format/encoders.py b/src/toon_format/encoders.py new file mode 100644 index 0000000..1d67075 --- /dev/null +++ b/src/toon_format/encoders.py @@ -0,0 +1,295 @@ +"""Encoders for different value types.""" + +from typing import List, Optional + +from .constants import LIST_ITEM_PREFIX +from .normalize import ( + is_array_of_arrays, + is_array_of_objects, + is_array_of_primitives, + is_json_array, + is_json_object, + is_json_primitive, +) +from .primitives import encode_key, encode_primitive, format_header, join_encoded_values +from .types import Depth, JsonArray, JsonObject, JsonValue, ResolvedEncodeOptions +from .writer import LineWriter + + +def encode_value( + value: JsonValue, options: ResolvedEncodeOptions, writer: LineWriter, depth: Depth = 0 +) -> None: + """Encode a value to TOON format. + + Args: + value: Normalized JSON value + options: Resolved encoding options + writer: Line writer for output + depth: Current indentation depth + """ + if is_json_primitive(value): + writer.push(depth, encode_primitive(value, options.delimiter)) + elif is_json_array(value): + encode_array(value, options, writer, depth, None) + elif is_json_object(value): + encode_object(value, options, writer, depth, None) + + +def encode_object( + obj: JsonObject, + options: ResolvedEncodeOptions, + writer: LineWriter, + depth: Depth, + key: Optional[str], +) -> None: + """Encode an object to TOON format. + + Args: + obj: Dictionary object + options: Resolved encoding options + writer: Line writer for output + depth: Current indentation depth + key: Optional key name + """ + if key: + writer.push(depth, f"{encode_key(key)}:") + + for obj_key, obj_value in obj.items(): + encode_key_value_pair(obj_key, obj_value, options, writer, depth if not key else depth + 1) + + +def encode_key_value_pair( + key: str, value: JsonValue, options: ResolvedEncodeOptions, writer: LineWriter, depth: Depth +) -> None: + """Encode a key-value pair. + + Args: + key: Key name + value: Value to encode + options: Resolved encoding options + writer: Line writer for output + depth: Current indentation depth + """ + if is_json_primitive(value): + writer.push(depth, f"{encode_key(key)}: {encode_primitive(value, options.delimiter)}") + elif is_json_array(value): + encode_array(value, options, writer, depth, key) + elif is_json_object(value): + encode_object(value, options, writer, depth, key) + + +def encode_array( + arr: JsonArray, + options: ResolvedEncodeOptions, + writer: LineWriter, + depth: Depth, + key: Optional[str], +) -> None: + """Encode an array to TOON format. + + Args: + arr: List array + options: Resolved encoding options + writer: Line writer for output + depth: Current indentation depth + key: Optional key name + """ + # Handle empty array + if not arr: + header = format_header(key, 0, None, options.delimiter, options.lengthMarker) + writer.push(depth, header) + return + + # Check array type and encode accordingly + if is_array_of_primitives(arr): + encode_inline_primitive_array(arr, options, writer, depth, key) + elif is_array_of_arrays(arr): + encode_array_of_arrays(arr, options, writer, depth, key) + elif is_array_of_objects(arr): + tabular_header = detect_tabular_header(arr, options.delimiter) + if tabular_header: + encode_array_of_objects_as_tabular(arr, tabular_header, options, writer, depth, key) + else: + encode_mixed_array_as_list_items(arr, options, writer, depth, key) + else: + encode_mixed_array_as_list_items(arr, options, writer, depth, key) + + +def encode_inline_primitive_array( + arr: JsonArray, + options: ResolvedEncodeOptions, + writer: LineWriter, + depth: Depth, + key: Optional[str], +) -> None: + """Encode an array of primitives inline. + + Args: + arr: Array of primitives + options: Resolved encoding options + writer: Line writer for output + depth: Current indentation depth + key: Optional key name + """ + encoded_values = [encode_primitive(item, options.delimiter) for item in arr] + joined = join_encoded_values(encoded_values, options.delimiter) + header = format_header(key, len(arr), None, options.delimiter, options.lengthMarker) + writer.push(depth, f"{header} {joined}") + + +def encode_array_of_arrays( + arr: JsonArray, + options: ResolvedEncodeOptions, + writer: LineWriter, + depth: Depth, + key: Optional[str], +) -> None: + """Encode an array of arrays. + + Args: + arr: Array of arrays + options: Resolved encoding options + writer: Line writer for output + depth: Current indentation depth + key: Optional key name + """ + header = format_header(key, len(arr), None, options.delimiter, options.lengthMarker) + writer.push(depth, header) + + for item in arr: + if is_array_of_primitives(item): + encoded_values = [encode_primitive(v, options.delimiter) for v in item] + joined = join_encoded_values(encoded_values, options.delimiter) + length_marker = options.lengthMarker if options.lengthMarker else "" + writer.push( + depth + 1, + f"{LIST_ITEM_PREFIX}[{length_marker}{len(item)}{options.delimiter}]: {joined}", + ) + else: + encode_array(item, options, writer, depth + 1, None) + + +def detect_tabular_header(arr: List[JsonObject], delimiter: str) -> Optional[List[str]]: + """Detect if array can use tabular format and return header keys. + + Args: + arr: Array of objects + delimiter: Delimiter character + + Returns: + List of keys if tabular, None otherwise + """ + if not arr: + return None + + # Get keys from first object + first_keys = list(arr[0].keys()) + + # Check all objects have same keys and all values are primitives + for obj in arr: + if list(obj.keys()) != first_keys: + return None + if not all(is_json_primitive(value) for value in obj.values()): + return None + + return first_keys + + +def is_tabular_array(arr: List[JsonObject], delimiter: str) -> bool: + """Check if array qualifies for tabular format. + + Args: + arr: Array to check + delimiter: Delimiter character + + Returns: + True if tabular format can be used + """ + return detect_tabular_header(arr, delimiter) is not None + + +def encode_array_of_objects_as_tabular( + arr: List[JsonObject], + fields: List[str], + options: ResolvedEncodeOptions, + writer: LineWriter, + depth: Depth, + key: Optional[str], +) -> None: + """Encode array of uniform objects in tabular format. + + Args: + arr: Array of uniform objects + fields: Field names for header + options: Resolved encoding options + writer: Line writer for output + depth: Current indentation depth + key: Optional key name + """ + header = format_header(key, len(arr), fields, options.delimiter, options.lengthMarker) + writer.push(depth, header) + + for obj in arr: + row_values = [encode_primitive(obj[field], options.delimiter) for field in fields] + row = join_encoded_values(row_values, options.delimiter) + writer.push(depth + 1, row) + + +def encode_mixed_array_as_list_items( + arr: JsonArray, + options: ResolvedEncodeOptions, + writer: LineWriter, + depth: Depth, + key: Optional[str], +) -> None: + """Encode mixed array as list items. + + Args: + arr: Mixed array + options: Resolved encoding options + writer: Line writer for output + depth: Current indentation depth + key: Optional key name + """ + header = format_header(key, len(arr), None, options.delimiter, options.lengthMarker) + writer.push(depth, header) + + for item in arr: + if is_json_primitive(item): + writer.push(depth + 1, f"{LIST_ITEM_PREFIX}{encode_primitive(item, options.delimiter)}") + elif is_json_object(item): + encode_object_as_list_item(item, options, writer, depth + 1) + elif is_json_array(item): + encode_array(item, options, writer, depth + 1, None) + + +def encode_object_as_list_item( + obj: JsonObject, options: ResolvedEncodeOptions, writer: LineWriter, depth: Depth +) -> None: + """Encode object as a list item. + + Args: + obj: Object to encode + options: Resolved encoding options + writer: Line writer for output + depth: Current indentation depth + """ + # Get all keys + keys = list(obj.items()) + if not keys: + writer.push(depth, LIST_ITEM_PREFIX.rstrip()) + return + + # First key-value pair goes on same line as the "-" + first_key, first_value = keys[0] + if is_json_primitive(first_value): + encoded_val = encode_primitive(first_value, options.delimiter) + writer.push(depth, f"{LIST_ITEM_PREFIX}{encode_key(first_key)}: {encoded_val}") + else: + # If first value is not primitive, put "-" alone then encode normally + writer.push(depth, LIST_ITEM_PREFIX.rstrip()) + encode_key_value_pair(first_key, first_value, options, writer, depth + 1) + + # Rest of the keys go normally indented + for key, value in keys[1:]: + encode_key_value_pair(key, value, options, writer, depth + 1) diff --git a/src/toon_format/normalize.py b/src/toon_format/normalize.py new file mode 100644 index 0000000..7c03637 --- /dev/null +++ b/src/toon_format/normalize.py @@ -0,0 +1,100 @@ +"""Value normalization for TOON encoding.""" + +import math +from datetime import date, datetime +from decimal import Decimal +from typing import Any, List + +from .types import JsonValue + + +def normalize_value(value: Any) -> JsonValue: + """Normalize a value to JSON-compatible type. + + Args: + value: Input value + + Returns: + JSON-compatible value + """ + # Handle None and booleans + if value is None or isinstance(value, bool): + return value + + # Handle numbers + if isinstance(value, (int, float)): + # Convert -0 to 0 + if value == 0: + return 0 + # Convert NaN and Infinity to null + if math.isnan(value) or math.isinf(value): + return None + return value + + # Handle Decimal + if isinstance(value, Decimal): + if not value.is_finite(): + return None + return float(value) + + # Handle strings + if isinstance(value, str): + return value + + # Handle dates + if isinstance(value, (date, datetime)): + return value.isoformat() + + # Handle lists/tuples + if isinstance(value, (list, tuple)): + return [normalize_value(item) for item in value] + + # Handle sets + if isinstance(value, set): + return [normalize_value(item) for item in value] + + # Handle dicts + if isinstance(value, dict): + return {str(key): normalize_value(val) for key, val in value.items()} + + # Handle callables, undefined, symbols -> null + if callable(value): + return None + + # Try to convert to string, otherwise null + try: + if hasattr(value, "__dict__"): + return None + return str(value) + except Exception: + return None + + +def is_json_primitive(value: Any) -> bool: + """Check if value is a JSON primitive.""" + return value is None or isinstance(value, (bool, int, float, str)) + + +def is_json_array(value: Any) -> bool: + """Check if value is an array.""" + return isinstance(value, list) + + +def is_json_object(value: Any) -> bool: + """Check if value is an object (dict but not a list).""" + return isinstance(value, dict) and not isinstance(value, list) + + +def is_array_of_primitives(arr: List[Any]) -> bool: + """Check if all array elements are primitives.""" + return all(is_json_primitive(item) for item in arr) + + +def is_array_of_arrays(arr: List[Any]) -> bool: + """Check if all array elements are arrays.""" + return all(is_json_array(item) for item in arr) + + +def is_array_of_objects(arr: List[Any]) -> bool: + """Check if all array elements are objects.""" + return all(is_json_object(item) for item in arr) diff --git a/src/toon_format/primitives.py b/src/toon_format/primitives.py new file mode 100644 index 0000000..8d494d7 --- /dev/null +++ b/src/toon_format/primitives.py @@ -0,0 +1,205 @@ +"""Primitive encoding utilities.""" + +import re +from typing import List, Optional + +from .constants import ( + BACKSLASH, + CARRIAGE_RETURN, + CLOSE_BRACE, + CLOSE_BRACKET, + COLON, + COMMA, + DOUBLE_QUOTE, + FALSE_LITERAL, + LIST_ITEM_MARKER, + NEWLINE, + NULL_LITERAL, + OPEN_BRACE, + OPEN_BRACKET, + TAB, + TRUE_LITERAL, +) +from .types import Delimiter, JsonPrimitive + + +def encode_primitive(value: JsonPrimitive, delimiter: str = COMMA) -> str: + """Encode a primitive value. + + Args: + value: Primitive value + delimiter: Current delimiter being used + + Returns: + Encoded string + """ + if value is None: + return NULL_LITERAL + if isinstance(value, bool): + return TRUE_LITERAL if value else FALSE_LITERAL + if isinstance(value, (int, float)): + return str(value) + if isinstance(value, str): + return encode_string_literal(value, delimiter) + return str(value) + + +def escape_string(value: str) -> str: + """Escape special characters in a string. + + Args: + value: String to escape + + Returns: + Escaped string + """ + result = value + result = result.replace(BACKSLASH, BACKSLASH + BACKSLASH) + result = result.replace(DOUBLE_QUOTE, BACKSLASH + DOUBLE_QUOTE) + result = result.replace(NEWLINE, BACKSLASH + "n") + result = result.replace(CARRIAGE_RETURN, BACKSLASH + "r") + result = result.replace(TAB, BACKSLASH + "t") + return result + + +def is_safe_unquoted(value: str, delimiter: str = COMMA) -> bool: + """Check if a string can be safely unquoted. + + Args: + value: String to check + delimiter: Current delimiter being used + + Returns: + True if string doesn't need quotes + """ + if not value: + return False + + # Check for leading/trailing whitespace + if value != value.strip(): + return False + + # Check for reserved literals + if value in (NULL_LITERAL, TRUE_LITERAL, FALSE_LITERAL): + return False + + # Check if it looks like a number + try: + float(value) + return False + except ValueError: + pass + + # Check if starts with list marker (hyphen) + if value.startswith(LIST_ITEM_MARKER): + return False + + # Check for structural characters (including current delimiter) + unsafe_chars = [ + COLON, + delimiter, # Current delimiter + OPEN_BRACKET, + CLOSE_BRACKET, + OPEN_BRACE, + CLOSE_BRACE, + DOUBLE_QUOTE, + BACKSLASH, + NEWLINE, + CARRIAGE_RETURN, + TAB, + ] + + if any(char in value for char in unsafe_chars): + return False + + return True + + +def encode_string_literal(value: str, delimiter: str = COMMA) -> str: + """Encode a string, quoting only if necessary. + + Args: + value: String value + delimiter: Current delimiter being used + + Returns: + Encoded string + """ + if is_safe_unquoted(value, delimiter): + return value + return f'{DOUBLE_QUOTE}{escape_string(value)}{DOUBLE_QUOTE}' + + +def encode_key(key: str) -> str: + """Encode an object key. + + Args: + key: Key string + + Returns: + Encoded key + """ + # Keys matching /^[A-Z_][\w.]*$/i don't require quotes + if re.match(r"^[A-Z_][\w.]*$", key, re.IGNORECASE): + return key + return f'{DOUBLE_QUOTE}{escape_string(key)}{DOUBLE_QUOTE}' + + +def join_encoded_values(values: List[str], delimiter: Delimiter) -> str: + """Join encoded primitive values with a delimiter. + + Args: + values: List of encoded values + delimiter: Delimiter to use + + Returns: + Joined string + """ + return delimiter.join(values) + + +def format_header( + key: Optional[str], + length: int, + fields: Optional[List[str]], + delimiter: Delimiter, + length_marker: Optional[str], +) -> str: + """Format array/table header. + + Args: + key: Optional key name + length: Array length + fields: Optional field names for tabular format + delimiter: Delimiter character + length_marker: Optional length marker prefix + + Returns: + Formatted header string + """ + # Build length marker + marker_prefix = length_marker if length_marker else "" + + # Build fields if provided + fields_str = "" + if fields: + fields_str = f"{OPEN_BRACE}{delimiter.join(fields)}{CLOSE_BRACE}" + + # Build length string with delimiter when needed + # Rules: + # - WITH fields: always include delimiter in bracket: [N,] or [N|] or [N\t] + # - WITHOUT fields: only include if delimiter is not comma: [N] vs [N|] + if fields: + # Tabular format: always show delimiter after length + length_str = f"{OPEN_BRACKET}{marker_prefix}{length}{delimiter}{CLOSE_BRACKET}" + elif delimiter != COMMA: + # Primitive array with non-comma delimiter: show delimiter + length_str = f"{OPEN_BRACKET}{marker_prefix}{length}{delimiter}{CLOSE_BRACKET}" + else: + # Primitive array with comma delimiter: just [length] + length_str = f"{OPEN_BRACKET}{marker_prefix}{length}{CLOSE_BRACKET}" + + # Combine parts + if key: + return f"{encode_key(key)}{length_str}{fields_str}{COLON}" + return f"{length_str}{fields_str}{COLON}" diff --git a/src/toon_format/types.py b/src/toon_format/types.py index 58c0127..d279e90 100644 --- a/src/toon_format/types.py +++ b/src/toon_format/types.py @@ -1,37 +1,58 @@ -"""Type definitions for TOON encoder and decoder.""" +"""Type definitions for pytoon.""" -from __future__ import annotations - -from typing import Any, Literal, TypeAlias, TypedDict +from typing import Any, Dict, List, Literal, TypedDict, Union # JSON-compatible types -JsonPrimitive: TypeAlias = str | int | float | bool | None -JsonValue: TypeAlias = JsonPrimitive | dict[str, "JsonValue"] | list["JsonValue"] -JsonObject: TypeAlias = dict[str, JsonValue] -JsonArray: TypeAlias = list[JsonValue] +JsonPrimitive = Union[str, int, float, bool, None] +JsonObject = Dict[str, Any] +JsonArray = List[Any] +JsonValue = Union[JsonPrimitive, JsonArray, JsonObject] + +# Delimiter type +Delimiter = str +DelimiterKey = Literal["comma", "tab", "pipe"] class EncodeOptions(TypedDict, total=False): - """Options for encoding values to TOON format. + """Options for TOON encoding. Attributes: indent: Number of spaces per indentation level (default: 2) - delimiter: Delimiter for array values and tabular rows (default: ',') - length_marker: Optional marker to prefix array lengths (default: False) + delimiter: Delimiter character for arrays (default: comma) + lengthMarker: Optional marker to prefix array lengths (default: False) """ indent: int - delimiter: Literal[",", "\t", "|"] - length_marker: Literal["#", False] + delimiter: Delimiter + lengthMarker: Literal["#", False] + + +class ResolvedEncodeOptions: + """Resolved encoding options with defaults applied.""" + + def __init__( + self, + indent: int = 2, + delimiter: str = ",", + length_marker: Literal["#", False] = False, + ) -> None: + self.indent = indent + self.delimiter = delimiter + self.lengthMarker = length_marker -class DecodeOptions(TypedDict, total=False): - """Options for decoding TOON format to values. +class DecodeOptions: + """Options for TOON decoding. Attributes: - indent: Expected number of spaces per indentation level (default: 2) + indent: Number of spaces per indentation level (default: 2) strict: Enable strict validation (default: True) """ - indent: int - strict: bool + def __init__(self, indent: int = 2, strict: bool = True) -> None: + self.indent = indent + self.strict = strict + + +# Depth type for tracking indentation level +Depth = int diff --git a/src/toon_format/writer.py b/src/toon_format/writer.py new file mode 100644 index 0000000..7a6ff05 --- /dev/null +++ b/src/toon_format/writer.py @@ -0,0 +1,36 @@ +"""Line writer for managing indented output.""" + +from typing import List + +from .types import Depth + + +class LineWriter: + """Manages indented text output.""" + + def __init__(self, indent_size: int) -> None: + """Initialize the line writer. + + Args: + indent_size: Number of spaces per indentation level + """ + self._lines: List[str] = [] + self._indentation_string = " " * indent_size + + def push(self, depth: Depth, content: str) -> None: + """Add a line with appropriate indentation. + + Args: + depth: Indentation depth level + content: Content to add + """ + indent = self._indentation_string * depth + self._lines.append(f"{indent}{content}") + + def to_string(self) -> str: + """Return all lines joined with newlines. + + Returns: + Complete output string + """ + return "\n".join(self._lines) diff --git a/tests/test_decoder.py b/tests/test_decoder.py index e3c1221..d409e72 100644 --- a/tests/test_decoder.py +++ b/tests/test_decoder.py @@ -1,67 +1,350 @@ -"""Tests for the TOON decoder.""" +"""Tests for TOON decoder.""" import pytest -from toon_format import decode - - -def test_decode_not_implemented(): - """Test that decode raises NotImplementedError.""" - with pytest.raises(NotImplementedError, match="not yet implemented"): - decode("key: value") - - -def test_decode_with_options_not_implemented(): - """Test that decode with options raises NotImplementedError.""" - with pytest.raises(NotImplementedError, match="not yet implemented"): - decode("[3]: 1,2,3", {"strict": False}) - - -# Placeholder tests for future implementation -@pytest.mark.skip(reason="Implementation pending") -def test_decode_simple_object(): - """Test decoding a simple object.""" - toon_data = "id: 123\nname: Ada\nactive: true" - result = decode(toon_data) - expected = {"id": 123, "name": "Ada", "active": True} - assert result == expected - - -@pytest.mark.skip(reason="Implementation pending") -def test_decode_array_of_objects(): - """Test decoding a tabular array.""" - toon_data = "items[2]{sku,qty,price}:\n A1,2,9.99\n B2,1,14.5" - result = decode(toon_data) - expected = { - "items": [ - {"sku": "A1", "qty": 2, "price": 9.99}, - {"sku": "B2", "qty": 1, "price": 14.5}, - ] - } - assert result == expected - - -@pytest.mark.skip(reason="Implementation pending") -def test_decode_primitive_array(): - """Test decoding a primitive array.""" - toon_data = "tags[3]: foo,bar,baz" - result = decode(toon_data) - expected = {"tags": ["foo", "bar", "baz"]} - assert result == expected - - -@pytest.mark.skip(reason="Implementation pending") -def test_decode_root_array(): - """Test decoding a root-level array.""" - toon_data = "[3]: 1,2,3" - result = decode(toon_data) - expected = [1, 2, 3] - assert result == expected - - -@pytest.mark.skip(reason="Implementation pending") -def test_decode_strict_mode(): - """Test that strict mode validates input.""" - invalid_toon = "items[3]{id,name}:\n 1,Alice\n 2,Bob" # Length mismatch - with pytest.raises(ValueError, match="length"): - decode(invalid_toon, {"strict": True}) +from toon_format import ToonDecodeError, decode +from toon_format.types import DecodeOptions + + +class TestBasicDecoding: + """Test basic decoding functionality.""" + + def test_decode_simple_object(self): + """Test decoding a simple object.""" + toon = """id: 123 +name: Ada +active: true""" + result = decode(toon) + assert result == {"id": 123, "name": "Ada", "active": True} + + def test_decode_nested_object(self): + """Test decoding a nested object.""" + toon = """user: + id: 123 + name: Ada""" + result = decode(toon) + assert result == {"user": {"id": 123, "name": "Ada"}} + + def test_decode_inline_primitive_array(self): + """Test decoding an inline primitive array.""" + toon = "tags[3]: reading,gaming,coding" + result = decode(toon) + assert result == {"tags": ["reading", "gaming", "coding"]} + + def test_decode_empty_array(self): + """Test decoding an empty array.""" + toon = "items[0]:" + result = decode(toon) + assert result == {"items": []} + + def test_decode_tabular_array(self): + """Test decoding a tabular array.""" + toon = """items[2]{sku,qty,price}: + A1,2,9.99 + B2,1,14.5""" + result = decode(toon) + assert result == { + "items": [ + {"sku": "A1", "qty": 2, "price": 9.99}, + {"sku": "B2", "qty": 1, "price": 14.5}, + ] + } + + def test_decode_list_array_with_objects(self): + """Test decoding a list array with objects.""" + toon = """items[2]: + - id: 1 + name: First + - id: 2 + name: Second""" + result = decode(toon) + assert result == { + "items": [ + {"id": 1, "name": "First"}, + {"id": 2, "name": "Second"}, + ] + } + + def test_decode_list_array_with_primitives(self): + """Test decoding a list array with primitives.""" + toon = """items[3]: + - 1 + - foo + - true""" + result = decode(toon) + assert result == {"items": [1, "foo", True]} + + def test_decode_root_array(self): + """Test decoding a root array.""" + toon = "[3]: a,b,c" + result = decode(toon) + assert result == ["a", "b", "c"] + + def test_decode_root_primitive(self): + """Test decoding a root primitive.""" + toon = "hello world" + result = decode(toon) + assert result == "hello world" + + def test_decode_quoted_strings(self): + """Test decoding quoted strings.""" + toon = 'name: "hello, world"' + result = decode(toon) + assert result == {"name": "hello, world"} + + def test_decode_escaped_strings(self): + """Test decoding escaped strings.""" + toon = r'text: "line1\nline2"' + result = decode(toon) + assert result == {"text": "line1\nline2"} + + def test_decode_booleans_and_null(self): + """Test decoding booleans and null.""" + toon = """active: true +inactive: false +missing: null""" + result = decode(toon) + assert result == {"active": True, "inactive": False, "missing": None} + + def test_decode_numbers(self): + """Test decoding various number formats.""" + toon = """int: 42 +negative: -10 +float: 3.14 +exponent: 1e-6""" + result = decode(toon) + assert result == { + "int": 42, + "negative": -10, + "float": 3.14, + "exponent": 1e-6, + } + + +class TestDelimiters: + """Test different delimiter types.""" + + def test_decode_tab_delimiter_primitive_array(self): + """Test tab-delimited primitive array.""" + toon = "tags[3\t]: reading\tgaming\tcoding" + result = decode(toon) + assert result == {"tags": ["reading", "gaming", "coding"]} + + def test_decode_tab_delimiter_tabular(self): + """Test tab-delimited tabular array.""" + toon = """items[2\t]{sku\tqty}: + A1\t5 + B2\t3""" + result = decode(toon) + assert result == { + "items": [ + {"sku": "A1", "qty": 5}, + {"sku": "B2", "qty": 3}, + ] + } + + def test_decode_pipe_delimiter_primitive_array(self): + """Test pipe-delimited primitive array.""" + toon = "tags[3|]: reading|gaming|coding" + result = decode(toon) + assert result == {"tags": ["reading", "gaming", "coding"]} + + def test_decode_pipe_delimiter_tabular(self): + """Test pipe-delimited tabular array.""" + toon = """items[2|]{sku|qty}: + A1|5 + B2|3""" + result = decode(toon) + assert result == { + "items": [ + {"sku": "A1", "qty": 5}, + {"sku": "B2", "qty": 3}, + ] + } + + +class TestLengthMarker: + """Test length marker support.""" + + def test_decode_with_length_marker(self): + """Test decoding with # length marker.""" + toon = "tags[#3]: a,b,c" + result = decode(toon) + assert result == {"tags": ["a", "b", "c"]} + + def test_decode_tabular_with_length_marker(self): + """Test tabular array with # length marker.""" + toon = """items[#2]{id,name}: + 1,Alice + 2,Bob""" + result = decode(toon) + assert result == { + "items": [ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"}, + ] + } + + +class TestStrictMode: + """Test strict mode validation.""" + + def test_strict_array_length_mismatch(self): + """Test that strict mode errors on length mismatch.""" + toon = "items[3]: a,b" # Declared 3, only 2 values + with pytest.raises(ToonDecodeError, match="Expected 3 values"): + decode(toon) + + def test_non_strict_array_length_mismatch(self): + """Test that non-strict mode allows length mismatch.""" + toon = "items[3]: a,b" + options = DecodeOptions(strict=False) + result = decode(toon, options) + assert result == {"items": ["a", "b"]} + + def test_strict_indentation_error(self): + """Test that strict mode errors on bad indentation.""" + toon = """user: + id: 1""" # 3 spaces instead of 2 + with pytest.raises(ToonDecodeError, match="exact multiple"): + decode(toon) + + def test_strict_tabular_row_width_mismatch(self): + """Test that strict mode errors on row width mismatch.""" + toon = """items[2]{a,b,c}: + 1,2,3 + 4,5""" # Second row has only 2 values instead of 3 + with pytest.raises(ToonDecodeError, match="Expected 3 values"): + decode(toon) + + +class TestEdgeCases: + """Test edge cases and error handling.""" + + def test_decode_empty_string_value(self): + """Test decoding empty string values.""" + toon = 'text: ""' + result = decode(toon) + assert result == {"text": ""} + + def test_decode_quoted_keywords(self): + """Test that quoted keywords remain strings.""" + toon = """items[3]: "true","false","null" """ + result = decode(toon) + assert result == {"items": ["true", "false", "null"]} + + def test_decode_quoted_numbers(self): + """Test that quoted numbers remain strings.""" + toon = """items[2]: "42","3.14" """ + result = decode(toon) + assert result == {"items": ["42", "3.14"]} + + def test_invalid_escape_sequence(self): + """Test that invalid escape sequences error.""" + toon = r'text: "invalid\x"' + with pytest.raises(ToonDecodeError, match="Invalid escape"): + decode(toon) + + def test_unterminated_string(self): + """Test that unterminated strings error.""" + toon = 'text: "unterminated' + with pytest.raises(ToonDecodeError, match="Unterminated"): + decode(toon) + + def test_missing_colon(self): + """Test that missing colon errors in strict mode.""" + toon = """key: value +invalid line without colon""" + with pytest.raises(ToonDecodeError, match="Missing colon"): + decode(toon) + + +class TestComplexStructures: + """Test complex nested structures.""" + + def test_nested_tabular_in_list(self): + """Test tabular array inside a list item.""" + toon = """items[1]: + - users[2]{id,name}: + 1,Alice + 2,Bob + status: active""" + result = decode(toon) + assert result == { + "items": [ + { + "users": [ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"}, + ], + "status": "active", + } + ] + } + + def test_array_of_arrays(self): + """Test array of arrays.""" + toon = """pairs[2]: + - [2]: 1,2 + - [2]: 3,4""" + result = decode(toon) + assert result == {"pairs": [[1, 2], [3, 4]]} + + def test_deeply_nested_objects(self): + """Test deeply nested object structures.""" + toon = """root: + level1: + level2: + level3: + value: deep""" + result = decode(toon) + assert result == { + "root": { + "level1": { + "level2": { + "level3": {"value": "deep"} + } + } + } + } + + +class TestRoundtrip: + """Test encoding and decoding roundtrip.""" + + def test_roundtrip_simple(self): + """Test simple roundtrip.""" + from toon_format import encode + + original = {"id": 123, "name": "Ada", "active": True} + toon = encode(original) + decoded = decode(toon) + assert decoded == original + + def test_roundtrip_tabular(self): + """Test tabular array roundtrip.""" + from toon_format import encode + + original = { + "items": [ + {"sku": "A1", "qty": 2, "price": 9.99}, + {"sku": "B2", "qty": 1, "price": 14.5}, + ] + } + toon = encode(original) + decoded = decode(toon) + assert decoded == original + + def test_roundtrip_nested(self): + """Test nested structure roundtrip.""" + from toon_format import encode + + original = { + "user": { + "id": 123, + "profile": {"name": "Ada", "tags": ["dev", "ops"]}, + } + } + toon = encode(original) + decoded = decode(toon) + assert decoded == original diff --git a/tests/test_encoder.py b/tests/test_encoder.py index e7411d6..9d0bca0 100644 --- a/tests/test_encoder.py +++ b/tests/test_encoder.py @@ -1,58 +1,294 @@ -"""Tests for the TOON encoder.""" - -import pytest +"""Tests for TOON encoder.""" from toon_format import encode -def test_encode_not_implemented(): - """Test that encode raises NotImplementedError.""" - with pytest.raises(NotImplementedError, match="not yet implemented"): - encode({"key": "value"}) +class TestPrimitives: + """Test encoding of primitive values.""" + + def test_null(self) -> None: + assert encode(None) == "null" + + def test_boolean_true(self) -> None: + assert encode(True) == "true" + + def test_boolean_false(self) -> None: + assert encode(False) == "false" + + def test_integer(self) -> None: + assert encode(42) == "42" + + def test_float(self) -> None: + result = encode(3.14) + assert result.startswith("3.14") + + def test_string_simple(self) -> None: + assert encode("hello") == "hello" + + def test_string_with_spaces(self) -> None: + # Spaces don't require quoting unless there are structural characters + assert encode("hello world") == "hello world" + + def test_string_empty(self) -> None: + assert encode("") == '""' + + def test_string_special_keywords(self) -> None: + assert encode("null") == '"null"' + assert encode("true") == '"true"' + assert encode("false") == '"false"' + + def test_string_with_hyphens(self) -> None: + # Strings starting with hyphen must be quoted (list marker conflict) + assert encode("-hello") == '"-hello"' + assert encode("-") == '"-"' + # Strings containing or ending with hyphen don't need quotes + assert encode("hello-world") == "hello-world" + assert encode("hello-") == "hello-" + + +class TestObjects: + """Test encoding of objects.""" + + def test_simple_object(self) -> None: + obj = {"name": "Alice", "age": 30} + result = encode(obj) + assert "name: Alice" in result + assert "age: 30" in result + + def test_nested_object(self) -> None: + obj = {"user": {"name": "Bob", "city": "NYC"}} + result = encode(obj) + assert "user:" in result + assert "name: Bob" in result + assert "city: NYC" in result + def test_object_with_null(self) -> None: + obj = {"value": None} + result = encode(obj) + assert "value: null" in result -def test_encode_with_options_not_implemented(): - """Test that encode with options raises NotImplementedError.""" - with pytest.raises(NotImplementedError, match="not yet implemented"): - encode([1, 2, 3], {"delimiter": "\t"}) + def test_empty_object(self) -> None: + result = encode({}) + assert result == "" -# Placeholder tests for future implementation -@pytest.mark.skip(reason="Implementation pending") -def test_encode_simple_object(): - """Test encoding a simple object.""" - result = encode({"id": 123, "name": "Ada", "active": True}) - expected = "id: 123\nname: Ada\nactive: true" - assert result == expected +class TestPrimitiveArrays: + """Test encoding of primitive arrays.""" + def test_number_array(self) -> None: + arr = [1, 2, 3, 4, 5] + result = encode(arr) + # Primitive arrays always include length marker + assert result == "[5]: 1,2,3,4,5" -@pytest.mark.skip(reason="Implementation pending") -def test_encode_array_of_objects(): - """Test encoding an array of uniform objects.""" - data = { - "items": [ - {"sku": "A1", "qty": 2, "price": 9.99}, - {"sku": "B2", "qty": 1, "price": 14.5}, + def test_string_array(self) -> None: + arr = ["apple", "banana", "cherry"] + result = encode(arr) + # Primitive arrays always include length marker + assert result == "[3]: apple,banana,cherry" + + def test_mixed_primitive_array(self) -> None: + arr = [1, "two", True, None] + result = encode(arr) + assert "1" in result + assert "two" in result + assert "true" in result + assert "null" in result + + def test_empty_array(self) -> None: + result = encode([]) + # Empty arrays show length marker with colon + assert result == "[0]:" + + +class TestTabularArrays: + """Test encoding of tabular (uniform object) arrays.""" + + def test_simple_tabular(self) -> None: + arr = [ + {"id": 1, "name": "Alice", "age": 30}, + {"id": 2, "name": "Bob", "age": 25}, + {"id": 3, "name": "Charlie", "age": 35}, + ] + result = encode(arr) + # Should have header with keys + assert "{id,name,age}" in result + # Should have data rows + assert "1,Alice,30" in result + assert "2,Bob,25" in result + assert "3,Charlie,35" in result + + def test_tabular_with_strings_needing_quotes(self) -> None: + arr = [ + {"name": "Alice Smith", "city": "New York"}, + {"name": "Bob Jones", "city": "Los Angeles"}, ] - } - result = encode(data) - expected = "items[2]{sku,qty,price}:\n A1,2,9.99\n B2,1,14.5" - assert result == expected - - -@pytest.mark.skip(reason="Implementation pending") -def test_encode_with_tab_delimiter(): - """Test encoding with tab delimiter.""" - data = {"tags": ["foo", "bar", "baz"]} - result = encode(data, {"delimiter": "\t"}) - expected = "tags[3\t]: foo\tbar\tbaz" - assert result == expected - - -@pytest.mark.skip(reason="Implementation pending") -def test_encode_with_length_marker(): - """Test encoding with length marker.""" - data = {"tags": ["foo", "bar"]} - result = encode(data, {"length_marker": "#"}) - expected = "tags[#2]: foo,bar" - assert result == expected + result = encode(arr) + # Spaces don't require quoting in tabular format + assert "Alice Smith" in result + assert "New York" in result + + def test_tabular_with_length_marker(self) -> None: + arr = [ + {"id": 1, "value": "a"}, + {"id": 2, "value": "b"}, + ] + result = encode(arr, {"lengthMarker": "#"}) + # lengthMarker adds # prefix before length + assert "[#2,]" in result + + +class TestMixedArrays: + """Test encoding of mixed/nested arrays.""" + + def test_array_of_mixed_types(self) -> None: + arr = [ + {"name": "Alice"}, + 42, + "hello", + ] + result = encode(arr) + # Should use list format with hyphens + assert "- " in result + assert "name: Alice" in result + + def test_nested_array(self) -> None: + arr = [ + [1, 2, 3], + [4, 5, 6], + ] + result = encode(arr) + # Nested arrays use list format with length markers + assert "[2]:" in result + assert "- " in result + assert "[3,]:" in result # Inner arrays show length with delimiter + + +class TestObjectsWithArrays: + """Test objects containing arrays.""" + + def test_object_with_primitive_array(self) -> None: + obj = {"numbers": [1, 2, 3]} + result = encode(obj) + # Primitive arrays always include length marker + assert "numbers[3]: 1,2,3" in result + + def test_object_with_tabular_array(self) -> None: + obj = { + "users": [ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"}, + ] + } + result = encode(obj) + # Tabular arrays include length with delimiter + assert "users[2,]{id,name}:" in result + assert "1,Alice" in result + + +class TestDelimiters: + """Test different delimiter options.""" + + def test_comma_delimiter(self) -> None: + arr = [1, 2, 3] + result = encode(arr, {"delimiter": ","}) + assert result == "[3]: 1,2,3" + + def test_tab_delimiter(self) -> None: + arr = [1, 2, 3] + result = encode(arr, {"delimiter": "\t"}) + assert result == "[3\t]: 1\t2\t3" + + def test_pipe_delimiter(self) -> None: + arr = [1, 2, 3] + result = encode(arr, {"delimiter": "|"}) + assert result == "[3|]: 1|2|3" + + def test_tabular_with_pipe_delimiter(self) -> None: + arr = [ + {"a": 1, "b": 2}, + {"a": 3, "b": 4}, + ] + result = encode(arr, {"delimiter": "|"}) + assert "{a|b}" in result + assert "1|2" in result + + +class TestIndentation: + """Test indentation options.""" + + def test_default_indentation(self) -> None: + obj = {"parent": {"child": "value"}} + result = encode(obj) + lines = result.split("\n") + # Child should be indented by 2 spaces + assert lines[1].startswith(" ") + + def test_custom_indentation(self) -> None: + obj = {"parent": {"child": "value"}} + result = encode(obj, {"indent": 4}) + lines = result.split("\n") + # Child should be indented by 4 spaces + assert lines[1].startswith(" ") + + +class TestComplexStructures: + """Test complex nested structures.""" + + def test_deep_nesting(self) -> None: + obj = { + "level1": { + "level2": { + "level3": {"value": "deep"}, + } + } + } + result = encode(obj) + assert "level1:" in result + assert "level2:" in result + assert "level3:" in result + assert "value: deep" in result + + def test_mixed_structure(self) -> None: + obj = { + "metadata": {"version": 1, "author": "test"}, + "items": [ + {"id": 1, "name": "Item1"}, + {"id": 2, "name": "Item2"}, + ], + "tags": ["alpha", "beta", "gamma"], + } + result = encode(obj) + assert "metadata:" in result + assert "version: 1" in result + # Tabular arrays include length with delimiter + assert "items[2,]{id,name}:" in result + # Primitive arrays include length marker + assert "tags[3]: alpha,beta,gamma" in result + + +class TestEdgeCases: + """Test edge cases and special values.""" + + def test_infinity(self) -> None: + assert encode(float("inf")) == "null" + assert encode(float("-inf")) == "null" + + def test_nan(self) -> None: + assert encode(float("nan")) == "null" + + def test_callable(self) -> None: + def func() -> None: + pass + + assert encode(func) == "null" + + def test_none_in_object(self) -> None: + obj = {"key": None} + result = encode(obj) + assert "key: null" in result + + def test_empty_string_in_array(self) -> None: + arr = ["", "hello", ""] + result = encode(arr) + assert '""' in result