diff --git a/.idea/misc.xml b/.idea/misc.xml index 7c8801c..244e571 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -4,7 +4,7 @@ - + \ No newline at end of file diff --git a/README.md b/README.md index e76f6dc..b97aac2 100644 --- a/README.md +++ b/README.md @@ -38,357 +38,21 @@ users[2]{id,name,role}: -## Format Comparison - -Format familiarity matters as much as token count. - -- **CSV:** best for uniform tables. -- **JSON:** best for non-uniform data. -- **TOON:** best for uniform complex (but not deeply nested) objects. - -TOON switches to list format for non-uniform arrays. In those cases, JSON can be cheaper at scale. - -## Key Features - -- ๐Ÿ’ธ **Token-efficient:** typically 30โ€“60% fewer tokens than JSON -- ๐Ÿคฟ **LLM-friendly guardrails:** explicit lengths and field lists help models validate output -- ๐Ÿฑ **Minimal syntax:** removes redundant punctuation (braces, brackets, most quotes) -- ๐Ÿ“ **Indentation-based structure:** replaces braces with whitespace for better readability -- ๐Ÿงบ **Tabular arrays:** declare keys once, then stream rows without repetition - ## Benchmarks - - -### Token Efficiency - -``` -โญ GitHub Repositories โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 8,745 tokens - vs JSON: 15,145 ๐Ÿ’ฐ 42.3% saved - vs XML: 17,095 ๐Ÿ’ฐ 48.8% saved - -๐Ÿ“ˆ Daily Analytics โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 4,507 tokens - vs JSON: 10,977 ๐Ÿ’ฐ 58.9% saved - vs XML: 13,128 ๐Ÿ’ฐ 65.7% saved - -๐Ÿ›’ E-Commerce Order โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 166 tokens - vs JSON: 257 ๐Ÿ’ฐ 35.4% saved - vs XML: 271 ๐Ÿ’ฐ 38.7% saved - -โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -Total โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 13,418 tokens - vs JSON: 26,379 ๐Ÿ’ฐ 49.1% saved - vs XML: 30,494 ๐Ÿ’ฐ 56.0% saved -``` - -
-View detailed examples - -#### โญ GitHub Repositories - -**Configuration:** Top 100 GitHub repositories with stars, forks, and metadata - -**Savings:** 6,400 tokens (42.3% reduction vs JSON) - -**JSON** (15,145 tokens): - -```json -{ - "repositories": [ - { - "id": 28457823, - "name": "freeCodeCamp", - "repo": "freeCodeCamp/freeCodeCamp", - "description": "freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,โ€ฆ", - "createdAt": "2014-12-24T17:49:19Z", - "updatedAt": "2025-10-27T07:40:58Z", - "pushedAt": "2025-10-26T11:31:08Z", - "stars": 430828, - "watchers": 8582, - "forks": 42136, - "defaultBranch": "main" - }, - { - "id": 132750724, - "name": "build-your-own-x", - "repo": "codecrafters-io/build-your-own-x", - "description": "Master programming by recreating your favorite technologies from scratch.", - "createdAt": "2018-05-09T12:03:18Z", - "updatedAt": "2025-10-27T07:43:25Z", - "pushedAt": "2025-10-10T18:45:01Z", - "stars": 430102, - "watchers": 6322, - "forks": 40388, - "defaultBranch": "master" - }, - { - "id": 21737465, - "name": "awesome", - "repo": "sindresorhus/awesome", - "description": "๐Ÿ˜Ž Awesome lists about all kinds of interesting topics", - "createdAt": "2014-07-11T13:42:37Z", - "updatedAt": "2025-10-27T07:44:27Z", - "pushedAt": "2025-10-23T17:26:53Z", - "stars": 409760, - "watchers": 8016, - "forks": 32015, - "defaultBranch": "main" - } - ] -} -``` - -**TOON** (8,745 tokens): - -``` -repositories[3]{id,name,repo,description,createdAt,updatedAt,pushedAt,stars,watchers,forks,defaultBranch}: - 28457823,freeCodeCamp,freeCodeCamp/freeCodeCamp,"freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,โ€ฆ","2014-12-24T17:49:19Z","2025-10-27T07:40:58Z","2025-10-26T11:31:08Z",430828,8582,42136,main - 132750724,build-your-own-x,codecrafters-io/build-your-own-x,Master programming by recreating your favorite technologies from scratch.,"2018-05-09T12:03:18Z","2025-10-27T07:43:25Z","2025-10-10T18:45:01Z",430102,6322,40388,master - 21737465,awesome,sindresorhus/awesome,๐Ÿ˜Ž Awesome lists about all kinds of interesting topics,"2014-07-11T13:42:37Z","2025-10-27T07:44:27Z","2025-10-23T17:26:53Z",409760,8016,32015,main -``` - ---- - -#### ๐Ÿ“ˆ Daily Analytics - -**Configuration:** 180 days of web metrics (views, clicks, conversions, revenue) - -**Savings:** 6,470 tokens (58.9% reduction vs JSON) - -**JSON** (10,977 tokens): - -```json -{ - "metrics": [ - { - "date": "2025-01-01", - "views": 6890, - "clicks": 401, - "conversions": 23, - "revenue": 6015.59, - "bounceRate": 0.63 - }, - { - "date": "2025-01-02", - "views": 6940, - "clicks": 323, - "conversions": 37, - "revenue": 9086.44, - "bounceRate": 0.36 - }, - { - "date": "2025-01-03", - "views": 4390, - "clicks": 346, - "conversions": 26, - "revenue": 6360.75, - "bounceRate": 0.48 - }, - { - "date": "2025-01-04", - "views": 3429, - "clicks": 231, - "conversions": 13, - "revenue": 2360.96, - "bounceRate": 0.65 - }, - { - "date": "2025-01-05", - "views": 5804, - "clicks": 186, - "conversions": 22, - "revenue": 2535.96, - "bounceRate": 0.37 - } - ] -} -``` - -**TOON** (4,507 tokens): - -``` -metrics[5]{date,views,clicks,conversions,revenue,bounceRate}: - 2025-01-01,6890,401,23,6015.59,0.63 - 2025-01-02,6940,323,37,9086.44,0.36 - 2025-01-03,4390,346,26,6360.75,0.48 - 2025-01-04,3429,231,13,2360.96,0.65 - 2025-01-05,5804,186,22,2535.96,0.37 -``` - -
- - - -> [!NOTE] -> Measured with [`gpt-tokenizer`](https://github.com/niieani/gpt-tokenizer) using `o200k_base` encoding (used by GPT-5 and other modern models). Savings will vary across models and tokenizers. +> **Learn more:** For complete format specification, rules, and additional benchmarks, see [TOON-SPECIFICATION.md](TOON-SPECIFICATION.md). - +### Token Efficiency Example -### Retrieval Accuracy - -Tested across **3 LLMs** with data retrieval tasks: +TOON typically achieves **30โ€“60% fewer tokens than JSON**. Here's a quick summary: ``` -gpt-5-nano - toon โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ 99.4% (158/159) - yaml โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘ 95.0% (151/159) - csv โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘ 92.5% (147/159) - json โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘ 92.5% (147/159) - xml โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘ 91.2% (145/159) - -claude-haiku-4-5 - toon โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘ 75.5% (120/159) - xml โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘ 75.5% (120/159) - csv โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘ 75.5% (120/159) - json โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘ 75.5% (120/159) - yaml โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘ 74.2% (118/159) - -gemini-2.5-flash - xml โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘ 91.8% (146/159) - csv โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘ 86.2% (137/159) - toon โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘ 84.9% (135/159) - json โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘ 81.8% (130/159) - yaml โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘ 78.6% (125/159) +Total across 4 datasets โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 13,418 tokens + vs JSON: 26,379 ๐Ÿ’ฐ 49.1% saved + vs XML: 30,494 ๐Ÿ’ฐ 56.0% saved ``` -**Advantage:** TOON achieves **86.6% accuracy** (vs JSON's 83.2%) while using **46.3% fewer tokens**. - -
-Performance by dataset and model - -#### Performance by Dataset - -##### Uniform employee records (TOON optimal format) - -| Format | Accuracy | Tokens | Correct/Total | -| ------ | -------- | ------ | ------------- | -| `toon` | 87.4% | 2.483 | 152/174 | -| `csv` | 82.8% | 2.337 | 144/174 | -| `yaml` | 83.9% | 4.969 | 146/174 | -| `json` | 83.9% | 6.347 | 146/174 | -| `xml` | 88.5% | 7.314 | 154/174 | - -##### E-commerce orders with nested structures - -| Format | Accuracy | Tokens | Correct/Total | -| ------ | -------- | ------ | ------------- | -| `toon` | 90.9% | 5.967 | 120/132 | -| `csv` | 93.9% | 6.735 | 124/132 | -| `yaml` | 87.1% | 7.328 | 115/132 | -| `json` | 87.9% | 9.694 | 116/132 | -| `xml` | 93.2% | 10.992 | 123/132 | - -##### Time-series analytics data - -| Format | Accuracy | Tokens | Correct/Total | -| ------ | -------- | ------ | ------------- | -| `csv` | 89.7% | 1.393 | 78/87 | -| `toon` | 88.5% | 1.515 | 77/87 | -| `yaml` | 83.9% | 2.938 | 73/87 | -| `json` | 88.5% | 3.665 | 77/87 | -| `xml` | 85.1% | 4.376 | 74/87 | - -##### Top 100 GitHub repositories - -| Format | Accuracy | Tokens | Correct/Total | -| ------ | -------- | ------ | ------------- | -| `toon` | 76.2% | 8.745 | 64/84 | -| `csv` | 69.0% | 8.513 | 58/84 | -| `yaml` | 71.4% | 13.129 | 60/84 | -| `json` | 69.0% | 15.145 | 58/84 | -| `xml` | 71.4% | 17.095 | 60/84 | - -#### Performance by Model - -##### gpt-5-nano - -| Format | Accuracy | Correct/Total | -| ------ | -------- | ------------- | -| `toon` | 99.4% | 158/159 | -| `yaml` | 95.0% | 151/159 | -| `csv` | 92.5% | 147/159 | -| `json` | 92.5% | 147/159 | -| `xml` | 91.2% | 145/159 | - -##### claude-haiku-4-5 - -| Format | Accuracy | Correct/Total | -| ------ | -------- | ------------- | -| `toon` | 75.5% | 120/159 | -| `xml` | 75.5% | 120/159 | -| `csv` | 75.5% | 120/159 | -| `json` | 75.5% | 120/159 | -| `yaml` | 74.2% | 118/159 | - -##### gemini-2.5-flash - -| Format | Accuracy | Correct/Total | -| ------ | -------- | ------------- | -| `xml` | 91.8% | 146/159 | -| `csv` | 86.2% | 137/159 | -| `toon` | 84.9% | 135/159 | -| `json` | 81.8% | 130/159 | -| `yaml` | 78.6% | 125/159 | - -
- -
-How the benchmark works - -#### What's Being Measured - -This benchmark tests **LLM comprehension and data retrieval accuracy** when data is presented in different formats. Each LLM receives formatted data and must answer questions about it (this does NOT test LLM's ability to generate TOON output). - -#### Datasets Tested - -Four datasets designed to test different structural patterns: - -1. **Tabular** (100 employee records): Uniform objects with identical fields โ€“ optimal for TOON's tabular format. -2. **Nested** (50 e-commerce orders): Complex structures with nested customer objects and item arrays. -3. **Analytics** (60 days of metrics): Time-series data with dates and numeric values. -4. **GitHub** (100 repositories): Real-world data from top GitHub repos by stars. - -#### Question Types - -~160 questions are generated dynamically across three categories: - -- **Field retrieval (50%)**: Direct value lookups - - Example: "What is Alice's salary?" โ†’ `75000` - - Example: "What is the customer name for order ORD-0042?" โ†’ `John Doe` - -- **Aggregation (25%)**: Counting and summation tasks - - Example: "How many employees work in Engineering?" โ†’ `17` - - Example: "What is the total revenue across all orders?" โ†’ `45123.50` - -- **Filtering (25%)**: Conditional queries - - Example: "How many employees in Sales have salary > 80000?" โ†’ `5` - - Example: "How many orders have total > 400?" โ†’ `12` - -#### Evaluation Process - -1. **Format conversion**: Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML). -2. **Query LLM**: Each model receives formatted data + question in a prompt. -3. **LLM responds**: Model extracts the answer from the data. -4. **Validate with LLM-as-judge**: GPT-5-nano validates if the answer is semantically correct. - -#### Semantic Validation - -Answers are validated by an LLM judge (`gpt-5-nano`) using semantic equivalence, not exact string matching: - -- **Numeric formats**: `50000` = `$50,000` = `50000 dollars` โœ“ -- **Case insensitive**: `Engineering` = `engineering` = `ENGINEERING` โœ“ -- **Minor formatting**: `2025-01-01` = `January 1, 2025` โœ“ - -#### Models & Configuration - -- **Models tested**: `gpt-5-nano`, `claude-haiku-4-5`, `gemini-2.5-flash` -- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer) -- **Temperature**: 0 (for non-reasoning models) -- **Total evaluations**: 159 questions ร— 5 formats ร— 3 models = 2,385 LLM calls - -
- - +**See [TOON-SPECIFICATION.md](TOON-SPECIFICATION.md#benchmarks) for detailed benchmark results and LLM retrieval accuracy tests.** ## Installation @@ -434,14 +98,11 @@ You can also download the JAR directly from the [GitHub Releases](https://github import com.felipestanzani.jtoon.JToon; import java.util.*; -Map data = new LinkedHashMap<>(); -Map user = new LinkedHashMap<>(); -user.put("id", 123); -user.put("name", "Ada"); -user.put("tags", List.of("reading", "gaming")); -user.put("active", true); -user.put("preferences", List.of()); -data.put("user", user); +record User(int id, String name, List tags, boolean active, List preferences) {} +record Data(User user) {} + +User user = new User(123, "Ada", List.of("reading", "gaming"), true, List.of()); +Data data = new Data(user); System.out.println(JToon.encode(data)); ``` @@ -457,54 +118,20 @@ user: preferences[0]: ``` -## Canonical Formatting Rules - -TOON formatting is deterministic and minimal: +## TOON Format Basics -- **Indentation**: 2 spaces per nesting level. -- **Lines**: - - `key: value` for primitives (single space after colon). - - `key:` for nested/empty objects (no trailing space on that line). -- **Arrays**: - - Delimiter encoding: Comma delimiters are implicit in array headers (e.g., `tags[3]:`, `items[2]{id,name}:`). Tab and pipe delimiters are explicitly shown in array headers (e.g., `tags[3|]:`, `items[2 ]{id name}:`). - - Primitive arrays inline: `key[N]: v1,v2` (comma) or `key[N]: v1v2` (tab/pipe). - - Tabular arrays: `key[N]{f1,f2}: โ€ฆ` (comma) or `key[N]{f1f2}: โ€ฆ` (tab/pipe). - - List items: two spaces, hyphen, space (`" - โ€ฆ"`). -- **Whitespace invariants**: - - No trailing spaces at end of any line. - - No trailing newline at end of output. +> **Complete specification:** For detailed formatting rules, quoting rules, and comprehensive examples, see [TOON-SPECIFICATION.md](TOON-SPECIFICATION.md). -## Format Overview +TOON uses indentation-based structure (like YAML) combined with efficient tabular format for uniform arrays (like CSV): -### Objects - -Simple objects with primitive values: - -```java -Map obj = new LinkedHashMap<>(); -obj.put("id", 123); -obj.put("name", "Ada"); -obj.put("active", true); -JToon.encode(obj); -``` +**Simple objects:** ``` id: 123 name: Ada -active: true ``` -Nested objects: - -```java -Map user = new LinkedHashMap<>(); -user.put("id", 123); -user.put("name", "Ada"); - -Map obj = new LinkedHashMap<>(); -obj.put("user", user); -JToon.encode(obj); -``` +**Nested objects:** ``` user: @@ -512,42 +139,13 @@ user: name: Ada ``` -### Arrays - -> [!TIP] -> TOON includes the array length in brackets (e.g., `items[3]`). When using comma delimiters (default), the delimiter is implicit. When using tab or pipe delimiters, the delimiter is explicitly shown in the header (e.g., `tags[2|]` or `[2 ]`). This encoding helps LLMs identify the delimiter and track the number of elements, reducing errors when generating or validating structured output. - -#### Primitive Arrays (Inline) - -```java -Map obj = new LinkedHashMap<>(); -obj.put("tags", List.of("admin", "ops", "dev")); -JToon.encode(obj); -``` +**Primitive arrays:** ``` tags[3]: admin,ops,dev ``` -#### Arrays of Objects (Tabular) - -When all objects share the same primitive fields, TOON uses an efficient **tabular format**: - -```java -Map item1 = new LinkedHashMap<>(); -item1.put("sku", "A1"); -item1.put("qty", 2); -item1.put("price", 9.99); - -Map item2 = new LinkedHashMap<>(); -item2.put("sku", "B2"); -item2.put("qty", 1); -item2.put("price", 14.5); - -Map obj = new LinkedHashMap<>(); -obj.put("items", List.of(item1, item2)); -JToon.encode(obj); -``` +**Tabular arrays** (uniform objects with same fields): ``` items[2]{sku,qty,price}: @@ -555,156 +153,6 @@ items[2]{sku,qty,price}: B2,1,14.5 ``` -**Tabular formatting applies recursively:** nested arrays of objects (whether as object properties or inside list items) also use tabular format if they meet the same requirements. - -```java -Map user1 = new LinkedHashMap<>(); -user1.put("id", 1); -user1.put("name", "Ada"); - -Map user2 = new LinkedHashMap<>(); -user2.put("id", 2); -user2.put("name", "Bob"); - -Map item = new LinkedHashMap<>(); -item.put("users", List.of(user1, user2)); -item.put("status", "active"); - -Map obj = new LinkedHashMap<>(); -obj.put("items", List.of(item)); -JToon.encode(obj); -``` - -``` -items[1]: - - users[2]{id,name}: - 1,Ada - 2,Bob - status: active -``` - -#### Mixed and Non-Uniform Arrays - -Arrays that don't meet the tabular requirements use list format: - -``` -items[3]: - - 1 - - a: 1 - - text -``` - -When objects appear in list format, the first field is placed on the hyphen line: - -``` -items[2]: - - id: 1 - name: First - - id: 2 - name: Second - extra: true -``` - -> [!NOTE] -> **Nested array indentation:** When the first field of a list item is an array (primitive, tabular, or nested), its contents are indented two spaces under the header line, and subsequent fields of the same object appear at that same indentation level. This remains unambiguous because list items begin with `"- "`, tabular arrays declare a fixed row count in their header, and object fields contain `":"`. - -#### Arrays of Arrays - -When you have arrays containing primitive inner arrays: - -```java -Map obj = new LinkedHashMap<>(); -obj.put("pairs", List.of( - List.of(1, 2), - List.of(3, 4) -)); -JToon.encode(obj); -``` - -``` -pairs[2]: - - [2]: 1,2 - - [2]: 3,4 -``` - -#### Empty Arrays and Objects - -Empty containers have special representations: - -```java -JToon.encode(Map.of("items", List.of())); // items[0]: -JToon.encode(List.of()); // [0]: -JToon.encode(Map.of()); // (empty output) -JToon.encode(Map.of("config", Map.of())); // config: -``` - -### Quoting Rules - -TOON quotes strings **only when necessary** to maximize token efficiency. Inner spaces are allowed; leading or trailing spaces force quotes. Unicode and emoji are safe unquoted. - -> [!NOTE] -> When using alternative delimiters (tab or pipe), the quoting rules adapt automatically. Strings containing the active delimiter will be quoted, while other delimiters remain safe. - -#### Keys - -Keys are quoted when any of the following is true: - -| Condition | Examples | -|---|---| -| Contains spaces, commas, colons, quotes, control chars | `"full name"`, `"a,b"`, `"order:id"`, `"tab\there"` | -| Contains brackets or braces | `"[index]"`, `"{key}"` | -| Leading hyphen | `"-lead"` | -| Numeric-only key | `"123"` | -| Empty key | `""` | - -**Notes:** - -- Quotes and control characters in keys are escaped (e.g., `"he said \"hi\""`, `"line\nbreak"`). - -#### String Values - -String values are quoted when any of the following is true: - -| Condition | Examples | -|---|---| -| Empty string | `""` | -| Contains active delimiter, colon, quote, backslash, or control chars | `"a,b"` (comma), `"a\tb"` (tab), `"a\|b"` (pipe), `"a:b"`, `"say \"hi\""`, `"C:\\Users"`, `"line1\\nline2"` | -| Leading or trailing spaces | `" padded "`, `" "` | -| Looks like boolean/number/null | `"true"`, `"false"`, `"null"`, `"42"`, `"-3.14"`, `"1e-6"`, `"05"` | -| Starts with `"- "` (list-like) | `"- item"` | -| Looks like structural token | `"[5]"`, `"{key}"`, `"[3]: x,y"` | - -> [!IMPORTANT] -> **Delimiter-aware quoting:** Unquoted strings never contain `:` or the active delimiter. This makes TOON reliably parseable with simple heuristics: split key/value on first `:`, and split array values on the delimiter declared in the array header. When using tab or pipe delimiters, commas don't need quoting โ€“ only the active delimiter triggers quoting for both array values and object values. - -#### Examples - -``` -note: "hello, world" -items[3]: foo,"true","- item" -hello ๐Ÿ‘‹ world // unquoted -" padded " // quoted -value: null // null value -name: "" // empty string (quoted) -text: "line1\nline2" // multi-line string (escaped) -``` - -### Tabular Format Requirements - -For arrays of objects to use the efficient tabular format, all of the following must be true: - -| Requirement | Detail | -|---|---| -| All elements are objects | No primitives in the array | -| Identical key sets | No missing or extra keys across rows | -| Primitive values only | No nested arrays or objects | -| Header delimiter | Comma is implicit in headers (`[N]{f1,f2}`); tab and pipe are explicit (`[N ]{f1 f2}`, `[N|]{f1|f2}`) | -| Header key order | Taken from the first object | -| Header key quoting | Same rules as object keys; keys containing the active delimiter must be quoted | -| Row value quoting | Same rules as string values; values containing the active delimiter must be quoted | - -If any condition fails, TOON falls back to list format. - ## Type Conversions Some Java-specific types are automatically normalized for LLM-safe output: @@ -761,17 +209,13 @@ A TOON-formatted string with no trailing newline or spaces. import com.felipestanzani.jtoon.JToon; import java.util.*; -Map item1 = new LinkedHashMap<>(); -item1.put("sku", "A1"); -item1.put("qty", 2); -item1.put("price", 9.99); +record Item(String sku, int qty, double price) {} +record Data(List items) {} -Map item2 = new LinkedHashMap<>(); -item2.put("sku", "B2"); -item2.put("qty", 1); -item2.put("price", 14.5); +Item item1 = new Item("A1", 2, 9.99); +Item item2 = new Item("B2", 1, 14.5); +Data data = new Data(List.of(item1, item2)); -Map data = Map.of("items", List.of(item1, item2)); System.out.println(JToon.encode(data)); ``` @@ -795,19 +239,13 @@ Using tab delimiters instead of commas can reduce token count further, especiall import com.felipestanzani.jtoon.*; import java.util.*; -Map item1 = new LinkedHashMap<>(); -item1.put("sku", "A1"); -item1.put("name", "Widget"); -item1.put("qty", 2); -item1.put("price", 9.99); +record Item(String sku, String name, int qty, double price) {} +record Data(List items) {} -Map item2 = new LinkedHashMap<>(); -item2.put("sku", "B2"); -item2.put("name", "Gadget"); -item2.put("qty", 1); -item2.put("price", 14.5); +Item item1 = new Item("A1", "Widget", 2, 9.99); +Item item2 = new Item("B2", "Gadget", 1, 14.5); +Data data = new Data(List.of(item1, item2)); -Map data = Map.of("items", List.of(item1, item2)); EncodeOptions options = new EncodeOptions(2, Delimiter.TAB, false); System.out.println(JToon.encode(data, options)); ``` @@ -836,6 +274,7 @@ items[2 ]{sku name qty price}: Pipe delimiters offer a middle ground between commas and tabs: ```java +// Using the same Item and Data records from above EncodeOptions options = new EncodeOptions(2, Delimiter.PIPE, false); System.out.println(JToon.encode(data, options)); ``` @@ -856,19 +295,12 @@ The `lengthMarker` option adds an optional hash (`#`) prefix to array lengths to import com.felipestanzani.jtoon.*; import java.util.*; -Map item1 = new LinkedHashMap<>(); -item1.put("sku", "A1"); -item1.put("qty", 2); -item1.put("price", 9.99); +record Item(String sku, int qty, double price) {} +record Data(List tags, List items) {} -Map item2 = new LinkedHashMap<>(); -item2.put("sku", "B2"); -item2.put("qty", 1); -item2.put("price", 14.5); - -Map data = new LinkedHashMap<>(); -data.put("tags", List.of("reading", "gaming", "coding")); -data.put("items", List.of(item1, item2)); +Item item1 = new Item("A1", 2, 9.99); +Item item2 = new Item("B2", 1, 14.5); +Data data = new Data(List.of("reading", "gaming", "coding"), List.of(item1, item2)); System.out.println(JToon.encode(data, new EncodeOptions(2, Delimiter.COMMA, true))); // tags[#3]: reading,gaming,coding @@ -884,107 +316,22 @@ System.out.println(JToon.encode(data, new EncodeOptions(2, Delimiter.PIPE, true) // B2|1|14.5 ``` -## Using TOON in LLM Prompts - -TOON works best when you show the format instead of describing it. The structure is self-documenting โ€“ models parse it naturally once they see the pattern. - -### Sending TOON to LLMs (Input) - -Wrap your encoded data in a fenced code block (label it \`\`\`toon for clarity). The indentation and headers are usually enough โ€“ models treat it like familiar YAML or CSV. The explicit length markers (`[N]`) and field headers (`{field1,field2}`) help the model track structure, especially for large tables. - -### Generating TOON from LLMs (Output) - -For output, be more explicit. When you want the model to **generate** TOON: - -- **Show the expected header** (`users[N]{id,name,role}:`). The model fills rows instead of repeating keys, reducing generation errors. -- **State the rules**: 2-space indent, no trailing spaces, `[N]` matches row count. - -Here's a prompt that works for both reading and generating: - -``` -Data is in TOON format (2-space indent, arrays show length and fields). - -\`\`\`toon -users[3]{id,name,role,lastLogin}: - 1,Alice,admin,2025-01-15T10:30:00Z - 2,Bob,user,2025-01-14T15:22:00Z - 3,Charlie,user,2025-01-13T09:45:00Z -\`\`\` +## See Also -Task: Return only users with role "user" as TOON. Use the same header. Set [N] to match the row count. Output only the code block. -``` - -> [!TIP] -> For large uniform tables, use `JToon.encode(data, new EncodeOptions(2, Delimiter.TAB, false))` and tell the model "fields are tab-separated." Tabs often tokenize better than commas and reduce the need for quote-escaping. - -## Notes and Limitations - -- **Token counts vary by tokenizer and model.** Benchmarks use a GPT-style tokenizer (cl100k/o200k); actual savings will differ with other models (e.g., SentencePiece). -- **TOON is designed for LLM contexts** where human readability and token efficiency matter. It's **not** a drop-in replacement for JSON in APIs or storage. -- **Tabular arrays** require all objects to have exactly the same keys with primitive values only. Arrays with mixed types (primitives + objects/arrays), non-uniform objects, or nested structures will use a more verbose list format. -- **Object key order** is preserved from the input. In tabular arrays, header order follows the first object's keys. -- **Arrays mixing primitives and objects/arrays** always use list form: - - ``` - items[2]: - - a: 1 - - [2]: 1,2 - ``` - -- **Deterministic formatting:** 2-space indentation, stable key order, no trailing spaces/newline. - -## Quick Reference - -``` -// Object -{ id: 1, name: 'Ada' } โ†’ id: 1 - name: Ada - -// Nested object -{ user: { id: 1 } } โ†’ user: - id: 1 - -// Primitive array (inline) -{ tags: ['foo', 'bar'] } โ†’ tags[2]: foo,bar - -// Tabular array (uniform objects) -{ items: [ โ†’ items[2]{id,qty}: - { id: 1, qty: 5 }, 1,5 - { id: 2, qty: 3 } 2,3 -]} - -// Mixed / non-uniform (list) -{ items: [1, { a: 1 }, 'x'] } โ†’ items[3]: - - 1 - - a: 1 - - x - -// Array of arrays -{ pairs: [[1, 2], [3, 4]] } โ†’ pairs[2]: - - [2]: 1,2 - - [2]: 3,4 - -// Root array -['x', 'y'] โ†’ [2]: x,y - -// Empty containers -{} โ†’ (empty output) -{ items: [] } โ†’ items[0]: - -// Special quoting -{ note: 'hello, world' } โ†’ note: "hello, world" -{ items: ['true', true] } โ†’ items[2]: "true",true -``` +- **[TOON Format Specification](TOON-SPECIFICATION.md)** โ€“ Complete format rules, benchmarks, and examples ## Implementations in Other Languages - **TypeScript/JavaScript**: [@johannschopplich/toon](https://github.com/johannschopplich/toon) (original) -- **Elixir**: [toon_ex](https://github.com/kentaro/toon_ex) -- **PHP**: [toon-php](https://github.com/HelgeSverre/toon-php) -- **Python**: [pytoon](https://github.com/bpradana/pytoon) - - [python-toon](https://github.com/xaviviro/python-toon) - - [toon-python](https://gitlab.com/KanTakahiro/toon-python) -- **Ruby**: [toon-ruby](https://github.com/andrepcg/toon-ruby) +- **Elixir:** [toon_ex](https://github.com/kentaro/toon_ex) +- **PHP:** [toon-php](https://github.com/HelgeSverre/toon-php) +- **Python:** [python-toon](https://github.com/xaviviro/python-toon) or [pytoon](https://github.com/bpradana/pytoon) +- **Ruby:** [toon-ruby](https://github.com/andrepcg/toon-ruby) +- **Java:** [JToon](https://github.com/felipestanzani/JToon) +- **.NET:** [toon.NET](https://github.com/ghost1face/toon.NET) +- **Swift:** [TOONEncoder](https://github.com/mattt/TOONEncoder) +- **Go:** [gotoon](https://github.com/alpkeskin/gotoon) +- **Rust:** [toon-rs](https://github.com/JadJabbour/toon-rs) ## License diff --git a/TOON-SPECIFICATION.md b/TOON-SPECIFICATION.md new file mode 100644 index 0000000..64922b3 --- /dev/null +++ b/TOON-SPECIFICATION.md @@ -0,0 +1,743 @@ +# TOON Format Specification + +**Token-Oriented Object Notation (TOON)** is a compact, human-readable format designed for passing structured data to Large Language Models with significantly reduced token usage. + +TOON excels at **uniform complex objects** โ€“ multiple fields per row, same structure across items. It borrows YAML's indentation-based structure for nested objects and CSV's tabular format for uniform data rows, then optimizes both for token efficiency in LLM contexts. + +> **Note:** This document describes the TOON format itself. For Java implementation details, see the [JToon library README](README.md). + +## Why TOON? + +AI is becoming cheaper and more accessible, but larger context windows allow for larger data inputs as well. **LLM tokens still cost money** โ€“ and standard JSON is verbose and token-expensive: + +```json +{ + "users": [ + { "id": 1, "name": "Alice", "role": "admin" }, + { "id": 2, "name": "Bob", "role": "user" } + ] +} +``` + +TOON conveys the same information with **fewer tokens**: + +``` +users[2]{id,name,role}: + 1,Alice,admin + 2,Bob,user +``` + +## Format Comparison + +Format familiarity matters as much as token count. + +- **CSV:** best for uniform tables. +- **JSON:** best for non-uniform data. +- **TOON:** best for uniform complex (but not deeply nested) objects. + +TOON switches to list format for non-uniform arrays. In those cases, JSON can be cheaper at scale. + +## Key Features + +- ๐Ÿ’ธ **Token-efficient:** typically 30โ€“60% fewer tokens than JSON +- ๐Ÿคฟ **LLM-friendly guardrails:** explicit lengths and field lists help models validate output +- ๐Ÿฑ **Minimal syntax:** removes redundant punctuation (braces, brackets, most quotes) +- ๐Ÿ“ **Indentation-based structure:** replaces braces with whitespace for better readability +- ๐Ÿงบ **Tabular arrays:** declare keys once, then stream rows without repetition + +## Benchmarks + + + +### Token Efficiency + +``` +โญ GitHub Repositories โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 8,745 tokens + vs JSON: 15,145 ๐Ÿ’ฐ 42.3% saved + vs XML: 17,095 ๐Ÿ’ฐ 48.8% saved + +๐Ÿ“ˆ Daily Analytics โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 4,507 tokens + vs JSON: 10,977 ๐Ÿ’ฐ 58.9% saved + vs XML: 13,128 ๐Ÿ’ฐ 65.7% saved + +๐Ÿ›’ E-Commerce Order โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 166 tokens + vs JSON: 257 ๐Ÿ’ฐ 35.4% saved + vs XML: 271 ๐Ÿ’ฐ 38.7% saved + +โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +Total โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 13,418 tokens + vs JSON: 26,379 ๐Ÿ’ฐ 49.1% saved + vs XML: 30,494 ๐Ÿ’ฐ 56.0% saved +``` + +
+View detailed examples + +#### โญ GitHub Repositories + +**Configuration:** Top 100 GitHub repositories with stars, forks, and metadata + +**Savings:** 6,400 tokens (42.3% reduction vs JSON) + +**JSON** (15,145 tokens): + +```json +{ + "repositories": [ + { + "id": 28457823, + "name": "freeCodeCamp", + "repo": "freeCodeCamp/freeCodeCamp", + "description": "freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,โ€ฆ", + "createdAt": "2014-12-24T17:49:19Z", + "updatedAt": "2025-10-27T07:40:58Z", + "pushedAt": "2025-10-26T11:31:08Z", + "stars": 430828, + "watchers": 8582, + "forks": 42136, + "defaultBranch": "main" + }, + { + "id": 132750724, + "name": "build-your-own-x", + "repo": "codecrafters-io/build-your-own-x", + "description": "Master programming by recreating your favorite technologies from scratch.", + "createdAt": "2018-05-09T12:03:18Z", + "updatedAt": "2025-10-27T07:43:25Z", + "pushedAt": "2025-10-10T18:45:01Z", + "stars": 430102, + "watchers": 6322, + "forks": 40388, + "defaultBranch": "master" + }, + { + "id": 21737465, + "name": "awesome", + "repo": "sindresorhus/awesome", + "description": "๐Ÿ˜Ž Awesome lists about all kinds of interesting topics", + "createdAt": "2014-07-11T13:42:37Z", + "updatedAt": "2025-10-27T07:44:27Z", + "pushedAt": "2025-10-23T17:26:53Z", + "stars": 409760, + "watchers": 8016, + "forks": 32015, + "defaultBranch": "main" + } + ] +} +``` + +**TOON** (8,745 tokens): + +``` +repositories[3]{id,name,repo,description,createdAt,updatedAt,pushedAt,stars,watchers,forks,defaultBranch}: + 28457823,freeCodeCamp,freeCodeCamp/freeCodeCamp,"freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,โ€ฆ","2014-12-24T17:49:19Z","2025-10-27T07:40:58Z","2025-10-26T11:31:08Z",430828,8582,42136,main + 132750724,build-your-own-x,codecrafters-io/build-your-own-x,Master programming by recreating your favorite technologies from scratch.,"2018-05-09T12:03:18Z","2025-10-27T07:43:25Z","2025-10-10T18:45:01Z",430102,6322,40388,master + 21737465,awesome,sindresorhus/awesome,๐Ÿ˜Ž Awesome lists about all kinds of interesting topics,"2014-07-11T13:42:37Z","2025-10-27T07:44:27Z","2025-10-23T17:26:53Z",409760,8016,32015,main +``` + +--- + +#### ๐Ÿ“ˆ Daily Analytics + +**Configuration:** 180 days of web metrics (views, clicks, conversions, revenue) + +**Savings:** 6,470 tokens (58.9% reduction vs JSON) + +**JSON** (10,977 tokens): + +```json +{ + "metrics": [ + { + "date": "2025-01-01", + "views": 6890, + "clicks": 401, + "conversions": 23, + "revenue": 6015.59, + "bounceRate": 0.63 + }, + { + "date": "2025-01-02", + "views": 6940, + "clicks": 323, + "conversions": 37, + "revenue": 9086.44, + "bounceRate": 0.36 + }, + { + "date": "2025-01-03", + "views": 4390, + "clicks": 346, + "conversions": 26, + "revenue": 6360.75, + "bounceRate": 0.48 + }, + { + "date": "2025-01-04", + "views": 3429, + "clicks": 231, + "conversions": 13, + "revenue": 2360.96, + "bounceRate": 0.65 + }, + { + "date": "2025-01-05", + "views": 5804, + "clicks": 186, + "conversions": 22, + "revenue": 2535.96, + "bounceRate": 0.37 + } + ] +} +``` + +**TOON** (4,507 tokens): + +``` +metrics[5]{date,views,clicks,conversions,revenue,bounceRate}: + 2025-01-01,6890,401,23,6015.59,0.63 + 2025-01-02,6940,323,37,9086.44,0.36 + 2025-01-03,4390,346,26,6360.75,0.48 + 2025-01-04,3429,231,13,2360.96,0.65 + 2025-01-05,5804,186,22,2535.96,0.37 +``` + +
+ + + +> [!NOTE] +> Measured with [`gpt-tokenizer`](https://github.com/niieani/gpt-tokenizer) using `o200k_base` encoding (used by GPT-5 and other modern models). Savings will vary across models and tokenizers. + + + +### Retrieval Accuracy + +Tested across **3 LLMs** with data retrieval tasks: + +``` +gpt-5-nano + toon โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ 99.4% (158/159) + yaml โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘ 95.0% (151/159) + csv โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘ 92.5% (147/159) + json โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘ 92.5% (147/159) + xml โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘ 91.2% (145/159) + +claude-haiku-4-5 + toon โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘ 75.5% (120/159) + xml โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘ 75.5% (120/159) + csv โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘ 75.5% (120/159) + json โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘ 75.5% (120/159) + yaml โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘ 74.2% (118/159) + +gemini-2.5-flash + xml โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘ 91.8% (146/159) + csv โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘ 86.2% (137/159) + toon โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘ 84.9% (135/159) + json โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘ 81.8% (130/159) + yaml โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘ 78.6% (125/159) +``` + +**Advantage:** TOON achieves **86.6% accuracy** (vs JSON's 83.2%) while using **46.3% fewer tokens**. + +
+Performance by dataset and model + +#### Performance by Dataset + +##### Uniform employee records (TOON optimal format) + +| Format | Accuracy | Tokens | Correct/Total | +| ------ | -------- | ------ | ------------- | +| `toon` | 87.4% | 2.483 | 152/174 | +| `csv` | 82.8% | 2.337 | 144/174 | +| `yaml` | 83.9% | 4.969 | 146/174 | +| `json` | 83.9% | 6.347 | 146/174 | +| `xml` | 88.5% | 7.314 | 154/174 | + +##### E-commerce orders with nested structures + +| Format | Accuracy | Tokens | Correct/Total | +| ------ | -------- | ------ | ------------- | +| `toon` | 90.9% | 5.967 | 120/132 | +| `csv` | 93.9% | 6.735 | 124/132 | +| `yaml` | 87.1% | 7.328 | 115/132 | +| `json` | 87.9% | 9.694 | 116/132 | +| `xml` | 93.2% | 10.992 | 123/132 | + +##### Time-series analytics data + +| Format | Accuracy | Tokens | Correct/Total | +| ------ | -------- | ------ | ------------- | +| `csv` | 89.7% | 1.393 | 78/87 | +| `toon` | 88.5% | 1.515 | 77/87 | +| `yaml` | 83.9% | 2.938 | 73/87 | +| `json` | 88.5% | 3.665 | 77/87 | +| `xml` | 85.1% | 4.376 | 74/87 | + +##### Top 100 GitHub repositories + +| Format | Accuracy | Tokens | Correct/Total | +| ------ | -------- | ------ | ------------- | +| `toon` | 76.2% | 8.745 | 64/84 | +| `csv` | 69.0% | 8.513 | 58/84 | +| `yaml` | 71.4% | 13.129 | 60/84 | +| `json` | 69.0% | 15.145 | 58/84 | +| `xml` | 71.4% | 17.095 | 60/84 | + +#### Performance by Model + +##### gpt-5-nano + +| Format | Accuracy | Correct/Total | +| ------ | -------- | ------------- | +| `toon` | 99.4% | 158/159 | +| `yaml` | 95.0% | 151/159 | +| `csv` | 92.5% | 147/159 | +| `json` | 92.5% | 147/159 | +| `xml` | 91.2% | 145/159 | + +##### claude-haiku-4-5 + +| Format | Accuracy | Correct/Total | +| ------ | -------- | ------------- | +| `toon` | 75.5% | 120/159 | +| `xml` | 75.5% | 120/159 | +| `csv` | 75.5% | 120/159 | +| `json` | 75.5% | 120/159 | +| `yaml` | 74.2% | 118/159 | + +##### gemini-2.5-flash + +| Format | Accuracy | Correct/Total | +| ------ | -------- | ------------- | +| `xml` | 91.8% | 146/159 | +| `csv` | 86.2% | 137/159 | +| `toon` | 84.9% | 135/159 | +| `json` | 81.8% | 130/159 | +| `yaml` | 78.6% | 125/159 | + +
+ +
+How the benchmark works + +#### What's Being Measured + +This benchmark tests **LLM comprehension and data retrieval accuracy** when data is presented in different formats. Each LLM receives formatted data and must answer questions about it (this does NOT test LLM's ability to generate TOON output). + +#### Datasets Tested + +Four datasets designed to test different structural patterns: + +1. **Tabular** (100 employee records): Uniform objects with identical fields โ€“ optimal for TOON's tabular format. +2. **Nested** (50 e-commerce orders): Complex structures with nested customer objects and item arrays. +3. **Analytics** (60 days of metrics): Time-series data with dates and numeric values. +4. **GitHub** (100 repositories): Real-world data from top GitHub repos by stars. + +#### Question Types + +~160 questions are generated dynamically across three categories: + +- **Field retrieval (50%)**: Direct value lookups + - Example: "What is Alice's salary?" โ†’ `75000` + - Example: "What is the customer name for order ORD-0042?" โ†’ `John Doe` + +- **Aggregation (25%)**: Counting and summation tasks + - Example: "How many employees work in Engineering?" โ†’ `17` + - Example: "What is the total revenue across all orders?" โ†’ `45123.50` + +- **Filtering (25%)**: Conditional queries + - Example: "How many employees in Sales have salary > 80000?" โ†’ `5` + - Example: "How many orders have total > 400?" โ†’ `12` + +#### Evaluation Process + +1. **Format conversion**: Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML). +2. **Query LLM**: Each model receives formatted data + question in a prompt. +3. **LLM responds**: Model extracts the answer from the data. +4. **Validate with LLM-as-judge**: GPT-5-nano validates if the answer is semantically correct. + +#### Semantic Validation + +Answers are validated by an LLM judge (`gpt-5-nano`) using semantic equivalence, not exact string matching: + +- **Numeric formats**: `50000` = `$50,000` = `50000 dollars` โœ“ +- **Case insensitive**: `Engineering` = `engineering` = `ENGINEERING` โœ“ +- **Minor formatting**: `2025-01-01` = `January 1, 2025` โœ“ + +#### Models & Configuration + +- **Models tested**: `gpt-5-nano`, `claude-haiku-4-5`, `gemini-2.5-flash` +- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer) +- **Temperature**: 0 (for non-reasoning models) +- **Total evaluations**: 159 questions ร— 5 formats ร— 3 models = 2,385 LLM calls + +
+ + + +## Canonical Formatting Rules + +TOON formatting is deterministic and minimal: + +- **Indentation**: 2 spaces per nesting level. +- **Lines**: + - `key: value` for primitives (single space after colon). + - `key:` for nested/empty objects (no trailing space on that line). +- **Arrays**: + - Delimiter encoding: Comma delimiters are implicit in array headers (e.g., `tags[3]:`, `items[2]{id,name}:`). Tab and pipe delimiters are explicitly shown in array headers (e.g., `tags[3|]:`, `items[2 ]{id name}:`). + - Primitive arrays inline: `key[N]: v1,v2` (comma) or `key[N]: v1v2` (tab/pipe). + - Tabular arrays: `key[N]{f1,f2}: โ€ฆ` (comma) or `key[N]{f1f2}: โ€ฆ` (tab/pipe). + - List items: two spaces, hyphen, space (`" - โ€ฆ"`). +- **Whitespace invariants**: + - No trailing spaces at end of any line. + - No trailing newline at end of output. + +## Format Overview + +### Objects + +Simple objects with primitive values: + +**JSON:** + +```json +{ + "id": 123, + "name": "Ada", + "active": true +} +``` + +**TOON:** + +``` +id: 123 +name: Ada +active: true +``` + +Nested objects: + +**JSON:** + +```json +{ + "user": { + "id": 123, + "name": "Ada" + } +} +``` + +**TOON:** + +``` +user: + id: 123 + name: Ada +``` + +### Arrays + +> [!TIP] +> TOON includes the array length in brackets (e.g., `items[3]`). When using comma delimiters (default), the delimiter is implicit. When using tab or pipe delimiters, the delimiter is explicitly shown in the header (e.g., `tags[2|]` or `[2 ]`). This encoding helps LLMs identify the delimiter and track the number of elements, reducing errors when generating or validating structured output. + +#### Primitive Arrays (Inline) + +**JSON:** + +```json +{ + "tags": ["admin", "ops", "dev"] +} +``` + +**TOON:** + +``` +tags[3]: admin,ops,dev +``` + +#### Arrays of Objects (Tabular) + +When all objects share the same primitive fields, TOON uses an efficient **tabular format**: + +**JSON:** + +```json +{ + "items": [ + { "sku": "A1", "qty": 2, "price": 9.99 }, + { "sku": "B2", "qty": 1, "price": 14.5 } + ] +} +``` + +**TOON:** + +``` +items[2]{sku,qty,price}: + A1,2,9.99 + B2,1,14.5 +``` + +**Tabular formatting applies recursively:** nested arrays of objects (whether as object properties or inside list items) also use tabular format if they meet the same requirements. + +**JSON:** + +```json +{ + "items": [ + { + "users": [ + { "id": 1, "name": "Ada" }, + { "id": 2, "name": "Bob" } + ], + "status": "active" + } + ] +} +``` + +**TOON:** + +``` +items[1]: + - users[2]{id,name}: + 1,Ada + 2,Bob + status: active +``` + +#### Mixed and Non-Uniform Arrays + +Arrays that don't meet the tabular requirements use list format: + +``` +items[3]: + - 1 + - a: 1 + - text +``` + +When objects appear in list format, the first field is placed on the hyphen line: + +``` +items[2]: + - id: 1 + name: First + - id: 2 + name: Second + extra: true +``` + +> [!NOTE] +> **Nested array indentation:** When the first field of a list item is an array (primitive, tabular, or nested), its contents are indented two spaces under the header line, and subsequent fields of the same object appear at that same indentation level. This remains unambiguous because list items begin with `"- "`, tabular arrays declare a fixed row count in their header, and object fields contain `":"`. + +#### Arrays of Arrays + +When you have arrays containing primitive inner arrays: + +**JSON:** + +```json +{ + "pairs": [ + [1, 2], + [3, 4] + ] +} +``` + +**TOON:** + +``` +pairs[2]: + - [2]: 1,2 + - [2]: 3,4 +``` + +#### Empty Arrays and Objects + +Empty containers have special representations: + +``` +{} โ†’ (empty output) +{ "items": [] } โ†’ items[0]: +[] โ†’ [0]: +{ "config": {} } โ†’ config: +``` + +### Quoting Rules + +TOON quotes strings **only when necessary** to maximize token efficiency. Inner spaces are allowed; leading or trailing spaces force quotes. Unicode and emoji are safe unquoted. + +> [!NOTE] +> When using alternative delimiters (tab or pipe), the quoting rules adapt automatically. Strings containing the active delimiter will be quoted, while other delimiters remain safe. + +#### Keys + +Keys are quoted when any of the following is true: + +| Condition | Examples | +|---|---| +| Contains spaces, commas, colons, quotes, control chars | `"full name"`, `"a,b"`, `"order:id"`, `"tab\there"` | +| Contains brackets or braces | `"[index]"`, `"{key}"` | +| Leading hyphen | `"-lead"` | +| Numeric-only key | `"123"` | +| Empty key | `""` | + +**Notes:** + +- Quotes and control characters in keys are escaped (e.g., `"he said \"hi\""`, `"line\nbreak"`). + +#### String Values + +String values are quoted when any of the following is true: + +| Condition | Examples | +|---|---| +| Empty string | `""` | +| Contains active delimiter, colon, quote, backslash, or control chars | `"a,b"` (comma), `"a\tb"` (tab), `"a\|b"` (pipe), `"a:b"`, `"say \"hi\""`, `"C:\\Users"`, `"line1\\nline2"` | +| Leading or trailing spaces | `" padded "`, `" "` | +| Looks like boolean/number/null | `"true"`, `"false"`, `"null"`, `"42"`, `"-3.14"`, `"1e-6"`, `"05"` | +| Starts with `"- "` (list-like) | `"- item"` | +| Looks like structural token | `"[5]"`, `"{key}"`, `"[3]: x,y"` | + +> [!IMPORTANT] +> **Delimiter-aware quoting:** Unquoted strings never contain `:` or the active delimiter. This makes TOON reliably parseable with simple heuristics: split key/value on first `:`, and split array values on the delimiter declared in the array header. When using tab or pipe delimiters, commas don't need quoting โ€“ only the active delimiter triggers quoting for both array values and object values. + +#### Examples + +``` +note: "hello, world" +items[3]: foo,"true","- item" +hello ๐Ÿ‘‹ world // unquoted +" padded " // quoted +value: null // null value +name: "" // empty string (quoted) +text: "line1\nline2" // multi-line string (escaped) +``` + +### Tabular Format Requirements + +For arrays of objects to use the efficient tabular format, all of the following must be true: + +| Requirement | Detail | +|---|---| +| All elements are objects | No primitives in the array | +| Identical key sets | No missing or extra keys across rows | +| Primitive values only | No nested arrays or objects | +| Header delimiter | Comma is implicit in headers (`[N]{f1,f2}`); tab and pipe are explicit (`[N ]{f1 f2}`, `[N|]{f1|f2}`) | +| Header key order | Taken from the first object | +| Header key quoting | Same rules as object keys; keys containing the active delimiter must be quoted | +| Row value quoting | Same rules as string values; values containing the active delimiter must be quoted | + +If any condition fails, TOON falls back to list format. + +## Using TOON in LLM Prompts + +TOON works best when you show the format instead of describing it. The structure is self-documenting โ€“ models parse it naturally once they see the pattern. + +### Sending TOON to LLMs (Input) + +Wrap your encoded data in a fenced code block (label it \`\`\`toon for clarity). The indentation and headers are usually enough โ€“ models treat it like familiar YAML or CSV. The explicit length markers (`[N]`) and field headers (`{field1,field2}`) help the model track structure, especially for large tables. + +### Generating TOON from LLMs (Output) + +For output, be more explicit. When you want the model to **generate** TOON: + +- **Show the expected header** (`users[N]{id,name,role}:`). The model fills rows instead of repeating keys, reducing generation errors. +- **State the rules**: 2-space indent, no trailing spaces, `[N]` matches row count. + +Here's a prompt that works for both reading and generating: + +``` +Data is in TOON format (2-space indent, arrays show length and fields). + +\`\`\`toon +users[3]{id,name,role,lastLogin}: + 1,Alice,admin,2025-01-15T10:30:00Z + 2,Bob,user,2025-01-14T15:22:00Z + 3,Charlie,user,2025-01-13T09:45:00Z +\`\`\` + +Task: Return only users with role "user" as TOON. Use the same header. Set [N] to match the row count. Output only the code block. +``` + +> [!TIP] +> For large uniform tables, tab delimiters can tokenize better than commas and reduce the need for quote-escaping. Tell the model "fields are tab-separated." + +## Notes and Limitations + +- **Token counts vary by tokenizer and model.** Benchmarks use a GPT-style tokenizer (cl100k/o200k); actual savings will differ with other models (e.g., SentencePiece). +- **TOON is designed for LLM contexts** where human readability and token efficiency matter. It's **not** a drop-in replacement for JSON in APIs or storage. +- **Tabular arrays** require all objects to have exactly the same keys with primitive values only. Arrays with mixed types (primitives + objects/arrays), non-uniform objects, or nested structures will use a more verbose list format. +- **Object key order** is preserved from the input. In tabular arrays, header order follows the first object's keys. +- **Arrays mixing primitives and objects/arrays** always use list form: + + ``` + items[2]: + - a: 1 + - [2]: 1,2 + ``` + +- **Deterministic formatting:** 2-space indentation, stable key order, no trailing spaces/newline. + +## Quick Reference + +``` +// Object +{ id: 1, name: 'Ada' } โ†’ id: 1 + name: Ada + +// Nested object +{ user: { id: 1 } } โ†’ user: + id: 1 + +// Primitive array (inline) +{ tags: ['foo', 'bar'] } โ†’ tags[2]: foo,bar + +// Tabular array (uniform objects) +{ items: [ โ†’ items[2]{id,qty}: + { id: 1, qty: 5 }, 1,5 + { id: 2, qty: 3 } 2,3 +]} + +// Mixed / non-uniform (list) +{ items: [1, { a: 1 }, 'x'] } โ†’ items[3]: + - 1 + - a: 1 + - x + +// Array of arrays +{ pairs: [[1, 2], [3, 4]] } โ†’ pairs[2]: + - [2]: 1,2 + - [2]: 3,4 + +// Root array +['x', 'y'] โ†’ [2]: x,y + +// Empty containers +{} โ†’ (empty output) +{ items: [] } โ†’ items[0]: + +// Special quoting +{ note: 'hello, world' } โ†’ note: "hello, world" +{ items: ['true', true] } โ†’ items[2]: "true",true +``` + +## Implementations in Other Languages + +- **Java**: [JToon](README.md) +- **TypeScript/JavaScript**: [@johannschopplich/toon](https://github.com/johannschopplich/toon) (original) +- **Elixir**: [toon_ex](https://github.com/kentaro/toon_ex) +- **PHP**: [toon-php](https://github.com/HelgeSverre/toon-php) +- **Python**: [pytoon](https://github.com/bpradana/pytoon) + - [python-toon](https://github.com/xaviviro/python-toon) + - [toon-python](https://gitlab.com/KanTakahiro/toon-python) +- **Ruby**: [toon-ruby](https://github.com/andrepcg/toon-ruby) diff --git a/build.gradle b/build.gradle index 296c631..7ff1980 100644 --- a/build.gradle +++ b/build.gradle @@ -9,8 +9,12 @@ version = '0.1.0' description = 'Token-Oriented Object Notation (TOON) - A compact, human-readable format for LLM contexts' java { - toolchain { - languageVersion = JavaLanguageVersion.of(21) + // Only enforce toolchain in CI to ensure consistent Java 21 builds + // Locally, use whatever JDK is installed + if (System.getenv('CI') == 'true') { + toolchain { + languageVersion = JavaLanguageVersion.of(21) + } } withSourcesJar() withJavadocJar() diff --git a/src/test/java/com/felipestanzani/jtoon/JToonTest.java b/src/test/java/com/felipestanzani/jtoon/JToonTest.java index 58f1b60..5d3fe16 100644 --- a/src/test/java/com/felipestanzani/jtoon/JToonTest.java +++ b/src/test/java/com/felipestanzani/jtoon/JToonTest.java @@ -2,6 +2,7 @@ import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import java.math.BigInteger; @@ -10,11 +11,13 @@ import java.util.List; import java.util.Map; +import static com.felipestanzani.jtoon.TestPojos.*; import static org.junit.jupiter.api.Assertions.*; /** * JUnit 5 test class for JToon encoder, converted from TypeScript vitest tests. */ +@Tag("unit") public class JToonTest { // Helper to create a LinkedHashMap for objects, preserving insertion order @@ -992,4 +995,200 @@ void defaultIsFalse() { assertEquals("tags[3]: reading,gaming,coding", encode(obj)); } } + + @Nested + @DisplayName("POJOs (Plain Old Java Objects)") + class Pojos { + + @Nested + @DisplayName("simple POJOs") + class SimplePOJOs { + + @Test + @DisplayName("encodes simple POJO with basic fields") + void encodesSimplePOJO() { + Person person = new Person("Ada", 30, true); + assertEquals("name: Ada\nage: 30\nactive: true", encode(person)); + } + + @Test + @DisplayName("encodes POJO with multiple field types") + void encodesMultipleFieldTypes() { + Product product = new Product(101, "Laptop", 999.99, true); + assertEquals("id: 101\nname: Laptop\nprice: 999.99\ninStock: true", encode(product)); + } + + @Test + @DisplayName("encodes POJO with null values") + void encodesNullValues() { + NullableData data = new NullableData("hello", null, null); + assertEquals("text: hello\ncount: null\nflag: null", encode(data)); + } + + @Test + @DisplayName("encodes POJO with all null values") + void encodesAllNulls() { + NullableData data = new NullableData(null, null, null); + assertEquals("text: null\ncount: null\nflag: null", encode(data)); + } + + @Test + @DisplayName("encodes POJO in object context") + void encodesPOJOInObject() { + Person person = new Person("Bob", 25, false); + Map obj = obj("user", person); + assertEquals("user:\n name: Bob\n age: 25\n active: false", encode(obj)); + } + } + + @Nested + @DisplayName("nested POJOs and collections") + class NestedAndCollections { + + @Test + @DisplayName("encodes POJO with nested POJO") + void encodesNestedPOJO() { + Address address = new Address("123 Main St", "Springfield", "12345"); + Employee employee = new Employee("Alice", 1001, address); + assertEquals( + """ + name: Alice + id: 1001 + address: + street: 123 Main St + city: Springfield + zipCode: "12345\"""", + encode(employee)); + } + + @Test + @DisplayName("encodes deeply nested POJOs") + void encodesDeeplyNested() { + Address address = new Address("456 Oak Ave", "Metropolis", "54321"); + Employee manager = new Employee("Carol", 2001, address); + Company company = new Company("TechCorp", manager); + assertEquals( + """ + name: TechCorp + manager: + name: Carol + id: 2001 + address: + street: 456 Oak Ave + city: Metropolis + zipCode: "54321\"""", + encode(company)); + } + + @Test + @DisplayName("encodes POJO with list of primitives") + void encodesListOfPrimitives() { + Skills skills = new Skills("Developer", List.of("Java", "Python", "JavaScript")); + assertEquals("owner: Developer\nskillList[3]: Java,Python,JavaScript", encode(skills)); + } + + @Test + @DisplayName("encodes POJO with list of POJOs in tabular format") + void encodesListOfPOJOs() { + Person person1 = new Person("Alice", 30, true); + Person person2 = new Person("Bob", 25, false); + Team team = new Team("DevTeam", List.of(person1, person2)); + assertEquals( + """ + name: DevTeam + members[2]{name,age,active}: + Alice,30,true + Bob,25,false""", + encode(team)); + } + + @Test + @DisplayName("encodes POJO with Map fields") + void encodesMapFields() { + Map settings = Map.of("debug", true, "timeout", 30, "mode", "production"); + Configuration config = new Configuration("AppConfig", settings); + String result = encode(config); + assertTrue(result.startsWith("name: AppConfig\nsettings:")); + assertTrue(result.contains("debug: true")); + assertTrue(result.contains("timeout: 30")); + assertTrue(result.contains("mode: production")); + } + + @Test + @DisplayName("encodes POJO with empty collections") + void encodesEmptyCollections() { + EmptyCollections empty = new EmptyCollections(List.of(), Map.of()); + assertEquals("emptyList[0]:\nemptyMap:", encode(empty)); + } + + @Test + @DisplayName("encodes POJO with multiple collection fields") + void encodesMultipleCollections() { + MultiCollection multi = new MultiCollection( + List.of(1, 2, 3), + List.of("a", "b"), + Map.of("x", 10, "y", 20)); + String result = encode(multi); + assertTrue(result.contains("numbers[3]: 1,2,3")); + assertTrue(result.contains("tags[2]: a,b")); + assertTrue(result.contains("counts:")); + } + } + + @Nested + @DisplayName("POJOs with Jackson annotations") + class JacksonAnnotations { + + @Test + @DisplayName("encodes POJO with @JsonProperty annotation") + void encodesJsonProperty() { + AnnotatedProduct product = new AnnotatedProduct(501, "Mouse", 29.99); + assertEquals("product_id: 501\nproduct_name: Mouse\nprice: 29.99", encode(product)); + } + + @Test + @DisplayName("encodes POJO with @JsonIgnore annotation") + void encodesJsonIgnore() { + SecureData data = new SecureData("public info", "secret", 1); + assertEquals("publicField: public info\nversion: 1", encode(data)); + } + + @Test + @DisplayName("encodes POJO with multiple annotations") + void encodesMultipleAnnotations() { + ComplexAnnotated obj = new ComplexAnnotated(123, "Test", "internal data", true); + assertEquals("user_id: 123\nname: Test\nis_active: true", encode(obj)); + } + + @Test + @DisplayName("encodes nested POJO with annotations") + void encodesNestedWithAnnotations() { + Address address = new Address("789 Pine Rd", "Gotham", "99999"); + AnnotatedEmployee employee = new AnnotatedEmployee(3001, "Diana", address, "123-45-6789"); + assertEquals( + """ + emp_id: 3001 + full_name: Diana + address: + street: 789 Pine Rd + city: Gotham + zipCode: "99999\"""", + encode(employee)); + } + + @Test + @DisplayName("encodes list of annotated POJOs in tabular format") + void encodesListOfAnnotatedPOJOs() { + AnnotatedProduct p1 = new AnnotatedProduct(101, "Keyboard", 79.99); + AnnotatedProduct p2 = new AnnotatedProduct(102, "Monitor", 299.99); + Map obj = obj("products", List.of(p1, p2)); + assertEquals( + """ + products[2]{product_id,product_name,price}: + 101,Keyboard,79.99 + 102,Monitor,299.99""", + encode(obj)); + } + } + } } diff --git a/src/test/java/com/felipestanzani/jtoon/TestPojos.java b/src/test/java/com/felipestanzani/jtoon/TestPojos.java new file mode 100644 index 0000000..26a8b20 --- /dev/null +++ b/src/test/java/com/felipestanzani/jtoon/TestPojos.java @@ -0,0 +1,128 @@ +package com.felipestanzani.jtoon; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.util.List; +import java.util.Map; + +/** + * Test POJOs (records) for JToon encoding tests. + * These records cover various scenarios including simple fields, nested structures, + * collections, and Jackson annotations. + */ +public class TestPojos { + + // ===== Simple Records ===== + + /** + * Simple person record with basic fields. + */ + public record Person(String name, int age, boolean active) { + } + + /** + * Simple product record with various numeric types. + */ + public record Product(int id, String name, double price, boolean inStock) { + } + + /** + * Record with nullable fields to test null handling. + */ + public record NullableData(String text, Integer count, Boolean flag) { + } + + // ===== Nested Records ===== + + /** + * Address record for nested structure tests. + */ + public record Address(String street, String city, String zipCode) { + } + + /** + * Employee record containing a nested Address. + */ + public record Employee(String name, int id, Address address) { + } + + /** + * Deeply nested structure for testing multiple levels. + */ + public record Company(String name, Employee manager) { + } + + // ===== Collection Records ===== + + /** + * Record with a list of primitives. + */ + public record Skills(String owner, List skillList) { + } + + /** + * Record with a list of objects (for tabular format testing). + */ + public record Team(String name, List members) { + } + + /** + * Record with Map fields. + */ + public record Configuration(String name, Map settings) { + } + + /** + * Record with empty collections. + */ + public record EmptyCollections(List emptyList, Map emptyMap) { + } + + /** + * Record with multiple collection types. + */ + public record MultiCollection(List numbers, List tags, Map counts) { + } + + // ===== Annotated Records ===== + + /** + * Record with @JsonProperty annotation for field name mapping. + */ + public record AnnotatedProduct( + @JsonProperty("product_id") int id, + @JsonProperty("product_name") String name, + double price) { + } + + /** + * Record with @JsonIgnore annotation to exclude fields. + */ + public record SecureData( + String publicField, + @JsonIgnore String secretField, + int version) { + } + + /** + * Record with multiple Jackson annotations. + */ + public record ComplexAnnotated( + @JsonProperty("user_id") int id, + String name, + @JsonIgnore String internal, + @JsonProperty("is_active") boolean active) { + } + + /** + * Record combining nested structure with annotations. + */ + public record AnnotatedEmployee( + @JsonProperty("emp_id") int id, + @JsonProperty("full_name") String name, + Address address, + @JsonIgnore String ssn) { + } +} +