Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions BENCHMARKS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# TOON Nested Tables — Benchmark Results

## Token Count Comparison

| Dataset | TOON | TOON + Nested | JSON Compact | Nested Savings | Nested vs JSON |
|---------|------|---------------|-------------|----------------|---------------|
| tabular | 49.919 | 49.919 | 79.059 | 0.0% | 36.9% |
| nested | 73.126 | 73.126 | 69.459 | 0.0% | -5.3% |
| analytics | 9.115 | 9.115 | 14.211 | 0.0% | 35.9% |
| github | 8.744 | 8.744 | 11.454 | 0.0% | 23.7% |
| event-logs | 154.084 | 154.084 | 128.529 | 0.0% | -19.9% |
| nested-config | 620 | 591 | 558 | 4.7% | -5.9% |
| uniform-nested | 58.701 | 27.111 | 46.697 | 53.8% | 41.9% |

## Key Findings

- **Nested tables** save tokens when data contains uniform nested objects by flattening them into the tabular format instead of falling back to list items.
- For datasets without nested structures, output is identical to standard TOON.
- The feature is opt-in and backwards-compatible.
76 changes: 76 additions & 0 deletions benchmarks/scripts/nested-tables-benchmark.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/**
* Benchmark comparing token counts for TOON with and without nested tables.
*
* Usage: node benchmarks/scripts/type-hints-benchmark.ts
*/
import * as fsp from 'node:fs/promises'
import * as path from 'node:path'
import { encode } from '../../packages/toon/src/index.ts'
import { TOKEN_EFFICIENCY_DATASETS } from '../src/datasets.ts'
import { tokenize } from '../src/utils.ts'

interface Result {
dataset: string
toonTokens: number
toonWithNestedTokens: number
jsonCompactTokens: number
nestedSavings: string
nestedVsJson: string
}

const results: Result[] = []

console.log('=== TOON Nested Tables — Token Benchmark ===\n')

for (const dataset of TOKEN_EFFICIENCY_DATASETS) {
const data = dataset.data

const toon = encode(data)
const toonWithNested = encode(data, { nestedTables: true })
const jsonCompact = JSON.stringify(data)

const toonTokens = tokenize(toon)
const toonNestedTokens = tokenize(toonWithNested)
const jsonTokens = tokenize(jsonCompact)

const nestedSavings = ((toonTokens - toonNestedTokens) / toonTokens * 100).toFixed(1)
const nestedVsJson = ((jsonTokens - toonNestedTokens) / jsonTokens * 100).toFixed(1)

results.push({
dataset: dataset.name,
toonTokens,
toonWithNestedTokens: toonNestedTokens,
jsonCompactTokens: jsonTokens,
nestedSavings: `${nestedSavings}%`,
nestedVsJson: `${nestedVsJson}%`,
})

console.log(`📊 ${dataset.name}`)
console.log(` TOON (baseline): ${toonTokens.toLocaleString()} tokens`)
console.log(` TOON + nested tables: ${toonNestedTokens.toLocaleString()} tokens (${nestedSavings}% saved vs TOON)`)
console.log(` JSON (compact): ${jsonTokens.toLocaleString()} tokens`)
console.log(` Nested vs JSON: ${nestedVsJson}% fewer tokens`)
console.log()
}

// Write results to BENCHMARKS.md
const md = `# TOON Nested Tables — Benchmark Results

## Token Count Comparison

| Dataset | TOON | TOON + Nested | JSON Compact | Nested Savings | Nested vs JSON |
|---------|------|---------------|-------------|----------------|---------------|
${results.map(r =>
`| ${r.dataset} | ${r.toonTokens.toLocaleString()} | ${r.toonWithNestedTokens.toLocaleString()} | ${r.jsonCompactTokens.toLocaleString()} | ${r.nestedSavings} | ${r.nestedVsJson} |`,
).join('\n')}

## Key Findings

- **Nested tables** save tokens when data contains uniform nested objects by flattening them into the tabular format instead of falling back to list items.
- For datasets without nested structures, output is identical to standard TOON.
- The feature is opt-in and backwards-compatible.
`

const benchmarkPath = path.resolve(import.meta.dirname, '..', '..', 'BENCHMARKS.md')
await fsp.writeFile(benchmarkPath, md, 'utf-8')
console.log(`Results written to BENCHMARKS.md`)
1 change: 1 addition & 0 deletions benchmarks/src/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ export const DATASET_NAMES = [
'github',
'event-logs',
'nested-config',
'uniform-nested',
'large-uniform',
'structural-validation-control',
'structural-validation-truncated',
Expand Down
56 changes: 56 additions & 0 deletions benchmarks/src/datasets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -698,6 +698,51 @@ export const ACCURACY_DATASETS: Dataset[] = [
...structuralValidationDatasets, // 5 validation fixtures
]

/**
* Generate shipment records with uniform nested objects.
* Every row has identical nested structure (sender, receiver, dimensions)
* with only primitive values — the ideal case for nested table encoding.
*/
function generateShipments(count: number): { shipments: Array<{
id: number
sender: { name: string, city: string, country: string, zip: string }
receiver: { name: string, city: string, country: string, zip: string }
dimensions: { weight: number, length: number, width: number, height: number }
carrier: string
status: string
cost: number
}> } {
const carriers = ['FedEx', 'UPS', 'DHL', 'USPS'] as const
const statuses = ['pending', 'in_transit', 'delivered', 'returned'] as const
const countries = ['US', 'UK', 'DE', 'DK', 'FR', 'JP', 'AU', 'CA'] as const
return {
shipments: Array.from({ length: count }, (_, i) => ({
id: i + 1,
sender: {
name: faker.person.fullName(),
city: faker.location.city(),
country: countries[i % countries.length]!,
zip: faker.location.zipCode(),
},
receiver: {
name: faker.person.fullName(),
city: faker.location.city(),
country: countries[(i + 3) % countries.length]!,
zip: faker.location.zipCode(),
},
dimensions: {
weight: faker.number.float({ min: 0.5, max: 50, fractionDigits: 1 }),
length: faker.number.int({ min: 10, max: 120 }),
width: faker.number.int({ min: 10, max: 80 }),
height: faker.number.int({ min: 5, max: 60 }),
},
carrier: carriers[i % carriers.length]!,
status: statuses[i % statuses.length]!,
cost: faker.number.float({ min: 5, max: 200, fractionDigits: 2 }),
})),
}
}

/**
* Datasets for token efficiency benchmarks (larger sizes to amplify token differences)
*/
Expand Down Expand Up @@ -750,4 +795,15 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
},
// Nested config: 1 config (same as accuracy)
nestedConfigDataset,
// Uniform nested: 500 shipments with sender/receiver/dimensions objects
{
name: 'uniform-nested',
description: 'Shipment records with uniform nested objects (sender, receiver, dimensions)',
data: generateShipments(500),
metadata: {
supportsCSV: false,
structureClass: 'nested',
tabularEligibility: 0, // Has nested objects, not tabular without nested table support
},
},
]
44 changes: 44 additions & 0 deletions packages/toon/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -811,6 +811,50 @@ const transformed = encode(data, {
> [!TIP]
> The `replacer` function provides fine-grained control over encoding, similar to `JSON.stringify`'s replacer but with path tracking. See the [API Reference](https://toonformat.dev/reference/api#replacer-function) for more examples.

**Nested tables (opt-in):**

Flatten uniform nested objects into the tabular format for better token efficiency:

```ts
import { encode } from '@toon-format/toon'

const data = {
orders: [
{ id: 1, customer: { name: 'Alice', country: 'DK' }, total: 99 },
{ id: 2, customer: { name: 'Bob', country: 'UK' }, total: 149 },
]
}

console.log(encode(data, { nestedTables: true }))
// orders[2]{id,customer{name,country},total}:
// 1,Alice,DK,99
// 2,Bob,UK,149
```

Nested tables require uniform nested objects (same keys in every row). Non-uniform structures fall back to list syntax automatically. Nesting depth is limited to 2 levels.

**Encoder options:**

| Option | Type | Default | Description |
|--------|------|---------|-------------|
| `indent` | `number` | `2` | Spaces per indentation level |
| `delimiter` | `Delimiter` | `','` | Delimiter for tabular rows and inline arrays |
| `keyFolding` | `'off' \| 'safe'` | `'off'` | Collapse single-key wrapper chains into dotted paths |
| `flattenDepth` | `number` | `Infinity` | Max segments to fold when keyFolding is enabled |
| `replacer` | `EncodeReplacer` | `undefined` | Transform or filter values during encoding |
| `nestedTables` | `boolean` | `false` | Flatten uniform nested objects in tabular arrays |

**Decoder options:**

| Option | Type | Default | Description |
|--------|------|---------|-------------|
| `indent` | `number` | `2` | Spaces per indentation level |
| `strict` | `boolean` | `true` | Enforce strict validation of array lengths and row counts |
| `expandPaths` | `'off' \| 'safe'` | `'off'` | Reconstruct dotted keys into nested objects |

> [!NOTE]
> **Backwards compatibility:** `nestedTables` is opt-in. When disabled (default), encoder output is identical to previous versions. The decoder always handles nested table syntax regardless of options, so TOON with nested tables can be decoded by any conforming parser.

## Playgrounds

Experiment with TOON format interactively using these tools for token comparison, format conversion, and validation.
Expand Down
32 changes: 29 additions & 3 deletions packages/toon/src/decode/decoders.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { ArrayHeaderInfo, DecodeStreamOptions, Depth, JsonPrimitive, JsonStreamEvent, ParsedLine } from '../types.ts'
import type { ArrayHeaderInfo, DecodeStreamOptions, Depth, FieldDescriptor, JsonPrimitive, JsonStreamEvent, ParsedLine } from '../types.ts'
import type { StreamingScanState } from './scanner.ts'
import { COLON, DEFAULT_DELIMITER, LIST_ITEM_MARKER, LIST_ITEM_PREFIX } from '../constants.ts'
import { findClosingQuote } from '../shared/string-utils.ts'
Expand Down Expand Up @@ -320,7 +320,7 @@ function* decodeTabularArraySync(
assertExpectedCount(values.length, header.fields!.length, 'tabular row values', options)

const primitives = mapRowValuesToPrimitives(values)
yield* yieldObjectFromFields(header.fields!, primitives)
yield* yieldObjectFromFields(header.fields!, primitives, header.fieldDescriptors)

rowCount++
}
Expand Down Expand Up @@ -706,7 +706,7 @@ async function* decodeTabularArrayAsync(
assertExpectedCount(values.length, header.fields!.length, 'tabular row values', options)

const primitives = mapRowValuesToPrimitives(values)
yield* yieldObjectFromFields(header.fields!, primitives)
yield* yieldObjectFromFields(header.fields!, primitives, header.fieldDescriptors)

rowCount++
}
Expand Down Expand Up @@ -887,7 +887,14 @@ async function* decodeListItemAsync(
function* yieldObjectFromFields(
fields: string[],
primitives: JsonPrimitive[],
fieldDescriptors?: FieldDescriptor[],
): Generator<JsonStreamEvent> {
// If we have nested field descriptors, use them to reconstruct nested objects
if (fieldDescriptors && fieldDescriptors.some(d => d.subfields)) {
yield* yieldObjectFromDescriptors(fieldDescriptors, primitives, { offset: 0 })
return
}

yield { type: 'startObject' }
for (let i = 0; i < fields.length; i++) {
yield { type: 'key', key: fields[i]! }
Expand All @@ -896,4 +903,23 @@ function* yieldObjectFromFields(
yield { type: 'endObject' }
}

function* yieldObjectFromDescriptors(
descriptors: FieldDescriptor[],
primitives: JsonPrimitive[],
cursor: { offset: number },
): Generator<JsonStreamEvent> {
yield { type: 'startObject' }
for (const desc of descriptors) {
yield { type: 'key', key: desc.name }
if (desc.subfields && desc.subfields.length > 0) {
yield* yieldObjectFromDescriptors(desc.subfields, primitives, cursor)
}
else {
yield { type: 'primitive', value: primitives[cursor.offset]! }
cursor.offset++
}
}
yield { type: 'endObject' }
}

// #endregion
Loading