diff --git a/BENCHMARKS.md b/BENCHMARKS.md new file mode 100644 index 00000000..089e9b7f --- /dev/null +++ b/BENCHMARKS.md @@ -0,0 +1,19 @@ +# TOON Nested Tables — Benchmark Results + +## Token Count Comparison + +| Dataset | TOON | TOON + Nested | JSON Compact | Nested Savings | Nested vs JSON | +|---------|------|---------------|-------------|----------------|---------------| +| tabular | 49.919 | 49.919 | 79.059 | 0.0% | 36.9% | +| nested | 73.126 | 73.126 | 69.459 | 0.0% | -5.3% | +| analytics | 9.115 | 9.115 | 14.211 | 0.0% | 35.9% | +| github | 8.744 | 8.744 | 11.454 | 0.0% | 23.7% | +| event-logs | 154.084 | 154.084 | 128.529 | 0.0% | -19.9% | +| nested-config | 620 | 591 | 558 | 4.7% | -5.9% | +| uniform-nested | 58.701 | 27.111 | 46.697 | 53.8% | 41.9% | + +## Key Findings + +- **Nested tables** save tokens when data contains uniform nested objects by flattening them into the tabular format instead of falling back to list items. +- For datasets without nested structures, output is identical to standard TOON. +- The feature is opt-in and backwards-compatible. diff --git a/benchmarks/scripts/nested-tables-benchmark.ts b/benchmarks/scripts/nested-tables-benchmark.ts new file mode 100644 index 00000000..0fb85649 --- /dev/null +++ b/benchmarks/scripts/nested-tables-benchmark.ts @@ -0,0 +1,76 @@ +/** + * Benchmark comparing token counts for TOON with and without nested tables. + * + * Usage: node benchmarks/scripts/type-hints-benchmark.ts + */ +import * as fsp from 'node:fs/promises' +import * as path from 'node:path' +import { encode } from '../../packages/toon/src/index.ts' +import { TOKEN_EFFICIENCY_DATASETS } from '../src/datasets.ts' +import { tokenize } from '../src/utils.ts' + +interface Result { + dataset: string + toonTokens: number + toonWithNestedTokens: number + jsonCompactTokens: number + nestedSavings: string + nestedVsJson: string +} + +const results: Result[] = [] + +console.log('=== TOON Nested Tables — Token Benchmark ===\n') + +for (const dataset of TOKEN_EFFICIENCY_DATASETS) { + const data = dataset.data + + const toon = encode(data) + const toonWithNested = encode(data, { nestedTables: true }) + const jsonCompact = JSON.stringify(data) + + const toonTokens = tokenize(toon) + const toonNestedTokens = tokenize(toonWithNested) + const jsonTokens = tokenize(jsonCompact) + + const nestedSavings = ((toonTokens - toonNestedTokens) / toonTokens * 100).toFixed(1) + const nestedVsJson = ((jsonTokens - toonNestedTokens) / jsonTokens * 100).toFixed(1) + + results.push({ + dataset: dataset.name, + toonTokens, + toonWithNestedTokens: toonNestedTokens, + jsonCompactTokens: jsonTokens, + nestedSavings: `${nestedSavings}%`, + nestedVsJson: `${nestedVsJson}%`, + }) + + console.log(`📊 ${dataset.name}`) + console.log(` TOON (baseline): ${toonTokens.toLocaleString()} tokens`) + console.log(` TOON + nested tables: ${toonNestedTokens.toLocaleString()} tokens (${nestedSavings}% saved vs TOON)`) + console.log(` JSON (compact): ${jsonTokens.toLocaleString()} tokens`) + console.log(` Nested vs JSON: ${nestedVsJson}% fewer tokens`) + console.log() +} + +// Write results to BENCHMARKS.md +const md = `# TOON Nested Tables — Benchmark Results + +## Token Count Comparison + +| Dataset | TOON | TOON + Nested | JSON Compact | Nested Savings | Nested vs JSON | +|---------|------|---------------|-------------|----------------|---------------| +${results.map(r => + `| ${r.dataset} | ${r.toonTokens.toLocaleString()} | ${r.toonWithNestedTokens.toLocaleString()} | ${r.jsonCompactTokens.toLocaleString()} | ${r.nestedSavings} | ${r.nestedVsJson} |`, +).join('\n')} + +## Key Findings + +- **Nested tables** save tokens when data contains uniform nested objects by flattening them into the tabular format instead of falling back to list items. +- For datasets without nested structures, output is identical to standard TOON. +- The feature is opt-in and backwards-compatible. +` + +const benchmarkPath = path.resolve(import.meta.dirname, '..', '..', 'BENCHMARKS.md') +await fsp.writeFile(benchmarkPath, md, 'utf-8') +console.log(`Results written to BENCHMARKS.md`) diff --git a/benchmarks/src/constants.ts b/benchmarks/src/constants.ts index dd6a66c4..840a65f7 100644 --- a/benchmarks/src/constants.ts +++ b/benchmarks/src/constants.ts @@ -85,6 +85,7 @@ export const DATASET_NAMES = [ 'github', 'event-logs', 'nested-config', + 'uniform-nested', 'large-uniform', 'structural-validation-control', 'structural-validation-truncated', diff --git a/benchmarks/src/datasets.ts b/benchmarks/src/datasets.ts index ba31f6cb..175f9a8e 100644 --- a/benchmarks/src/datasets.ts +++ b/benchmarks/src/datasets.ts @@ -698,6 +698,51 @@ export const ACCURACY_DATASETS: Dataset[] = [ ...structuralValidationDatasets, // 5 validation fixtures ] +/** + * Generate shipment records with uniform nested objects. + * Every row has identical nested structure (sender, receiver, dimensions) + * with only primitive values — the ideal case for nested table encoding. + */ +function generateShipments(count: number): { shipments: Array<{ + id: number + sender: { name: string, city: string, country: string, zip: string } + receiver: { name: string, city: string, country: string, zip: string } + dimensions: { weight: number, length: number, width: number, height: number } + carrier: string + status: string + cost: number +}> } { + const carriers = ['FedEx', 'UPS', 'DHL', 'USPS'] as const + const statuses = ['pending', 'in_transit', 'delivered', 'returned'] as const + const countries = ['US', 'UK', 'DE', 'DK', 'FR', 'JP', 'AU', 'CA'] as const + return { + shipments: Array.from({ length: count }, (_, i) => ({ + id: i + 1, + sender: { + name: faker.person.fullName(), + city: faker.location.city(), + country: countries[i % countries.length]!, + zip: faker.location.zipCode(), + }, + receiver: { + name: faker.person.fullName(), + city: faker.location.city(), + country: countries[(i + 3) % countries.length]!, + zip: faker.location.zipCode(), + }, + dimensions: { + weight: faker.number.float({ min: 0.5, max: 50, fractionDigits: 1 }), + length: faker.number.int({ min: 10, max: 120 }), + width: faker.number.int({ min: 10, max: 80 }), + height: faker.number.int({ min: 5, max: 60 }), + }, + carrier: carriers[i % carriers.length]!, + status: statuses[i % statuses.length]!, + cost: faker.number.float({ min: 5, max: 200, fractionDigits: 2 }), + })), + } +} + /** * Datasets for token efficiency benchmarks (larger sizes to amplify token differences) */ @@ -750,4 +795,15 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [ }, // Nested config: 1 config (same as accuracy) nestedConfigDataset, + // Uniform nested: 500 shipments with sender/receiver/dimensions objects + { + name: 'uniform-nested', + description: 'Shipment records with uniform nested objects (sender, receiver, dimensions)', + data: generateShipments(500), + metadata: { + supportsCSV: false, + structureClass: 'nested', + tabularEligibility: 0, // Has nested objects, not tabular without nested table support + }, + }, ] diff --git a/packages/toon/README.md b/packages/toon/README.md index 079ebd8f..d94b6b86 100644 --- a/packages/toon/README.md +++ b/packages/toon/README.md @@ -811,6 +811,50 @@ const transformed = encode(data, { > [!TIP] > The `replacer` function provides fine-grained control over encoding, similar to `JSON.stringify`'s replacer but with path tracking. See the [API Reference](https://toonformat.dev/reference/api#replacer-function) for more examples. +**Nested tables (opt-in):** + +Flatten uniform nested objects into the tabular format for better token efficiency: + +```ts +import { encode } from '@toon-format/toon' + +const data = { + orders: [ + { id: 1, customer: { name: 'Alice', country: 'DK' }, total: 99 }, + { id: 2, customer: { name: 'Bob', country: 'UK' }, total: 149 }, + ] +} + +console.log(encode(data, { nestedTables: true })) +// orders[2]{id,customer{name,country},total}: +// 1,Alice,DK,99 +// 2,Bob,UK,149 +``` + +Nested tables require uniform nested objects (same keys in every row). Non-uniform structures fall back to list syntax automatically. Nesting depth is limited to 2 levels. + +**Encoder options:** + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `indent` | `number` | `2` | Spaces per indentation level | +| `delimiter` | `Delimiter` | `','` | Delimiter for tabular rows and inline arrays | +| `keyFolding` | `'off' \| 'safe'` | `'off'` | Collapse single-key wrapper chains into dotted paths | +| `flattenDepth` | `number` | `Infinity` | Max segments to fold when keyFolding is enabled | +| `replacer` | `EncodeReplacer` | `undefined` | Transform or filter values during encoding | +| `nestedTables` | `boolean` | `false` | Flatten uniform nested objects in tabular arrays | + +**Decoder options:** + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `indent` | `number` | `2` | Spaces per indentation level | +| `strict` | `boolean` | `true` | Enforce strict validation of array lengths and row counts | +| `expandPaths` | `'off' \| 'safe'` | `'off'` | Reconstruct dotted keys into nested objects | + +> [!NOTE] +> **Backwards compatibility:** `nestedTables` is opt-in. When disabled (default), encoder output is identical to previous versions. The decoder always handles nested table syntax regardless of options, so TOON with nested tables can be decoded by any conforming parser. + ## Playgrounds Experiment with TOON format interactively using these tools for token comparison, format conversion, and validation. diff --git a/packages/toon/src/decode/decoders.ts b/packages/toon/src/decode/decoders.ts index add34cfb..7b379472 100644 --- a/packages/toon/src/decode/decoders.ts +++ b/packages/toon/src/decode/decoders.ts @@ -1,4 +1,4 @@ -import type { ArrayHeaderInfo, DecodeStreamOptions, Depth, JsonPrimitive, JsonStreamEvent, ParsedLine } from '../types.ts' +import type { ArrayHeaderInfo, DecodeStreamOptions, Depth, FieldDescriptor, JsonPrimitive, JsonStreamEvent, ParsedLine } from '../types.ts' import type { StreamingScanState } from './scanner.ts' import { COLON, DEFAULT_DELIMITER, LIST_ITEM_MARKER, LIST_ITEM_PREFIX } from '../constants.ts' import { findClosingQuote } from '../shared/string-utils.ts' @@ -320,7 +320,7 @@ function* decodeTabularArraySync( assertExpectedCount(values.length, header.fields!.length, 'tabular row values', options) const primitives = mapRowValuesToPrimitives(values) - yield* yieldObjectFromFields(header.fields!, primitives) + yield* yieldObjectFromFields(header.fields!, primitives, header.fieldDescriptors) rowCount++ } @@ -706,7 +706,7 @@ async function* decodeTabularArrayAsync( assertExpectedCount(values.length, header.fields!.length, 'tabular row values', options) const primitives = mapRowValuesToPrimitives(values) - yield* yieldObjectFromFields(header.fields!, primitives) + yield* yieldObjectFromFields(header.fields!, primitives, header.fieldDescriptors) rowCount++ } @@ -887,7 +887,14 @@ async function* decodeListItemAsync( function* yieldObjectFromFields( fields: string[], primitives: JsonPrimitive[], + fieldDescriptors?: FieldDescriptor[], ): Generator { + // If we have nested field descriptors, use them to reconstruct nested objects + if (fieldDescriptors && fieldDescriptors.some(d => d.subfields)) { + yield* yieldObjectFromDescriptors(fieldDescriptors, primitives, { offset: 0 }) + return + } + yield { type: 'startObject' } for (let i = 0; i < fields.length; i++) { yield { type: 'key', key: fields[i]! } @@ -896,4 +903,23 @@ function* yieldObjectFromFields( yield { type: 'endObject' } } +function* yieldObjectFromDescriptors( + descriptors: FieldDescriptor[], + primitives: JsonPrimitive[], + cursor: { offset: number }, +): Generator { + yield { type: 'startObject' } + for (const desc of descriptors) { + yield { type: 'key', key: desc.name } + if (desc.subfields && desc.subfields.length > 0) { + yield* yieldObjectFromDescriptors(desc.subfields, primitives, cursor) + } + else { + yield { type: 'primitive', value: primitives[cursor.offset]! } + cursor.offset++ + } + } + yield { type: 'endObject' } +} + // #endregion diff --git a/packages/toon/src/decode/parser.ts b/packages/toon/src/decode/parser.ts index 9598eb91..0680df80 100644 --- a/packages/toon/src/decode/parser.ts +++ b/packages/toon/src/decode/parser.ts @@ -1,4 +1,4 @@ -import type { ArrayHeaderInfo, Delimiter, JsonPrimitive } from '../types.ts' +import type { ArrayHeaderInfo, Delimiter, FieldDescriptor, JsonPrimitive } from '../types.ts' import { BACKSLASH, CLOSE_BRACE, CLOSE_BRACKET, COLON, DELIMITERS, DOUBLE_QUOTE, FALSE_LITERAL, NULL_LITERAL, OPEN_BRACE, OPEN_BRACKET, PIPE, TAB, TRUE_LITERAL } from '../constants.ts' import { isBooleanOrNullLiteral, isNumericLiteral } from '../shared/literal-utils.ts' import { findClosingQuote, findUnquotedChar, unescapeString } from '../shared/string-utils.ts' @@ -51,16 +51,18 @@ export function parseArrayHeaderLine( // Check for fields segment (braces come after bracket) const braceStart = content.indexOf(OPEN_BRACE, bracketEnd) - if (braceStart !== -1 && braceStart < content.indexOf(COLON, bracketEnd)) { + if (braceStart !== -1) { // Validate: no extraneous content between bracket end and brace start const gapBeforeBrace = content.slice(bracketEnd + 1, braceStart) if (gapBeforeBrace.trim() !== '') { - return + // Brace exists but has content before it — not a fields segment, skip } - - const foundBraceEnd = content.indexOf(CLOSE_BRACE, braceStart) - if (foundBraceEnd !== -1) { - braceEnd = foundBraceEnd + 1 + else { + // Use matching brace finder to handle nested braces (e.g., customer{name,country}) + const foundBraceEnd = findMatchingBrace(content, braceStart) + if (foundBraceEnd !== -1) { + braceEnd = foundBraceEnd + 1 + } } } @@ -100,11 +102,15 @@ export function parseArrayHeaderLine( // Check for fields segment let fields: string[] | undefined + let fieldDescriptors: FieldDescriptor[] | undefined if (braceStart !== -1 && braceStart < colonIndex) { - const foundBraceEnd = content.indexOf(CLOSE_BRACE, braceStart) + // Find the matching closing brace (accounting for nested braces) + const foundBraceEnd = findMatchingBrace(content, braceStart) if (foundBraceEnd !== -1 && foundBraceEnd < colonIndex) { const fieldsContent = content.slice(braceStart + 1, foundBraceEnd) - fields = parseDelimitedValues(fieldsContent, delimiter).map(field => parseStringLiteral(field.trim())) + const parsed = parseFieldDescriptors(fieldsContent, delimiter) + fieldDescriptors = parsed.descriptors + fields = parsed.flatNames } } @@ -114,6 +120,7 @@ export function parseArrayHeaderLine( length, delimiter, fields, + fieldDescriptors: fieldDescriptors?.some(d => d.subfields) ? fieldDescriptors : undefined, }, inlineValues: afterColon || undefined, } @@ -315,6 +322,157 @@ export function parseKeyToken(content: string, start: number): { key: string, en // #endregion +// #region Field descriptor parsing + +/** + * Find the matching closing brace, accounting for nested braces. + */ +function findMatchingBrace(content: string, openIndex: number): number { + let depth = 0 + for (let i = openIndex; i < content.length; i++) { + if (content[i] === OPEN_BRACE) { + depth++ + } + else if (content[i] === CLOSE_BRACE) { + depth-- + if (depth === 0) + return i + } + } + return -1 +} + +/** + * Parse field descriptors from the content inside `{...}` of a header. + * Handles nested fields (field{sub1,sub2}) and strips type hint suffixes (field:type) for forward compatibility. + * Returns both structured descriptors and flat leaf field names. + */ +export function parseFieldDescriptors( + content: string, + delimiter: Delimiter, +): { descriptors: FieldDescriptor[], flatNames: string[] } { + const descriptors: FieldDescriptor[] = [] + const flatNames: string[] = [] + + // Split top-level fields by delimiter, respecting nested braces + const rawFields = splitTopLevel(content, delimiter) + + for (const raw of rawFields) { + const trimmed = raw.trim() + if (!trimmed) + continue + + // Check for nested fields: fieldName{sub1,sub2} + const braceIdx = trimmed.indexOf(OPEN_BRACE) + if (braceIdx !== -1) { + const matchEnd = findMatchingBrace(trimmed, braceIdx) + if (matchEnd !== -1) { + const name = parseFieldName(trimmed.slice(0, braceIdx)) + const subContent = trimmed.slice(braceIdx + 1, matchEnd) + const subParsed = parseFieldDescriptors(subContent, delimiter) + descriptors.push({ name, subfields: subParsed.descriptors }) + flatNames.push(...subParsed.flatNames) + continue + } + } + + // Check for type hint: fieldName:type + const { name } = parseFieldNameWithHint(trimmed) + descriptors.push({ name }) + flatNames.push(name) + } + + return { descriptors, flatNames } +} + +/** + * Split a string by delimiter at the top level only (not inside braces or quotes). + */ +function splitTopLevel(content: string, delimiter: Delimiter): string[] { + const parts: string[] = [] + let current = '' + let braceDepth = 0 + let inQuotes = false + + for (let i = 0; i < content.length; i++) { + const ch = content[i]! + + if (ch === BACKSLASH && inQuotes && i + 1 < content.length) { + current += ch + content[i + 1] + i++ + continue + } + + if (ch === DOUBLE_QUOTE) { + inQuotes = !inQuotes + current += ch + continue + } + + if (!inQuotes) { + if (ch === OPEN_BRACE) { + braceDepth++ + current += ch + continue + } + if (ch === CLOSE_BRACE) { + braceDepth-- + current += ch + continue + } + if (ch === delimiter && braceDepth === 0) { + parts.push(current) + current = '' + continue + } + } + + current += ch + } + + if (current || parts.length > 0) { + parts.push(current) + } + + return parts +} + +const TOON_TYPE_HINTS = new Set(['int', 'float', 'str', 'bool', 'enum', 'date', 'null']) + +/** + * Parse a field name, stripping any type hint suffix (e.g., `:int`, `:str`). + * Type hints are recognized and stripped for forward compatibility but not stored. + */ +function parseFieldNameWithHint(raw: string): { name: string } { + const trimmed = raw.trim() + + // Handle quoted field names + if (trimmed.startsWith(DOUBLE_QUOTE)) { + const closingIdx = findClosingQuote(trimmed, 0) + if (closingIdx !== -1) { + const name = parseStringLiteral(trimmed.slice(0, closingIdx + 1)) + return { name } + } + } + + // Unquoted: look for :type suffix and strip it + const colonIdx = trimmed.lastIndexOf(COLON) + if (colonIdx !== -1) { + const possibleHint = trimmed.slice(colonIdx + 1).trim() + if (TOON_TYPE_HINTS.has(possibleHint)) { + return { name: trimmed.slice(0, colonIdx).trim() } + } + } + + return { name: parseStringLiteral(trimmed) } +} + +function parseFieldName(raw: string): string { + return parseFieldNameWithHint(raw).name +} + +// #endregion + // #region Array content detection helpers export function isArrayHeaderContent(content: string): boolean { diff --git a/packages/toon/src/encode/encoders.ts b/packages/toon/src/encode/encoders.ts index 7cd9e603..d82ee969 100644 --- a/packages/toon/src/encode/encoders.ts +++ b/packages/toon/src/encode/encoders.ts @@ -1,6 +1,7 @@ -import type { Depth, JsonArray, JsonObject, JsonPrimitive, JsonValue, ResolvedEncodeOptions } from '../types.ts' +import type { Depth, FieldDescriptor, JsonArray, JsonObject, JsonPrimitive, JsonValue, ResolvedEncodeOptions } from '../types.ts' import { DOT, LIST_ITEM_MARKER, LIST_ITEM_PREFIX } from '../constants.ts' import { tryFoldKeyChain } from './folding.ts' +import { inferNestedFieldDescriptors } from './nested-fields.ts' import { isArrayOfArrays, isArrayOfObjects, isArrayOfPrimitives, isEmptyObject, isJsonArray, isJsonObject, isJsonPrimitive } from './normalize.ts' import { encodeAndJoinPrimitives, encodeKey, encodePrimitive, formatHeader } from './primitives.ts' @@ -151,6 +152,15 @@ export function* encodeArrayLines( // Array of objects if (isArrayOfObjects(value)) { + // Try nested table encoding first (when enabled) + if (options.nestedTables) { + const nestedHeader = extractNestedTabularHeader(value) + if (nestedHeader) { + yield* encodeNestedTabularLines(key, value, nestedHeader.flatHeader, nestedHeader.descriptors, depth, options) + return + } + } + const header = extractTabularHeader(value) if (header) { yield* encodeArrayOfObjectsAsTabularLines(key, value, header, depth, options) @@ -389,6 +399,76 @@ function* encodeListItemValueLines( // #endregion +// #region Nested table encoding + +function extractNestedTabularHeader( + rows: readonly JsonObject[], +): { flatHeader: string[], descriptors: FieldDescriptor[] } | undefined { + if (rows.length === 0) + return undefined + + const firstRow = rows[0]! + const topKeys = Object.keys(firstRow) + if (topKeys.length === 0) + return undefined + + const descriptors = inferNestedFieldDescriptors(rows, topKeys) + if (!descriptors) + return undefined + + // Build flat header from descriptors (leaf field names in order) + const flatHeader = flattenDescriptorKeys(descriptors) + return { flatHeader, descriptors } +} + +function flattenDescriptorKeys(descriptors: readonly FieldDescriptor[]): string[] { + const keys: string[] = [] + for (const desc of descriptors) { + if (desc.subfields && desc.subfields.length > 0) { + keys.push(...flattenDescriptorKeys(desc.subfields)) + } + else { + keys.push(desc.name) + } + } + return keys +} + +function* encodeNestedTabularLines( + prefix: string | undefined, + rows: readonly JsonObject[], + _flatHeader: string[], + descriptors: FieldDescriptor[], + depth: Depth, + options: ResolvedEncodeOptions, +): Generator { + const formattedHeader = formatHeader(rows.length, { key: prefix, fieldDescriptors: descriptors, delimiter: options.delimiter }) + yield indentedLine(depth, formattedHeader, options.indent) + + // Write flattened rows + for (const row of rows) { + const flatValues = flattenRowValues(row, descriptors) + const joinedValue = encodeAndJoinPrimitives(flatValues, options.delimiter) + yield indentedLine(depth + 1, joinedValue, options.indent) + } +} + +function flattenRowValues(row: JsonObject, descriptors: readonly FieldDescriptor[]): JsonPrimitive[] { + const values: JsonPrimitive[] = [] + for (const desc of descriptors) { + if (desc.subfields && desc.subfields.length > 0) { + const nestedObj = row[desc.name] as JsonObject + values.push(...flattenRowValues(nestedObj, desc.subfields)) + } + else { + values.push(row[desc.name] as JsonPrimitive) + } + } + return values +} + +// #endregion + // #region Indentation helpers function indentedLine(depth: Depth, content: string, indentSize: number): string { diff --git a/packages/toon/src/encode/nested-fields.ts b/packages/toon/src/encode/nested-fields.ts new file mode 100644 index 00000000..2f675759 --- /dev/null +++ b/packages/toon/src/encode/nested-fields.ts @@ -0,0 +1,48 @@ +import type { FieldDescriptor, JsonObject } from '../types.ts' +import { isJsonPrimitive } from './normalize.ts' + +/** + * Detect uniform nested objects in a tabular array and build descriptors. + * Returns undefined if nested tables cannot be used (non-uniform structures). + */ +export function inferNestedFieldDescriptors( + rows: readonly JsonObject[], + header: readonly string[], +): FieldDescriptor[] | undefined { + const descriptors: FieldDescriptor[] = [] + let hasNesting = false + + for (const fieldName of header) { + const columnValues = rows.map(row => row[fieldName]) + + // Check if all values are objects with identical keys (uniform nested) + if (columnValues.every(v => v !== null && typeof v === 'object' && !Array.isArray(v))) { + const objects = columnValues as JsonObject[] + const firstKeys = Object.keys(objects[0]!) + if (firstKeys.length === 0) { + descriptors.push({ name: fieldName }) + continue + } + + const allUniform = objects.every((obj) => { + const keys = Object.keys(obj) + return keys.length === firstKeys.length && firstKeys.every(k => k in obj && isJsonPrimitive(obj[k])) + }) + + if (allUniform) { + hasNesting = true + const subfields: FieldDescriptor[] = firstKeys.map(subKey => ({ name: subKey })) + descriptors.push({ name: fieldName, subfields }) + continue + } + } + + // Simple field — must be all primitives for tabular encoding + if (!columnValues.every(v => isJsonPrimitive(v))) { + return undefined + } + descriptors.push({ name: fieldName }) + } + + return hasNesting ? descriptors : undefined +} diff --git a/packages/toon/src/encode/primitives.ts b/packages/toon/src/encode/primitives.ts index b7adca58..8834c943 100644 --- a/packages/toon/src/encode/primitives.ts +++ b/packages/toon/src/encode/primitives.ts @@ -1,4 +1,4 @@ -import type { JsonPrimitive } from '../types.ts' +import type { FieldDescriptor, JsonPrimitive } from '../types.ts' import { COMMA, DEFAULT_DELIMITER, DOUBLE_QUOTE, NULL_LITERAL } from '../constants.ts' import { escapeString } from '../shared/string-utils.ts' import { isSafeUnquoted, isValidUnquotedKey } from '../shared/validation.ts' @@ -58,11 +58,13 @@ export function formatHeader( options?: { key?: string fields?: readonly string[] + fieldDescriptors?: readonly FieldDescriptor[] delimiter?: string }, ): string { const key = options?.key const fields = options?.fields + const fieldDescriptors = options?.fieldDescriptors const delimiter = options?.delimiter ?? COMMA let header = '' @@ -74,7 +76,10 @@ export function formatHeader( // Only include delimiter if it's not the default (comma) header += `[${length}${delimiter !== DEFAULT_DELIMITER ? delimiter : ''}]` - if (fields) { + if (fieldDescriptors) { + header += `{${fieldDescriptors.map(d => formatFieldDescriptor(d, delimiter)).join(delimiter)}}` + } + else if (fields) { const quotedFields = fields.map(f => encodeKey(f)) header += `{${quotedFields.join(delimiter)}}` } @@ -84,4 +89,15 @@ export function formatHeader( return header } +function formatFieldDescriptor(desc: FieldDescriptor, delimiter: string): string { + const name = encodeKey(desc.name) + + if (desc.subfields && desc.subfields.length > 0) { + const subfieldsStr = desc.subfields.map(s => formatFieldDescriptor(s, delimiter)).join(delimiter) + return `${name}{${subfieldsStr}}` + } + + return name +} + // #endregion diff --git a/packages/toon/src/index.ts b/packages/toon/src/index.ts index dcdc777d..cbc81ee6 100644 --- a/packages/toon/src/index.ts +++ b/packages/toon/src/index.ts @@ -15,6 +15,7 @@ export type { DelimiterKey, EncodeOptions, EncodeReplacer, + FieldDescriptor, JsonArray, JsonObject, JsonPrimitive, @@ -219,6 +220,7 @@ function resolveOptions(options?: EncodeOptions): ResolvedEncodeOptions { keyFolding: options?.keyFolding ?? 'off', flattenDepth: options?.flattenDepth ?? Number.POSITIVE_INFINITY, replacer: options?.replacer, + nestedTables: options?.nestedTables ?? false, } } diff --git a/packages/toon/src/types.ts b/packages/toon/src/types.ts index 1337697f..e2806200 100644 --- a/packages/toon/src/types.ts +++ b/packages/toon/src/types.ts @@ -81,6 +81,14 @@ export interface EncodeOptions { * @default undefined */ replacer?: EncodeReplacer + /** + * When true, flatten uniform nested objects in tabular arrays into inline + * nested field syntax: `{id,customer{name,country},total}` + * Nesting depth is limited to 2 levels. Non-uniform nested objects fall back + * to the default key-value syntax. + * @default false + */ + nestedTables?: boolean } export type ResolvedEncodeOptions = Readonly>> & Pick @@ -140,6 +148,15 @@ export type JsonStreamEvent // #endregion +// #region Field descriptors (for nested tables) + +export interface FieldDescriptor { + name: string + subfields?: FieldDescriptor[] +} + +// #endregion + // #region Decoder parsing types export interface ArrayHeaderInfo { @@ -147,6 +164,7 @@ export interface ArrayHeaderInfo { length: number delimiter: Delimiter fields?: string[] + fieldDescriptors?: FieldDescriptor[] } export interface ParsedLine { diff --git a/packages/toon/test/nestedTables.test.ts b/packages/toon/test/nestedTables.test.ts new file mode 100644 index 00000000..7345745f --- /dev/null +++ b/packages/toon/test/nestedTables.test.ts @@ -0,0 +1,124 @@ +import { describe, expect, it } from 'vitest' +import { decode, encode } from '../src/index' + +describe('nested table syntax', () => { + describe('encoder', () => { + it('flattens uniform nested objects into inline syntax', () => { + const data = { + orders: [ + { id: 1, customer: { name: 'Alice', country: 'DK' }, total: 99.00 }, + { id: 2, customer: { name: 'Bob', country: 'UK' }, total: 149.00 }, + ], + } + + const result = encode(data, { nestedTables: true }) + expect(result).toContain('orders[2]{id,customer{name,country},total}:') + expect(result).toContain('1,Alice,DK,99') + expect(result).toContain('2,Bob,UK,149') + }) + + it('falls back for non-uniform nested objects', () => { + const data = { + orders: [ + { id: 1, customer: { name: 'Alice', country: 'DK' }, total: 99.00 }, + { id: 2, customer: { name: 'Bob', age: 30 }, total: 149.00 }, + ], + } + + const result = encode(data, { nestedTables: true }) + // Non-uniform nested objects should fall back to list format + expect(result).not.toContain('customer{') + expect(result).toContain('- id: 1') + }) + + it('does not use nested tables when option is disabled', () => { + const data = { + orders: [ + { id: 1, customer: { name: 'Alice', country: 'DK' }, total: 99.00 }, + { id: 2, customer: { name: 'Bob', country: 'UK' }, total: 149.00 }, + ], + } + + const withoutNested = encode(data) + const withNestedFalse = encode(data, { nestedTables: false }) + expect(withoutNested).toBe(withNestedFalse) + expect(withoutNested).not.toContain('customer{') + }) + + it('handles multiple nested fields', () => { + const data = { + records: [ + { addr: { city: 'Copenhagen', zip: '2100' }, contact: { email: 'a@b.com', phone: '123' } }, + { addr: { city: 'London', zip: 'SW1A' }, contact: { email: 'c@d.com', phone: '456' } }, + ], + } + + const result = encode(data, { nestedTables: true }) + expect(result).toContain('addr{city,zip}') + expect(result).toContain('contact{email,phone}') + }) + }) + + describe('decoder', () => { + it('reconstructs nested objects from flattened rows', () => { + const toon = 'orders[2]{id,customer{name,country},total}:\n 1,Alice,DK,99\n 2,Bob,UK,149' + const result = decode(toon) + expect(result).toEqual({ + orders: [ + { id: 1, customer: { name: 'Alice', country: 'DK' }, total: 99 }, + { id: 2, customer: { name: 'Bob', country: 'UK' }, total: 149 }, + ], + }) + }) + + it('strips type hints in nested headers (forward compat)', () => { + const toon = 'orders[2]{id:int,customer{name:str,country:str},total:float}:\n 1,Alice,DK,99.5\n 2,Bob,UK,149.0' + const result = decode(toon) + expect(result).toEqual({ + orders: [ + { id: 1, customer: { name: 'Alice', country: 'DK' }, total: 99.5 }, + { id: 2, customer: { name: 'Bob', country: 'UK' }, total: 149 }, + ], + }) + }) + + it('handles multiple nested fields', () => { + const toon = 'data[1]{a{x,y},b{m,n}}:\n 1,2,3,4' + const result = decode(toon) + expect(result).toEqual({ + data: [ + { a: { x: 1, y: 2 }, b: { m: 3, n: 4 } }, + ], + }) + }) + }) + + describe('round-trip', () => { + it('jSON → TOON (nested) → JSON is lossless', () => { + const original = { + orders: [ + { id: 1, customer: { name: 'Alice', country: 'DK' }, total: 99 }, + { id: 2, customer: { name: 'Bob', country: 'UK' }, total: 149 }, + ], + } + + const toon = encode(original, { nestedTables: true }) + const decoded = decode(toon) + expect(decoded).toEqual(original) + }) + + it('falls back gracefully for non-uniform nested objects', () => { + const original = { + items: [ + { id: 1, meta: { tag: 'a' } }, + { id: 2, meta: { tag: 'b', extra: 'x' } }, + ], + } + + // With nestedTables on, non-uniform should fall back + const toon = encode(original, { nestedTables: true }) + const decoded = decode(toon) + expect(decoded).toEqual(original) + }) + }) +})