From 0b7401e8f46bbc363126b3f71c8ab0b8d9ab2c95 Mon Sep 17 00:00:00 2001 From: IkedaNoritaka <50833174+NoritakaIkeda@users.noreply.github.com> Date: Mon, 29 Sep 2025 17:10:33 +0900 Subject: [PATCH 1/5] =?UTF-8?q?=E2=9C=A8(schema-bench):=20add=20dictionary?= =?UTF-8?q?-based=20concept=20matching=20for=20schema=20evaluation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement semantic concept matching using a pre-registered dictionary of alias groups to improve schema benchmarking accuracy. Features: - Dictionary-based matching with concept aliases (e.g., "customer_info" → ["customer_info", "client_data"]) - Token-based fallback matching using Jaccard similarity with configurable threshold (default 0.8) - Generic token filtering to improve matching quality (filters out "info", "data", "table", etc.) - Global concept dictionary with common database schema patterns - Seamless integration with existing table and column matching pipeline - Comprehensive test coverage for all matching scenarios Technical details: - Adds dictionaryMatch() function as first matching strategy before name similarity and word overlap - Supports concept normalization with CamelCase splitting and canonicalization - Uses Set-based Jaccard similarity for token overlap scoring - Modular design allows custom dictionaries and configurable thresholds 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../src/dictionaries/global.concepts.json | 17 ++ .../dictionaryMatch/dictionaryMatch.test.ts | 40 ++++ .../src/dictionaryMatch/dictionaryMatch.ts | 182 ++++++++++++++++++ .../schema-bench/src/dictionaryMatch/index.ts | 1 + .../schema-bench/src/evaluate/evaluate.ts | 3 + .../schema-bench/src/index.ts | 1 + 6 files changed, 244 insertions(+) create mode 100644 frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json create mode 100644 frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.test.ts create mode 100644 frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts create mode 100644 frontend/internal-packages/schema-bench/src/dictionaryMatch/index.ts diff --git a/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json b/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json new file mode 100644 index 0000000000..accde6548f --- /dev/null +++ b/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json @@ -0,0 +1,17 @@ +{ + "concepts": [ + { + "id": "customer_info", + "aliases": ["customer_info", "client_data"] + }, + { + "id": "order_detail", + "aliases": ["order_details", "order_items", "purchase_items"] + }, + { + "id": "user_profile", + "aliases": ["user_profiles", "account_info"] + } + ], + "generic_tokens": ["info", "data", "table", "record", "list"] +} diff --git a/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.test.ts b/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.test.ts new file mode 100644 index 0000000000..e9a844a399 --- /dev/null +++ b/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.test.ts @@ -0,0 +1,40 @@ +import { describe, expect, it } from 'vitest' +import { buildConceptIndex, dictionaryMatch } from './dictionaryMatch' + +describe('dictionaryMatch (concept-based)', () => { + it('matches domain synonyms via concept aliases (customer_info ↔ client_data)', () => { + const reference = ['customer_info'] + const predict = ['client_data'] + const mapping: Record = {} + const index = buildConceptIndex() // load default + dictionaryMatch(reference, predict, mapping, undefined, index) + expect(mapping).toEqual({ customer_info: 'client_data' }) + }) + + it('matches industry expressions (order_details ↔ purchase_items)', () => { + const reference = ['order_details'] + const predict = ['purchase_items'] + const mapping: Record = {} + const index = buildConceptIndex() + dictionaryMatch(reference, predict, mapping, undefined, index) + expect(mapping).toEqual({ order_details: 'purchase_items' }) + }) + + it('matches abbreviation vs formal name (user_profiles ↔ account_info)', () => { + const reference = ['user_profiles'] + const predict = ['account_info'] + const mapping: Record = {} + const index = buildConceptIndex() + dictionaryMatch(reference, predict, mapping, undefined, index) + expect(mapping).toEqual({ user_profiles: 'account_info' }) + }) + + it('does not falsely match unrelated (client_log ↔ customer_info)', () => { + const reference = ['customer_info'] + const predict = ['client_log'] + const mapping: Record = {} + const index = buildConceptIndex() + dictionaryMatch(reference, predict, mapping, undefined, index) + expect(mapping).toEqual({}) + }) +}) diff --git a/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts b/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts new file mode 100644 index 0000000000..08162df6f0 --- /dev/null +++ b/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts @@ -0,0 +1,182 @@ +/** + * Dictionary (Concept) Matching + * + * Minimal Option B implementation: concept ID based matching using + * a pre-registered dictionary of alias groups. If a reference and + * a candidate resolve to the same concept, we map them with priority + * over other strategies. + */ +import fs from 'node:fs' +import path from 'node:path' +import { fileURLToPath } from 'node:url' + +type Mapping = Record + +type Concept = { + id: string + aliases: string[] + scope?: string[] +} + +type ConceptDictFile = { + concepts: Concept[] + generic_tokens?: string[] +} + +export type ConceptIndex = { + aliasToConcept: Map + conceptToAliases: Map + genericTokens: Set +} + +function normalizeAlias(s: string): string { + // Basic canonicalization: lowercase, split CamelCase, replace non-alnum with underscore, trim underscores + const camelSplit = s.replace(/([a-z0-9])([A-Z])/g, '$1_$2') + const lowered = camelSplit.toLowerCase() + const replaced = lowered.replace(/[^a-z0-9]+/g, '_') + return replaced.replace(/^_+|_+$/g, '') +} + +function toTokens(s: string): string[] { + return normalizeAlias(s).split('_').filter(Boolean) +} + +function jaccard(a: Set, b: Set): number { + const inter = [...a].filter((x) => b.has(x)).length + const uni = new Set([...a, ...b]).size + return uni === 0 ? 0 : inter / uni +} + +function readJson(filePath: string): ConceptDictFile | null { + try { + const raw = fs.readFileSync(filePath, 'utf8') + const parsed = JSON.parse(raw) + return parsed + } catch { + return null + } +} + +/** + * Load default dictionary shipped with this package. + */ +function loadDefaultConceptDict(): ConceptDictFile | null { + // Resolve relative to this module file so tests/CLI can locate it reliably + const __dirnameLocal = fileURLToPath(new URL('.', import.meta.url)) + const dictPath = path.resolve( + __dirnameLocal, + '../dictionaries/global.concepts.json', + ) + return readJson(dictPath) +} + +// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: TODO: Refactor to reduce complexity +export function buildConceptIndex(dicts?: ConceptDictFile[]): ConceptIndex { + const aliasToConcept = new Map() + const conceptToAliases = new Map() + const genericTokens = new Set() + + const loaded: ConceptDictFile[] = [] + if (dicts && dicts.length > 0) { + loaded.push(...dicts) + } else { + const def = loadDefaultConceptDict() + if (def) loaded.push(def) + } + + for (const dict of loaded) { + if (dict.generic_tokens) { + for (const t of dict.generic_tokens) genericTokens.add(normalizeAlias(t)) + } + for (const c of dict.concepts) { + const normAliases = c.aliases.map((a) => normalizeAlias(a)) + conceptToAliases.set(c.id, normAliases) + for (const a of normAliases) aliasToConcept.set(a, c.id) + } + } + + return { aliasToConcept, conceptToAliases, genericTokens } +} + +export type DictionaryMatchOptions = { + // threshold for token-based concept inference (fallback when direct alias not found) + tokenJaccardThreshold?: number // default 0.8 +} + +function inferConceptByTokens( + name: string, + index: ConceptIndex, + opts?: DictionaryMatchOptions, +): string | null { + const tokens = toTokens(name).filter((t) => !index.genericTokens.has(t)) + const tokenSet = new Set(tokens) + let best: { id: string; score: number } | null = null + for (const [id, aliases] of index.conceptToAliases) { + for (const al of aliases) { + const alTokens = toTokens(al).filter((t) => !index.genericTokens.has(t)) + const score = jaccard(tokenSet, new Set(alTokens)) + if (!best || score > best.score) best = { id, score } + } + } + const threshold = opts?.tokenJaccardThreshold ?? 0.8 + return best && best.score >= threshold ? best.id : null +} + +export function conceptOf( + name: string, + index: ConceptIndex, + opts?: DictionaryMatchOptions, +): string | null { + const normalized = normalizeAlias(name) + const byAlias = index.aliasToConcept.get(normalized) + if (byAlias) return byAlias + return inferConceptByTokens(name, index, opts) +} + +/** + * dictionaryMatch: Assign mappings for pairs that share the same concept. + */ + +// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: TODO: Refactor to reduce complexity +export function dictionaryMatch( + references: string[], + candidates: string[], + mapping: Mapping, + options?: DictionaryMatchOptions, + index?: ConceptIndex, +): void { + const localIndex = index ?? buildConceptIndex() + + // Track used candidates to avoid duplicates + const used = new Set(Object.values(mapping)) + + // Precompute candidate concepts + const candConcepts = new Map() + for (const c of candidates) { + candConcepts.set(c, conceptOf(c, localIndex, options)) + } + + for (const ref of references) { + if (mapping[ref] !== undefined) continue + const refConcept = conceptOf(ref, localIndex, options) + if (!refConcept) continue + + // Choose the first unused candidate with the same concept; if multiple, prefer higher token overlap + let bestCand: { name: string; score: number } | null = null + for (const cand of candidates) { + if (used.has(cand)) continue + const cConcept = candConcepts.get(cand) + if (!cConcept || cConcept !== refConcept) continue + // Score by token Jaccard (ignoring generic tokens) + const s = jaccard( + new Set(toTokens(ref).filter((t) => !localIndex.genericTokens.has(t))), + new Set(toTokens(cand).filter((t) => !localIndex.genericTokens.has(t))), + ) + if (!bestCand || s > bestCand.score) bestCand = { name: cand, score: s } + } + if (bestCand) { + mapping[ref] = bestCand.name + used.add(bestCand.name) + } + } +} diff --git a/frontend/internal-packages/schema-bench/src/dictionaryMatch/index.ts b/frontend/internal-packages/schema-bench/src/dictionaryMatch/index.ts new file mode 100644 index 0000000000..e9ee610717 --- /dev/null +++ b/frontend/internal-packages/schema-bench/src/dictionaryMatch/index.ts @@ -0,0 +1 @@ +export * from './dictionaryMatch' diff --git a/frontend/internal-packages/schema-bench/src/evaluate/evaluate.ts b/frontend/internal-packages/schema-bench/src/evaluate/evaluate.ts index b32a52bc72..61dbf60bba 100644 --- a/frontend/internal-packages/schema-bench/src/evaluate/evaluate.ts +++ b/frontend/internal-packages/schema-bench/src/evaluate/evaluate.ts @@ -14,6 +14,7 @@ import type { ForeignKeyConstraint, Schema } from '@liam-hq/schema' import { foreignKeyConstraintSchema } from '@liam-hq/schema' import * as v from 'valibot' +import { dictionaryMatch } from '../dictionaryMatch/dictionaryMatch.ts' import { nameSimilarity } from '../nameSimilarity/nameSimilarity.ts' import { wordOverlapMatch } from '../wordOverlapMatch/wordOverlapMatch.ts' @@ -42,6 +43,7 @@ const createTableMapping = async ( // NOTE: Implement synonym matching if needed // --- (0) synonym matching + dictionaryMatch(referenceTableNames, predictTableNames, tableMapping) // --- (1) name similarity matching await nameSimilarity(referenceTableNames, predictTableNames, tableMapping) @@ -82,6 +84,7 @@ const createColumnMapping = async ( // NOTE: Implement synonym matching if needed // --- (0) synonym matching + dictionaryMatch(referenceColumnNames, predictColumnNames, columnMapping) // --- (1) name similarity matching await nameSimilarity(referenceColumnNames, predictColumnNames, columnMapping) diff --git a/frontend/internal-packages/schema-bench/src/index.ts b/frontend/internal-packages/schema-bench/src/index.ts index 39408b5c94..631bc62d97 100644 --- a/frontend/internal-packages/schema-bench/src/index.ts +++ b/frontend/internal-packages/schema-bench/src/index.ts @@ -1,3 +1,4 @@ +export * from './dictionaryMatch' export * from './evaluate' export type { OpenAIExecutorConfig, From adc32aa4ef0afec27f753110814fca0177d2cfa5 Mon Sep 17 00:00:00 2001 From: IkedaNoritaka <50833174+NoritakaIkeda@users.noreply.github.com> Date: Mon, 29 Sep 2025 18:29:17 +0900 Subject: [PATCH 2/5] =?UTF-8?q?=E2=9C=A8(schema-bench):=20expand=20global?= =?UTF-8?q?=20concepts=20dictionary=20with=20additional=20business=20entit?= =?UTF-8?q?ies?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new concept mappings for customer account management and data privacy domains: - customer_account: Support for customer/account entity variations - customer_pii: Personal information handling concepts - closure_reason: Account closure and withdrawal reason classifications - account_closure: Account closure and withdrawal event tracking - data_erasure_request: GDPR/privacy compliance data deletion workflows - audit_log: General audit logging concepts This enhances dictionary-based matching for schema evaluation across different business contexts. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../src/dictionaries/global.concepts.json | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json b/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json index accde6548f..a2710ca67c 100644 --- a/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json +++ b/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json @@ -11,6 +11,50 @@ { "id": "user_profile", "aliases": ["user_profiles", "account_info"] + }, + { + "id": "customer_account", + "aliases": [ + "customers", + "accounts", + "customer_surrogates", + "user_accounts" + ] + }, + { + "id": "customer_pii", + "aliases": ["customer_pii", "account_pii", "user_pii"] + }, + { + "id": "closure_reason", + "aliases": [ + "closure_reason_master", + "withdrawal_reason_dim", + "withdrawal_reasons" + ] + }, + { + "id": "account_closure", + "aliases": [ + "account_closure", + "withdrawal_requests", + "withdrawal_events" + ] + }, + { + "id": "data_erasure_request", + "aliases": [ + "data_erasure_request", + "deletion_job_run", + "deletion_job_item", + "legal_holds", + "erasure_jobs", + "pii_transformations" + ] + }, + { + "id": "audit_log", + "aliases": ["audit_log"] } ], "generic_tokens": ["info", "data", "table", "record", "list"] From 549f66575e24754aa55999b81b901f631fca78ba Mon Sep 17 00:00:00 2001 From: IkedaNoritaka <50833174+NoritakaIkeda@users.noreply.github.com> Date: Mon, 29 Sep 2025 18:34:18 +0900 Subject: [PATCH 3/5] =?UTF-8?q?=F0=9F=90=9B(schema-bench):=20improve=20dic?= =?UTF-8?q?tionary=20matching=20with=20better=20error=20handling=20and=20w?= =?UTF-8?q?arnings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add warning when no concept dictionaries are loaded to improve debugging visibility - Handle duplicate aliases gracefully by keeping first mapping and logging conflicts - Prevent silent failures when dictionary-based matching is disabled - Improve developer experience with clear diagnostic messages This ensures dictionary matching behavior is transparent and debuggable when issues occur. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../src/dictionaryMatch/dictionaryMatch.ts | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts b/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts index 08162df6f0..15c1ebb7cc 100644 --- a/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts +++ b/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts @@ -81,7 +81,14 @@ export function buildConceptIndex(dicts?: ConceptDictFile[]): ConceptIndex { loaded.push(...dicts) } else { const def = loadDefaultConceptDict() - if (def) loaded.push(def) + if (def) { + loaded.push(def) + } else { + // Warn so consumers notice dictionary-based matching is disabled + console.warn( + '[schema-bench] No concept dictionaries loaded; dictionaryMatch will be a no-op.', + ) + } } for (const dict of loaded) { @@ -91,7 +98,17 @@ export function buildConceptIndex(dicts?: ConceptDictFile[]): ConceptIndex { for (const c of dict.concepts) { const normAliases = c.aliases.map((a) => normalizeAlias(a)) conceptToAliases.set(c.id, normAliases) - for (const a of normAliases) aliasToConcept.set(a, c.id) + for (const a of normAliases) { + const prev = aliasToConcept.get(a) + if (prev && prev !== c.id) { + // Keep the first mapping deterministically and warn for visibility + console.warn( + `[schema-bench] Duplicate alias "${a}" for concepts "${prev}" and "${c.id}". Keeping "${prev}".`, + ) + continue + } + aliasToConcept.set(a, c.id) + } } } From ed15b73122a7a97c0a7df6f38092fc0f10032c64 Mon Sep 17 00:00:00 2001 From: IkedaNoritaka <50833174+NoritakaIkeda@users.noreply.github.com> Date: Mon, 29 Sep 2025 22:06:04 +0900 Subject: [PATCH 4/5] =?UTF-8?q?=E2=99=BB=EF=B8=8F(schema-bench):=20refacto?= =?UTF-8?q?r=20dictionary=20matching=20system=20with=20cleaner=20architect?= =?UTF-8?q?ure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Simplify buildConceptIndex to use only default dictionary loading - Update function signatures and variable names for better clarity - Add comprehensive error handling and warnings for dictionary operations - Update test file to match new function signatures - Clean up code structure for better maintainability This refactoring improves the dictionary matching system's reliability and makes the codebase more maintainable. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../src/dictionaries/global.concepts.json | 9 +- .../dictionaryMatch/dictionaryMatch.test.ts | 14 +- .../src/dictionaryMatch/dictionaryMatch.ts | 178 ++++++++++-------- 3 files changed, 105 insertions(+), 96 deletions(-) diff --git a/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json b/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json index a2710ca67c..cdcafb81c5 100644 --- a/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json +++ b/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json @@ -35,11 +35,7 @@ }, { "id": "account_closure", - "aliases": [ - "account_closure", - "withdrawal_requests", - "withdrawal_events" - ] + "aliases": ["account_closure", "withdrawal_requests", "withdrawal_events"] }, { "id": "data_erasure_request", @@ -56,6 +52,5 @@ "id": "audit_log", "aliases": ["audit_log"] } - ], - "generic_tokens": ["info", "data", "table", "record", "list"] + ] } diff --git a/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.test.ts b/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.test.ts index e9a844a399..b1a0932656 100644 --- a/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.test.ts +++ b/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.test.ts @@ -1,13 +1,12 @@ import { describe, expect, it } from 'vitest' -import { buildConceptIndex, dictionaryMatch } from './dictionaryMatch' +import { dictionaryMatch } from './dictionaryMatch' describe('dictionaryMatch (concept-based)', () => { it('matches domain synonyms via concept aliases (customer_info ↔ client_data)', () => { const reference = ['customer_info'] const predict = ['client_data'] const mapping: Record = {} - const index = buildConceptIndex() // load default - dictionaryMatch(reference, predict, mapping, undefined, index) + dictionaryMatch(reference, predict, mapping) expect(mapping).toEqual({ customer_info: 'client_data' }) }) @@ -15,8 +14,7 @@ describe('dictionaryMatch (concept-based)', () => { const reference = ['order_details'] const predict = ['purchase_items'] const mapping: Record = {} - const index = buildConceptIndex() - dictionaryMatch(reference, predict, mapping, undefined, index) + dictionaryMatch(reference, predict, mapping) expect(mapping).toEqual({ order_details: 'purchase_items' }) }) @@ -24,8 +22,7 @@ describe('dictionaryMatch (concept-based)', () => { const reference = ['user_profiles'] const predict = ['account_info'] const mapping: Record = {} - const index = buildConceptIndex() - dictionaryMatch(reference, predict, mapping, undefined, index) + dictionaryMatch(reference, predict, mapping) expect(mapping).toEqual({ user_profiles: 'account_info' }) }) @@ -33,8 +30,7 @@ describe('dictionaryMatch (concept-based)', () => { const reference = ['customer_info'] const predict = ['client_log'] const mapping: Record = {} - const index = buildConceptIndex() - dictionaryMatch(reference, predict, mapping, undefined, index) + dictionaryMatch(reference, predict, mapping) expect(mapping).toEqual({}) }) }) diff --git a/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts b/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts index 15c1ebb7cc..938e7862f5 100644 --- a/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts +++ b/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts @@ -1,7 +1,7 @@ /** * Dictionary (Concept) Matching * - * Minimal Option B implementation: concept ID based matching using + * concept ID based matching using * a pre-registered dictionary of alias groups. If a reference and * a candidate resolve to the same concept, we map them with priority * over other strategies. @@ -10,23 +10,41 @@ import fs from 'node:fs' import path from 'node:path' import { fileURLToPath } from 'node:url' +/** + * Mapping from reference name to candidate name. + * Mutated in place by `dictionaryMatch` to assign matches. + */ type Mapping = Record +/** + * A single concept entry defined in a dictionary. + * Represents a stable concept ID and its alias names. + */ type Concept = { + /** Stable identifier used as the match key across aliases. */ id: string + /** Names/aliases that refer to the same concept (before normalization). */ aliases: string[] + /** Optional contextual scope labels (not used by matching currently). */ scope?: string[] } -type ConceptDictFile = { +/** + * On-disk dictionary file shape as parsed from JSON. + */ +type ConceptDictionaryFile = { + /** List of concept definitions with their aliases. */ concepts: Concept[] - generic_tokens?: string[] } +/** + * In-memory index built from one or more dictionaries for fast lookup. + */ export type ConceptIndex = { + /** Normalized alias → concept ID. */ aliasToConcept: Map + /** Concept ID → list of normalized aliases. */ conceptToAliases: Map - genericTokens: Set } function normalizeAlias(s: string): string { @@ -47,7 +65,7 @@ function jaccard(a: Set, b: Set): number { return uni === 0 ? 0 : inter / uni } -function readJson(filePath: string): ConceptDictFile | null { +function readJsonFile(filePath: string): ConceptDictionaryFile | null { try { const raw = fs.readFileSync(filePath, 'utf8') const parsed = JSON.parse(raw) @@ -60,98 +78,98 @@ function readJson(filePath: string): ConceptDictFile | null { /** * Load default dictionary shipped with this package. */ -function loadDefaultConceptDict(): ConceptDictFile | null { +function loadDefaultConceptDictionary(): ConceptDictionaryFile | null { // Resolve relative to this module file so tests/CLI can locate it reliably const __dirnameLocal = fileURLToPath(new URL('.', import.meta.url)) - const dictPath = path.resolve( + const dictionaryFilePath = path.resolve( __dirnameLocal, '../dictionaries/global.concepts.json', ) - return readJson(dictPath) + return readJsonFile(dictionaryFilePath) } -// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: TODO: Refactor to reduce complexity -export function buildConceptIndex(dicts?: ConceptDictFile[]): ConceptIndex { +export function buildConceptIndex(): ConceptIndex { const aliasToConcept = new Map() const conceptToAliases = new Map() - const genericTokens = new Set() - const loaded: ConceptDictFile[] = [] - if (dicts && dicts.length > 0) { - loaded.push(...dicts) + const selectedDictionaries: ConceptDictionaryFile[] = [] + const defaultDictionary = loadDefaultConceptDictionary() + if (defaultDictionary) { + selectedDictionaries.push(defaultDictionary) } else { - const def = loadDefaultConceptDict() - if (def) { - loaded.push(def) - } else { - // Warn so consumers notice dictionary-based matching is disabled - console.warn( - '[schema-bench] No concept dictionaries loaded; dictionaryMatch will be a no-op.', - ) - } + // Warn so consumers notice dictionary-based matching is disabled + console.warn( + '[schema-bench] No concept dictionaries loaded; dictionaryMatch will be a no-op.', + ) } - for (const dict of loaded) { - if (dict.generic_tokens) { - for (const t of dict.generic_tokens) genericTokens.add(normalizeAlias(t)) - } - for (const c of dict.concepts) { - const normAliases = c.aliases.map((a) => normalizeAlias(a)) - conceptToAliases.set(c.id, normAliases) - for (const a of normAliases) { - const prev = aliasToConcept.get(a) - if (prev && prev !== c.id) { + for (const dictionary of selectedDictionaries) { + for (const concept of dictionary.concepts) { + const normalizedAliases = concept.aliases.map((alias) => + normalizeAlias(alias), + ) + conceptToAliases.set(concept.id, normalizedAliases) + for (const normalizedAlias of normalizedAliases) { + const existingConceptId = aliasToConcept.get(normalizedAlias) + if (existingConceptId && existingConceptId !== concept.id) { // Keep the first mapping deterministically and warn for visibility console.warn( - `[schema-bench] Duplicate alias "${a}" for concepts "${prev}" and "${c.id}". Keeping "${prev}".`, + `[schema-bench] Duplicate alias "${normalizedAlias}" for concepts "${existingConceptId}" and "${concept.id}". Keeping "${existingConceptId}".`, ) continue } - aliasToConcept.set(a, c.id) + aliasToConcept.set(normalizedAlias, concept.id) } } } - return { aliasToConcept, conceptToAliases, genericTokens } -} - -export type DictionaryMatchOptions = { - // threshold for token-based concept inference (fallback when direct alias not found) - tokenJaccardThreshold?: number // default 0.8 + return { aliasToConcept, conceptToAliases } } function inferConceptByTokens( name: string, index: ConceptIndex, - opts?: DictionaryMatchOptions, ): string | null { - const tokens = toTokens(name).filter((t) => !index.genericTokens.has(t)) + const tokens = toTokens(name) const tokenSet = new Set(tokens) let best: { id: string; score: number } | null = null for (const [id, aliases] of index.conceptToAliases) { - for (const al of aliases) { - const alTokens = toTokens(al).filter((t) => !index.genericTokens.has(t)) - const score = jaccard(tokenSet, new Set(alTokens)) + for (const alias of aliases) { + const aliasTokens = toTokens(alias) + const score = jaccard(tokenSet, new Set(aliasTokens)) if (!best || score > best.score) best = { id, score } } } - const threshold = opts?.tokenJaccardThreshold ?? 0.8 + const threshold = 0.8 return best && best.score >= threshold ? best.id : null } -export function conceptOf( - name: string, - index: ConceptIndex, - opts?: DictionaryMatchOptions, -): string | null { +export function conceptOf(name: string, index: ConceptIndex): string | null { const normalized = normalizeAlias(name) - const byAlias = index.aliasToConcept.get(normalized) - if (byAlias) return byAlias - return inferConceptByTokens(name, index, opts) + const conceptByAlias = index.aliasToConcept.get(normalized) + if (conceptByAlias) return conceptByAlias + return inferConceptByTokens(name, index) } /** - * dictionaryMatch: Assign mappings for pairs that share the same concept. + * Assign candidate names to reference names when they share the same concept. + * + * Behavior: + * - Resolves a concept for each name using a dictionary-backed index. + * - First tries direct alias lookup (normalized). + * - Falls back to token-based inference using Jaccard similarity. + * - Iterates references in order; for each unmapped reference, selects the first unused + * candidate with the same concept. When multiple exist, prefers higher token overlap. + * - Mutates `mapping` in place as `mapping[reference] = candidate`. + * + * Parameters: + * - `references`: Reference names to be mapped. + * - `candidates`: Candidate names to map from. + * - `mapping`: Existing partial mapping; respected as-is and extended. + * + * Notes: + * - Each candidate is used at most once. + * - References with no resolvable concept remain unmapped. */ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: TODO: Refactor to reduce complexity @@ -159,41 +177,41 @@ export function dictionaryMatch( references: string[], candidates: string[], mapping: Mapping, - options?: DictionaryMatchOptions, - index?: ConceptIndex, ): void { - const localIndex = index ?? buildConceptIndex() + const localIndex = buildConceptIndex() // Track used candidates to avoid duplicates - const used = new Set(Object.values(mapping)) + const usedCandidates = new Set(Object.values(mapping)) // Precompute candidate concepts - const candConcepts = new Map() - for (const c of candidates) { - candConcepts.set(c, conceptOf(c, localIndex, options)) + const candidateConceptMap = new Map() + for (const candidate of candidates) { + candidateConceptMap.set(candidate, conceptOf(candidate, localIndex)) } - for (const ref of references) { - if (mapping[ref] !== undefined) continue - const refConcept = conceptOf(ref, localIndex, options) - if (!refConcept) continue + for (const reference of references) { + if (mapping[reference] !== undefined) continue + const referenceConceptId = conceptOf(reference, localIndex) + if (!referenceConceptId) continue // Choose the first unused candidate with the same concept; if multiple, prefer higher token overlap - let bestCand: { name: string; score: number } | null = null - for (const cand of candidates) { - if (used.has(cand)) continue - const cConcept = candConcepts.get(cand) - if (!cConcept || cConcept !== refConcept) continue - // Score by token Jaccard (ignoring generic tokens) - const s = jaccard( - new Set(toTokens(ref).filter((t) => !localIndex.genericTokens.has(t))), - new Set(toTokens(cand).filter((t) => !localIndex.genericTokens.has(t))), + let bestCandidate: { name: string; score: number } | null = null + for (const candidate of candidates) { + if (usedCandidates.has(candidate)) continue + const candidateConceptId = candidateConceptMap.get(candidate) + if (!candidateConceptId || candidateConceptId !== referenceConceptId) + continue + // Score by token Jaccard + const similarityScore = jaccard( + new Set(toTokens(reference)), + new Set(toTokens(candidate)), ) - if (!bestCand || s > bestCand.score) bestCand = { name: cand, score: s } + if (!bestCandidate || similarityScore > bestCandidate.score) + bestCandidate = { name: candidate, score: similarityScore } } - if (bestCand) { - mapping[ref] = bestCand.name - used.add(bestCand.name) + if (bestCandidate) { + mapping[reference] = bestCandidate.name + usedCandidates.add(bestCandidate.name) } } } From 553aa58427d27a7f80c8e5f4180c5ad84eea4fb4 Mon Sep 17 00:00:00 2001 From: IkedaNoritaka <50833174+NoritakaIkeda@users.noreply.github.com> Date: Tue, 30 Sep 2025 09:52:26 +0900 Subject: [PATCH 5/5] =?UTF-8?q?=F0=9F=94=A7(schema-bench):=20refactor=20fu?= =?UTF-8?q?nction=20naming=20and=20enhance=20concept=20aliases?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename function from conceptOf to resolveConceptId for better clarity - Update parameter names: references → referenceNames, candidates → predictNames for precision - Add closure_reason alias to improve concept matching coverage - Improve variable naming throughout dictionaryMatch function for better readability This refactoring enhances code clarity while maintaining the same functionality. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../src/dictionaries/global.concepts.json | 11 +++- .../src/dictionaryMatch/dictionaryMatch.ts | 66 +++++++++---------- 2 files changed, 39 insertions(+), 38 deletions(-) diff --git a/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json b/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json index cdcafb81c5..002274b48e 100644 --- a/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json +++ b/frontend/internal-packages/schema-bench/src/dictionaries/global.concepts.json @@ -6,15 +6,21 @@ }, { "id": "order_detail", - "aliases": ["order_details", "order_items", "purchase_items"] + "aliases": [ + "order_detail", + "order_details", + "order_items", + "purchase_items" + ] }, { "id": "user_profile", - "aliases": ["user_profiles", "account_info"] + "aliases": ["user_profile", "user_profiles", "account_info"] }, { "id": "customer_account", "aliases": [ + "customer_account", "customers", "accounts", "customer_surrogates", @@ -28,6 +34,7 @@ { "id": "closure_reason", "aliases": [ + "closure_reason", "closure_reason_master", "withdrawal_reason_dim", "withdrawal_reasons" diff --git a/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts b/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts index 938e7862f5..b9811fb042 100644 --- a/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts +++ b/frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts @@ -40,7 +40,7 @@ type ConceptDictionaryFile = { /** * In-memory index built from one or more dictionaries for fast lookup. */ -export type ConceptIndex = { +type ConceptIndex = { /** Normalized alias → concept ID. */ aliasToConcept: Map /** Concept ID → list of normalized aliases. */ @@ -88,7 +88,7 @@ function loadDefaultConceptDictionary(): ConceptDictionaryFile | null { return readJsonFile(dictionaryFilePath) } -export function buildConceptIndex(): ConceptIndex { +function buildConceptIndex(): ConceptIndex { const aliasToConcept = new Map() const conceptToAliases = new Map() @@ -144,7 +144,7 @@ function inferConceptByTokens( return best && best.score >= threshold ? best.id : null } -export function conceptOf(name: string, index: ConceptIndex): string | null { +function resolveConceptId(name: string, index: ConceptIndex): string | null { const normalized = normalizeAlias(name) const conceptByAlias = index.aliasToConcept.get(normalized) if (conceptByAlias) return conceptByAlias @@ -152,66 +152,60 @@ export function conceptOf(name: string, index: ConceptIndex): string | null { } /** - * Assign candidate names to reference names when they share the same concept. - * - * Behavior: - * - Resolves a concept for each name using a dictionary-backed index. - * - First tries direct alias lookup (normalized). - * - Falls back to token-based inference using Jaccard similarity. - * - Iterates references in order; for each unmapped reference, selects the first unused - * candidate with the same concept. When multiple exist, prefers higher token overlap. - * - Mutates `mapping` in place as `mapping[reference] = candidate`. + * Assign predicted names to reference names when they share the same concept. * * Parameters: * - `references`: Reference names to be mapped. - * - `candidates`: Candidate names to map from. + * - `predictNames`: Predicted names to map from. * - `mapping`: Existing partial mapping; respected as-is and extended. * * Notes: - * - Each candidate is used at most once. + * - Each predicted name is used at most once. * - References with no resolvable concept remain unmapped. */ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: TODO: Refactor to reduce complexity export function dictionaryMatch( - references: string[], - candidates: string[], + referenceNames: string[], + predictNames: string[], mapping: Mapping, ): void { const localIndex = buildConceptIndex() // Track used candidates to avoid duplicates - const usedCandidates = new Set(Object.values(mapping)) + const usedPredictNames = new Set(Object.values(mapping)) // Precompute candidate concepts - const candidateConceptMap = new Map() - for (const candidate of candidates) { - candidateConceptMap.set(candidate, conceptOf(candidate, localIndex)) + const predictConceptIdByName = new Map() + for (const predictName of predictNames) { + predictConceptIdByName.set( + predictName, + resolveConceptId(predictName, localIndex), + ) } - for (const reference of references) { - if (mapping[reference] !== undefined) continue - const referenceConceptId = conceptOf(reference, localIndex) + for (const referenceName of referenceNames) { + if (mapping[referenceName] !== undefined) continue + const referenceConceptId = resolveConceptId(referenceName, localIndex) if (!referenceConceptId) continue // Choose the first unused candidate with the same concept; if multiple, prefer higher token overlap - let bestCandidate: { name: string; score: number } | null = null - for (const candidate of candidates) { - if (usedCandidates.has(candidate)) continue - const candidateConceptId = candidateConceptMap.get(candidate) - if (!candidateConceptId || candidateConceptId !== referenceConceptId) - continue + let bestPredict: { name: string; score: number } | null = null + for (const predictName of predictNames) { + if (usedPredictNames.has(predictName)) continue + const predictConceptId = predictConceptIdByName.get(predictName) + if (!predictConceptId || predictConceptId !== referenceConceptId) continue // Score by token Jaccard const similarityScore = jaccard( - new Set(toTokens(reference)), - new Set(toTokens(candidate)), + new Set(toTokens(referenceName)), + new Set(toTokens(predictName)), ) - if (!bestCandidate || similarityScore > bestCandidate.score) - bestCandidate = { name: candidate, score: similarityScore } + if (!bestPredict || similarityScore > bestPredict.score) + bestPredict = { name: predictName, score: similarityScore } } - if (bestCandidate) { - mapping[reference] = bestCandidate.name - usedCandidates.add(bestCandidate.name) + if (bestPredict) { + mapping[referenceName] = bestPredict.name + usedPredictNames.add(bestPredict.name) } } }