Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
{
"concepts": [
{
"id": "customer_info",
"aliases": ["customer_info", "client_data"]
},
{
"id": "order_detail",
"aliases": [
"order_detail",
"order_details",
"order_items",
"purchase_items"
]
},
{
"id": "user_profile",
"aliases": ["user_profile", "user_profiles", "account_info"]
},
{
"id": "customer_account",
"aliases": [
"customer_account",
"customers",
"accounts",
"customer_surrogates",
"user_accounts"
]
},
{
"id": "customer_pii",
"aliases": ["customer_pii", "account_pii", "user_pii"]
},
{
"id": "closure_reason",
"aliases": [
"closure_reason",
"closure_reason_master",
"withdrawal_reason_dim",
"withdrawal_reasons"
]
},
{
"id": "account_closure",
"aliases": ["account_closure", "withdrawal_requests", "withdrawal_events"]
},
{
"id": "data_erasure_request",
"aliases": [
"data_erasure_request",
"deletion_job_run",
"deletion_job_item",
"legal_holds",
"erasure_jobs",
"pii_transformations"
]
},
{
"id": "audit_log",
"aliases": ["audit_log"]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import { describe, expect, it } from 'vitest'
import { dictionaryMatch } from './dictionaryMatch'

describe('dictionaryMatch (concept-based)', () => {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

it('matches domain synonyms via concept aliases (customer_info ↔ client_data)', () => {
const reference = ['customer_info']
const predict = ['client_data']
const mapping: Record<string, string> = {}
dictionaryMatch(reference, predict, mapping)
expect(mapping).toEqual({ customer_info: 'client_data' })
})

it('matches industry expressions (order_details ↔ purchase_items)', () => {
const reference = ['order_details']
const predict = ['purchase_items']
const mapping: Record<string, string> = {}
dictionaryMatch(reference, predict, mapping)
expect(mapping).toEqual({ order_details: 'purchase_items' })
})

it('matches abbreviation vs formal name (user_profiles ↔ account_info)', () => {
const reference = ['user_profiles']
const predict = ['account_info']
const mapping: Record<string, string> = {}
dictionaryMatch(reference, predict, mapping)
expect(mapping).toEqual({ user_profiles: 'account_info' })
})

it('does not falsely match unrelated (client_log ↔ customer_info)', () => {
const reference = ['customer_info']
const predict = ['client_log']
const mapping: Record<string, string> = {}
dictionaryMatch(reference, predict, mapping)
expect(mapping).toEqual({})
})
})
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
/**
* Dictionary (Concept) Matching
*
* concept ID based matching using
* a pre-registered dictionary of alias groups. If a reference and
* a candidate resolve to the same concept, we map them with priority
* over other strategies.
*/
import fs from 'node:fs'
import path from 'node:path'
import { fileURLToPath } from 'node:url'

/**
* Mapping from reference name to candidate name.
* Mutated in place by `dictionaryMatch` to assign matches.
*/
type Mapping = Record<string, string>

/**
* A single concept entry defined in a dictionary.
* Represents a stable concept ID and its alias names.
*/
type Concept = {
/** Stable identifier used as the match key across aliases. */
id: string
/** Names/aliases that refer to the same concept (before normalization). */
aliases: string[]
/** Optional contextual scope labels (not used by matching currently). */
scope?: string[]
}

/**
* On-disk dictionary file shape as parsed from JSON.
*/
type ConceptDictionaryFile = {
/** List of concept definitions with their aliases. */
concepts: Concept[]
}

/**
* In-memory index built from one or more dictionaries for fast lookup.
*/
type ConceptIndex = {
/** Normalized alias → concept ID. */
aliasToConcept: Map<string, string>
/** Concept ID → list of normalized aliases. */
conceptToAliases: Map<string, string[]>
}

function normalizeAlias(s: string): string {
// Basic canonicalization: lowercase, split CamelCase, replace non-alnum with underscore, trim underscores
const camelSplit = s.replace(/([a-z0-9])([A-Z])/g, '$1_$2')
const lowered = camelSplit.toLowerCase()
const replaced = lowered.replace(/[^a-z0-9]+/g, '_')
return replaced.replace(/^_+|_+$/g, '')
}

function toTokens(s: string): string[] {
return normalizeAlias(s).split('_').filter(Boolean)
}

function jaccard(a: Set<string>, b: Set<string>): number {
const inter = [...a].filter((x) => b.has(x)).length
const uni = new Set([...a, ...b]).size
return uni === 0 ? 0 : inter / uni
}

function readJsonFile(filePath: string): ConceptDictionaryFile | null {
try {
const raw = fs.readFileSync(filePath, 'utf8')
const parsed = JSON.parse(raw)
return parsed
} catch {
return null
}
}

/**
* Load default dictionary shipped with this package.
*/
function loadDefaultConceptDictionary(): ConceptDictionaryFile | null {
// Resolve relative to this module file so tests/CLI can locate it reliably
const __dirnameLocal = fileURLToPath(new URL('.', import.meta.url))
const dictionaryFilePath = path.resolve(
__dirnameLocal,
'../dictionaries/global.concepts.json',
)
return readJsonFile(dictionaryFilePath)
}

function buildConceptIndex(): ConceptIndex {
const aliasToConcept = new Map<string, string>()
const conceptToAliases = new Map<string, string[]>()

const selectedDictionaries: ConceptDictionaryFile[] = []
const defaultDictionary = loadDefaultConceptDictionary()
if (defaultDictionary) {
selectedDictionaries.push(defaultDictionary)
} else {
// Warn so consumers notice dictionary-based matching is disabled
console.warn(
'[schema-bench] No concept dictionaries loaded; dictionaryMatch will be a no-op.',
)
Comment on lines +101 to +103
Copy link

Copilot AI Sep 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Using console.warn for library warnings may not be appropriate in all environments. Consider using a proper logging framework or making the warning behavior configurable.

Copilot uses AI. Check for mistakes.

}

for (const dictionary of selectedDictionaries) {
for (const concept of dictionary.concepts) {
const normalizedAliases = concept.aliases.map((alias) =>
normalizeAlias(alias),
)
conceptToAliases.set(concept.id, normalizedAliases)
for (const normalizedAlias of normalizedAliases) {
const existingConceptId = aliasToConcept.get(normalizedAlias)
if (existingConceptId && existingConceptId !== concept.id) {
// Keep the first mapping deterministically and warn for visibility
console.warn(
`[schema-bench] Duplicate alias "${normalizedAlias}" for concepts "${existingConceptId}" and "${concept.id}". Keeping "${existingConceptId}".`,
)
Comment on lines +116 to +118
Copy link

Copilot AI Sep 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Using console.warn for duplicate alias warnings may not be appropriate in all environments. Consider using a proper logging framework or making the warning behavior configurable.

Copilot uses AI. Check for mistakes.

continue
}
aliasToConcept.set(normalizedAlias, concept.id)
}
}
}

return { aliasToConcept, conceptToAliases }
}

function inferConceptByTokens(
name: string,
index: ConceptIndex,
): string | null {
const tokens = toTokens(name)
const tokenSet = new Set(tokens)
let best: { id: string; score: number } | null = null
for (const [id, aliases] of index.conceptToAliases) {
for (const alias of aliases) {
const aliasTokens = toTokens(alias)
const score = jaccard(tokenSet, new Set(aliasTokens))
if (!best || score > best.score) best = { id, score }
}
}
const threshold = 0.8
return best && best.score >= threshold ? best.id : null
}
Comment on lines +129 to +145
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

Verify: Is generic token filtering intentionally omitted?

The PR objectives mention "Generic token filtering to remove common tokens (info, data, table, etc.)" as a core feature, but inferConceptByTokens uses all tokens from the input without filtering. Was this feature deferred, or should common tokens be filtered before computing Jaccard similarity?

Run this script to check if generic token filtering exists elsewhere:


🏁 Script executed:

#!/bin/bash
# Description: Search for generic token filtering implementation

# Search for filter/blacklist patterns related to common tokens
rg -n -C3 'info|data|table' --type ts -g 'schema-bench/**' | rg -i 'filter|blacklist|stopword|generic.*token'

# Search for any token filtering logic
ast-grep --pattern 'const $FILTER_TOKENS = [$$$]'

# Check if filtering is mentioned in tests
rg -n 'generic.*token|common.*token|filter.*token' --type ts -g '**/dictionaryMatch.test.ts'

Length of output: 69155


Implement generic token filtering or update PR objectives

Function inferConceptByTokens (frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts:129-145) currently includes all tokens when computing Jaccard similarity, without removing common tokens (e.g., info, data, table) as described in the PR objectives. Add filtering of these generic tokens prior to scoring, or remove “Generic token filtering” from the feature list.

🤖 Prompt for AI Agents
In
frontend/internal-packages/schema-bench/src/dictionaryMatch/dictionaryMatch.ts
around lines 129 to 145, inferConceptByTokens currently uses all tokens for
Jaccard scoring but must filter out generic tokens (e.g., "info", "data",
"table") as stated in the PR objectives; implement this by defining a small
set/array of lowercased generic tokens, filter the tokenSet and each aliasTokens
array to remove any tokens in that set (normalize to lower case and strip
empties) before computing the jaccard score, then continue the existing
scoring/threshold logic (or, if you prefer not to implement filtering, remove
"Generic token filtering" from the feature list in the PR instead).


function resolveConceptId(name: string, index: ConceptIndex): string | null {
const normalized = normalizeAlias(name)
const conceptByAlias = index.aliasToConcept.get(normalized)
if (conceptByAlias) return conceptByAlias
return inferConceptByTokens(name, index)
}

/**
* Assign predicted names to reference names when they share the same concept.
*
* Parameters:
* - `references`: Reference names to be mapped.
* - `predictNames`: Predicted names to map from.
* - `mapping`: Existing partial mapping; respected as-is and extended.
*
* Notes:
* - Each predicted name is used at most once.
* - References with no resolvable concept remain unmapped.
*/

// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: TODO: Refactor to reduce complexity
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, I'd prefer not to have any new code created that violates the rules, but this time it's acceptable as is.

export function dictionaryMatch(
referenceNames: string[],
predictNames: string[],
mapping: Mapping,
): void {
const localIndex = buildConceptIndex()

// Track used candidates to avoid duplicates
const usedPredictNames = new Set(Object.values(mapping))

// Precompute candidate concepts
const predictConceptIdByName = new Map<string, string | null>()
for (const predictName of predictNames) {
predictConceptIdByName.set(
predictName,
resolveConceptId(predictName, localIndex),
)
}

for (const referenceName of referenceNames) {
if (mapping[referenceName] !== undefined) continue
const referenceConceptId = resolveConceptId(referenceName, localIndex)
if (!referenceConceptId) continue

// Choose the first unused candidate with the same concept; if multiple, prefer higher token overlap
let bestPredict: { name: string; score: number } | null = null
for (const predictName of predictNames) {
if (usedPredictNames.has(predictName)) continue
const predictConceptId = predictConceptIdByName.get(predictName)
if (!predictConceptId || predictConceptId !== referenceConceptId) continue
// Score by token Jaccard
const similarityScore = jaccard(
new Set(toTokens(referenceName)),
new Set(toTokens(predictName)),
)
if (!bestPredict || similarityScore > bestPredict.score)
bestPredict = { name: predictName, score: similarityScore }
}
if (bestPredict) {
mapping[referenceName] = bestPredict.name
usedPredictNames.add(bestPredict.name)
}
}
}
Comment on lines +167 to +211
Copy link

Copilot AI Sep 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The TODO comment indicates acknowledged technical debt. Consider breaking down this function into smaller, more focused functions to improve maintainability and readability.

Suggested change
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: TODO: Refactor to reduce complexity
export function dictionaryMatch(
referenceNames: string[],
predictNames: string[],
mapping: Mapping,
): void {
const localIndex = buildConceptIndex()
// Track used candidates to avoid duplicates
const usedPredictNames = new Set(Object.values(mapping))
// Precompute candidate concepts
const predictConceptIdByName = new Map<string, string | null>()
for (const predictName of predictNames) {
predictConceptIdByName.set(
predictName,
resolveConceptId(predictName, localIndex),
)
}
for (const referenceName of referenceNames) {
if (mapping[referenceName] !== undefined) continue
const referenceConceptId = resolveConceptId(referenceName, localIndex)
if (!referenceConceptId) continue
// Choose the first unused candidate with the same concept; if multiple, prefer higher token overlap
let bestPredict: { name: string; score: number } | null = null
for (const predictName of predictNames) {
if (usedPredictNames.has(predictName)) continue
const predictConceptId = predictConceptIdByName.get(predictName)
if (!predictConceptId || predictConceptId !== referenceConceptId) continue
// Score by token Jaccard
const similarityScore = jaccard(
new Set(toTokens(referenceName)),
new Set(toTokens(predictName)),
)
if (!bestPredict || similarityScore > bestPredict.score)
bestPredict = { name: predictName, score: similarityScore }
}
if (bestPredict) {
mapping[referenceName] = bestPredict.name
usedPredictNames.add(bestPredict.name)
}
}
}
// Refactored to reduce complexity by extracting helper functions
export function dictionaryMatch(
referenceNames: string[],
predictNames: string[],
mapping: Mapping,
): void {
const localIndex = buildConceptIndex()
const usedPredictNames = new Set(Object.values(mapping))
const predictConceptIdByName = precomputePredictConceptIds(predictNames, localIndex)
for (const referenceName of referenceNames) {
if (mapping[referenceName] !== undefined) continue
const referenceConceptId = resolveConceptId(referenceName, localIndex)
if (!referenceConceptId) continue
const bestPredict = findBestPredict(
referenceName,
referenceConceptId,
predictNames,
predictConceptIdByName,
usedPredictNames,
)
if (bestPredict) {
mapping[referenceName] = bestPredict
usedPredictNames.add(bestPredict)
}
}
}
/**
* Precompute concept IDs for each predicted name.
*/
function precomputePredictConceptIds(
predictNames: string[],
index: ConceptIndex,
): Map<string, string | null> {
const predictConceptIdByName = new Map<string, string | null>()
for (const predictName of predictNames) {
predictConceptIdByName.set(
predictName,
resolveConceptId(predictName, index),
)
}
return predictConceptIdByName
}
/**
* Find the best unused predicted name for a reference, matching the concept ID and maximizing token overlap.
*/
function findBestPredict(
referenceName: string,
referenceConceptId: string,
predictNames: string[],
predictConceptIdByName: Map<string, string | null>,
usedPredictNames: Set<string>,
): string | null {
let bestPredict: { name: string; score: number } | null = null
for (const predictName of predictNames) {
if (usedPredictNames.has(predictName)) continue
const predictConceptId = predictConceptIdByName.get(predictName)
if (!predictConceptId || predictConceptId !== referenceConceptId) continue
const similarityScore = jaccard(
new Set(toTokens(referenceName)),
new Set(toTokens(predictName)),
)
if (!bestPredict || similarityScore > bestPredict.score)
bestPredict = { name: predictName, score: similarityScore }
}
return bestPredict ? bestPredict.name : null
}

Copilot uses AI. Check for mistakes.

Comment on lines +168 to +211
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Optimize: Cache or inject the concept index to avoid rebuilding.

Line 173 calls buildConceptIndex() on every invocation of dictionaryMatch, which rereads and reparses the dictionary file and rebuilds the maps. If this function is called multiple times during evaluation (e.g., for tables and columns separately), this becomes a significant performance bottleneck.

Consider one of these approaches:

Option 1: Module-level cache (simplest)

+let cachedIndex: ConceptIndex | null = null
+
 export function dictionaryMatch(
   referenceNames: string[],
   predictNames: string[],
   mapping: Mapping,
 ): void {
-  const localIndex = buildConceptIndex()
+  if (!cachedIndex) {
+    cachedIndex = buildConceptIndex()
+  }
+  const localIndex = cachedIndex

Option 2: Accept index as parameter (more flexible)

 export function dictionaryMatch(
   referenceNames: string[],
   predictNames: string[],
   mapping: Mapping,
+  index?: ConceptIndex,
 ): void {
-  const localIndex = buildConceptIndex()
+  const localIndex = index ?? buildConceptIndex()
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
export function dictionaryMatch(
referenceNames: string[],
predictNames: string[],
mapping: Mapping,
): void {
const localIndex = buildConceptIndex()
// Track used candidates to avoid duplicates
const usedPredictNames = new Set(Object.values(mapping))
// Precompute candidate concepts
const predictConceptIdByName = new Map<string, string | null>()
for (const predictName of predictNames) {
predictConceptIdByName.set(
predictName,
resolveConceptId(predictName, localIndex),
)
}
for (const referenceName of referenceNames) {
if (mapping[referenceName] !== undefined) continue
const referenceConceptId = resolveConceptId(referenceName, localIndex)
if (!referenceConceptId) continue
// Choose the first unused candidate with the same concept; if multiple, prefer higher token overlap
let bestPredict: { name: string; score: number } | null = null
for (const predictName of predictNames) {
if (usedPredictNames.has(predictName)) continue
const predictConceptId = predictConceptIdByName.get(predictName)
if (!predictConceptId || predictConceptId !== referenceConceptId) continue
// Score by token Jaccard
const similarityScore = jaccard(
new Set(toTokens(referenceName)),
new Set(toTokens(predictName)),
)
if (!bestPredict || similarityScore > bestPredict.score)
bestPredict = { name: predictName, score: similarityScore }
}
if (bestPredict) {
mapping[referenceName] = bestPredict.name
usedPredictNames.add(bestPredict.name)
}
}
}
// Add a one-time cache for the concept index
let cachedIndex: ConceptIndex | null = null
export function dictionaryMatch(
referenceNames: string[],
predictNames: string[],
mapping: Mapping,
): void {
// Reuse the cached index if available
if (!cachedIndex) {
cachedIndex = buildConceptIndex()
}
const localIndex = cachedIndex
// Track used candidates to avoid duplicates
const usedPredictNames = new Set(Object.values(mapping))
// Precompute candidate concepts
const predictConceptIdByName = new Map<string, string | null>()
for (const predictName of predictNames) {
predictConceptIdByName.set(
predictName,
resolveConceptId(predictName, localIndex),
)
}
for (const referenceName of referenceNames) {
if (mapping[referenceName] !== undefined) continue
const referenceConceptId = resolveConceptId(referenceName, localIndex)
if (!referenceConceptId) continue
// Choose the first unused candidate with the same concept; if multiple, prefer higher token overlap
let bestPredict: { name: string; score: number } | null = null
for (const predictName of predictNames) {
if (usedPredictNames.has(predictName)) continue
const predictConceptId = predictConceptIdByName.get(predictName)
if (!predictConceptId || predictConceptId !== referenceConceptId) continue
// Score by token Jaccard
const similarityScore = jaccard(
new Set(toTokens(referenceName)),
new Set(toTokens(predictName)),
)
if (!bestPredict || similarityScore > bestPredict.score)
bestPredict = { name: predictName, score: similarityScore }
}
if (bestPredict) {
mapping[referenceName] = bestPredict.name
usedPredictNames.add(bestPredict.name)
}
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export * from './dictionaryMatch'
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import type { ForeignKeyConstraint, Schema } from '@liam-hq/schema'
import { foreignKeyConstraintSchema } from '@liam-hq/schema'
import * as v from 'valibot'
import { dictionaryMatch } from '../dictionaryMatch/dictionaryMatch.ts'
import { nameSimilarity } from '../nameSimilarity/nameSimilarity.ts'
import { wordOverlapMatch } from '../wordOverlapMatch/wordOverlapMatch.ts'

Expand Down Expand Up @@ -42,6 +43,7 @@ const createTableMapping = async (

// NOTE: Implement synonym matching if needed
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment should have been implemented this time. Shouldn't you delete it?

// --- (0) synonym matching
dictionaryMatch(referenceTableNames, predictTableNames, tableMapping)

// --- (1) name similarity matching
await nameSimilarity(referenceTableNames, predictTableNames, tableMapping)
Expand Down Expand Up @@ -82,6 +84,7 @@ const createColumnMapping = async (

// NOTE: Implement synonym matching if needed
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment should have been implemented this time. Shouldn't you delete it?

// --- (0) synonym matching
dictionaryMatch(referenceColumnNames, predictColumnNames, columnMapping)

// --- (1) name similarity matching
await nameSimilarity(referenceColumnNames, predictColumnNames, columnMapping)
Expand Down
1 change: 1 addition & 0 deletions frontend/internal-packages/schema-bench/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
export * from './dictionaryMatch'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this export necessary?

export * from './evaluate'
export type {
OpenAIExecutorConfig,
Expand Down