diff --git a/.changeset/fast-joins-redesign.md b/.changeset/fast-joins-redesign.md new file mode 100644 index 000000000..ef9e20e35 --- /dev/null +++ b/.changeset/fast-joins-redesign.md @@ -0,0 +1,5 @@ +--- +"@tanstack/db-ivm": patch +--- + +Redesign of the join operators with direct algorithms for major performance improvements by replacing composition-based joins (inner+anti) with implementation using mass tracking. Delivers significant performance gains while maintaining full correctness for all join types (inner, left, right, full, anti). diff --git a/.changeset/odd-mangos-pick.md b/.changeset/odd-mangos-pick.md new file mode 100644 index 000000000..f9f839817 --- /dev/null +++ b/.changeset/odd-mangos-pick.md @@ -0,0 +1,5 @@ +--- +"@tanstack/db-ivm": patch +--- + +Change the ivm indexes to use a three level `key->prefix->hash->value` structure, only falling back to structural hashing when there are multiple values for a single prefix. This removes all hashing during the initial run of a query delivering a 2-3x speedup. diff --git a/packages/db-ivm/src/hashIndex.ts b/packages/db-ivm/src/hashIndex.ts deleted file mode 100644 index cc9df1b0c..000000000 --- a/packages/db-ivm/src/hashIndex.ts +++ /dev/null @@ -1,94 +0,0 @@ -import { DefaultMap } from "./utils.js" -import { hash } from "./hashing/index.js" -import type { Hash } from "./hashing/index.js" - -/** - * A map from a difference collection trace's keys -> (value, multiplicities) that changed. - * Used in operations like join and reduce where the operation needs to - * exploit the key-value structure of the data to run efficiently. - */ -export class HashIndex { - #inner: DefaultMap> - - constructor() { - this.#inner = new DefaultMap>( - () => new DefaultMap(() => [undefined as any as V, 0]) - ) - // #inner is as map of: - // { - // [key]: { - // [hash(value)]: [value, multiplicity] - // } - // } - } - - toString(indent = false): string { - return `HashIndex(${JSON.stringify( - [...this.#inner].map(([k, valueMap]) => [k, [...valueMap]]), - undefined, - indent ? 2 : undefined - )})` - } - - get(key: K): Array<[V, number]> { - const valueMap = this.#inner.get(key) - return [...valueMap.values()] - } - - getMultiplicity(key: K, value: V): number { - const valueMap = this.#inner.get(key) - const valueHash = hash(value) - const [, multiplicity] = valueMap.get(valueHash) - return multiplicity - } - - entries() { - return this.#inner.entries() - } - - *entriesIterator(): Generator<[K, [V, number]]> { - for (const [key, valueMap] of this.#inner.entries()) { - for (const [_valueHash, [value, multiplicity]] of valueMap.entries()) { - yield [key, [value, multiplicity]] - } - } - } - - has(key: K): boolean { - return this.#inner.has(key) - } - - delete(key: K): void { - this.#inner.delete(key) - } - - get size(): number { - return this.#inner.size - } - - /** - * Adds a value to the index and does not return anything - * except if the addition caused the value to be removed - * and the key to be left with only a single value. - * In that case, we return the single remaining value. - */ - addValue(key: K, value: [V, number]): [V, number] | void { - const [val, multiplicity] = value - const valueMap = this.#inner.get(key) - const valueHash = hash(val) - const [, existingMultiplicity] = valueMap.get(valueHash) - const newMultiplicity = existingMultiplicity + multiplicity - if (multiplicity !== 0) { - if (newMultiplicity === 0) { - valueMap.delete(valueHash) - if (valueMap.size === 1) { - // Signal that the key only has a single remaining value - return valueMap.entries().next().value![1] - } - } else { - valueMap.set(valueHash, [val, newMultiplicity]) - } - } - this.#inner.set(key, valueMap) - } -} diff --git a/packages/db-ivm/src/indexes.ts b/packages/db-ivm/src/indexes.ts index 27131fc29..3c52614eb 100644 --- a/packages/db-ivm/src/indexes.ts +++ b/packages/db-ivm/src/indexes.ts @@ -1,60 +1,238 @@ +/** + * # Optimized Index Data Structure + * + * Multi-level index that adapts storage strategy based on data patterns to minimize memory + * usage, eliminate wasteful lookups, and avoid hashing whenever possible. + * + * ## Storage Strategy + * + * **Single value**: `IndexMap['key'] → [value, multiplicity]` (no hashing needed) + * + * **Multiple unprefixed values**: Direct ValueMap (avoids NO_PREFIX lookup) + * ``` + * IndexMap['key'] → ValueMap { hash(value1) → [value1, mult1], ... } + * ``` + * + * **Values with prefixes**: PrefixMap uses prefix keys directly (no hashing) + * ``` + * IndexMap['key'] → PrefixMap { 'prefix1' → [value1, mult1], NO_PREFIX → ValueMap{...} } + * ``` + * + * **Multiple values per prefix**: ValueMap within PrefixMap (hash only suffixes) + * ``` + * PrefixMap['prefix'] → ValueMap { hash(suffix1) → [full_value1, mult1], ... } + * ``` + * + * ## Dynamic Evolution + * + * Structure automatically evolves as data is added: + * - Single → ValueMap (when both values unprefixed) + * - Single → PrefixMap (when at least one prefixed) + * - ValueMap → PrefixMap (adding prefixed value to unprefixed) + * + * Prefixes extracted from array values: `['prefix', 'suffix']` → prefix='prefix' + */ + import { MultiSet } from "./multiset.js" -import { HashIndex } from "./hashIndex.js" -import { ValueIndex } from "./valueIndex.js" -import { concatIterable, mapIterable } from "./utils.js" +import { hash } from "./hashing/index.js" +import type { Hash } from "./hashing/index.js" + +// We use a symbol to represent the absence of a prefix, unprefixed values a stored +// against this key. +const NO_PREFIX = Symbol(`NO_PREFIX`) +type NO_PREFIX = typeof NO_PREFIX + +// A single value is a tuple of the value and the multiplicity. +type SingleValue = [TValue, number] + +// Base map type for the index. Stores single values, prefix maps, or value maps against a key. +type IndexMap = Map< + TKey, + SingleValue | PrefixMap | ValueMap +> + +// Second level map type for the index, stores single values or value maps against a prefix. +class PrefixMap extends Map< + TPrefix | NO_PREFIX, + SingleValue | ValueMap +> { + /** + * Add a value to the PrefixMap. Returns true if the map becomes empty after the operation. + */ + addValue(value: TValue, multiplicity: number): boolean { + if (multiplicity === 0) return this.size === 0 + + const prefix = getPrefix(value) + const valueMapOrSingleValue = this.get(prefix) + + if (isSingleValue(valueMapOrSingleValue)) { + const [currentValue, currentMultiplicity] = valueMapOrSingleValue + const currentPrefix = getPrefix(currentValue) + + if (currentPrefix !== prefix) { + throw new Error(`Mismatching prefixes, this should never happen`) + } + + if (currentValue === value || hash(currentValue) === hash(value)) { + // Same value, update multiplicity + const newMultiplicity = currentMultiplicity + multiplicity + if (newMultiplicity === 0) { + this.delete(prefix) + } else { + this.set(prefix, [value, newMultiplicity]) + } + } else { + // Different suffixes, need to create ValueMap + const valueMap = new ValueMap() + valueMap.set(hash(currentValue), valueMapOrSingleValue) + valueMap.set(hash(value), [value, multiplicity]) + this.set(prefix, valueMap) + } + } else if (valueMapOrSingleValue === undefined) { + // No existing value for this prefix + this.set(prefix, [value, multiplicity]) + } else { + // Existing ValueMap + const isEmpty = valueMapOrSingleValue.addValue(value, multiplicity) + if (isEmpty) { + this.delete(prefix) + } + } + + return this.size === 0 + } +} + +// Third level map type for the index, stores single values or value maps against a hash. +class ValueMap extends Map { + /** + * Add a value to the ValueMap. Returns true if the map becomes empty after the operation. + * @param value - The full value to store + * @param multiplicity - The multiplicity to add + * @param hashKey - Optional hash key to use instead of hashing the full value (used when in PrefixMap context) + */ + addValue(value: TValue, multiplicity: number): boolean { + if (multiplicity === 0) return this.size === 0 + + const key = hash(value) + const currentValue = this.get(key) + + if (currentValue) { + const [, currentMultiplicity] = currentValue + const newMultiplicity = currentMultiplicity + multiplicity + if (newMultiplicity === 0) { + this.delete(key) + } else { + this.set(key, [value, newMultiplicity]) + } + } else { + this.set(key, [value, multiplicity]) + } + + return this.size === 0 + } +} /** * A map from a difference collection trace's keys -> (value, multiplicities) that changed. * Used in operations like join and reduce where the operation needs to * exploit the key-value structure of the data to run efficiently. */ -export class Index { +export class Index { /* - * This is a hybrid Index that composes a ValueIndex and a HashIndex. - * Keys that have only one value are stored in the ValueIndex. - * Keys that have multiple values are stored in the HashIndex, the hash distinguishes between the values. - * This reduces the amount of hashes we need to compute since often times only a small portion of the keys are updated - * so we don't have to hash the keys that are never updated. - * - * Note: The `valueIndex` and `hashIndex` have disjoint keys. - * When a key that has only one value gets a new distinct value, - * it is added to the `hashIndex` and removed from the `valueIndex` and vice versa. + * This index maintains a nested map of keys -> (value, multiplicities), where: + * - initially the values are stored against the key as a single value tuple + * - when a key gets additional values, the values are stored against the key in a + * prefix map + * - the prefix is extract where possible from values that are structured as + * [rowPrimaryKey, rowValue], as they are in the Tanstack DB query pipeline. + * - only when there are multiple values for a given prefix do we fall back to a + * hash to identify identical values, storing them in a third level value map. */ - #valueIndex: ValueIndex - #hashIndex: HashIndex + #inner: IndexMap constructor() { - this.#valueIndex = new ValueIndex() - this.#hashIndex = new HashIndex() + this.#inner = new Map() } + /** + * This method returns a string representation of the index. + * @param indent - Whether to indent the string representation. + * @returns A string representation of the index. + */ toString(indent = false): string { - return `Index(\n ${this.#valueIndex.toString(indent)},\n ${this.#hashIndex.toString(indent)}\n)` + return `Index(${JSON.stringify( + [...this.entries()], + undefined, + indent ? 2 : undefined + )})` } - get(key: K): Array<[V, number]> { - if (this.#valueIndex.has(key)) { - return [this.#valueIndex.get(key)!] - } - return this.#hashIndex.get(key) + /** + * The size of the index. + */ + get size(): number { + return this.#inner.size + } + + /** + * This method checks if the index has a given key. + * @param key - The key to check. + * @returns True if the index has the key, false otherwise. + */ + has(key: TKey): boolean { + return this.#inner.has(key) } - getMultiplicity(key: K, value: V): number { - if (this.#valueIndex.has(key)) { - return this.#valueIndex.getMultiplicity(key) + /** + * This method returns all values for a given key. + * @param key - The key to get the values for. + * @returns An array of value tuples [value, multiplicity]. + */ + get(key: TKey): Array<[TValue, number]> { + return [...this.getIterator(key)] + } + + /** + * This method returns an iterator over all values for a given key. + * @param key - The key to get the values for. + * @returns An iterator of value tuples [value, multiplicity]. + */ + *getIterator(key: TKey): Iterable<[TValue, number]> { + const mapOrSingleValue = this.#inner.get(key) + if (isSingleValue(mapOrSingleValue)) { + yield mapOrSingleValue + } else if (mapOrSingleValue === undefined) { + return + } else if (mapOrSingleValue instanceof ValueMap) { + // Direct ValueMap - all values have NO_PREFIX + for (const valueTuple of mapOrSingleValue.values()) { + yield valueTuple + } + } else { + // PrefixMap - iterate through all prefixes + for (const singleValueOrValueMap of mapOrSingleValue.values()) { + if (isSingleValue(singleValueOrValueMap)) { + yield singleValueOrValueMap + } else { + for (const valueTuple of singleValueOrValueMap.values()) { + yield valueTuple + } + } + } } - return this.#hashIndex.getMultiplicity(key, value) } /** * This returns an iterator that iterates over all key-value pairs. * @returns An iterable of all key-value pairs (and their multiplicities) in the index. */ - #entries(): Iterable<[K, [V, number]]> { - return concatIterable( - this.#valueIndex.entries(), - this.#hashIndex.entriesIterator() - ) + *entries(): Iterable<[TKey, [TValue, number]]> { + for (const key of this.#inner.keys()) { + for (const valueTuple of this.getIterator(key)) { + yield [key, valueTuple] + } + } } /** @@ -63,90 +241,156 @@ export class Index { * It returns an iterator that you can use if you need to iterate over the values for a given key. * @returns An iterator of all *keys* in the index and their corresponding value iterator. */ - *#entriesIterators(): Iterable<[K, Iterable<[V, number]>]> { - for (const [key, [value, multiplicity]] of this.#valueIndex.entries()) { - yield [key, new Map([[value, multiplicity]])] - } - for (const [key, valueMap] of this.#hashIndex.entries()) { - yield [ - key, - mapIterable(valueMap, ([_hash, [value, multiplicity]]) => [ - value, - multiplicity, - ]), - ] + *entriesIterators(): Iterable<[TKey, Iterable<[TValue, number]>]> { + for (const key of this.#inner.keys()) { + yield [key, this.getIterator(key)] } } - has(key: K): boolean { - return this.#valueIndex.has(key) || this.#hashIndex.has(key) - } + /** + * This method adds a value to the index. + * @param key - The key to add the value to. + * @param valueTuple - The value tuple [value, multiplicity] to add to the index. + */ + addValue(key: TKey, valueTuple: SingleValue) { + const [value, multiplicity] = valueTuple + // If the multiplicity is 0, do nothing + if (multiplicity === 0) return - get size(): number { - return this.#valueIndex.size + this.#hashIndex.size - } + const mapOrSingleValue = this.#inner.get(key) - addValue(key: K, value: [V, number]): void { - const containedInValueIndex = this.#valueIndex.has(key) - const containedInHashIndex = this.#hashIndex.has(key) + if (mapOrSingleValue === undefined) { + // First value for this key + this.#inner.set(key, valueTuple) + return + } - if (containedInHashIndex && containedInValueIndex) { - throw new Error( - `Key ${key} is contained in both the value index and the hash index. This should never happen because they should have disjoint keysets.` + if (isSingleValue(mapOrSingleValue)) { + // Handle transition from single value to map + this.#handleSingleValueTransition( + key, + mapOrSingleValue, + value, + multiplicity ) + return } - if (!containedInValueIndex && !containedInHashIndex) { - // This is the first time we see the key - // Add it to the value index - this.#valueIndex.addValue(key, value) - return + if (mapOrSingleValue instanceof ValueMap) { + // Handle existing ValueMap + const prefix = getPrefix(value) + if (prefix !== NO_PREFIX) { + // Convert ValueMap to PrefixMap since we have a prefixed value + const prefixMap = new PrefixMap() + prefixMap.set(NO_PREFIX, mapOrSingleValue) + prefixMap.set(prefix, valueTuple) + this.#inner.set(key, prefixMap) + } else { + // Add to existing ValueMap + const isEmpty = mapOrSingleValue.addValue(value, multiplicity) + if (isEmpty) { + this.#inner.delete(key) + } + } + } else { + // Handle existing PrefixMap + const isEmpty = mapOrSingleValue.addValue(value, multiplicity) + if (isEmpty) { + this.#inner.delete(key) + } } + } - if (containedInValueIndex) { - // This key is already in the value index - // It could be that it's the same value or a different one - // If it's a different value we will need to remove the key from the value index - // and add the key and its two values to the hash index - try { - this.#valueIndex.addValue(key, value) - } catch { - // This is a different value, need to move the key to the hash index - const existingValue = this.#valueIndex.get(key)! - this.#valueIndex.delete(key) - this.#hashIndex.addValue(key, existingValue) - this.#hashIndex.addValue(key, value) + /** + * Handle the transition from a single value to either a ValueMap or PrefixMap + */ + #handleSingleValueTransition( + key: TKey, + currentSingleValue: SingleValue, + newValue: TValue, + multiplicity: number + ) { + const [currentValue, currentMultiplicity] = currentSingleValue + + // Check for exact same value (reference equality) + if (currentValue === newValue) { + const newMultiplicity = currentMultiplicity + multiplicity + if (newMultiplicity === 0) { + this.#inner.delete(key) + } else { + this.#inner.set(key, [newValue, newMultiplicity]) } return } - if (containedInHashIndex) { - // This key is already in the hash index so it already has two or more values. - // However, this new value and multiplicity could cause an existing value to be removed - // and lead to the key having only a single value in which case we need to move it back to the value index - const singleRemainingValue = this.#hashIndex.addValue(key, value) - if (singleRemainingValue) { - // The key only has a single remaining value so we need to move it back to the value index - this.#hashIndex.delete(key) - this.#valueIndex.addValue(key, singleRemainingValue) + // Get prefixes for both values + const newPrefix = getPrefix(newValue) + const currentPrefix = getPrefix(currentValue) + + // Check if they're the same value by prefix/suffix comparison + if ( + currentPrefix === newPrefix && + (currentValue === newValue || hash(currentValue) === hash(newValue)) + ) { + const newMultiplicity = currentMultiplicity + multiplicity + if (newMultiplicity === 0) { + this.#inner.delete(key) + } else { + this.#inner.set(key, [newValue, newMultiplicity]) } return } + + // Different values - choose appropriate map type + if (currentPrefix === NO_PREFIX && newPrefix === NO_PREFIX) { + // Both have NO_PREFIX, use ValueMap directly + const valueMap = new ValueMap() + valueMap.set(hash(currentValue), currentSingleValue) + valueMap.set(hash(newValue), [newValue, multiplicity]) + this.#inner.set(key, valueMap) + } else { + // At least one has a prefix, use PrefixMap + const prefixMap = new PrefixMap() + + if (currentPrefix === newPrefix) { + // Same prefix, different suffixes - need ValueMap within PrefixMap + const valueMap = new ValueMap() + valueMap.set(hash(currentValue), currentSingleValue) + valueMap.set(hash(newValue), [newValue, multiplicity]) + prefixMap.set(currentPrefix, valueMap) + } else { + // Different prefixes - store as separate single values + prefixMap.set(currentPrefix, currentSingleValue) + prefixMap.set(newPrefix, [newValue, multiplicity]) + } + + this.#inner.set(key, prefixMap) + } } - append(other: Index): void { - for (const [key, value] of other.#entries()) { + /** + * This method appends another index to the current index. + * @param other - The index to append to the current index. + */ + append(other: Index): void { + for (const [key, value] of other.entries()) { this.addValue(key, value) } } - join(other: Index): MultiSet<[K, [V, V2]]> { - const result: Array<[[K, [V, V2]], number]> = [] - + /** + * This method joins two indexes. + * @param other - The index to join with the current index. + * @returns A multiset of the joined values. + */ + join( + other: Index + ): MultiSet<[TKey, [TValue, TValue2]]> { + const result: Array<[[TKey, [TValue, TValue2]], number]> = [] // We want to iterate over the smaller of the two indexes to reduce the // number of operations we need to do. if (this.size <= other.size) { - for (const [key, valueIt] of this.#entriesIterators()) { + for (const [key, valueIt] of this.entriesIterators()) { if (!other.has(key)) continue const otherValues = other.get(key) for (const [val1, mul1] of valueIt) { @@ -158,7 +402,7 @@ export class Index { } } } else { - for (const [key, otherValueIt] of other.#entriesIterators()) { + for (const [key, otherValueIt] of other.entriesIterators()) { if (!this.has(key)) continue const values = this.get(key) for (const [val2, mul2] of otherValueIt) { @@ -174,3 +418,34 @@ export class Index { return new MultiSet(result) } } + +/** + * This function extracts the prefix from a value. + * @param value - The value to extract the prefix from. + * @returns The prefix and the suffix. + */ +function getPrefix(value: TValue): TPrefix | NO_PREFIX { + // If the value is an array and the first element is a string or number, then the + // first element is the prefix. This is used to distinguish between values without + // the need for hashing unless there are multiple values for the same prefix. + if ( + Array.isArray(value) && + (typeof value[0] === `string` || + typeof value[0] === `number` || + typeof value[0] === `bigint`) + ) { + return value[0] as TPrefix + } + return NO_PREFIX +} + +/** + * This function checks if a value is a single value. + * @param value - The value to check. + * @returns True if the value is a single value, false otherwise. + */ +function isSingleValue( + value: SingleValue | unknown +): value is SingleValue { + return Array.isArray(value) +} diff --git a/packages/db-ivm/src/multiset.ts b/packages/db-ivm/src/multiset.ts index 1e793345e..44ba297ed 100644 --- a/packages/db-ivm/src/multiset.ts +++ b/packages/db-ivm/src/multiset.ts @@ -209,6 +209,12 @@ export class MultiSet { chunkedArrayPush(this.#inner, otherArray) } + add(item: T, multiplicity: number): void { + if (multiplicity !== 0) { + this.#inner.push([item, multiplicity]) + } + } + getInner(): MultiSetArray { return this.#inner } diff --git a/packages/db-ivm/src/operators/join.ts b/packages/db-ivm/src/operators/join.ts index 259cfbc05..3fe4ad041 100644 --- a/packages/db-ivm/src/operators/join.ts +++ b/packages/db-ivm/src/operators/join.ts @@ -1,10 +1,57 @@ +/** + * # Direct Join Algorithms for Incremental View Maintenance + * + * High-performance join operations implementing all join types (inner, left, right, full, anti) + * with minimal state and optimized performance. + * + * ## Algorithm + * + * For each tick, the algorithm processes incoming changes (deltas) and emits join results: + * + * 1. **Build deltas**: Extract new/changed/deleted rows from input messages + * 2. **Inner results**: Emit `ΔA⋈B_old + A_old⋈ΔB + ΔA⋈ΔB` (matched pairs) + * 3. **Outer results**: For unmatched rows, emit null-extended tuples: + * - New unmatched rows from deltas (when opposite side empty) + * - Presence transitions: when key goes `0→>0` (retract nulls) or `>0→0` (emit nulls) + * 4. **Update state**: Append deltas to indexes and update mass counters + * + * **Mass tracking** enables O(1) presence checks instead of scanning index buckets. + * + * ## State + * + * **Indexes** store the actual data: + * - `indexA: Index` - all left-side rows accumulated over time + * - `indexB: Index` - all right-side rows accumulated over time + * + * **Mass maps** track presence efficiently: + * - `massA/massB: Map` - sum of multiplicities per key + * - Used for O(1) presence checks: `mass.get(key) !== 0` means key exists + * - Avoids scanning entire index buckets just to check if key has any rows + * + * ## Join Types + * + * - **Inner**: Standard delta terms only + * - **Outer**: Inner results + null-extended unmatched rows with transition handling + * - **Anti**: Unmatched rows only (no inner results) + * + * ## Key Optimizations + * + * - **No temp copying**: Uses `(A⊎ΔA)⋈ΔB = A⋈ΔB ⊎ ΔA⋈ΔB` distributive property + * - **Early-out checks**: Skip phases when no deltas present + * - **Zero-entry pruning**: Keep maps compact, O(distinct keys) memory + * - **Final presence logic**: Avoid emit→retract churn within same tick + * + * ## Correctness + * + * - **Ordering**: Pre-append snapshots for emissions, post-emit state updates + * - **Presence**: Key matched iff mass ≠ 0, transitions trigger null handling + * - **Bag semantics**: Proper multiplicity handling including negatives + */ + import { BinaryOperator, DifferenceStreamWriter } from "../graph.js" import { StreamBuilder } from "../d2.js" import { MultiSet } from "../multiset.js" import { Index } from "../indexes.js" -import { negate } from "./negate.js" -import { map } from "./map.js" -import { concat } from "./concat.js" import type { DifferenceStreamReader } from "../graph.js" import type { IStreamBuilder, KeyValue, PipedOperator } from "../types.js" @@ -14,66 +61,204 @@ import type { IStreamBuilder, KeyValue, PipedOperator } from "../types.js" export type JoinType = `inner` | `left` | `right` | `full` | `anti` /** - * Operator that joins two input streams + * Helper to build delta index and mass map from messages + */ +function buildDelta( + messages: Array +): [Index, Map] { + const delta = new Index() + const deltaMass = new Map() + + for (const message of messages) { + const multiSetMessage = message as MultiSet<[K, V]> + for (const [item, multiplicity] of multiSetMessage.getInner()) { + const [key, value] = item + delta.addValue(key, [value, multiplicity]) + + // Keep deltaMass small by deleting zero entries + const next = (deltaMass.get(key) || 0) + multiplicity + if (next === 0) { + deltaMass.delete(key) + } else { + deltaMass.set(key, next) + } + } + } + + return [delta, deltaMass] +} + +/** + * Operator that joins two input streams using direct join algorithms */ export class JoinOperator extends BinaryOperator< - [K, V1] | [K, V2] | [K, [V1, V2]] + [K, V1] | [K, V2] | [K, [V1, V2]] | [K, [V1 | null, V2 | null]] > { #indexA = new Index() #indexB = new Index() + #massA = new Map() // sum of multiplicities per key on side A + #massB = new Map() // sum of multiplicities per key on side B + #mode: JoinType constructor( id: number, inputA: DifferenceStreamReader<[K, V1]>, inputB: DifferenceStreamReader<[K, V2]>, - output: DifferenceStreamWriter<[K, [V1, V2]]> + output: DifferenceStreamWriter, + mode: JoinType = `inner` ) { super(id, inputA, inputB, output) + this.#mode = mode } run(): void { - const deltaA = new Index() - const deltaB = new Index() - - // Process input A - process ALL messages, not just the first one - const messagesA = this.inputAMessages() - for (const message of messagesA) { - const multiSetMessage = message as unknown as MultiSet<[K, V1]> - for (const [item, multiplicity] of multiSetMessage.getInner()) { - const [key, value] = item - deltaA.addValue(key, [value, multiplicity]) - } + // 1) Ingest messages and build deltas (no state mutation yet) + const [deltaA, deltaMassA] = buildDelta(this.inputAMessages()) + const [deltaB, deltaMassB] = buildDelta(this.inputBMessages()) + + // Early-out checks + const hasDeltaA = deltaA.size > 0 + const hasDeltaB = deltaB.size > 0 + const hasDeltaMassA = deltaMassA.size > 0 + const hasDeltaMassB = deltaMassB.size > 0 + + // If nothing happened, bail early + if (!(hasDeltaA || hasDeltaB || hasDeltaMassA || hasDeltaMassB)) return + + // Precompute mode flags to avoid repeated string comparisons + const mode = this.#mode + const emitInner = + mode === `inner` || mode === `left` || mode === `right` || mode === `full` + const emitLeftNulls = mode === `left` || mode === `full` + const emitRightNulls = mode === `right` || mode === `full` + const emitAntiLeft = mode === `anti` + + const results = new MultiSet() + + // 2) INNER part (used by inner/left/right/full, but NOT anti) + if (emitInner && (hasDeltaA || hasDeltaB)) { + // Emit the three standard delta terms: DeltaA⋈B_old, A_old⋈DeltaB, DeltaA⋈DeltaB + // This avoids copying the entire left index each tick + if (hasDeltaA) results.extend(deltaA.join(this.#indexB)) + if (hasDeltaB) results.extend(this.#indexA.join(deltaB)) + if (hasDeltaA && hasDeltaB) results.extend(deltaA.join(deltaB)) } - // Process input B - process ALL messages, not just the first one - const messagesB = this.inputBMessages() - for (const message of messagesB) { - const multiSetMessage = message as unknown as MultiSet<[K, V2]> - for (const [item, multiplicity] of multiSetMessage.getInner()) { - const [key, value] = item - deltaB.addValue(key, [value, multiplicity]) + // 3) OUTER/ANTI specifics + + // LEFT side nulls or anti-left (depend only on B's presence) + if ((emitLeftNulls || emitAntiLeft) && (hasDeltaA || hasDeltaMassB)) { + // 3a) New/deleted left rows that are currently unmatched (only if DeltaA changed) + if (hasDeltaA) { + // For initial state, check final presence after applying deltaB + for (const [key, valueIterator] of deltaA.entriesIterators()) { + const finalMassB = + (this.#massB.get(key) || 0) + (deltaMassB.get(key) || 0) + if (finalMassB === 0) { + for (const [value, multiplicity] of valueIterator) { + if (multiplicity !== 0) { + results.add([key, [value, null]], multiplicity) + } + } + } + } + } + + // 3b) Right-side presence transitions (only if some RHS masses changed) + if (hasDeltaMassB) { + for (const [key, deltaMass] of deltaMassB) { + const before = this.#massB.get(key) || 0 + if (deltaMass === 0) continue + const after = before + deltaMass + + // Skip if presence doesn't flip (0->0, >0->different>0) + if ((before === 0) === (after === 0)) continue + + const it = this.#indexA.getIterator(key) + const retract = before === 0 // 0->!0 => retract, else (>0->0) emit + for (const [value, multiplicity] of it) { + if (multiplicity !== 0) { + results.add( + [key, [value, null]], + retract ? -multiplicity : +multiplicity + ) + } + } + } } } - // Process results - const results = new MultiSet<[K, [V1, V2]]>() + // RIGHT side nulls (depend only on A's presence) + if (emitRightNulls && (hasDeltaB || hasDeltaMassA)) { + // 3a) New/deleted right rows that are currently unmatched (only if DeltaB changed) + if (hasDeltaB) { + // For initial state, check final presence after applying deltaA + for (const [key, valueIterator] of deltaB.entriesIterators()) { + const finalMassA = + (this.#massA.get(key) || 0) + (deltaMassA.get(key) || 0) + if (finalMassA === 0) { + for (const [value, multiplicity] of valueIterator) { + if (multiplicity !== 0) { + results.add([key, [null, value]], multiplicity) + } + } + } + } + } + + // 3b) Left-side presence transitions (only if some LHS masses changed) + if (hasDeltaMassA) { + for (const [key, deltaMass] of deltaMassA) { + const before = this.#massA.get(key) || 0 + if (deltaMass === 0) continue + const after = before + deltaMass - // Join deltaA with existing indexB - results.extend(deltaA.join(this.#indexB)) + // Skip if presence doesn't flip (0->0, >0->different>0) + if ((before === 0) === (after === 0)) continue - // Append deltaA to indexA + const it = this.#indexB.getIterator(key) + const retract = before === 0 // 0->!0 => retract, else (>0->0) emit + for (const [value, multiplicity] of it) { + if (multiplicity !== 0) { + results.add( + [key, [null, value]], + retract ? -multiplicity : +multiplicity + ) + } + } + } + } + } + + // 4) Commit — update state + // IMPORTANT: All emissions use pre-append snapshots of indexA/indexB. + // For unmatched-on-delta (3a), use final presence (mass + deltaMass) to avoid churn. + // Append deltas and update masses only after all emissions. this.#indexA.append(deltaA) + this.#indexB.append(deltaB) - // Join existing indexA with deltaB - results.extend(this.#indexA.join(deltaB)) + // Update masses and keep maps small by deleting zero entries + for (const [key, deltaMass] of deltaMassA) { + const next = (this.#massA.get(key) || 0) + deltaMass + if (next === 0) { + this.#massA.delete(key) + } else { + this.#massA.set(key, next) + } + } + for (const [key, deltaMass] of deltaMassB) { + const next = (this.#massB.get(key) || 0) + deltaMass + if (next === 0) { + this.#massB.delete(key) + } else { + this.#massB.set(key, next) + } + } // Send results if (results.getInner().length > 0) { this.output.sendData(results) } - - // Append deltaB to indexB - this.#indexB.append(deltaB) } } @@ -91,62 +276,22 @@ export function join< other: IStreamBuilder>, type: JoinType = `inner` ): PipedOperator> { - switch (type) { - case `inner`: - return innerJoin(other) as unknown as PipedOperator< - T, - KeyValue - > - case `anti`: - return antiJoin(other) as unknown as PipedOperator< - T, - KeyValue - > - case `left`: - return leftJoin(other) as unknown as PipedOperator< - T, - KeyValue - > - case `right`: - return rightJoin(other) as unknown as PipedOperator< - T, - KeyValue - > - case `full`: - return fullJoin(other) as unknown as PipedOperator< - T, - KeyValue - > - default: - throw new Error(`Join type ${type} is invalid`) - } -} - -/** - * Joins two input streams - * @param other - The other stream to join with - */ -export function innerJoin< - K, - V1 extends T extends KeyValue ? VT : never, - V2, - T, ->( - other: IStreamBuilder> -): PipedOperator> { - return (stream: IStreamBuilder): IStreamBuilder> => { + return ( + stream: IStreamBuilder + ): IStreamBuilder> => { if (stream.graph !== other.graph) { throw new Error(`Cannot join streams from different graphs`) } - const output = new StreamBuilder>( + const output = new StreamBuilder>( stream.graph, - new DifferenceStreamWriter>() + new DifferenceStreamWriter>() ) const operator = new JoinOperator( stream.graph.getNextOperatorId(), stream.connectReader() as DifferenceStreamReader>, other.connectReader(), - output.writer + output.writer, + type ) stream.graph.addOperator(operator) stream.graph.addStream(output.connectReader()) @@ -155,7 +300,25 @@ export function innerJoin< } /** - * Joins two input streams + * Joins two input streams (inner join) + * @param other - The other stream to join with + */ +export function innerJoin< + K, + V1 extends T extends KeyValue ? VT : never, + V2, + T, +>( + other: IStreamBuilder> +): PipedOperator> { + return join(other, `inner`) as unknown as PipedOperator< + T, + KeyValue + > +} + +/** + * Joins two input streams (anti join) * @param other - The other stream to join with */ export function antiJoin< @@ -166,24 +329,14 @@ export function antiJoin< >( other: IStreamBuilder> ): PipedOperator> { - return ( - stream: IStreamBuilder - ): IStreamBuilder> => { - const matchedLeft = stream.pipe( - innerJoin(other), - map(([key, [valueLeft, _valueRight]]) => [key, valueLeft]) - ) - const anti = stream.pipe( - concat(matchedLeft.pipe(negate())), - // @ts-ignore TODO: fix this - map(([key, value]) => [key, [value, null]]) - ) - return anti as IStreamBuilder> - } + return join(other, `anti`) as unknown as PipedOperator< + T, + KeyValue + > } /** - * Joins two input streams + * Joins two input streams (left join) * @param other - The other stream to join with */ export function leftJoin< @@ -194,21 +347,14 @@ export function leftJoin< >( other: IStreamBuilder> ): PipedOperator> { - return ( - stream: IStreamBuilder - ): IStreamBuilder> => { - const left = stream - const right = other - const inner = left.pipe(innerJoin(right)) - const anti = left.pipe(antiJoin(right)) - return inner.pipe(concat(anti)) as IStreamBuilder< - KeyValue - > - } + return join(other, `left`) as unknown as PipedOperator< + T, + KeyValue + > } /** - * Joins two input streams + * Joins two input streams (right join) * @param other - The other stream to join with */ export function rightJoin< @@ -219,24 +365,14 @@ export function rightJoin< >( other: IStreamBuilder> ): PipedOperator> { - return ( - stream: IStreamBuilder - ): IStreamBuilder> => { - const left = stream as IStreamBuilder> - const right = other - const inner = left.pipe(innerJoin(right)) - const anti = right.pipe( - antiJoin(left), - map(([key, [a, b]]) => [key, [b, a]]) - ) - return inner.pipe(concat(anti)) as IStreamBuilder< - KeyValue - > - } + return join(other, `right`) as unknown as PipedOperator< + T, + KeyValue + > } /** - * Joins two input streams + * Joins two input streams (full join) * @param other - The other stream to join with */ export function fullJoin< @@ -247,19 +383,8 @@ export function fullJoin< >( other: IStreamBuilder> ): PipedOperator> { - return ( - stream: IStreamBuilder - ): IStreamBuilder> => { - const left = stream as IStreamBuilder> - const right = other - const inner = left.pipe(innerJoin(right)) - const antiLeft = left.pipe(antiJoin(right)) - const antiRight = right.pipe( - antiJoin(left), - map(([key, [a, b]]) => [key, [b, a]]) - ) - return inner.pipe(concat(antiLeft), concat(antiRight)) as IStreamBuilder< - KeyValue - > - } + return join(other, `full`) as unknown as PipedOperator< + T, + KeyValue + > } diff --git a/packages/db-ivm/src/valueIndex.ts b/packages/db-ivm/src/valueIndex.ts deleted file mode 100644 index 2470e7aa8..000000000 --- a/packages/db-ivm/src/valueIndex.ts +++ /dev/null @@ -1,78 +0,0 @@ -import { hash } from "./hashing/index.js" - -/** - * A map from a difference collection trace's keys -> (value, multiplicities) that changed. - * Used in operations like join and reduce where the operation needs to - * exploit the key-value structure of the data to run efficiently. - */ -export class ValueIndex { - #inner: Map // Maps key to the value and its multiplicity - - constructor() { - this.#inner = new Map() - } - - toString(indent = false): string { - return `ValueIndex(${JSON.stringify( - [...this.#inner.entries()], - undefined, - indent ? 2 : undefined - )})` - } - - get(key: K): [V, number] | undefined { - return this.#inner.get(key) - } - - getMultiplicity(key: K): number { - return this.get(key)?.[1] ?? 0 - } - - entries() { - return this.#inner.entries() - } - - has(key: K): boolean { - return this.#inner.has(key) - } - - delete(key: K): void { - this.#inner.delete(key) - } - - get size(): number { - return this.#inner.size - } - - addValue(key: K, v: [V, number]): void { - const [value, multiplicity] = v - - if (multiplicity === 0) { - return - } - - if (this.has(key)) { - const [currValue, currMultiplicity] = this.get(key)! - if (hash(value) === hash(currValue)) { - // Update the multiplicity - this.#setMultiplicity(key, value, currMultiplicity + multiplicity) - return - } - // Different value, not allowed. - // ValueIndex only supports one value per key. - throw new Error( - `Cannot add value for key ${key} because it already exists in ValueIndex with a different value` - ) - } - - this.#inner.set(key, [value, multiplicity]) - } - - #setMultiplicity(key: K, value: V, multiplicity: number): void { - if (multiplicity === 0) { - this.#inner.delete(key) - } else { - this.#inner.set(key, [value, multiplicity]) - } - } -} diff --git a/packages/db/src/query/compiler/joins.ts b/packages/db/src/query/compiler/joins.ts index 961937ecd..8fc3ed2da 100644 --- a/packages/db/src/query/compiler/joins.ts +++ b/packages/db/src/query/compiler/joins.ts @@ -1,10 +1,4 @@ -import { - consolidate, - filter, - join as joinOperator, - map, - tap, -} from "@tanstack/db-ivm" +import { filter, join as joinOperator, map, tap } from "@tanstack/db-ivm" import { CollectionInputNotFoundError, InvalidJoinCondition, @@ -296,7 +290,6 @@ function processJoin( return mainPipeline.pipe( joinOperator(joinedPipeline, joinClause.type as JoinType), - consolidate(), processJoinResults(joinClause.type) ) }