Introduce Symbolic Constraint Solver for SQL-Driven Data Generation#564
Introduce Symbolic Constraint Solver for SQL-Driven Data Generation#564wmoustafa wants to merge 6 commits intolinkedin:masterfrom
Conversation
There was a problem hiding this comment.
Half way through the README.md. Will continue reading and then proceed to the code.
How does the system in general handle expressions where the values depend on each other.
Eg.
SELECT * FROM test.suitcase WHERE width + height + length < 25
Does this need a new domain type?
| /** | ||
| * Copyright 2025 LinkedIn Corporation. All rights reserved. | ||
| * Licensed under the BSD-2 Clause license. | ||
| * See LICENSE in the project root for license information. | ||
| */ | ||
| package com.linkedin.coral.datagen.domain; | ||
|
|
||
| import java.util.Arrays; | ||
| import java.util.List; | ||
|
|
||
| import org.testng.annotations.Test; | ||
|
|
||
|
|
||
| /** | ||
| * Tests for IntegerDomain class. | ||
| */ | ||
| public class IntegerDomainTest { | ||
|
|
||
| @Test | ||
| public void testSingleValue() { | ||
| System.out.println("\n=== Single Value Test ==="); | ||
| IntegerDomain domain = IntegerDomain.of(42); | ||
| System.out.println("Domain: " + domain); | ||
| System.out.println("Is empty: " + domain.isEmpty()); | ||
| System.out.println("Contains 42: " + domain.contains(42)); | ||
| System.out.println("Contains 43: " + domain.contains(43)); | ||
| System.out.println("Samples: " + domain.sampleValues(5)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testSingleInterval() { | ||
| System.out.println("\n=== Single Interval Test ==="); | ||
| IntegerDomain domain = IntegerDomain.of(10, 20); | ||
| System.out.println("Domain: " + domain); | ||
| System.out.println("Contains 10: " + domain.contains(10)); | ||
| System.out.println("Contains 15: " + domain.contains(15)); | ||
| System.out.println("Contains 20: " + domain.contains(20)); | ||
| System.out.println("Contains 21: " + domain.contains(21)); | ||
| System.out.println("Samples: " + domain.sampleValues(5)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testMultipleIntervals() { | ||
| System.out.println("\n=== Multiple Intervals Test ==="); | ||
| List<IntegerDomain.Interval> intervals = Arrays.asList(new IntegerDomain.Interval(1, 5), | ||
| new IntegerDomain.Interval(10, 15), new IntegerDomain.Interval(20, 30)); | ||
| IntegerDomain domain = IntegerDomain.of(intervals); | ||
| System.out.println("Domain: " + domain); | ||
| System.out.println("Contains 3: " + domain.contains(3)); | ||
| System.out.println("Contains 7: " + domain.contains(7)); | ||
| System.out.println("Contains 12: " + domain.contains(12)); | ||
| System.out.println("Contains 25: " + domain.contains(25)); | ||
| System.out.println("Samples: " + domain.sampleValues(10)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testIntersection() { | ||
| System.out.println("\n=== Intersection Test ==="); | ||
| IntegerDomain domain1 = IntegerDomain.of(1, 20); | ||
| IntegerDomain domain2 = IntegerDomain.of(10, 30); | ||
| IntegerDomain intersection = domain1.intersect(domain2); | ||
| System.out.println("Domain 1: " + domain1); | ||
| System.out.println("Domain 2: " + domain2); | ||
| System.out.println("Intersection: " + intersection); | ||
| System.out.println("Samples: " + intersection.sampleValues(5)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testUnion() { | ||
| System.out.println("\n=== Union Test ==="); | ||
| IntegerDomain domain1 = IntegerDomain.of(1, 10); | ||
| IntegerDomain domain2 = IntegerDomain.of(20, 30); | ||
| IntegerDomain union = domain1.union(domain2); | ||
| System.out.println("Domain 1: " + domain1); | ||
| System.out.println("Domain 2: " + domain2); | ||
| System.out.println("Union: " + union); | ||
| System.out.println("Samples: " + union.sampleValues(10)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testAddConstant() { | ||
| System.out.println("\n=== Add Constant Test ==="); | ||
| IntegerDomain domain = IntegerDomain.of(10, 20); | ||
| IntegerDomain shifted = domain.add(5); | ||
| System.out.println("Original domain: " + domain); | ||
| System.out.println("After adding 5: " + shifted); | ||
| System.out.println("Samples: " + shifted.sampleValues(5)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testMultiplyConstant() { | ||
| System.out.println("\n=== Multiply Constant Test ==="); | ||
| IntegerDomain domain = IntegerDomain.of(10, 20); | ||
| IntegerDomain scaled = domain.multiply(2); | ||
| System.out.println("Original domain: " + domain); | ||
| System.out.println("After multiplying by 2: " + scaled); | ||
| System.out.println("Samples: " + scaled.sampleValues(5)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testNegativeMultiply() { | ||
| System.out.println("\n=== Negative Multiply Test ==="); | ||
| IntegerDomain domain = IntegerDomain.of(10, 20); | ||
| IntegerDomain scaled = domain.multiply(-1); | ||
| System.out.println("Original domain: " + domain); | ||
| System.out.println("After multiplying by -1: " + scaled); | ||
| System.out.println("Samples: " + scaled.sampleValues(5)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testOverlappingIntervalsMerge() { | ||
| System.out.println("\n=== Overlapping Intervals Merge Test ==="); | ||
| List<IntegerDomain.Interval> intervals = Arrays.asList(new IntegerDomain.Interval(1, 10), | ||
| new IntegerDomain.Interval(5, 15), new IntegerDomain.Interval(20, 30)); | ||
| IntegerDomain domain = IntegerDomain.of(intervals); | ||
| System.out.println("Input intervals: [1, 10], [5, 15], [20, 30]"); | ||
| System.out.println("Merged domain: " + domain); | ||
| System.out.println("Samples: " + domain.sampleValues(10)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testAdjacentIntervalsMerge() { | ||
| System.out.println("\n=== Adjacent Intervals Merge Test ==="); | ||
| List<IntegerDomain.Interval> intervals = Arrays.asList(new IntegerDomain.Interval(1, 10), | ||
| new IntegerDomain.Interval(11, 20), new IntegerDomain.Interval(30, 40)); | ||
| IntegerDomain domain = IntegerDomain.of(intervals); | ||
| System.out.println("Input intervals: [1, 10], [11, 20], [30, 40]"); | ||
| System.out.println("Merged domain: " + domain); | ||
| System.out.println("Samples: " + domain.sampleValues(10)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testEmptyDomain() { | ||
| System.out.println("\n=== Empty Domain Test ==="); | ||
| IntegerDomain empty = IntegerDomain.empty(); | ||
| System.out.println("Empty domain: " + empty); | ||
| System.out.println("Is empty: " + empty.isEmpty()); | ||
| System.out.println("Samples: " + empty.sampleValues(5)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testIntersectionEmpty() { | ||
| System.out.println("\n=== Intersection Empty Test ==="); | ||
| IntegerDomain domain1 = IntegerDomain.of(1, 10); | ||
| IntegerDomain domain2 = IntegerDomain.of(20, 30); | ||
| IntegerDomain intersection = domain1.intersect(domain2); | ||
| System.out.println("Domain 1: " + domain1); | ||
| System.out.println("Domain 2: " + domain2); | ||
| System.out.println("Intersection: " + intersection); | ||
| System.out.println("Is empty: " + intersection.isEmpty()); | ||
| } | ||
|
|
||
| @Test | ||
| public void testComplexArithmetic() { | ||
| System.out.println("\n=== Complex Arithmetic Test ==="); | ||
| // Solve: 2*x + 5 = 25, where x in [0, 100] | ||
| // => 2*x = 20 | ||
| // => x = 10 | ||
| IntegerDomain output = IntegerDomain.of(25); | ||
| IntegerDomain afterSubtract = output.add(-5); // x = 20 | ||
| IntegerDomain solution = afterSubtract.multiply(1).intersect(IntegerDomain.of(0, 100)); | ||
|
|
||
| System.out.println("Equation: 2*x + 5 = 25"); | ||
| System.out.println("Output domain: " + output); | ||
| System.out.println("After subtracting 5: " + afterSubtract); | ||
| System.out.println("Solution (x must be in [0, 100]): " + solution); | ||
|
|
||
| // Verify | ||
| if (!solution.isEmpty()) { | ||
| long x = solution.sampleValues(1).get(0); | ||
| System.out.println("Sample x: " + x); | ||
| System.out.println("Verification: 2*" + x + " + 5 = " + (2 * x + 5)); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| public void testMultiIntervalIntersection() { | ||
| System.out.println("\n=== Multi-Interval Intersection Test ==="); | ||
| List<IntegerDomain.Interval> intervals1 = | ||
| Arrays.asList(new IntegerDomain.Interval(1, 20), new IntegerDomain.Interval(30, 50)); | ||
| List<IntegerDomain.Interval> intervals2 = | ||
| Arrays.asList(new IntegerDomain.Interval(10, 35), new IntegerDomain.Interval(45, 60)); | ||
|
|
||
| IntegerDomain domain1 = IntegerDomain.of(intervals1); | ||
| IntegerDomain domain2 = IntegerDomain.of(intervals2); | ||
| IntegerDomain intersection = domain1.intersect(domain2); | ||
|
|
||
| System.out.println("Domain 1: " + domain1); | ||
| System.out.println("Domain 2: " + domain2); | ||
| System.out.println("Intersection: " + intersection); | ||
| System.out.println("Expected: [10, 20] ∪ [30, 35] ∪ [45, 50]"); | ||
| System.out.println("Samples: " + intersection.sampleValues(15)); | ||
| } | ||
| } |
There was a problem hiding this comment.
These tests don't have assertions. Some other files have tests like these too.
| @Test | ||
| public void testArithmeticExpression() { | ||
| testDomainInference("Arithmetic Expression Test", "SELECT * FROM test.T WHERE age * 2 + 5 = 25", inputDomain -> { | ||
| assertTrue(inputDomain instanceof IntegerDomain, "Should be IntegerDomain"); |
There was a problem hiding this comment.
When there is an error, a new test I'm adding still passes
@Test
public void testMultiVariateArithmeticExpression() {
testDomainInference("Arithmetic Expression Test", "SELECT * FROM test.suitcase WHERE width + height + length < 25", inputDomain -> {
assertTrue(inputDomain instanceof IntegerDomain, "Should be IntegerDomain");
IntegerDomain intDomain = (IntegerDomain) inputDomain;
System.out.println(intDomain);
assertTrue(intDomain.contains(10), "Should contain 10 (since 10 * 2 + 5 = 25)");
assertTrue(intDomain.contains(10), "Should contain 10 (since 10 * 2 + 5 = 25)");
assertTrue(intDomain.isSingleton(), "Should be singleton");
});
}There was a problem hiding this comment.
Good catch. The old testDomainInference helper used if guards (if (disjunct instanceof RexCall), if (operator == EQUALS)) that would silently skip the assertion lambda when the structure didn't match, making any test pass vacuously.
I've refactored the helper to replace those if guards with hard assertions (assertTrue(..., "disjunct should be a RexCall"), assertEquals(..., EQUALS, "operator should be EQUALS")), so a test like your testMultiVariateArithmeticExpression example would now fail explicitly at the operator check instead of passing silently.
Thanks for the review. This sounds like a type of "domain propagation" which is used for joins (e.g., when we resolve one variable, we resolve the other based on the relationship between them). However, this cases is a bit more complex than join because all expressions mutually depend on each other, and there is no obvious expression to start from and propagate to the rest. We will tackle this separately. |
| * - Single interval: [10, 20] | ||
| * - Multiple intervals: [1, 5] ∪ [10, 15] ∪ [20, 30] | ||
| */ | ||
| public class IntegerDomain extends Domain<Long, IntegerDomain> { |
There was a problem hiding this comment.
IntegerDomain.class and IntegerDomain$Interval.class has also committed, need to be removed.
There was a problem hiding this comment.
Good catch. Thanks. Removed.
| if (max == Long.MAX_VALUE || min == Long.MIN_VALUE) { | ||
| return Long.MAX_VALUE; // Unbounded | ||
| } | ||
| return max - min + 1; |
There was a problem hiding this comment.
Potential overflow: max - min + 1 can overflow for large intervals where neither bound is exactly Long.MIN_VALUE/Long.MAX_VALUE. For example, Interval(Long.MIN_VALUE + 1, Long.MAX_VALUE - 1) bypasses both guards but the arithmetic overflows.
There was a problem hiding this comment.
Now throws exception and handles gracefully.
| } | ||
|
|
||
| public boolean isAdjacent(Interval other) { | ||
| return this.max + 1 == other.min || other.max + 1 == this.min; |
There was a problem hiding this comment.
Potential overflow: this.max + 1 overflows when max == Long.MAX_VALUE (wraps to Long.MIN_VALUE), which could cause two non-adjacent intervals to incorrectly appear adjacent and get merged during normalization.
| { ':', '@' }, // colon to @ (common punctuation) | ||
| { '[', '`' }, // [ to backtick | ||
| { '{', '~' } // { to tilde | ||
| }; |
There was a problem hiding this comment.
Nit: ALPHABET_RANGES is hard to read — requires knowledge of ASCII table gaps. Consider deriving the alphabet programmatically from the printable ASCII range:
private static final String ALPHABET;
static {
char printableStart = ' '; // 0x20 — first printable ASCII character
char printableEnd = '~'; // 0x7E — last printable ASCII character
StringBuilder sb = new StringBuilder();
for (char c = printableStart; c <= printableEnd; c++) {
sb.append(c);
}
ALPHABET = sb.toString();
}This covers the same set of characters and makes the intent explicit.
There was a problem hiding this comment.
enumerateAllowedChars can then just refer to the above.
There was a problem hiding this comment.
I would keep it simple for now.
| */ | ||
| private RegexDomain(Automaton automaton) { | ||
| this.regex = automaton.toString(); | ||
| this.automaton = automaton; |
There was a problem hiding this comment.
dk.brics Automaton is mutable — methods like determinize() and reduce() can modify internal state. Since intersection() and union() return new instances this is low-risk today, but as a defensive measure consider cloning:
private RegexDomain(Automaton automaton) {
this.automaton = automaton.clone();
this.regex = this.automaton.toString();
}There was a problem hiding this comment.
I would keep it simple since the above pattern does not exist.
| * Creates a RegexDomain from an existing automaton. | ||
| */ | ||
| private RegexDomain(Automaton automaton) { | ||
| this.regex = automaton.toString(); |
There was a problem hiding this comment.
automaton.toString() returns a debug representation of states/transitions, not a valid regex pattern. This means this.regex here is not equivalent to the regex stored in the public constructor (line 41). This could be confusing if regex is used for display or debugging.
One option is to pass a descriptive string from the call sites, e.g.:
private RegexDomain(Automaton automaton, String regex) { ... }
// In intersect():
return new RegexDomain(intersection, "(" + this.regex + ")&(" + other.regex + ")");
// In union():
return new RegexDomain(union, "(" + this.regex + ")|(" + other.regex + ")");Up to you whether this is worth addressing now.
There was a problem hiding this comment.
This also affects LowerRegexTransformer: if a RegexDomain produced by intersect()/union() is passed as the output domain, getRegex() returns the debug string. isLiteral() will return false (since the debug string contains special characters), so the transformer silently skips the case-insensitive inversion and returns the domain unchanged. Fixing toString() here would fix that too.
There was a problem hiding this comment.
No longer applicable. The regex String field has been eliminated entirely. RegexDomain is now purely automaton-driven. isLiteral() uses automaton.getFiniteStrings(2) and getLiteralValue() uses automaton.getFiniteStrings(1).
| */ | ||
| @Override | ||
| public List<String> sample(int limit) { | ||
| return sampleStrings(limit, 100); |
There was a problem hiding this comment.
Nit: consider extracting 100 to a named constant (e.g. DEFAULT_MAX_SAMPLE_LENGTH) to make the intent clearer.
There was a problem hiding this comment.
Extracted to DEFAULT_MAX_SAMPLE_LENGTH = 100. sample() now throws IllegalStateException when the automaton is non-empty but DFS yields zero samples.
| */ | ||
| @Override | ||
| public List<String> sample(int limit) { | ||
| return sampleStrings(limit, 100); |
There was a problem hiding this comment.
If the regex only matches strings longer than maxLength (100), this silently returns an empty list. The caller can't distinguish "empty domain" from "domain exists but all valid strings exceed the length cap." Consider logging a warning or throwing when the domain is non-empty but no samples could be generated.
|
|
||
| // Initialize generic domain inference program with all transformers | ||
| program = new DomainInferenceProgram(Arrays.asList(new LowerRegexTransformer(), new SubstringRegexTransformer(), | ||
| new PlusRegexTransformer(), new TimesRegexTransformer(), new CastRegexTransformer())); |
There was a problem hiding this comment.
As new transformers are added, every caller that assembles this list needs to remember to include them — easy to miss silently.
Consider adding a factory method to DomainInferenceProgram as the single source of truth:
public static DomainInferenceProgram withDefaultTransformers() {
return new DomainInferenceProgram(List.of(
new LowerRegexTransformer(),
new SubstringRegexTransformer(),
new PlusRegexTransformer(),
new TimesRegexTransformer(),
new CastRegexTransformer()));
}You could also add a test that verifies all DomainTransformer implementations are included in the default list, so adding a new transformer without registering it fails a test.
There was a problem hiding this comment.
I was considering if we need a ServiceLoader but they seems like overkill.
There was a problem hiding this comment.
Agreed, ServiceLoader is overkill. Added DomainInferenceProgram.withDefaultTransformers() as the single source of truth for the default transformer list.
| * Convenience method for deriving IntegerDomain constraints. | ||
| * Throws if the result is not an IntegerDomain. | ||
| */ | ||
| public IntegerDomain deriveInputInteger(RexNode expr, IntegerDomain outputInteger) { |
There was a problem hiding this comment.
This method doesn't appear to be called anywhere. Consider removing it to avoid dead code, or adding test coverage if it's intended for future use.
|
|
||
| @Override | ||
| public boolean canHandle(RexNode expr) { | ||
| return expr instanceof RexCall && ((RexCall) expr).getOperator() == SqlStdOperatorTable.LOWER; |
There was a problem hiding this comment.
Should we also handle UPPER? The inversion logic is the same — both produce a case-insensitive regex. Consider generalizing this into a CaseRegexTransformer that handles both SqlStdOperatorTable.LOWER and SqlStdOperatorTable.UPPER, to avoid duplicating the class.
There was a problem hiding this comment.
Deferring to a follow-up. The logic is identical, but adding UPPER support can be done cleanly when it's actually needed.
| if (isStringType(targetTypeName) && !isStringType(sourceTypeName)) { | ||
| if (isDateType(sourceTypeName)) { | ||
| // Date to String | ||
| String dateFormatRegex = "^[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|30)$"; |
There was a problem hiding this comment.
Date end should handle 31 too. Right now handling ends at 30.
| // For complex patterns, wrap with case-insensitive flag | ||
| // Note: Java regex doesn't have inline (?i) in automaton library, | ||
| // so we return the pattern as-is and rely on character-level matching | ||
| return outputRegex; |
There was a problem hiding this comment.
Consider adding a disabled test to track this known limitation, so it doesn't get silently forgotten:
@Test(enabled = false, description = "Non-literal LOWER patterns not yet case-insensitive inverted — see LowerRegexTransformer line 74")
public void testLowerWithComplexPattern() {
// LOWER(name) LIKE '%abc%' should produce .*[aA][bB][cC].*
}This way it shows up in test reports as skipped rather than being invisible.
There was a problem hiding this comment.
The non-literal LOWER fallback (returning as-is for complex patterns like .*abc.*) is a known feature gap. A proper fix would walk the automaton transitions and expand alphabetic ranges to include both cases. Deferring to a follow-up alongside UPPER support.
| return new Output(scans, remapped); | ||
| } | ||
|
|
||
| private static final Map<RexNode, RelNode> predicateOriginMap = new IdentityHashMap<>(); |
There was a problem hiding this comment.
P0 — Thread-safety issue: predicateOriginMap is a static mutable field shared across all invocations. extract() calls .clear() (line 42) then populates it during collection (line 81). If two threads call extract() concurrently, they corrupt each other's data.
Fix: Make it a local variable inside extract() and pass it through as a parameter, or make the class non-static with an instance field.
There was a problem hiding this comment.
Fixed. Made predicateOriginMap a local variable in extract() and passed it as a parameter to collectPredicates(). The class remains fully static with a private constructor — no API change since collectPredicates is private.
| } | ||
| // Remove escape sequences | ||
| return result.replaceAll("\\\\(.)", "$1"); | ||
| } |
There was a problem hiding this comment.
Dead code: unescapeLiteral is defined but never called within this class. Consider removing it.
| } | ||
| // Remove escape sequences | ||
| return result.replaceAll("\\\\(.)", "$1"); | ||
| } |
There was a problem hiding this comment.
Duplicated code: this identical unescapeLiteral method also appears in SubstringRegexTransformer (line 119) and CastRegexTransformer (line 253, dead there). Consider extracting to a shared utility (e.g., RegexUtils.unescapeLiteral).
There was a problem hiding this comment.
Leaving as-is. Duplication across 2 active files is tolerable; the method is small and stable.
| } | ||
| } | ||
|
|
||
| private static class Node { |
There was a problem hiding this comment.
Dead code: inner class Node is never referenced anywhere. Likely a remnant of an earlier BFS-based sampling approach that was replaced by dfsCollect. Consider removing it.
There was a problem hiding this comment.
Gone after refactor.
| return interval.min; | ||
| } | ||
|
|
||
| long range = interval.max - interval.min; |
There was a problem hiding this comment.
Potential overflow: interval.max - interval.min can overflow for ranges exceeding Long.MAX_VALUE. For example, Interval(-1, Long.MAX_VALUE) produces a negative range, which would cause the range < Integer.MAX_VALUE check to behave unexpectedly and lead to incorrect sampling.
| * x + 5 in [20, 30] | ||
| * produces: x in [15, 25] | ||
| */ | ||
| public class PlusRegexTransformer implements DomainTransformer { |
There was a problem hiding this comment.
Misleading name: PlusRegexTransformer operates on IntegerDomain and throws if given RegexDomain. The "Regex" prefix is a historical artifact. Consider renaming to PlusIntegerTransformer. Same applies to TimesRegexTransformer → TimesIntegerTransformer.
There was a problem hiding this comment.
Renamed. PlusRegexTransformer → PlusIntegerTransformer, TimesRegexTransformer → TimesIntegerTransformer.
| @@ -0,0 +1,659 @@ | |||
| /** | |||
There was a problem hiding this comment.
AI-Assisted Review Summary (In addition to the above comments)
Note: This is an automated review comment generated by AI (Claude). It is intended for consumption by both humans and AI agents. Author: please reply with a sub-comment indicating which issues should be fixed (e.g.,
fix: 1, 2, 5orfix: allorskip: all).
P1 — Should Fix
| # | File | Line | Issue |
|---|---|---|---|
| 1 | SubstringRegexTransformer.java |
33 | Operator name matching uses hardcoded "substr" but Hive's SUBSTRING may be registered as "substring" in Calcite. If mismatched, this transformer silently never fires. |
| 2 | RegexToIntegerDomainConverter.java |
462-465 | createIntegerDomainFromBounds computes a single [min, max] interval for large domains. Pattern ^(10|20|30)$ becomes [10, 30] including 20 false values. Should document the over-approximation explicitly. |
| 3 | RegexToIntegerDomainConverter.java |
109 | pattern.matches(".*[*+?].*") rejects *, +, ? anywhere — including inside character classes like [*+?] where they are literal. Could incorrectly reject valid patterns. |
| 4 | RegexToIntegerDomainConverter.java |
289 | No validation that quantifier min <= max. Pattern {5,2} is silently accepted; downstream loops iterate from min to max, producing no iterations rather than erroring. |
| 5 | CastRegexTransformer.java |
216-218 | break on first large interval (>100 values) loses remaining intervals. [10, 20] ∪ [1000, 2000] silently becomes -?[0-9]+, discarding the first interval's precision. |
| 6 | CanonicalPredicateExtractor.java |
84-89 | Join condition remapping applies a single base offset to all RexInputRefs. Correct only if Projects are already pulled up (stated in Javadoc precondition). No runtime validation — wrong results if called without pull-up. |
| 7 | ProjectPullUpRewriter.java |
241 | rightCount = join.getRowType().getFieldCount() - leftCount uses original join output type, but leftCount comes from leftProj.getRowType() (Project output, not input). Mismatch if the left Project changes field count. |
P2 — Consider
| # | File | Line | Issue |
|---|---|---|---|
| 8 | RegexToIntegerDomainConverter.java |
331-342 | estimateSize sums across repetition counts: [0-9]{2,3} gives 10^2 + 10^3 = 1100 but actual domain is 1000 values. Over-estimates and may unnecessarily trigger bounds path, losing precision. |
| 9 | CastRegexTransformer.java |
206-219 | convertIntegerDomainToRegex emits -?[0-9]+ (matches ANY integer) for ranges >100 values. [1000, 9999] loses all constraint info. Known limitation but severe. |
| 10 | RegexToIntegerDomainConverter.java |
551, 590 | computeMinString/computeMaxString for AlternationNode: if alt.alternatives is empty, returns null → NPE in caller's sb.append(). Parser shouldn't produce empty alternations but not validated. |
| 11 | build.gradle |
2, 5 | Uses deprecated compile/testCompile instead of implementation/testImplementation. |
| 12 | rel/ package |
— | No unit tests for ProjectPullUpRewriter, CanonicalPredicateExtractor, DnfRewriter. Only tested indirectly via integration tests. |
There was a problem hiding this comment.
break on first large interval (>100 values) loses remaining intervals. [10, 20] ∪ [1000, 2000] silently becomes -?[0-9]+, discarding the first interval's precision.
Fixed with IntegerRangeAutomaton — a ~80-line utility that builds a minimal automaton accepting exactly the decimal representations of integers in [lo, hi] using recursive digit-by-digit construction. All intervals are now processed precisely via parts.add(IntegerRangeAutomaton.build(min, max)). The break and the generic -?[0-9]+ fallback are both gone. Added 19 tests in IntegerRangeAutomatonTest covering single values, cross-digit boundaries, large ranges, negative/mixed ranges, and leading-zero rejection.
There was a problem hiding this comment.
Join condition remapping applies a single base offset to all RexInputRefs. Correct only if Projects are already pulled up (stated in Javadoc precondition). No runtime validation — wrong results if called without pull-up.
The Javadoc precondition is sufficient. The single production caller always applies ProjectPullUpController.applyUntilFixedPoint() before extract().
There was a problem hiding this comment.
rightCount = join.getRowType().getFieldCount() - leftCount uses original join output type, but leftCount comes from leftProj.getRowType() (Project output, not input). Mismatch if the left Project changes field count.
Fixed. The "only left had Project" and "only right had Project" branches now use newLeft.getRowType().getFieldCount() for pass-through RexInputRef offsets and newJoin.getRowType() for field types.
There was a problem hiding this comment.
Uses deprecated compile/testCompile instead of implementation/testImplementation.
Fixed. Replaced throughout.
There was a problem hiding this comment.
P1-2, P1-3, P1-4:
RegexToIntegerDomainConverterissues (over-approximation, naive[*+?]check, no quantifier validation).
No longer applicable. The class was completely rewritten to use dk.brics automaton directly. isConvertible() checks a.isFinite() && isDigitOnly(a), convert() uses getFiniteStrings(5000) for enumeration with DFS fallback. The custom regex parser, estimateSize(), checkForInvalidConstructs(), computeMinString/computeMaxString, and createIntegerDomainFromBounds are all gone.
There was a problem hiding this comment.
break on first large interval (>100 values) loses remaining intervals. [10, 20] ∪ [1000, 2000] silently becomes -?[0-9]+, discarding the first interval's precision.
Fixed via IntegerRangeAutomaton (see CastRegexTransformer.java:216-218 reply above).
There was a problem hiding this comment.
estimateSize sums across repetition counts: [0-9]{2,3} gives 10^2 + 10^3 = 1100 but actual domain is 1000 values. Over-estimates and may unnecessarily trigger bounds path, losing precision.
No longer applicable (class rewritten, estimateSize removed).
There was a problem hiding this comment.
convertIntegerDomainToRegex emits -?[0-9]+ (matches ANY integer) for ranges >100 values. [1000, 9999] loses all constraint info. Known limitation but severe.
Fixed via IntegerRangeAutomaton.
There was a problem hiding this comment.
computeMinString/computeMaxString for AlternationNode: if alt.alternatives is empty, returns null → NPE in caller's sb.append(). Parser shouldn't produce empty alternations but not validated.
No longer applicable (class rewritten, methods removed).
Bug Bash: Failing Test Cases for Already-Identified IssuesHere are test cases for bugs already flagged in the review. They all fail on the current code and can be copy-pasted directly. Tests to add to
|
|
Consider moving all transformer implementations ( |
| } | ||
|
|
||
| @Override | ||
| public Domain<?, ?> refineInputDomain(RexNode expr, Domain<?, ?> outputDomain) { |
There was a problem hiding this comment.
Suggestion: Refactor to avoid the growing if-else chain
This method dispatches on (sourceType, targetType, domainType) via a series of if blocks. As more types are supported (decimal, timestamp, etc.), this will grow unwieldy.
Option A — Handler registry (simplest):
@FunctionalInterface
interface CastHandler {
Domain<?, ?> handle(RexCall call, Domain<?, ?> outputDomain);
}
enum TypeCategory { STRING, INTEGER, DATE, NUMERIC_OTHER }
// Register handlers keyed by (source, target, domainClass)
private final Map<CastKey, CastHandler> handlers = new HashMap<>();
private void registerHandlers() {
handlers.put(key(STRING, INTEGER, IntegerDomain.class), this::castStringToIntegerWithIntegerDomain);
handlers.put(key(STRING, INTEGER, RegexDomain.class), this::castStringToIntegerWithRegexDomain);
handlers.put(key(INTEGER, STRING, RegexDomain.class), this::castIntegerToStringWithRegexDomain);
handlers.put(key(DATE, STRING, RegexDomain.class), this::castDateToStringWithRegexDomain);
// ...add new entries as needed
}Then refineInputDomain becomes:
public Domain<?, ?> refineInputDomain(RexNode expr, Domain<?, ?> outputDomain) {
// ... extract sourceType, targetType ...
if (sourceTypeName == targetTypeName) return outputDomain;
CastHandler handler = handlers.get(key(categorize(sourceTypeName), categorize(targetTypeName), outputDomain.getClass()));
if (handler != null) return handler.handle(call, outputDomain);
// fallback for numeric-to-numeric, unknown combos, etc.
return outputDomain;
}Option B — Two-level dispatch: Group by (sourceCategory, targetCategory) into strategy objects, each of which handles the domain type internally. Better if strategies within a type pair share logic.
Option A is probably the right starting point — adding support for a new type is just adding a map entry and a private method, with no risk of breaking existing branches.
There was a problem hiding this comment.
Deferring. The current 8-branch if-else chain is manageable. Will refactor to a handler registry when it reaches ~15+ branches as more types are supported.
There was a problem hiding this comment.
A handler registry sounds like a good approach as a follow up when more branches are neede.
| * | ||
| * @deprecated Use {@link DomainInferenceProgram} for full cross-domain support | ||
| */ | ||
| public class RegexDomainInferenceProgram { |
There was a problem hiding this comment.
Good catch. Removed.
| * <p>Usage pattern:</p> | ||
| * <pre> | ||
| * CanonicalPredicateExtractor.Output extracted = CanonicalPredicateExtractor.extract(rel); | ||
| * CanonicalPredicateDnf.Output dnf = CanonicalPredicateDnf.convert(extracted, rexBuilder); |
There was a problem hiding this comment.
Nit: Javadoc references CanonicalPredicateDnf.Output / CanonicalPredicateDnf.convert but the class is DnfRewriter.
| } | ||
|
|
||
| List<RelNode> newInputs = new ArrayList<>(); | ||
| boolean anyChanged = false; |
There was a problem hiding this comment.
Dead code: anyChanged is assigned at line 152 but never read — the method returns immediately after setting it.
|
|
||
| // Inline right Project if present | ||
| if (rightProj != null) { | ||
| int leftFieldCount = newLeft.getRowType().getFieldCount(); |
There was a problem hiding this comment.
Same category of bug as line 241: after newLeft = leftProj.getInput() (line 205), newLeft.getRowType().getFieldCount() gives the field count of the left project's input (child), not its output. If the left project changes field count (adds/drops columns), this offset will be wrong for right-side condition inlining.
There was a problem hiding this comment.
Good catch. Fixed. Captured leftFieldCount from leftProj.getRowType().getFieldCount() before newLeft is reassigned. Added a unit test (testJoinLeftProjectPullUp_FieldCountChanged) that confirms the fix.
|
Thanks for the Bug Bash tests. Added them. They pass after the above fixes. |
Moved to own package. |
- Move transformers to dedicated subpackage (domain/transformer/) - Rename PlusRegexTransformer → PlusIntegerTransformer, TimesRegexTransformer → TimesIntegerTransformer - Add DomainInferenceProgram.withDefaultTransformers() factory method - Rewrite RegexToIntegerDomainConverter with automaton-based approach and add IntegerRangeAutomaton for precise integer range constraints - Add unit tests for CanonicalPredicateExtractor, ProjectPullUpRewriter, and IntegerRangeAutomaton - Fix DnfRewriter Javadoc, extract RegexDomain magic number, make IntegerRangeAutomaton public for cross-package access - Remove dead code: deriveInputRegex(), deprecated sampleValues() - Normalize test conventions: setup method naming, section separators, camelCase method names, assertEquals parameter order, and IntegerDomain construction style
simbadzina
left a comment
There was a problem hiding this comment.
LGTM. Just merge conflicts need to be addressed.
Introduce Symbolic Constraint Solver for SQL-Driven Data Generation
Overview
This PR introduces coral-data-generation, a symbolic constraint solver that inverts SQL expressions to derive input domain constraints. Instead of forward evaluation (generate → test → reject), it solves backward from predicates to derive what inputs must satisfy, enabling efficient test data generation with guaranteed constraint satisfaction.
Motivation
Problem: Traditional test data generation uses rejection sampling—generate random values, evaluate SQL predicates, discard mismatches. This is inefficient for complex nested expressions and cannot detect unsatisfiable queries.
Solution: Symbolic inversion treats SQL expressions as mathematical transformations with inverse functions. Starting from output constraints (e.g.,
= '50'), the system walks expression trees inward, applying inverse operations to derive input domains.Examples
1. Nested String Operations
2. Cross-Domain Arithmetic
3. Date Extraction with Type Casting
4. Complex Nested Substring
5. Contradiction Detection
6. Date String Pattern Matching
Key Components
1. Domain System
2. Transformer Architecture
Pluggable symbolic inversion functions implementing DomainTransformer:
SUBSTRING(x, start, len)with positional constraintsLOWER(x)via case-insensitive regex generationx + c = value→x = value - cx * c = value→x = value / c3. Relational Preprocessing
Normalizes Calcite RelNode trees for symbolic analysis:
4. Solver
DomainInferenceProgram: Top-down expression tree traversal with domain refinement at each step, detecting contradictions via empty domain intersection.
Technical Approach
Symbolic Inversion: For nested expression
f(g(h(x))) = constant:f⁻¹→ intermediate domaing⁻¹→ refined domainh⁻¹→ input constraint onxContradiction Detection: Multiple predicates on same variable → domain intersection. Empty result = unsatisfiable query.
Extensibility: Architecture supports multi-table inference (join propagation), fixed-point iteration (recursive constraints), and arbitrary domain types (date, decimal, enum).
Testing
Integration Tests (RegexDomainInferenceProgramTest): 14+ test scenarios covering simple/nested transformations, cross-domain CAST operations, arithmetic inversion, and contradiction detection. All tests validate generated samples satisfy original SQL predicates.
Documentation
This module comes with aomprehensive README with conceptual model, examples, and API reference.
Future Extensibility
The architecture naturally extends to additional domains (DecimalDomain, DateDomain), more transformers (CONCAT, REGEXP_EXTRACT), multi-table inference (join constraint propagation), and aggregate support (cardinality constraints).