diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 06e6c2f14d864..780c53007233b 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -5599,6 +5599,18 @@ ], "sqlState" : "428EK" }, + "THETA_INVALID_INPUT_SKETCH_BUFFER" : { + "message" : [ + "Invalid call to ; only valid Theta sketch buffers are supported as inputs (such as those produced by the `theta_sketch_agg` function)." + ], + "sqlState" : "22546" + }, + "THETA_INVALID_LG_NOM_ENTRIES" : { + "message" : [ + "Invalid call to ; the `lgNomEntries` value must be between and , inclusive: ." + ], + "sqlState" : "22546" + }, "TRAILING_COMMA_IN_SELECT" : { "message" : [ "Trailing comma detected in SELECT clause. Remove the trailing comma before the FROM clause." diff --git a/python/docs/source/reference/pyspark.sql/functions.rst b/python/docs/source/reference/pyspark.sql/functions.rst index 7bec529407667..003fdc0a00b54 100644 --- a/python/docs/source/reference/pyspark.sql/functions.rst +++ b/python/docs/source/reference/pyspark.sql/functions.rst @@ -491,6 +491,9 @@ Aggregate Functions string_agg_distinct sum sum_distinct + theta_intersection_agg + theta_sketch_agg + theta_union_agg try_avg try_sum var_pop @@ -636,6 +639,10 @@ Misc Functions reflect session_user spark_partition_id + theta_difference + theta_intersection + theta_sketch_estimate + theta_union try_aes_decrypt try_reflect typeof diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py index 0380b517e6e5e..ce85f2c37ffcf 100644 --- a/python/pyspark/sql/connect/functions/builtin.py +++ b/python/pyspark/sql/connect/functions/builtin.py @@ -4236,6 +4236,81 @@ def hll_union( hll_union.__doc__ = pysparkfuncs.hll_union.__doc__ +def theta_sketch_agg( + col: "ColumnOrName", + lgNomEntries: Optional[Union[int, Column]] = None, +) -> Column: + fn = "theta_sketch_agg" + if lgNomEntries is None: + return _invoke_function_over_columns(fn, col) + else: + return _invoke_function_over_columns(fn, col, lit(lgNomEntries)) + + +theta_sketch_agg.__doc__ = pysparkfuncs.theta_sketch_agg.__doc__ + + +def theta_union_agg( + col: "ColumnOrName", + lgNomEntries: Optional[Union[int, Column]] = None, +) -> Column: + fn = "theta_union_agg" + if lgNomEntries is None: + return _invoke_function_over_columns(fn, col) + else: + return _invoke_function_over_columns(fn, col, lit(lgNomEntries)) + + +theta_union_agg.__doc__ = pysparkfuncs.theta_union_agg.__doc__ + + +def theta_intersection_agg( + col: "ColumnOrName", +) -> Column: + fn = "theta_intersection_agg" + return _invoke_function_over_columns(fn, col) + + +theta_intersection_agg.__doc__ = pysparkfuncs.theta_intersection_agg.__doc__ + + +def theta_sketch_estimate(col: "ColumnOrName") -> Column: + fn = "theta_sketch_estimate" + return _invoke_function_over_columns(fn, col) + + +theta_sketch_estimate.__doc__ = pysparkfuncs.theta_sketch_estimate.__doc__ + + +def theta_union( + col1: "ColumnOrName", col2: "ColumnOrName", lgNomEntries: Optional[Union[int, Column]] = None +) -> Column: + fn = "theta_union" + if lgNomEntries is None: + return _invoke_function_over_columns(fn, col1, col2) + else: + return _invoke_function_over_columns(fn, col1, col2, lit(lgNomEntries)) + + +theta_union.__doc__ = pysparkfuncs.theta_union.__doc__ + + +def theta_intersection(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: + fn = "theta_intersection" + return _invoke_function_over_columns(fn, col1, col2) + + +theta_intersection.__doc__ = pysparkfuncs.theta_intersection.__doc__ + + +def theta_difference(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: + fn = "theta_difference" + return _invoke_function_over_columns(fn, col1, col2) + + +theta_difference.__doc__ = pysparkfuncs.theta_difference.__doc__ + + # Predicates Function diff --git a/python/pyspark/sql/functions/__init__.py b/python/pyspark/sql/functions/__init__.py index 0d1df20d13d6f..7039a0c79f692 100644 --- a/python/pyspark/sql/functions/__init__.py +++ b/python/pyspark/sql/functions/__init__.py @@ -404,6 +404,9 @@ "string_agg_distinct", "sum", "sum_distinct", + "theta_intersection_agg", + "theta_sketch_agg", + "theta_union_agg", "try_avg", "try_sum", "var_pop", @@ -495,6 +498,10 @@ "reflect", "session_user", "spark_partition_id", + "theta_difference", + "theta_intersection", + "theta_sketch_estimate", + "theta_union", "try_aes_decrypt", "try_reflect", "typeof", diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index b09713e0c289e..cc00f6abe068b 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -25744,6 +25744,344 @@ def hll_union( return _invoke_function("hll_union", _to_java_column(col1), _to_java_column(col2)) +@_try_remote_functions +def theta_sketch_agg( + col: "ColumnOrName", + lgNomEntries: Optional[Union[int, Column]] = None, +) -> Column: + """ + Aggregate function: returns the compact binary representation of the Datasketches + ThetaSketch with the values in the input column configured with lgNomEntries nominal entries. + + .. versionadded:: 4.1.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or column name + lgNomEntries : :class:`~pyspark.sql.Column` or int, optional + The log-base-2 of nominal entries, where nominal entries is the size of the sketch + (must be between 4 and 26, defaults to 12) + + Returns + ------- + :class:`~pyspark.sql.Column` + The binary representation of the ThetaSketch. + + See Also + -------- + :meth:`pyspark.sql.functions.theta_union` + :meth:`pyspark.sql.functions.theta_intersection` + :meth:`pyspark.sql.functions.theta_difference` + :meth:`pyspark.sql.functions.theta_union_agg` + :meth:`pyspark.sql.functions.theta_intersection_agg` + :meth:`pyspark.sql.functions.theta_sketch_estimate` + + Examples + -------- + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([1,2,2,3], "INT") + >>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value"))).show() + +--------------------------------------------------+ + |theta_sketch_estimate(theta_sketch_agg(value, 12))| + +--------------------------------------------------+ + | 3| + +--------------------------------------------------+ + + >>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value", 15))).show() + +--------------------------------------------------+ + |theta_sketch_estimate(theta_sketch_agg(value, 15))| + +--------------------------------------------------+ + | 3| + +--------------------------------------------------+ + """ + fn = "theta_sketch_agg" + if lgNomEntries is None: + return _invoke_function_over_columns(fn, col) + else: + return _invoke_function_over_columns(fn, col, lit(lgNomEntries)) + + +@_try_remote_functions +def theta_union_agg( + col: "ColumnOrName", + lgNomEntries: Optional[Union[int, Column]] = None, +) -> Column: + """ + Aggregate function: returns the compact binary representation of the Datasketches + ThetaSketch that is the union of the Theta sketches in the input column. + + .. versionadded:: 4.1.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or column name + lgNomEntries : :class:`~pyspark.sql.Column` or int, optional + The log-base-2 of nominal entries for the union operation + (must be between 4 and 26, defaults to 12) + + Returns + ------- + :class:`~pyspark.sql.Column` + The binary representation of the merged ThetaSketch. + + See Also + -------- + :meth:`pyspark.sql.functions.theta_union` + :meth:`pyspark.sql.functions.theta_sketch_agg` + :meth:`pyspark.sql.functions.theta_sketch_estimate` + + Examples + -------- + >>> from pyspark.sql import functions as sf + >>> df1 = spark.createDataFrame([1,2,2,3], "INT") + >>> df1 = df1.agg(sf.theta_sketch_agg("value").alias("sketch")) + >>> df2 = spark.createDataFrame([4,5,5,6], "INT") + >>> df2 = df2.agg(sf.theta_sketch_agg("value").alias("sketch")) + >>> df3 = df1.union(df2) + >>> df3.agg(sf.theta_sketch_estimate(sf.theta_union_agg("sketch"))).show() + +--------------------------------------------------+ + |theta_sketch_estimate(theta_union_agg(sketch, 12))| + +--------------------------------------------------+ + | 6| + +--------------------------------------------------+ + """ + fn = "theta_union_agg" + if lgNomEntries is None: + return _invoke_function_over_columns(fn, col) + else: + return _invoke_function_over_columns(fn, col, lit(lgNomEntries)) + + +@_try_remote_functions +def theta_intersection_agg(col: "ColumnOrName") -> Column: + """ + Aggregate function: returns the compact binary representation of the Datasketches + ThetaSketch that is the intersection of the Theta sketches in the input column + + .. versionadded:: 4.1.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or column name + + Returns + ------- + :class:`~pyspark.sql.Column` + The binary representation of the intersected ThetaSketch. + + See Also + -------- + :meth:`pyspark.sql.functions.theta_intersection` + :meth:`pyspark.sql.functions.theta_sketch_agg` + :meth:`pyspark.sql.functions.theta_sketch_estimate` + + Examples + -------- + >>> from pyspark.sql import functions as sf + >>> df1 = spark.createDataFrame([1,2,2,3], "INT") + >>> df1 = df1.agg(sf.theta_sketch_agg("value").alias("sketch")) + >>> df2 = spark.createDataFrame([2,3,3,4], "INT") + >>> df2 = df2.agg(sf.theta_sketch_agg("value").alias("sketch")) + >>> df3 = df1.union(df2) + >>> df3.agg(sf.theta_sketch_estimate(sf.theta_intersection_agg("sketch"))).show() + +-----------------------------------------------------+ + |theta_sketch_estimate(theta_intersection_agg(sketch))| + +-----------------------------------------------------+ + | 2| + +-----------------------------------------------------+ + """ + fn = "theta_intersection_agg" + return _invoke_function_over_columns(fn, col) + + +@_try_remote_functions +def theta_sketch_estimate(col: "ColumnOrName") -> Column: + """ + Returns the estimated number of unique values given the binary representation + of a Datasketches ThetaSketch. + + .. versionadded:: 4.1.0 + + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or column name + + Returns + ------- + :class:`~pyspark.sql.Column` + The estimated number of unique values for the ThetaSketch. + + See Also + -------- + :meth:`pyspark.sql.functions.theta_union` + :meth:`pyspark.sql.functions.theta_intersection` + :meth:`pyspark.sql.functions.theta_difference` + :meth:`pyspark.sql.functions.theta_union_agg` + :meth:`pyspark.sql.functions.theta_intersection_agg` + :meth:`pyspark.sql.functions.theta_sketch_agg` + + Examples + -------- + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([1,2,2,3], "INT") + >>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value"))).show() + +--------------------------------------------------+ + |theta_sketch_estimate(theta_sketch_agg(value, 12))| + +--------------------------------------------------+ + | 3| + +--------------------------------------------------+ + """ + + fn = "theta_sketch_estimate" + return _invoke_function_over_columns(fn, col) + + +@_try_remote_functions +def theta_union( + col1: "ColumnOrName", col2: "ColumnOrName", lgNomEntries: Optional[Union[int, Column]] = None +) -> Column: + """ + Merges two binary representations of Datasketches ThetaSketch objects, using a + Datasketches Union object. + + .. versionadded:: 4.1.0 + + Parameters + ---------- + col1 : :class:`~pyspark.sql.Column` or column name + col2 : :class:`~pyspark.sql.Column` or column name + lgNomEntries : :class:`~pyspark.sql.Column` or int, optional + The log-base-2 of nominal entries for the union operation + (must be between 4 and 26, defaults to 12) + + Returns + ------- + :class:`~pyspark.sql.Column` + The binary representation of the merged ThetaSketch. + + See Also + -------- + :meth:`pyspark.sql.functions.theta_union_agg` + :meth:`pyspark.sql.functions.theta_sketch_agg` + :meth:`pyspark.sql.functions.theta_sketch_estimate` + + Examples + -------- + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(1,4),(2,5),(2,5),(3,6)], "struct") + >>> df = df.agg( + ... sf.theta_sketch_agg("v1").alias("sketch1"), + ... sf.theta_sketch_agg("v2").alias("sketch2") + ... ) + >>> df.select(sf.theta_sketch_estimate(sf.theta_union(df.sketch1, "sketch2"))).show() + +--------------------------------------------------------+ + |theta_sketch_estimate(theta_union(sketch1, sketch2, 12))| + +--------------------------------------------------------+ + | 6| + +--------------------------------------------------------+ + """ + + fn = "theta_union" + if lgNomEntries is not None: + return _invoke_function_over_columns( + fn, + col1, + col2, + lit(lgNomEntries), + ) + else: + return _invoke_function_over_columns(fn, col1, col2) + + +@_try_remote_functions +def theta_intersection(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: + """ + Returns the intersection of two binary representations of Datasketches ThetaSketch + objects, using a Datasketches Intersection object. + + .. versionadded:: 4.1.0 + + Parameters + ---------- + col1 : :class:`~pyspark.sql.Column` or column name + col2 : :class:`~pyspark.sql.Column` or column name + + Returns + ------- + :class:`~pyspark.sql.Column` + The binary representation of the intersected ThetaSketch. + + See Also + -------- + :meth:`pyspark.sql.functions.theta_intersection_agg` + :meth:`pyspark.sql.functions.theta_sketch_agg` + :meth:`pyspark.sql.functions.theta_sketch_estimate` + + Examples + -------- + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(1,1),(2,2),(3,2),(3,3)], "struct") + >>> df = df.agg( + ... sf.theta_sketch_agg("v1").alias("sketch1"), + ... sf.theta_sketch_agg("v2").alias("sketch2") + ... ) + >>> df.select(sf.theta_sketch_estimate(sf.theta_intersection(df.sketch1, "sketch2"))).show() + +-----------------------------------------------------------+ + |theta_sketch_estimate(theta_intersection(sketch1, sketch2))| + +-----------------------------------------------------------+ + | 3| + +-----------------------------------------------------------+ + """ + + fn = "theta_intersection" + return _invoke_function_over_columns(fn, col1, col2) + + +@_try_remote_functions +def theta_difference(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: + """ + Returns the set difference of two binary representations of Datasketches ThetaSketch + objects (elements in first sketch but not in second), using a Datasketches ANotB object. + + .. versionadded:: 4.1.0 + + Parameters + ---------- + col1 : :class:`~pyspark.sql.Column` or column name + col2 : :class:`~pyspark.sql.Column` or column name + + Returns + ------- + :class:`~pyspark.sql.Column` + The binary representation of the difference ThetaSketch. + + See Also + -------- + :meth:`pyspark.sql.functions.theta_union` + :meth:`pyspark.sql.functions.theta_intersection` + :meth:`pyspark.sql.functions.theta_sketch_agg` + :meth:`pyspark.sql.functions.theta_sketch_estimate` + + Examples + -------- + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(1,4),(2,4),(3,5),(4,5)], "struct") + >>> df = df.agg( + ... sf.theta_sketch_agg("v1").alias("sketch1"), + ... sf.theta_sketch_agg("v2").alias("sketch2") + ... ) + >>> df.select(sf.theta_sketch_estimate(sf.theta_difference(df.sketch1, "sketch2"))).show() + +---------------------------------------------------------+ + |theta_sketch_estimate(theta_difference(sketch1, sketch2))| + +---------------------------------------------------------+ + | 3| + +---------------------------------------------------------+ + """ + + fn = "theta_difference" + return _invoke_function_over_columns(fn, col1, col2) + + # ---------------------- Predicates functions ------------------------------ diff --git a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala index 0165d8d4cf708..2a26c0fad29a7 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala @@ -1165,6 +1165,143 @@ object functions { */ def sum_distinct(e: Column): Column = Column.fn("sum", isDistinct = true, e) + /** + * Aggregate function: returns the compact binary representation of the Datasketches + * ThetaSketch, generated by intersecting the Datasketches ThetaSketch instances in the input + * column via a Datasketches Intersection instance. + * + * @group agg_funcs + * @since 4.1.0 + */ + def theta_intersection_agg(e: Column): Column = + Column.fn("theta_intersection_agg", e) + + /** + * Aggregate function: returns the compact binary representation of the Datasketches + * ThetaSketch, generated by intersecting the Datasketches ThetaSketch instances in the input + * volumn via a Datasketches Intersection instance. + * + * @group agg_funcs + * @since 4.1.0 + */ + def theta_intersection_agg(columnName: String): Column = + theta_intersection_agg(Column(columnName)) + + /** + * Aggregate function: returns the compact binary representation of the Datasketches ThetaSketch + * built with the values in the input column and configured with the `lgNomEntries` nominal + * entries. + * + * @group agg_funcs + * @since 4.1.0 + */ + def theta_sketch_agg(e: Column, lgNomEntries: Column): Column = + Column.fn("theta_sketch_agg", e, lgNomEntries) + + /** + * Aggregate function: returns the compact binary representation of the Datasketches ThetaSketch + * built with the values in the input column and configured with the `lgNomEntries` nominal + * entries. + * + * @group agg_funcs + * @since 4.1.0 + */ + def theta_sketch_agg(e: Column, lgNomEntries: Int): Column = + Column.fn("theta_sketch_agg", e, lit(lgNomEntries)) + + /** + * Aggregate function: returns the compact binary representation of the Datasketches ThetaSketch + * built with the values in the input column and configured with the `lgNomEntries` nominal + * entries. + * + * @group agg_funcs + * @since 4.1.0 + */ + def theta_sketch_agg(columnName: String, lgNomEntries: Int): Column = + theta_sketch_agg(Column(columnName), lgNomEntries) + + /** + * Aggregate function: returns the compact binary representation of the Datasketches ThetaSketch + * built with the values in the input column and configured with the default value of 12 for + * `lgNomEntries`. + * + * @group agg_funcs + * @since 4.1.0 + */ + def theta_sketch_agg(e: Column): Column = + Column.fn("theta_sketch_agg", e) + + /** + * Aggregate function: returns the compact binary representation of the Datasketches ThetaSketch + * built with the values in the input column and configured with the default value of 12 for + * `lgNomEntries`. + * + * @group agg_funcs + * @since 4.1.0 + */ + def theta_sketch_agg(columnName: String): Column = + theta_sketch_agg(Column(columnName)) + + /** + * Aggregate function: returns the compact binary representation of the Datasketches + * ThetaSketch, generated by the union of Datasketches ThetaSketch instances in the input column + * via a Datasketches Union instance. It allows the configuration of `lgNomEntries` log nominal + * entries for the union buffer. + * + * @group agg_funcs + * @since 4.1.0 + */ + def theta_union_agg(e: Column, lgNomEntries: Column): Column = + Column.fn("theta_union_agg", e, lgNomEntries) + + /** + * Aggregate function: returns the compact binary representation of the Datasketches + * ThetaSketch, generated by the union of Datasketches ThetaSketch instances in the input column + * via a Datasketches Union instance. It allows the configuration of `lgNomEntries` log nominal + * entries for the union buffer. + * + * @group agg_funcs + * @since 4.1.0 + */ + def theta_union_agg(e: Column, lgNomEntries: Int): Column = + Column.fn("theta_union_agg", e, lit(lgNomEntries)) + + /** + * Aggregate function: returns the compact binary representation of the Datasketches + * ThetaSketch, generated by the union of Datasketches ThetaSketch instances in the input column + * via a Datasketches Union instance. It allows the configuration of `lgNomEntries` log nominal + * entries for the union buffer. + * + * @group agg_funcs + * @since 4.1.0 + */ + def theta_union_agg(columnName: String, lgNomEntries: Int): Column = + theta_union_agg(Column(columnName), lgNomEntries) + + /** + * Aggregate function: returns the compact binary representation of the Datasketches + * ThetaSketch, generated by the union of Datasketches ThetaSketch instances in the input column + * via a Datasketches Union instance. It is configured with the default value of 12 for + * `lgNomEntries`. + * + * @group agg_funcs + * @since 4.1.0 + */ + def theta_union_agg(e: Column): Column = + Column.fn("theta_union_agg", e) + + /** + * Aggregate function: returns the compact binary representation of the Datasketches + * ThetaSketch, generated by the union of Datasketches ThetaSketch instances in the input column + * via a Datasketches Union instance. It is configured with the default value of 12 for + * `lgNomEntries`. + * + * @group agg_funcs + * @since 4.1.0 + */ + def theta_union_agg(columnName: String): Column = + theta_union_agg(Column(columnName)) + /** * Aggregate function: returns the concatenation of non-null input values. * @@ -3552,6 +3689,125 @@ object functions { hll_union(Column(columnName1), Column(columnName2), allowDifferentLgConfigK) } + /** + * Subtracts two binary representations of Datasketches ThetaSketch objects in the input columns + * using a Datasketches AnotB object + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_difference(c1: Column, c2: Column): Column = + Column.fn("theta_difference", c1, c2) + + /** + * Subtracts two binary representations of Datasketches ThetaSketch objects in the input columns + * using a Datasketches AnotB object + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_difference(columnName1: String, columnName2: String): Column = { + theta_difference(Column(columnName1), Column(columnName2)) + } + + /** + * Intersects two binary representations of Datasketches ThetaSketch objects in the input + * columns using a Datasketches Intersection object + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_intersection(c1: Column, c2: Column): Column = + Column.fn("theta_intersection", c1, c2) + + /** + * Intersects two binary representations of Datasketches ThetaSketch objects in the input + * columns using a Datasketches Intersection object + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_intersection(columnName1: String, columnName2: String): Column = { + theta_intersection(Column(columnName1), Column(columnName2)) + } + + /** + * Returns the estimated number of unique values given the binary representation of a + * Datasketches ThetaSketch. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_sketch_estimate(c: Column): Column = Column.fn("theta_sketch_estimate", c) + + /** + * Returns the estimated number of unique values given the binary representation of a + * Datasketches ThetaSketch. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_sketch_estimate(columnName: String): Column = { + theta_sketch_estimate(Column(columnName)) + } + + /** + * Unions two binary representations of Datasketches ThetaSketch objects in the input columns + * using a Datasketches Union object. It is configured with the default value of 12 for + * `lgNomEntries`. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_union(c1: Column, c2: Column): Column = + Column.fn("theta_union", c1, c2) + + /** + * Unions two binary representations of Datasketches ThetaSketch objects in the input columns + * using a Datasketches Union object. It is configured with the default value of 12 for + * `lgNomEntries`. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_union(columnName1: String, columnName2: String): Column = { + theta_union(Column(columnName1), Column(columnName2)) + } + + /** + * Unions two binary representations of Datasketches ThetaSketch objects in the input columns + * using a Datasketches Union object. It allows the configuration of `lgNomEntries` log nominal + * entries for the union buffer. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_union(c1: Column, c2: Column, lgNomEntries: Int): Column = + Column.fn("theta_union", c1, c2, lit(lgNomEntries)) + + /** + * Unions two binary representations of Datasketches ThetaSketch objects in the input columns + * using a Datasketches Union object. It allows the configuration of `lgNomEntries` log nominal + * entries for the union buffer. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_union(columnName1: String, columnName2: String, lgNomEntries: Int): Column = { + theta_union(Column(columnName1), Column(columnName2), lgNomEntries) + } + + /** + * Unions two binary representations of Datasketches ThetaSketch objects in the input columns + * using a Datasketches Union object. It allows the configuration of `lgNomEntries` log nominal + * entries for the union buffer. + * + * @group misc_funcs + * @since 4.1.0 + */ + def theta_union(c1: Column, c2: Column, lgNomEntries: Column): Column = + Column.fn("theta_union", c1, c2, lgNomEntries) + /** * Returns the user name of current execution context. * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index afa43e876b26d..9c3ace2471851 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -530,6 +530,9 @@ object FunctionRegistry { expression[HllSketchAgg]("hll_sketch_agg"), expression[HllUnionAgg]("hll_union_agg"), expression[ApproxTopK]("approx_top_k"), + expression[ThetaSketchAgg]("theta_sketch_agg"), + expression[ThetaUnionAgg]("theta_union_agg"), + expression[ThetaIntersectionAgg]("theta_intersection_agg"), expression[ApproxTopKAccumulate]("approx_top_k_accumulate"), // string functions @@ -791,6 +794,10 @@ object FunctionRegistry { expression[EqualNull]("equal_null"), expression[HllSketchEstimate]("hll_sketch_estimate"), expression[HllUnion]("hll_union"), + expression[ThetaSketchEstimate]("theta_sketch_estimate"), + expression[ThetaUnion]("theta_union"), + expression[ThetaDifference]("theta_difference"), + expression[ThetaIntersection]("theta_intersection"), expression[ApproxTopKEstimate]("approx_top_k_estimate"), // grouping sets diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala new file mode 100644 index 0000000000000..7e55c006782cf --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala @@ -0,0 +1,644 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions.aggregate + +import org.apache.datasketches.memory.Memory +import org.apache.datasketches.theta.{CompactSketch, Intersection, SetOperation, Sketch, Union, UpdateSketch, UpdateSketchBuilder} + +import org.apache.spark.SparkUnsupportedOperationException +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, ExpressionDescription, Literal} +import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate +import org.apache.spark.sql.catalyst.trees.{BinaryLike, UnaryLike} +import org.apache.spark.sql.catalyst.util.{ArrayData, CollationFactory, ThetaSketchUtils} +import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.internal.types.StringTypeWithCollation +import org.apache.spark.sql.types.{AbstractDataType, ArrayType, BinaryType, DataType, DoubleType, FloatType, IntegerType, LongType, StringType, TypeCollection} +import org.apache.spark.unsafe.types.UTF8String + +sealed trait ThetaSketchState { + def serialize(): Array[Byte] + def eval(): Array[Byte] +} +case class UpdatableSketchBuffer(sketch: UpdateSketch) extends ThetaSketchState { + override def serialize(): Array[Byte] = sketch.rebuild.compact.toByteArrayCompressed + override def eval(): Array[Byte] = sketch.rebuild.compact.toByteArrayCompressed +} +case class UnionAggregationBuffer(union: Union) extends ThetaSketchState { + override def serialize(): Array[Byte] = union.getResult.toByteArrayCompressed + override def eval(): Array[Byte] = union.getResult.toByteArrayCompressed +} +case class IntersectionAggregationBuffer(intersection: Intersection) extends ThetaSketchState { + override def serialize(): Array[Byte] = intersection.getResult.toByteArrayCompressed + override def eval(): Array[Byte] = intersection.getResult.toByteArrayCompressed +} +case class FinalizedSketch(sketch: CompactSketch) extends ThetaSketchState { + override def serialize(): Array[Byte] = sketch.toByteArrayCompressed + override def eval(): Array[Byte] = sketch.toByteArrayCompressed +} + +/** + * The ThetaSketchAgg function utilizes a Datasketches ThetaSketch instance to count a + * probabilistic approximation of the number of unique values in a given column, and outputs the + * binary representation of the ThetaSketch. + * + * See [[https://datasketches.apache.org/docs/Theta/ThetaSketches.html]] for more information. + * + * @param left + * child expression against which unique counting will occur + * @param right + * the log-base-2 of nomEntries decides the number of buckets for the sketch + * @param mutableAggBufferOffset + * offset for mutable aggregation buffer + * @param inputAggBufferOffset + * offset for input aggregation buffer + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(expr, lgNomEntries) - Returns the ThetaSketch compact binary representation. + `lgNomEntries` (optional) is the log-base-2 of nominal entries, with nominal entries deciding + the number buckets or slots for the ThetaSketch. """, + examples = """ + Examples: + > SELECT theta_sketch_estimate(_FUNC_(col, 12)) FROM VALUES (1), (1), (2), (2), (3) tab(col); + 3 + """, + group = "agg_funcs", + since = "4.1.0") +// scalastyle:on line.size.limit +case class ThetaSketchAgg( + left: Expression, + right: Expression, + override val mutableAggBufferOffset: Int, + override val inputAggBufferOffset: Int) + extends TypedImperativeAggregate[ThetaSketchState] + with BinaryLike[Expression] + with ExpectsInputTypes { + + // ThetaSketch config - mark as lazy so that they're not evaluated during tree transformation. + + lazy val lgNomEntries: Int = { + val lgNomEntriesInput = right.eval().asInstanceOf[Int] + ThetaSketchUtils.checkLgNomLongs(lgNomEntriesInput, prettyName) + lgNomEntriesInput + } + + // Constructors + + def this(child: Expression) = { + this(child, Literal(ThetaSketchUtils.DEFAULT_LG_NOM_LONGS), 0, 0) + } + + def this(child: Expression, lgNomEntries: Expression) = { + this(child, lgNomEntries, 0, 0) + } + + def this(child: Expression, lgNomEntries: Int) = { + this(child, Literal(lgNomEntries), 0, 0) + } + + // Copy constructors required by ImperativeAggregate + + override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ThetaSketchAgg = + copy(mutableAggBufferOffset = newMutableAggBufferOffset) + + override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ThetaSketchAgg = + copy(inputAggBufferOffset = newInputAggBufferOffset) + + override protected def withNewChildrenInternal( + newLeft: Expression, + newRight: Expression): ThetaSketchAgg = + copy(left = newLeft, right = newRight) + + // Overrides for TypedImperativeAggregate + + override def prettyName: String = "theta_sketch_agg" + + override def inputTypes: Seq[AbstractDataType] = + Seq( + TypeCollection( + ArrayType(IntegerType), + ArrayType(LongType), + BinaryType, + DoubleType, + FloatType, + IntegerType, + LongType, + StringTypeWithCollation(supportsTrimCollation = true)), + IntegerType) + + override def dataType: DataType = BinaryType + + override def nullable: Boolean = false + + /** + * Instantiate an UpdateSketch instance using the lgNomEntries param. + * + * @return + * an UpdateSketch instance wrapped with UpdatableSketchBuffer + */ + override def createAggregationBuffer(): ThetaSketchState = { + val builder = new UpdateSketchBuilder + builder.setLogNominalEntries(lgNomEntries) + UpdatableSketchBuffer(builder.build) + } + + /** + * Evaluate the input row and update the UpdateSketch instance with the row's value. The update + * function only supports a subset of Spark SQL types, and an exception will be thrown for + * unsupported types. + * Notes: + * - Null values are ignored. + * - Empty byte arrays are ignored + * - Empty arrays of supported element types are ignored + * - Strings that are collation-equal to the empty string are ignored. + * + * @param updateBuffer + * A previously initialized UpdateSketch instance + * @param input + * An input row + */ + override def update(updateBuffer: ThetaSketchState, input: InternalRow): ThetaSketchState = { + // Return early for null values. + val v = left.eval(input) + if (v == null) return updateBuffer + + // Initialized buffer should be UpdatableSketchBuffer, else error out. + val sketch = updateBuffer match { + case UpdatableSketchBuffer(s) => s + case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName) + } + + // Handle the different data types for sketch updates. + left.dataType match { + case ArrayType(IntegerType, _) => + val arr = v.asInstanceOf[ArrayData].toIntArray() + sketch.update(arr) + case ArrayType(LongType, _) => + val arr = v.asInstanceOf[ArrayData].toLongArray() + sketch.update(arr) + case BinaryType => + val bytes = v.asInstanceOf[Array[Byte]] + sketch.update(bytes) + case DoubleType => + sketch.update(v.asInstanceOf[Double]) + case FloatType => + sketch.update(v.asInstanceOf[Float].toDouble) // Float is promoted to double. + case IntegerType => + sketch.update(v.asInstanceOf[Int].toLong) // Int is promoted to Long. + case LongType => + sketch.update(v.asInstanceOf[Long]) + case st: StringType => + val collation = CollationFactory.fetchCollation(st.collationId) + val str = v.asInstanceOf[UTF8String] + if (!collation.equalsFunction(str, UTF8String.EMPTY_UTF8)) { + sketch.update(collation.sortKeyFunction.apply(str)) + } + case _ => + throw new SparkUnsupportedOperationException( + errorClass = "_LEGACY_ERROR_TEMP_3121", + messageParameters = Map("dataType" -> left.dataType.toString)) + } + + UpdatableSketchBuffer(sketch) + } + + /** + * Merges an input Compact sketch into the UpdateSketch which is acting as the aggregation + * buffer. + * + * @param updateBuffer + * The UpdateSketch or Union instance used to store the aggregation result + * @param input + * An input UpdateSketch, Union, or Compact sketch instance + */ + override def merge( + updateBuffer: ThetaSketchState, + input: ThetaSketchState): ThetaSketchState = { + // This is a helper function to create union only when needed. + def createUnionWith(sketch1: Sketch, sketch2: Sketch): UnionAggregationBuffer = { + val union = SetOperation.builder.setLogNominalEntries(lgNomEntries).buildUnion + union.union(sketch1) + union.union(sketch2) + UnionAggregationBuffer(union) + } + + (updateBuffer, input) match { + // Reuse the existing union in the next iteration. This is the most efficient path. + case (UnionAggregationBuffer(existingUnion), UpdatableSketchBuffer(sketch)) => + existingUnion.union(sketch.compact) + UnionAggregationBuffer(existingUnion) + case (UnionAggregationBuffer(existingUnion), FinalizedSketch(sketch)) => + existingUnion.union(sketch) + UnionAggregationBuffer(existingUnion) + case (UnionAggregationBuffer(union1), UnionAggregationBuffer(union2)) => + union1.union(union2.getResult) + UnionAggregationBuffer(union1) + // Create a new union only when necessary. + case (UpdatableSketchBuffer(sketch1), UpdatableSketchBuffer(sketch2)) => + createUnionWith(sketch1.compact, sketch2.compact) + case (UpdatableSketchBuffer(sketch1), FinalizedSketch(sketch2)) => + createUnionWith(sketch1.compact, sketch2) + // The program should never make it here, the cases are for defensive programming. + case (FinalizedSketch(sketch1), UpdatableSketchBuffer(sketch2)) => + createUnionWith(sketch1, sketch2.compact) + case (FinalizedSketch(sketch1), FinalizedSketch(sketch2)) => + createUnionWith(sketch1, sketch2) + case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName) + } + } + + /** + * Returns a Compact sketch derived from the input column or expression + * + * @param sketchState + * Union instance used as an aggregation buffer + * @return + * A Compact binary sketch + */ + override def eval(sketchState: ThetaSketchState): Any = { + sketchState.eval() + } + + /** Convert the underlying UpdateSketch/Union into an Compact byte array. */ + override def serialize(sketchState: ThetaSketchState): Array[Byte] = { + sketchState.serialize() + } + + /** Wrap the byte array into a Compact sketch instance. */ + override def deserialize(buffer: Array[Byte]): ThetaSketchState = { + if (buffer.nonEmpty) { + FinalizedSketch(CompactSketch.heapify(Memory.wrap(buffer))) + } else { + this.createAggregationBuffer() + } + } +} + +/** + * The ThetaUnionAgg function ingests and merges Datasketches ThetaSketch instances previously + * produced by the ThetaSketchAgg function and outputs the merged ThetaSketch. + * + * See [[https://datasketches.apache.org/docs/Theta/ThetaSketches.html]] for more information. + * + * @param left + * Child expression against which unique counting will occur + * @param right + * the log-base-2 of nomEntries decides the number of buckets for the sketch + * @param mutableAggBufferOffset + * offset for mutable aggregation buffer + * @param inputAggBufferOffset + * offset for input aggregation buffer + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(expr, lgNomEntries) - Returns the ThetaSketch's Compact binary representation. + `lgNomEntries` (optional) the log-base-2 of Nominal Entries, with Nominal Entries deciding + the number buckets or slots for the ThetaSketch.""", + examples = """ + Examples: + > SELECT theta_sketch_estimate(_FUNC_(sketch)) FROM (SELECT theta_sketch_agg(col) as sketch FROM VALUES (1) tab(col) UNION ALL SELECT theta_sketch_agg(col, 20) as sketch FROM VALUES (1) tab(col)); + 1 + """, + group = "agg_funcs", + since = "4.1.0") +// scalastyle:on line.size.limit +case class ThetaUnionAgg( + left: Expression, + right: Expression, + override val mutableAggBufferOffset: Int, + override val inputAggBufferOffset: Int) + extends TypedImperativeAggregate[ThetaSketchState] + with BinaryLike[Expression] + with ExpectsInputTypes { + + // ThetaSketch config - mark as lazy so that they're not evaluated during tree transformation. + + lazy val lgNomEntries: Int = { + val lgNomEntriesInput = right.eval().asInstanceOf[Int] + ThetaSketchUtils.checkLgNomLongs(lgNomEntriesInput, prettyName) + lgNomEntriesInput + } + + // Constructors + + def this(child: Expression) = { + this(child, Literal(ThetaSketchUtils.DEFAULT_LG_NOM_LONGS), 0, 0) + } + + def this(child: Expression, lgNomEntries: Expression) = { + this(child, lgNomEntries, 0, 0) + } + + def this(child: Expression, lgNomEntries: Int) = { + this(child, Literal(lgNomEntries), 0, 0) + } + + // Copy constructors required by ImperativeAggregate + + override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ThetaUnionAgg = + copy(mutableAggBufferOffset = newMutableAggBufferOffset) + + override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ThetaUnionAgg = + copy(inputAggBufferOffset = newInputAggBufferOffset) + + override protected def withNewChildrenInternal( + newLeft: Expression, + newRight: Expression): ThetaUnionAgg = + copy(left = newLeft, right = newRight) + + // Overrides for TypedImperativeAggregate + + override def prettyName: String = "theta_union_agg" + + override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, IntegerType) + + override def dataType: DataType = BinaryType + + override def nullable: Boolean = false + + /** + * Instantiate a Union instance using the lgNomEntries param. + * + * @return + * a Union instance wrapped with UnionAggregationBuffer + */ + override def createAggregationBuffer(): ThetaSketchState = { + UnionAggregationBuffer( + SetOperation.builder + .setLogNominalEntries(lgNomEntries) + .buildUnion) + } + + /** + * Update the Union instance with the Compact sketch byte array obtained from the row. + * + * @param unionBuffer + * A previously initialized Union instance + * @param input + * An input row + */ + override def update(unionBuffer: ThetaSketchState, input: InternalRow): ThetaSketchState = { + // Return early for null input values. + val v = left.eval(input) + if (v == null) return unionBuffer + + // Sketches must be in binary form to be aggregated, else error out. + left.dataType match { + case BinaryType => // Continue processing with a BinaryType. + case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName) + } + + val sketchBytes = v.asInstanceOf[Array[Byte]] + val inputSketch = ThetaSketchUtils.wrapCompactSketch(sketchBytes, prettyName) + + val union = unionBuffer match { + case UnionAggregationBuffer(existingUnionBuffer) => existingUnionBuffer + case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName) + } + union.union(inputSketch) + UnionAggregationBuffer(union) + } + + /** + * Merges an input Compact sketch into the Union which is acting as the aggregation buffer. + * + * @param unionBuffer + * The Union instance used to store the aggregation result + * @param input + * An input Union or Compact sketch instance + */ + override def merge(unionBuffer: ThetaSketchState, input: ThetaSketchState): ThetaSketchState = { + (unionBuffer, input) match { + // If both arguments are union objects, merge them directly. + case (UnionAggregationBuffer(unionLeft), UnionAggregationBuffer(unionRight)) => + unionLeft.union(unionRight.getResult) + UnionAggregationBuffer(unionLeft) + // The input was serialized then deserialized. + case (UnionAggregationBuffer(union), FinalizedSketch(sketch)) => + union.union(sketch) + UnionAggregationBuffer(union) + // The program should never make it here, the cases are for defensive programming. + case (FinalizedSketch(sketch1), FinalizedSketch(sketch2)) => + val union = SetOperation.builder.setLogNominalEntries(lgNomEntries).buildUnion + union.union(sketch1) + union.union(sketch2) + UnionAggregationBuffer(union) + case (FinalizedSketch(sketch), UnionAggregationBuffer(union)) => + union.union(sketch) + UnionAggregationBuffer(union) + case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName) + } + } + + /** + * Returns a Compact sketch derived from the merged sketches + * + * @param sketchState + * Union instance used as an aggregation buffer + * @return + * A Compact binary sketch + */ + override def eval(sketchState: ThetaSketchState): Any = { + sketchState.eval() + } + + /** Converts the underlying Union into an Compact byte array. */ + override def serialize(sketchState: ThetaSketchState): Array[Byte] = { + sketchState.serialize() + } + + /** Wrap the byte array into a Compact sketch instance. */ + override def deserialize(buffer: Array[Byte]): ThetaSketchState = { + if (buffer.nonEmpty) { + FinalizedSketch(CompactSketch.heapify(Memory.wrap(buffer))) + } else { + this.createAggregationBuffer() + } + } +} + +/** + * The ThetaIntersectionAgg function ingests and intersects Datasketches ThetaSketch instances + * previously produced by the ThetaSketchAgg function, and outputs the intersected ThetaSketch. + * + * See [[https://datasketches.apache.org/docs/Theta/ThetaSketches.html]] for more information. + * + * @param child + * Child expression against which unique counting will occur + * @param mutableAggBufferOffset + * offset for mutable aggregation buffer + * @param inputAggBufferOffset + * offset for input aggregation buffer + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(expr, lgNomEntries) - Returns the ThetaSketch's Compact binary representation + by intersecting all the Theta sketches in the input column.""", + examples = """ + Examples: + > SELECT theta_sketch_estimate(_FUNC_(sketch)) FROM (SELECT theta_sketch_agg(col) as sketch FROM VALUES (1) tab(col) UNION ALL SELECT theta_sketch_agg(col, 20) as sketch FROM VALUES (1) tab(col)); + 1 + """, + group = "agg_funcs", + since = "4.1.0") +// scalastyle:on line.size.limit +case class ThetaIntersectionAgg( + child: Expression, + override val mutableAggBufferOffset: Int, + override val inputAggBufferOffset: Int) + extends TypedImperativeAggregate[ThetaSketchState] + with UnaryLike[Expression] + with ExpectsInputTypes { + + // Constructor + + def this(child: Expression) = { + this(child, 0, 0) + } + + // Copy constructors required by ImperativeAggregate + + override def withNewMutableAggBufferOffset( + newMutableAggBufferOffset: Int): ThetaIntersectionAgg = + copy(mutableAggBufferOffset = newMutableAggBufferOffset) + + override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ThetaIntersectionAgg = + copy(inputAggBufferOffset = newInputAggBufferOffset) + + override protected def withNewChildInternal(newChild: Expression): ThetaIntersectionAgg = + copy(child = newChild) + + // Overrides for TypedImperativeAggregate + + override def prettyName: String = "theta_intersection_agg" + + override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType) + + override def dataType: DataType = BinaryType + + override def nullable: Boolean = false + + /** + * Instantiate an Intersection instance using the lgNomEntries param. + * + * @return + * an Intersection instance wrapped with IntersectionAggregationBuffer + */ + override def createAggregationBuffer(): ThetaSketchState = { + IntersectionAggregationBuffer(SetOperation.builder.buildIntersection) + } + + /** + * Update the Intersection instance with the Compact sketch byte array obtained from the row. + * + * @param intersectionBuffer + * A previously initialized Intersection instance + * @param input + * An input row + */ + override def update( + intersectionBuffer: ThetaSketchState, + input: InternalRow): ThetaSketchState = { + // Return early for null input values. + val v = child.eval(input) + if (v == null) return intersectionBuffer + + // Sketches must be in binary form to be aggregated, else error out. + child.dataType match { + case BinaryType => // Continue processing with a BinaryType. + case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName) + } + + val sketchBytes = v.asInstanceOf[Array[Byte]] + val inputSketch = ThetaSketchUtils.wrapCompactSketch(sketchBytes, prettyName) + + val intersection = intersectionBuffer match { + case IntersectionAggregationBuffer(existingIntersection) => existingIntersection + case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName) + } + intersection.intersect(inputSketch) + IntersectionAggregationBuffer(intersection) + } + + /** + * Merges an input Compact sketch into the Intersection which is acting as the aggregation + * buffer. + * + * @param intersectionBuffer + * The Intersection instance used to store the aggregation result + * @param input + * An input Intersection or Compact sketch instance + */ + override def merge( + intersectionBuffer: ThetaSketchState, + input: ThetaSketchState): ThetaSketchState = { + (intersectionBuffer, input) match { + // If both arguments are intersection objects, merge them directly. + case ( + IntersectionAggregationBuffer(intersectLeft), + IntersectionAggregationBuffer(intersectRight)) => + intersectLeft.intersect(intersectRight.getResult) + IntersectionAggregationBuffer(intersectLeft) + // The input was serialized then deserialized. + case (IntersectionAggregationBuffer(intersection), FinalizedSketch(sketch)) => + intersection.intersect(sketch) + IntersectionAggregationBuffer(intersection) + // The program should never make it here, the cases are for defensive programming. + case (FinalizedSketch(sketch1), FinalizedSketch(sketch2)) => + val intersection = + SetOperation.builder.buildIntersection + intersection.intersect(sketch1) + intersection.intersect(sketch2) + IntersectionAggregationBuffer(intersection) + case (FinalizedSketch(sketch), IntersectionAggregationBuffer(intersection)) => + intersection.intersect(sketch) + IntersectionAggregationBuffer(intersection) + case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName) + } + } + + /** + * Returns a Compact sketch derived from the intersected sketches + * + * @param sketchState + * Intersection instance used as an aggregation buffer + * @return + * A Compact binary sketch + */ + override def eval(sketchState: ThetaSketchState): Any = { + sketchState.eval() + } + + /** Convert the underlying Intersection into an Compact byte array. */ + override def serialize(sketchState: ThetaSketchState): Array[Byte] = { + sketchState.serialize() + } + + /** Wrap the byte array into a Compact sketch instance. */ + override def deserialize(buffer: Array[Byte]): ThetaSketchState = { + if (buffer.nonEmpty) { + FinalizedSketch(CompactSketch.heapify(Memory.wrap(buffer))) + } else { + this.createAggregationBuffer() + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/thetasketchesExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/thetasketchesExpressions.scala new file mode 100644 index 0000000000000..f662f405297bb --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/thetasketchesExpressions.scala @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.datasketches.theta.SetOperation + +import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, ExpressionDescription, Literal} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.util.ThetaSketchUtils +import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType, IntegerType, LongType} + +@ExpressionDescription( + usage = """ + _FUNC_(expr) - Returns the estimated number of unique values + given the binary representation of a Datasketches ThetaSketch. """, + examples = """ + Examples: + > SELECT _FUNC_(theta_sketch_agg(col)) FROM VALUES (1), (1), (2), (2), (3) tab(col); + 3 + """, + group = "misc_funcs", + since = "4.1.0") +case class ThetaSketchEstimate(child: Expression) + extends UnaryExpression + with CodegenFallback + with ExpectsInputTypes { + override def nullIntolerant: Boolean = true + + override protected def withNewChildInternal(newChild: Expression): ThetaSketchEstimate = + copy(child = newChild) + + override def prettyName: String = "theta_sketch_estimate" + + override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType) + + override def dataType: DataType = LongType + + override def nullSafeEval(input: Any): Any = { + val buffer = input.asInstanceOf[Array[Byte]] + + val sketch = ThetaSketchUtils.wrapCompactSketch(buffer, prettyName) + + Math.round(sketch.getEstimate) + } +} + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(first, second, lgNomEntries) - Merges two binary representations of + Datasketches ThetaSketch objects using a ThetaSketch Union object. Users can set + lgNomEntries to a value between 4 and 26 to find the union of sketches with different + union buffer size values (defaults to 12). """, + examples = """ + Examples: + > SELECT theta_sketch_estimate(_FUNC_(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (1, 4), (1, 4), (2, 5), (2, 5), (3, 6) tab(col1, col2); + 6 + """, + group = "misc_funcs", + since = "4.1.0") +// scalastyle:on line.size.limit +case class ThetaUnion(first: Expression, second: Expression, third: Expression) + extends TernaryExpression + with CodegenFallback + with ExpectsInputTypes { + override def nullIntolerant: Boolean = true + + def this(first: Expression, second: Expression) = { + this(first, second, Literal(ThetaSketchUtils.DEFAULT_LG_NOM_LONGS)) + } + + def this(first: Expression, second: Expression, third: Int) = { + this(first, second, Literal(third)) + } + + override protected def withNewChildrenInternal( + newFirst: Expression, + newSecond: Expression, + newThird: Expression): ThetaUnion = + copy(first = newFirst, second = newSecond, third = newThird) + + override def prettyName: String = "theta_union" + + override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, BinaryType, IntegerType) + + override def dataType: DataType = BinaryType + + override def nullSafeEval(value1: Any, value2: Any, value3: Any): Any = { + val logNominalEntries = value3.asInstanceOf[Int] + ThetaSketchUtils.checkLgNomLongs(logNominalEntries, prettyName) + + val sketch1Bytes = value1.asInstanceOf[Array[Byte]] + val sketch1 = ThetaSketchUtils.wrapCompactSketch(sketch1Bytes, prettyName) + + val sketch2Bytes = value2.asInstanceOf[Array[Byte]] + val sketch2 = ThetaSketchUtils.wrapCompactSketch(sketch2Bytes, prettyName) + + val union = SetOperation.builder + .setLogNominalEntries(logNominalEntries) + .buildUnion + .union(sketch1, sketch2) + + union.toByteArrayCompressed + } +} + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(first, second) - Subtracts two binary representations of + Datasketches ThetaSketch objects from two input columns using a + ThetaSketch AnotB object. """, + examples = """ + Examples: + > SELECT theta_sketch_estimate(_FUNC_(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2); + 2 + """, + group = "misc_funcs", + since = "4.1.0") +// scalastyle:on line.size.limit +case class ThetaDifference(first: Expression, second: Expression) + extends BinaryExpression + with CodegenFallback + with ExpectsInputTypes { + override def nullIntolerant: Boolean = true + + override def left: Expression = first + override def right: Expression = second + + override protected def withNewChildrenInternal( + newFirst: Expression, + newSecond: Expression): ThetaDifference = + copy(first = newFirst, second = newSecond) + + override def prettyName: String = "theta_difference" + + override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, BinaryType) + + override def dataType: DataType = BinaryType + + override def nullSafeEval(value1: Any, value2: Any): Any = { + val sketch1Bytes = value1.asInstanceOf[Array[Byte]] + val sketch1 = ThetaSketchUtils.wrapCompactSketch(sketch1Bytes, prettyName) + + val sketch2Bytes = value2.asInstanceOf[Array[Byte]] + val sketch2 = ThetaSketchUtils.wrapCompactSketch(sketch2Bytes, prettyName) + + val difference = SetOperation.builder.buildANotB + .aNotB(sketch1, sketch2) + + difference.toByteArrayCompressed + } +} + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(first, second) - Intersects two binary representations of + Datasketches ThetaSketch objects from two input columns using a + ThetaSketch Intersect object. """, + examples = """ + Examples: + > SELECT theta_sketch_estimate(_FUNC_(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2); + 2 + """, + group = "misc_funcs", + since = "4.1.0") +// scalastyle:on line.size.limit +case class ThetaIntersection(first: Expression, second: Expression) + extends BinaryExpression + with CodegenFallback + with ExpectsInputTypes { + override def nullIntolerant: Boolean = true + + override def left: Expression = first + override def right: Expression = second + + override protected def withNewChildrenInternal( + newFirst: Expression, + newSecond: Expression): ThetaIntersection = + copy(first = newFirst, second = newSecond) + + override def prettyName: String = "theta_intersection" + + override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, BinaryType) + + override def dataType: DataType = BinaryType + + override def nullSafeEval(value1: Any, value2: Any): Any = { + val sketch1Bytes = value1.asInstanceOf[Array[Byte]] + val sketch1 = ThetaSketchUtils.wrapCompactSketch(sketch1Bytes, prettyName) + + val sketch2Bytes = value2.asInstanceOf[Array[Byte]] + val sketch2 = ThetaSketchUtils.wrapCompactSketch(sketch2Bytes, prettyName) + + val intersection = SetOperation.builder.buildIntersection + .intersect(sketch1, sketch2) + + intersection.toByteArrayCompressed + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtils.scala new file mode 100644 index 0000000000000..f9a651b5662db --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtils.scala @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.util + +import org.apache.datasketches.common.SketchesArgumentException +import org.apache.datasketches.memory.{Memory, MemoryBoundsException} +import org.apache.datasketches.theta.CompactSketch + +import org.apache.spark.sql.errors.QueryExecutionErrors + +object ThetaSketchUtils { + /* + * Bounds copied from DataSketches' ThetaUtil. These define the valid range for lgNomEntries, + * which is the log-base-2 of the nominal number of entries that determines the sketch size. + * The actual number of buckets in the sketch = 2^lgNomEntries. + * MIN_LG_NOM_LONGS = 4 means minimum 16 buckets (2^4), MAX_LG_NOM_LONGS = 26 means maximum + * ~67 million buckets (2^26). These bounds ensure reasonable memory usage while maintaining + * sketch accuracy for cardinality estimation. + */ + final val MIN_LG_NOM_LONGS = 4 + final val MAX_LG_NOM_LONGS = 26 + final val DEFAULT_LG_NOM_LONGS = 12 + + /** + * Validates the lgNomLongs parameter for Theta sketch size. Throws a Spark SQL exception if the + * value is out of bounds. + * + * @param lgNomLongs + * Log2 of nominal entries + */ + def checkLgNomLongs(lgNomLongs: Int, prettyName: String): Unit = { + if (lgNomLongs < MIN_LG_NOM_LONGS || lgNomLongs > MAX_LG_NOM_LONGS) { + throw QueryExecutionErrors.thetaInvalidLgNomEntries( + function = prettyName, + min = MIN_LG_NOM_LONGS, + max = MAX_LG_NOM_LONGS, + value = lgNomLongs) + } + } + + /** + * Wraps a byte array into a DataSketches CompactSketch object. + * This method safely deserializes a compact Theta sketch from its binary representation, + * handling potential deserialization errors by throwing appropriate Spark SQL exceptions. + * + * @param bytes The binary representation of a compact theta sketch + * @param prettyName The display name of the function/expression for error messages + * @return A CompactSketch object wrapping the provided bytes + */ + def wrapCompactSketch(bytes: Array[Byte], prettyName: String): CompactSketch = { + val memory = try { + Memory.wrap(bytes) + } catch { + case _: NullPointerException | _: MemoryBoundsException => + throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName) + } + + try { + CompactSketch.wrap(memory) + } catch { + case _: SketchesArgumentException | _: MemoryBoundsException => + throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName) + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index 67bb80403b9f1..18edffc4ec59e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -3112,4 +3112,20 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE ) ) } + + def thetaInvalidInputSketchBuffer(function: String): Throwable = { + new SparkRuntimeException( + errorClass = "THETA_INVALID_INPUT_SKETCH_BUFFER", + messageParameters = Map("function" -> toSQLId(function))) + } + + def thetaInvalidLgNomEntries(function: String, min: Int, max: Int, value: Int): Throwable = { + new SparkRuntimeException( + errorClass = "THETA_INVALID_LG_NOM_ENTRIES", + messageParameters = Map( + "function" -> toSQLId(function), + "min" -> toSQLValue(min, IntegerType), + "max" -> toSQLValue(max, IntegerType), + "value" -> toSQLValue(value, IntegerType))) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ThetasketchesAggSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ThetasketchesAggSuite.scala new file mode 100644 index 0000000000000..afb978b8204de --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ThetasketchesAggSuite.scala @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions.aggregate + +import scala.collection.immutable.NumericRange +import scala.util.Random + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{BoundReference, ThetaSketchEstimate} +import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.types.{ArrayType, BinaryType, DataType, DoubleType, FloatType, IntegerType, LongType, StringType} +import org.apache.spark.unsafe.types.UTF8String + +class ThetasketchesAggSuite extends SparkFunSuite { + + def simulateUpdateMerge( + dataType: DataType, + input: Seq[Any], + numSketches: Integer = 5): (Long, NumericRange[Long]) = { + + // Create a map of the agg function instances. + val aggFunctionMap = Seq + .tabulate(numSketches)(index => { + val sketch = new ThetaSketchAgg(BoundReference(0, dataType, nullable = true)) + index -> (sketch, sketch.createAggregationBuffer()) + }) + .toMap + + // Randomly update agg function instances. + input.map(value => { + val (aggFunction, aggBuffer) = aggFunctionMap(Random.nextInt(numSketches)) + aggFunction.update(aggBuffer, InternalRow(value)) + }) + + def serializeDeserialize( + tuple: (ThetaSketchAgg, ThetaSketchState)): (ThetaSketchAgg, ThetaSketchState) = { + val (agg, buf) = tuple + val serialized = agg.serialize(buf) + (agg, agg.deserialize(serialized)) + } + + // Simulate serialization -> deserialization -> merge. + val mapValues = aggFunctionMap.values + val (mergedAgg, UnionAggregationBuffer(mergedBuf)) = + mapValues.tail.foldLeft(mapValues.head)((prev, cur) => { + val (prevAgg, prevBuf) = serializeDeserialize(prev) + val (_, curBuf) = serializeDeserialize(cur) + + (prevAgg, prevAgg.merge(prevBuf, curBuf)) + }) + + val estimator = ThetaSketchEstimate(BoundReference(0, BinaryType, nullable = true)) + val estimate = + estimator.eval(InternalRow(mergedBuf.getResult.toByteArrayCompressed)).asInstanceOf[Long] + ( + estimate, + mergedBuf.getResult.getLowerBound(3).toLong to mergedBuf.getResult.getUpperBound(3).toLong) + } + + test("SPARK-52407: Test min/max values of supported datatypes") { + val intRange = Integer.MIN_VALUE to Integer.MAX_VALUE by 10000000 + val (intEstimate, intEstimateRange) = simulateUpdateMerge(IntegerType, intRange) + assert(intEstimate == intRange.size || intEstimateRange.contains(intRange.size.toLong)) + + val longRange = Long.MinValue to Long.MaxValue by 1000000000000000L + val (longEstimate, longEstimateRange) = simulateUpdateMerge(LongType, longRange) + assert(longEstimate == longRange.size || longEstimateRange.contains(longRange.size.toLong)) + + val stringRange = Seq.tabulate(1000)(i => UTF8String.fromString(Random.nextString(i + 1))) + val (stringEstimate, stringEstimateRange) = simulateUpdateMerge(StringType, stringRange) + assert( + stringEstimate == stringRange.size || + stringEstimateRange.contains(stringRange.size.toLong)) + + val binaryRange = + Seq.tabulate(1000)(i => UTF8String.fromString(Random.nextString(i + 1)).getBytes) + val (binaryEstimate, binaryEstimateRange) = simulateUpdateMerge(BinaryType, binaryRange) + assert( + binaryEstimate == binaryRange.size || + binaryEstimateRange.contains(binaryRange.size.toLong)) + + val floatRange = (1 to 1000).map(_.toFloat) + val (floatEstimate, floatRangeEst) = simulateUpdateMerge(FloatType, floatRange) + assert(floatEstimate == floatRange.size || floatRangeEst.contains(floatRange.size.toLong)) + + val doubleRange = (1 to 1000).map(_.toDouble) + val (doubleEstimate, doubleRangeEst) = simulateUpdateMerge(DoubleType, doubleRange) + assert(doubleEstimate == doubleRange.size || doubleRangeEst.contains(doubleRange.size.toLong)) + + val arrayIntRange = (1 to 500).map(i => ArrayData.toArrayData(Array(i, i + 1))) + val (arrayIntEstimate, arrayIntRangeEst) = + simulateUpdateMerge(ArrayType(IntegerType), arrayIntRange) + assert( + arrayIntEstimate == arrayIntRange.size || + arrayIntRangeEst.contains(arrayIntRange.size.toLong)) + + val arrayLongRange = + (1 to 500).map(i => ArrayData.toArrayData(Array(i.toLong, (i + 1).toLong))) + val (arrayLongEstimate, arrayLongRangeEst) = + simulateUpdateMerge(ArrayType(LongType), arrayLongRange) + assert( + arrayLongEstimate == arrayLongRange.size || + arrayLongRangeEst.contains(arrayLongRange.size.toLong)) + } + + test("SPARK-52407: Test lgNomEntries results in downsampling sketches during Union") { + // Create a sketch with larger configuration (more precise). + val aggFunc1 = new ThetaSketchAgg(BoundReference(0, IntegerType, nullable = true), 12) + val sketch1 = aggFunc1.createAggregationBuffer() + (0 to 100).map(i => aggFunc1.update(sketch1, InternalRow(i))) + val binary1 = aggFunc1.eval(sketch1) + + // Create a sketch with smaller configuration (less precise). + val aggFunc2 = new ThetaSketchAgg(BoundReference(0, IntegerType, nullable = true), 10) + val sketch2 = aggFunc2.createAggregationBuffer() + (0 to 100).map(i => aggFunc2.update(sketch2, InternalRow(i))) + val binary2 = aggFunc2.eval(sketch2) + + // Union the sketches. + val unionAgg = new ThetaUnionAgg(BoundReference(0, BinaryType, nullable = true), 12) + val union = unionAgg.createAggregationBuffer() + unionAgg.update(union, InternalRow(binary1)) + unionAgg.update(union, InternalRow(binary2)) + val unionResult = unionAgg.eval(union) + + // Verify the estimate is still accurate despite different configurations + val estimate = ThetaSketchEstimate(BoundReference(0, BinaryType, nullable = true)) + .eval(InternalRow(unionResult)) + assert(estimate.asInstanceOf[Long] >= 95 && estimate.asInstanceOf[Long] <= 105) + } + + test("SPARK-52407: Test lgNomEntries results in downsampling sketches during intersection") { + // Create sketch with a larger configuration (more precise). + val aggFunc1 = new ThetaSketchAgg(BoundReference(0, IntegerType, nullable = true), 12) + val sketch1 = aggFunc1.createAggregationBuffer() + (0 to 150).map(i => aggFunc1.update(sketch1, InternalRow(i))) + val binary1 = aggFunc1.eval(sketch1) + + // Create a sketch with smaller configuration (less precise). + val aggFunc2 = new ThetaSketchAgg(BoundReference(0, IntegerType, nullable = true), 10) + val sketch2 = aggFunc2.createAggregationBuffer() + (50 to 200).map(i => aggFunc2.update(sketch2, InternalRow(i))) + val binary2 = aggFunc2.eval(sketch2) + + // Intersect the sketches. + val intersectionAgg = + new ThetaIntersectionAgg(BoundReference(0, BinaryType, nullable = true)) + val intersection = intersectionAgg.createAggregationBuffer() + intersectionAgg.update(intersection, InternalRow(binary1)) + intersectionAgg.update(intersection, InternalRow(binary2)) + val intersectionResult = intersectionAgg.eval(intersection) + + // Verify the estimate is still accurate despite different configurations, + // should be around 101 (overlap from 50 to 150). + val estimate = ThetaSketchEstimate(BoundReference(0, BinaryType, nullable = true)) + .eval(InternalRow(intersectionResult)) + assert(estimate.asInstanceOf[Long] >= 95 && estimate.asInstanceOf[Long] <= 105) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtilsSuite.scala new file mode 100644 index 0000000000000..1a21d7a4d6c87 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtilsSuite.scala @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.util + +import org.apache.datasketches.theta.UpdateSketch + +import org.apache.spark.{SparkFunSuite, SparkRuntimeException} +import org.apache.spark.sql.catalyst.plans.SQLHelper + +class ThetaSketchUtilsSuite extends SparkFunSuite with SQLHelper { + + test("checkLgNomLongs: accepts values within valid range") { + val validValues = + Seq(ThetaSketchUtils.MIN_LG_NOM_LONGS, 10, 20, ThetaSketchUtils.MAX_LG_NOM_LONGS) + validValues.foreach { value => + // There should be no error here. + ThetaSketchUtils.checkLgNomLongs(value, "test_function") + } + } + + + test("checkLgNomLongs: throws exception for values below minimum") { + val invalidValues = Seq(ThetaSketchUtils.MIN_LG_NOM_LONGS - 1, 0, -5) + invalidValues.foreach { value => + checkError( + exception = intercept[SparkRuntimeException] { + ThetaSketchUtils.checkLgNomLongs(value, "test_function") + }, + condition = "THETA_INVALID_LG_NOM_ENTRIES", + parameters = Map( + "function" -> "`test_function`", + "min" -> ThetaSketchUtils.MIN_LG_NOM_LONGS.toString, + "max" -> ThetaSketchUtils.MAX_LG_NOM_LONGS.toString, + "value" -> value.toString + ) + ) + } + } + + test("checkLgNomLongs: throws exception for values above maximum") { + val invalidValues = Seq(ThetaSketchUtils.MAX_LG_NOM_LONGS + 1, 30, 100) + invalidValues.foreach { value => + checkError( + exception = intercept[SparkRuntimeException] { + ThetaSketchUtils.checkLgNomLongs(value, "test_function") + }, + condition = "THETA_INVALID_LG_NOM_ENTRIES", + parameters = Map( + "function" -> "`test_function`", + "min" -> ThetaSketchUtils.MIN_LG_NOM_LONGS.toString, + "max" -> ThetaSketchUtils.MAX_LG_NOM_LONGS.toString, + "value" -> value.toString + ) + ) + } + } + + test("wrapCompactSketch: successfully wraps valid sketch bytes") { + // Create a valid sketch and get its bytes. + val updateSketch = UpdateSketch.builder().build() + updateSketch.update("test1") + updateSketch.update("test2") + updateSketch.update("test3") + val compactSketch = updateSketch.compact + val validBytes = compactSketch.toByteArrayCompressed + + // Test that wrapCompactSketch can successfully wrap the valid bytes. + val wrappedSketch = ThetaSketchUtils.wrapCompactSketch(validBytes, "test_function") + + assert(wrappedSketch != null) + assert(wrappedSketch.getEstimate == compactSketch.getEstimate) + assert(wrappedSketch.getRetainedEntries == compactSketch.getRetainedEntries) + } + + test("wrapCompactSketch: throws exception for null bytes") { + checkError( + exception = intercept[SparkRuntimeException] { + ThetaSketchUtils.wrapCompactSketch(null, "test_function") + }, + condition = "THETA_INVALID_INPUT_SKETCH_BUFFER", + parameters = Map("function" -> "`test_function`") + ) + } + + test("wrapCompactSketch: throws exception for empty bytes") { + checkError( + exception = intercept[SparkRuntimeException] { + ThetaSketchUtils.wrapCompactSketch(Array.empty[Byte], "test_function") + }, + condition = "THETA_INVALID_INPUT_SKETCH_BUFFER", + parameters = Map("function" -> "`test_function`") + ) + } + + test("wrapCompactSketch: throws exception for invalid bytes") { + val invalidBytes = Array[Byte](1, 2, 3, 4, 5) + checkError( + exception = intercept[SparkRuntimeException] { + ThetaSketchUtils.wrapCompactSketch(invalidBytes, "test_function") + }, + condition = "THETA_INVALID_INPUT_SKETCH_BUFFER", + parameters = Map("function" -> "`test_function`") + ) + } +} diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 6e6d520efbac3..22c8a017ca596 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -342,6 +342,10 @@ | org.apache.spark.sql.catalyst.expressions.Subtract | - | SELECT 2 - 1 | struct<(2 - 1):int> | | org.apache.spark.sql.catalyst.expressions.Tan | tan | SELECT tan(0) | struct | | org.apache.spark.sql.catalyst.expressions.Tanh | tanh | SELECT tanh(0) | struct | +| org.apache.spark.sql.catalyst.expressions.ThetaDifference | theta_difference | SELECT theta_sketch_estimate(theta_difference(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2) | struct | +| org.apache.spark.sql.catalyst.expressions.ThetaIntersection | theta_intersection | SELECT theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2) | struct | +| org.apache.spark.sql.catalyst.expressions.ThetaSketchEstimate | theta_sketch_estimate | SELECT theta_sketch_estimate(theta_sketch_agg(col)) FROM VALUES (1), (1), (2), (2), (3) tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.ThetaUnion | theta_union | SELECT theta_sketch_estimate(theta_union(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (1, 4), (1, 4), (2, 5), (2, 5), (3, 6) tab(col1, col2) | struct | | org.apache.spark.sql.catalyst.expressions.TimeDiff | time_diff | SELECT time_diff('HOUR', TIME'20:30:29', TIME'21:30:28') | struct | | org.apache.spark.sql.catalyst.expressions.TimeTrunc | time_trunc | SELECT time_trunc('HOUR', TIME'09:32:05.359') | struct | | org.apache.spark.sql.catalyst.expressions.TimeWindow | window | SELECT a, window.start, window.end, count(*) as cnt FROM VALUES ('A1', '2021-01-01 00:00:00'), ('A1', '2021-01-01 00:04:30'), ('A1', '2021-01-01 00:06:00'), ('A2', '2021-01-01 00:01:00') AS tab(a, b) GROUP by a, window(b, '5 minutes') ORDER BY a, start | struct | @@ -463,6 +467,9 @@ | org.apache.spark.sql.catalyst.expressions.aggregate.StddevSamp | stddev | SELECT stddev(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | | org.apache.spark.sql.catalyst.expressions.aggregate.StddevSamp | stddev_samp | SELECT stddev_samp(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | | org.apache.spark.sql.catalyst.expressions.aggregate.Sum | sum | SELECT sum(col) FROM VALUES (5), (10), (15) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.ThetaIntersectionAgg | theta_intersection_agg | SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) FROM (SELECT theta_sketch_agg(col) as sketch FROM VALUES (1) tab(col) UNION ALL SELECT theta_sketch_agg(col, 20) as sketch FROM VALUES (1) tab(col)) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.ThetaSketchAgg | theta_sketch_agg | SELECT theta_sketch_estimate(theta_sketch_agg(col, 12)) FROM VALUES (1), (1), (2), (2), (3) tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.ThetaUnionAgg | theta_union_agg | SELECT theta_sketch_estimate(theta_union_agg(sketch)) FROM (SELECT theta_sketch_agg(col) as sketch FROM VALUES (1) tab(col) UNION ALL SELECT theta_sketch_agg(col, 20) as sketch FROM VALUES (1) tab(col)) | struct | | org.apache.spark.sql.catalyst.expressions.aggregate.TryAverageExpressionBuilder | try_avg | SELECT try_avg(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | | org.apache.spark.sql.catalyst.expressions.aggregate.TrySumExpressionBuilder | try_sum | SELECT try_sum(col) FROM VALUES (5), (10), (15) AS tab(col) | struct | | org.apache.spark.sql.catalyst.expressions.aggregate.VariancePop | var_pop | SELECT var_pop(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/thetasketch.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/thetasketch.sql.out new file mode 100644 index 0000000000000..323084223d4bc --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/thetasketch.sql.out @@ -0,0 +1,1323 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +DROP TABLE IF EXISTS t_int_1_5_through_7_11 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_int_1_5_through_7_11 + + +-- !query +CREATE TABLE t_int_1_5_through_7_11 AS +VALUES + (1, 5), (2, 6), (3, 7), (4, 8), (5, 9), (6, 10), (7, 11) AS tab(col1, col2) +-- !query analysis +CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_int_1_5_through_7_11`, ErrorIfExists, [col1, col2] + +- SubqueryAlias tab + +- LocalRelation [col1#x, col2#x] + + +-- !query +DROP TABLE IF EXISTS t_long_1_5_through_7_11 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_long_1_5_through_7_11 + + +-- !query +CREATE TABLE t_long_1_5_through_7_11 AS +VALUES + (1L, 5L), (2L, 6L), (3L, 7L), (4L, 8L), (5L, 9L), (6L, 10L), (7L, 11L) AS tab(col1, col2) +-- !query analysis +CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_long_1_5_through_7_11`, ErrorIfExists, [col1, col2] + +- SubqueryAlias tab + +- LocalRelation [col1#xL, col2#xL] + + +-- !query +DROP TABLE IF EXISTS t_double_1_1_1_4_through_1_5_1_8 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_double_1_1_1_4_through_1_5_1_8 + + +-- !query +CREATE TABLE t_double_1_1_1_4_through_1_5_1_8 AS +SELECT CAST(col1 AS DOUBLE) AS col1, CAST(col2 AS DOUBLE) AS col2 +FROM VALUES + (1.1, 1.4), (1.2, 1.5), (1.3, 1.6), (1.4, 1.7), (1.5, 1.8) AS tab(col1, col2) +-- !query analysis +CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_double_1_1_1_4_through_1_5_1_8`, ErrorIfExists, [col1, col2] + +- Project [cast(col1#x as double) AS col1#x, cast(col2#x as double) AS col2#x] + +- SubqueryAlias tab + +- LocalRelation [col1#x, col2#x] + + +-- !query +DROP TABLE IF EXISTS t_float_1_1_1_4_through_1_5_1_8 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_float_1_1_1_4_through_1_5_1_8 + + +-- !query +CREATE TABLE t_float_1_1_1_4_through_1_5_1_8 AS +SELECT CAST(col1 AS FLOAT) col1, CAST(col2 AS FLOAT) col2 +FROM VALUES + (1.1, 1.4), (1.2, 1.5), (1.3, 1.6), (1.4, 1.7), (1.5, 1.8) AS tab(col1, col2) +-- !query analysis +CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_float_1_1_1_4_through_1_5_1_8`, ErrorIfExists, [col1, col2] + +- Project [cast(col1#x as float) AS col1#x, cast(col2#x as float) AS col2#x] + +- SubqueryAlias tab + +- LocalRelation [col1#x, col2#x] + + +-- !query +DROP TABLE IF EXISTS t_string_a_d_through_e_h +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_string_a_d_through_e_h + + +-- !query +CREATE TABLE t_string_a_d_through_e_h AS +VALUES + ('a', 'd'), ('b', 'e'), ('c', 'f'), ('d', 'g'), ('e', 'h') AS tab(col1, col2) +-- !query analysis +CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_string_a_d_through_e_h`, ErrorIfExists, [col1, col2] + +- SubqueryAlias tab + +- LocalRelation [col1#x, col2#x] + + +-- !query +DROP TABLE IF EXISTS t_binary_a_b_through_e_f +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_binary_a_b_through_e_f + + +-- !query +CREATE TABLE t_binary_a_b_through_e_f AS +VALUES + (X'A', X'B'), (X'B', X'C'), (X'C', X'D'), (X'D', X'E'), (X'E', X'F') AS tab(col1, col2) +-- !query analysis +CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_binary_a_b_through_e_f`, ErrorIfExists, [col1, col2] + +- SubqueryAlias tab + +- LocalRelation [col1#x, col2#x] + + +-- !query +DROP TABLE IF EXISTS t_array_int_1_3_through_4_6 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_array_int_1_3_through_4_6 + + +-- !query +CREATE TABLE t_array_int_1_3_through_4_6 AS +VALUES + (ARRAY(1), ARRAY(3)), + (ARRAY(2), ARRAY(4)), + (ARRAY(3), ARRAY(5)), + (ARRAY(4), ARRAY(6)) AS tab(col1, col2) +-- !query analysis +CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_array_int_1_3_through_4_6`, ErrorIfExists, [col1, col2] + +- SubqueryAlias tab + +- LocalRelation [col1#x, col2#x] + + +-- !query +DROP TABLE IF EXISTS t_array_long_1_3_through_4_6 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_array_long_1_3_through_4_6 + + +-- !query +CREATE TABLE t_array_long_1_3_through_4_6 AS +VALUES + (ARRAY(1L), ARRAY(3L)), + (ARRAY(2L), ARRAY(4L)), + (ARRAY(3L), ARRAY(5L)), + (ARRAY(4L), ARRAY(6L)) AS tab(col1, col2) +-- !query analysis +CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_array_long_1_3_through_4_6`, ErrorIfExists, [col1, col2] + +- SubqueryAlias tab + +- LocalRelation [col1#x, col2#x] + + +-- !query +DROP TABLE IF EXISTS t_string_collation +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_string_collation + + +-- !query +CREATE TABLE t_string_collation AS +VALUES + (''), (' '), (CAST(X'C1' AS STRING)), (CAST(X'80' AS STRING)), + ('\uFFFD'), ('Å'), ('å'), ('a\u030A'), ('Å '), ('å '), + ('a\u030A ') AS tab(col1) +-- !query analysis +CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_string_collation`, ErrorIfExists, [col1] + +- SubqueryAlias tab + +- LocalRelation [col1#x] + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) AS result FROM t_int_1_5_through_7_11 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#x, 12, 0, 0)) AS result#xL] ++- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11 + +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_array_int_1_3_through_4_6 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col1, 12))#xL] ++- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6 + +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col2)) FROM t_array_long_1_3_through_4_6 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col2#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col2, 12))#xL] ++- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6 + +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_binary_a_b_through_e_f +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col1, 12))#xL] ++- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f + +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_double_1_1_1_4_through_1_5_1_8 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col1, 12))#xL] ++- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8 + +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col2)) FROM t_float_1_1_1_4_through_1_5_1_8 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col2#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col2, 12))#xL] ++- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8 + +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1, 22)) FROM t_int_1_5_through_7_11 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#x, 22, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col1, 22))#xL] ++- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11 + +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_long_1_5_through_7_11 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#xL, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col1, 12))#xL] ++- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11 + +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_string_a_d_through_e_h +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col1, 12))#xL] ++- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h + +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0), 12)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12))#xL] ++- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11 + +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1, 15), + theta_sketch_agg(col2))) FROM t_long_1_5_through_7_11 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#xL, 15, 0, 0), theta_sketch_agg(col2#xL, 12, 0, 0), 12)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 15), theta_sketch_agg(col2, 12), 12))#xL] ++- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11 + +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0), 12)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12))#xL] ++- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8 + +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1, 6), + theta_sketch_agg(col2, 15), 15)) FROM t_float_1_1_1_4_through_1_5_1_8 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#x, 6, 0, 0), theta_sketch_agg(col2#x, 15, 0, 0), 15)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 6), theta_sketch_agg(col2, 15), 15))#xL] ++- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8 + +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0), 12)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12))#xL] ++- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h + +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2), 20)) FROM t_binary_a_b_through_e_f +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0), 20)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 20))#xL] ++- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f + +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0), 12)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12))#xL] ++- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6 + +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 13))) FROM t_array_long_1_3_through_4_6 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 13, 0, 0), 12)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 13), 12))#xL] ++- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6 + +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL] ++- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11 + +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1, 5), + theta_sketch_agg(col2, 12))) FROM t_long_1_5_through_7_11 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#xL, 5, 0, 0), theta_sketch_agg(col2#xL, 12, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 5), theta_sketch_agg(col2, 12)))#xL] ++- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11 + +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL] ++- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8 + +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1, 5), + theta_sketch_agg(col2))) FROM t_float_1_1_1_4_through_1_5_1_8 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#x, 5, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 5), theta_sketch_agg(col2, 12)))#xL] ++- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8 + +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL] ++- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h + +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 22))) FROM t_binary_a_b_through_e_f +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 22, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 22)))#xL] ++- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f + +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL] ++- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6 + +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 10))) FROM t_array_long_1_3_through_4_6 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 10, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 10)))#xL] ++- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6 + +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL] ++- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11 + +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 5))) FROM t_long_1_5_through_7_11 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#xL, 12, 0, 0), theta_sketch_agg(col2#xL, 5, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 5)))#xL] ++- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11 + +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL] ++- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8 + +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1, 12), + theta_sketch_agg(col2))) FROM t_float_1_1_1_4_through_1_5_1_8 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL] ++- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8 + +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h +-- !query analysis +Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL] ++- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h + +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1, 6), + theta_sketch_agg(col2, 8))) FROM t_binary_a_b_through_e_f +-- !query analysis +Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#x, 6, 0, 0), theta_sketch_agg(col2#x, 8, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 6), theta_sketch_agg(col2, 8)))#xL] ++- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f + +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL] ++- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6 + +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 4))) FROM t_array_long_1_3_through_4_6 +-- !query analysis +Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 4, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 4)))#xL] ++- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6 + +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch, 15)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_int_1_5_through_7_11 + UNION ALL + SELECT theta_sketch_agg(col2, 20) as sketch FROM t_int_1_5_through_7_11) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 15, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 15))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11 + : +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet + +- Aggregate [theta_sketch_agg(col2#x, 20, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11 + +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch, 12)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_double_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_double_1_1_1_4_through_1_5_1_8) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 12, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 12))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8 + : +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8 + +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch, 14)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_string_a_d_through_e_h + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_string_a_d_through_e_h) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 14, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 14))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h + : +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet + +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h + +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch, 10)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_long_1_5_through_7_11 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_long_1_5_through_7_11) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 10, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 10))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#xL, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11 + : +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet + +- Aggregate [theta_sketch_agg(col2#xL, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11 + +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch, 6)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_float_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_float_1_1_1_4_through_1_5_1_8) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 6, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 6))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8 + : +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8 + +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_binary_a_b_through_e_f + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_binary_a_b_through_e_f) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 12, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 12))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f + : +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet + +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f + +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch, 12)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_int_1_3_through_4_6 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_array_int_1_3_through_4_6) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 12, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 12))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6 + : +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet + +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6 + +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch, 16)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_long_1_3_through_4_6 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_array_long_1_3_through_4_6) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 16, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 16))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6 + : +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet + +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6 + +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_int_1_5_through_7_11 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_int_1_5_through_7_11) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11 + : +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet + +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11 + +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_long_1_5_through_7_11 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_long_1_5_through_7_11) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#xL, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11 + : +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet + +- Aggregate [theta_sketch_agg(col2#xL, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11 + +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_float_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_float_1_1_1_4_through_1_5_1_8) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8 + : +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8 + +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_double_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_double_1_1_1_4_through_1_5_1_8) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8 + : +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8 + +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_string_a_d_through_e_h + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_string_a_d_through_e_h) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h + : +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet + +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h + +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_binary_a_b_through_e_f + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_binary_a_b_through_e_f) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f + : +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet + +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f + +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_int_1_3_through_4_6 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_array_int_1_3_through_4_6) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6 + : +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet + +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6 + +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_long_1_3_through_4_6 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_array_long_1_3_through_4_6) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6 + : +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet + +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x] + +- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6 + +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (1), (null), (2), (null), (3) tab(col) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES ('test'), (null), ('null'), (null) tab(col) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (100L), (null), (200L), (null), (300L) tab(col) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col#xL, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL] ++- SubqueryAlias tab + +- LocalRelation [col#xL] + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(CAST(col AS DOUBLE))) +FROM VALUES (1.1), (null), (2.2), (null), (3.3) tab(col) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(cast(col#x as double), 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(CAST(col AS DOUBLE), 12))#xL] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(CAST(col AS FLOAT))) +FROM VALUES (1.5), (null), (2.5), (null), (3.5) tab(col) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(cast(col#x as float), 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(CAST(col AS FLOAT), 12))#xL] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (X'AA'), (null), (X'BB'), (null), (X'CC') tab(col) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY(1, 2)), (null), (ARRAY(3, 4)), (null), (ARRAY(5, 6)) tab(col) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY(10L, 20L)), (null), (ARRAY(30L, 40L)), (null), (ARRAY(50L, 60L)) tab(col) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY(1, null)), (ARRAY(1)), (ARRAY(2, null, 3)), (ARRAY(4)) tab(col) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY(10L, null)), (ARRAY(10L)), (ARRAY(20L, null, 30L)), (ARRAY(40L)) tab(col) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY()), (ARRAY(1, 2)), (ARRAY()), (ARRAY(3, 4)) tab(col) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (''), ('a'), (''), ('b'), ('c') tab(col) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (X''), (X'01'), (X'02'), (X'03'), (CAST(' ' AS BINARY)), (X'e280'), (X'c1'), (X'c120') tab(col) +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) utf8_b FROM t_string_collation +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#x, 12, 0, 0)) AS utf8_b#xL] ++- SubqueryAlias spark_catalog.default.t_string_collation + +- Relation spark_catalog.default.t_string_collation[col1#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_LCASE)) utf8_lc FROM t_string_collation +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(collate(col1#x, UTF8_LCASE), 12, 0, 0)) AS utf8_lc#xL] ++- SubqueryAlias spark_catalog.default.t_string_collation + +- Relation spark_catalog.default.t_string_collation[col1#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE)) unicode FROM t_string_collation +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(collate(col1#x, UNICODE), 12, 0, 0)) AS unicode#xL] ++- SubqueryAlias spark_catalog.default.t_string_collation + +- Relation spark_catalog.default.t_string_collation[col1#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_CI)) unicode_ci FROM t_string_collation +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(collate(col1#x, UNICODE_CI), 12, 0, 0)) AS unicode_ci#xL] ++- SubqueryAlias spark_catalog.default.t_string_collation + +- Relation spark_catalog.default.t_string_collation[col1#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_BINARY_RTRIM)) utf8_b_rt FROM t_string_collation +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(collate(col1#x, UTF8_BINARY_RTRIM), 12, 0, 0)) AS utf8_b_rt#xL] ++- SubqueryAlias spark_catalog.default.t_string_collation + +- Relation spark_catalog.default.t_string_collation[col1#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_LCASE_RTRIM)) utf8_lc_rt FROM t_string_collation +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(collate(col1#x, UTF8_LCASE_RTRIM), 12, 0, 0)) AS utf8_lc_rt#xL] ++- SubqueryAlias spark_catalog.default.t_string_collation + +- Relation spark_catalog.default.t_string_collation[col1#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_RTRIM)) unicode_rt FROM t_string_collation +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(collate(col1#x, UNICODE_RTRIM), 12, 0, 0)) AS unicode_rt#xL] ++- SubqueryAlias spark_catalog.default.t_string_collation + +- Relation spark_catalog.default.t_string_collation[col1#x] parquet + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_CI_RTRIM)) unicode_ci_rt FROM t_string_collation +-- !query analysis +Aggregate [theta_sketch_estimate(theta_sketch_agg(collate(col1#x, UNICODE_CI_RTRIM), 12, 0, 0)) AS unicode_ci_rt#xL] ++- SubqueryAlias spark_catalog.default.t_string_collation + +- Relation spark_catalog.default.t_string_collation[col1#x] parquet + + +-- !query +WITH sketches AS ( + SELECT 'int_sketch' as sketch_type, theta_sketch_agg(col1, 12) as sketch FROM t_int_1_5_through_7_11 + UNION ALL + SELECT 'long_sketch' as sketch_type, theta_sketch_agg(col1, 15) as sketch FROM t_long_1_5_through_7_11 + UNION ALL + SELECT 'double_sketch' as sketch_type, theta_sketch_agg(col1, 10) as sketch FROM t_double_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT 'string_sketch' as sketch_type, theta_sketch_agg(col1, 14) as sketch FROM t_string_a_d_through_e_h +), +union_result AS ( + SELECT theta_union_agg(sketch, 16) as union_sketch FROM sketches +), +individual_sketches AS ( + SELECT theta_sketch_agg(col1, 12) as sketch1, theta_sketch_agg(col2, 12) as sketch2 FROM t_int_1_5_through_7_11 +) +SELECT + theta_sketch_estimate((SELECT union_sketch FROM union_result)) as union_estimate, + theta_sketch_estimate(theta_union(sketch1, sketch2, 15)) as binary_union_estimate, + theta_sketch_estimate(theta_intersection(sketch1, sketch2)) as intersection_estimate, + theta_sketch_estimate(theta_difference(sketch1, sketch2)) as difference_estimate +FROM individual_sketches +-- !query analysis +WithCTE +:- CTERelationDef xxxx, false +: +- SubqueryAlias sketches +: +- Union false, false +: :- Union false, false +: : :- Union false, false +: : : :- Aggregate [int_sketch AS sketch_type#x, theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x] +: : : : +- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11 +: : : : +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet +: : : +- Aggregate [long_sketch AS sketch_type#x, theta_sketch_agg(col1#xL, 15, 0, 0) AS sketch#x] +: : : +- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11 +: : : +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet +: : +- Aggregate [double_sketch AS sketch_type#x, theta_sketch_agg(col1#x, 10, 0, 0) AS sketch#x] +: : +- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8 +: : +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet +: +- Aggregate [string_sketch AS sketch_type#x, theta_sketch_agg(col1#x, 14, 0, 0) AS sketch#x] +: +- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h +: +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet +:- CTERelationDef xxxx, false +: +- SubqueryAlias union_result +: +- Aggregate [theta_union_agg(sketch#x, 16, 0, 0) AS union_sketch#x] +: +- SubqueryAlias sketches +: +- CTERelationRef xxxx, true, [sketch_type#x, sketch#x], false, false, 4 +:- CTERelationDef xxxx, false +: +- SubqueryAlias individual_sketches +: +- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch1#x, theta_sketch_agg(col2#x, 12, 0, 0) AS sketch2#x] +: +- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11 +: +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet ++- Project [theta_sketch_estimate(scalar-subquery#x []) AS union_estimate#xL, theta_sketch_estimate(theta_union(sketch1#x, sketch2#x, 15)) AS binary_union_estimate#xL, theta_sketch_estimate(theta_intersection(sketch1#x, sketch2#x)) AS intersection_estimate#xL, theta_sketch_estimate(theta_difference(sketch1#x, sketch2#x)) AS difference_estimate#xL] + : +- Project [union_sketch#x] + : +- SubqueryAlias union_result + : +- CTERelationRef xxxx, true, [union_sketch#x], false, false, 1 + +- SubqueryAlias individual_sketches + +- CTERelationRef xxxx, true, [sketch1#x, sketch2#x], false, false, 1 + + +-- !query +SELECT theta_sketch_agg(col, 2) +FROM VALUES (50), (60), (60) tab(col) +-- !query analysis +Aggregate [theta_sketch_agg(col#x, 2, 0, 0) AS theta_sketch_agg(col, 2)#x] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_sketch_agg(col, 40) +FROM VALUES (50), (60), (60) tab(col) +-- !query analysis +Aggregate [theta_sketch_agg(col#x, 40, 0, 0) AS theta_sketch_agg(col, 40)#x] ++- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_union_agg(sketch, 3) +FROM (SELECT theta_sketch_agg(col, 12) as sketch + FROM VALUES (1) AS tab(col) + UNION ALL + SELECT theta_sketch_agg(col, 20) as sketch + FROM VALUES (1) AS tab(col)) +-- !query analysis +Aggregate [theta_union_agg(sketch#x, 3, 0, 0) AS theta_union_agg(sketch, 3)#x] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias tab + : +- LocalRelation [col#x] + +- Aggregate [theta_sketch_agg(col#x, 20, 0, 0) AS sketch#x] + +- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_union_agg(sketch, 27) +FROM (SELECT theta_sketch_agg(col, 12) as sketch + FROM VALUES (1) AS tab(col) + UNION ALL + SELECT theta_sketch_agg(col, 20) as sketch + FROM VALUES (1) AS tab(col)) +-- !query analysis +Aggregate [theta_union_agg(sketch#x, 27, 0, 0) AS theta_union_agg(sketch, 27)#x] ++- SubqueryAlias __auto_generated_subquery_name + +- Union false, false + :- Aggregate [theta_sketch_agg(col#x, 12, 0, 0) AS sketch#x] + : +- SubqueryAlias tab + : +- LocalRelation [col#x] + +- Aggregate [theta_sketch_agg(col#x, 20, 0, 0) AS sketch#x] + +- SubqueryAlias tab + +- LocalRelation [col#x] + + +-- !query +SELECT theta_union(1, 2) + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"1\"", + "inputType" : "\"INT\"", + "paramIndex" : "first", + "requiredType" : "\"BINARY\"", + "sqlExpr" : "\"theta_union(1, 2, 12)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 24, + "fragment" : "theta_union(1, 2)" + } ] +} + + +-- !query +SELECT theta_intersection(1, 2) + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"1\"", + "inputType" : "\"INT\"", + "paramIndex" : "first", + "requiredType" : "\"BINARY\"", + "sqlExpr" : "\"theta_intersection(1, 2)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 31, + "fragment" : "theta_intersection(1, 2)" + } ] +} + + +-- !query +SELECT theta_difference(1, 2) + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"1\"", + "inputType" : "\"INT\"", + "paramIndex" : "first", + "requiredType" : "\"BINARY\"", + "sqlExpr" : "\"theta_difference(1, 2)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 29, + "fragment" : "theta_difference(1, 2)" + } ] +} + + +-- !query +SELECT theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2), 'invalid') + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"invalid\"", + "inputType" : "\"STRING\"", + "paramIndex" : "third", + "requiredType" : "\"INT\"", + "sqlExpr" : "\"theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), invalid)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 86, + "fragment" : "theta_union(\n theta_sketch_agg(col1),\n theta_sketch_agg(col2), 'invalid')" + } ] +} + + +-- !query +SELECT theta_intersection( + theta_sketch_agg(col1), + 'invalid_sketch') + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"invalid_sketch\"", + "inputType" : "\"STRING\"", + "paramIndex" : "second", + "requiredType" : "\"BINARY\"", + "sqlExpr" : "\"theta_intersection(theta_sketch_agg(col1, 12), invalid_sketch)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 76, + "fragment" : "theta_intersection(\n theta_sketch_agg(col1),\n 'invalid_sketch')" + } ] +} + + +-- !query +SELECT theta_sketch_estimate(CAST('abc' AS BINARY)) +-- !query analysis +Project [theta_sketch_estimate(cast(abc as binary)) AS theta_sketch_estimate(CAST(abc AS BINARY))#xL] ++- OneRowRelation + + +-- !query +SELECT theta_union(CAST('abc' AS BINARY), CAST('def' AS BINARY)) +-- !query analysis +Project [theta_union(cast(abc as binary), cast(def as binary), 12) AS theta_union(CAST(abc AS BINARY), CAST(def AS BINARY), 12)#x] ++- OneRowRelation + + +-- !query +SELECT theta_intersection(CAST('abc' AS BINARY), CAST('def' AS BINARY)) +-- !query analysis +Project [theta_intersection(cast(abc as binary), cast(def as binary)) AS theta_intersection(CAST(abc AS BINARY), CAST(def AS BINARY))#x] ++- OneRowRelation + + +-- !query +SELECT theta_difference(CAST('abc' AS BINARY), CAST('def' AS BINARY)) +-- !query analysis +Project [theta_difference(cast(abc as binary), cast(def as binary)) AS theta_difference(CAST(abc AS BINARY), CAST(def AS BINARY))#x] ++- OneRowRelation + + +-- !query +SELECT theta_union_agg(buffer, 15) +FROM (SELECT CAST('abc' AS BINARY) AS buffer) +-- !query analysis +Aggregate [theta_union_agg(buffer#x, 15, 0, 0) AS theta_union_agg(buffer, 15)#x] ++- SubqueryAlias __auto_generated_subquery_name + +- Project [cast(abc as binary) AS buffer#x] + +- OneRowRelation + + +-- !query +SELECT theta_intersection_agg(buffer) +FROM (SELECT CAST('abc' AS BINARY) AS buffer) +-- !query analysis +Aggregate [theta_intersection_agg(buffer#x, 0, 0) AS theta_intersection_agg(buffer)#x] ++- SubqueryAlias __auto_generated_subquery_name + +- Project [cast(abc as binary) AS buffer#x] + +- OneRowRelation + + +-- !query +DROP TABLE IF EXISTS t_int_1_5_through_7_11 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_int_1_5_through_7_11 + + +-- !query +DROP TABLE IF EXISTS t_long_1_5_through_7_11 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_long_1_5_through_7_11 + + +-- !query +DROP TABLE IF EXISTS t_double_1_1_1_4_through_1_5_1_8 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_double_1_1_1_4_through_1_5_1_8 + + +-- !query +DROP TABLE IF EXISTS t_float_1_1_1_4_through_1_5_1_8 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_float_1_1_1_4_through_1_5_1_8 + + +-- !query +DROP TABLE IF EXISTS t_string_a_d_through_e_h +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_string_a_d_through_e_h + + +-- !query +DROP TABLE IF EXISTS t_binary_a_b_through_e_f +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_binary_a_b_through_e_f + + +-- !query +DROP TABLE IF EXISTS t_array_int_1_3_through_4_6 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_array_int_1_3_through_4_6 + + +-- !query +DROP TABLE IF EXISTS t_array_long_1_3_through_4_6 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_array_long_1_3_through_4_6 + + +-- !query +DROP TABLE IF EXISTS t_string_collation +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_string_collation diff --git a/sql/core/src/test/resources/sql-tests/inputs/thetasketch.sql b/sql/core/src/test/resources/sql-tests/inputs/thetasketch.sql new file mode 100644 index 0000000000000..d270442b50499 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/thetasketch.sql @@ -0,0 +1,528 @@ +-- Positive test cases +-- Create tables with two columns for each data type + +-- Integer table +DROP TABLE IF EXISTS t_int_1_5_through_7_11; +CREATE TABLE t_int_1_5_through_7_11 AS +VALUES + (1, 5), (2, 6), (3, 7), (4, 8), (5, 9), (6, 10), (7, 11) AS tab(col1, col2); + +-- Long table +DROP TABLE IF EXISTS t_long_1_5_through_7_11; +CREATE TABLE t_long_1_5_through_7_11 AS +VALUES + (1L, 5L), (2L, 6L), (3L, 7L), (4L, 8L), (5L, 9L), (6L, 10L), (7L, 11L) AS tab(col1, col2); + +-- Double table +DROP TABLE IF EXISTS t_double_1_1_1_4_through_1_5_1_8; +CREATE TABLE t_double_1_1_1_4_through_1_5_1_8 AS +SELECT CAST(col1 AS DOUBLE) AS col1, CAST(col2 AS DOUBLE) AS col2 +FROM VALUES + (1.1, 1.4), (1.2, 1.5), (1.3, 1.6), (1.4, 1.7), (1.5, 1.8) AS tab(col1, col2); + +-- Float table (must cast, otherwise Spark will store DOUBLEs) +DROP TABLE IF EXISTS t_float_1_1_1_4_through_1_5_1_8; +CREATE TABLE t_float_1_1_1_4_through_1_5_1_8 AS +SELECT CAST(col1 AS FLOAT) col1, CAST(col2 AS FLOAT) col2 +FROM VALUES + (1.1, 1.4), (1.2, 1.5), (1.3, 1.6), (1.4, 1.7), (1.5, 1.8) AS tab(col1, col2); + +-- String table +DROP TABLE IF EXISTS t_string_a_d_through_e_h; +CREATE TABLE t_string_a_d_through_e_h AS +VALUES + ('a', 'd'), ('b', 'e'), ('c', 'f'), ('d', 'g'), ('e', 'h') AS tab(col1, col2); + +-- Binary table +DROP TABLE IF EXISTS t_binary_a_b_through_e_f; +CREATE TABLE t_binary_a_b_through_e_f AS +VALUES + (X'A', X'B'), (X'B', X'C'), (X'C', X'D'), (X'D', X'E'), (X'E', X'F') AS tab(col1, col2); + +-- Array Integer table +DROP TABLE IF EXISTS t_array_int_1_3_through_4_6; +CREATE TABLE t_array_int_1_3_through_4_6 AS +VALUES + (ARRAY(1), ARRAY(3)), + (ARRAY(2), ARRAY(4)), + (ARRAY(3), ARRAY(5)), + (ARRAY(4), ARRAY(6)) AS tab(col1, col2); + +-- Array Long table +DROP TABLE IF EXISTS t_array_long_1_3_through_4_6; +CREATE TABLE t_array_long_1_3_through_4_6 AS +VALUES + (ARRAY(1L), ARRAY(3L)), + (ARRAY(2L), ARRAY(4L)), + (ARRAY(3L), ARRAY(5L)), + (ARRAY(4L), ARRAY(6L)) AS tab(col1, col2); + +DROP TABLE IF EXISTS t_string_collation; +-- `\u030A` is the "combining ring above" Unicode character: https://www.compart.com/en/unicode/U+030A +-- `\uFFFD is the Unicode replacement character +-- `\xC1` is an invalid Unicode byte. +-- `\x80` is a Unicode continuation byte, that is it cannot be the first byte of a multi-byte UTF8 character. +-- All strings are different based on the UTF8_BINARY collation. +-- The first and second strings are equal for any collation with the RTRIM modifier, and equal to the empty string. +-- The last three strings are respectively equal to the next last three strings for any collation with the RTRIM modifier. +-- The strings "\xC1", "\x80" and "\uFFFD" are equal for all collations except UTF8_BINARY. +-- The (sub)strings `å` and `a\u030A` are equal for the UNICODE family of collations. +-- `å` is the lowercase version of `Å`. +CREATE TABLE t_string_collation AS +VALUES + (''), (' '), (CAST(X'C1' AS STRING)), (CAST(X'80' AS STRING)), + ('\uFFFD'), ('Å'), ('å'), ('a\u030A'), ('Å '), ('å '), + ('a\u030A ') AS tab(col1); + +-- Test basic theta_sketch_agg with IntegerType from table +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) AS result FROM t_int_1_5_through_7_11; + +-- Test theta_sketch_agg with ArrayType(IntegerType) values from table +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_array_int_1_3_through_4_6; + +-- Test theta_sketch_agg with ArrayType(LongType) values from table +SELECT theta_sketch_estimate(theta_sketch_agg(col2)) FROM t_array_long_1_3_through_4_6; + +-- Test theta_sketch_agg with BinaryType values from table +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_binary_a_b_through_e_f; + +-- Test theta_sketch_agg with DoubleType values from table +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_double_1_1_1_4_through_1_5_1_8; + +-- Test theta_sketch_agg with FloatType values from table (promoted to Double internally) +SELECT theta_sketch_estimate(theta_sketch_agg(col2)) FROM t_float_1_1_1_4_through_1_5_1_8; + +-- Test theta_sketch_agg with IntegerType and explicit lgNomEntries parameter +SELECT theta_sketch_estimate(theta_sketch_agg(col1, 22)) FROM t_int_1_5_through_7_11; + +-- Test theta_sketch_agg with LongType values +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_long_1_5_through_7_11; + +-- Test theta_sketch_agg with StringType values +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_string_a_d_through_e_h; + +-- Test theta_union function with IntegerType sketches +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11; + +-- Test theta_union function with LongType sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1, 15), + theta_sketch_agg(col2))) FROM t_long_1_5_through_7_11; + +-- Test theta_union function with DoubleType sketches +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8; + +-- Test theta_union function with FloatType sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1, 6), + theta_sketch_agg(col2, 15), 15)) FROM t_float_1_1_1_4_through_1_5_1_8; + +-- Test theta_union function with StringType sketches +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h; + +-- Test theta_union function with BinaryType sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2), 20)) FROM t_binary_a_b_through_e_f; + +-- Test theta_union function with ArrayType(IntegerType) sketches +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6; + +-- Test theta_union function with ArrayType(LongType) sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 13))) FROM t_array_long_1_3_through_4_6; + +-- Test theta_intersection function with IntegerType sketches +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11; + +-- Test theta_intersection function with LongType sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1, 5), + theta_sketch_agg(col2, 12))) FROM t_long_1_5_through_7_11; + +-- Test theta_intersection function with DoubleType sketches +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8; + +-- Test theta_intersection function with FloatType sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1, 5), + theta_sketch_agg(col2))) FROM t_float_1_1_1_4_through_1_5_1_8; + +-- Test theta_intersection function with StringType sketches +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h; + +-- Test theta_intersection function with BinaryType sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 22))) FROM t_binary_a_b_through_e_f; + +-- Test theta_intersection function with ArrayType(IntegerType) sketches +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6; + +-- Test theta_intersection function with ArrayType(LongType) sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 10))) FROM t_array_long_1_3_through_4_6; + +-- Test theta_difference function with IntegerType sketches +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11; + +-- Test theta_difference function with LongType sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 5))) FROM t_long_1_5_through_7_11; + +-- Test theta_difference function with DoubleType sketches +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8; + +-- Test theta_difference function with FloatType sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1, 12), + theta_sketch_agg(col2))) FROM t_float_1_1_1_4_through_1_5_1_8; + +-- Test theta_difference function with StringType sketches +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h; + +-- Test theta_difference function with BinaryType sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1, 6), + theta_sketch_agg(col2, 8))) FROM t_binary_a_b_through_e_f; + +-- Test theta_difference function with ArrayType(IntegerType) sketches +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6; + +-- Test theta_difference function with ArrayType(LongType) sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 4))) FROM t_array_long_1_3_through_4_6; + +-- Test theta_union_agg with IntegerType and explicit lgNomEntries parameter +SELECT theta_sketch_estimate(theta_union_agg(sketch, 15)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_int_1_5_through_7_11 + UNION ALL + SELECT theta_sketch_agg(col2, 20) as sketch FROM t_int_1_5_through_7_11); + +-- Test theta_union_agg with DoubleType sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate(theta_union_agg(sketch, 12)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_double_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_double_1_1_1_4_through_1_5_1_8); + +-- Test theta_union_agg with StringType sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate(theta_union_agg(sketch, 14)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_string_a_d_through_e_h + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_string_a_d_through_e_h); + +-- Test theta_union_agg with LongType sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate(theta_union_agg(sketch, 10)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_long_1_5_through_7_11 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_long_1_5_through_7_11); + +-- Test theta_union_agg with FloatType sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate(theta_union_agg(sketch, 6)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_float_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_float_1_1_1_4_through_1_5_1_8); + +-- Test theta_union_agg with BinaryType sketches +SELECT theta_sketch_estimate(theta_union_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_binary_a_b_through_e_f + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_binary_a_b_through_e_f); + +-- Test theta_union_agg with ArrayType(IntegerType) sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate(theta_union_agg(sketch, 12)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_int_1_3_through_4_6 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_array_int_1_3_through_4_6); + +-- Test theta_union_agg with ArrayType(LongType) sketches and explicit lgNomEntries parameter +SELECT theta_sketch_estimate(theta_union_agg(sketch, 16)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_long_1_3_through_4_6 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_array_long_1_3_through_4_6); + +-- Test theta_intersection_agg with IntegerType sketches +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_int_1_5_through_7_11 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_int_1_5_through_7_11); + +-- Test theta_intersection_agg with LongType sketches +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_long_1_5_through_7_11 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_long_1_5_through_7_11); + +-- Test theta_intersection_agg with FloatType sketches +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_float_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_float_1_1_1_4_through_1_5_1_8); + +-- Test theta_intersection_agg with DoubleType sketches +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_double_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_double_1_1_1_4_through_1_5_1_8); + +-- Test theta_intersection_agg with StringType sketches +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_string_a_d_through_e_h + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_string_a_d_through_e_h); + +-- Test theta_intersection_agg with BinaryType sketches +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_binary_a_b_through_e_f + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_binary_a_b_through_e_f); + +-- Test theta_intersection_agg with ArrayType(IntegerType) sketches +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_int_1_3_through_4_6 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_array_int_1_3_through_4_6); + +-- Test theta_intersection_agg with ArrayType(LongType) sketches +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_long_1_3_through_4_6 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_array_long_1_3_through_4_6); + +-- Test theta_sketch_agg with IntegerType and null values (nulls should be ignored) +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (1), (null), (2), (null), (3) tab(col); + +-- Test theta_sketch_agg with StringType and null values (nulls should be ignored) +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES ('test'), (null), ('null'), (null) tab(col); + +-- Test theta_sketch_agg with LongType and null values (nulls should be ignored) +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (100L), (null), (200L), (null), (300L) tab(col); + +-- Test theta_sketch_agg with DoubleType and null values (nulls should be ignored) +SELECT theta_sketch_estimate(theta_sketch_agg(CAST(col AS DOUBLE))) +FROM VALUES (1.1), (null), (2.2), (null), (3.3) tab(col); + +-- Test theta_sketch_agg with FloatType and null values (nulls should be ignored) +SELECT theta_sketch_estimate(theta_sketch_agg(CAST(col AS FLOAT))) +FROM VALUES (1.5), (null), (2.5), (null), (3.5) tab(col); + +-- Test theta_sketch_agg with BinaryType and null values (nulls should be ignored) +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (X'AA'), (null), (X'BB'), (null), (X'CC') tab(col); + +-- Test theta_sketch_agg with ArrayType(IntegerType) and null values (nulls should be ignored) +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY(1, 2)), (null), (ARRAY(3, 4)), (null), (ARRAY(5, 6)) tab(col); + +-- Test theta_sketch_agg with ArrayType(LongType) and null values (nulls should be ignored) +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY(10L, 20L)), (null), (ARRAY(30L, 40L)), (null), (ARRAY(50L, 60L)) tab(col); + +-- Test theta_sketch_agg with arrays containing null elements +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY(1, null)), (ARRAY(1)), (ARRAY(2, null, 3)), (ARRAY(4)) tab(col); + +-- Test theta_sketch_agg with arrays containing null elements (LongType) +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY(10L, null)), (ARRAY(10L)), (ARRAY(20L, null, 30L)), (ARRAY(40L)) tab(col); + +-- Test theta_sketch_agg with empty arrays +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY()), (ARRAY(1, 2)), (ARRAY()), (ARRAY(3, 4)) tab(col); + +-- Test theta_sketch_agg with empty strings +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (''), ('a'), (''), ('b'), ('c') tab(col); + +-- Test theta_sketch_agg with empty binary data +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (X''), (X'01'), (X'02'), (X'03'), (CAST(' ' AS BINARY)), (X'e280'), (X'c1'), (X'c120') tab(col); + +-- Test theta_sketch_agg with collated string data +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) utf8_b FROM t_string_collation; +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_LCASE)) utf8_lc FROM t_string_collation; +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE)) unicode FROM t_string_collation; +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_CI)) unicode_ci FROM t_string_collation; +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_BINARY_RTRIM)) utf8_b_rt FROM t_string_collation; +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_LCASE_RTRIM)) utf8_lc_rt FROM t_string_collation; +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_RTRIM)) unicode_rt FROM t_string_collation; +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_CI_RTRIM)) unicode_ci_rt FROM t_string_collation; + +-- Comprehensive test using all ThetaSketch functions in a single query +-- This query demonstrates the full workflow: aggregation -> union -> intersection -> difference -> estimate +WITH sketches AS ( + SELECT 'int_sketch' as sketch_type, theta_sketch_agg(col1, 12) as sketch FROM t_int_1_5_through_7_11 + UNION ALL + SELECT 'long_sketch' as sketch_type, theta_sketch_agg(col1, 15) as sketch FROM t_long_1_5_through_7_11 + UNION ALL + SELECT 'double_sketch' as sketch_type, theta_sketch_agg(col1, 10) as sketch FROM t_double_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT 'string_sketch' as sketch_type, theta_sketch_agg(col1, 14) as sketch FROM t_string_a_d_through_e_h +), +union_result AS ( + SELECT theta_union_agg(sketch, 16) as union_sketch FROM sketches +), +individual_sketches AS ( + SELECT theta_sketch_agg(col1, 12) as sketch1, theta_sketch_agg(col2, 12) as sketch2 FROM t_int_1_5_through_7_11 +) +SELECT + -- Basic estimate from union of all sketches + theta_sketch_estimate((SELECT union_sketch FROM union_result)) as union_estimate, + -- Union of two individual sketches + theta_sketch_estimate(theta_union(sketch1, sketch2, 15)) as binary_union_estimate, + -- Intersection of two individual sketches + theta_sketch_estimate(theta_intersection(sketch1, sketch2)) as intersection_estimate, + -- Difference of two individual sketches + theta_sketch_estimate(theta_difference(sketch1, sketch2)) as difference_estimate +FROM individual_sketches; + +-- Negative test cases + +-- Test theta_sketch_agg with lgNomEntries value of 2 (too low, minimum is 4) - should fail +SELECT theta_sketch_agg(col, 2) +FROM VALUES (50), (60), (60) tab(col); + +-- Test theta_sketch_agg with lgNomEntries value of 40 (too high, maximum is 26) - should fail +SELECT theta_sketch_agg(col, 40) +FROM VALUES (50), (60), (60) tab(col); + +-- Test theta_union_agg with lgNomEntries value of 3 (too low, minimum is 4) - should fail +SELECT theta_union_agg(sketch, 3) +FROM (SELECT theta_sketch_agg(col, 12) as sketch + FROM VALUES (1) AS tab(col) + UNION ALL + SELECT theta_sketch_agg(col, 20) as sketch + FROM VALUES (1) AS tab(col)); + +-- Test theta_union_agg with lgNomEntries value of 27 (too high, maximum is 26) - should fail +SELECT theta_union_agg(sketch, 27) +FROM (SELECT theta_sketch_agg(col, 12) as sketch + FROM VALUES (1) AS tab(col) + UNION ALL + SELECT theta_sketch_agg(col, 20) as sketch + FROM VALUES (1) AS tab(col)); + +-- Test theta_union with integers (1, 2) instead of binary sketch data - should fail +SELECT theta_union(1, 2) + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2); + +-- Test theta_intersection with integers (1, 2) instead of binary sketch data - should fail +SELECT theta_intersection(1, 2) + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2); + +-- Test theta_difference with integers (1, 2) instead of binary sketch data - should fail +SELECT theta_difference(1, 2) + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2); + +-- Test theta_union with string 'invalid' instead of integer for lgNomEntries parameter - should fail +SELECT theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2), 'invalid') + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2); + +-- Test theta_intersection with string 'invalid_sketch' instead of binary sketch data - should fail +SELECT theta_intersection( + theta_sketch_agg(col1), + 'invalid_sketch') + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2); + +-- Test theta_sketch_estimate with invalid binary data ('abc') that is not a valid theta sketch - should fail +SELECT theta_sketch_estimate(CAST('abc' AS BINARY)); + +-- Test theta_union with invalid binary data ('abc', 'def') that are not valid theta sketches - should fail +SELECT theta_union(CAST('abc' AS BINARY), CAST('def' AS BINARY)); + +-- Test theta_intersection with invalid binary data ('abc', 'def') that are not valid theta sketches - should fail +SELECT theta_intersection(CAST('abc' AS BINARY), CAST('def' AS BINARY)); + +-- Test theta_difference with invalid binary data ('abc', 'def') that are not valid theta sketches - should fail +SELECT theta_difference(CAST('abc' AS BINARY), CAST('def' AS BINARY)); + +-- Test theta_union_agg with invalid binary data ('abc') that is not a valid theta sketch - should fail +SELECT theta_union_agg(buffer, 15) +FROM (SELECT CAST('abc' AS BINARY) AS buffer); + +-- Test theta_intersection_agg with invalid binary data ('abc') that is not a valid theta sketch - should fail +SELECT theta_intersection_agg(buffer) +FROM (SELECT CAST('abc' AS BINARY) AS buffer); + +-- Clean up +DROP TABLE IF EXISTS t_int_1_5_through_7_11; +DROP TABLE IF EXISTS t_long_1_5_through_7_11; +DROP TABLE IF EXISTS t_double_1_1_1_4_through_1_5_1_8; +DROP TABLE IF EXISTS t_float_1_1_1_4_through_1_5_1_8; +DROP TABLE IF EXISTS t_string_a_d_through_e_h; +DROP TABLE IF EXISTS t_binary_a_b_through_e_f; +DROP TABLE IF EXISTS t_array_int_1_3_through_4_6; +DROP TABLE IF EXISTS t_array_long_1_3_through_4_6; +DROP TABLE IF EXISTS t_string_collation \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/results/thetasketch.sql.out b/sql/core/src/test/resources/sql-tests/results/thetasketch.sql.out new file mode 100644 index 0000000000000..95c6e28a8c426 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/thetasketch.sql.out @@ -0,0 +1,1294 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +DROP TABLE IF EXISTS t_int_1_5_through_7_11 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t_int_1_5_through_7_11 AS +VALUES + (1, 5), (2, 6), (3, 7), (4, 8), (5, 9), (6, 10), (7, 11) AS tab(col1, col2) +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_long_1_5_through_7_11 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t_long_1_5_through_7_11 AS +VALUES + (1L, 5L), (2L, 6L), (3L, 7L), (4L, 8L), (5L, 9L), (6L, 10L), (7L, 11L) AS tab(col1, col2) +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_double_1_1_1_4_through_1_5_1_8 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t_double_1_1_1_4_through_1_5_1_8 AS +SELECT CAST(col1 AS DOUBLE) AS col1, CAST(col2 AS DOUBLE) AS col2 +FROM VALUES + (1.1, 1.4), (1.2, 1.5), (1.3, 1.6), (1.4, 1.7), (1.5, 1.8) AS tab(col1, col2) +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_float_1_1_1_4_through_1_5_1_8 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t_float_1_1_1_4_through_1_5_1_8 AS +SELECT CAST(col1 AS FLOAT) col1, CAST(col2 AS FLOAT) col2 +FROM VALUES + (1.1, 1.4), (1.2, 1.5), (1.3, 1.6), (1.4, 1.7), (1.5, 1.8) AS tab(col1, col2) +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_string_a_d_through_e_h +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t_string_a_d_through_e_h AS +VALUES + ('a', 'd'), ('b', 'e'), ('c', 'f'), ('d', 'g'), ('e', 'h') AS tab(col1, col2) +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_binary_a_b_through_e_f +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t_binary_a_b_through_e_f AS +VALUES + (X'A', X'B'), (X'B', X'C'), (X'C', X'D'), (X'D', X'E'), (X'E', X'F') AS tab(col1, col2) +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_array_int_1_3_through_4_6 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t_array_int_1_3_through_4_6 AS +VALUES + (ARRAY(1), ARRAY(3)), + (ARRAY(2), ARRAY(4)), + (ARRAY(3), ARRAY(5)), + (ARRAY(4), ARRAY(6)) AS tab(col1, col2) +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_array_long_1_3_through_4_6 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t_array_long_1_3_through_4_6 AS +VALUES + (ARRAY(1L), ARRAY(3L)), + (ARRAY(2L), ARRAY(4L)), + (ARRAY(3L), ARRAY(5L)), + (ARRAY(4L), ARRAY(6L)) AS tab(col1, col2) +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_string_collation +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t_string_collation AS +VALUES + (''), (' '), (CAST(X'C1' AS STRING)), (CAST(X'80' AS STRING)), + ('\uFFFD'), ('Å'), ('å'), ('a\u030A'), ('Å '), ('å '), + ('a\u030A ') AS tab(col1) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) AS result FROM t_int_1_5_through_7_11 +-- !query schema +struct +-- !query output +7 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_array_int_1_3_through_4_6 +-- !query schema +struct +-- !query output +4 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col2)) FROM t_array_long_1_3_through_4_6 +-- !query schema +struct +-- !query output +4 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_binary_a_b_through_e_f +-- !query schema +struct +-- !query output +5 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_double_1_1_1_4_through_1_5_1_8 +-- !query schema +struct +-- !query output +5 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col2)) FROM t_float_1_1_1_4_through_1_5_1_8 +-- !query schema +struct +-- !query output +5 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1, 22)) FROM t_int_1_5_through_7_11 +-- !query schema +struct +-- !query output +7 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_long_1_5_through_7_11 +-- !query schema +struct +-- !query output +7 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_string_a_d_through_e_h +-- !query schema +struct +-- !query output +5 + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11 +-- !query schema +struct +-- !query output +11 + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1, 15), + theta_sketch_agg(col2))) FROM t_long_1_5_through_7_11 +-- !query schema +struct +-- !query output +11 + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8 +-- !query schema +struct +-- !query output +8 + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1, 6), + theta_sketch_agg(col2, 15), 15)) FROM t_float_1_1_1_4_through_1_5_1_8 +-- !query schema +struct +-- !query output +8 + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h +-- !query schema +struct +-- !query output +8 + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2), 20)) FROM t_binary_a_b_through_e_f +-- !query schema +struct +-- !query output +6 + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6 +-- !query schema +struct +-- !query output +6 + + +-- !query +SELECT theta_sketch_estimate( + theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 13))) FROM t_array_long_1_3_through_4_6 +-- !query schema +struct +-- !query output +6 + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11 +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1, 5), + theta_sketch_agg(col2, 12))) FROM t_long_1_5_through_7_11 +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8 +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1, 5), + theta_sketch_agg(col2))) FROM t_float_1_1_1_4_through_1_5_1_8 +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 22))) FROM t_binary_a_b_through_e_f +-- !query schema +struct +-- !query output +4 + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6 +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT theta_sketch_estimate( + theta_intersection( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 10))) FROM t_array_long_1_3_through_4_6 +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11 +-- !query schema +struct +-- !query output +4 + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 5))) FROM t_long_1_5_through_7_11 +-- !query schema +struct +-- !query output +4 + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8 +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1, 12), + theta_sketch_agg(col2))) FROM t_float_1_1_1_4_through_1_5_1_8 +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1, 6), + theta_sketch_agg(col2, 8))) FROM t_binary_a_b_through_e_f +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6 +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT theta_sketch_estimate( + theta_difference( + theta_sketch_agg(col1), + theta_sketch_agg(col2, 4))) FROM t_array_long_1_3_through_4_6 +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch, 15)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_int_1_5_through_7_11 + UNION ALL + SELECT theta_sketch_agg(col2, 20) as sketch FROM t_int_1_5_through_7_11) +-- !query schema +struct +-- !query output +11 + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch, 12)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_double_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_double_1_1_1_4_through_1_5_1_8) +-- !query schema +struct +-- !query output +8 + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch, 14)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_string_a_d_through_e_h + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_string_a_d_through_e_h) +-- !query schema +struct +-- !query output +8 + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch, 10)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_long_1_5_through_7_11 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_long_1_5_through_7_11) +-- !query schema +struct +-- !query output +11 + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch, 6)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_float_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_float_1_1_1_4_through_1_5_1_8) +-- !query schema +struct +-- !query output +8 + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_binary_a_b_through_e_f + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_binary_a_b_through_e_f) +-- !query schema +struct +-- !query output +6 + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch, 12)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_int_1_3_through_4_6 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_array_int_1_3_through_4_6) +-- !query schema +struct +-- !query output +6 + + +-- !query +SELECT theta_sketch_estimate(theta_union_agg(sketch, 16)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_long_1_3_through_4_6 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_array_long_1_3_through_4_6) +-- !query schema +struct +-- !query output +6 + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_int_1_5_through_7_11 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_int_1_5_through_7_11) +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_long_1_5_through_7_11 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_long_1_5_through_7_11) +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_float_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_float_1_1_1_4_through_1_5_1_8) +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_double_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_double_1_1_1_4_through_1_5_1_8) +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_string_a_d_through_e_h + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_string_a_d_through_e_h) +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_binary_a_b_through_e_f + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_binary_a_b_through_e_f) +-- !query schema +struct +-- !query output +4 + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_int_1_3_through_4_6 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_array_int_1_3_through_4_6) +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) + FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_long_1_3_through_4_6 + UNION ALL + SELECT theta_sketch_agg(col2) as sketch FROM t_array_long_1_3_through_4_6) +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (1), (null), (2), (null), (3) tab(col) +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES ('test'), (null), ('null'), (null) tab(col) +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (100L), (null), (200L), (null), (300L) tab(col) +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(CAST(col AS DOUBLE))) +FROM VALUES (1.1), (null), (2.2), (null), (3.3) tab(col) +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(CAST(col AS FLOAT))) +FROM VALUES (1.5), (null), (2.5), (null), (3.5) tab(col) +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (X'AA'), (null), (X'BB'), (null), (X'CC') tab(col) +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY(1, 2)), (null), (ARRAY(3, 4)), (null), (ARRAY(5, 6)) tab(col) +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY(10L, 20L)), (null), (ARRAY(30L, 40L)), (null), (ARRAY(50L, 60L)) tab(col) +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY(1, null)), (ARRAY(1)), (ARRAY(2, null, 3)), (ARRAY(4)) tab(col) +-- !query schema +struct +-- !query output +4 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY(10L, null)), (ARRAY(10L)), (ARRAY(20L, null, 30L)), (ARRAY(40L)) tab(col) +-- !query schema +struct +-- !query output +4 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (ARRAY()), (ARRAY(1, 2)), (ARRAY()), (ARRAY(3, 4)) tab(col) +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (''), ('a'), (''), ('b'), ('c') tab(col) +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col)) +FROM VALUES (X''), (X'01'), (X'02'), (X'03'), (CAST(' ' AS BINARY)), (X'e280'), (X'c1'), (X'c120') tab(col) +-- !query schema +struct +-- !query output +7 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1)) utf8_b FROM t_string_collation +-- !query schema +struct +-- !query output +10 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_LCASE)) utf8_lc FROM t_string_collation +-- !query schema +struct +-- !query output +7 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE)) unicode FROM t_string_collation +-- !query schema +struct +-- !query output +7 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_CI)) unicode_ci FROM t_string_collation +-- !query schema +struct +-- !query output +6 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_BINARY_RTRIM)) utf8_b_rt FROM t_string_collation +-- !query schema +struct +-- !query output +6 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_LCASE_RTRIM)) utf8_lc_rt FROM t_string_collation +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_RTRIM)) unicode_rt FROM t_string_collation +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_CI_RTRIM)) unicode_ci_rt FROM t_string_collation +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH sketches AS ( + SELECT 'int_sketch' as sketch_type, theta_sketch_agg(col1, 12) as sketch FROM t_int_1_5_through_7_11 + UNION ALL + SELECT 'long_sketch' as sketch_type, theta_sketch_agg(col1, 15) as sketch FROM t_long_1_5_through_7_11 + UNION ALL + SELECT 'double_sketch' as sketch_type, theta_sketch_agg(col1, 10) as sketch FROM t_double_1_1_1_4_through_1_5_1_8 + UNION ALL + SELECT 'string_sketch' as sketch_type, theta_sketch_agg(col1, 14) as sketch FROM t_string_a_d_through_e_h +), +union_result AS ( + SELECT theta_union_agg(sketch, 16) as union_sketch FROM sketches +), +individual_sketches AS ( + SELECT theta_sketch_agg(col1, 12) as sketch1, theta_sketch_agg(col2, 12) as sketch2 FROM t_int_1_5_through_7_11 +) +SELECT + theta_sketch_estimate((SELECT union_sketch FROM union_result)) as union_estimate, + theta_sketch_estimate(theta_union(sketch1, sketch2, 15)) as binary_union_estimate, + theta_sketch_estimate(theta_intersection(sketch1, sketch2)) as intersection_estimate, + theta_sketch_estimate(theta_difference(sketch1, sketch2)) as difference_estimate +FROM individual_sketches +-- !query schema +struct +-- !query output +17 11 3 4 + + +-- !query +SELECT theta_sketch_agg(col, 2) +FROM VALUES (50), (60), (60) tab(col) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "THETA_INVALID_LG_NOM_ENTRIES", + "sqlState" : "22546", + "messageParameters" : { + "function" : "`theta_sketch_agg`", + "max" : "26", + "min" : "4", + "value" : "2" + } +} + + +-- !query +SELECT theta_sketch_agg(col, 40) +FROM VALUES (50), (60), (60) tab(col) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "THETA_INVALID_LG_NOM_ENTRIES", + "sqlState" : "22546", + "messageParameters" : { + "function" : "`theta_sketch_agg`", + "max" : "26", + "min" : "4", + "value" : "40" + } +} + + +-- !query +SELECT theta_union_agg(sketch, 3) +FROM (SELECT theta_sketch_agg(col, 12) as sketch + FROM VALUES (1) AS tab(col) + UNION ALL + SELECT theta_sketch_agg(col, 20) as sketch + FROM VALUES (1) AS tab(col)) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "THETA_INVALID_LG_NOM_ENTRIES", + "sqlState" : "22546", + "messageParameters" : { + "function" : "`theta_union_agg`", + "max" : "26", + "min" : "4", + "value" : "3" + } +} + + +-- !query +SELECT theta_union_agg(sketch, 27) +FROM (SELECT theta_sketch_agg(col, 12) as sketch + FROM VALUES (1) AS tab(col) + UNION ALL + SELECT theta_sketch_agg(col, 20) as sketch + FROM VALUES (1) AS tab(col)) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "THETA_INVALID_LG_NOM_ENTRIES", + "sqlState" : "22546", + "messageParameters" : { + "function" : "`theta_union_agg`", + "max" : "26", + "min" : "4", + "value" : "27" + } +} + + +-- !query +SELECT theta_union(1, 2) + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"1\"", + "inputType" : "\"INT\"", + "paramIndex" : "first", + "requiredType" : "\"BINARY\"", + "sqlExpr" : "\"theta_union(1, 2, 12)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 24, + "fragment" : "theta_union(1, 2)" + } ] +} + + +-- !query +SELECT theta_intersection(1, 2) + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"1\"", + "inputType" : "\"INT\"", + "paramIndex" : "first", + "requiredType" : "\"BINARY\"", + "sqlExpr" : "\"theta_intersection(1, 2)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 31, + "fragment" : "theta_intersection(1, 2)" + } ] +} + + +-- !query +SELECT theta_difference(1, 2) + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"1\"", + "inputType" : "\"INT\"", + "paramIndex" : "first", + "requiredType" : "\"BINARY\"", + "sqlExpr" : "\"theta_difference(1, 2)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 29, + "fragment" : "theta_difference(1, 2)" + } ] +} + + +-- !query +SELECT theta_union( + theta_sketch_agg(col1), + theta_sketch_agg(col2), 'invalid') + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"invalid\"", + "inputType" : "\"STRING\"", + "paramIndex" : "third", + "requiredType" : "\"INT\"", + "sqlExpr" : "\"theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), invalid)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 86, + "fragment" : "theta_union(\n theta_sketch_agg(col1),\n theta_sketch_agg(col2), 'invalid')" + } ] +} + + +-- !query +SELECT theta_intersection( + theta_sketch_agg(col1), + 'invalid_sketch') + FROM VALUES + (1, 4), + (2, 5), + (3, 6) AS tab(col1, col2) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + "sqlState" : "42K09", + "messageParameters" : { + "inputSql" : "\"invalid_sketch\"", + "inputType" : "\"STRING\"", + "paramIndex" : "second", + "requiredType" : "\"BINARY\"", + "sqlExpr" : "\"theta_intersection(theta_sketch_agg(col1, 12), invalid_sketch)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 8, + "stopIndex" : 76, + "fragment" : "theta_intersection(\n theta_sketch_agg(col1),\n 'invalid_sketch')" + } ] +} + + +-- !query +SELECT theta_sketch_estimate(CAST('abc' AS BINARY)) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "THETA_INVALID_INPUT_SKETCH_BUFFER", + "sqlState" : "22546", + "messageParameters" : { + "function" : "`theta_sketch_estimate`" + } +} + + +-- !query +SELECT theta_union(CAST('abc' AS BINARY), CAST('def' AS BINARY)) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "THETA_INVALID_INPUT_SKETCH_BUFFER", + "sqlState" : "22546", + "messageParameters" : { + "function" : "`theta_union`" + } +} + + +-- !query +SELECT theta_intersection(CAST('abc' AS BINARY), CAST('def' AS BINARY)) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "THETA_INVALID_INPUT_SKETCH_BUFFER", + "sqlState" : "22546", + "messageParameters" : { + "function" : "`theta_intersection`" + } +} + + +-- !query +SELECT theta_difference(CAST('abc' AS BINARY), CAST('def' AS BINARY)) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "THETA_INVALID_INPUT_SKETCH_BUFFER", + "sqlState" : "22546", + "messageParameters" : { + "function" : "`theta_difference`" + } +} + + +-- !query +SELECT theta_union_agg(buffer, 15) +FROM (SELECT CAST('abc' AS BINARY) AS buffer) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "THETA_INVALID_INPUT_SKETCH_BUFFER", + "sqlState" : "22546", + "messageParameters" : { + "function" : "`theta_union_agg`" + } +} + + +-- !query +SELECT theta_intersection_agg(buffer) +FROM (SELECT CAST('abc' AS BINARY) AS buffer) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "THETA_INVALID_INPUT_SKETCH_BUFFER", + "sqlState" : "22546", + "messageParameters" : { + "function" : "`theta_intersection_agg`" + } +} + + +-- !query +DROP TABLE IF EXISTS t_int_1_5_through_7_11 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_long_1_5_through_7_11 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_double_1_1_1_4_through_1_5_1_8 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_float_1_1_1_4_through_1_5_1_8 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_string_a_d_through_e_h +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_binary_a_b_through_e_f +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_array_int_1_3_through_4_6 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_array_long_1_3_through_4_6 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS t_string_collation +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala index 721d1c1deea9d..37614145fe83f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -2217,6 +2217,707 @@ class DataFrameAggregateSuite extends QueryTest ) } + test("SPARK-52407: theta_sketch_agg + theta_union_agg + theta_sketch_estimate positive tests") { + val df1 = Seq((1, "a"), (1, "a"), (1, "a"), (1, "b"), (1, "c"), (1, "c"), (1, "d")) + .toDF("id", "value") + df1.createOrReplaceTempView("df1") + + val df2 = Seq((1, "a"), (1, "c"), (1, "d"), (1, "d"), (1, "d"), (1, "e"), (1, "e"), (1, "f")) + .toDF("id", "value") + df2.createOrReplaceTempView("df2") + + // First test theta_sketch_agg, theta_sketch_estimate via dataframe + sql, + // with and without configs, via both DF and SQL implementations. + val res1 = df1 + .groupBy("id") + .agg( + count("value").as("count"), + theta_sketch_agg("value").as("sketch_1"), + theta_sketch_agg("value", 20).as("sketch_2")) + .withColumn("distinct_count_1", theta_sketch_estimate("sketch_1")) + .withColumn("distinct_count_2", theta_sketch_estimate("sketch_2")) + .drop("sketch_1", "sketch_2") + checkAnswer(res1, Row(1, 7, 4, 4)) + + val res2 = sql("""with sketches as ( + |select + | id, + | count(value) as count, + | theta_sketch_agg(value) as sketch_1, + | theta_sketch_agg(value, 20) as sketch_2 + |from df1 + |group by 1 + |) + | + |select + | id, + | count, + | theta_sketch_estimate(sketch_1) as distinct_count_1, + | theta_sketch_estimate(sketch_2) as distinct_count_2 + |from + | sketches + |""".stripMargin) + checkAnswer(res2, Row(1, 7, 4, 4)) + + // Now test theta_union_agg via dataframe + sql, with and without configs, + // unioning together sketches with default, non-default and different configurations + val df3 = df1 + .groupBy("id") + .agg( + count("value").as("count"), + theta_sketch_agg("value").as("thetasketch_1"), + theta_sketch_agg("value", 20).as("thetasketch_2"), + theta_sketch_agg("value").as("thetasketch_3")) + df3.createOrReplaceTempView("df3") + + val df4 = sql("""select + | id, + | count(value) as count, + | theta_sketch_agg(value) as thetasketch_1, + | theta_sketch_agg(value, 20) as thetasketch_2, + | theta_sketch_agg(value, 20) as thetasketch_3 + |from df2 + |group by 1 + |""".stripMargin) + df4.createOrReplaceTempView("df4") + + val res3 = df3 + .union(df4) + .groupBy("id") + .agg( + sum("count").as("count"), + theta_sketch_estimate(theta_union_agg("thetasketch_1")).as("distinct_count_1"), + theta_sketch_estimate(theta_union_agg("thetasketch_2")).as("distinct_count_2"), + theta_sketch_estimate(theta_union_agg("thetasketch_3", 15)).as("distinct_count_3")) + checkAnswer(res3, Row(1, 15, 6, 6, 6)) + + val res4 = sql("""select + | id, + | sum(count) as count, + | theta_sketch_estimate(theta_union_agg(thetasketch_1)) as distinct_count_1, + | theta_sketch_estimate(theta_union_agg(thetasketch_2)) as distinct_count_2, + | theta_sketch_estimate(theta_union_agg(thetasketch_3, 15)) as distinct_count_3 + |from (select * from df3 union all select * from df4) + |group by 1 + |""".stripMargin) + checkAnswer(res4, Row(1, 15, 6, 6, 6)) + + // add tests to ensure theta_union works via both DF and SQL too + val df5 = df3.drop("count") + df5.createOrReplaceTempView("df5") + + val df6 = df4 + .drop("count") + .withColumnRenamed("thetasketch_1", "thetasketch_4") + .withColumnRenamed("thetasketch_2", "thetasketch_5") + .withColumnRenamed("thetasketch_3", "thetasketch_6") + df6.createOrReplaceTempView("df6") + + val res5 = df5 + .join(df6, "id") + .withColumn( + "distinct_count_1", + theta_sketch_estimate(theta_union("thetasketch_1", "thetasketch_4"))) + .withColumn( + "distinct_count_2", + theta_sketch_estimate(theta_union("thetasketch_2", "thetasketch_5"))) + .withColumn( + "distinct_count_3", + theta_sketch_estimate(theta_union("thetasketch_3", "thetasketch_6", 15))) + .drop( + "thetasketch_1", + "thetasketch_2", + "thetasketch_3", + "thetasketch_4", + "thetasketch_5", + "thetasketch_6") + checkAnswer(res5, Row(1, 6, 6, 6)) + + val res6 = sql("""with joined as ( + | select + | l.id, + | l.thetasketch_1, + | l.thetasketch_2, + | l.thetasketch_3, + | r.thetasketch_4, + | r.thetasketch_5, + | r.thetasketch_6 + | from + | df5 l + | join + | df6 r + | on l.id = r.id + | ) + | + |select + | id, + | theta_sketch_estimate(theta_union(thetasketch_1, thetasketch_4)) as distinct_count_1, + | theta_sketch_estimate(theta_union(thetasketch_2, thetasketch_5)) as distinct_count_2, + | theta_sketch_estimate(theta_union(thetasketch_3, thetasketch_6, 20)) + | as distinct_count_3 + |from + | joined + |""".stripMargin) + checkAnswer(res6, Row(1, 6, 6, 6)) + + val df7 = + Seq((1, "a"), (1, "a"), (1, "a"), (1, "b"), (1, null), (2, null), (2, null), (2, null)) + .toDF("id", "value") + + // empty column test + val res7 = df7 + .where(expr("id = 2")) + .groupBy("id") + .agg(theta_sketch_estimate(theta_sketch_agg("value")).as("distinct_count")) + checkAnswer(res7, Row(2, 0)) + + // partial empty column test + val res8 = df7 + .groupBy("id") + .agg(theta_sketch_estimate(theta_sketch_agg("value")).as("distinct_count")) + checkAnswer(res8, Seq(Row(1, 2), Row(2, 0))) + } + + test("SPARK-52407: theta_sketch_agg + theta_union_agg + theta_union negative tests") { + val df1 = Seq((1, "a"), (1, "a"), (1, "a"), (1, "b"), (1, "c"), (1, "c"), (1, "d")) + .toDF("id", "value") + df1.createOrReplaceTempView("df1") + + val df2 = Seq((1, "a"), (1, "c"), (1, "d"), (1, "d"), (1, "d"), (1, "e"), (1, "e"), (1, "f")) + .toDF("id", "value") + df2.createOrReplaceTempView("df2") + + // Validate that the functions error out when lgNomEntries < 4 or > 26. + checkError( + exception = intercept[SparkRuntimeException] { + df1 + .groupBy("id") + .agg(theta_sketch_agg("value", 1).as("thetasketch")) + .collect() + }, + condition = "THETA_INVALID_LG_NOM_ENTRIES", + parameters = Map( + "function" -> "`theta_sketch_agg`", + "min" -> "4", + "max" -> "26", + "value" -> "1" + ) + ) + + checkError( + exception = intercept[SparkRuntimeException] { + df1 + .groupBy("id") + .agg(theta_sketch_agg("value", 28).as("thetasketch")) + .collect() + }, + condition = "THETA_INVALID_LG_NOM_ENTRIES", + parameters = Map( + "function" -> "`theta_sketch_agg`", + "min" -> "4", + "max" -> "26", + "value" -> "28" + ) + ) + + // Validate that the functions error out when provided unexpected types. + checkError( + exception = intercept[AnalysisException] { + val res = sql(""" + |select + | id, + | theta_sketch_agg(value, 'text') + |from + | df1 + |group by 1 + |""".stripMargin) + checkAnswer(res, Nil) + }, + condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"theta_sketch_agg(value, text)\"", + "paramIndex" -> "second", + "inputSql" -> "\"text\"", + "inputType" -> "\"STRING\"", + "requiredType" -> "\"INT\""), + context = + ExpectedContext(fragment = "theta_sketch_agg(value, 'text')", start = 14, stop = 44)) + + checkError( + exception = intercept[AnalysisException] { + val res = sql("""with sketch_cte as ( + |select + | id, + | theta_sketch_agg(value) as sketch + |from + | df1 + |group by 1 + |) + | + |select theta_union_agg(sketch, 'Theta_4') from sketch_cte + |""".stripMargin) + checkAnswer(res, Nil) + }, + condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"theta_union_agg(sketch, Theta_4)\"", + "paramIndex" -> "second", + "inputSql" -> "\"Theta_4\"", + "inputType" -> "\"STRING\"", + "requiredType" -> "\"INT\""), + context = + ExpectedContext(fragment = "theta_union_agg(sketch, 'Theta_4')", start = 99, stop = 132)) + + // Test invalid parameter types for theta_union + checkError( + exception = intercept[AnalysisException] { + sql("""with sketches as ( + |select id, theta_sketch_agg(value) as sketch from df1 group by 1 + |) + |select theta_union(sketch, 'invalid') from sketches + |""".stripMargin).collect() + }, + condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"theta_union(sketch, invalid, 12)\"", + "paramIndex" -> "second", + "inputSql" -> "\"invalid\"", + "inputType" -> "\"STRING\"", + "requiredType" -> "\"BINARY\""), + context = + ExpectedContext(fragment = "theta_union(sketch, 'invalid')", start = 93, stop = 122)) + + // Test theta_union with non-sketch input. + checkError( + exception = intercept[AnalysisException] { + sql("select theta_union('not_a_sketch', 'also_not_a_sketch')").collect() + }, + condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"theta_union(not_a_sketch, also_not_a_sketch, 12)\"", + "paramIndex" -> "first", + "inputSql" -> "\"not_a_sketch\"", + "inputType" -> "\"STRING\"", + "requiredType" -> "\"BINARY\""), + context = ExpectedContext( + fragment = "theta_union('not_a_sketch', 'also_not_a_sketch')", + start = 7, + stop = 54)) + + } + test( + "SPARK-52407: theta_difference + theta_intersection + theta_intersection_agg positive tests") { + val df1 = Seq((1, "a"), (1, "a"), (1, "a"), (1, "b"), (1, "c"), (1, "c"), (1, "d")) + .toDF("id", "value") + df1.createOrReplaceTempView("df1") + + val df2 = Seq((1, "a"), (1, "c"), (1, "d"), (1, "d"), (1, "d"), (1, "e"), (1, "e"), (1, "f")) + .toDF("id", "value") + df2.createOrReplaceTempView("df2") + + val df3 = Seq((1, "c"), (1, "d"), (1, "g"), (1, "g"), (1, "h")).toDF("id", "value") + df3.createOrReplaceTempView("df3") + + // Test theta_difference via DataFrame API. + val sketches1 = df1 + .groupBy("id") + .agg( + theta_sketch_agg("value").as("sketch1"), + theta_sketch_agg("value", 20).as("sketch1_20")) + + val sketches2 = df2 + .groupBy("id") + .agg( + theta_sketch_agg("value").as("sketch2"), + theta_sketch_agg("value", 20).as("sketch2_20")) + + val res1 = sketches1 + .join(sketches2, "id") + .withColumn( + "difference_count_1", + theta_sketch_estimate(theta_difference("sketch1", "sketch2"))) + .withColumn( + "difference_count_2", + theta_sketch_estimate(theta_difference("sketch1_20", "sketch2_20"))) + .select("id", "difference_count_1", "difference_count_2") + + // df1 has {a,b,c,d}, df2 has {a,c,d,e,f}, so df1 - df2 should be approximately {b}. + checkAnswer(res1, Row(1, 1, 1)) + + // Test theta_difference via SQL. + val res2 = sql("""with sketches1 as ( + |select + | id, + | theta_sketch_agg(value) as sketch1, + | theta_sketch_agg(value, 20) as sketch1_20 + |from df1 + |group by 1 + |), + |sketches2 as ( + |select + | id, + | theta_sketch_agg(value) as sketch2, + | theta_sketch_agg(value, 20) as sketch2_20 + |from df2 + |group by 1 + |) + | + |select + | s1.id, + | theta_sketch_estimate(theta_difference(s1.sketch1, s2.sketch2)) as difference_count_1, + | theta_sketch_estimate(theta_difference(s1.sketch1_20, s2.sketch2_20)) as difference_count_2 + |from sketches1 s1 + |join sketches2 s2 on s1.id = s2.id + |""".stripMargin) + checkAnswer(res2, Row(1, 1, 1)) + + // Test theta_intersection via DataFrame API. + val res3 = sketches1 + .join(sketches2, "id") + .withColumn( + "intersection_count_1", + theta_sketch_estimate(theta_intersection("sketch1", "sketch2"))) + .withColumn( + "intersection_count_2", + theta_sketch_estimate(theta_intersection("sketch1_20", "sketch2_20"))) + .select("id", "intersection_count_1", "intersection_count_2") + + // df1 has {a,b,c,d}, df2 has {a,c,d,e,f}, so intersection should be approximately {a,c,d} = 3. + checkAnswer(res3, Row(1, 3, 3)) + + // Test theta_intersection via SQL. + val res4 = sql("""with sketches1 as ( + |select + | id, + | theta_sketch_agg(value) as sketch1, + | theta_sketch_agg(value, 20) as sketch1_20 + |from df1 + |group by 1 + |), + |sketches2 as ( + |select + | id, + | theta_sketch_agg(value) as sketch2, + | theta_sketch_agg(value, 20) as sketch2_20 + |from df2 + |group by 1 + |) + | + |select + | s1.id, + | theta_sketch_estimate(theta_intersection(s1.sketch1, s2.sketch2)) as intersection_count_1, + | theta_sketch_estimate(theta_intersection(s1.sketch1_20, s2.sketch2_20)) + | as intersection_count_2 + |from sketches1 s1 + |join sketches2 s2 on s1.id = s2.id + |""".stripMargin) + checkAnswer(res4, Row(1, 3, 3)) + + // Test theta_intersection_agg via DataFrame API. + val all_sketches = df1 + .groupBy("id") + .agg(theta_sketch_agg("value").as("sketch")) + .withColumn("source", lit("df1")) + .union( + df2 + .groupBy("id") + .agg(theta_sketch_agg("value").as("sketch")) + .withColumn("source", lit("df2"))) + .union( + df3 + .groupBy("id") + .agg(theta_sketch_agg("value").as("sketch")) + .withColumn("source", lit("df3"))) + + val res5 = all_sketches + .groupBy("id") + .agg( + theta_sketch_estimate(theta_intersection_agg("sketch")).as("intersection_count_1") + ) + + // df1={a,b,c,d}, df2={a,c,d,e,f}, df3={c,d,g,h}, so intersection should be {c,d} = 2. + checkAnswer(res5, Row(1, 2)) + + // Test theta_intersection_agg via SQL. + val res6 = sql("""with all_sketches as ( + |select id, theta_sketch_agg(value) as sketch, 'df1' as source from df1 group by 1 + |union all + |select id, theta_sketch_agg(value) as sketch, 'df2' as source from df2 group by 1 + |union all + |select id, theta_sketch_agg(value) as sketch, 'df3' as source from df3 group by 1 + |) + | + |select + | id, + | theta_sketch_estimate(theta_intersection_agg(sketch)) as intersection_count_1 + |from all_sketches + |group by 1 + |""".stripMargin) + checkAnswer(res6, Row(1, 2)) + + // Test with different lgNomEntries parameters. + val res7 = sql("""with sketches1 as ( + |select id, theta_sketch_agg(value, 12) as sketch1 from df1 group by 1 + |), + |sketches2 as ( + |select id, theta_sketch_agg(value, 18) as sketch2 from df2 group by 1 + |) + | + |select + | s1.id, + | theta_sketch_estimate(theta_difference(s1.sketch1, s2.sketch2)) as difference_count, + | theta_sketch_estimate(theta_intersection(s1.sketch1, s2.sketch2)) as intersection_count + |from sketches1 s1 + |join sketches2 s2 on s1.id = s2.id + |""".stripMargin) + checkAnswer(res7, Row(1, 1, 3)) + + // Test with null values. + val df_with_nulls = + Seq((1, "a"), (1, "b"), (1, null), (2, null), (2, null)).toDF("id", "value") + df_with_nulls.createOrReplaceTempView("df_with_nulls") + + val res8 = sql("""with sketch1 as ( + |select id, theta_sketch_agg(value) as sketch from df_with_nulls where id = 1 group by 1 + |), + |sketch2 as ( + |select id, theta_sketch_agg(value) as sketch from df_with_nulls where id = 2 group by 1 + |) + | + |select + | s1.id, + | theta_sketch_estimate(theta_difference(s1.sketch, s2.sketch)) as difference_count, + | theta_sketch_estimate(theta_intersection(s1.sketch, s2.sketch)) as intersection_count + |from sketch1 s1 + |cross join sketch2 s2 + |""".stripMargin) + // sketch1 has {a,b}, sketch2 is empty, so difference = 2 and intersection = 0. + checkAnswer(res8, Row(1, 2, 0)) + + // Test empty intersection. + val df_disjoint1 = Seq((1, "a"), (1, "b")).toDF("id", "value") + val df_disjoint2 = Seq((1, "c"), (1, "d")).toDF("id", "value") + df_disjoint1.createOrReplaceTempView("df_disjoint1") + df_disjoint2.createOrReplaceTempView("df_disjoint2") + + val res9 = sql("""with sketch1 as ( + |select id, theta_sketch_agg(value) as sketch from df_disjoint1 group by 1 + |), + |sketch2 as ( + |select id, theta_sketch_agg(value) as sketch from df_disjoint2 group by 1 + |) + | + |select + | s1.id, + | theta_sketch_estimate(theta_intersection(s1.sketch, s2.sketch)) as intersection_count + |from sketch1 s1 + |join sketch2 s2 on s1.id = s2.id + |""".stripMargin) + checkAnswer(res9, Row(1, 0)) + } + + test( + "SPARK-52407: theta_difference + theta_intersection + theta_intersection_agg negative tests") { + val df1 = Seq((1, "a"), (1, "b"), (1, "c"), (1, "d")).toDF("id", "value") + df1.createOrReplaceTempView("df1") + + // Test invalid parameter types for theta_difference. + checkError( + exception = intercept[AnalysisException] { + sql("""with sketches as ( + |select id, theta_sketch_agg(value) as sketch from df1 group by 1 + |) + |select theta_difference(sketch, 'invalid') from sketches + |""".stripMargin).collect() + }, + condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"theta_difference(sketch, invalid)\"", + "paramIndex" -> "second", + "inputSql" -> "\"invalid\"", + "inputType" -> "\"STRING\"", + "requiredType" -> "\"BINARY\""), + context = + ExpectedContext(fragment = "theta_difference(sketch, 'invalid')", start = 93, stop = 127)) + + // Test invalid parameter types for theta_intersection. + checkError( + exception = intercept[AnalysisException] { + sql("""with sketches as ( + |select id, theta_sketch_agg(value) as sketch from df1 group by 1 + |) + |select theta_intersection(sketch, 123) from sketches + |""".stripMargin).collect() + }, + condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"theta_intersection(sketch, 123)\"", + "paramIndex" -> "second", + "inputSql" -> "\"123\"", + "inputType" -> "\"INT\"", + "requiredType" -> "\"BINARY\""), + context = + ExpectedContext(fragment = "theta_intersection(sketch, 123)", start = 93, stop = 123)) + + // Test invalid parameter types for theta_intersection_agg. + checkError( + exception = intercept[AnalysisException] { + sql("""with sketches as ( + |select id, theta_sketch_agg(value) as sketch from df1 group by 1 + |) + |select theta_intersection_agg('invalid') from sketches + |""".stripMargin).collect() + }, + condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"theta_intersection_agg(invalid)\"", + "paramIndex" -> "first", + "inputSql" -> "\"invalid\"", + "inputType" -> "\"STRING\"", + "requiredType" -> "\"BINARY\""), + context = ExpectedContext( + fragment = "theta_intersection_agg('invalid')", + start = 93, + stop = 125)) + + // Test theta_difference with non-sketch input. + checkError( + exception = intercept[AnalysisException] { + sql("select theta_difference('not_a_sketch', 'also_not_a_sketch')").collect() + }, + condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"theta_difference(not_a_sketch, also_not_a_sketch)\"", + "paramIndex" -> "first", + "inputSql" -> "\"not_a_sketch\"", + "inputType" -> "\"STRING\"", + "requiredType" -> "\"BINARY\""), + context = ExpectedContext( + fragment = "theta_difference('not_a_sketch', 'also_not_a_sketch')", + start = 7, + stop = 59)) + + // Test theta_intersection with non-sketch input. + checkError( + exception = intercept[AnalysisException] { + sql("select theta_intersection('not_a_sketch', 'also_not_a_sketch')").collect() + }, + condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"theta_intersection(not_a_sketch, also_not_a_sketch)\"", + "paramIndex" -> "first", + "inputSql" -> "\"not_a_sketch\"", + "inputType" -> "\"STRING\"", + "requiredType" -> "\"BINARY\""), + context = ExpectedContext( + fragment = "theta_intersection('not_a_sketch', 'also_not_a_sketch')", + start = 7, + stop = 61)) + } + + test("SPARK-52407: theta_union") { + val df1 = Seq(1, 1, 2, 3).toDF("col") + val df2 = Seq(1, 3, 4, 5).toDF("col") + + val sketch1 = df1.selectExpr("theta_sketch_agg(col, 12) as sketch1") + val sketch2 = df2.selectExpr("theta_sketch_agg(col, 12) as sketch2") + + checkAnswer( + sketch1 + .crossJoin(sketch2) + .selectExpr("theta_sketch_estimate(theta_union(sketch1, sketch2))"), + Seq(Row(5)) // {1,2,3} ∪ {1,3,4,5} = {1,2,3,4,5} + ) + checkAnswer( + sketch1 + .crossJoin(sketch2) + .select(theta_sketch_estimate( + theta_union(col("sketch1"), col("sketch2")))), + Seq(Row(5))) + } + + test("SPARK-52407: theta_difference") { + val df1 = Seq(1, 1, 2, 3).toDF("col") + val df2 = Seq(1, 4, 5).toDF("col") + + val sketch1 = df1.selectExpr("theta_sketch_agg(col, 12) as sketch1") + val sketch2 = df2.selectExpr("theta_sketch_agg(col, 12) as sketch2") + + checkAnswer( + sketch1 + .crossJoin(sketch2) + .selectExpr("theta_sketch_estimate(theta_difference(sketch1, sketch2))"), + Seq(Row(2)) // {1,2,3} - {1,4,5} = {2,3} + ) + checkAnswer( + sketch1 + .crossJoin(sketch2) + .select( + theta_sketch_estimate(theta_difference(col("sketch1"), col("sketch2")))), + Seq(Row(2))) + } + + test("SPARK-52407: theta_intersection") { + val df1 = Seq(1, 1, 2, 3).toDF("col") + val df2 = Seq(1, 3, 4, 5).toDF("col") + + val sketch1 = df1.selectExpr("theta_sketch_agg(col, 12) as sketch1") + val sketch2 = df2.selectExpr("theta_sketch_agg(col, 12) as sketch2") + + checkAnswer( + sketch1 + .crossJoin(sketch2) + .selectExpr("theta_sketch_estimate(theta_intersection(sketch1, sketch2))"), + Seq(Row(2)) // {1,2,3} ∩ {1,3,4,5} = {1,3} + ) + checkAnswer( + sketch1 + .crossJoin(sketch2) + .select(theta_sketch_estimate( + theta_intersection(col("sketch1"), col("sketch2")))), + Seq(Row(2))) + } + + test("SPARK-52407: theta_intersection_agg") { + val df = Seq(1, 2).toDF("col") + + checkAnswer( + df.selectExpr("theta_sketch_agg(col) as sketch") + .unionAll(df.selectExpr("theta_sketch_agg(col, 20) as sketch")) + .unionAll(df.filter(col("col") === 1).selectExpr("theta_sketch_agg(col) as sketch")) + .selectExpr("theta_sketch_estimate(theta_intersection_agg(sketch))"), + Seq(Row(1)) // The intersection of {1,2}, {1,2}, {1} = {1}. + ) + checkAnswer( + df.select(theta_sketch_agg(col("col")).as("sketch")) + .unionAll(df.select(theta_sketch_agg(col("col"), lit(20)).as("sketch"))) + .unionAll(df.filter(col("col") === 1).select(theta_sketch_agg(col("col")).as("sketch"))) + .select(theta_sketch_estimate(theta_intersection_agg(col("sketch")))), + Seq(Row(1))) + } + + test("SPARK-52407: theta_sketch_agg") { + val df = Seq(1, 1, 2, 2, 3).toDF("col") + checkAnswer(df.selectExpr("theta_sketch_estimate(theta_sketch_agg(col, 12))"), Seq(Row(3))) + checkAnswer( + df.select(theta_sketch_estimate(theta_sketch_agg(col("col"), lit(12)))), + Seq(Row(3))) + } + + test("SPARK-52407: theta_union_agg") { + val df = Seq(1).toDF("col") + checkAnswer( + df.selectExpr("theta_sketch_agg(col) as sketch") + .unionAll(df.selectExpr("theta_sketch_agg(col, 20) as sketch")) + .selectExpr("theta_sketch_estimate(theta_union_agg(sketch, 15))"), + Seq(Row(1))) + checkAnswer( + df.select(theta_sketch_agg(col("col")).as("sketch")) + .unionAll(df.select(theta_sketch_agg(col("col"), lit(20)).as("sketch"))) + .select(theta_sketch_estimate(theta_union_agg(col("sketch"), lit(15)))), + Seq(Row(1))) + } + private def assertAggregateOnDataframe( df: => DataFrame, expected: Int): Unit = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala index 4044e5674191b..bcc0ce2633125 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala @@ -1885,6 +1885,26 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { }) } + test("theta sketch aggregate should respect collation") { + case class ThetaSketchAggTestCase[R](c: String, result: R) + val testCases = Seq( + ThetaSketchAggTestCase("UTF8_BINARY", 5), + ThetaSketchAggTestCase("UTF8_BINARY_RTRIM", 4), + ThetaSketchAggTestCase("UTF8_LCASE", 4), + ThetaSketchAggTestCase("UTF8_LCASE_RTRIM", 3), + ThetaSketchAggTestCase("UNICODE", 5), + ThetaSketchAggTestCase("UNICODE_RTRIM", 4), + ThetaSketchAggTestCase("UNICODE_CI", 4), + ThetaSketchAggTestCase("UNICODE_CI_RTRIM", 3) + ) + testCases.foreach(t => { + val q = s"SELECT theta_sketch_estimate(theta_sketch_agg(col collate ${t.c})) FROM " + + "VALUES ('a'), ('A'), ('b'), ('b'), ('c'), ('c ') tab(col)" + val df = sql(q) + checkAnswer(df, Seq(Row(t.result))) + }) + } + test("cache table with collated columns") { val collations = Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI", "SR_CI_AI") val lazyOptions = Seq(false, true)