diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
index 06e6c2f14d864..780c53007233b 100644
--- a/common/utils/src/main/resources/error/error-conditions.json
+++ b/common/utils/src/main/resources/error/error-conditions.json
@@ -5599,6 +5599,18 @@
     ],
     "sqlState" : "428EK"
   },
+  "THETA_INVALID_INPUT_SKETCH_BUFFER" : {
+    "message" : [
+      "Invalid call to <function>; only valid Theta sketch buffers are supported as inputs (such as those produced by the `theta_sketch_agg` function)."
+    ],
+    "sqlState" : "22546"
+  },
+  "THETA_INVALID_LG_NOM_ENTRIES" : {
+    "message" : [
+      "Invalid call to <function>; the `lgNomEntries` value must be between <min> and <max>, inclusive: <value>."
+    ],
+    "sqlState" : "22546"
+  },
   "TRAILING_COMMA_IN_SELECT" : {
     "message" : [
       "Trailing comma detected in SELECT clause. Remove the trailing comma before the FROM clause."
diff --git a/python/docs/source/reference/pyspark.sql/functions.rst b/python/docs/source/reference/pyspark.sql/functions.rst
index 7bec529407667..003fdc0a00b54 100644
--- a/python/docs/source/reference/pyspark.sql/functions.rst
+++ b/python/docs/source/reference/pyspark.sql/functions.rst
@@ -491,6 +491,9 @@ Aggregate Functions
     string_agg_distinct
     sum
     sum_distinct
+    theta_intersection_agg
+    theta_sketch_agg
+    theta_union_agg
     try_avg
     try_sum
     var_pop
@@ -636,6 +639,10 @@ Misc Functions
     reflect
     session_user
     spark_partition_id
+    theta_difference
+    theta_intersection
+    theta_sketch_estimate
+    theta_union
     try_aes_decrypt
     try_reflect
     typeof
diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py
index 0380b517e6e5e..ce85f2c37ffcf 100644
--- a/python/pyspark/sql/connect/functions/builtin.py
+++ b/python/pyspark/sql/connect/functions/builtin.py
@@ -4236,6 +4236,81 @@ def hll_union(
 hll_union.__doc__ = pysparkfuncs.hll_union.__doc__
 
 
+def theta_sketch_agg(
+    col: "ColumnOrName",
+    lgNomEntries: Optional[Union[int, Column]] = None,
+) -> Column:
+    fn = "theta_sketch_agg"
+    if lgNomEntries is None:
+        return _invoke_function_over_columns(fn, col)
+    else:
+        return _invoke_function_over_columns(fn, col, lit(lgNomEntries))
+
+
+theta_sketch_agg.__doc__ = pysparkfuncs.theta_sketch_agg.__doc__
+
+
+def theta_union_agg(
+    col: "ColumnOrName",
+    lgNomEntries: Optional[Union[int, Column]] = None,
+) -> Column:
+    fn = "theta_union_agg"
+    if lgNomEntries is None:
+        return _invoke_function_over_columns(fn, col)
+    else:
+        return _invoke_function_over_columns(fn, col, lit(lgNomEntries))
+
+
+theta_union_agg.__doc__ = pysparkfuncs.theta_union_agg.__doc__
+
+
+def theta_intersection_agg(
+    col: "ColumnOrName",
+) -> Column:
+    fn = "theta_intersection_agg"
+    return _invoke_function_over_columns(fn, col)
+
+
+theta_intersection_agg.__doc__ = pysparkfuncs.theta_intersection_agg.__doc__
+
+
+def theta_sketch_estimate(col: "ColumnOrName") -> Column:
+    fn = "theta_sketch_estimate"
+    return _invoke_function_over_columns(fn, col)
+
+
+theta_sketch_estimate.__doc__ = pysparkfuncs.theta_sketch_estimate.__doc__
+
+
+def theta_union(
+    col1: "ColumnOrName", col2: "ColumnOrName", lgNomEntries: Optional[Union[int, Column]] = None
+) -> Column:
+    fn = "theta_union"
+    if lgNomEntries is None:
+        return _invoke_function_over_columns(fn, col1, col2)
+    else:
+        return _invoke_function_over_columns(fn, col1, col2, lit(lgNomEntries))
+
+
+theta_union.__doc__ = pysparkfuncs.theta_union.__doc__
+
+
+def theta_intersection(col1: "ColumnOrName", col2: "ColumnOrName") -> Column:
+    fn = "theta_intersection"
+    return _invoke_function_over_columns(fn, col1, col2)
+
+
+theta_intersection.__doc__ = pysparkfuncs.theta_intersection.__doc__
+
+
+def theta_difference(col1: "ColumnOrName", col2: "ColumnOrName") -> Column:
+    fn = "theta_difference"
+    return _invoke_function_over_columns(fn, col1, col2)
+
+
+theta_difference.__doc__ = pysparkfuncs.theta_difference.__doc__
+
+
 # Predicates Function
 
 
diff --git a/python/pyspark/sql/functions/__init__.py b/python/pyspark/sql/functions/__init__.py
index 0d1df20d13d6f..7039a0c79f692 100644
--- a/python/pyspark/sql/functions/__init__.py
+++ b/python/pyspark/sql/functions/__init__.py
@@ -404,6 +404,9 @@
     "string_agg_distinct",
     "sum",
     "sum_distinct",
+    "theta_intersection_agg",
+    "theta_sketch_agg",
+    "theta_union_agg",
     "try_avg",
     "try_sum",
     "var_pop",
@@ -495,6 +498,10 @@
     "reflect",
     "session_user",
     "spark_partition_id",
+    "theta_difference",
+    "theta_intersection",
+    "theta_sketch_estimate",
+    "theta_union",
     "try_aes_decrypt",
     "try_reflect",
     "typeof",
diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
index b09713e0c289e..cc00f6abe068b 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -25744,6 +25744,344 @@ def hll_union(
         return _invoke_function("hll_union", _to_java_column(col1), _to_java_column(col2))
 
 
+@_try_remote_functions
+def theta_sketch_agg(
+    col: "ColumnOrName",
+    lgNomEntries: Optional[Union[int, Column]] = None,
+) -> Column:
+    """
+    Aggregate function: returns the compact binary representation of the Datasketches
+    ThetaSketch with the values in the input column configured with lgNomEntries nominal entries.
+
+    .. versionadded:: 4.1.0
+
+    Parameters
+    ----------
+    col : :class:`~pyspark.sql.Column` or column name
+    lgNomEntries : :class:`~pyspark.sql.Column` or int, optional
+        The log-base-2 of nominal entries, where nominal entries is the size of the sketch
+        (must be between 4 and 26, defaults to 12)
+
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        The binary representation of the ThetaSketch.
+
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.theta_union`
+    :meth:`pyspark.sql.functions.theta_intersection`
+    :meth:`pyspark.sql.functions.theta_difference`
+    :meth:`pyspark.sql.functions.theta_union_agg`
+    :meth:`pyspark.sql.functions.theta_intersection_agg`
+    :meth:`pyspark.sql.functions.theta_sketch_estimate`
+
+    Examples
+    --------
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([1,2,2,3], "INT")
+    >>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value"))).show()
+    +--------------------------------------------------+
+    |theta_sketch_estimate(theta_sketch_agg(value, 12))|
+    +--------------------------------------------------+
+    |                                                 3|
+    +--------------------------------------------------+
+
+    >>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value", 15))).show()
+    +--------------------------------------------------+
+    |theta_sketch_estimate(theta_sketch_agg(value, 15))|
+    +--------------------------------------------------+
+    |                                                 3|
+    +--------------------------------------------------+
+    """
+    fn = "theta_sketch_agg"
+    if lgNomEntries is None:
+        return _invoke_function_over_columns(fn, col)
+    else:
+        return _invoke_function_over_columns(fn, col, lit(lgNomEntries))
+
+
+@_try_remote_functions
+def theta_union_agg(
+    col: "ColumnOrName",
+    lgNomEntries: Optional[Union[int, Column]] = None,
+) -> Column:
+    """
+    Aggregate function: returns the compact binary representation of the Datasketches
+    ThetaSketch that is the union of the Theta sketches in the input column.
+
+    .. versionadded:: 4.1.0
+
+    Parameters
+    ----------
+    col : :class:`~pyspark.sql.Column` or column name
+    lgNomEntries : :class:`~pyspark.sql.Column` or int, optional
+        The log-base-2 of nominal entries for the union operation
+        (must be between 4 and 26, defaults to 12)
+
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        The binary representation of the merged ThetaSketch.
+
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.theta_union`
+    :meth:`pyspark.sql.functions.theta_sketch_agg`
+    :meth:`pyspark.sql.functions.theta_sketch_estimate`
+
+    Examples
+    --------
+    >>> from pyspark.sql import functions as sf
+    >>> df1 = spark.createDataFrame([1,2,2,3], "INT")
+    >>> df1 = df1.agg(sf.theta_sketch_agg("value").alias("sketch"))
+    >>> df2 = spark.createDataFrame([4,5,5,6], "INT")
+    >>> df2 = df2.agg(sf.theta_sketch_agg("value").alias("sketch"))
+    >>> df3 = df1.union(df2)
+    >>> df3.agg(sf.theta_sketch_estimate(sf.theta_union_agg("sketch"))).show()
+    +--------------------------------------------------+
+    |theta_sketch_estimate(theta_union_agg(sketch, 12))|
+    +--------------------------------------------------+
+    |                                                 6|
+    +--------------------------------------------------+
+    """
+    fn = "theta_union_agg"
+    if lgNomEntries is None:
+        return _invoke_function_over_columns(fn, col)
+    else:
+        return _invoke_function_over_columns(fn, col, lit(lgNomEntries))
+
+
+@_try_remote_functions
+def theta_intersection_agg(col: "ColumnOrName") -> Column:
+    """
+    Aggregate function: returns the compact binary representation of the Datasketches
+    ThetaSketch that is the intersection of the Theta sketches in the input column
+
+    .. versionadded:: 4.1.0
+
+    Parameters
+    ----------
+    col : :class:`~pyspark.sql.Column` or column name
+
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        The binary representation of the intersected ThetaSketch.
+
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.theta_intersection`
+    :meth:`pyspark.sql.functions.theta_sketch_agg`
+    :meth:`pyspark.sql.functions.theta_sketch_estimate`
+
+    Examples
+    --------
+    >>> from pyspark.sql import functions as sf
+    >>> df1 = spark.createDataFrame([1,2,2,3], "INT")
+    >>> df1 = df1.agg(sf.theta_sketch_agg("value").alias("sketch"))
+    >>> df2 = spark.createDataFrame([2,3,3,4], "INT")
+    >>> df2 = df2.agg(sf.theta_sketch_agg("value").alias("sketch"))
+    >>> df3 = df1.union(df2)
+    >>> df3.agg(sf.theta_sketch_estimate(sf.theta_intersection_agg("sketch"))).show()
+    +-----------------------------------------------------+
+    |theta_sketch_estimate(theta_intersection_agg(sketch))|
+    +-----------------------------------------------------+
+    |                                                    2|
+    +-----------------------------------------------------+
+    """
+    fn = "theta_intersection_agg"
+    return _invoke_function_over_columns(fn, col)
+
+
+@_try_remote_functions
+def theta_sketch_estimate(col: "ColumnOrName") -> Column:
+    """
+    Returns the estimated number of unique values given the binary representation
+    of a Datasketches ThetaSketch.
+
+    .. versionadded:: 4.1.0
+
+    Parameters
+    ----------
+    col : :class:`~pyspark.sql.Column` or column name
+
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        The estimated number of unique values for the ThetaSketch.
+
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.theta_union`
+    :meth:`pyspark.sql.functions.theta_intersection`
+    :meth:`pyspark.sql.functions.theta_difference`
+    :meth:`pyspark.sql.functions.theta_union_agg`
+    :meth:`pyspark.sql.functions.theta_intersection_agg`
+    :meth:`pyspark.sql.functions.theta_sketch_agg`
+
+    Examples
+    --------
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([1,2,2,3], "INT")
+    >>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value"))).show()
+    +--------------------------------------------------+
+    |theta_sketch_estimate(theta_sketch_agg(value, 12))|
+    +--------------------------------------------------+
+    |                                                 3|
+    +--------------------------------------------------+
+    """
+
+    fn = "theta_sketch_estimate"
+    return _invoke_function_over_columns(fn, col)
+
+
+@_try_remote_functions
+def theta_union(
+    col1: "ColumnOrName", col2: "ColumnOrName", lgNomEntries: Optional[Union[int, Column]] = None
+) -> Column:
+    """
+    Merges two binary representations of Datasketches ThetaSketch objects, using a
+    Datasketches Union object.
+
+    .. versionadded:: 4.1.0
+
+    Parameters
+    ----------
+    col1 : :class:`~pyspark.sql.Column` or column name
+    col2 : :class:`~pyspark.sql.Column` or column name
+    lgNomEntries : :class:`~pyspark.sql.Column` or int, optional
+        The log-base-2 of nominal entries for the union operation
+        (must be between 4 and 26, defaults to 12)
+
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        The binary representation of the merged ThetaSketch.
+
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.theta_union_agg`
+    :meth:`pyspark.sql.functions.theta_sketch_agg`
+    :meth:`pyspark.sql.functions.theta_sketch_estimate`
+
+    Examples
+    --------
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([(1,4),(2,5),(2,5),(3,6)], "struct<v1:int,v2:int>")
+    >>> df = df.agg(
+    ...     sf.theta_sketch_agg("v1").alias("sketch1"),
+    ...     sf.theta_sketch_agg("v2").alias("sketch2")
+    ... )
+    >>> df.select(sf.theta_sketch_estimate(sf.theta_union(df.sketch1, "sketch2"))).show()
+    +--------------------------------------------------------+
+    |theta_sketch_estimate(theta_union(sketch1, sketch2, 12))|
+    +--------------------------------------------------------+
+    |                                                       6|
+    +--------------------------------------------------------+
+    """
+
+    fn = "theta_union"
+    if lgNomEntries is not None:
+        return _invoke_function_over_columns(
+            fn,
+            col1,
+            col2,
+            lit(lgNomEntries),
+        )
+    else:
+        return _invoke_function_over_columns(fn, col1, col2)
+
+
+@_try_remote_functions
+def theta_intersection(col1: "ColumnOrName", col2: "ColumnOrName") -> Column:
+    """
+    Returns the intersection of two binary representations of Datasketches ThetaSketch
+    objects, using a Datasketches Intersection object.
+
+    .. versionadded:: 4.1.0
+
+    Parameters
+    ----------
+    col1 : :class:`~pyspark.sql.Column` or column name
+    col2 : :class:`~pyspark.sql.Column` or column name
+
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        The binary representation of the intersected ThetaSketch.
+
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.theta_intersection_agg`
+    :meth:`pyspark.sql.functions.theta_sketch_agg`
+    :meth:`pyspark.sql.functions.theta_sketch_estimate`
+
+    Examples
+    --------
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([(1,1),(2,2),(3,2),(3,3)], "struct<v1:int,v2:int>")
+    >>> df = df.agg(
+    ...     sf.theta_sketch_agg("v1").alias("sketch1"),
+    ...     sf.theta_sketch_agg("v2").alias("sketch2")
+    ... )
+    >>> df.select(sf.theta_sketch_estimate(sf.theta_intersection(df.sketch1, "sketch2"))).show()
+    +-----------------------------------------------------------+
+    |theta_sketch_estimate(theta_intersection(sketch1, sketch2))|
+    +-----------------------------------------------------------+
+    |                                                          3|
+    +-----------------------------------------------------------+
+    """
+
+    fn = "theta_intersection"
+    return _invoke_function_over_columns(fn, col1, col2)
+
+
+@_try_remote_functions
+def theta_difference(col1: "ColumnOrName", col2: "ColumnOrName") -> Column:
+    """
+    Returns the set difference of two binary representations of Datasketches ThetaSketch
+    objects (elements in first sketch but not in second), using a Datasketches ANotB object.
+
+    .. versionadded:: 4.1.0
+
+    Parameters
+    ----------
+    col1 : :class:`~pyspark.sql.Column` or column name
+    col2 : :class:`~pyspark.sql.Column` or column name
+
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        The binary representation of the difference ThetaSketch.
+
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.theta_union`
+    :meth:`pyspark.sql.functions.theta_intersection`
+    :meth:`pyspark.sql.functions.theta_sketch_agg`
+    :meth:`pyspark.sql.functions.theta_sketch_estimate`
+
+    Examples
+    --------
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([(1,4),(2,4),(3,5),(4,5)], "struct<v1:int,v2:int>")
+    >>> df = df.agg(
+    ...     sf.theta_sketch_agg("v1").alias("sketch1"),
+    ...     sf.theta_sketch_agg("v2").alias("sketch2")
+    ... )
+    >>> df.select(sf.theta_sketch_estimate(sf.theta_difference(df.sketch1, "sketch2"))).show()
+    +---------------------------------------------------------+
+    |theta_sketch_estimate(theta_difference(sketch1, sketch2))|
+    +---------------------------------------------------------+
+    |                                                        3|
+    +---------------------------------------------------------+
+    """
+
+    fn = "theta_difference"
+    return _invoke_function_over_columns(fn, col1, col2)
+
+
 # ---------------------- Predicates functions ------------------------------
 
 
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
index 0165d8d4cf708..2a26c0fad29a7 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1165,6 +1165,143 @@ object functions {
    */
   def sum_distinct(e: Column): Column = Column.fn("sum", isDistinct = true, e)
 
+  /**
+   * Aggregate function: returns the compact binary representation of the Datasketches
+   * ThetaSketch, generated by intersecting the Datasketches ThetaSketch instances in the input
+   * column via a Datasketches Intersection instance.
+   *
+   * @group agg_funcs
+   * @since 4.1.0
+   */
+  def theta_intersection_agg(e: Column): Column =
+    Column.fn("theta_intersection_agg", e)
+
+  /**
+   * Aggregate function: returns the compact binary representation of the Datasketches
+   * ThetaSketch, generated by intersecting the Datasketches ThetaSketch instances in the input
+   * volumn via a Datasketches Intersection instance.
+   *
+   * @group agg_funcs
+   * @since 4.1.0
+   */
+  def theta_intersection_agg(columnName: String): Column =
+    theta_intersection_agg(Column(columnName))
+
+  /**
+   * Aggregate function: returns the compact binary representation of the Datasketches ThetaSketch
+   * built with the values in the input column and configured with the `lgNomEntries` nominal
+   * entries.
+   *
+   * @group agg_funcs
+   * @since 4.1.0
+   */
+  def theta_sketch_agg(e: Column, lgNomEntries: Column): Column =
+    Column.fn("theta_sketch_agg", e, lgNomEntries)
+
+  /**
+   * Aggregate function: returns the compact binary representation of the Datasketches ThetaSketch
+   * built with the values in the input column and configured with the `lgNomEntries` nominal
+   * entries.
+   *
+   * @group agg_funcs
+   * @since 4.1.0
+   */
+  def theta_sketch_agg(e: Column, lgNomEntries: Int): Column =
+    Column.fn("theta_sketch_agg", e, lit(lgNomEntries))
+
+  /**
+   * Aggregate function: returns the compact binary representation of the Datasketches ThetaSketch
+   * built with the values in the input column and configured with the `lgNomEntries` nominal
+   * entries.
+   *
+   * @group agg_funcs
+   * @since 4.1.0
+   */
+  def theta_sketch_agg(columnName: String, lgNomEntries: Int): Column =
+    theta_sketch_agg(Column(columnName), lgNomEntries)
+
+  /**
+   * Aggregate function: returns the compact binary representation of the Datasketches ThetaSketch
+   * built with the values in the input column and configured with the default value of 12 for
+   * `lgNomEntries`.
+   *
+   * @group agg_funcs
+   * @since 4.1.0
+   */
+  def theta_sketch_agg(e: Column): Column =
+    Column.fn("theta_sketch_agg", e)
+
+  /**
+   * Aggregate function: returns the compact binary representation of the Datasketches ThetaSketch
+   * built with the values in the input column and configured with the default value of 12 for
+   * `lgNomEntries`.
+   *
+   * @group agg_funcs
+   * @since 4.1.0
+   */
+  def theta_sketch_agg(columnName: String): Column =
+    theta_sketch_agg(Column(columnName))
+
+  /**
+   * Aggregate function: returns the compact binary representation of the Datasketches
+   * ThetaSketch, generated by the union of Datasketches ThetaSketch instances in the input column
+   * via a Datasketches Union instance. It allows the configuration of `lgNomEntries` log nominal
+   * entries for the union buffer.
+   *
+   * @group agg_funcs
+   * @since 4.1.0
+   */
+  def theta_union_agg(e: Column, lgNomEntries: Column): Column =
+    Column.fn("theta_union_agg", e, lgNomEntries)
+
+  /**
+   * Aggregate function: returns the compact binary representation of the Datasketches
+   * ThetaSketch, generated by the union of Datasketches ThetaSketch instances in the input column
+   * via a Datasketches Union instance. It allows the configuration of `lgNomEntries` log nominal
+   * entries for the union buffer.
+   *
+   * @group agg_funcs
+   * @since 4.1.0
+   */
+  def theta_union_agg(e: Column, lgNomEntries: Int): Column =
+    Column.fn("theta_union_agg", e, lit(lgNomEntries))
+
+  /**
+   * Aggregate function: returns the compact binary representation of the Datasketches
+   * ThetaSketch, generated by the union of Datasketches ThetaSketch instances in the input column
+   * via a Datasketches Union instance. It allows the configuration of `lgNomEntries` log nominal
+   * entries for the union buffer.
+   *
+   * @group agg_funcs
+   * @since 4.1.0
+   */
+  def theta_union_agg(columnName: String, lgNomEntries: Int): Column =
+    theta_union_agg(Column(columnName), lgNomEntries)
+
+  /**
+   * Aggregate function: returns the compact binary representation of the Datasketches
+   * ThetaSketch, generated by the union of Datasketches ThetaSketch instances in the input column
+   * via a Datasketches Union instance. It is configured with the default value of 12 for
+   * `lgNomEntries`.
+   *
+   * @group agg_funcs
+   * @since 4.1.0
+   */
+  def theta_union_agg(e: Column): Column =
+    Column.fn("theta_union_agg", e)
+
+  /**
+   * Aggregate function: returns the compact binary representation of the Datasketches
+   * ThetaSketch, generated by the union of Datasketches ThetaSketch instances in the input column
+   * via a Datasketches Union instance. It is configured with the default value of 12 for
+   * `lgNomEntries`.
+   *
+   * @group agg_funcs
+   * @since 4.1.0
+   */
+  def theta_union_agg(columnName: String): Column =
+    theta_union_agg(Column(columnName))
+
   /**
    * Aggregate function: returns the concatenation of non-null input values.
    *
@@ -3552,6 +3689,125 @@ object functions {
     hll_union(Column(columnName1), Column(columnName2), allowDifferentLgConfigK)
   }
 
+  /**
+   * Subtracts two binary representations of Datasketches ThetaSketch objects in the input columns
+   * using a Datasketches AnotB object
+   *
+   * @group misc_funcs
+   * @since 4.1.0
+   */
+  def theta_difference(c1: Column, c2: Column): Column =
+    Column.fn("theta_difference", c1, c2)
+
+  /**
+   * Subtracts two binary representations of Datasketches ThetaSketch objects in the input columns
+   * using a Datasketches AnotB object
+   *
+   * @group misc_funcs
+   * @since 4.1.0
+   */
+  def theta_difference(columnName1: String, columnName2: String): Column = {
+    theta_difference(Column(columnName1), Column(columnName2))
+  }
+
+  /**
+   * Intersects two binary representations of Datasketches ThetaSketch objects in the input
+   * columns using a Datasketches Intersection object
+   *
+   * @group misc_funcs
+   * @since 4.1.0
+   */
+  def theta_intersection(c1: Column, c2: Column): Column =
+    Column.fn("theta_intersection", c1, c2)
+
+  /**
+   * Intersects two binary representations of Datasketches ThetaSketch objects in the input
+   * columns using a Datasketches Intersection object
+   *
+   * @group misc_funcs
+   * @since 4.1.0
+   */
+  def theta_intersection(columnName1: String, columnName2: String): Column = {
+    theta_intersection(Column(columnName1), Column(columnName2))
+  }
+
+  /**
+   * Returns the estimated number of unique values given the binary representation of a
+   * Datasketches ThetaSketch.
+   *
+   * @group misc_funcs
+   * @since 4.1.0
+   */
+  def theta_sketch_estimate(c: Column): Column = Column.fn("theta_sketch_estimate", c)
+
+  /**
+   * Returns the estimated number of unique values given the binary representation of a
+   * Datasketches ThetaSketch.
+   *
+   * @group misc_funcs
+   * @since 4.1.0
+   */
+  def theta_sketch_estimate(columnName: String): Column = {
+    theta_sketch_estimate(Column(columnName))
+  }
+
+  /**
+   * Unions two binary representations of Datasketches ThetaSketch objects in the input columns
+   * using a Datasketches Union object. It is configured with the default value of 12 for
+   * `lgNomEntries`.
+   *
+   * @group misc_funcs
+   * @since 4.1.0
+   */
+  def theta_union(c1: Column, c2: Column): Column =
+    Column.fn("theta_union", c1, c2)
+
+  /**
+   * Unions two binary representations of Datasketches ThetaSketch objects in the input columns
+   * using a Datasketches Union object. It is configured with the default value of 12 for
+   * `lgNomEntries`.
+   *
+   * @group misc_funcs
+   * @since 4.1.0
+   */
+  def theta_union(columnName1: String, columnName2: String): Column = {
+    theta_union(Column(columnName1), Column(columnName2))
+  }
+
+  /**
+   * Unions two binary representations of Datasketches ThetaSketch objects in the input columns
+   * using a Datasketches Union object. It allows the configuration of `lgNomEntries` log nominal
+   * entries for the union buffer.
+   *
+   * @group misc_funcs
+   * @since 4.1.0
+   */
+  def theta_union(c1: Column, c2: Column, lgNomEntries: Int): Column =
+    Column.fn("theta_union", c1, c2, lit(lgNomEntries))
+
+  /**
+   * Unions two binary representations of Datasketches ThetaSketch objects in the input columns
+   * using a Datasketches Union object. It allows the configuration of `lgNomEntries` log nominal
+   * entries for the union buffer.
+   *
+   * @group misc_funcs
+   * @since 4.1.0
+   */
+  def theta_union(columnName1: String, columnName2: String, lgNomEntries: Int): Column = {
+    theta_union(Column(columnName1), Column(columnName2), lgNomEntries)
+  }
+
+  /**
+   * Unions two binary representations of Datasketches ThetaSketch objects in the input columns
+   * using a Datasketches Union object. It allows the configuration of `lgNomEntries` log nominal
+   * entries for the union buffer.
+   *
+   * @group misc_funcs
+   * @since 4.1.0
+   */
+  def theta_union(c1: Column, c2: Column, lgNomEntries: Column): Column =
+    Column.fn("theta_union", c1, c2, lgNomEntries)
+
   /**
    * Returns the user name of current execution context.
    *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index afa43e876b26d..9c3ace2471851 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -530,6 +530,9 @@ object FunctionRegistry {
     expression[HllSketchAgg]("hll_sketch_agg"),
     expression[HllUnionAgg]("hll_union_agg"),
     expression[ApproxTopK]("approx_top_k"),
+    expression[ThetaSketchAgg]("theta_sketch_agg"),
+    expression[ThetaUnionAgg]("theta_union_agg"),
+    expression[ThetaIntersectionAgg]("theta_intersection_agg"),
     expression[ApproxTopKAccumulate]("approx_top_k_accumulate"),
 
     // string functions
@@ -791,6 +794,10 @@ object FunctionRegistry {
     expression[EqualNull]("equal_null"),
     expression[HllSketchEstimate]("hll_sketch_estimate"),
     expression[HllUnion]("hll_union"),
+    expression[ThetaSketchEstimate]("theta_sketch_estimate"),
+    expression[ThetaUnion]("theta_union"),
+    expression[ThetaDifference]("theta_difference"),
+    expression[ThetaIntersection]("theta_intersection"),
     expression[ApproxTopKEstimate]("approx_top_k_estimate"),
 
     // grouping sets
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala
new file mode 100644
index 0000000000000..7e55c006782cf
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/thetasketchesAggregates.scala
@@ -0,0 +1,644 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+import org.apache.datasketches.memory.Memory
+import org.apache.datasketches.theta.{CompactSketch, Intersection, SetOperation, Sketch, Union, UpdateSketch, UpdateSketchBuilder}
+
+import org.apache.spark.SparkUnsupportedOperationException
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, ExpressionDescription, Literal}
+import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate
+import org.apache.spark.sql.catalyst.trees.{BinaryLike, UnaryLike}
+import org.apache.spark.sql.catalyst.util.{ArrayData, CollationFactory, ThetaSketchUtils}
+import org.apache.spark.sql.errors.QueryExecutionErrors
+import org.apache.spark.sql.internal.types.StringTypeWithCollation
+import org.apache.spark.sql.types.{AbstractDataType, ArrayType, BinaryType, DataType, DoubleType, FloatType, IntegerType, LongType, StringType, TypeCollection}
+import org.apache.spark.unsafe.types.UTF8String
+
+sealed trait ThetaSketchState {
+  def serialize(): Array[Byte]
+  def eval(): Array[Byte]
+}
+case class UpdatableSketchBuffer(sketch: UpdateSketch) extends ThetaSketchState {
+  override def serialize(): Array[Byte] = sketch.rebuild.compact.toByteArrayCompressed
+  override def eval(): Array[Byte] = sketch.rebuild.compact.toByteArrayCompressed
+}
+case class UnionAggregationBuffer(union: Union) extends ThetaSketchState {
+  override def serialize(): Array[Byte] = union.getResult.toByteArrayCompressed
+  override def eval(): Array[Byte] = union.getResult.toByteArrayCompressed
+}
+case class IntersectionAggregationBuffer(intersection: Intersection) extends ThetaSketchState {
+  override def serialize(): Array[Byte] = intersection.getResult.toByteArrayCompressed
+  override def eval(): Array[Byte] = intersection.getResult.toByteArrayCompressed
+}
+case class FinalizedSketch(sketch: CompactSketch) extends ThetaSketchState {
+  override def serialize(): Array[Byte] = sketch.toByteArrayCompressed
+  override def eval(): Array[Byte] = sketch.toByteArrayCompressed
+}
+
+/**
+ * The ThetaSketchAgg function utilizes a Datasketches ThetaSketch instance to count a
+ * probabilistic approximation of the number of unique values in a given column, and outputs the
+ * binary representation of the ThetaSketch.
+ *
+ * See [[https://datasketches.apache.org/docs/Theta/ThetaSketches.html]] for more information.
+ *
+ * @param left
+ *   child expression against which unique counting will occur
+ * @param right
+ *   the log-base-2 of nomEntries decides the number of buckets for the sketch
+ * @param mutableAggBufferOffset
+ *   offset for mutable aggregation buffer
+ * @param inputAggBufferOffset
+ *   offset for input aggregation buffer
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = """
+    _FUNC_(expr, lgNomEntries) - Returns the ThetaSketch compact binary representation.
+      `lgNomEntries` (optional) is the log-base-2 of nominal entries, with nominal entries deciding
+      the number buckets or slots for the ThetaSketch. """,
+  examples = """
+    Examples:
+      > SELECT theta_sketch_estimate(_FUNC_(col, 12)) FROM VALUES (1), (1), (2), (2), (3) tab(col);
+       3
+  """,
+  group = "agg_funcs",
+  since = "4.1.0")
+// scalastyle:on line.size.limit
+case class ThetaSketchAgg(
+    left: Expression,
+    right: Expression,
+    override val mutableAggBufferOffset: Int,
+    override val inputAggBufferOffset: Int)
+    extends TypedImperativeAggregate[ThetaSketchState]
+    with BinaryLike[Expression]
+    with ExpectsInputTypes {
+
+  // ThetaSketch config - mark as lazy so that they're not evaluated during tree transformation.
+
+  lazy val lgNomEntries: Int = {
+    val lgNomEntriesInput = right.eval().asInstanceOf[Int]
+    ThetaSketchUtils.checkLgNomLongs(lgNomEntriesInput, prettyName)
+    lgNomEntriesInput
+  }
+
+  // Constructors
+
+  def this(child: Expression) = {
+    this(child, Literal(ThetaSketchUtils.DEFAULT_LG_NOM_LONGS), 0, 0)
+  }
+
+  def this(child: Expression, lgNomEntries: Expression) = {
+    this(child, lgNomEntries, 0, 0)
+  }
+
+  def this(child: Expression, lgNomEntries: Int) = {
+    this(child, Literal(lgNomEntries), 0, 0)
+  }
+
+  // Copy constructors required by ImperativeAggregate
+
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ThetaSketchAgg =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ThetaSketchAgg =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+  override protected def withNewChildrenInternal(
+      newLeft: Expression,
+      newRight: Expression): ThetaSketchAgg =
+    copy(left = newLeft, right = newRight)
+
+  // Overrides for TypedImperativeAggregate
+
+  override def prettyName: String = "theta_sketch_agg"
+
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(
+      TypeCollection(
+        ArrayType(IntegerType),
+        ArrayType(LongType),
+        BinaryType,
+        DoubleType,
+        FloatType,
+        IntegerType,
+        LongType,
+        StringTypeWithCollation(supportsTrimCollation = true)),
+      IntegerType)
+
+  override def dataType: DataType = BinaryType
+
+  override def nullable: Boolean = false
+
+  /**
+   * Instantiate an UpdateSketch instance using the lgNomEntries param.
+   *
+   * @return
+   *   an UpdateSketch instance wrapped with UpdatableSketchBuffer
+   */
+  override def createAggregationBuffer(): ThetaSketchState = {
+    val builder = new UpdateSketchBuilder
+    builder.setLogNominalEntries(lgNomEntries)
+    UpdatableSketchBuffer(builder.build)
+  }
+
+  /**
+   * Evaluate the input row and update the UpdateSketch instance with the row's value. The update
+   * function only supports a subset of Spark SQL types, and an exception will be thrown for
+   * unsupported types.
+   * Notes:
+   *   - Null values are ignored.
+   *   - Empty byte arrays are ignored
+   *   - Empty arrays of supported element types are ignored
+   *   - Strings that are collation-equal to the empty string are ignored.
+   *
+   * @param updateBuffer
+   *   A previously initialized UpdateSketch instance
+   * @param input
+   *   An input row
+   */
+  override def update(updateBuffer: ThetaSketchState, input: InternalRow): ThetaSketchState = {
+    // Return early for null values.
+    val v = left.eval(input)
+    if (v == null) return updateBuffer
+
+    // Initialized buffer should be UpdatableSketchBuffer, else error out.
+    val sketch = updateBuffer match {
+      case UpdatableSketchBuffer(s) => s
+      case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName)
+    }
+
+    // Handle the different data types for sketch updates.
+    left.dataType match {
+      case ArrayType(IntegerType, _) =>
+        val arr = v.asInstanceOf[ArrayData].toIntArray()
+        sketch.update(arr)
+      case ArrayType(LongType, _) =>
+        val arr = v.asInstanceOf[ArrayData].toLongArray()
+        sketch.update(arr)
+      case BinaryType =>
+        val bytes = v.asInstanceOf[Array[Byte]]
+        sketch.update(bytes)
+      case DoubleType =>
+        sketch.update(v.asInstanceOf[Double])
+      case FloatType =>
+        sketch.update(v.asInstanceOf[Float].toDouble) // Float is promoted to double.
+      case IntegerType =>
+        sketch.update(v.asInstanceOf[Int].toLong) // Int is promoted to Long.
+      case LongType =>
+        sketch.update(v.asInstanceOf[Long])
+      case st: StringType =>
+        val collation = CollationFactory.fetchCollation(st.collationId)
+        val str = v.asInstanceOf[UTF8String]
+        if (!collation.equalsFunction(str, UTF8String.EMPTY_UTF8)) {
+          sketch.update(collation.sortKeyFunction.apply(str))
+        }
+      case _ =>
+        throw new SparkUnsupportedOperationException(
+          errorClass = "_LEGACY_ERROR_TEMP_3121",
+          messageParameters = Map("dataType" -> left.dataType.toString))
+    }
+
+    UpdatableSketchBuffer(sketch)
+  }
+
+  /**
+   * Merges an input Compact sketch into the UpdateSketch which is acting as the aggregation
+   * buffer.
+   *
+   * @param updateBuffer
+   *   The UpdateSketch or Union instance used to store the aggregation result
+   * @param input
+   *   An input UpdateSketch, Union, or Compact sketch instance
+   */
+  override def merge(
+      updateBuffer: ThetaSketchState,
+      input: ThetaSketchState): ThetaSketchState = {
+    // This is a helper function to create union only when needed.
+    def createUnionWith(sketch1: Sketch, sketch2: Sketch): UnionAggregationBuffer = {
+      val union = SetOperation.builder.setLogNominalEntries(lgNomEntries).buildUnion
+      union.union(sketch1)
+      union.union(sketch2)
+      UnionAggregationBuffer(union)
+    }
+
+    (updateBuffer, input) match {
+      // Reuse the existing union in the next iteration. This is the most efficient path.
+      case (UnionAggregationBuffer(existingUnion), UpdatableSketchBuffer(sketch)) =>
+        existingUnion.union(sketch.compact)
+        UnionAggregationBuffer(existingUnion)
+      case (UnionAggregationBuffer(existingUnion), FinalizedSketch(sketch)) =>
+        existingUnion.union(sketch)
+        UnionAggregationBuffer(existingUnion)
+      case (UnionAggregationBuffer(union1), UnionAggregationBuffer(union2)) =>
+        union1.union(union2.getResult)
+        UnionAggregationBuffer(union1)
+      // Create a new union only when necessary.
+      case (UpdatableSketchBuffer(sketch1), UpdatableSketchBuffer(sketch2)) =>
+        createUnionWith(sketch1.compact, sketch2.compact)
+      case (UpdatableSketchBuffer(sketch1), FinalizedSketch(sketch2)) =>
+        createUnionWith(sketch1.compact, sketch2)
+      // The program should never make it here, the cases are for defensive programming.
+      case (FinalizedSketch(sketch1), UpdatableSketchBuffer(sketch2)) =>
+        createUnionWith(sketch1, sketch2.compact)
+      case (FinalizedSketch(sketch1), FinalizedSketch(sketch2)) =>
+        createUnionWith(sketch1, sketch2)
+      case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName)
+    }
+  }
+
+  /**
+   * Returns a Compact sketch derived from the input column or expression
+   *
+   * @param sketchState
+   *   Union instance used as an aggregation buffer
+   * @return
+   *   A Compact binary sketch
+   */
+  override def eval(sketchState: ThetaSketchState): Any = {
+    sketchState.eval()
+  }
+
+  /** Convert the underlying UpdateSketch/Union into an Compact byte array. */
+  override def serialize(sketchState: ThetaSketchState): Array[Byte] = {
+    sketchState.serialize()
+  }
+
+  /** Wrap the byte array into a Compact sketch instance. */
+  override def deserialize(buffer: Array[Byte]): ThetaSketchState = {
+    if (buffer.nonEmpty) {
+      FinalizedSketch(CompactSketch.heapify(Memory.wrap(buffer)))
+    } else {
+      this.createAggregationBuffer()
+    }
+  }
+}
+
+/**
+ * The ThetaUnionAgg function ingests and merges Datasketches ThetaSketch instances previously
+ * produced by the ThetaSketchAgg function and outputs the merged ThetaSketch.
+ *
+ * See [[https://datasketches.apache.org/docs/Theta/ThetaSketches.html]] for more information.
+ *
+ * @param left
+ *   Child expression against which unique counting will occur
+ * @param right
+ *   the log-base-2 of nomEntries decides the number of buckets for the sketch
+ * @param mutableAggBufferOffset
+ *   offset for mutable aggregation buffer
+ * @param inputAggBufferOffset
+ *   offset for input aggregation buffer
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = """
+    _FUNC_(expr, lgNomEntries) - Returns the ThetaSketch's Compact binary representation.
+      `lgNomEntries` (optional) the log-base-2 of Nominal Entries, with Nominal Entries deciding
+      the number buckets or slots for the ThetaSketch.""",
+  examples = """
+    Examples:
+      > SELECT theta_sketch_estimate(_FUNC_(sketch)) FROM (SELECT theta_sketch_agg(col) as sketch FROM VALUES (1) tab(col) UNION ALL SELECT theta_sketch_agg(col, 20) as sketch FROM VALUES (1) tab(col));
+       1
+  """,
+  group = "agg_funcs",
+  since = "4.1.0")
+// scalastyle:on line.size.limit
+case class ThetaUnionAgg(
+    left: Expression,
+    right: Expression,
+    override val mutableAggBufferOffset: Int,
+    override val inputAggBufferOffset: Int)
+    extends TypedImperativeAggregate[ThetaSketchState]
+    with BinaryLike[Expression]
+    with ExpectsInputTypes {
+
+  // ThetaSketch config - mark as lazy so that they're not evaluated during tree transformation.
+
+  lazy val lgNomEntries: Int = {
+    val lgNomEntriesInput = right.eval().asInstanceOf[Int]
+    ThetaSketchUtils.checkLgNomLongs(lgNomEntriesInput, prettyName)
+    lgNomEntriesInput
+  }
+
+  // Constructors
+
+  def this(child: Expression) = {
+    this(child, Literal(ThetaSketchUtils.DEFAULT_LG_NOM_LONGS), 0, 0)
+  }
+
+  def this(child: Expression, lgNomEntries: Expression) = {
+    this(child, lgNomEntries, 0, 0)
+  }
+
+  def this(child: Expression, lgNomEntries: Int) = {
+    this(child, Literal(lgNomEntries), 0, 0)
+  }
+
+  // Copy constructors required by ImperativeAggregate
+
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ThetaUnionAgg =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ThetaUnionAgg =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+  override protected def withNewChildrenInternal(
+      newLeft: Expression,
+      newRight: Expression): ThetaUnionAgg =
+    copy(left = newLeft, right = newRight)
+
+  // Overrides for TypedImperativeAggregate
+
+  override def prettyName: String = "theta_union_agg"
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, IntegerType)
+
+  override def dataType: DataType = BinaryType
+
+  override def nullable: Boolean = false
+
+  /**
+   * Instantiate a Union instance using the lgNomEntries param.
+   *
+   * @return
+   *   a Union instance wrapped with UnionAggregationBuffer
+   */
+  override def createAggregationBuffer(): ThetaSketchState = {
+    UnionAggregationBuffer(
+      SetOperation.builder
+        .setLogNominalEntries(lgNomEntries)
+        .buildUnion)
+  }
+
+  /**
+   * Update the Union instance with the Compact sketch byte array obtained from the row.
+   *
+   * @param unionBuffer
+   *   A previously initialized Union instance
+   * @param input
+   *   An input row
+   */
+  override def update(unionBuffer: ThetaSketchState, input: InternalRow): ThetaSketchState = {
+    // Return early for null input values.
+    val v = left.eval(input)
+    if (v == null) return unionBuffer
+
+    // Sketches must be in binary form to be aggregated, else error out.
+    left.dataType match {
+      case BinaryType => // Continue processing with a BinaryType.
+      case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName)
+    }
+
+    val sketchBytes = v.asInstanceOf[Array[Byte]]
+    val inputSketch = ThetaSketchUtils.wrapCompactSketch(sketchBytes, prettyName)
+
+    val union = unionBuffer match {
+      case UnionAggregationBuffer(existingUnionBuffer) => existingUnionBuffer
+      case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName)
+    }
+    union.union(inputSketch)
+    UnionAggregationBuffer(union)
+  }
+
+  /**
+   * Merges an input Compact sketch into the Union which is acting as the aggregation buffer.
+   *
+   * @param unionBuffer
+   *   The Union instance used to store the aggregation result
+   * @param input
+   *   An input Union or Compact sketch instance
+   */
+  override def merge(unionBuffer: ThetaSketchState, input: ThetaSketchState): ThetaSketchState = {
+    (unionBuffer, input) match {
+      // If both arguments are union objects, merge them directly.
+      case (UnionAggregationBuffer(unionLeft), UnionAggregationBuffer(unionRight)) =>
+        unionLeft.union(unionRight.getResult)
+        UnionAggregationBuffer(unionLeft)
+      // The input was serialized then deserialized.
+      case (UnionAggregationBuffer(union), FinalizedSketch(sketch)) =>
+        union.union(sketch)
+        UnionAggregationBuffer(union)
+      // The program should never make it here, the cases are for defensive programming.
+      case (FinalizedSketch(sketch1), FinalizedSketch(sketch2)) =>
+        val union = SetOperation.builder.setLogNominalEntries(lgNomEntries).buildUnion
+        union.union(sketch1)
+        union.union(sketch2)
+        UnionAggregationBuffer(union)
+      case (FinalizedSketch(sketch), UnionAggregationBuffer(union)) =>
+        union.union(sketch)
+        UnionAggregationBuffer(union)
+      case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName)
+    }
+  }
+
+  /**
+   * Returns a Compact sketch derived from the merged sketches
+   *
+   * @param sketchState
+   *   Union instance used as an aggregation buffer
+   * @return
+   *   A Compact binary sketch
+   */
+  override def eval(sketchState: ThetaSketchState): Any = {
+    sketchState.eval()
+  }
+
+  /** Converts the underlying Union into an Compact byte array. */
+  override def serialize(sketchState: ThetaSketchState): Array[Byte] = {
+    sketchState.serialize()
+  }
+
+  /** Wrap the byte array into a Compact sketch instance. */
+  override def deserialize(buffer: Array[Byte]): ThetaSketchState = {
+    if (buffer.nonEmpty) {
+      FinalizedSketch(CompactSketch.heapify(Memory.wrap(buffer)))
+    } else {
+      this.createAggregationBuffer()
+    }
+  }
+}
+
+/**
+ * The ThetaIntersectionAgg function ingests and intersects Datasketches ThetaSketch instances
+ * previously produced by the ThetaSketchAgg function, and outputs the intersected ThetaSketch.
+ *
+ * See [[https://datasketches.apache.org/docs/Theta/ThetaSketches.html]] for more information.
+ *
+ * @param child
+ *   Child expression against which unique counting will occur
+ * @param mutableAggBufferOffset
+ *   offset for mutable aggregation buffer
+ * @param inputAggBufferOffset
+ *   offset for input aggregation buffer
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = """
+    _FUNC_(expr, lgNomEntries) - Returns the ThetaSketch's Compact binary representation
+      by intersecting all the Theta sketches in the input column.""",
+  examples = """
+    Examples:
+      > SELECT theta_sketch_estimate(_FUNC_(sketch)) FROM (SELECT theta_sketch_agg(col) as sketch FROM VALUES (1) tab(col) UNION ALL SELECT theta_sketch_agg(col, 20) as sketch FROM VALUES (1) tab(col));
+       1
+  """,
+  group = "agg_funcs",
+  since = "4.1.0")
+// scalastyle:on line.size.limit
+case class ThetaIntersectionAgg(
+    child: Expression,
+    override val mutableAggBufferOffset: Int,
+    override val inputAggBufferOffset: Int)
+    extends TypedImperativeAggregate[ThetaSketchState]
+    with UnaryLike[Expression]
+    with ExpectsInputTypes {
+
+  // Constructor
+
+  def this(child: Expression) = {
+    this(child, 0, 0)
+  }
+
+  // Copy constructors required by ImperativeAggregate
+
+  override def withNewMutableAggBufferOffset(
+      newMutableAggBufferOffset: Int): ThetaIntersectionAgg =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ThetaIntersectionAgg =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+  override protected def withNewChildInternal(newChild: Expression): ThetaIntersectionAgg =
+    copy(child = newChild)
+
+  // Overrides for TypedImperativeAggregate
+
+  override def prettyName: String = "theta_intersection_agg"
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType)
+
+  override def dataType: DataType = BinaryType
+
+  override def nullable: Boolean = false
+
+  /**
+   * Instantiate an Intersection instance using the lgNomEntries param.
+   *
+   * @return
+   *   an Intersection instance wrapped with IntersectionAggregationBuffer
+   */
+  override def createAggregationBuffer(): ThetaSketchState = {
+    IntersectionAggregationBuffer(SetOperation.builder.buildIntersection)
+  }
+
+  /**
+   * Update the Intersection instance with the Compact sketch byte array obtained from the row.
+   *
+   * @param intersectionBuffer
+   *   A previously initialized Intersection instance
+   * @param input
+   *   An input row
+   */
+  override def update(
+      intersectionBuffer: ThetaSketchState,
+      input: InternalRow): ThetaSketchState = {
+    // Return early for null input values.
+    val v = child.eval(input)
+    if (v == null) return intersectionBuffer
+
+    // Sketches must be in binary form to be aggregated, else error out.
+    child.dataType match {
+      case BinaryType => // Continue processing with a BinaryType.
+      case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName)
+    }
+
+    val sketchBytes = v.asInstanceOf[Array[Byte]]
+    val inputSketch = ThetaSketchUtils.wrapCompactSketch(sketchBytes, prettyName)
+
+    val intersection = intersectionBuffer match {
+      case IntersectionAggregationBuffer(existingIntersection) => existingIntersection
+      case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName)
+    }
+    intersection.intersect(inputSketch)
+    IntersectionAggregationBuffer(intersection)
+  }
+
+  /**
+   * Merges an input Compact sketch into the Intersection which is acting as the aggregation
+   * buffer.
+   *
+   * @param intersectionBuffer
+   *   The Intersection instance used to store the aggregation result
+   * @param input
+   *   An input Intersection or Compact sketch instance
+   */
+  override def merge(
+      intersectionBuffer: ThetaSketchState,
+      input: ThetaSketchState): ThetaSketchState = {
+    (intersectionBuffer, input) match {
+      // If both arguments are intersection objects, merge them directly.
+      case (
+            IntersectionAggregationBuffer(intersectLeft),
+            IntersectionAggregationBuffer(intersectRight)) =>
+        intersectLeft.intersect(intersectRight.getResult)
+        IntersectionAggregationBuffer(intersectLeft)
+      // The input was serialized then deserialized.
+      case (IntersectionAggregationBuffer(intersection), FinalizedSketch(sketch)) =>
+        intersection.intersect(sketch)
+        IntersectionAggregationBuffer(intersection)
+      // The program should never make it here, the cases are for defensive programming.
+      case (FinalizedSketch(sketch1), FinalizedSketch(sketch2)) =>
+        val intersection =
+          SetOperation.builder.buildIntersection
+        intersection.intersect(sketch1)
+        intersection.intersect(sketch2)
+        IntersectionAggregationBuffer(intersection)
+      case (FinalizedSketch(sketch), IntersectionAggregationBuffer(intersection)) =>
+        intersection.intersect(sketch)
+        IntersectionAggregationBuffer(intersection)
+      case _ => throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName)
+    }
+  }
+
+  /**
+   * Returns a Compact sketch derived from the intersected sketches
+   *
+   * @param sketchState
+   *   Intersection instance used as an aggregation buffer
+   * @return
+   *   A Compact binary sketch
+   */
+  override def eval(sketchState: ThetaSketchState): Any = {
+    sketchState.eval()
+  }
+
+  /** Convert the underlying Intersection into an Compact byte array. */
+  override def serialize(sketchState: ThetaSketchState): Array[Byte] = {
+    sketchState.serialize()
+  }
+
+  /** Wrap the byte array into a Compact sketch instance. */
+  override def deserialize(buffer: Array[Byte]): ThetaSketchState = {
+    if (buffer.nonEmpty) {
+      FinalizedSketch(CompactSketch.heapify(Memory.wrap(buffer)))
+    } else {
+      this.createAggregationBuffer()
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/thetasketchesExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/thetasketchesExpressions.scala
new file mode 100644
index 0000000000000..f662f405297bb
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/thetasketchesExpressions.scala
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.datasketches.theta.SetOperation
+
+import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, ExpressionDescription, Literal}
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.util.ThetaSketchUtils
+import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType, IntegerType, LongType}
+
+@ExpressionDescription(
+  usage = """
+    _FUNC_(expr) - Returns the estimated number of unique values
+    given the binary representation of a Datasketches ThetaSketch. """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_(theta_sketch_agg(col)) FROM VALUES (1), (1), (2), (2), (3) tab(col);
+       3
+  """,
+  group = "misc_funcs",
+  since = "4.1.0")
+case class ThetaSketchEstimate(child: Expression)
+    extends UnaryExpression
+    with CodegenFallback
+    with ExpectsInputTypes {
+  override def nullIntolerant: Boolean = true
+
+  override protected def withNewChildInternal(newChild: Expression): ThetaSketchEstimate =
+    copy(child = newChild)
+
+  override def prettyName: String = "theta_sketch_estimate"
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType)
+
+  override def dataType: DataType = LongType
+
+  override def nullSafeEval(input: Any): Any = {
+    val buffer = input.asInstanceOf[Array[Byte]]
+
+    val sketch = ThetaSketchUtils.wrapCompactSketch(buffer, prettyName)
+
+    Math.round(sketch.getEstimate)
+  }
+}
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = """
+    _FUNC_(first, second, lgNomEntries) - Merges two binary representations of
+    Datasketches ThetaSketch objects using a ThetaSketch Union object. Users can set
+    lgNomEntries to a value between 4 and 26 to find the union of sketches with different
+    union buffer size values (defaults to 12). """,
+  examples = """
+    Examples:
+      > SELECT theta_sketch_estimate(_FUNC_(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (1, 4), (1, 4), (2, 5), (2, 5), (3, 6) tab(col1, col2);
+       6
+  """,
+  group = "misc_funcs",
+  since = "4.1.0")
+// scalastyle:on line.size.limit
+case class ThetaUnion(first: Expression, second: Expression, third: Expression)
+    extends TernaryExpression
+    with CodegenFallback
+    with ExpectsInputTypes {
+  override def nullIntolerant: Boolean = true
+
+  def this(first: Expression, second: Expression) = {
+    this(first, second, Literal(ThetaSketchUtils.DEFAULT_LG_NOM_LONGS))
+  }
+
+  def this(first: Expression, second: Expression, third: Int) = {
+    this(first, second, Literal(third))
+  }
+
+  override protected def withNewChildrenInternal(
+      newFirst: Expression,
+      newSecond: Expression,
+      newThird: Expression): ThetaUnion =
+    copy(first = newFirst, second = newSecond, third = newThird)
+
+  override def prettyName: String = "theta_union"
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, BinaryType, IntegerType)
+
+  override def dataType: DataType = BinaryType
+
+  override def nullSafeEval(value1: Any, value2: Any, value3: Any): Any = {
+    val logNominalEntries = value3.asInstanceOf[Int]
+    ThetaSketchUtils.checkLgNomLongs(logNominalEntries, prettyName)
+
+    val sketch1Bytes = value1.asInstanceOf[Array[Byte]]
+    val sketch1 = ThetaSketchUtils.wrapCompactSketch(sketch1Bytes, prettyName)
+
+    val sketch2Bytes = value2.asInstanceOf[Array[Byte]]
+    val sketch2 = ThetaSketchUtils.wrapCompactSketch(sketch2Bytes, prettyName)
+
+    val union = SetOperation.builder
+      .setLogNominalEntries(logNominalEntries)
+      .buildUnion
+      .union(sketch1, sketch2)
+
+    union.toByteArrayCompressed
+  }
+}
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = """
+    _FUNC_(first, second) - Subtracts two binary representations of
+    Datasketches ThetaSketch objects from two input columns using a
+    ThetaSketch AnotB object. """,
+  examples = """
+    Examples:
+      > SELECT theta_sketch_estimate(_FUNC_(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2);
+       2
+  """,
+  group = "misc_funcs",
+  since = "4.1.0")
+// scalastyle:on line.size.limit
+case class ThetaDifference(first: Expression, second: Expression)
+    extends BinaryExpression
+    with CodegenFallback
+    with ExpectsInputTypes {
+  override def nullIntolerant: Boolean = true
+
+  override def left: Expression = first
+  override def right: Expression = second
+
+  override protected def withNewChildrenInternal(
+      newFirst: Expression,
+      newSecond: Expression): ThetaDifference =
+    copy(first = newFirst, second = newSecond)
+
+  override def prettyName: String = "theta_difference"
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, BinaryType)
+
+  override def dataType: DataType = BinaryType
+
+  override def nullSafeEval(value1: Any, value2: Any): Any = {
+    val sketch1Bytes = value1.asInstanceOf[Array[Byte]]
+    val sketch1 = ThetaSketchUtils.wrapCompactSketch(sketch1Bytes, prettyName)
+
+    val sketch2Bytes = value2.asInstanceOf[Array[Byte]]
+    val sketch2 = ThetaSketchUtils.wrapCompactSketch(sketch2Bytes, prettyName)
+
+    val difference = SetOperation.builder.buildANotB
+      .aNotB(sketch1, sketch2)
+
+    difference.toByteArrayCompressed
+  }
+}
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = """
+    _FUNC_(first, second) - Intersects two binary representations of
+    Datasketches ThetaSketch objects from two input columns using a
+    ThetaSketch Intersect object. """,
+  examples = """
+    Examples:
+      > SELECT theta_sketch_estimate(_FUNC_(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2);
+       2
+  """,
+  group = "misc_funcs",
+  since = "4.1.0")
+// scalastyle:on line.size.limit
+case class ThetaIntersection(first: Expression, second: Expression)
+    extends BinaryExpression
+    with CodegenFallback
+    with ExpectsInputTypes {
+  override def nullIntolerant: Boolean = true
+
+  override def left: Expression = first
+  override def right: Expression = second
+
+  override protected def withNewChildrenInternal(
+      newFirst: Expression,
+      newSecond: Expression): ThetaIntersection =
+    copy(first = newFirst, second = newSecond)
+
+  override def prettyName: String = "theta_intersection"
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, BinaryType)
+
+  override def dataType: DataType = BinaryType
+
+  override def nullSafeEval(value1: Any, value2: Any): Any = {
+    val sketch1Bytes = value1.asInstanceOf[Array[Byte]]
+    val sketch1 = ThetaSketchUtils.wrapCompactSketch(sketch1Bytes, prettyName)
+
+    val sketch2Bytes = value2.asInstanceOf[Array[Byte]]
+    val sketch2 = ThetaSketchUtils.wrapCompactSketch(sketch2Bytes, prettyName)
+
+    val intersection = SetOperation.builder.buildIntersection
+      .intersect(sketch1, sketch2)
+
+    intersection.toByteArrayCompressed
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtils.scala
new file mode 100644
index 0000000000000..f9a651b5662db
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtils.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import org.apache.datasketches.common.SketchesArgumentException
+import org.apache.datasketches.memory.{Memory, MemoryBoundsException}
+import org.apache.datasketches.theta.CompactSketch
+
+import org.apache.spark.sql.errors.QueryExecutionErrors
+
+object ThetaSketchUtils {
+  /*
+   * Bounds copied from DataSketches' ThetaUtil. These define the valid range for lgNomEntries,
+   * which is the log-base-2 of the nominal number of entries that determines the sketch size.
+   * The actual number of buckets in the sketch = 2^lgNomEntries.
+   * MIN_LG_NOM_LONGS = 4 means minimum 16 buckets (2^4), MAX_LG_NOM_LONGS = 26 means maximum
+   * ~67 million buckets (2^26). These bounds ensure reasonable memory usage while maintaining
+   * sketch accuracy for cardinality estimation.
+  */
+  final val MIN_LG_NOM_LONGS = 4
+  final val MAX_LG_NOM_LONGS = 26
+  final val DEFAULT_LG_NOM_LONGS = 12
+
+  /**
+   * Validates the lgNomLongs parameter for Theta sketch size. Throws a Spark SQL exception if the
+   * value is out of bounds.
+   *
+   * @param lgNomLongs
+   *   Log2 of nominal entries
+   */
+  def checkLgNomLongs(lgNomLongs: Int, prettyName: String): Unit = {
+    if (lgNomLongs < MIN_LG_NOM_LONGS || lgNomLongs > MAX_LG_NOM_LONGS) {
+      throw QueryExecutionErrors.thetaInvalidLgNomEntries(
+        function = prettyName,
+        min = MIN_LG_NOM_LONGS,
+        max = MAX_LG_NOM_LONGS,
+        value = lgNomLongs)
+    }
+  }
+
+  /**
+   * Wraps a byte array into a DataSketches CompactSketch object.
+   * This method safely deserializes a compact Theta sketch from its binary representation,
+   * handling potential deserialization errors by throwing appropriate Spark SQL exceptions.
+   *
+   * @param bytes The binary representation of a compact theta sketch
+   * @param prettyName The display name of the function/expression for error messages
+   * @return A CompactSketch object wrapping the provided bytes
+   */
+  def wrapCompactSketch(bytes: Array[Byte], prettyName: String): CompactSketch = {
+    val memory = try {
+      Memory.wrap(bytes)
+    } catch {
+      case _: NullPointerException | _: MemoryBoundsException =>
+        throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName)
+    }
+
+    try {
+      CompactSketch.wrap(memory)
+    } catch {
+      case _: SketchesArgumentException | _: MemoryBoundsException =>
+        throw QueryExecutionErrors.thetaInvalidInputSketchBuffer(prettyName)
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
index 67bb80403b9f1..18edffc4ec59e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
@@ -3112,4 +3112,20 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE
       )
     )
   }
+
+  def thetaInvalidInputSketchBuffer(function: String): Throwable = {
+    new SparkRuntimeException(
+      errorClass = "THETA_INVALID_INPUT_SKETCH_BUFFER",
+      messageParameters = Map("function" -> toSQLId(function)))
+  }
+
+  def thetaInvalidLgNomEntries(function: String, min: Int, max: Int, value: Int): Throwable = {
+    new SparkRuntimeException(
+      errorClass = "THETA_INVALID_LG_NOM_ENTRIES",
+      messageParameters = Map(
+        "function" -> toSQLId(function),
+        "min" -> toSQLValue(min, IntegerType),
+        "max" -> toSQLValue(max, IntegerType),
+        "value" -> toSQLValue(value, IntegerType)))
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ThetasketchesAggSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ThetasketchesAggSuite.scala
new file mode 100644
index 0000000000000..afb978b8204de
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ThetasketchesAggSuite.scala
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+import scala.collection.immutable.NumericRange
+import scala.util.Random
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{BoundReference, ThetaSketchEstimate}
+import org.apache.spark.sql.catalyst.util.ArrayData
+import org.apache.spark.sql.types.{ArrayType, BinaryType, DataType, DoubleType, FloatType, IntegerType, LongType, StringType}
+import org.apache.spark.unsafe.types.UTF8String
+
+class ThetasketchesAggSuite extends SparkFunSuite {
+
+  def simulateUpdateMerge(
+      dataType: DataType,
+      input: Seq[Any],
+      numSketches: Integer = 5): (Long, NumericRange[Long]) = {
+
+    // Create a map of the agg function instances.
+    val aggFunctionMap = Seq
+      .tabulate(numSketches)(index => {
+        val sketch = new ThetaSketchAgg(BoundReference(0, dataType, nullable = true))
+        index -> (sketch, sketch.createAggregationBuffer())
+      })
+      .toMap
+
+    // Randomly update agg function instances.
+    input.map(value => {
+      val (aggFunction, aggBuffer) = aggFunctionMap(Random.nextInt(numSketches))
+      aggFunction.update(aggBuffer, InternalRow(value))
+    })
+
+    def serializeDeserialize(
+        tuple: (ThetaSketchAgg, ThetaSketchState)): (ThetaSketchAgg, ThetaSketchState) = {
+      val (agg, buf) = tuple
+      val serialized = agg.serialize(buf)
+      (agg, agg.deserialize(serialized))
+    }
+
+    // Simulate serialization -> deserialization -> merge.
+    val mapValues = aggFunctionMap.values
+    val (mergedAgg, UnionAggregationBuffer(mergedBuf)) =
+      mapValues.tail.foldLeft(mapValues.head)((prev, cur) => {
+        val (prevAgg, prevBuf) = serializeDeserialize(prev)
+        val (_, curBuf) = serializeDeserialize(cur)
+
+        (prevAgg, prevAgg.merge(prevBuf, curBuf))
+      })
+
+    val estimator = ThetaSketchEstimate(BoundReference(0, BinaryType, nullable = true))
+    val estimate =
+      estimator.eval(InternalRow(mergedBuf.getResult.toByteArrayCompressed)).asInstanceOf[Long]
+    (
+      estimate,
+      mergedBuf.getResult.getLowerBound(3).toLong to mergedBuf.getResult.getUpperBound(3).toLong)
+  }
+
+  test("SPARK-52407: Test min/max values of supported datatypes") {
+    val intRange = Integer.MIN_VALUE to Integer.MAX_VALUE by 10000000
+    val (intEstimate, intEstimateRange) = simulateUpdateMerge(IntegerType, intRange)
+    assert(intEstimate == intRange.size || intEstimateRange.contains(intRange.size.toLong))
+
+    val longRange = Long.MinValue to Long.MaxValue by 1000000000000000L
+    val (longEstimate, longEstimateRange) = simulateUpdateMerge(LongType, longRange)
+    assert(longEstimate == longRange.size || longEstimateRange.contains(longRange.size.toLong))
+
+    val stringRange = Seq.tabulate(1000)(i => UTF8String.fromString(Random.nextString(i + 1)))
+    val (stringEstimate, stringEstimateRange) = simulateUpdateMerge(StringType, stringRange)
+    assert(
+      stringEstimate == stringRange.size ||
+        stringEstimateRange.contains(stringRange.size.toLong))
+
+    val binaryRange =
+      Seq.tabulate(1000)(i => UTF8String.fromString(Random.nextString(i + 1)).getBytes)
+    val (binaryEstimate, binaryEstimateRange) = simulateUpdateMerge(BinaryType, binaryRange)
+    assert(
+      binaryEstimate == binaryRange.size ||
+        binaryEstimateRange.contains(binaryRange.size.toLong))
+
+    val floatRange = (1 to 1000).map(_.toFloat)
+    val (floatEstimate, floatRangeEst) = simulateUpdateMerge(FloatType, floatRange)
+    assert(floatEstimate == floatRange.size || floatRangeEst.contains(floatRange.size.toLong))
+
+    val doubleRange = (1 to 1000).map(_.toDouble)
+    val (doubleEstimate, doubleRangeEst) = simulateUpdateMerge(DoubleType, doubleRange)
+    assert(doubleEstimate == doubleRange.size || doubleRangeEst.contains(doubleRange.size.toLong))
+
+    val arrayIntRange = (1 to 500).map(i => ArrayData.toArrayData(Array(i, i + 1)))
+    val (arrayIntEstimate, arrayIntRangeEst) =
+      simulateUpdateMerge(ArrayType(IntegerType), arrayIntRange)
+    assert(
+      arrayIntEstimate == arrayIntRange.size ||
+        arrayIntRangeEst.contains(arrayIntRange.size.toLong))
+
+    val arrayLongRange =
+      (1 to 500).map(i => ArrayData.toArrayData(Array(i.toLong, (i + 1).toLong)))
+    val (arrayLongEstimate, arrayLongRangeEst) =
+      simulateUpdateMerge(ArrayType(LongType), arrayLongRange)
+    assert(
+      arrayLongEstimate == arrayLongRange.size ||
+        arrayLongRangeEst.contains(arrayLongRange.size.toLong))
+  }
+
+  test("SPARK-52407: Test lgNomEntries results in downsampling sketches during Union") {
+    // Create a sketch with larger configuration (more precise).
+    val aggFunc1 = new ThetaSketchAgg(BoundReference(0, IntegerType, nullable = true), 12)
+    val sketch1 = aggFunc1.createAggregationBuffer()
+    (0 to 100).map(i => aggFunc1.update(sketch1, InternalRow(i)))
+    val binary1 = aggFunc1.eval(sketch1)
+
+    // Create a sketch with smaller configuration (less precise).
+    val aggFunc2 = new ThetaSketchAgg(BoundReference(0, IntegerType, nullable = true), 10)
+    val sketch2 = aggFunc2.createAggregationBuffer()
+    (0 to 100).map(i => aggFunc2.update(sketch2, InternalRow(i)))
+    val binary2 = aggFunc2.eval(sketch2)
+
+    // Union the sketches.
+    val unionAgg = new ThetaUnionAgg(BoundReference(0, BinaryType, nullable = true), 12)
+    val union = unionAgg.createAggregationBuffer()
+    unionAgg.update(union, InternalRow(binary1))
+    unionAgg.update(union, InternalRow(binary2))
+    val unionResult = unionAgg.eval(union)
+
+    // Verify the estimate is still accurate despite different configurations
+    val estimate = ThetaSketchEstimate(BoundReference(0, BinaryType, nullable = true))
+      .eval(InternalRow(unionResult))
+    assert(estimate.asInstanceOf[Long] >= 95 && estimate.asInstanceOf[Long] <= 105)
+  }
+
+  test("SPARK-52407: Test lgNomEntries results in downsampling sketches during intersection") {
+    // Create sketch with a larger configuration (more precise).
+    val aggFunc1 = new ThetaSketchAgg(BoundReference(0, IntegerType, nullable = true), 12)
+    val sketch1 = aggFunc1.createAggregationBuffer()
+    (0 to 150).map(i => aggFunc1.update(sketch1, InternalRow(i)))
+    val binary1 = aggFunc1.eval(sketch1)
+
+    // Create a sketch with smaller configuration (less precise).
+    val aggFunc2 = new ThetaSketchAgg(BoundReference(0, IntegerType, nullable = true), 10)
+    val sketch2 = aggFunc2.createAggregationBuffer()
+    (50 to 200).map(i => aggFunc2.update(sketch2, InternalRow(i)))
+    val binary2 = aggFunc2.eval(sketch2)
+
+    // Intersect the sketches.
+    val intersectionAgg =
+      new ThetaIntersectionAgg(BoundReference(0, BinaryType, nullable = true))
+    val intersection = intersectionAgg.createAggregationBuffer()
+    intersectionAgg.update(intersection, InternalRow(binary1))
+    intersectionAgg.update(intersection, InternalRow(binary2))
+    val intersectionResult = intersectionAgg.eval(intersection)
+
+    // Verify the estimate is still accurate despite different configurations,
+    // should be around 101 (overlap from 50 to 150).
+    val estimate = ThetaSketchEstimate(BoundReference(0, BinaryType, nullable = true))
+      .eval(InternalRow(intersectionResult))
+    assert(estimate.asInstanceOf[Long] >= 95 && estimate.asInstanceOf[Long] <= 105)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtilsSuite.scala
new file mode 100644
index 0000000000000..1a21d7a4d6c87
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ThetaSketchUtilsSuite.scala
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import org.apache.datasketches.theta.UpdateSketch
+
+import org.apache.spark.{SparkFunSuite, SparkRuntimeException}
+import org.apache.spark.sql.catalyst.plans.SQLHelper
+
+class ThetaSketchUtilsSuite extends SparkFunSuite with SQLHelper {
+
+  test("checkLgNomLongs: accepts values within valid range") {
+    val validValues =
+      Seq(ThetaSketchUtils.MIN_LG_NOM_LONGS, 10, 20, ThetaSketchUtils.MAX_LG_NOM_LONGS)
+    validValues.foreach { value =>
+      // There should be no error here.
+      ThetaSketchUtils.checkLgNomLongs(value, "test_function")
+    }
+  }
+
+
+  test("checkLgNomLongs: throws exception for values below minimum") {
+    val invalidValues = Seq(ThetaSketchUtils.MIN_LG_NOM_LONGS - 1, 0, -5)
+    invalidValues.foreach { value =>
+      checkError(
+        exception = intercept[SparkRuntimeException] {
+          ThetaSketchUtils.checkLgNomLongs(value, "test_function")
+        },
+        condition = "THETA_INVALID_LG_NOM_ENTRIES",
+        parameters = Map(
+          "function" -> "`test_function`",
+          "min" -> ThetaSketchUtils.MIN_LG_NOM_LONGS.toString,
+          "max" -> ThetaSketchUtils.MAX_LG_NOM_LONGS.toString,
+          "value" -> value.toString
+        )
+      )
+    }
+  }
+
+  test("checkLgNomLongs: throws exception for values above maximum") {
+    val invalidValues = Seq(ThetaSketchUtils.MAX_LG_NOM_LONGS + 1, 30, 100)
+    invalidValues.foreach { value =>
+      checkError(
+        exception = intercept[SparkRuntimeException] {
+          ThetaSketchUtils.checkLgNomLongs(value, "test_function")
+        },
+        condition = "THETA_INVALID_LG_NOM_ENTRIES",
+        parameters = Map(
+          "function" -> "`test_function`",
+          "min" -> ThetaSketchUtils.MIN_LG_NOM_LONGS.toString,
+          "max" -> ThetaSketchUtils.MAX_LG_NOM_LONGS.toString,
+          "value" -> value.toString
+        )
+      )
+    }
+  }
+
+  test("wrapCompactSketch: successfully wraps valid sketch bytes") {
+    // Create a valid sketch and get its bytes.
+    val updateSketch = UpdateSketch.builder().build()
+    updateSketch.update("test1")
+    updateSketch.update("test2")
+    updateSketch.update("test3")
+    val compactSketch = updateSketch.compact
+    val validBytes = compactSketch.toByteArrayCompressed
+
+    // Test that wrapCompactSketch can successfully wrap the valid bytes.
+    val wrappedSketch = ThetaSketchUtils.wrapCompactSketch(validBytes, "test_function")
+
+    assert(wrappedSketch != null)
+    assert(wrappedSketch.getEstimate == compactSketch.getEstimate)
+    assert(wrappedSketch.getRetainedEntries == compactSketch.getRetainedEntries)
+  }
+
+  test("wrapCompactSketch: throws exception for null bytes") {
+    checkError(
+      exception = intercept[SparkRuntimeException] {
+        ThetaSketchUtils.wrapCompactSketch(null, "test_function")
+      },
+      condition = "THETA_INVALID_INPUT_SKETCH_BUFFER",
+      parameters = Map("function" -> "`test_function`")
+    )
+  }
+
+  test("wrapCompactSketch: throws exception for empty bytes") {
+    checkError(
+      exception = intercept[SparkRuntimeException] {
+        ThetaSketchUtils.wrapCompactSketch(Array.empty[Byte], "test_function")
+      },
+      condition = "THETA_INVALID_INPUT_SKETCH_BUFFER",
+      parameters = Map("function" -> "`test_function`")
+    )
+  }
+
+  test("wrapCompactSketch: throws exception for invalid bytes") {
+    val invalidBytes = Array[Byte](1, 2, 3, 4, 5)
+    checkError(
+      exception = intercept[SparkRuntimeException] {
+        ThetaSketchUtils.wrapCompactSketch(invalidBytes, "test_function")
+      },
+      condition = "THETA_INVALID_INPUT_SKETCH_BUFFER",
+      parameters = Map("function" -> "`test_function`")
+    )
+  }
+}
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index 6e6d520efbac3..22c8a017ca596 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -342,6 +342,10 @@
 | org.apache.spark.sql.catalyst.expressions.Subtract | - | SELECT 2 - 1 | struct<(2 - 1):int> |
 | org.apache.spark.sql.catalyst.expressions.Tan | tan | SELECT tan(0) | struct<TAN(0):double> |
 | org.apache.spark.sql.catalyst.expressions.Tanh | tanh | SELECT tanh(0) | struct<TANH(0):double> |
+| org.apache.spark.sql.catalyst.expressions.ThetaDifference | theta_difference | SELECT theta_sketch_estimate(theta_difference(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2) | struct<theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint> |
+| org.apache.spark.sql.catalyst.expressions.ThetaIntersection | theta_intersection | SELECT theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (5, 4), (1, 4), (2, 5), (2, 5), (3, 1) tab(col1, col2) | struct<theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint> |
+| org.apache.spark.sql.catalyst.expressions.ThetaSketchEstimate | theta_sketch_estimate | SELECT theta_sketch_estimate(theta_sketch_agg(col)) FROM VALUES (1), (1), (2), (2), (3) tab(col) | struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint> |
+| org.apache.spark.sql.catalyst.expressions.ThetaUnion | theta_union | SELECT theta_sketch_estimate(theta_union(theta_sketch_agg(col1), theta_sketch_agg(col2))) FROM VALUES (1, 4), (1, 4), (2, 5), (2, 5), (3, 6) tab(col1, col2) | struct<theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12)):bigint> |
 | org.apache.spark.sql.catalyst.expressions.TimeDiff | time_diff | SELECT time_diff('HOUR', TIME'20:30:29', TIME'21:30:28') | struct<time_diff(HOUR, TIME '20:30:29', TIME '21:30:28'):bigint> |
 | org.apache.spark.sql.catalyst.expressions.TimeTrunc | time_trunc | SELECT time_trunc('HOUR', TIME'09:32:05.359') | struct<time_trunc(HOUR, TIME '09:32:05.359'):time(6)> |
 | org.apache.spark.sql.catalyst.expressions.TimeWindow | window | SELECT a, window.start, window.end, count(*) as cnt FROM VALUES ('A1', '2021-01-01 00:00:00'), ('A1', '2021-01-01 00:04:30'), ('A1', '2021-01-01 00:06:00'), ('A2', '2021-01-01 00:01:00') AS tab(a, b) GROUP by a, window(b, '5 minutes') ORDER BY a, start | struct<a:string,start:timestamp,end:timestamp,cnt:bigint> |
@@ -463,6 +467,9 @@
 | org.apache.spark.sql.catalyst.expressions.aggregate.StddevSamp | stddev | SELECT stddev(col) FROM VALUES (1), (2), (3) AS tab(col) | struct<stddev(col):double> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.StddevSamp | stddev_samp | SELECT stddev_samp(col) FROM VALUES (1), (2), (3) AS tab(col) | struct<stddev_samp(col):double> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.Sum | sum | SELECT sum(col) FROM VALUES (5), (10), (15) AS tab(col) | struct<sum(col):bigint> |
+| org.apache.spark.sql.catalyst.expressions.aggregate.ThetaIntersectionAgg | theta_intersection_agg | SELECT theta_sketch_estimate(theta_intersection_agg(sketch)) FROM (SELECT theta_sketch_agg(col) as sketch FROM VALUES (1) tab(col) UNION ALL SELECT theta_sketch_agg(col, 20) as sketch FROM VALUES (1) tab(col)) | struct<theta_sketch_estimate(theta_intersection_agg(sketch)):bigint> |
+| org.apache.spark.sql.catalyst.expressions.aggregate.ThetaSketchAgg | theta_sketch_agg | SELECT theta_sketch_estimate(theta_sketch_agg(col, 12)) FROM VALUES (1), (1), (2), (2), (3) tab(col) | struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint> |
+| org.apache.spark.sql.catalyst.expressions.aggregate.ThetaUnionAgg | theta_union_agg | SELECT theta_sketch_estimate(theta_union_agg(sketch)) FROM (SELECT theta_sketch_agg(col) as sketch FROM VALUES (1) tab(col) UNION ALL SELECT theta_sketch_agg(col, 20) as sketch FROM VALUES (1) tab(col)) | struct<theta_sketch_estimate(theta_union_agg(sketch, 12)):bigint> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.TryAverageExpressionBuilder | try_avg | SELECT try_avg(col) FROM VALUES (1), (2), (3) AS tab(col) | struct<try_avg(col):double> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.TrySumExpressionBuilder | try_sum | SELECT try_sum(col) FROM VALUES (5), (10), (15) AS tab(col) | struct<try_sum(col):bigint> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.VariancePop | var_pop | SELECT var_pop(col) FROM VALUES (1), (2), (3) AS tab(col) | struct<var_pop(col):double> |
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/thetasketch.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/thetasketch.sql.out
new file mode 100644
index 0000000000000..323084223d4bc
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/thetasketch.sql.out
@@ -0,0 +1,1323 @@
+-- Automatically generated by SQLQueryTestSuite
+-- !query
+DROP TABLE IF EXISTS t_int_1_5_through_7_11
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_int_1_5_through_7_11
+
+
+-- !query
+CREATE TABLE t_int_1_5_through_7_11 AS
+VALUES 
+  (1, 5), (2, 6), (3, 7), (4, 8), (5, 9), (6, 10), (7, 11) AS tab(col1, col2)
+-- !query analysis
+CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_int_1_5_through_7_11`, ErrorIfExists, [col1, col2]
+   +- SubqueryAlias tab
+      +- LocalRelation [col1#x, col2#x]
+
+
+-- !query
+DROP TABLE IF EXISTS t_long_1_5_through_7_11
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_long_1_5_through_7_11
+
+
+-- !query
+CREATE TABLE t_long_1_5_through_7_11 AS
+VALUES
+  (1L, 5L), (2L, 6L), (3L, 7L), (4L, 8L), (5L, 9L), (6L, 10L), (7L, 11L) AS tab(col1, col2)
+-- !query analysis
+CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_long_1_5_through_7_11`, ErrorIfExists, [col1, col2]
+   +- SubqueryAlias tab
+      +- LocalRelation [col1#xL, col2#xL]
+
+
+-- !query
+DROP TABLE IF EXISTS t_double_1_1_1_4_through_1_5_1_8
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_double_1_1_1_4_through_1_5_1_8
+
+
+-- !query
+CREATE TABLE t_double_1_1_1_4_through_1_5_1_8 AS
+SELECT CAST(col1 AS DOUBLE) AS col1, CAST(col2 AS DOUBLE) AS col2
+FROM VALUES
+  (1.1, 1.4), (1.2, 1.5), (1.3, 1.6), (1.4, 1.7), (1.5, 1.8) AS tab(col1, col2)
+-- !query analysis
+CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_double_1_1_1_4_through_1_5_1_8`, ErrorIfExists, [col1, col2]
+   +- Project [cast(col1#x as double) AS col1#x, cast(col2#x as double) AS col2#x]
+      +- SubqueryAlias tab
+         +- LocalRelation [col1#x, col2#x]
+
+
+-- !query
+DROP TABLE IF EXISTS t_float_1_1_1_4_through_1_5_1_8
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_float_1_1_1_4_through_1_5_1_8
+
+
+-- !query
+CREATE TABLE t_float_1_1_1_4_through_1_5_1_8 AS
+SELECT CAST(col1 AS FLOAT) col1, CAST(col2 AS FLOAT) col2
+FROM VALUES
+  (1.1, 1.4), (1.2, 1.5), (1.3, 1.6), (1.4, 1.7), (1.5, 1.8) AS tab(col1, col2)
+-- !query analysis
+CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_float_1_1_1_4_through_1_5_1_8`, ErrorIfExists, [col1, col2]
+   +- Project [cast(col1#x as float) AS col1#x, cast(col2#x as float) AS col2#x]
+      +- SubqueryAlias tab
+         +- LocalRelation [col1#x, col2#x]
+
+
+-- !query
+DROP TABLE IF EXISTS t_string_a_d_through_e_h
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_string_a_d_through_e_h
+
+
+-- !query
+CREATE TABLE t_string_a_d_through_e_h AS
+VALUES
+  ('a', 'd'), ('b', 'e'), ('c', 'f'), ('d', 'g'), ('e', 'h') AS tab(col1, col2)
+-- !query analysis
+CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_string_a_d_through_e_h`, ErrorIfExists, [col1, col2]
+   +- SubqueryAlias tab
+      +- LocalRelation [col1#x, col2#x]
+
+
+-- !query
+DROP TABLE IF EXISTS t_binary_a_b_through_e_f
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_binary_a_b_through_e_f
+
+
+-- !query
+CREATE TABLE t_binary_a_b_through_e_f AS 
+VALUES 
+  (X'A', X'B'), (X'B', X'C'), (X'C', X'D'), (X'D', X'E'), (X'E', X'F') AS tab(col1, col2)
+-- !query analysis
+CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_binary_a_b_through_e_f`, ErrorIfExists, [col1, col2]
+   +- SubqueryAlias tab
+      +- LocalRelation [col1#x, col2#x]
+
+
+-- !query
+DROP TABLE IF EXISTS t_array_int_1_3_through_4_6
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_array_int_1_3_through_4_6
+
+
+-- !query
+CREATE TABLE t_array_int_1_3_through_4_6 AS
+VALUES 
+  (ARRAY(1), ARRAY(3)), 
+  (ARRAY(2), ARRAY(4)), 
+  (ARRAY(3), ARRAY(5)), 
+  (ARRAY(4), ARRAY(6)) AS tab(col1, col2)
+-- !query analysis
+CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_array_int_1_3_through_4_6`, ErrorIfExists, [col1, col2]
+   +- SubqueryAlias tab
+      +- LocalRelation [col1#x, col2#x]
+
+
+-- !query
+DROP TABLE IF EXISTS t_array_long_1_3_through_4_6
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_array_long_1_3_through_4_6
+
+
+-- !query
+CREATE TABLE t_array_long_1_3_through_4_6 AS
+VALUES 
+  (ARRAY(1L), ARRAY(3L)), 
+  (ARRAY(2L), ARRAY(4L)), 
+  (ARRAY(3L), ARRAY(5L)), 
+  (ARRAY(4L), ARRAY(6L)) AS tab(col1, col2)
+-- !query analysis
+CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_array_long_1_3_through_4_6`, ErrorIfExists, [col1, col2]
+   +- SubqueryAlias tab
+      +- LocalRelation [col1#x, col2#x]
+
+
+-- !query
+DROP TABLE IF EXISTS t_string_collation
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_string_collation
+
+
+-- !query
+CREATE TABLE t_string_collation AS
+VALUES
+  (''), ('  '), (CAST(X'C1' AS STRING)), (CAST(X'80' AS STRING)),
+  ('\uFFFD'), ('Å'), ('å'), ('a\u030A'), ('Å '), ('å  '),
+  ('a\u030A   ') AS tab(col1)
+-- !query analysis
+CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t_string_collation`, ErrorIfExists, [col1]
+   +- SubqueryAlias tab
+      +- LocalRelation [col1#x]
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) AS result FROM t_int_1_5_through_7_11
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#x, 12, 0, 0)) AS result#xL]
++- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11
+   +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_array_int_1_3_through_4_6
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col1, 12))#xL]
++- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6
+   +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col2)) FROM t_array_long_1_3_through_4_6
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col2#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col2, 12))#xL]
++- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6
+   +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_binary_a_b_through_e_f
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col1, 12))#xL]
++- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f
+   +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_double_1_1_1_4_through_1_5_1_8
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col1, 12))#xL]
++- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8
+   +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col2)) FROM t_float_1_1_1_4_through_1_5_1_8
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col2#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col2, 12))#xL]
++- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8
+   +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1, 22)) FROM t_int_1_5_through_7_11
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#x, 22, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col1, 22))#xL]
++- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11
+   +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_long_1_5_through_7_11
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#xL, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col1, 12))#xL]
++- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11
+   +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_string_a_d_through_e_h
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col1, 12))#xL]
++- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h
+   +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0), 12)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12))#xL]
++- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11
+   +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1, 15),
+    theta_sketch_agg(col2))) FROM t_long_1_5_through_7_11
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#xL, 15, 0, 0), theta_sketch_agg(col2#xL, 12, 0, 0), 12)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 15), theta_sketch_agg(col2, 12), 12))#xL]
++- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11
+   +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0), 12)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12))#xL]
++- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8
+   +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1, 6),
+    theta_sketch_agg(col2, 15), 15)) FROM t_float_1_1_1_4_through_1_5_1_8
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#x, 6, 0, 0), theta_sketch_agg(col2#x, 15, 0, 0), 15)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 6), theta_sketch_agg(col2, 15), 15))#xL]
++- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8
+   +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0), 12)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12))#xL]
++- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h
+   +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2), 20)) FROM t_binary_a_b_through_e_f
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0), 20)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 20))#xL]
++- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f
+   +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0), 12)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12))#xL]
++- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6
+   +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 13))) FROM t_array_long_1_3_through_4_6
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 13, 0, 0), 12)) AS theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 13), 12))#xL]
++- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6
+   +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL]
++- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11
+   +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1, 5),
+    theta_sketch_agg(col2, 12))) FROM t_long_1_5_through_7_11
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#xL, 5, 0, 0), theta_sketch_agg(col2#xL, 12, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 5), theta_sketch_agg(col2, 12)))#xL]
++- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11
+   +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL]
++- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8
+   +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1, 5),
+    theta_sketch_agg(col2))) FROM t_float_1_1_1_4_through_1_5_1_8
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#x, 5, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 5), theta_sketch_agg(col2, 12)))#xL]
++- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8
+   +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL]
++- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h
+   +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 22))) FROM t_binary_a_b_through_e_f
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 22, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 22)))#xL]
++- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f
+   +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL]
++- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6
+   +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 10))) FROM t_array_long_1_3_through_4_6
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 10, 0, 0))) AS theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 10)))#xL]
++- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6
+   +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL]
++- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11
+   +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 5))) FROM t_long_1_5_through_7_11
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#xL, 12, 0, 0), theta_sketch_agg(col2#xL, 5, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 5)))#xL]
++- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11
+   +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL]
++- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8
+   +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1, 12),
+    theta_sketch_agg(col2))) FROM t_float_1_1_1_4_through_1_5_1_8
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL]
++- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8
+   +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL]
++- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h
+   +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1, 6),
+    theta_sketch_agg(col2, 8))) FROM t_binary_a_b_through_e_f
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#x, 6, 0, 0), theta_sketch_agg(col2#x, 8, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 6), theta_sketch_agg(col2, 8)))#xL]
++- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f
+   +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 12, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12)))#xL]
++- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6
+   +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 4))) FROM t_array_long_1_3_through_4_6
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_difference(theta_sketch_agg(col1#x, 12, 0, 0), theta_sketch_agg(col2#x, 4, 0, 0))) AS theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 4)))#xL]
++- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6
+   +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 15))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_int_1_5_through_7_11
+          UNION ALL
+          SELECT theta_sketch_agg(col2, 20) as sketch FROM t_int_1_5_through_7_11)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 15, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 15))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11
+      :     +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet
+      +- Aggregate [theta_sketch_agg(col2#x, 20, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11
+            +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 12))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_double_1_1_1_4_through_1_5_1_8
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_double_1_1_1_4_through_1_5_1_8)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 12, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 12))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8
+      :     +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+      +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8
+            +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 14))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_string_a_d_through_e_h
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_string_a_d_through_e_h)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 14, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 14))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h
+      :     +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet
+      +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h
+            +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 10))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_long_1_5_through_7_11
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_long_1_5_through_7_11)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 10, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 10))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#xL, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11
+      :     +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet
+      +- Aggregate [theta_sketch_agg(col2#xL, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11
+            +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 6))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_float_1_1_1_4_through_1_5_1_8
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_float_1_1_1_4_through_1_5_1_8)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 6, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 6))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8
+      :     +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+      +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8
+            +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_binary_a_b_through_e_f
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_binary_a_b_through_e_f)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 12, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 12))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f
+      :     +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet
+      +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f
+            +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 12))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_int_1_3_through_4_6
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_array_int_1_3_through_4_6)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 12, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 12))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6
+      :     +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet
+      +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6
+            +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 16))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_long_1_3_through_4_6
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_array_long_1_3_through_4_6)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_union_agg(sketch#x, 16, 0, 0)) AS theta_sketch_estimate(theta_union_agg(sketch, 16))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6
+      :     +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet
+      +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6
+            +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_int_1_5_through_7_11
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_int_1_5_through_7_11)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11
+      :     +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet
+      +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11
+            +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_long_1_5_through_7_11
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_long_1_5_through_7_11)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#xL, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11
+      :     +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet
+      +- Aggregate [theta_sketch_agg(col2#xL, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11
+            +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_float_1_1_1_4_through_1_5_1_8
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_float_1_1_1_4_through_1_5_1_8)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8
+      :     +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+      +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8
+            +- Relation spark_catalog.default.t_float_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_double_1_1_1_4_through_1_5_1_8
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_double_1_1_1_4_through_1_5_1_8)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8
+      :     +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+      +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8
+            +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_string_a_d_through_e_h
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_string_a_d_through_e_h)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h
+      :     +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet
+      +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h
+            +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_binary_a_b_through_e_f
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_binary_a_b_through_e_f)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f
+      :     +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet
+      +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_binary_a_b_through_e_f
+            +- Relation spark_catalog.default.t_binary_a_b_through_e_f[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_int_1_3_through_4_6
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_array_int_1_3_through_4_6)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6
+      :     +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet
+      +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_array_int_1_3_through_4_6
+            +- Relation spark_catalog.default.t_array_int_1_3_through_4_6[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_long_1_3_through_4_6
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_array_long_1_3_through_4_6)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_intersection_agg(sketch#x, 0, 0)) AS theta_sketch_estimate(theta_intersection_agg(sketch))#xL]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6
+      :     +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet
+      +- Aggregate [theta_sketch_agg(col2#x, 12, 0, 0) AS sketch#x]
+         +- SubqueryAlias spark_catalog.default.t_array_long_1_3_through_4_6
+            +- Relation spark_catalog.default.t_array_long_1_3_through_4_6[col1#x,col2#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (1), (null), (2), (null), (3) tab(col)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES ('test'), (null), ('null'), (null) tab(col)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (100L), (null), (200L), (null), (300L) tab(col)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col#xL, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [col#xL]
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(CAST(col AS DOUBLE)))
+FROM VALUES (1.1), (null), (2.2), (null), (3.3) tab(col)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(cast(col#x as double), 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(CAST(col AS DOUBLE), 12))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(CAST(col AS FLOAT)))
+FROM VALUES (1.5), (null), (2.5), (null), (3.5) tab(col)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(cast(col#x as float), 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(CAST(col AS FLOAT), 12))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (X'AA'), (null), (X'BB'), (null), (X'CC') tab(col)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY(1, 2)), (null), (ARRAY(3, 4)), (null), (ARRAY(5, 6)) tab(col)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY(10L, 20L)), (null), (ARRAY(30L, 40L)), (null), (ARRAY(50L, 60L)) tab(col)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY(1, null)), (ARRAY(1)), (ARRAY(2, null, 3)), (ARRAY(4)) tab(col)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY(10L, null)), (ARRAY(10L)), (ARRAY(20L, null, 30L)), (ARRAY(40L)) tab(col)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY()), (ARRAY(1, 2)), (ARRAY()), (ARRAY(3, 4)) tab(col)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (''), ('a'), (''), ('b'), ('c') tab(col)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (X''), (X'01'), (X'02'), (X'03'), (CAST('  ' AS BINARY)), (X'e280'), (X'c1'), (X'c120') tab(col)
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col#x, 12, 0, 0)) AS theta_sketch_estimate(theta_sketch_agg(col, 12))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) utf8_b FROM t_string_collation
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(col1#x, 12, 0, 0)) AS utf8_b#xL]
++- SubqueryAlias spark_catalog.default.t_string_collation
+   +- Relation spark_catalog.default.t_string_collation[col1#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_LCASE)) utf8_lc FROM t_string_collation
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(collate(col1#x, UTF8_LCASE), 12, 0, 0)) AS utf8_lc#xL]
++- SubqueryAlias spark_catalog.default.t_string_collation
+   +- Relation spark_catalog.default.t_string_collation[col1#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE)) unicode FROM t_string_collation
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(collate(col1#x, UNICODE), 12, 0, 0)) AS unicode#xL]
++- SubqueryAlias spark_catalog.default.t_string_collation
+   +- Relation spark_catalog.default.t_string_collation[col1#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_CI)) unicode_ci FROM t_string_collation
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(collate(col1#x, UNICODE_CI), 12, 0, 0)) AS unicode_ci#xL]
++- SubqueryAlias spark_catalog.default.t_string_collation
+   +- Relation spark_catalog.default.t_string_collation[col1#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_BINARY_RTRIM)) utf8_b_rt FROM t_string_collation
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(collate(col1#x, UTF8_BINARY_RTRIM), 12, 0, 0)) AS utf8_b_rt#xL]
++- SubqueryAlias spark_catalog.default.t_string_collation
+   +- Relation spark_catalog.default.t_string_collation[col1#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_LCASE_RTRIM)) utf8_lc_rt FROM t_string_collation
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(collate(col1#x, UTF8_LCASE_RTRIM), 12, 0, 0)) AS utf8_lc_rt#xL]
++- SubqueryAlias spark_catalog.default.t_string_collation
+   +- Relation spark_catalog.default.t_string_collation[col1#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_RTRIM)) unicode_rt FROM t_string_collation
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(collate(col1#x, UNICODE_RTRIM), 12, 0, 0)) AS unicode_rt#xL]
++- SubqueryAlias spark_catalog.default.t_string_collation
+   +- Relation spark_catalog.default.t_string_collation[col1#x] parquet
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_CI_RTRIM)) unicode_ci_rt FROM t_string_collation
+-- !query analysis
+Aggregate [theta_sketch_estimate(theta_sketch_agg(collate(col1#x, UNICODE_CI_RTRIM), 12, 0, 0)) AS unicode_ci_rt#xL]
++- SubqueryAlias spark_catalog.default.t_string_collation
+   +- Relation spark_catalog.default.t_string_collation[col1#x] parquet
+
+
+-- !query
+WITH sketches AS (
+  SELECT 'int_sketch' as sketch_type, theta_sketch_agg(col1, 12) as sketch FROM t_int_1_5_through_7_11
+  UNION ALL
+  SELECT 'long_sketch' as sketch_type, theta_sketch_agg(col1, 15) as sketch FROM t_long_1_5_through_7_11
+  UNION ALL
+  SELECT 'double_sketch' as sketch_type, theta_sketch_agg(col1, 10) as sketch FROM t_double_1_1_1_4_through_1_5_1_8
+  UNION ALL
+  SELECT 'string_sketch' as sketch_type, theta_sketch_agg(col1, 14) as sketch FROM t_string_a_d_through_e_h
+),
+union_result AS (
+  SELECT theta_union_agg(sketch, 16) as union_sketch FROM sketches
+),
+individual_sketches AS (
+  SELECT theta_sketch_agg(col1, 12) as sketch1, theta_sketch_agg(col2, 12) as sketch2 FROM t_int_1_5_through_7_11
+)
+SELECT 
+  theta_sketch_estimate((SELECT union_sketch FROM union_result)) as union_estimate,
+  theta_sketch_estimate(theta_union(sketch1, sketch2, 15)) as binary_union_estimate,
+  theta_sketch_estimate(theta_intersection(sketch1, sketch2)) as intersection_estimate,
+  theta_sketch_estimate(theta_difference(sketch1, sketch2)) as difference_estimate
+FROM individual_sketches
+-- !query analysis
+WithCTE
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias sketches
+:     +- Union false, false
+:        :- Union false, false
+:        :  :- Union false, false
+:        :  :  :- Aggregate [int_sketch AS sketch_type#x, theta_sketch_agg(col1#x, 12, 0, 0) AS sketch#x]
+:        :  :  :  +- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11
+:        :  :  :     +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet
+:        :  :  +- Aggregate [long_sketch AS sketch_type#x, theta_sketch_agg(col1#xL, 15, 0, 0) AS sketch#x]
+:        :  :     +- SubqueryAlias spark_catalog.default.t_long_1_5_through_7_11
+:        :  :        +- Relation spark_catalog.default.t_long_1_5_through_7_11[col1#xL,col2#xL] parquet
+:        :  +- Aggregate [double_sketch AS sketch_type#x, theta_sketch_agg(col1#x, 10, 0, 0) AS sketch#x]
+:        :     +- SubqueryAlias spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8
+:        :        +- Relation spark_catalog.default.t_double_1_1_1_4_through_1_5_1_8[col1#x,col2#x] parquet
+:        +- Aggregate [string_sketch AS sketch_type#x, theta_sketch_agg(col1#x, 14, 0, 0) AS sketch#x]
+:           +- SubqueryAlias spark_catalog.default.t_string_a_d_through_e_h
+:              +- Relation spark_catalog.default.t_string_a_d_through_e_h[col1#x,col2#x] parquet
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias union_result
+:     +- Aggregate [theta_union_agg(sketch#x, 16, 0, 0) AS union_sketch#x]
+:        +- SubqueryAlias sketches
+:           +- CTERelationRef xxxx, true, [sketch_type#x, sketch#x], false, false, 4
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias individual_sketches
+:     +- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch1#x, theta_sketch_agg(col2#x, 12, 0, 0) AS sketch2#x]
+:        +- SubqueryAlias spark_catalog.default.t_int_1_5_through_7_11
+:           +- Relation spark_catalog.default.t_int_1_5_through_7_11[col1#x,col2#x] parquet
++- Project [theta_sketch_estimate(scalar-subquery#x []) AS union_estimate#xL, theta_sketch_estimate(theta_union(sketch1#x, sketch2#x, 15)) AS binary_union_estimate#xL, theta_sketch_estimate(theta_intersection(sketch1#x, sketch2#x)) AS intersection_estimate#xL, theta_sketch_estimate(theta_difference(sketch1#x, sketch2#x)) AS difference_estimate#xL]
+   :  +- Project [union_sketch#x]
+   :     +- SubqueryAlias union_result
+   :        +- CTERelationRef xxxx, true, [union_sketch#x], false, false, 1
+   +- SubqueryAlias individual_sketches
+      +- CTERelationRef xxxx, true, [sketch1#x, sketch2#x], false, false, 1
+
+
+-- !query
+SELECT theta_sketch_agg(col, 2)
+FROM VALUES (50), (60), (60) tab(col)
+-- !query analysis
+Aggregate [theta_sketch_agg(col#x, 2, 0, 0) AS theta_sketch_agg(col, 2)#x]
++- SubqueryAlias tab
+   +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_sketch_agg(col, 40)
+FROM VALUES (50), (60), (60) tab(col)
+-- !query analysis
+Aggregate [theta_sketch_agg(col#x, 40, 0, 0) AS theta_sketch_agg(col, 40)#x]
++- SubqueryAlias tab
+   +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_union_agg(sketch, 3)
+FROM (SELECT theta_sketch_agg(col, 12) as sketch
+        FROM VALUES (1) AS tab(col)
+      UNION ALL
+      SELECT theta_sketch_agg(col, 20) as sketch
+        FROM VALUES (1) AS tab(col))
+-- !query analysis
+Aggregate [theta_union_agg(sketch#x, 3, 0, 0) AS theta_union_agg(sketch, 3)#x]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias tab
+      :     +- LocalRelation [col#x]
+      +- Aggregate [theta_sketch_agg(col#x, 20, 0, 0) AS sketch#x]
+         +- SubqueryAlias tab
+            +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_union_agg(sketch, 27)
+FROM (SELECT theta_sketch_agg(col, 12) as sketch
+        FROM VALUES (1) AS tab(col)
+      UNION ALL
+      SELECT theta_sketch_agg(col, 20) as sketch
+        FROM VALUES (1) AS tab(col))
+-- !query analysis
+Aggregate [theta_union_agg(sketch#x, 27, 0, 0) AS theta_union_agg(sketch, 27)#x]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Union false, false
+      :- Aggregate [theta_sketch_agg(col#x, 12, 0, 0) AS sketch#x]
+      :  +- SubqueryAlias tab
+      :     +- LocalRelation [col#x]
+      +- Aggregate [theta_sketch_agg(col#x, 20, 0, 0) AS sketch#x]
+         +- SubqueryAlias tab
+            +- LocalRelation [col#x]
+
+
+-- !query
+SELECT theta_union(1, 2)
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2)
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputSql" : "\"1\"",
+    "inputType" : "\"INT\"",
+    "paramIndex" : "first",
+    "requiredType" : "\"BINARY\"",
+    "sqlExpr" : "\"theta_union(1, 2, 12)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 24,
+    "fragment" : "theta_union(1, 2)"
+  } ]
+}
+
+
+-- !query
+SELECT theta_intersection(1, 2)
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2)
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputSql" : "\"1\"",
+    "inputType" : "\"INT\"",
+    "paramIndex" : "first",
+    "requiredType" : "\"BINARY\"",
+    "sqlExpr" : "\"theta_intersection(1, 2)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 31,
+    "fragment" : "theta_intersection(1, 2)"
+  } ]
+}
+
+
+-- !query
+SELECT theta_difference(1, 2)
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2)
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputSql" : "\"1\"",
+    "inputType" : "\"INT\"",
+    "paramIndex" : "first",
+    "requiredType" : "\"BINARY\"",
+    "sqlExpr" : "\"theta_difference(1, 2)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 29,
+    "fragment" : "theta_difference(1, 2)"
+  } ]
+}
+
+
+-- !query
+SELECT theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2), 'invalid')
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2)
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputSql" : "\"invalid\"",
+    "inputType" : "\"STRING\"",
+    "paramIndex" : "third",
+    "requiredType" : "\"INT\"",
+    "sqlExpr" : "\"theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), invalid)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 86,
+    "fragment" : "theta_union(\n    theta_sketch_agg(col1),\n    theta_sketch_agg(col2), 'invalid')"
+  } ]
+}
+
+
+-- !query
+SELECT theta_intersection(
+    theta_sketch_agg(col1),
+    'invalid_sketch')
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2)
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputSql" : "\"invalid_sketch\"",
+    "inputType" : "\"STRING\"",
+    "paramIndex" : "second",
+    "requiredType" : "\"BINARY\"",
+    "sqlExpr" : "\"theta_intersection(theta_sketch_agg(col1, 12), invalid_sketch)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 76,
+    "fragment" : "theta_intersection(\n    theta_sketch_agg(col1),\n    'invalid_sketch')"
+  } ]
+}
+
+
+-- !query
+SELECT theta_sketch_estimate(CAST('abc' AS BINARY))
+-- !query analysis
+Project [theta_sketch_estimate(cast(abc as binary)) AS theta_sketch_estimate(CAST(abc AS BINARY))#xL]
++- OneRowRelation
+
+
+-- !query
+SELECT theta_union(CAST('abc' AS BINARY), CAST('def' AS BINARY))
+-- !query analysis
+Project [theta_union(cast(abc as binary), cast(def as binary), 12) AS theta_union(CAST(abc AS BINARY), CAST(def AS BINARY), 12)#x]
++- OneRowRelation
+
+
+-- !query
+SELECT theta_intersection(CAST('abc' AS BINARY), CAST('def' AS BINARY))
+-- !query analysis
+Project [theta_intersection(cast(abc as binary), cast(def as binary)) AS theta_intersection(CAST(abc AS BINARY), CAST(def AS BINARY))#x]
++- OneRowRelation
+
+
+-- !query
+SELECT theta_difference(CAST('abc' AS BINARY), CAST('def' AS BINARY))
+-- !query analysis
+Project [theta_difference(cast(abc as binary), cast(def as binary)) AS theta_difference(CAST(abc AS BINARY), CAST(def AS BINARY))#x]
++- OneRowRelation
+
+
+-- !query
+SELECT theta_union_agg(buffer, 15)
+FROM (SELECT CAST('abc' AS BINARY) AS buffer)
+-- !query analysis
+Aggregate [theta_union_agg(buffer#x, 15, 0, 0) AS theta_union_agg(buffer, 15)#x]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Project [cast(abc as binary) AS buffer#x]
+      +- OneRowRelation
+
+
+-- !query
+SELECT theta_intersection_agg(buffer)
+FROM (SELECT CAST('abc' AS BINARY) AS buffer)
+-- !query analysis
+Aggregate [theta_intersection_agg(buffer#x, 0, 0) AS theta_intersection_agg(buffer)#x]
++- SubqueryAlias __auto_generated_subquery_name
+   +- Project [cast(abc as binary) AS buffer#x]
+      +- OneRowRelation
+
+
+-- !query
+DROP TABLE IF EXISTS t_int_1_5_through_7_11
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_int_1_5_through_7_11
+
+
+-- !query
+DROP TABLE IF EXISTS t_long_1_5_through_7_11
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_long_1_5_through_7_11
+
+
+-- !query
+DROP TABLE IF EXISTS t_double_1_1_1_4_through_1_5_1_8
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_double_1_1_1_4_through_1_5_1_8
+
+
+-- !query
+DROP TABLE IF EXISTS t_float_1_1_1_4_through_1_5_1_8
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_float_1_1_1_4_through_1_5_1_8
+
+
+-- !query
+DROP TABLE IF EXISTS t_string_a_d_through_e_h
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_string_a_d_through_e_h
+
+
+-- !query
+DROP TABLE IF EXISTS t_binary_a_b_through_e_f
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_binary_a_b_through_e_f
+
+
+-- !query
+DROP TABLE IF EXISTS t_array_int_1_3_through_4_6
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_array_int_1_3_through_4_6
+
+
+-- !query
+DROP TABLE IF EXISTS t_array_long_1_3_through_4_6
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_array_long_1_3_through_4_6
+
+
+-- !query
+DROP TABLE IF EXISTS t_string_collation
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t_string_collation
diff --git a/sql/core/src/test/resources/sql-tests/inputs/thetasketch.sql b/sql/core/src/test/resources/sql-tests/inputs/thetasketch.sql
new file mode 100644
index 0000000000000..d270442b50499
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/thetasketch.sql
@@ -0,0 +1,528 @@
+-- Positive test cases
+-- Create tables with two columns for each data type
+
+-- Integer table
+DROP TABLE IF EXISTS t_int_1_5_through_7_11;
+CREATE TABLE t_int_1_5_through_7_11 AS
+VALUES 
+  (1, 5), (2, 6), (3, 7), (4, 8), (5, 9), (6, 10), (7, 11) AS tab(col1, col2);
+
+-- Long table  
+DROP TABLE IF EXISTS t_long_1_5_through_7_11;
+CREATE TABLE t_long_1_5_through_7_11 AS
+VALUES
+  (1L, 5L), (2L, 6L), (3L, 7L), (4L, 8L), (5L, 9L), (6L, 10L), (7L, 11L) AS tab(col1, col2);
+
+-- Double table
+DROP TABLE IF EXISTS t_double_1_1_1_4_through_1_5_1_8;
+CREATE TABLE t_double_1_1_1_4_through_1_5_1_8 AS
+SELECT CAST(col1 AS DOUBLE) AS col1, CAST(col2 AS DOUBLE) AS col2
+FROM VALUES
+  (1.1, 1.4), (1.2, 1.5), (1.3, 1.6), (1.4, 1.7), (1.5, 1.8) AS tab(col1, col2);
+
+-- Float table (must cast, otherwise Spark will store DOUBLEs)
+DROP TABLE IF EXISTS t_float_1_1_1_4_through_1_5_1_8;
+CREATE TABLE t_float_1_1_1_4_through_1_5_1_8 AS
+SELECT CAST(col1 AS FLOAT) col1, CAST(col2 AS FLOAT) col2
+FROM VALUES
+  (1.1, 1.4), (1.2, 1.5), (1.3, 1.6), (1.4, 1.7), (1.5, 1.8) AS tab(col1, col2);
+
+-- String table
+DROP TABLE IF EXISTS t_string_a_d_through_e_h;
+CREATE TABLE t_string_a_d_through_e_h AS
+VALUES
+  ('a', 'd'), ('b', 'e'), ('c', 'f'), ('d', 'g'), ('e', 'h') AS tab(col1, col2);
+
+-- Binary table
+DROP TABLE IF EXISTS t_binary_a_b_through_e_f;
+CREATE TABLE t_binary_a_b_through_e_f AS 
+VALUES 
+  (X'A', X'B'), (X'B', X'C'), (X'C', X'D'), (X'D', X'E'), (X'E', X'F') AS tab(col1, col2);
+
+-- Array Integer table
+DROP TABLE IF EXISTS t_array_int_1_3_through_4_6;
+CREATE TABLE t_array_int_1_3_through_4_6 AS
+VALUES 
+  (ARRAY(1), ARRAY(3)), 
+  (ARRAY(2), ARRAY(4)), 
+  (ARRAY(3), ARRAY(5)), 
+  (ARRAY(4), ARRAY(6)) AS tab(col1, col2);
+
+-- Array Long table
+DROP TABLE IF EXISTS t_array_long_1_3_through_4_6;
+CREATE TABLE t_array_long_1_3_through_4_6 AS
+VALUES 
+  (ARRAY(1L), ARRAY(3L)), 
+  (ARRAY(2L), ARRAY(4L)), 
+  (ARRAY(3L), ARRAY(5L)), 
+  (ARRAY(4L), ARRAY(6L)) AS tab(col1, col2);
+
+DROP TABLE IF EXISTS t_string_collation;
+-- `\u030A` is the "combining ring above" Unicode character: https://www.compart.com/en/unicode/U+030A
+-- `\uFFFD is the Unicode replacement character
+-- `\xC1` is an invalid Unicode byte.
+-- `\x80` is a Unicode continuation byte, that is it cannot be the first byte of a multi-byte UTF8 character.
+-- All strings are different based on the UTF8_BINARY collation.
+-- The first and second strings are equal for any collation with the RTRIM modifier, and equal to the empty string.
+-- The last three strings are respectively equal to the next last three strings for any collation with the RTRIM modifier.
+-- The strings "\xC1", "\x80" and "\uFFFD" are equal for all collations except UTF8_BINARY.
+-- The (sub)strings `å` and `a\u030A` are equal for the UNICODE family of collations.
+-- `å` is the lowercase version of `Å`.
+CREATE TABLE t_string_collation AS
+VALUES
+  (''), ('  '), (CAST(X'C1' AS STRING)), (CAST(X'80' AS STRING)),
+  ('\uFFFD'), ('Å'), ('å'), ('a\u030A'), ('Å '), ('å  '),
+  ('a\u030A   ') AS tab(col1);
+
+-- Test basic theta_sketch_agg with IntegerType from table
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) AS result FROM t_int_1_5_through_7_11;
+
+-- Test theta_sketch_agg with ArrayType(IntegerType) values from table
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_array_int_1_3_through_4_6;
+
+-- Test theta_sketch_agg with ArrayType(LongType) values from table  
+SELECT theta_sketch_estimate(theta_sketch_agg(col2)) FROM t_array_long_1_3_through_4_6;
+
+-- Test theta_sketch_agg with BinaryType values from table
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_binary_a_b_through_e_f;
+
+-- Test theta_sketch_agg with DoubleType values from table
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_double_1_1_1_4_through_1_5_1_8;
+
+-- Test theta_sketch_agg with FloatType values from table (promoted to Double internally)
+SELECT theta_sketch_estimate(theta_sketch_agg(col2)) FROM t_float_1_1_1_4_through_1_5_1_8;
+
+-- Test theta_sketch_agg with IntegerType and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(theta_sketch_agg(col1, 22)) FROM t_int_1_5_through_7_11;
+
+-- Test theta_sketch_agg with LongType values
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_long_1_5_through_7_11;
+
+-- Test theta_sketch_agg with StringType values
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_string_a_d_through_e_h;
+
+-- Test theta_union function with IntegerType sketches
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11;
+
+-- Test theta_union function with LongType sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1, 15),
+    theta_sketch_agg(col2))) FROM t_long_1_5_through_7_11;
+
+-- Test theta_union function with DoubleType sketches
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8;
+
+-- Test theta_union function with FloatType sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1, 6),
+    theta_sketch_agg(col2, 15), 15)) FROM t_float_1_1_1_4_through_1_5_1_8;
+
+-- Test theta_union function with StringType sketches
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h;
+
+-- Test theta_union function with BinaryType sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2), 20)) FROM t_binary_a_b_through_e_f;
+
+-- Test theta_union function with ArrayType(IntegerType) sketches
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6;
+
+-- Test theta_union function with ArrayType(LongType) sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 13))) FROM t_array_long_1_3_through_4_6;
+
+-- Test theta_intersection function with IntegerType sketches  
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11;
+
+-- Test theta_intersection function with LongType sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1, 5),
+    theta_sketch_agg(col2, 12))) FROM t_long_1_5_through_7_11;
+
+-- Test theta_intersection function with DoubleType sketches
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8;
+
+-- Test theta_intersection function with FloatType sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1, 5),
+    theta_sketch_agg(col2))) FROM t_float_1_1_1_4_through_1_5_1_8;
+
+-- Test theta_intersection function with StringType sketches
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h;
+
+-- Test theta_intersection function with BinaryType sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 22))) FROM t_binary_a_b_through_e_f;
+
+-- Test theta_intersection function with ArrayType(IntegerType) sketches
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6;
+
+-- Test theta_intersection function with ArrayType(LongType) sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 10))) FROM t_array_long_1_3_through_4_6;
+
+-- Test theta_difference function with IntegerType sketches
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11;
+
+-- Test theta_difference function with LongType sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 5))) FROM t_long_1_5_through_7_11;
+
+-- Test theta_difference function with DoubleType sketches
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8;
+
+-- Test theta_difference function with FloatType sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1, 12),
+    theta_sketch_agg(col2))) FROM t_float_1_1_1_4_through_1_5_1_8;
+
+-- Test theta_difference function with StringType sketches
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h;
+
+-- Test theta_difference function with BinaryType sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1, 6),
+    theta_sketch_agg(col2, 8))) FROM t_binary_a_b_through_e_f;
+
+-- Test theta_difference function with ArrayType(IntegerType) sketches
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6;
+
+-- Test theta_difference function with ArrayType(LongType) sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 4))) FROM t_array_long_1_3_through_4_6;
+
+-- Test theta_union_agg with IntegerType and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 15))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_int_1_5_through_7_11
+          UNION ALL
+          SELECT theta_sketch_agg(col2, 20) as sketch FROM t_int_1_5_through_7_11);
+
+-- Test theta_union_agg with DoubleType sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 12))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_double_1_1_1_4_through_1_5_1_8
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_double_1_1_1_4_through_1_5_1_8);
+
+-- Test theta_union_agg with StringType sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 14))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_string_a_d_through_e_h
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_string_a_d_through_e_h);
+
+-- Test theta_union_agg with LongType sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 10))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_long_1_5_through_7_11
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_long_1_5_through_7_11);
+
+-- Test theta_union_agg with FloatType sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 6))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_float_1_1_1_4_through_1_5_1_8
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_float_1_1_1_4_through_1_5_1_8);
+
+-- Test theta_union_agg with BinaryType sketches
+SELECT theta_sketch_estimate(theta_union_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_binary_a_b_through_e_f
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_binary_a_b_through_e_f);
+
+-- Test theta_union_agg with ArrayType(IntegerType) sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 12))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_int_1_3_through_4_6
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_array_int_1_3_through_4_6);
+
+-- Test theta_union_agg with ArrayType(LongType) sketches and explicit lgNomEntries parameter
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 16))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_long_1_3_through_4_6
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_array_long_1_3_through_4_6);
+
+-- Test theta_intersection_agg with IntegerType sketches
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_int_1_5_through_7_11
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_int_1_5_through_7_11);
+
+-- Test theta_intersection_agg with LongType sketches
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_long_1_5_through_7_11
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_long_1_5_through_7_11);
+
+-- Test theta_intersection_agg with FloatType sketches
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_float_1_1_1_4_through_1_5_1_8
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_float_1_1_1_4_through_1_5_1_8);
+
+-- Test theta_intersection_agg with DoubleType sketches
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_double_1_1_1_4_through_1_5_1_8
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_double_1_1_1_4_through_1_5_1_8);
+
+-- Test theta_intersection_agg with StringType sketches
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_string_a_d_through_e_h
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_string_a_d_through_e_h);
+
+-- Test theta_intersection_agg with BinaryType sketches
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_binary_a_b_through_e_f
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_binary_a_b_through_e_f);
+
+-- Test theta_intersection_agg with ArrayType(IntegerType) sketches
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_int_1_3_through_4_6
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_array_int_1_3_through_4_6);
+
+-- Test theta_intersection_agg with ArrayType(LongType) sketches
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_long_1_3_through_4_6
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_array_long_1_3_through_4_6);
+
+-- Test theta_sketch_agg with IntegerType and null values (nulls should be ignored)
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (1), (null), (2), (null), (3) tab(col);
+
+-- Test theta_sketch_agg with StringType and null values (nulls should be ignored)
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES ('test'), (null), ('null'), (null) tab(col);
+
+-- Test theta_sketch_agg with LongType and null values (nulls should be ignored)
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (100L), (null), (200L), (null), (300L) tab(col);
+
+-- Test theta_sketch_agg with DoubleType and null values (nulls should be ignored)
+SELECT theta_sketch_estimate(theta_sketch_agg(CAST(col AS DOUBLE)))
+FROM VALUES (1.1), (null), (2.2), (null), (3.3) tab(col);
+
+-- Test theta_sketch_agg with FloatType and null values (nulls should be ignored)
+SELECT theta_sketch_estimate(theta_sketch_agg(CAST(col AS FLOAT)))
+FROM VALUES (1.5), (null), (2.5), (null), (3.5) tab(col);
+
+-- Test theta_sketch_agg with BinaryType and null values (nulls should be ignored)
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (X'AA'), (null), (X'BB'), (null), (X'CC') tab(col);
+
+-- Test theta_sketch_agg with ArrayType(IntegerType) and null values (nulls should be ignored)
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY(1, 2)), (null), (ARRAY(3, 4)), (null), (ARRAY(5, 6)) tab(col);
+
+-- Test theta_sketch_agg with ArrayType(LongType) and null values (nulls should be ignored)
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY(10L, 20L)), (null), (ARRAY(30L, 40L)), (null), (ARRAY(50L, 60L)) tab(col);
+
+-- Test theta_sketch_agg with arrays containing null elements
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY(1, null)), (ARRAY(1)), (ARRAY(2, null, 3)), (ARRAY(4)) tab(col);
+
+-- Test theta_sketch_agg with arrays containing null elements (LongType)
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY(10L, null)), (ARRAY(10L)), (ARRAY(20L, null, 30L)), (ARRAY(40L)) tab(col);
+
+-- Test theta_sketch_agg with empty arrays
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY()), (ARRAY(1, 2)), (ARRAY()), (ARRAY(3, 4)) tab(col);
+
+-- Test theta_sketch_agg with empty strings
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (''), ('a'), (''), ('b'), ('c') tab(col);
+
+-- Test theta_sketch_agg with empty binary data
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (X''), (X'01'), (X'02'), (X'03'), (CAST('  ' AS BINARY)), (X'e280'), (X'c1'), (X'c120') tab(col);
+
+-- Test theta_sketch_agg with collated string data
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) utf8_b FROM t_string_collation;
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_LCASE)) utf8_lc FROM t_string_collation;
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE)) unicode FROM t_string_collation;
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_CI)) unicode_ci FROM t_string_collation;
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_BINARY_RTRIM)) utf8_b_rt FROM t_string_collation;
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_LCASE_RTRIM)) utf8_lc_rt FROM t_string_collation;
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_RTRIM)) unicode_rt FROM t_string_collation;
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_CI_RTRIM)) unicode_ci_rt FROM t_string_collation;
+
+-- Comprehensive test using all ThetaSketch functions in a single query
+-- This query demonstrates the full workflow: aggregation -> union -> intersection -> difference -> estimate
+WITH sketches AS (
+  SELECT 'int_sketch' as sketch_type, theta_sketch_agg(col1, 12) as sketch FROM t_int_1_5_through_7_11
+  UNION ALL
+  SELECT 'long_sketch' as sketch_type, theta_sketch_agg(col1, 15) as sketch FROM t_long_1_5_through_7_11
+  UNION ALL
+  SELECT 'double_sketch' as sketch_type, theta_sketch_agg(col1, 10) as sketch FROM t_double_1_1_1_4_through_1_5_1_8
+  UNION ALL
+  SELECT 'string_sketch' as sketch_type, theta_sketch_agg(col1, 14) as sketch FROM t_string_a_d_through_e_h
+),
+union_result AS (
+  SELECT theta_union_agg(sketch, 16) as union_sketch FROM sketches
+),
+individual_sketches AS (
+  SELECT theta_sketch_agg(col1, 12) as sketch1, theta_sketch_agg(col2, 12) as sketch2 FROM t_int_1_5_through_7_11
+)
+SELECT 
+  -- Basic estimate from union of all sketches
+  theta_sketch_estimate((SELECT union_sketch FROM union_result)) as union_estimate,
+  -- Union of two individual sketches
+  theta_sketch_estimate(theta_union(sketch1, sketch2, 15)) as binary_union_estimate,
+  -- Intersection of two individual sketches
+  theta_sketch_estimate(theta_intersection(sketch1, sketch2)) as intersection_estimate,
+  -- Difference of two individual sketches
+  theta_sketch_estimate(theta_difference(sketch1, sketch2)) as difference_estimate
+FROM individual_sketches;
+
+-- Negative test cases
+
+-- Test theta_sketch_agg with lgNomEntries value of 2 (too low, minimum is 4) - should fail
+SELECT theta_sketch_agg(col, 2)
+FROM VALUES (50), (60), (60) tab(col);
+
+-- Test theta_sketch_agg with lgNomEntries value of 40 (too high, maximum is 26) - should fail
+SELECT theta_sketch_agg(col, 40)
+FROM VALUES (50), (60), (60) tab(col);
+
+-- Test theta_union_agg with lgNomEntries value of 3 (too low, minimum is 4) - should fail
+SELECT theta_union_agg(sketch, 3)
+FROM (SELECT theta_sketch_agg(col, 12) as sketch
+        FROM VALUES (1) AS tab(col)
+      UNION ALL
+      SELECT theta_sketch_agg(col, 20) as sketch
+        FROM VALUES (1) AS tab(col));
+
+-- Test theta_union_agg with lgNomEntries value of 27 (too high, maximum is 26) - should fail
+SELECT theta_union_agg(sketch, 27)
+FROM (SELECT theta_sketch_agg(col, 12) as sketch
+        FROM VALUES (1) AS tab(col)
+      UNION ALL
+      SELECT theta_sketch_agg(col, 20) as sketch
+        FROM VALUES (1) AS tab(col));
+
+-- Test theta_union with integers (1, 2) instead of binary sketch data - should fail
+SELECT theta_union(1, 2)
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2);
+
+-- Test theta_intersection with integers (1, 2) instead of binary sketch data - should fail
+SELECT theta_intersection(1, 2)
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2);
+
+-- Test theta_difference with integers (1, 2) instead of binary sketch data - should fail
+SELECT theta_difference(1, 2)
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2);
+
+-- Test theta_union with string 'invalid' instead of integer for lgNomEntries parameter - should fail
+SELECT theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2), 'invalid')
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2);
+
+-- Test theta_intersection with string 'invalid_sketch' instead of binary sketch data - should fail
+SELECT theta_intersection(
+    theta_sketch_agg(col1),
+    'invalid_sketch')
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2);
+
+-- Test theta_sketch_estimate with invalid binary data ('abc') that is not a valid theta sketch - should fail
+SELECT theta_sketch_estimate(CAST('abc' AS BINARY));
+
+-- Test theta_union with invalid binary data ('abc', 'def') that are not valid theta sketches - should fail
+SELECT theta_union(CAST('abc' AS BINARY), CAST('def' AS BINARY));
+
+-- Test theta_intersection with invalid binary data ('abc', 'def') that are not valid theta sketches - should fail
+SELECT theta_intersection(CAST('abc' AS BINARY), CAST('def' AS BINARY));
+
+-- Test theta_difference with invalid binary data ('abc', 'def') that are not valid theta sketches - should fail
+SELECT theta_difference(CAST('abc' AS BINARY), CAST('def' AS BINARY));
+
+-- Test theta_union_agg with invalid binary data ('abc') that is not a valid theta sketch - should fail
+SELECT theta_union_agg(buffer, 15)
+FROM (SELECT CAST('abc' AS BINARY) AS buffer);
+
+-- Test theta_intersection_agg with invalid binary data ('abc') that is not a valid theta sketch - should fail
+SELECT theta_intersection_agg(buffer)
+FROM (SELECT CAST('abc' AS BINARY) AS buffer);
+
+-- Clean up
+DROP TABLE IF EXISTS t_int_1_5_through_7_11;
+DROP TABLE IF EXISTS t_long_1_5_through_7_11;
+DROP TABLE IF EXISTS t_double_1_1_1_4_through_1_5_1_8;
+DROP TABLE IF EXISTS t_float_1_1_1_4_through_1_5_1_8;
+DROP TABLE IF EXISTS t_string_a_d_through_e_h;
+DROP TABLE IF EXISTS t_binary_a_b_through_e_f;
+DROP TABLE IF EXISTS t_array_int_1_3_through_4_6;
+DROP TABLE IF EXISTS t_array_long_1_3_through_4_6;
+DROP TABLE IF EXISTS t_string_collation
\ No newline at end of file
diff --git a/sql/core/src/test/resources/sql-tests/results/thetasketch.sql.out b/sql/core/src/test/resources/sql-tests/results/thetasketch.sql.out
new file mode 100644
index 0000000000000..95c6e28a8c426
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/thetasketch.sql.out
@@ -0,0 +1,1294 @@
+-- Automatically generated by SQLQueryTestSuite
+-- !query
+DROP TABLE IF EXISTS t_int_1_5_through_7_11
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TABLE t_int_1_5_through_7_11 AS
+VALUES 
+  (1, 5), (2, 6), (3, 7), (4, 8), (5, 9), (6, 10), (7, 11) AS tab(col1, col2)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_long_1_5_through_7_11
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TABLE t_long_1_5_through_7_11 AS
+VALUES
+  (1L, 5L), (2L, 6L), (3L, 7L), (4L, 8L), (5L, 9L), (6L, 10L), (7L, 11L) AS tab(col1, col2)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_double_1_1_1_4_through_1_5_1_8
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TABLE t_double_1_1_1_4_through_1_5_1_8 AS
+SELECT CAST(col1 AS DOUBLE) AS col1, CAST(col2 AS DOUBLE) AS col2
+FROM VALUES
+  (1.1, 1.4), (1.2, 1.5), (1.3, 1.6), (1.4, 1.7), (1.5, 1.8) AS tab(col1, col2)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_float_1_1_1_4_through_1_5_1_8
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TABLE t_float_1_1_1_4_through_1_5_1_8 AS
+SELECT CAST(col1 AS FLOAT) col1, CAST(col2 AS FLOAT) col2
+FROM VALUES
+  (1.1, 1.4), (1.2, 1.5), (1.3, 1.6), (1.4, 1.7), (1.5, 1.8) AS tab(col1, col2)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_string_a_d_through_e_h
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TABLE t_string_a_d_through_e_h AS
+VALUES
+  ('a', 'd'), ('b', 'e'), ('c', 'f'), ('d', 'g'), ('e', 'h') AS tab(col1, col2)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_binary_a_b_through_e_f
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TABLE t_binary_a_b_through_e_f AS 
+VALUES 
+  (X'A', X'B'), (X'B', X'C'), (X'C', X'D'), (X'D', X'E'), (X'E', X'F') AS tab(col1, col2)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_array_int_1_3_through_4_6
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TABLE t_array_int_1_3_through_4_6 AS
+VALUES 
+  (ARRAY(1), ARRAY(3)), 
+  (ARRAY(2), ARRAY(4)), 
+  (ARRAY(3), ARRAY(5)), 
+  (ARRAY(4), ARRAY(6)) AS tab(col1, col2)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_array_long_1_3_through_4_6
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TABLE t_array_long_1_3_through_4_6 AS
+VALUES 
+  (ARRAY(1L), ARRAY(3L)), 
+  (ARRAY(2L), ARRAY(4L)), 
+  (ARRAY(3L), ARRAY(5L)), 
+  (ARRAY(4L), ARRAY(6L)) AS tab(col1, col2)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_string_collation
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TABLE t_string_collation AS
+VALUES
+  (''), ('  '), (CAST(X'C1' AS STRING)), (CAST(X'80' AS STRING)),
+  ('\uFFFD'), ('Å'), ('å'), ('a\u030A'), ('Å '), ('å  '),
+  ('a\u030A   ') AS tab(col1)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) AS result FROM t_int_1_5_through_7_11
+-- !query schema
+struct<result:bigint>
+-- !query output
+7
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_array_int_1_3_through_4_6
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col1, 12)):bigint>
+-- !query output
+4
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col2)) FROM t_array_long_1_3_through_4_6
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col2, 12)):bigint>
+-- !query output
+4
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_binary_a_b_through_e_f
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col1, 12)):bigint>
+-- !query output
+5
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_double_1_1_1_4_through_1_5_1_8
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col1, 12)):bigint>
+-- !query output
+5
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col2)) FROM t_float_1_1_1_4_through_1_5_1_8
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col2, 12)):bigint>
+-- !query output
+5
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1, 22)) FROM t_int_1_5_through_7_11
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col1, 22)):bigint>
+-- !query output
+7
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_long_1_5_through_7_11
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col1, 12)):bigint>
+-- !query output
+7
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) FROM t_string_a_d_through_e_h
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col1, 12)):bigint>
+-- !query output
+5
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11
+-- !query schema
+struct<theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12)):bigint>
+-- !query output
+11
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1, 15),
+    theta_sketch_agg(col2))) FROM t_long_1_5_through_7_11
+-- !query schema
+struct<theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 15), theta_sketch_agg(col2, 12), 12)):bigint>
+-- !query output
+11
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8
+-- !query schema
+struct<theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12)):bigint>
+-- !query output
+8
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1, 6),
+    theta_sketch_agg(col2, 15), 15)) FROM t_float_1_1_1_4_through_1_5_1_8
+-- !query schema
+struct<theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 6), theta_sketch_agg(col2, 15), 15)):bigint>
+-- !query output
+8
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h
+-- !query schema
+struct<theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12)):bigint>
+-- !query output
+8
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2), 20)) FROM t_binary_a_b_through_e_f
+-- !query schema
+struct<theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 20)):bigint>
+-- !query output
+6
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6
+-- !query schema
+struct<theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), 12)):bigint>
+-- !query output
+6
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 13))) FROM t_array_long_1_3_through_4_6
+-- !query schema
+struct<theta_sketch_estimate(theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 13), 12)):bigint>
+-- !query output
+6
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1, 5),
+    theta_sketch_agg(col2, 12))) FROM t_long_1_5_through_7_11
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 5), theta_sketch_agg(col2, 12))):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint>
+-- !query output
+2
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1, 5),
+    theta_sketch_agg(col2))) FROM t_float_1_1_1_4_through_1_5_1_8
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 5), theta_sketch_agg(col2, 12))):bigint>
+-- !query output
+2
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint>
+-- !query output
+2
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 22))) FROM t_binary_a_b_through_e_f
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 22))):bigint>
+-- !query output
+4
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint>
+-- !query output
+2
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_intersection(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 10))) FROM t_array_long_1_3_through_4_6
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 10))):bigint>
+-- !query output
+2
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_int_1_5_through_7_11
+-- !query schema
+struct<theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint>
+-- !query output
+4
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 5))) FROM t_long_1_5_through_7_11
+-- !query schema
+struct<theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 5))):bigint>
+-- !query output
+4
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_double_1_1_1_4_through_1_5_1_8
+-- !query schema
+struct<theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1, 12),
+    theta_sketch_agg(col2))) FROM t_float_1_1_1_4_through_1_5_1_8
+-- !query schema
+struct<theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_string_a_d_through_e_h
+-- !query schema
+struct<theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1, 6),
+    theta_sketch_agg(col2, 8))) FROM t_binary_a_b_through_e_f
+-- !query schema
+struct<theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 6), theta_sketch_agg(col2, 8))):bigint>
+-- !query output
+1
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2))) FROM t_array_int_1_3_through_4_6
+-- !query schema
+struct<theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12))):bigint>
+-- !query output
+2
+
+
+-- !query
+SELECT theta_sketch_estimate(
+  theta_difference(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2, 4))) FROM t_array_long_1_3_through_4_6
+-- !query schema
+struct<theta_sketch_estimate(theta_difference(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 4))):bigint>
+-- !query output
+2
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 15))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_int_1_5_through_7_11
+          UNION ALL
+          SELECT theta_sketch_agg(col2, 20) as sketch FROM t_int_1_5_through_7_11)
+-- !query schema
+struct<theta_sketch_estimate(theta_union_agg(sketch, 15)):bigint>
+-- !query output
+11
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 12))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_double_1_1_1_4_through_1_5_1_8
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_double_1_1_1_4_through_1_5_1_8)
+-- !query schema
+struct<theta_sketch_estimate(theta_union_agg(sketch, 12)):bigint>
+-- !query output
+8
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 14))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_string_a_d_through_e_h
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_string_a_d_through_e_h)
+-- !query schema
+struct<theta_sketch_estimate(theta_union_agg(sketch, 14)):bigint>
+-- !query output
+8
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 10))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_long_1_5_through_7_11
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_long_1_5_through_7_11)
+-- !query schema
+struct<theta_sketch_estimate(theta_union_agg(sketch, 10)):bigint>
+-- !query output
+11
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 6))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_float_1_1_1_4_through_1_5_1_8
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_float_1_1_1_4_through_1_5_1_8)
+-- !query schema
+struct<theta_sketch_estimate(theta_union_agg(sketch, 6)):bigint>
+-- !query output
+8
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_binary_a_b_through_e_f
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_binary_a_b_through_e_f)
+-- !query schema
+struct<theta_sketch_estimate(theta_union_agg(sketch, 12)):bigint>
+-- !query output
+6
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 12))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_int_1_3_through_4_6
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_array_int_1_3_through_4_6)
+-- !query schema
+struct<theta_sketch_estimate(theta_union_agg(sketch, 12)):bigint>
+-- !query output
+6
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_union_agg(sketch, 16))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_long_1_3_through_4_6
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_array_long_1_3_through_4_6)
+-- !query schema
+struct<theta_sketch_estimate(theta_union_agg(sketch, 16)):bigint>
+-- !query output
+6
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_int_1_5_through_7_11
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_int_1_5_through_7_11)
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection_agg(sketch)):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_long_1_5_through_7_11
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_long_1_5_through_7_11)
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection_agg(sketch)):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_float_1_1_1_4_through_1_5_1_8
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_float_1_1_1_4_through_1_5_1_8)
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection_agg(sketch)):bigint>
+-- !query output
+2
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_double_1_1_1_4_through_1_5_1_8
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_double_1_1_1_4_through_1_5_1_8)
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection_agg(sketch)):bigint>
+-- !query output
+2
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_string_a_d_through_e_h
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_string_a_d_through_e_h)
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection_agg(sketch)):bigint>
+-- !query output
+2
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_binary_a_b_through_e_f
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_binary_a_b_through_e_f)
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection_agg(sketch)):bigint>
+-- !query output
+4
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_int_1_3_through_4_6
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_array_int_1_3_through_4_6)
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection_agg(sketch)):bigint>
+-- !query output
+2
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_intersection_agg(sketch))
+    FROM (SELECT theta_sketch_agg(col1) as sketch FROM t_array_long_1_3_through_4_6
+          UNION ALL
+          SELECT theta_sketch_agg(col2) as sketch FROM t_array_long_1_3_through_4_6)
+-- !query schema
+struct<theta_sketch_estimate(theta_intersection_agg(sketch)):bigint>
+-- !query output
+2
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (1), (null), (2), (null), (3) tab(col)
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES ('test'), (null), ('null'), (null) tab(col)
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint>
+-- !query output
+2
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (100L), (null), (200L), (null), (300L) tab(col)
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(CAST(col AS DOUBLE)))
+FROM VALUES (1.1), (null), (2.2), (null), (3.3) tab(col)
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(CAST(col AS DOUBLE), 12)):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(CAST(col AS FLOAT)))
+FROM VALUES (1.5), (null), (2.5), (null), (3.5) tab(col)
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(CAST(col AS FLOAT), 12)):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (X'AA'), (null), (X'BB'), (null), (X'CC') tab(col)
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY(1, 2)), (null), (ARRAY(3, 4)), (null), (ARRAY(5, 6)) tab(col)
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY(10L, 20L)), (null), (ARRAY(30L, 40L)), (null), (ARRAY(50L, 60L)) tab(col)
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY(1, null)), (ARRAY(1)), (ARRAY(2, null, 3)), (ARRAY(4)) tab(col)
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint>
+-- !query output
+4
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY(10L, null)), (ARRAY(10L)), (ARRAY(20L, null, 30L)), (ARRAY(40L)) tab(col)
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint>
+-- !query output
+4
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (ARRAY()), (ARRAY(1, 2)), (ARRAY()), (ARRAY(3, 4)) tab(col)
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint>
+-- !query output
+2
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (''), ('a'), (''), ('b'), ('c') tab(col)
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col))
+FROM VALUES (X''), (X'01'), (X'02'), (X'03'), (CAST('  ' AS BINARY)), (X'e280'), (X'c1'), (X'c120') tab(col)
+-- !query schema
+struct<theta_sketch_estimate(theta_sketch_agg(col, 12)):bigint>
+-- !query output
+7
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1)) utf8_b FROM t_string_collation
+-- !query schema
+struct<utf8_b:bigint>
+-- !query output
+10
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_LCASE)) utf8_lc FROM t_string_collation
+-- !query schema
+struct<utf8_lc:bigint>
+-- !query output
+7
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE)) unicode FROM t_string_collation
+-- !query schema
+struct<unicode:bigint>
+-- !query output
+7
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_CI)) unicode_ci FROM t_string_collation
+-- !query schema
+struct<unicode_ci:bigint>
+-- !query output
+6
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_BINARY_RTRIM)) utf8_b_rt FROM t_string_collation
+-- !query schema
+struct<utf8_b_rt:bigint>
+-- !query output
+6
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UTF8_LCASE_RTRIM)) utf8_lc_rt FROM t_string_collation
+-- !query schema
+struct<utf8_lc_rt:bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_RTRIM)) unicode_rt FROM t_string_collation
+-- !query schema
+struct<unicode_rt:bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT theta_sketch_estimate(theta_sketch_agg(col1 COLLATE UNICODE_CI_RTRIM)) unicode_ci_rt FROM t_string_collation
+-- !query schema
+struct<unicode_ci_rt:bigint>
+-- !query output
+2
+
+
+-- !query
+WITH sketches AS (
+  SELECT 'int_sketch' as sketch_type, theta_sketch_agg(col1, 12) as sketch FROM t_int_1_5_through_7_11
+  UNION ALL
+  SELECT 'long_sketch' as sketch_type, theta_sketch_agg(col1, 15) as sketch FROM t_long_1_5_through_7_11
+  UNION ALL
+  SELECT 'double_sketch' as sketch_type, theta_sketch_agg(col1, 10) as sketch FROM t_double_1_1_1_4_through_1_5_1_8
+  UNION ALL
+  SELECT 'string_sketch' as sketch_type, theta_sketch_agg(col1, 14) as sketch FROM t_string_a_d_through_e_h
+),
+union_result AS (
+  SELECT theta_union_agg(sketch, 16) as union_sketch FROM sketches
+),
+individual_sketches AS (
+  SELECT theta_sketch_agg(col1, 12) as sketch1, theta_sketch_agg(col2, 12) as sketch2 FROM t_int_1_5_through_7_11
+)
+SELECT 
+  theta_sketch_estimate((SELECT union_sketch FROM union_result)) as union_estimate,
+  theta_sketch_estimate(theta_union(sketch1, sketch2, 15)) as binary_union_estimate,
+  theta_sketch_estimate(theta_intersection(sketch1, sketch2)) as intersection_estimate,
+  theta_sketch_estimate(theta_difference(sketch1, sketch2)) as difference_estimate
+FROM individual_sketches
+-- !query schema
+struct<union_estimate:bigint,binary_union_estimate:bigint,intersection_estimate:bigint,difference_estimate:bigint>
+-- !query output
+17	11	3	4
+
+
+-- !query
+SELECT theta_sketch_agg(col, 2)
+FROM VALUES (50), (60), (60) tab(col)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+  "errorClass" : "THETA_INVALID_LG_NOM_ENTRIES",
+  "sqlState" : "22546",
+  "messageParameters" : {
+    "function" : "`theta_sketch_agg`",
+    "max" : "26",
+    "min" : "4",
+    "value" : "2"
+  }
+}
+
+
+-- !query
+SELECT theta_sketch_agg(col, 40)
+FROM VALUES (50), (60), (60) tab(col)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+  "errorClass" : "THETA_INVALID_LG_NOM_ENTRIES",
+  "sqlState" : "22546",
+  "messageParameters" : {
+    "function" : "`theta_sketch_agg`",
+    "max" : "26",
+    "min" : "4",
+    "value" : "40"
+  }
+}
+
+
+-- !query
+SELECT theta_union_agg(sketch, 3)
+FROM (SELECT theta_sketch_agg(col, 12) as sketch
+        FROM VALUES (1) AS tab(col)
+      UNION ALL
+      SELECT theta_sketch_agg(col, 20) as sketch
+        FROM VALUES (1) AS tab(col))
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+  "errorClass" : "THETA_INVALID_LG_NOM_ENTRIES",
+  "sqlState" : "22546",
+  "messageParameters" : {
+    "function" : "`theta_union_agg`",
+    "max" : "26",
+    "min" : "4",
+    "value" : "3"
+  }
+}
+
+
+-- !query
+SELECT theta_union_agg(sketch, 27)
+FROM (SELECT theta_sketch_agg(col, 12) as sketch
+        FROM VALUES (1) AS tab(col)
+      UNION ALL
+      SELECT theta_sketch_agg(col, 20) as sketch
+        FROM VALUES (1) AS tab(col))
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+  "errorClass" : "THETA_INVALID_LG_NOM_ENTRIES",
+  "sqlState" : "22546",
+  "messageParameters" : {
+    "function" : "`theta_union_agg`",
+    "max" : "26",
+    "min" : "4",
+    "value" : "27"
+  }
+}
+
+
+-- !query
+SELECT theta_union(1, 2)
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputSql" : "\"1\"",
+    "inputType" : "\"INT\"",
+    "paramIndex" : "first",
+    "requiredType" : "\"BINARY\"",
+    "sqlExpr" : "\"theta_union(1, 2, 12)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 24,
+    "fragment" : "theta_union(1, 2)"
+  } ]
+}
+
+
+-- !query
+SELECT theta_intersection(1, 2)
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputSql" : "\"1\"",
+    "inputType" : "\"INT\"",
+    "paramIndex" : "first",
+    "requiredType" : "\"BINARY\"",
+    "sqlExpr" : "\"theta_intersection(1, 2)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 31,
+    "fragment" : "theta_intersection(1, 2)"
+  } ]
+}
+
+
+-- !query
+SELECT theta_difference(1, 2)
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputSql" : "\"1\"",
+    "inputType" : "\"INT\"",
+    "paramIndex" : "first",
+    "requiredType" : "\"BINARY\"",
+    "sqlExpr" : "\"theta_difference(1, 2)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 29,
+    "fragment" : "theta_difference(1, 2)"
+  } ]
+}
+
+
+-- !query
+SELECT theta_union(
+    theta_sketch_agg(col1),
+    theta_sketch_agg(col2), 'invalid')
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputSql" : "\"invalid\"",
+    "inputType" : "\"STRING\"",
+    "paramIndex" : "third",
+    "requiredType" : "\"INT\"",
+    "sqlExpr" : "\"theta_union(theta_sketch_agg(col1, 12), theta_sketch_agg(col2, 12), invalid)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 86,
+    "fragment" : "theta_union(\n    theta_sketch_agg(col1),\n    theta_sketch_agg(col2), 'invalid')"
+  } ]
+}
+
+
+-- !query
+SELECT theta_intersection(
+    theta_sketch_agg(col1),
+    'invalid_sketch')
+  FROM VALUES
+    (1, 4),
+    (2, 5),
+    (3, 6) AS tab(col1, col2)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputSql" : "\"invalid_sketch\"",
+    "inputType" : "\"STRING\"",
+    "paramIndex" : "second",
+    "requiredType" : "\"BINARY\"",
+    "sqlExpr" : "\"theta_intersection(theta_sketch_agg(col1, 12), invalid_sketch)\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 76,
+    "fragment" : "theta_intersection(\n    theta_sketch_agg(col1),\n    'invalid_sketch')"
+  } ]
+}
+
+
+-- !query
+SELECT theta_sketch_estimate(CAST('abc' AS BINARY))
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+  "errorClass" : "THETA_INVALID_INPUT_SKETCH_BUFFER",
+  "sqlState" : "22546",
+  "messageParameters" : {
+    "function" : "`theta_sketch_estimate`"
+  }
+}
+
+
+-- !query
+SELECT theta_union(CAST('abc' AS BINARY), CAST('def' AS BINARY))
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+  "errorClass" : "THETA_INVALID_INPUT_SKETCH_BUFFER",
+  "sqlState" : "22546",
+  "messageParameters" : {
+    "function" : "`theta_union`"
+  }
+}
+
+
+-- !query
+SELECT theta_intersection(CAST('abc' AS BINARY), CAST('def' AS BINARY))
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+  "errorClass" : "THETA_INVALID_INPUT_SKETCH_BUFFER",
+  "sqlState" : "22546",
+  "messageParameters" : {
+    "function" : "`theta_intersection`"
+  }
+}
+
+
+-- !query
+SELECT theta_difference(CAST('abc' AS BINARY), CAST('def' AS BINARY))
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+  "errorClass" : "THETA_INVALID_INPUT_SKETCH_BUFFER",
+  "sqlState" : "22546",
+  "messageParameters" : {
+    "function" : "`theta_difference`"
+  }
+}
+
+
+-- !query
+SELECT theta_union_agg(buffer, 15)
+FROM (SELECT CAST('abc' AS BINARY) AS buffer)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+  "errorClass" : "THETA_INVALID_INPUT_SKETCH_BUFFER",
+  "sqlState" : "22546",
+  "messageParameters" : {
+    "function" : "`theta_union_agg`"
+  }
+}
+
+
+-- !query
+SELECT theta_intersection_agg(buffer)
+FROM (SELECT CAST('abc' AS BINARY) AS buffer)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkRuntimeException
+{
+  "errorClass" : "THETA_INVALID_INPUT_SKETCH_BUFFER",
+  "sqlState" : "22546",
+  "messageParameters" : {
+    "function" : "`theta_intersection_agg`"
+  }
+}
+
+
+-- !query
+DROP TABLE IF EXISTS t_int_1_5_through_7_11
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_long_1_5_through_7_11
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_double_1_1_1_4_through_1_5_1_8
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_float_1_1_1_4_through_1_5_1_8
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_string_a_d_through_e_h
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_binary_a_b_through_e_f
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_array_int_1_3_through_4_6
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_array_long_1_3_through_4_6
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS t_string_collation
+-- !query schema
+struct<>
+-- !query output
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 721d1c1deea9d..37614145fe83f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -2217,6 +2217,707 @@ class DataFrameAggregateSuite extends QueryTest
     )
   }
 
+  test("SPARK-52407: theta_sketch_agg + theta_union_agg + theta_sketch_estimate positive tests") {
+    val df1 = Seq((1, "a"), (1, "a"), (1, "a"), (1, "b"), (1, "c"), (1, "c"), (1, "d"))
+      .toDF("id", "value")
+    df1.createOrReplaceTempView("df1")
+
+    val df2 = Seq((1, "a"), (1, "c"), (1, "d"), (1, "d"), (1, "d"), (1, "e"), (1, "e"), (1, "f"))
+      .toDF("id", "value")
+    df2.createOrReplaceTempView("df2")
+
+    // First test theta_sketch_agg, theta_sketch_estimate via dataframe + sql,
+    // with and without configs, via both DF and SQL implementations.
+    val res1 = df1
+      .groupBy("id")
+      .agg(
+        count("value").as("count"),
+        theta_sketch_agg("value").as("sketch_1"),
+        theta_sketch_agg("value", 20).as("sketch_2"))
+      .withColumn("distinct_count_1", theta_sketch_estimate("sketch_1"))
+      .withColumn("distinct_count_2", theta_sketch_estimate("sketch_2"))
+      .drop("sketch_1", "sketch_2")
+    checkAnswer(res1, Row(1, 7, 4, 4))
+
+    val res2 = sql("""with sketches as (
+        |select
+        | id,
+        | count(value) as count,
+        | theta_sketch_agg(value) as sketch_1,
+        | theta_sketch_agg(value, 20) as sketch_2
+        |from df1
+        |group by 1
+        |)
+        |
+        |select
+        | id,
+        | count,
+        | theta_sketch_estimate(sketch_1) as distinct_count_1,
+        | theta_sketch_estimate(sketch_2) as distinct_count_2
+        |from
+        | sketches
+        |""".stripMargin)
+    checkAnswer(res2, Row(1, 7, 4, 4))
+
+    // Now test theta_union_agg via dataframe + sql, with and without configs,
+    // unioning together sketches with default, non-default and different configurations
+    val df3 = df1
+      .groupBy("id")
+      .agg(
+        count("value").as("count"),
+        theta_sketch_agg("value").as("thetasketch_1"),
+        theta_sketch_agg("value", 20).as("thetasketch_2"),
+        theta_sketch_agg("value").as("thetasketch_3"))
+    df3.createOrReplaceTempView("df3")
+
+    val df4 = sql("""select
+        | id,
+        | count(value) as count,
+        | theta_sketch_agg(value) as thetasketch_1,
+        | theta_sketch_agg(value, 20) as thetasketch_2,
+        | theta_sketch_agg(value, 20) as thetasketch_3
+        |from df2
+        |group by 1
+        |""".stripMargin)
+    df4.createOrReplaceTempView("df4")
+
+    val res3 = df3
+      .union(df4)
+      .groupBy("id")
+      .agg(
+        sum("count").as("count"),
+        theta_sketch_estimate(theta_union_agg("thetasketch_1")).as("distinct_count_1"),
+        theta_sketch_estimate(theta_union_agg("thetasketch_2")).as("distinct_count_2"),
+        theta_sketch_estimate(theta_union_agg("thetasketch_3", 15)).as("distinct_count_3"))
+    checkAnswer(res3, Row(1, 15, 6, 6, 6))
+
+    val res4 = sql("""select
+        | id,
+        | sum(count) as count,
+        | theta_sketch_estimate(theta_union_agg(thetasketch_1)) as distinct_count_1,
+        | theta_sketch_estimate(theta_union_agg(thetasketch_2)) as distinct_count_2,
+        | theta_sketch_estimate(theta_union_agg(thetasketch_3, 15)) as distinct_count_3
+        |from (select * from df3 union all select * from df4)
+        |group by 1
+        |""".stripMargin)
+    checkAnswer(res4, Row(1, 15, 6, 6, 6))
+
+    // add tests to ensure theta_union works via both DF and SQL too
+    val df5 = df3.drop("count")
+    df5.createOrReplaceTempView("df5")
+
+    val df6 = df4
+      .drop("count")
+      .withColumnRenamed("thetasketch_1", "thetasketch_4")
+      .withColumnRenamed("thetasketch_2", "thetasketch_5")
+      .withColumnRenamed("thetasketch_3", "thetasketch_6")
+    df6.createOrReplaceTempView("df6")
+
+    val res5 = df5
+      .join(df6, "id")
+      .withColumn(
+        "distinct_count_1",
+        theta_sketch_estimate(theta_union("thetasketch_1", "thetasketch_4")))
+      .withColumn(
+        "distinct_count_2",
+        theta_sketch_estimate(theta_union("thetasketch_2", "thetasketch_5")))
+      .withColumn(
+        "distinct_count_3",
+        theta_sketch_estimate(theta_union("thetasketch_3", "thetasketch_6", 15)))
+      .drop(
+        "thetasketch_1",
+        "thetasketch_2",
+        "thetasketch_3",
+        "thetasketch_4",
+        "thetasketch_5",
+        "thetasketch_6")
+    checkAnswer(res5, Row(1, 6, 6, 6))
+
+    val res6 = sql("""with joined as (
+        |  select
+        |    l.id,
+        |    l.thetasketch_1,
+        |    l.thetasketch_2,
+        |    l.thetasketch_3,
+        |    r.thetasketch_4,
+        |    r.thetasketch_5,
+        |    r.thetasketch_6
+        |  from
+        |    df5 l
+        |    join
+        |    df6 r
+        |     on l.id = r.id
+        | )
+        |
+        |select
+        |  id,
+        |  theta_sketch_estimate(theta_union(thetasketch_1, thetasketch_4)) as distinct_count_1,
+        |  theta_sketch_estimate(theta_union(thetasketch_2, thetasketch_5)) as distinct_count_2,
+        |  theta_sketch_estimate(theta_union(thetasketch_3, thetasketch_6, 20))
+        |  as distinct_count_3
+        |from
+        | joined
+        |""".stripMargin)
+    checkAnswer(res6, Row(1, 6, 6, 6))
+
+    val df7 =
+      Seq((1, "a"), (1, "a"), (1, "a"), (1, "b"), (1, null), (2, null), (2, null), (2, null))
+        .toDF("id", "value")
+
+    // empty column test
+    val res7 = df7
+      .where(expr("id = 2"))
+      .groupBy("id")
+      .agg(theta_sketch_estimate(theta_sketch_agg("value")).as("distinct_count"))
+    checkAnswer(res7, Row(2, 0))
+
+    // partial empty column test
+    val res8 = df7
+      .groupBy("id")
+      .agg(theta_sketch_estimate(theta_sketch_agg("value")).as("distinct_count"))
+    checkAnswer(res8, Seq(Row(1, 2), Row(2, 0)))
+  }
+
+  test("SPARK-52407: theta_sketch_agg + theta_union_agg + theta_union negative tests") {
+    val df1 = Seq((1, "a"), (1, "a"), (1, "a"), (1, "b"), (1, "c"), (1, "c"), (1, "d"))
+      .toDF("id", "value")
+    df1.createOrReplaceTempView("df1")
+
+    val df2 = Seq((1, "a"), (1, "c"), (1, "d"), (1, "d"), (1, "d"), (1, "e"), (1, "e"), (1, "f"))
+      .toDF("id", "value")
+    df2.createOrReplaceTempView("df2")
+
+    // Validate that the functions error out when lgNomEntries < 4 or > 26.
+    checkError(
+      exception = intercept[SparkRuntimeException] {
+        df1
+          .groupBy("id")
+          .agg(theta_sketch_agg("value", 1).as("thetasketch"))
+          .collect()
+      },
+      condition = "THETA_INVALID_LG_NOM_ENTRIES",
+      parameters = Map(
+        "function" -> "`theta_sketch_agg`",
+        "min" -> "4",
+        "max" -> "26",
+        "value" -> "1"
+      )
+    )
+
+    checkError(
+      exception = intercept[SparkRuntimeException] {
+        df1
+          .groupBy("id")
+          .agg(theta_sketch_agg("value", 28).as("thetasketch"))
+          .collect()
+      },
+      condition = "THETA_INVALID_LG_NOM_ENTRIES",
+      parameters = Map(
+        "function" -> "`theta_sketch_agg`",
+        "min" -> "4",
+        "max" -> "26",
+        "value" -> "28"
+      )
+    )
+
+    // Validate that the functions error out when provided unexpected types.
+    checkError(
+      exception = intercept[AnalysisException] {
+        val res = sql("""
+            |select
+            | id,
+            | theta_sketch_agg(value, 'text')
+            |from
+            | df1
+            |group by 1
+            |""".stripMargin)
+        checkAnswer(res, Nil)
+      },
+      condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+      parameters = Map(
+        "sqlExpr" -> "\"theta_sketch_agg(value, text)\"",
+        "paramIndex" -> "second",
+        "inputSql" -> "\"text\"",
+        "inputType" -> "\"STRING\"",
+        "requiredType" -> "\"INT\""),
+      context =
+        ExpectedContext(fragment = "theta_sketch_agg(value, 'text')", start = 14, stop = 44))
+
+    checkError(
+      exception = intercept[AnalysisException] {
+        val res = sql("""with sketch_cte as (
+            |select
+            | id,
+            | theta_sketch_agg(value) as sketch
+            |from
+            | df1
+            |group by 1
+            |)
+            |
+            |select theta_union_agg(sketch, 'Theta_4') from sketch_cte
+            |""".stripMargin)
+        checkAnswer(res, Nil)
+      },
+      condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+      parameters = Map(
+        "sqlExpr" -> "\"theta_union_agg(sketch, Theta_4)\"",
+        "paramIndex" -> "second",
+        "inputSql" -> "\"Theta_4\"",
+        "inputType" -> "\"STRING\"",
+        "requiredType" -> "\"INT\""),
+      context =
+        ExpectedContext(fragment = "theta_union_agg(sketch, 'Theta_4')", start = 99, stop = 132))
+
+    // Test invalid parameter types for theta_union
+    checkError(
+      exception = intercept[AnalysisException] {
+        sql("""with sketches as (
+          |select id, theta_sketch_agg(value) as sketch from df1 group by 1
+          |)
+          |select theta_union(sketch, 'invalid') from sketches
+          |""".stripMargin).collect()
+      },
+      condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+      parameters = Map(
+        "sqlExpr" -> "\"theta_union(sketch, invalid, 12)\"",
+        "paramIndex" -> "second",
+        "inputSql" -> "\"invalid\"",
+        "inputType" -> "\"STRING\"",
+        "requiredType" -> "\"BINARY\""),
+      context =
+        ExpectedContext(fragment = "theta_union(sketch, 'invalid')", start = 93, stop = 122))
+
+    // Test theta_union with non-sketch input.
+    checkError(
+      exception = intercept[AnalysisException] {
+        sql("select theta_union('not_a_sketch', 'also_not_a_sketch')").collect()
+      },
+      condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+      parameters = Map(
+        "sqlExpr" -> "\"theta_union(not_a_sketch, also_not_a_sketch, 12)\"",
+        "paramIndex" -> "first",
+        "inputSql" -> "\"not_a_sketch\"",
+        "inputType" -> "\"STRING\"",
+        "requiredType" -> "\"BINARY\""),
+      context = ExpectedContext(
+        fragment = "theta_union('not_a_sketch', 'also_not_a_sketch')",
+        start = 7,
+        stop = 54))
+
+  }
+  test(
+    "SPARK-52407: theta_difference + theta_intersection + theta_intersection_agg positive tests") {
+    val df1 = Seq((1, "a"), (1, "a"), (1, "a"), (1, "b"), (1, "c"), (1, "c"), (1, "d"))
+      .toDF("id", "value")
+    df1.createOrReplaceTempView("df1")
+
+    val df2 = Seq((1, "a"), (1, "c"), (1, "d"), (1, "d"), (1, "d"), (1, "e"), (1, "e"), (1, "f"))
+      .toDF("id", "value")
+    df2.createOrReplaceTempView("df2")
+
+    val df3 = Seq((1, "c"), (1, "d"), (1, "g"), (1, "g"), (1, "h")).toDF("id", "value")
+    df3.createOrReplaceTempView("df3")
+
+    // Test theta_difference via DataFrame API.
+    val sketches1 = df1
+      .groupBy("id")
+      .agg(
+        theta_sketch_agg("value").as("sketch1"),
+        theta_sketch_agg("value", 20).as("sketch1_20"))
+
+    val sketches2 = df2
+      .groupBy("id")
+      .agg(
+        theta_sketch_agg("value").as("sketch2"),
+        theta_sketch_agg("value", 20).as("sketch2_20"))
+
+    val res1 = sketches1
+      .join(sketches2, "id")
+      .withColumn(
+        "difference_count_1",
+        theta_sketch_estimate(theta_difference("sketch1", "sketch2")))
+      .withColumn(
+        "difference_count_2",
+        theta_sketch_estimate(theta_difference("sketch1_20", "sketch2_20")))
+      .select("id", "difference_count_1", "difference_count_2")
+
+    // df1 has {a,b,c,d}, df2 has {a,c,d,e,f}, so df1 - df2 should be approximately {b}.
+    checkAnswer(res1, Row(1, 1, 1))
+
+    // Test theta_difference via SQL.
+    val res2 = sql("""with sketches1 as (
+      |select
+      | id,
+      | theta_sketch_agg(value) as sketch1,
+      | theta_sketch_agg(value, 20) as sketch1_20
+      |from df1
+      |group by 1
+      |),
+      |sketches2 as (
+      |select
+      | id,
+      | theta_sketch_agg(value) as sketch2,
+      | theta_sketch_agg(value, 20) as sketch2_20
+      |from df2
+      |group by 1
+      |)
+      |
+      |select
+      | s1.id,
+      | theta_sketch_estimate(theta_difference(s1.sketch1, s2.sketch2)) as difference_count_1,
+      | theta_sketch_estimate(theta_difference(s1.sketch1_20, s2.sketch2_20)) as difference_count_2
+      |from sketches1 s1
+      |join sketches2 s2 on s1.id = s2.id
+      |""".stripMargin)
+    checkAnswer(res2, Row(1, 1, 1))
+
+    // Test theta_intersection via DataFrame API.
+    val res3 = sketches1
+      .join(sketches2, "id")
+      .withColumn(
+        "intersection_count_1",
+        theta_sketch_estimate(theta_intersection("sketch1", "sketch2")))
+      .withColumn(
+        "intersection_count_2",
+        theta_sketch_estimate(theta_intersection("sketch1_20", "sketch2_20")))
+      .select("id", "intersection_count_1", "intersection_count_2")
+
+    // df1 has {a,b,c,d}, df2 has {a,c,d,e,f}, so intersection should be approximately {a,c,d} = 3.
+    checkAnswer(res3, Row(1, 3, 3))
+
+    // Test theta_intersection via SQL.
+    val res4 = sql("""with sketches1 as (
+      |select
+      | id,
+      | theta_sketch_agg(value) as sketch1,
+      | theta_sketch_agg(value, 20) as sketch1_20
+      |from df1
+      |group by 1
+      |),
+      |sketches2 as (
+      |select
+      | id,
+      | theta_sketch_agg(value) as sketch2,
+      | theta_sketch_agg(value, 20) as sketch2_20
+      |from df2
+      |group by 1
+      |)
+      |
+      |select
+      | s1.id,
+      | theta_sketch_estimate(theta_intersection(s1.sketch1, s2.sketch2)) as intersection_count_1,
+      | theta_sketch_estimate(theta_intersection(s1.sketch1_20, s2.sketch2_20))
+      | as intersection_count_2
+      |from sketches1 s1
+      |join sketches2 s2 on s1.id = s2.id
+      |""".stripMargin)
+    checkAnswer(res4, Row(1, 3, 3))
+
+    // Test theta_intersection_agg via DataFrame API.
+    val all_sketches = df1
+      .groupBy("id")
+      .agg(theta_sketch_agg("value").as("sketch"))
+      .withColumn("source", lit("df1"))
+      .union(
+        df2
+          .groupBy("id")
+          .agg(theta_sketch_agg("value").as("sketch"))
+          .withColumn("source", lit("df2")))
+      .union(
+        df3
+          .groupBy("id")
+          .agg(theta_sketch_agg("value").as("sketch"))
+          .withColumn("source", lit("df3")))
+
+    val res5 = all_sketches
+      .groupBy("id")
+      .agg(
+        theta_sketch_estimate(theta_intersection_agg("sketch")).as("intersection_count_1")
+      )
+
+    // df1={a,b,c,d}, df2={a,c,d,e,f}, df3={c,d,g,h}, so intersection should be {c,d} = 2.
+    checkAnswer(res5, Row(1, 2))
+
+    // Test theta_intersection_agg via SQL.
+    val res6 = sql("""with all_sketches as (
+      |select id, theta_sketch_agg(value) as sketch, 'df1' as source from df1 group by 1
+      |union all
+      |select id, theta_sketch_agg(value) as sketch, 'df2' as source from df2 group by 1
+      |union all
+      |select id, theta_sketch_agg(value) as sketch, 'df3' as source from df3 group by 1
+      |)
+      |
+      |select
+      | id,
+      | theta_sketch_estimate(theta_intersection_agg(sketch)) as intersection_count_1
+      |from all_sketches
+      |group by 1
+      |""".stripMargin)
+    checkAnswer(res6, Row(1, 2))
+
+    // Test with different lgNomEntries parameters.
+    val res7 = sql("""with sketches1 as (
+      |select id, theta_sketch_agg(value, 12) as sketch1 from df1 group by 1
+      |),
+      |sketches2 as (
+      |select id, theta_sketch_agg(value, 18) as sketch2 from df2 group by 1
+      |)
+      |
+      |select
+      | s1.id,
+      | theta_sketch_estimate(theta_difference(s1.sketch1, s2.sketch2)) as difference_count,
+      | theta_sketch_estimate(theta_intersection(s1.sketch1, s2.sketch2)) as intersection_count
+      |from sketches1 s1
+      |join sketches2 s2 on s1.id = s2.id
+      |""".stripMargin)
+    checkAnswer(res7, Row(1, 1, 3))
+
+    // Test with null values.
+    val df_with_nulls =
+      Seq((1, "a"), (1, "b"), (1, null), (2, null), (2, null)).toDF("id", "value")
+    df_with_nulls.createOrReplaceTempView("df_with_nulls")
+
+    val res8 = sql("""with sketch1 as (
+      |select id, theta_sketch_agg(value) as sketch from df_with_nulls where id = 1 group by 1
+      |),
+      |sketch2 as (
+      |select id, theta_sketch_agg(value) as sketch from df_with_nulls where id = 2 group by 1
+      |)
+      |
+      |select
+      | s1.id,
+      | theta_sketch_estimate(theta_difference(s1.sketch, s2.sketch)) as difference_count,
+      | theta_sketch_estimate(theta_intersection(s1.sketch, s2.sketch)) as intersection_count
+      |from sketch1 s1
+      |cross join sketch2 s2
+      |""".stripMargin)
+    // sketch1 has {a,b}, sketch2 is empty, so difference = 2 and intersection = 0.
+    checkAnswer(res8, Row(1, 2, 0))
+
+    // Test empty intersection.
+    val df_disjoint1 = Seq((1, "a"), (1, "b")).toDF("id", "value")
+    val df_disjoint2 = Seq((1, "c"), (1, "d")).toDF("id", "value")
+    df_disjoint1.createOrReplaceTempView("df_disjoint1")
+    df_disjoint2.createOrReplaceTempView("df_disjoint2")
+
+    val res9 = sql("""with sketch1 as (
+      |select id, theta_sketch_agg(value) as sketch from df_disjoint1 group by 1
+      |),
+      |sketch2 as (
+      |select id, theta_sketch_agg(value) as sketch from df_disjoint2 group by 1
+      |)
+      |
+      |select
+      | s1.id,
+      | theta_sketch_estimate(theta_intersection(s1.sketch, s2.sketch)) as intersection_count
+      |from sketch1 s1
+      |join sketch2 s2 on s1.id = s2.id
+      |""".stripMargin)
+    checkAnswer(res9, Row(1, 0))
+  }
+
+  test(
+    "SPARK-52407: theta_difference + theta_intersection + theta_intersection_agg negative tests") {
+    val df1 = Seq((1, "a"), (1, "b"), (1, "c"), (1, "d")).toDF("id", "value")
+    df1.createOrReplaceTempView("df1")
+
+    // Test invalid parameter types for theta_difference.
+    checkError(
+      exception = intercept[AnalysisException] {
+        sql("""with sketches as (
+          |select id, theta_sketch_agg(value) as sketch from df1 group by 1
+          |)
+          |select theta_difference(sketch, 'invalid') from sketches
+          |""".stripMargin).collect()
+      },
+      condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+      parameters = Map(
+        "sqlExpr" -> "\"theta_difference(sketch, invalid)\"",
+        "paramIndex" -> "second",
+        "inputSql" -> "\"invalid\"",
+        "inputType" -> "\"STRING\"",
+        "requiredType" -> "\"BINARY\""),
+      context =
+        ExpectedContext(fragment = "theta_difference(sketch, 'invalid')", start = 93, stop = 127))
+
+    // Test invalid parameter types for theta_intersection.
+    checkError(
+      exception = intercept[AnalysisException] {
+        sql("""with sketches as (
+          |select id, theta_sketch_agg(value) as sketch from df1 group by 1
+          |)
+          |select theta_intersection(sketch, 123) from sketches
+          |""".stripMargin).collect()
+      },
+      condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+      parameters = Map(
+        "sqlExpr" -> "\"theta_intersection(sketch, 123)\"",
+        "paramIndex" -> "second",
+        "inputSql" -> "\"123\"",
+        "inputType" -> "\"INT\"",
+        "requiredType" -> "\"BINARY\""),
+      context =
+        ExpectedContext(fragment = "theta_intersection(sketch, 123)", start = 93, stop = 123))
+
+    // Test invalid parameter types for theta_intersection_agg.
+    checkError(
+      exception = intercept[AnalysisException] {
+        sql("""with sketches as (
+          |select id, theta_sketch_agg(value) as sketch from df1 group by 1
+          |)
+          |select theta_intersection_agg('invalid') from sketches
+          |""".stripMargin).collect()
+      },
+      condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+      parameters = Map(
+        "sqlExpr" -> "\"theta_intersection_agg(invalid)\"",
+        "paramIndex" -> "first",
+        "inputSql" -> "\"invalid\"",
+        "inputType" -> "\"STRING\"",
+        "requiredType" -> "\"BINARY\""),
+      context = ExpectedContext(
+        fragment = "theta_intersection_agg('invalid')",
+        start = 93,
+        stop = 125))
+
+    // Test theta_difference with non-sketch input.
+    checkError(
+      exception = intercept[AnalysisException] {
+        sql("select theta_difference('not_a_sketch', 'also_not_a_sketch')").collect()
+      },
+      condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+      parameters = Map(
+        "sqlExpr" -> "\"theta_difference(not_a_sketch, also_not_a_sketch)\"",
+        "paramIndex" -> "first",
+        "inputSql" -> "\"not_a_sketch\"",
+        "inputType" -> "\"STRING\"",
+        "requiredType" -> "\"BINARY\""),
+      context = ExpectedContext(
+        fragment = "theta_difference('not_a_sketch', 'also_not_a_sketch')",
+        start = 7,
+        stop = 59))
+
+    // Test theta_intersection with non-sketch input.
+    checkError(
+      exception = intercept[AnalysisException] {
+        sql("select theta_intersection('not_a_sketch', 'also_not_a_sketch')").collect()
+      },
+      condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+      parameters = Map(
+        "sqlExpr" -> "\"theta_intersection(not_a_sketch, also_not_a_sketch)\"",
+        "paramIndex" -> "first",
+        "inputSql" -> "\"not_a_sketch\"",
+        "inputType" -> "\"STRING\"",
+        "requiredType" -> "\"BINARY\""),
+      context = ExpectedContext(
+        fragment = "theta_intersection('not_a_sketch', 'also_not_a_sketch')",
+        start = 7,
+        stop = 61))
+  }
+
+  test("SPARK-52407: theta_union") {
+    val df1 = Seq(1, 1, 2, 3).toDF("col")
+    val df2 = Seq(1, 3, 4, 5).toDF("col")
+
+    val sketch1 = df1.selectExpr("theta_sketch_agg(col, 12) as sketch1")
+    val sketch2 = df2.selectExpr("theta_sketch_agg(col, 12) as sketch2")
+
+    checkAnswer(
+      sketch1
+        .crossJoin(sketch2)
+        .selectExpr("theta_sketch_estimate(theta_union(sketch1, sketch2))"),
+      Seq(Row(5)) // {1,2,3} ∪ {1,3,4,5} = {1,2,3,4,5}
+    )
+    checkAnswer(
+      sketch1
+        .crossJoin(sketch2)
+        .select(theta_sketch_estimate(
+          theta_union(col("sketch1"), col("sketch2")))),
+      Seq(Row(5)))
+  }
+
+  test("SPARK-52407: theta_difference") {
+    val df1 = Seq(1, 1, 2, 3).toDF("col")
+    val df2 = Seq(1, 4, 5).toDF("col")
+
+    val sketch1 = df1.selectExpr("theta_sketch_agg(col, 12) as sketch1")
+    val sketch2 = df2.selectExpr("theta_sketch_agg(col, 12) as sketch2")
+
+    checkAnswer(
+      sketch1
+        .crossJoin(sketch2)
+        .selectExpr("theta_sketch_estimate(theta_difference(sketch1, sketch2))"),
+      Seq(Row(2)) // {1,2,3} - {1,4,5} = {2,3}
+    )
+    checkAnswer(
+      sketch1
+        .crossJoin(sketch2)
+        .select(
+          theta_sketch_estimate(theta_difference(col("sketch1"), col("sketch2")))),
+      Seq(Row(2)))
+  }
+
+  test("SPARK-52407: theta_intersection") {
+    val df1 = Seq(1, 1, 2, 3).toDF("col")
+    val df2 = Seq(1, 3, 4, 5).toDF("col")
+
+    val sketch1 = df1.selectExpr("theta_sketch_agg(col, 12) as sketch1")
+    val sketch2 = df2.selectExpr("theta_sketch_agg(col, 12) as sketch2")
+
+    checkAnswer(
+      sketch1
+        .crossJoin(sketch2)
+        .selectExpr("theta_sketch_estimate(theta_intersection(sketch1, sketch2))"),
+      Seq(Row(2)) // {1,2,3} ∩ {1,3,4,5} = {1,3}
+    )
+    checkAnswer(
+      sketch1
+        .crossJoin(sketch2)
+        .select(theta_sketch_estimate(
+          theta_intersection(col("sketch1"), col("sketch2")))),
+      Seq(Row(2)))
+  }
+
+  test("SPARK-52407: theta_intersection_agg") {
+    val df = Seq(1, 2).toDF("col")
+
+    checkAnswer(
+      df.selectExpr("theta_sketch_agg(col) as sketch")
+        .unionAll(df.selectExpr("theta_sketch_agg(col, 20) as sketch"))
+        .unionAll(df.filter(col("col") === 1).selectExpr("theta_sketch_agg(col) as sketch"))
+        .selectExpr("theta_sketch_estimate(theta_intersection_agg(sketch))"),
+      Seq(Row(1)) // The intersection of {1,2}, {1,2}, {1} = {1}.
+    )
+    checkAnswer(
+      df.select(theta_sketch_agg(col("col")).as("sketch"))
+        .unionAll(df.select(theta_sketch_agg(col("col"), lit(20)).as("sketch")))
+        .unionAll(df.filter(col("col") === 1).select(theta_sketch_agg(col("col")).as("sketch")))
+        .select(theta_sketch_estimate(theta_intersection_agg(col("sketch")))),
+      Seq(Row(1)))
+  }
+
+  test("SPARK-52407: theta_sketch_agg") {
+    val df = Seq(1, 1, 2, 2, 3).toDF("col")
+    checkAnswer(df.selectExpr("theta_sketch_estimate(theta_sketch_agg(col, 12))"), Seq(Row(3)))
+    checkAnswer(
+      df.select(theta_sketch_estimate(theta_sketch_agg(col("col"), lit(12)))),
+      Seq(Row(3)))
+  }
+
+  test("SPARK-52407: theta_union_agg") {
+    val df = Seq(1).toDF("col")
+    checkAnswer(
+      df.selectExpr("theta_sketch_agg(col) as sketch")
+        .unionAll(df.selectExpr("theta_sketch_agg(col, 20) as sketch"))
+        .selectExpr("theta_sketch_estimate(theta_union_agg(sketch, 15))"),
+      Seq(Row(1)))
+    checkAnswer(
+      df.select(theta_sketch_agg(col("col")).as("sketch"))
+        .unionAll(df.select(theta_sketch_agg(col("col"), lit(20)).as("sketch")))
+        .select(theta_sketch_estimate(theta_union_agg(col("sketch"), lit(15)))),
+      Seq(Row(1)))
+  }
+
   private def assertAggregateOnDataframe(
       df: => DataFrame,
       expected: Int): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala
index 4044e5674191b..bcc0ce2633125 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala
@@ -1885,6 +1885,26 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper {
     })
   }
 
+  test("theta sketch aggregate should respect collation") {
+    case class ThetaSketchAggTestCase[R](c: String, result: R)
+    val testCases = Seq(
+      ThetaSketchAggTestCase("UTF8_BINARY", 5),
+      ThetaSketchAggTestCase("UTF8_BINARY_RTRIM", 4),
+      ThetaSketchAggTestCase("UTF8_LCASE", 4),
+      ThetaSketchAggTestCase("UTF8_LCASE_RTRIM", 3),
+      ThetaSketchAggTestCase("UNICODE", 5),
+      ThetaSketchAggTestCase("UNICODE_RTRIM", 4),
+      ThetaSketchAggTestCase("UNICODE_CI", 4),
+      ThetaSketchAggTestCase("UNICODE_CI_RTRIM", 3)
+    )
+    testCases.foreach(t => {
+      val q = s"SELECT theta_sketch_estimate(theta_sketch_agg(col collate ${t.c})) FROM " +
+        "VALUES ('a'), ('A'), ('b'), ('b'), ('c'), ('c ') tab(col)"
+      val df = sql(q)
+      checkAnswer(df, Seq(Row(t.result)))
+    })
+  }
+
   test("cache table with collated columns") {
     val collations = Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI", "SR_CI_AI")
     val lazyOptions = Seq(false, true)