Skip to content
Closed
Show file tree
Hide file tree
Changes from 39 commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
e9a2cbe
[SPARK-52407][SQL] Add support for Theta Sketch
Jun 27, 2025
5185c7f
Merge branch 'apache:master' into SPARK-52407_add_datasketches_thetas…
cboumalh Jun 27, 2025
a44a6be
added ExpressionDescription to ThetaIntersectionAgg, fixed some broke…
Jun 27, 2025
7c8e4f5
wrap array unit tests with array data to avoid casting errors test run
Jun 27, 2025
07a0c87
Update merge function parameter name in thesketchesAggregates
cboumalh Jun 27, 2025
1b3bfc8
Added Dataframes Suite, Theta Sketch Utils, sql expressions, error co…
Jul 12, 2025
edae21a
generate golden files
Jul 14, 2025
db75ca3
python functionality added
Jul 15, 2025
a892e0c
Merge branch 'master' into SPARK-52407_add_datasketches_thetasketch
cboumalh Jul 15, 2025
9e9886e
added missing builtin.py code for pyspark
Jul 15, 2025
b0d57a9
fix typos and formatting errors
Jul 15, 2025
5cca851
fix broken tests in ExpressionsSchemaSuite, PlanGenerationTestSuite, …
Jul 15, 2025
a00fdd6
add .explain files from ProtoToParsedPlanTestSuite
Jul 15, 2025
b3cb40a
typos, formatting, and naming fixes
Jul 16, 2025
2334aa1
changed theta sketch spark since version, added default sketch size t…
Jul 23, 2025
134dd29
remove unnecessary query from sql query tests
Jul 24, 2025
4928059
update to use toByteArrayCompressed, heapify in deserialization for s…
Jul 26, 2025
32a4d66
format
Jul 27, 2025
30fd361
optimize aggregating performance
Jul 27, 2025
3335453
test fix
Jul 27, 2025
5830fbc
test fix
Jul 27, 2025
41e3311
buffer empty check added
Jul 27, 2025
544079f
refactor
Jul 28, 2025
e79cf4e
Re-run checks
Jul 28, 2025
6ca7440
feature version back to 4.1.0
Jul 29, 2025
a996539
Merge branch 'master' into SPARK-52407_add_datasketches_thetasketch
cboumalh Jul 30, 2025
6548180
Merge branch 'master' into SPARK-52407_add_datasketches_thetasketch
cboumalh Jul 31, 2025
28a64a5
small refactor
Aug 8, 2025
3d00d23
small refactor
Aug 8, 2025
07da71c
fix
Aug 8, 2025
34ad644
fix
Aug 10, 2025
a80b7d4
comments fix
Aug 10, 2025
e91d702
comment fix
Aug 26, 2025
e792ac2
resolving comments
Aug 26, 2025
8905ea9
resolving comments
Aug 29, 2025
1075f6b
comment fix
Aug 29, 2025
ffd2b30
fix
Aug 29, 2025
49c88fc
fix
Aug 29, 2025
ae1f1bd
comments fix
Aug 29, 2025
8ef59e8
resolve comments
Aug 30, 2025
0ae8315
fix
Aug 30, 2025
526d791
fix
Aug 30, 2025
cc452e8
fix
Aug 30, 2025
c86c4bf
fix
Aug 30, 2025
f1669ec
fix
Sep 8, 2025
93f4a0a
fix
Sep 9, 2025
44fd9d4
fix
Sep 9, 2025
613ee05
fix
Sep 10, 2025
6c3b9bc
fix
Sep 11, 2025
b1cea44
Merge branch 'apache:master' into SPARK-52407_add_datasketches_thetas…
cboumalh Sep 14, 2025
78c6df8
fix
Sep 14, 2025
189dae6
fix
Sep 14, 2025
b1409d4
fix
Sep 14, 2025
a135d6a
fix
Sep 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions common/utils/src/main/resources/error/error-conditions.json
Original file line number Diff line number Diff line change
Expand Up @@ -5560,6 +5560,18 @@
],
"sqlState" : "428EK"
},
"THETA_INVALID_INPUT_SKETCH_BUFFER" : {
"message" : [
"Invalid call to <function>; only valid Theta sketch buffers are supported as inputs (such as those produced by the `theta_sketch_agg` function)."
],
"sqlState" : "22546"
},
"THETA_INVALID_LG_NOM_ENTRIES" : {
"message" : [
"Invalid Theta sketch call; the `lgNomEntries` value must be between <min> and <max>, inclusive: <value>."
],
"sqlState" : "22546"
},
"TRAILING_COMMA_IN_SELECT" : {
"message" : [
"Trailing comma detected in SELECT clause. Remove the trailing comma before the FROM clause."
Expand Down
7 changes: 7 additions & 0 deletions python/docs/source/reference/pyspark.sql/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,9 @@ Aggregate Functions
string_agg_distinct
sum
sum_distinct
theta_intersection_agg
theta_sketch_agg
theta_union_agg
try_avg
try_sum
var_pop
Expand Down Expand Up @@ -635,6 +638,10 @@ Misc Functions
reflect
session_user
spark_partition_id
theta_difference
theta_intersection
theta_sketch_estimate
theta_union
try_aes_decrypt
try_reflect
typeof
Expand Down
84 changes: 84 additions & 0 deletions python/pyspark/sql/connect/functions/builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4229,6 +4229,90 @@ def hll_union(
hll_union.__doc__ = pysparkfuncs.hll_union.__doc__


def theta_sketch_agg(
col: "ColumnOrName",
lgNomEntries: Optional[Union[int, Column]] = None,
) -> Column:
if lgNomEntries is None:
return _invoke_function_over_columns("theta_sketch_agg", col)
else:
return _invoke_function_over_columns("theta_sketch_agg", col, lit(lgNomEntries))


theta_sketch_agg.__doc__ = pysparkfuncs.theta_sketch_agg.__doc__


def theta_union_agg(
col: "ColumnOrName",
lgNomEntries: Optional[Union[int, Column]] = None,
) -> Column:
if lgNomEntries is None:
return _invoke_function_over_columns("theta_union_agg", col)
else:
return _invoke_function_over_columns("theta_union_agg", col, lit(lgNomEntries))


theta_union_agg.__doc__ = pysparkfuncs.theta_union_agg.__doc__


def theta_intersection_agg(
col: "ColumnOrName",
lgNomEntries: Optional[Union[int, Column]] = None,
) -> Column:
if lgNomEntries is None:
return _invoke_function_over_columns("theta_intersection_agg", col)
else:
return _invoke_function_over_columns("theta_intersection_agg", col, lit(lgNomEntries))


theta_intersection_agg.__doc__ = pysparkfuncs.theta_intersection_agg.__doc__


def theta_sketch_estimate(col: "ColumnOrName") -> Column:
return _invoke_function("theta_sketch_estimate", _to_col(col))


theta_sketch_estimate.__doc__ = pysparkfuncs.theta_sketch_estimate.__doc__


def theta_union(
col1: "ColumnOrName", col2: "ColumnOrName", lgNomEntries: Optional[int] = None
) -> Column:
if lgNomEntries is None:
return _invoke_function("theta_union", _to_col(col1), _to_col(col2))
else:
return _invoke_function("theta_union", _to_col(col1), _to_col(col2), lit(lgNomEntries))


theta_union.__doc__ = pysparkfuncs.theta_union.__doc__


def theta_intersection(
col1: "ColumnOrName", col2: "ColumnOrName", lgNomEntries: Optional[int] = None
) -> Column:
if lgNomEntries is None:
return _invoke_function("theta_intersection", _to_col(col1), _to_col(col2))
else:
return _invoke_function(
"theta_intersection", _to_col(col1), _to_col(col2), lit(lgNomEntries)
)


theta_intersection.__doc__ = pysparkfuncs.theta_intersection.__doc__


def theta_difference(
col1: "ColumnOrName", col2: "ColumnOrName", lgNomEntries: Optional[int] = None
) -> Column:
if lgNomEntries is None:
return _invoke_function("theta_difference", _to_col(col1), _to_col(col2))
else:
return _invoke_function("theta_difference", _to_col(col1), _to_col(col2), lit(lgNomEntries))


theta_difference.__doc__ = pysparkfuncs.theta_difference.__doc__


# Predicates Function


Expand Down
7 changes: 7 additions & 0 deletions python/pyspark/sql/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,9 @@
"string_agg_distinct",
"sum",
"sum_distinct",
"theta_intersection_agg",
"theta_sketch_agg",
"theta_union_agg",
"try_avg",
"try_sum",
"var_pop",
Expand Down Expand Up @@ -494,6 +497,10 @@
"reflect",
"session_user",
"spark_partition_id",
"theta_difference",
"theta_intersection",
"theta_sketch_estimate",
"theta_union",
"try_aes_decrypt",
"try_reflect",
"typeof",
Expand Down
Loading