Skip to content

Commit b9849f3

Browse files
committed
ORCA: Make partial agg use the ndistinct generated by segment to calculate the output rows
A new statistic STATISTIC_KIND_NDV_BY_SEGMENTS(hereinafter referred to as NDVOnSeg) was added in the previous commit and is applied to ORCA in the current commit. Due to the limited slots of statskind, not all cases have NDVOnSeg statistics. However, if the current colstats has NDVOnSeg, NDVOnSeg will be used first, otherwise, ORCA will fall back to using the histogram to calculate NDV (the calculation result is close to the NDV in pg_statistic). In fact, I think this commit is quite safe, because I only changed the NDV value in `CColStats`, which means that the cost calculation will be more accurate when the GbAgg operator is closer to the Scan operator.
1 parent 9d7820e commit b9849f3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+544
-416
lines changed

contrib/pax_storage/src/test/regress/expected/bfv_aggregate_optimizer.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ set optimizer_force_multistage_agg = on;
188188
select count_operator('select count(*) from multi_stage_test group by b;','GroupAggregate');
189189
count_operator
190190
----------------
191-
2
191+
1
192192
(1 row)
193193

194194
set optimizer_force_multistage_agg = off;

contrib/pax_storage/src/test/regress/expected/incremental_analyze.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1949,7 +1949,7 @@ SELECT staattnum, stakind1, stakind2, stakind3, stakind4, stakind5,
19491949
FROM pg_statistic WHERE starelid = 'simple_table_no_hll'::regclass;
19501950
staattnum | stakind1 | stakind2 | stakind3 | stakind4 | stakind5 | stavalues1 | stavalues2 | stavalues3 | stavalues4 | stavalues5
19511951
-----------+----------+----------+----------+----------+----------+--------------+------------+------------+------------+------------
1952-
1 | 2 | 3 | 0 | 0 | 0 | {1,3,5,7,10} | | | |
1952+
1 | 2 | 3 | 8 | 0 | 0 | {1,3,5,7,10} | | {10} | |
19531953
(1 row)
19541954

19551955
-- Make sure analyze rootpartition option works in an option list

contrib/pax_storage/src/test/regress/expected/olap_plans_optimizer.out

Lines changed: 58 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -79,18 +79,20 @@ select a, b, c, sum(d) from olap_test group by a, b, c;
7979

8080
-- If it's not a superset, redistribution is needed.
8181
explain select a, sum(d) from olap_test group by a;
82-
QUERY PLAN
83-
-------------------------------------------------------------------------------------------------
82+
QUERY PLAN
83+
-------------------------------------------------------------------------------------------------------
8484
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.55 rows=3 width=12)
85-
-> Finalize HashAggregate (cost=0.00..431.55 rows=1 width=12)
85+
-> Finalize GroupAggregate (cost=0.00..431.55 rows=1 width=12)
8686
Group Key: a
87-
-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.55 rows=1 width=12)
88-
Hash Key: a
89-
-> Streaming Partial HashAggregate (cost=0.00..431.55 rows=1 width=12)
90-
Group Key: a
91-
-> Seq Scan on olap_test (cost=0.00..431.09 rows=3334 width=8)
87+
-> Sort (cost=0.00..431.55 rows=2 width=12)
88+
Sort Key: a
89+
-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.55 rows=2 width=12)
90+
Hash Key: a
91+
-> Streaming Partial HashAggregate (cost=0.00..431.55 rows=2 width=12)
92+
Group Key: a
93+
-> Seq Scan on olap_test (cost=0.00..431.09 rows=3334 width=8)
9294
Optimizer: GPORCA
93-
(9 rows)
95+
(11 rows)
9496

9597
select a, sum(d) from olap_test group by a;
9698
a | sum
@@ -185,8 +187,8 @@ set gp_motion_cost_per_row=1.0;
185187
-- If the query produces a relatively small number of groups in comparison to
186188
-- the number of input rows, two-stage aggregation will be picked.
187189
explain select a, b, c, sum(d) from olap_test group by grouping sets((a, b), (a), (b, c));
188-
QUERY PLAN
189-
--------------------------------------------------------------------------------------------------------------
190+
QUERY PLAN
191+
-------------------------------------------------------------------------------------------------------------------
190192
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1726.51 rows=152 width=20)
191193
-> Sequence (cost=0.00..1726.50 rows=51 width=20)
192194
-> Shared Scan (share slice:id 1:0) (cost=0.00..431.19 rows=3334 width=1)
@@ -199,18 +201,20 @@ explain select a, b, c, sum(d) from olap_test group by grouping sets((a, b), (a)
199201
-> Streaming Partial HashAggregate (cost=0.00..431.91 rows=44 width=16)
200202
Group Key: share0_ref2.b, share0_ref2.c
201203
-> Shared Scan (share slice:id 2:0) (cost=0.00..431.10 rows=3334 width=12)
202-
-> Finalize HashAggregate (cost=0.00..431.47 rows=1 width=12)
204+
-> Finalize GroupAggregate (cost=0.00..431.47 rows=1 width=12)
203205
Group Key: share0_ref3.a
204-
-> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.47 rows=1 width=12)
205-
Hash Key: share0_ref3.a
206-
-> Streaming Partial HashAggregate (cost=0.00..431.47 rows=1 width=12)
207-
Group Key: share0_ref3.a
208-
-> Shared Scan (share slice:id 3:0) (cost=0.00..431.06 rows=3334 width=8)
206+
-> Sort (cost=0.00..431.47 rows=2 width=12)
207+
Sort Key: share0_ref3.a
208+
-> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.47 rows=2 width=12)
209+
Hash Key: share0_ref3.a
210+
-> Streaming Partial HashAggregate (cost=0.00..431.47 rows=2 width=12)
211+
Group Key: share0_ref3.a
212+
-> Shared Scan (share slice:id 3:0) (cost=0.00..431.06 rows=3334 width=8)
209213
-> HashAggregate (cost=0.00..431.91 rows=7 width=16)
210214
Group Key: share0_ref4.a, share0_ref4.b
211215
-> Shared Scan (share slice:id 1:0) (cost=0.00..431.10 rows=3334 width=12)
212216
Optimizer: GPORCA
213-
(23 rows)
217+
(25 rows)
214218

215219
select a, b, c, sum(d) from olap_test group by grouping sets((a, b), (a), (b, c));
216220
a | b | c | sum
@@ -255,8 +259,8 @@ select a, b, c, sum(d) from olap_test group by grouping sets((a, b), (a), (b, c)
255259
-- If the query produces a relatively large number of groups in comparison to
256260
-- the number of input rows, one-stage aggregation will be picked.
257261
explain select a, b, d, sum(d) from olap_test group by grouping sets((a, b), (a), (b, d));
258-
QUERY PLAN
259-
---------------------------------------------------------------------------------------------------------------
262+
QUERY PLAN
263+
-------------------------------------------------------------------------------------------------------------------
260264
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1727.48 rows=10022 width=20)
261265
-> Sequence (cost=0.00..1726.74 rows=3341 width=20)
262266
-> Shared Scan (share slice:id 1:0) (cost=0.00..431.17 rows=3334 width=1)
@@ -268,18 +272,20 @@ explain select a, b, d, sum(d) from olap_test group by grouping sets((a, b), (a)
268272
Hash Key: share0_ref2.b, share0_ref2.d
269273
-> Result (cost=0.00..431.06 rows=3334 width=8)
270274
-> Shared Scan (share slice:id 2:0) (cost=0.00..431.06 rows=3334 width=8)
271-
-> Finalize HashAggregate (cost=0.00..431.47 rows=1 width=12)
275+
-> Finalize GroupAggregate (cost=0.00..431.47 rows=1 width=12)
272276
Group Key: share0_ref3.a
273-
-> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.47 rows=1 width=12)
274-
Hash Key: share0_ref3.a
275-
-> Streaming Partial HashAggregate (cost=0.00..431.47 rows=1 width=12)
276-
Group Key: share0_ref3.a
277-
-> Shared Scan (share slice:id 3:0) (cost=0.00..431.06 rows=3334 width=8)
277+
-> Sort (cost=0.00..431.47 rows=2 width=12)
278+
Sort Key: share0_ref3.a
279+
-> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.47 rows=2 width=12)
280+
Hash Key: share0_ref3.a
281+
-> Streaming Partial HashAggregate (cost=0.00..431.47 rows=2 width=12)
282+
Group Key: share0_ref3.a
283+
-> Shared Scan (share slice:id 3:0) (cost=0.00..431.06 rows=3334 width=8)
278284
-> HashAggregate (cost=0.00..431.91 rows=7 width=16)
279285
Group Key: share0_ref4.a, share0_ref4.b
280286
-> Shared Scan (share slice:id 1:0) (cost=0.00..431.10 rows=3334 width=12)
281287
Optimizer: GPORCA
282-
(22 rows)
288+
(24 rows)
283289

284290
-- do not execute this query as it would produce too many tuples.
285291
-- Test that when the second-stage Agg doesn't try to preserve the
@@ -292,8 +298,8 @@ explain select a, b, d, sum(d) from olap_test group by grouping sets((a, b), (a)
292298
-- from the Merge Key.
293299
set enable_hashagg=off;
294300
explain select a, b, c, sum(d) from olap_test group by grouping sets((a, b), (a), (b, c)) limit 200;
295-
QUERY PLAN
296-
--------------------------------------------------------------------------------------------------------------------
301+
QUERY PLAN
302+
-------------------------------------------------------------------------------------------------------------------------
297303
Limit (cost=0.00..1726.51 rows=152 width=20)
298304
-> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1726.51 rows=152 width=20)
299305
-> Sequence (cost=0.00..1726.50 rows=51 width=20)
@@ -307,18 +313,20 @@ explain select a, b, c, sum(d) from olap_test group by grouping sets((a, b), (a)
307313
-> Streaming Partial HashAggregate (cost=0.00..431.91 rows=44 width=16)
308314
Group Key: share0_ref2.b, share0_ref2.c
309315
-> Shared Scan (share slice:id 2:0) (cost=0.00..431.10 rows=3334 width=12)
310-
-> Finalize HashAggregate (cost=0.00..431.47 rows=1 width=12)
316+
-> Finalize GroupAggregate (cost=0.00..431.47 rows=1 width=12)
311317
Group Key: share0_ref3.a
312-
-> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.47 rows=1 width=12)
313-
Hash Key: share0_ref3.a
314-
-> Streaming Partial HashAggregate (cost=0.00..431.47 rows=1 width=12)
315-
Group Key: share0_ref3.a
316-
-> Shared Scan (share slice:id 3:0) (cost=0.00..431.06 rows=3334 width=8)
318+
-> Sort (cost=0.00..431.47 rows=2 width=12)
319+
Sort Key: share0_ref3.a
320+
-> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.47 rows=2 width=12)
321+
Hash Key: share0_ref3.a
322+
-> Streaming Partial HashAggregate (cost=0.00..431.47 rows=2 width=12)
323+
Group Key: share0_ref3.a
324+
-> Shared Scan (share slice:id 3:0) (cost=0.00..431.06 rows=3334 width=8)
317325
-> HashAggregate (cost=0.00..431.91 rows=7 width=16)
318326
Group Key: share0_ref4.a, share0_ref4.b
319327
-> Shared Scan (share slice:id 1:0) (cost=0.00..431.10 rows=3334 width=12)
320328
Optimizer: GPORCA
321-
(24 rows)
329+
(26 rows)
322330

323331
reset enable_hashagg;
324332
--
@@ -331,48 +339,40 @@ create table foo_ctas(a int, b int) distributed randomly;
331339
insert into foo_ctas select g%5, g%2 from generate_series(1, 100) g;
332340
analyze foo_ctas;
333341
explain create table bar_ctas as select * from foo_ctas group by a, b distributed by (b);
334-
QUERY PLAN
335-
------------------------------------------------------------------------------------------------------------
342+
QUERY PLAN
343+
-------------------------------------------------------------------------------------------------------------
336344
Result (cost=0.00..431.10 rows=6 width=8)
337345
-> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..431.01 rows=6 width=8)
338346
Hash Key: b
339347
-> GroupAggregate (cost=0.00..431.01 rows=2 width=8)
340348
Group Key: a, b
341-
-> Sort (cost=0.00..431.01 rows=2 width=8)
349+
-> Sort (cost=0.00..431.01 rows=34 width=8)
342350
Sort Key: a, b
343-
-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.01 rows=2 width=8)
351+
-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=34 width=8)
344352
Hash Key: a, b
345-
-> GroupAggregate (cost=0.00..431.01 rows=2 width=8)
346-
Group Key: a, b
347-
-> Sort (cost=0.00..431.01 rows=34 width=8)
348-
Sort Key: a, b
349-
-> Seq Scan on foo_ctas (cost=0.00..431.00 rows=34 width=8)
350-
Optimizer: Pivotal Optimizer (GPORCA)
351-
(15 rows)
353+
-> Seq Scan on foo_ctas (cost=0.00..431.00 rows=34 width=8)
354+
Optimizer: GPORCA
355+
(11 rows)
352356

353357
create table bar_ctas as select * from foo_ctas group by a, b distributed by (b);
354358
-- Currently, the planner misses this optimization with INSERT, so this
355359
-- needs an extra Redistribute Motion.
356360
explain insert into bar_ctas select * from foo_ctas group by a, b;
357-
QUERY PLAN
358-
------------------------------------------------------------------------------------------------------------------
361+
QUERY PLAN
362+
-------------------------------------------------------------------------------------------------------------------
359363
Insert on bar_ctas (cost=0.00..431.10 rows=2 width=8)
360364
-> Result (cost=0.00..431.01 rows=6 width=12)
361365
-> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..431.01 rows=6 width=8)
362366
Hash Key: foo_ctas.b
363367
-> GroupAggregate (cost=0.00..431.01 rows=2 width=8)
364368
Group Key: foo_ctas.a, foo_ctas.b
365-
-> Sort (cost=0.00..431.01 rows=2 width=8)
369+
-> Sort (cost=0.00..431.01 rows=34 width=8)
366370
Sort Key: foo_ctas.a, foo_ctas.b
367-
-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.01 rows=2 width=8)
371+
-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=34 width=8)
368372
Hash Key: foo_ctas.a, foo_ctas.b
369-
-> GroupAggregate (cost=0.00..431.01 rows=2 width=8)
370-
Group Key: foo_ctas.a, foo_ctas.b
371-
-> Sort (cost=0.00..431.01 rows=34 width=8)
372-
Sort Key: foo_ctas.a, foo_ctas.b
373-
-> Seq Scan on foo_ctas (cost=0.00..431.00 rows=34 width=8)
374-
Optimizer: Pivotal Optimizer (GPORCA)
375-
(16 rows)
373+
-> Seq Scan on foo_ctas (cost=0.00..431.00 rows=34 width=8)
374+
Optimizer: GPORCA
375+
(12 rows)
376376

377377
drop table foo_ctas;
378378
drop table bar_ctas;

0 commit comments

Comments
 (0)