performance: eliminate unnecessary null_counts calculations

gongxun0928 · gongxun0928 · commit cb135adb03cd · 2025-10-29T15:52:07.000+08:00
after calculating the null_counts array in advance, there is no need to call
GetColumnDatum to continue updating null_counts. We can directly read the datum.
diff --git a/contrib/pax_storage/src/cpp/storage/micro_partition_row_filter_reader.cc b/contrib/pax_storage/src/cpp/storage/micro_partition_row_filter_reader.cc
@@ -152,15 +152,15 @@ bool MicroPartitionRowFilterReader::ApplyFiltersWithSampling(
     return true;
   }
 
+  ctx->sample_rows++;
   bool all_pass = true;
+  // in the sampling phase, we need to evaluate all filter nodes, if any node
+  // fails, the tuple is rejected
   for (auto &node : ctx->filter_nodes) {
     if (!EvalFilterNode(ctx, group, desc, row_index, slot, node, true)) {
       all_pass = false;
-      break;
     }
   }
-  ctx->sample_rows++;
-  if (!all_pass) return false;
 
   if (ctx->sample_rows >= ctx->sample_target) {
     for (auto &node : ctx->filter_nodes) {
@@ -175,7 +175,7 @@ bool MicroPartitionRowFilterReader::ApplyFiltersWithSampling(
                      });
     ctx->sampling = false;
   }
-  return true;
+  return all_pass;
 }
 
 bool MicroPartitionRowFilterReader::ReadTuple(TupleTableSlot *slot) {
diff --git a/contrib/pax_storage/src/cpp/storage/orc/orc_group.cc b/contrib/pax_storage/src/cpp/storage/orc/orc_group.cc
@@ -293,15 +293,20 @@ std::pair<Datum, bool> OrcGroup::GetColumnValueNoMissing(size_t column_index,
     return {0, true};
   }
 
-  if (column->HasNull() && !nulls_shuffle_[column_index]) {
-    CalcNullShuffle(column, column_index);
-  }
+  if (column->HasNull()) {
+    const auto &bm = column->GetBitmap();
+    Assert(bm);
+    if (!bm->Test(row_index)) {
+      return {0, true};
+    }
 
-  if (nulls_shuffle_[column_index]) {
+    // if not null value, calculate the null offsets array for each row
+    if (!nulls_shuffle_[column_index]) {
+      CalcNullShuffle(column, column_index);
+    }
     null_counts = nulls_shuffle_[column_index][row_index];
   }
-
-  return GetColumnDatum(column, row_index, &null_counts);
+  return {column->GetDatum(row_index - null_counts), false};
 }
 
 void OrcGroup::CalcNullShuffle(PaxColumn *column, size_t column_index) {
diff --git a/contrib/pax_storage/src/test/regress/expected/gp_runtime_filter.out b/contrib/pax_storage/src/test/regress/expected/gp_runtime_filter.out
@@ -261,8 +261,8 @@ DROP TABLE IF EXISTS t1;
 NOTICE:  table "t1" does not exist, skipping
 DROP TABLE IF EXISTS t2;
 NOTICE:  table "t2" does not exist, skipping
-CREATE TABLE t1(c1 int, c2 int, c3 int, c4 int, c5 int) with (appendonly=true, orientation=column) distributed by (c1);
-CREATE TABLE t2(c1 int, c2 int, c3 int, c4 int, c5 int) with (appendonly=true, orientation=column) distributed REPLICATED;
+CREATE TABLE t1(c1 int, c2 int, c3 int, c4 int, c5 int) distributed by (c1);
+CREATE TABLE t2(c1 int, c2 int, c3 int, c4 int, c5 int) distributed REPLICATED;
 INSERT INTO t1 VALUES (5,5,5,5,5);
 INSERT INTO t2 VALUES (1,1,1,1,1), (2,2,2,2,2), (3,3,3,3,3), (4,4,4,4,4);
 INSERT INTO t1 SELECT * FROM t1;
diff --git a/contrib/pax_storage/src/test/regress/sql/gp_runtime_filter.sql b/contrib/pax_storage/src/test/regress/sql/gp_runtime_filter.sql
@@ -85,8 +85,8 @@ SET enable_parallel TO off;
 -- case 1: join on distribution table and replicated table.
 DROP TABLE IF EXISTS t1;
 DROP TABLE IF EXISTS t2;
-CREATE TABLE t1(c1 int, c2 int, c3 int, c4 int, c5 int) with (appendonly=true, orientation=column) distributed by (c1);
-CREATE TABLE t2(c1 int, c2 int, c3 int, c4 int, c5 int) with (appendonly=true, orientation=column) distributed REPLICATED;
+CREATE TABLE t1(c1 int, c2 int, c3 int, c4 int, c5 int) distributed by (c1);
+CREATE TABLE t2(c1 int, c2 int, c3 int, c4 int, c5 int) distributed REPLICATED;
 
 INSERT INTO t1 VALUES (5,5,5,5,5);
 INSERT INTO t2 VALUES (1,1,1,1,1), (2,2,2,2,2), (3,3,3,3,3), (4,4,4,4,4);

Original file line number	Diff line number	Diff line change
`@@ -152,15 +152,15 @@ bool MicroPartitionRowFilterReader::ApplyFiltersWithSampling(`
`152`	`152`	`return true;`
`153`	`153`	`}`
`154`	`154`
	`155`	`+ ctx->sample_rows++;`
`155`	`156`	`bool all_pass = true;`
	`157`	`+ // in the sampling phase, we need to evaluate all filter nodes, if any node`
	`158`	`+ // fails, the tuple is rejected`
`156`	`159`	`for (auto &node : ctx->filter_nodes) {`
`157`	`160`	`if (!EvalFilterNode(ctx, group, desc, row_index, slot, node, true)) {`
`158`	`161`	`all_pass = false;`
`159`		`- break;`
`160`	`162`	`}`
`161`	`163`	`}`
`162`		`- ctx->sample_rows++;`
`163`		`- if (!all_pass) return false;`
`164`	`164`
`165`	`165`	`if (ctx->sample_rows >= ctx->sample_target) {`
`166`	`166`	`for (auto &node : ctx->filter_nodes) {`
`@@ -175,7 +175,7 @@ bool MicroPartitionRowFilterReader::ApplyFiltersWithSampling(`
`175`	`175`	`});`
`176`	`176`	`ctx->sampling = false;`
`177`	`177`	`}`
`178`		`- return true;`
	`178`	`+ return all_pass;`
`179`	`179`	`}`
`180`	`180`
`181`	`181`	`bool MicroPartitionRowFilterReader::ReadTuple(TupleTableSlot *slot) {`