This is an automated email from the ASF dual-hosted git repository.

zclllyybb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 5d97b29034a [improvement](be) Optimize count on nullable column 
(#64166)
5d97b29034a is described below

commit 5d97b29034ab07f32ccff6f8c9d14efdba3190ec
Author: zclllyybb <[email protected]>
AuthorDate: Mon Jun 8 11:52:56 2026 +0800

    [improvement](be) Optimize count on nullable column (#64166)
    
    Count aggregation without GROUP BY reaches
    AggFnEvaluator::execute_single_add(), which calls
    add_batch_single_place(). AggregateFunctionCount and
    AggregateFunctionCountNotNullUnary previously inherited the row-by-row
    helper there, so count(*) and count(nullable_expr) paid per-row
    add/is_null_at costs even when all rows were aggregated into one state.
    
    This patch adds batch implementations: count(*) increments the state
    once by batch_size, while unary count(nullable_expr) checks the nullable
    null map once and fast-paths the no-NULL case to count += batch_size.
    When NULLs exist it uses simd::count_zero_num() over the null map to
    count non-NULL rows. The nullable class name is kept because SQL
    count(expr) counts non-NULL values, not NULL values.
    
    Performance:
    test with sql
    ```sql
    select count(nullable(number)) from numbers("number"="1000000000");
    
    select count(nullable(if(number >= 0, null, number))) from 
numbers("number"="1000000000");
    
    select count(nullable(if(number % 2 = 0, number, null))) from 
numbers("number"="1000000000");
    ```
    get result
    ```
     Scenario     before median / mean    after median / mean    median diff
    ━━━━━━━━━━━  ━━━━━━━━━━━━━━━━━━━━━━  ━━━━━━━━━━━━━━━━━━━━━  ━━━━━━━━━━━━━
     non NULL           645 / 648.6 ms         555 / 556.4 ms         -14.0%
    ───────────  ──────────────────────  ─────────────────────  ─────────────
     all NULL         1541 / 1539.6 ms       1448 / 1450.6 ms          -6.0%
    ───────────  ──────────────────────  ─────────────────────  ─────────────
     half NULL        4256 / 4261.2 ms       4192 / 4232.2 ms          -1.5%
    ```
---
 be/src/exprs/aggregate/aggregate_function_count.h | 23 +++++++++++++++++++++--
 be/test/exprs/aggregate/agg_count_test.cpp        | 21 +++++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/be/src/exprs/aggregate/aggregate_function_count.h 
b/be/src/exprs/aggregate/aggregate_function_count.h
index bc6e57251c0..2ae2692ccb5 100644
--- a/be/src/exprs/aggregate/aggregate_function_count.h
+++ b/be/src/exprs/aggregate/aggregate_function_count.h
@@ -37,6 +37,7 @@
 #include "core/data_type/data_type_number.h"
 #include "core/types.h"
 #include "exprs/aggregate/aggregate_function.h"
+#include "util/simd/bits.h"
 
 namespace doris {
 class Arena;
@@ -67,6 +68,11 @@ public:
         ++data(place).count;
     }
 
+    void add_batch_single_place(size_t batch_size, AggregateDataPtr place, 
const IColumn**,
+                                Arena&) const override {
+        data(place).count += batch_size;
+    }
+
     void reset(AggregateDataPtr place) const override {
         AggregateFunctionCount::data(place).count = 0;
     }
@@ -180,8 +186,7 @@ public:
     }
 };
 
-// TODO: Maybe AggregateFunctionCountNotNullUnary should be a subclass of 
AggregateFunctionCount
-// Simply count number of not-NULL values.
+// Used for unary count(nullable_expr). SQL count(expr) counts non-NULL values.
 class AggregateFunctionCountNotNullUnary final
         : public IAggregateFunctionDataHelper<AggregateFunctionCountData,
                                               
AggregateFunctionCountNotNullUnary> {
@@ -202,6 +207,20 @@ public:
                          .is_null_at(row_num);
     }
 
+    void add_batch_single_place(size_t batch_size, AggregateDataPtr place, 
const IColumn** columns,
+                                Arena&) const override {
+        const auto& nullable_column =
+                assert_cast<const ColumnNullable&, 
TypeCheckOnRelease::DISABLE>(*columns[0]);
+        const auto& null_map = nullable_column.get_null_map_data();
+        DCHECK_LE(batch_size, null_map.size());
+        if (!nullable_column.has_null(0, batch_size)) {
+            data(place).count += batch_size;
+            return;
+        }
+        data(place).count +=
+                simd::count_zero_num(reinterpret_cast<const 
int8_t*>(null_map.data()), batch_size);
+    }
+
     void reset(AggregateDataPtr place) const override { data(place).count = 0; 
}
 
     void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs,
diff --git a/be/test/exprs/aggregate/agg_count_test.cpp 
b/be/test/exprs/aggregate/agg_count_test.cpp
index 7b86cf5324f..21162c97022 100644
--- a/be/test/exprs/aggregate/agg_count_test.cpp
+++ b/be/test/exprs/aggregate/agg_count_test.cpp
@@ -17,6 +17,7 @@
 
 #include <gtest/gtest.h>
 
+#include "core/data_type/data_type_nullable.h"
 #include "core/data_type/data_type_number.h"
 #include "exprs/aggregate/agg_function_test.h"
 
@@ -31,4 +32,24 @@ TEST_F(AggregateFunctionCountTest, test_int64) {
     execute(Block({ColumnHelper::create_column_with_name<DataTypeInt64>({1, 2, 
3})}),
             ColumnHelper::create_column_with_name<DataTypeInt64>({3}));
 }
+
+TEST_F(AggregateFunctionCountTest, test_nullable_int64_without_null) {
+    create_agg("count", false,
+               
{std::make_shared<DataTypeNullable>(std::make_shared<DataTypeInt64>())},
+               std::make_shared<DataTypeInt64>());
+
+    
execute(Block({ColumnHelper::create_nullable_column_with_name<DataTypeInt64>({1,
 2, 3, 4},
+                                                                               
  {0, 0, 0, 0})}),
+            ColumnHelper::create_column_with_name<DataTypeInt64>({4}));
+}
+
+TEST_F(AggregateFunctionCountTest, test_nullable_int64_with_null) {
+    create_agg("count", false,
+               
{std::make_shared<DataTypeNullable>(std::make_shared<DataTypeInt64>())},
+               std::make_shared<DataTypeInt64>());
+
+    
execute(Block({ColumnHelper::create_nullable_column_with_name<DataTypeInt64>({1,
 2, 3, 4, 5},
+                                                                               
  {0, 1, 0, 1, 0})}),
+            ColumnHelper::create_column_with_name<DataTypeInt64>({3}));
+}
 } // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to