Copilot commented on code in PR #61566:
URL: https://github.com/apache/doris/pull/61566#discussion_r2964749057


##########
be/src/exprs/aggregate/aggregate_function_window_funnel_v2.h:
##########
@@ -0,0 +1,441 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+#include "common/cast_set.h"
+#include "common/exception.h"
+#include "common/status.h"
+#include "core/assert_cast.h"
+#include "core/column/column_string.h"
+#include "core/data_type/data_type_number.h"
+#include "core/types.h"
+#include "core/value/vdatetime_value.h"
+#include "exprs/aggregate/aggregate_function.h"
+#include "exprs/aggregate/aggregate_function_window_funnel.h" // for 
WindowFunnelMode, string_to_window_funnel_mode
+#include "util/var_int.h"
+
+namespace doris {
+#include "common/compile_check_begin.h"
+class Arena;
+class BufferReadable;
+class BufferWritable;
+class IColumn;
+} // namespace doris
+
+namespace doris {
+
+/// Merge two event lists, utilizing sorted flags to optimize.
+/// After merge, all events are in `events_list` and it is sorted.
+template <typename T>
+void merge_events_list(T& events_list, size_t prefix_size, bool prefix_sorted, 
bool suffix_sorted) {
+    if (!prefix_sorted && !suffix_sorted) {
+        std::stable_sort(std::begin(events_list), std::end(events_list));
+    } else {
+        const auto begin = std::begin(events_list);
+        const auto middle = std::next(begin, prefix_size);
+        const auto end = std::end(events_list);
+
+        if (!prefix_sorted) {
+            std::stable_sort(begin, middle);
+        }
+        if (!suffix_sorted) {
+            std::stable_sort(middle, end);
+        }
+        std::inplace_merge(begin, middle, end);
+    }
+}
+
+/// V2 state: stores only matched events as (timestamp, event_index) pairs.
+/// Compared to V1 which stores all rows with N boolean columns, V2 only stores
+/// events that actually match at least one condition, dramatically reducing 
memory.
+struct WindowFunnelStateV2 {
+    /// (timestamp_int_val, 1-based event_index)
+    /// event_index 0 is unused in normal modes.
+    using TimestampEvent = std::pair<UInt64, UInt8>;
+
+    int event_count = 0;
+    int64_t window = 0;
+    WindowFunnelMode window_funnel_mode = WindowFunnelMode::INVALID;
+    bool sorted = true;
+    std::vector<TimestampEvent> events_list;
+
+    WindowFunnelStateV2() = default;
+    WindowFunnelStateV2(int arg_event_count) : event_count(arg_event_count) {}
+
+    void reset() {
+        events_list.clear();
+        sorted = true;
+    }
+
+    void add(const IColumn** arg_columns, ssize_t row_num, int64_t win, 
WindowFunnelMode mode) {
+        window = win;
+        window_funnel_mode = mode;
+
+        // get_data() returns DateV2Value<DateTimeV2ValueType>; convert to 
packed UInt64
+        auto timestamp = assert_cast<const 
ColumnVector<TYPE_DATETIMEV2>&>(*arg_columns[2])
+                                 .get_data()[row_num]
+                                 .to_date_int_val();
+
+        // Iterate from last event to first (reverse order).
+        // This ensures that after stable_sort, events with the same timestamp
+        // appear in descending event_index order, which is important for 
correct
+        // matching when one row satisfies multiple conditions.
+        for (int i = event_count - 1; i >= 0; --i) {
+            auto event_val =
+                    assert_cast<const ColumnUInt8&>(*arg_columns[3 + 
i]).get_data()[row_num];
+            if (event_val) {
+                TimestampEvent new_event {timestamp, cast_set<UInt8>(i + 1)};
+                if (sorted && !events_list.empty()) {
+                    sorted = events_list.back() <= new_event;
+                }
+                events_list.emplace_back(new_event);
+            }
+        }
+    }
+
+    void sort() {
+        if (!sorted) {
+            std::stable_sort(std::begin(events_list), std::end(events_list));
+            sorted = true;
+        }

Review Comment:
   `WindowFunnelStateV2::sort()` uses `std::stable_sort` on `std::pair<UInt64, 
UInt8>` which orders by `(timestamp, event_index)` ascending. This defeats the 
intended “same timestamp in descending event_index order” (see the comment in 
`add()`), and can let a single input row that matches multiple conditions 
advance multiple funnel levels at the same timestamp (different from V1 
row-based semantics). Use an explicit comparator that sorts by timestamp 
ascending and event_index *descending* (and update the `sorted` monotonicity 
check to use the same ordering).



##########
be/src/exprs/aggregate/aggregate_function_window_funnel_v2.h:
##########
@@ -0,0 +1,441 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+#include "common/cast_set.h"
+#include "common/exception.h"
+#include "common/status.h"
+#include "core/assert_cast.h"
+#include "core/column/column_string.h"
+#include "core/data_type/data_type_number.h"
+#include "core/types.h"
+#include "core/value/vdatetime_value.h"
+#include "exprs/aggregate/aggregate_function.h"
+#include "exprs/aggregate/aggregate_function_window_funnel.h" // for 
WindowFunnelMode, string_to_window_funnel_mode
+#include "util/var_int.h"
+
+namespace doris {
+#include "common/compile_check_begin.h"
+class Arena;
+class BufferReadable;
+class BufferWritable;
+class IColumn;
+} // namespace doris
+
+namespace doris {
+
+/// Merge two event lists, utilizing sorted flags to optimize.
+/// After merge, all events are in `events_list` and it is sorted.
+template <typename T>
+void merge_events_list(T& events_list, size_t prefix_size, bool prefix_sorted, 
bool suffix_sorted) {
+    if (!prefix_sorted && !suffix_sorted) {
+        std::stable_sort(std::begin(events_list), std::end(events_list));
+    } else {
+        const auto begin = std::begin(events_list);
+        const auto middle = std::next(begin, prefix_size);
+        const auto end = std::end(events_list);
+
+        if (!prefix_sorted) {
+            std::stable_sort(begin, middle);
+        }
+        if (!suffix_sorted) {
+            std::stable_sort(middle, end);
+        }
+        std::inplace_merge(begin, middle, end);
+    }
+}
+
+/// V2 state: stores only matched events as (timestamp, event_index) pairs.
+/// Compared to V1 which stores all rows with N boolean columns, V2 only stores
+/// events that actually match at least one condition, dramatically reducing 
memory.
+struct WindowFunnelStateV2 {
+    /// (timestamp_int_val, 1-based event_index)
+    /// event_index 0 is unused in normal modes.
+    using TimestampEvent = std::pair<UInt64, UInt8>;
+
+    int event_count = 0;
+    int64_t window = 0;
+    WindowFunnelMode window_funnel_mode = WindowFunnelMode::INVALID;
+    bool sorted = true;
+    std::vector<TimestampEvent> events_list;
+
+    WindowFunnelStateV2() = default;
+    WindowFunnelStateV2(int arg_event_count) : event_count(arg_event_count) {}
+
+    void reset() {
+        events_list.clear();
+        sorted = true;
+    }
+
+    void add(const IColumn** arg_columns, ssize_t row_num, int64_t win, 
WindowFunnelMode mode) {
+        window = win;
+        window_funnel_mode = mode;
+
+        // get_data() returns DateV2Value<DateTimeV2ValueType>; convert to 
packed UInt64
+        auto timestamp = assert_cast<const 
ColumnVector<TYPE_DATETIMEV2>&>(*arg_columns[2])
+                                 .get_data()[row_num]
+                                 .to_date_int_val();
+
+        // Iterate from last event to first (reverse order).
+        // This ensures that after stable_sort, events with the same timestamp
+        // appear in descending event_index order, which is important for 
correct
+        // matching when one row satisfies multiple conditions.
+        for (int i = event_count - 1; i >= 0; --i) {
+            auto event_val =
+                    assert_cast<const ColumnUInt8&>(*arg_columns[3 + 
i]).get_data()[row_num];
+            if (event_val) {
+                TimestampEvent new_event {timestamp, cast_set<UInt8>(i + 1)};
+                if (sorted && !events_list.empty()) {
+                    sorted = events_list.back() <= new_event;
+                }
+                events_list.emplace_back(new_event);
+            }
+        }
+    }
+
+    void sort() {
+        if (!sorted) {
+            std::stable_sort(std::begin(events_list), std::end(events_list));
+            sorted = true;
+        }
+    }
+
+    void merge(const WindowFunnelStateV2& other) {
+        if (other.events_list.empty()) {
+            return;
+        }
+
+        if (events_list.empty()) {
+            events_list = other.events_list;
+            sorted = other.sorted;
+        } else {
+            const auto prefix_size = events_list.size();
+            events_list.insert(std::end(events_list), 
std::begin(other.events_list),
+                               std::end(other.events_list));
+            merge_events_list(events_list, prefix_size, sorted, other.sorted);
+            sorted = true;
+        }
+
+        event_count = event_count > 0 ? event_count : other.event_count;
+        window = window > 0 ? window : other.window;
+        window_funnel_mode = window_funnel_mode == WindowFunnelMode::INVALID
+                                     ? other.window_funnel_mode
+                                     : window_funnel_mode;

Review Comment:
   `WindowFunnelStateV2::merge()` treats `window` as “unset” unless it’s > 0 
(`window = window > 0 ? window : other.window`). A window value of 0 is valid 
(meaning events must occur at the same timestamp) but will be overwritten 
during merges, producing incorrect results for distributed/partial aggregation. 
Track whether `window` is initialized separately (or use a sentinel like 
`window = -1` for unset) so that 0 is preserved.



##########
fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinAggregateFunctions.java:
##########
@@ -193,7 +194,8 @@ private BuiltinAggregateFunctions() {
                 agg(TopNWeighted.class, "topn_weighted"),
                 agg(Variance.class, "var_pop", "variance_pop", "variance"),
                 agg(VarianceSamp.class, "var_samp", "variance_samp"),
-                agg(WindowFunnel.class, "window_funnel")
+                agg(WindowFunnel.class, "window_funnel_v1"),
+                agg(WindowFunnelV2.class, "window_funnel_v2", "window_funnel")

Review Comment:
   By remapping SQL name `window_funnel` to V2 in Nereids, the existing 
`nereids_p0/aggregate/window_funnel` regression suite now effectively tests V2 
semantics and no longer provides coverage for V1 behavior. Consider adding 
regression coverage that explicitly calls `window_funnel_v1` (at least for 
DEFAULT/DEDUP/FIXED/INCREASE) to ensure V1 remains stable during the transition.



##########
be/src/exprs/aggregate/aggregate_function_window_funnel_v2.cpp:
##########
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "exprs/aggregate/aggregate_function_window_funnel_v2.h"
+
+#include <string>
+
+#include "common/logging.h"
+#include "core/data_type/data_type.h"
+#include "core/types.h"
+#include "exprs/aggregate/aggregate_function_simple_factory.h"
+#include "exprs/aggregate/helpers.h"
+
+namespace doris {
+#include "common/compile_check_begin.h"
+
+AggregateFunctionPtr create_aggregate_function_window_funnel_v2(const 
std::string& name,
+                                                                const 
DataTypes& argument_types,
+                                                                const 
DataTypePtr& result_type,
+                                                                const bool 
result_is_nullable,
+                                                                const 
AggregateFunctionAttr& attr) {
+    if (argument_types.size() < 3) {
+        LOG(WARNING) << "window_funnel_v2's argument less than 3.";
+        return nullptr;
+    }
+    if (argument_types[2]->get_primitive_type() == TYPE_DATETIMEV2) {
+        return creator_without_type::create<AggregateFunctionWindowFunnelV2>(
+                argument_types, result_is_nullable, attr);
+    } else {
+        LOG(WARNING) << "Only support DateTime type as window argument!";
+        return nullptr;

Review Comment:
   `create_aggregate_function_window_funnel_v2` allows `argument_types.size() 
== 3`, but the function requires at least 4 arguments (window, mode, timestamp, 
and >=1 boolean condition). With 3 args, `event_count` becomes 0 and the 
function silently returns 0 instead of rejecting invalid SQL. Update the arity 
check (and warning message) to require >=4 arguments.



##########
fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinAggregateFunctions.java:
##########
@@ -193,7 +194,8 @@ private BuiltinAggregateFunctions() {
                 agg(TopNWeighted.class, "topn_weighted"),
                 agg(Variance.class, "var_pop", "variance_pop", "variance"),
                 agg(VarianceSamp.class, "var_samp", "variance_samp"),
-                agg(WindowFunnel.class, "window_funnel")
+                agg(WindowFunnel.class, "window_funnel_v1"),
+                agg(WindowFunnelV2.class, "window_funnel_v2", "window_funnel")

Review Comment:
   This change makes `window_funnel` resolve to `window_funnel_v2` in Nereids 
(`agg(WindowFunnelV2.class, ..., "window_funnel")`) while keeping 
`window_funnel_v1` available. Since V2 has different semantics (regression 
outputs change), this is a user-visible behavior change for existing queries 
using `window_funnel`. Consider gating the alias switch behind a session 
variable / config, or keeping `window_funnel` mapped to V1 until a deprecation 
window is completed.
   ```suggestion
                   agg(WindowFunnel.class, "window_funnel_v1", "window_funnel"),
                   agg(WindowFunnelV2.class, "window_funnel_v2")
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to