github-actions[bot] commented on code in PR #63196:
URL: https://github.com/apache/doris/pull/63196#discussion_r3254653038


##########
be/src/exprs/function/function_hash.cpp:
##########
@@ -133,6 +133,107 @@ using FunctionMurmurHash3_64_V2 =
 using FunctionMurmurHash3U64V2 =
         FunctionVariadicArgumentsBase<DataTypeInt128, 
MurmurHash3Impl<TYPE_LARGEINT, true>>;
 
+struct MurmurHash3128Impl {
+    static constexpr auto name = "murmur_hash3_128";
+
+    static Status empty_apply(IColumn& icolumn, size_t input_rows_count) {
+        ColumnVector<TYPE_LARGEINT>& vec_to = 
assert_cast<ColumnVector<TYPE_LARGEINT>&>(icolumn);
+        vec_to.get_data().assign(input_rows_count, pack_hash(emtpy_value, 
emtpy_value));
+        return Status::OK();
+    }
+
+    static Status first_apply(const IDataType* type, const IColumn* column, 
size_t input_rows_count,
+                              IColumn& icolumn) {
+        return execute<true>(type, column, input_rows_count, icolumn);
+    }
+
+    static Status combine_apply(const IDataType* type, const IColumn* column,
+                                size_t input_rows_count, IColumn& icolumn) {
+        return execute<false>(type, column, input_rows_count, icolumn);
+    }
+
+    template <bool first>
+    static Status execute(const IDataType* type, const IColumn* column, size_t 
input_rows_count,
+                          IColumn& col_to) {
+        auto& to_column = assert_cast<ColumnVector<TYPE_LARGEINT>&>(col_to);
+        if constexpr (first) {
+            // The first argument initializes one 128-bit hash state per row. 
Later arguments reuse
+            // the same result column and update the saved state in place.
+            to_column.insert_many_defaults(input_rows_count);
+        }
+        auto& col_to_data = to_column.get_data();
+        if (const auto* col_from = check_and_get_column<ColumnString>(column)) 
{
+            const typename ColumnString::Chars& data = col_from->get_chars();
+            const typename ColumnString::Offsets& offsets = 
col_from->get_offsets();
+            size_t size = offsets.size();
+            ColumnString::Offset current_offset = 0;
+            for (size_t i = 0; i < size; ++i) {
+                if constexpr (first) {
+                    init_hash(col_to_data[i], reinterpret_cast<const 
char*>(&data[current_offset]),
+                              offsets[i] - current_offset);
+                } else {
+                    update_hash(col_to_data[i],
+                                reinterpret_cast<const 
char*>(&data[current_offset]),
+                                offsets[i] - current_offset);
+                }
+                current_offset = offsets[i];
+            }
+        } else if (const ColumnConst* col_from_const =
+                           
check_and_get_column_const_string_or_fixedstring(column)) {
+            auto value = col_from_const->get_value<TYPE_STRING>();
+            for (size_t i = 0; i < input_rows_count; ++i) {
+                if constexpr (first) {
+                    init_hash(col_to_data[i], value.data(), value.size());
+                } else {
+                    update_hash(col_to_data[i], value.data(), value.size());
+                }
+            }
+        } else {
+            DCHECK(false);
+            return Status::NotSupported("Illegal column {} of argument of 
function {}",
+                                        column->get_name(), name);
+        }
+        return Status::OK();
+    }
+
+private:
+    static __int128_t pack_hash(uint64_t h1, uint64_t h2) {
+        static_assert(sizeof(__int128_t) == sizeof(uint64_t) * 2);
+        // Store the two MurmurHash3 x64 128-bit lanes in a single LARGEINT 
value. Keep h1 in the
+        // low 64 bits and h2 in the high 64 bits to match 
murmur_hash3_x64_128's out[0]/out[1].
+        const auto value =
+                (static_cast<unsigned __int128>(h2) << 64) | 
static_cast<unsigned __int128>(h1);
+        return static_cast<__int128_t>(value);
+    }
+
+    static void unpack_hash(__int128_t value, uint64_t& h1, uint64_t& h2) {
+        static_assert(sizeof(__int128_t) == sizeof(uint64_t) * 2);
+        const auto unsigned_value = static_cast<unsigned __int128>(value);
+        h1 = static_cast<uint64_t>(unsigned_value);
+        h2 = static_cast<uint64_t>(unsigned_value >> 64);
+    }
+
+    static void init_hash(__int128_t& value, const void* data, size_t size) {
+        uint64_t hash[2] = {0, 0};
+        // The first SQL argument starts from seed 0, so it can use the 
existing 128-bit primitive
+        // directly. Later arguments must use update_hash() to continue from 
the saved (h1, h2).
+        murmur_hash3_x64_128(data, static_cast<int>(size), 0, hash);

Review Comment:
   This narrows a SQL string length from `size_t` to `int` before entering 
`murmur_hash3_x64_128` / `murmur_hash3_x64_process`. Doris string offsets are 
not limited to `INT_MAX` (`StringLengthType` is `uint32_t`), so a large STRING 
value can wrap to a negative `len`; `murmur_hash3_x64_process` then computes a 
negative `nblocks` and derives `tail = data + nblocks * 16`, which can point 
before the buffer and read out of bounds. Please either make the 128-bit Murmur 
primitive accept a non-narrowing length type (as other hash wrappers do at 
their public boundary) or reject/guard values larger than `INT_MAX` before 
calling it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to