github-actions[bot] commented on code in PR #63196:
URL: https://github.com/apache/doris/pull/63196#discussion_r3254653038
##########
be/src/exprs/function/function_hash.cpp:
##########
@@ -133,6 +133,107 @@ using FunctionMurmurHash3_64_V2 =
using FunctionMurmurHash3U64V2 =
FunctionVariadicArgumentsBase<DataTypeInt128,
MurmurHash3Impl<TYPE_LARGEINT, true>>;
+struct MurmurHash3128Impl {
+ static constexpr auto name = "murmur_hash3_128";
+
+ static Status empty_apply(IColumn& icolumn, size_t input_rows_count) {
+ ColumnVector<TYPE_LARGEINT>& vec_to =
assert_cast<ColumnVector<TYPE_LARGEINT>&>(icolumn);
+ vec_to.get_data().assign(input_rows_count, pack_hash(emtpy_value,
emtpy_value));
+ return Status::OK();
+ }
+
+ static Status first_apply(const IDataType* type, const IColumn* column,
size_t input_rows_count,
+ IColumn& icolumn) {
+ return execute<true>(type, column, input_rows_count, icolumn);
+ }
+
+ static Status combine_apply(const IDataType* type, const IColumn* column,
+ size_t input_rows_count, IColumn& icolumn) {
+ return execute<false>(type, column, input_rows_count, icolumn);
+ }
+
+ template <bool first>
+ static Status execute(const IDataType* type, const IColumn* column, size_t
input_rows_count,
+ IColumn& col_to) {
+ auto& to_column = assert_cast<ColumnVector<TYPE_LARGEINT>&>(col_to);
+ if constexpr (first) {
+ // The first argument initializes one 128-bit hash state per row.
Later arguments reuse
+ // the same result column and update the saved state in place.
+ to_column.insert_many_defaults(input_rows_count);
+ }
+ auto& col_to_data = to_column.get_data();
+ if (const auto* col_from = check_and_get_column<ColumnString>(column))
{
+ const typename ColumnString::Chars& data = col_from->get_chars();
+ const typename ColumnString::Offsets& offsets =
col_from->get_offsets();
+ size_t size = offsets.size();
+ ColumnString::Offset current_offset = 0;
+ for (size_t i = 0; i < size; ++i) {
+ if constexpr (first) {
+ init_hash(col_to_data[i], reinterpret_cast<const
char*>(&data[current_offset]),
+ offsets[i] - current_offset);
+ } else {
+ update_hash(col_to_data[i],
+ reinterpret_cast<const
char*>(&data[current_offset]),
+ offsets[i] - current_offset);
+ }
+ current_offset = offsets[i];
+ }
+ } else if (const ColumnConst* col_from_const =
+
check_and_get_column_const_string_or_fixedstring(column)) {
+ auto value = col_from_const->get_value<TYPE_STRING>();
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ if constexpr (first) {
+ init_hash(col_to_data[i], value.data(), value.size());
+ } else {
+ update_hash(col_to_data[i], value.data(), value.size());
+ }
+ }
+ } else {
+ DCHECK(false);
+ return Status::NotSupported("Illegal column {} of argument of
function {}",
+ column->get_name(), name);
+ }
+ return Status::OK();
+ }
+
+private:
+ static __int128_t pack_hash(uint64_t h1, uint64_t h2) {
+ static_assert(sizeof(__int128_t) == sizeof(uint64_t) * 2);
+ // Store the two MurmurHash3 x64 128-bit lanes in a single LARGEINT
value. Keep h1 in the
+ // low 64 bits and h2 in the high 64 bits to match
murmur_hash3_x64_128's out[0]/out[1].
+ const auto value =
+ (static_cast<unsigned __int128>(h2) << 64) |
static_cast<unsigned __int128>(h1);
+ return static_cast<__int128_t>(value);
+ }
+
+ static void unpack_hash(__int128_t value, uint64_t& h1, uint64_t& h2) {
+ static_assert(sizeof(__int128_t) == sizeof(uint64_t) * 2);
+ const auto unsigned_value = static_cast<unsigned __int128>(value);
+ h1 = static_cast<uint64_t>(unsigned_value);
+ h2 = static_cast<uint64_t>(unsigned_value >> 64);
+ }
+
+ static void init_hash(__int128_t& value, const void* data, size_t size) {
+ uint64_t hash[2] = {0, 0};
+ // The first SQL argument starts from seed 0, so it can use the
existing 128-bit primitive
+ // directly. Later arguments must use update_hash() to continue from
the saved (h1, h2).
+ murmur_hash3_x64_128(data, static_cast<int>(size), 0, hash);
Review Comment:
This narrows a SQL string length from `size_t` to `int` before entering
`murmur_hash3_x64_128` / `murmur_hash3_x64_process`. Doris string offsets are
not limited to `INT_MAX` (`StringLengthType` is `uint32_t`), so a large STRING
value can wrap to a negative `len`; `murmur_hash3_x64_process` then computes a
negative `nblocks` and derives `tail = data + nblocks * 16`, which can point
before the buffer and read out of bounds. Please either make the 128-bit Murmur
primitive accept a non-narrowing length type (as other hash wrappers do at
their public boundary) or reject/guard values larger than `INT_MAX` before
calling it.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]