This is an automated email from the ASF dual-hosted git repository.
zouxinyi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new db3a3ec49d2 [env](compile)open compile check in columns class (#44425)
db3a3ec49d2 is described below
commit db3a3ec49d20019b1cb871c11aa10d13b974ec22
Author: Xinyi Zou <[email protected]>
AuthorDate: Wed Nov 27 22:13:44 2024 +0800
[env](compile)open compile check in columns class (#44425)
### What problem does this PR solve?
Problem Summary:
open compile check in columns class
---
be/src/util/hash_util.hpp | 16 ++++-----
be/src/vec/columns/column_const.cpp | 5 ++-
be/src/vec/columns/column_const.h | 5 ++-
be/src/vec/columns/column_decimal.h | 4 ++-
be/src/vec/columns/column_dictionary.h | 22 +++++++-----
be/src/vec/columns/column_nullable.cpp | 5 +--
be/src/vec/columns/column_nullable.h | 7 ++--
be/src/vec/columns/column_object.cpp | 19 +++++++----
be/src/vec/columns/column_object.h | 6 ++--
be/src/vec/columns/column_string.cpp | 61 +++++++++++++++++++---------------
be/src/vec/columns/column_string.h | 29 +++++++++++-----
be/src/vec/columns/column_vector.cpp | 3 +-
be/src/vec/columns/column_vector.h | 9 ++---
be/src/vec/core/field.h | 11 ++++--
14 files changed, 125 insertions(+), 77 deletions(-)
diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp
index e9ac72c5ccd..d444daa8c68 100644
--- a/be/src/util/hash_util.hpp
+++ b/be/src/util/hash_util.hpp
@@ -46,7 +46,7 @@ public:
return std::hash<T>()(value);
}
- static uint32_t zlib_crc_hash(const void* data, int32_t bytes, uint32_t
hash) {
+ static uint32_t zlib_crc_hash(const void* data, uint32_t bytes, uint32_t
hash) {
return crc32(hash, (const unsigned char*)data, bytes);
}
@@ -66,7 +66,7 @@ public:
// NOTE: Any changes made to this function need to be reflected in
Codegen::GetHashFn.
// TODO: crc32 hashes with different seeds do not result in different hash
functions.
// The resulting hashes are correlated.
- static uint32_t crc_hash(const void* data, int32_t bytes, uint32_t hash) {
+ static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
if (!CpuInfo::is_supported(CpuInfo::SSE4_2)) {
return zlib_crc_hash(data, bytes, hash);
}
@@ -93,7 +93,7 @@ public:
return hash;
}
- static uint64_t crc_hash64(const void* data, int32_t bytes, uint64_t hash)
{
+ static uint64_t crc_hash64(const void* data, uint32_t bytes, uint64_t
hash) {
uint32_t words = bytes / sizeof(uint32_t);
bytes = bytes % sizeof(uint32_t);
@@ -125,7 +125,7 @@ public:
return converter.u64;
}
#else
- static uint32_t crc_hash(const void* data, int32_t bytes, uint32_t hash) {
+ static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
return zlib_crc_hash(data, bytes, hash);
}
#endif
@@ -202,7 +202,7 @@ public:
// For example, if the data is <1000, 2000, 3000, 4000, ..> and then the
mod of 1000
// is taken on the hash, all values will collide to the same bucket.
// For string values, Fnv is slightly faster than boost.
- static uint32_t fnv_hash(const void* data, int32_t bytes, uint32_t hash) {
+ static uint32_t fnv_hash(const void* data, uint32_t bytes, uint32_t hash) {
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
while (bytes--) {
@@ -213,7 +213,7 @@ public:
return hash;
}
- static uint64_t fnv_hash64(const void* data, int32_t bytes, uint64_t hash)
{
+ static uint64_t fnv_hash64(const void* data, uint32_t bytes, uint64_t
hash) {
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
while (bytes--) {
@@ -291,7 +291,7 @@ public:
// depending on hardware capabilities.
// Seed values for different steps of the query execution should use
different seeds
// to prevent accidental key collisions. (See IMPALA-219 for more details).
- static uint32_t hash(const void* data, int32_t bytes, uint32_t seed) {
+ static uint32_t hash(const void* data, uint32_t bytes, uint32_t seed) {
#ifdef __SSE4_2__
if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) {
@@ -305,7 +305,7 @@ public:
#endif
}
- static uint64_t hash64(const void* data, int32_t bytes, uint64_t seed) {
+ static uint64_t hash64(const void* data, uint32_t bytes, uint64_t seed) {
#ifdef _SSE4_2_
if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) {
return crc_hash64(data, bytes, seed);
diff --git a/be/src/vec/columns/column_const.cpp
b/be/src/vec/columns/column_const.cpp
index f751f1d8d3e..fd05127f6d7 100644
--- a/be/src/vec/columns/column_const.cpp
+++ b/be/src/vec/columns/column_const.cpp
@@ -35,6 +35,7 @@
#include "vec/core/column_with_type_and_name.h"
namespace doris::vectorized {
+#include "common/compile_check_begin.h"
ColumnConst::ColumnConst(const ColumnPtr& data_, size_t s_) : data(data_),
s(s_) {
/// Squash Const of Const.
@@ -66,7 +67,9 @@ ColumnConst::ColumnConst(const ColumnPtr& data_, size_t s_,
bool create_with_emp
}
ColumnPtr ColumnConst::convert_to_full_column() const {
- return data->replicate(Offsets(1, s));
+ // Assuming the number of replicate rows will not exceed Offset(UInt32),
+ // currently Column::replicate only supports Uint32 Offsets
+ return data->replicate(Offsets(1, cast_set<Offset>(s)));
}
ColumnPtr ColumnConst::remove_low_cardinality() const {
diff --git a/be/src/vec/columns/column_const.h
b/be/src/vec/columns/column_const.h
index ee3860f0635..0fa22ca23bf 100644
--- a/be/src/vec/columns/column_const.h
+++ b/be/src/vec/columns/column_const.h
@@ -48,6 +48,7 @@
class SipHash;
namespace doris::vectorized {
+#include "common/compile_check_begin.h"
class Arena;
class Block;
@@ -267,7 +268,8 @@ public:
template <typename T>
T get_value() const {
- return get_field().safe_get<NearestFieldType<T>>();
+ // Here the cast is correct, relevant code is rather tricky.
+ return static_cast<T>(get_field().safe_get<NearestFieldType<T>>());
}
void replace_column_data(const IColumn& rhs, size_t row, size_t self_row =
0) override {
@@ -276,3 +278,4 @@ public:
}
};
} // namespace doris::vectorized
+#include "common/compile_check_end.h"
diff --git a/be/src/vec/columns/column_decimal.h
b/be/src/vec/columns/column_decimal.h
index 4c2f69d5ef3..946b268436e 100644
--- a/be/src/vec/columns/column_decimal.h
+++ b/be/src/vec/columns/column_decimal.h
@@ -53,6 +53,7 @@ class ColumnSorter;
} // namespace doris
namespace doris::vectorized {
+#include "common/compile_check_begin.h"
/// PaddedPODArray extended by Decimal scale
template <typename T>
@@ -261,7 +262,7 @@ protected:
for (U i = 0; i < s; ++i) res[i] = i;
auto sort_end = res.end();
- if (limit && limit < s / 8.0) {
+ if (limit && limit < static_cast<double>(s) / 8.0L) {
sort_end = res.begin() + limit;
if (reverse)
std::partial_sort(res.begin(), sort_end, res.end(),
@@ -305,3 +306,4 @@ template <typename T>
using ColumnVectorOrDecimal = typename ColumnVectorOrDecimalT<T,
IsDecimalNumber<T>>::Col;
} // namespace doris::vectorized
+#include "common/compile_check_end.h"
diff --git a/be/src/vec/columns/column_dictionary.h
b/be/src/vec/columns/column_dictionary.h
index ae7d001a31d..c4b1f3e27e0 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -29,6 +29,7 @@
#include "vec/core/types.h"
namespace doris::vectorized {
+#include "common/compile_check_begin.h"
/**
* For low cardinality string columns, using ColumnDictionary can reduce memory
@@ -265,9 +266,9 @@ public:
}
}
- int32_t find_code(const StringRef& value) const { return
_dict.find_code(value); }
+ T find_code(const StringRef& value) const { return _dict.find_code(value);
}
- int32_t find_code_by_bound(const StringRef& value, bool greater, bool eq)
const {
+ T find_code_by_bound(const StringRef& value, bool greater, bool eq) const {
return _dict.find_code_by_bound(value, greater, eq);
}
@@ -346,8 +347,9 @@ public:
_total_str_len += value.size;
}
- int32_t find_code(const StringRef& value) const {
- for (size_t i = 0; i < _dict_data->size(); i++) {
+ T find_code(const StringRef& value) const {
+ // _dict_data->size will not exceed the range of T.
+ for (T i = 0; i < _dict_data->size(); i++) {
if ((*_dict_data)[i] == value) {
return i;
}
@@ -384,11 +386,11 @@ public:
// For dictionary data of char type, sv.size is the schema
length,
// so use strnlen to remove the 0 at the end to get the actual
length.
- int32_t len = sv.size;
+ size_t len = sv.size;
if (type == FieldType::OLAP_FIELD_TYPE_CHAR) {
len = strnlen(sv.data, sv.size);
}
- uint32_t hash_val = HashUtil::crc_hash(sv.data, len, 0);
+ uint32_t hash_val = HashUtil::crc_hash(sv.data,
static_cast<uint32_t>(len), 0);
_hash_values[code] = hash_val;
_compute_hash_value_flags[code] = 1;
return _hash_values[code];
@@ -412,13 +414,14 @@ public:
// so upper_bound is the code 0 of b, then evaluate code < 0 and
returns empty
// If the predicate is col <= 'a' and upper_bound-1 is -1,
// then evaluate code <= -1 and returns empty
- int32_t find_code_by_bound(const StringRef& value, bool greater, bool
eq) const {
+ T find_code_by_bound(const StringRef& value, bool greater, bool eq)
const {
auto code = find_code(value);
if (code >= 0) {
return code;
}
- auto bound = std::upper_bound(_dict_data->begin(),
_dict_data->end(), value) -
- _dict_data->begin();
+ auto bound =
+ static_cast<T>(std::upper_bound(_dict_data->begin(),
_dict_data->end(), value) -
+ _dict_data->begin());
return greater ? bound - greater + eq : bound - eq;
}
@@ -536,3 +539,4 @@ template class ColumnDictionary<int32_t>;
using ColumnDictI32 = vectorized::ColumnDictionary<doris::vectorized::Int32>;
} // namespace doris::vectorized
+#include "common/compile_check_end.h"
diff --git a/be/src/vec/columns/column_nullable.cpp
b/be/src/vec/columns/column_nullable.cpp
index 5e34ad4d8d4..c58c78f5611 100644
--- a/be/src/vec/columns/column_nullable.cpp
+++ b/be/src/vec/columns/column_nullable.cpp
@@ -29,6 +29,7 @@
#include "vec/utils/util.hpp"
namespace doris::vectorized {
+#include "common/compile_check_begin.h"
ColumnNullable::ColumnNullable(MutableColumnPtr&& nested_column_,
MutableColumnPtr&& null_map_)
: NullMapProvider(std::move(null_map_)),
nested_column(std::move(nested_column_)) {
@@ -62,7 +63,7 @@ void ColumnNullable::update_xxHash_with_value(size_t start,
size_t end, uint64_t
} else {
const auto* __restrict real_null_data =
assert_cast<const
ColumnUInt8&>(get_null_map_column()).get_data().data();
- for (int i = start; i < end; ++i) {
+ for (size_t i = start; i < end; ++i) {
if (real_null_data[i] != 0) {
hash = HashUtil::xxHash64NullWithSeed(hash);
}
@@ -78,7 +79,7 @@ void ColumnNullable::update_crc_with_value(size_t start,
size_t end, uint32_t& h
} else {
const auto* __restrict real_null_data =
assert_cast<const
ColumnUInt8&>(get_null_map_column()).get_data().data();
- for (int i = start; i < end; ++i) {
+ for (size_t i = start; i < end; ++i) {
if (real_null_data[i] != 0) {
hash = HashUtil::zlib_crc_hash_null(hash);
}
diff --git a/be/src/vec/columns/column_nullable.h
b/be/src/vec/columns/column_nullable.h
index 252144fbc5f..83d5e6af35a 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -43,6 +43,7 @@
class SipHash;
namespace doris::vectorized {
+#include "common/compile_check_begin.h"
class Arena;
class ColumnSorter;
@@ -404,7 +405,8 @@ public:
}
static constexpr auto MAX_NUMBER_OF_ROWS_FOR_FULL_SEARCH = 1000;
size_t num_rows = size();
- size_t num_sampled_rows = std::min(static_cast<size_t>(num_rows *
sample_ratio), num_rows);
+ size_t num_sampled_rows = std::min(
+ static_cast<size_t>(static_cast<double>(num_rows) *
sample_ratio), num_rows);
size_t num_checked_rows = 0;
size_t res = 0;
if (num_sampled_rows == num_rows || num_rows <=
MAX_NUMBER_OF_ROWS_FOR_FULL_SEARCH) {
@@ -423,7 +425,7 @@ public:
if (num_checked_rows == 0) {
return 0.0;
}
- return static_cast<double>(res) / num_checked_rows;
+ return static_cast<double>(res) /
static_cast<double>(num_checked_rows);
}
void convert_dict_codes_if_necessary() override {
@@ -460,3 +462,4 @@ private:
ColumnPtr make_nullable(const ColumnPtr& column, bool is_nullable = false);
ColumnPtr remove_nullable(const ColumnPtr& column);
} // namespace doris::vectorized
+#include "common/compile_check_end.h"
diff --git a/be/src/vec/columns/column_object.cpp
b/be/src/vec/columns/column_object.cpp
index d5e52d07bcf..3e8d3722305 100644
--- a/be/src/vec/columns/column_object.cpp
+++ b/be/src/vec/columns/column_object.cpp
@@ -82,6 +82,7 @@
#endif
namespace doris::vectorized {
+#include "common/compile_check_begin.h"
namespace {
DataTypePtr create_array_of_type(TypeIndex type, size_t num_dimensions, bool
is_nullable) {
@@ -653,7 +654,7 @@ bool ColumnObject::Subcolumn::check_if_sparse_column(size_t
num_rows) {
defaults_ratio.push_back(data[i]->get_ratio_of_default_rows());
}
double default_ratio = std::accumulate(defaults_ratio.begin(),
defaults_ratio.end(), 0.0) /
- defaults_ratio.size();
+ static_cast<double>(defaults_ratio.size());
return default_ratio >= config::variant_ratio_of_defaults_as_sparse_column;
}
@@ -1294,7 +1295,11 @@ rapidjson::Value*
find_leaf_node_by_path(rapidjson::Value& json, const PathInDat
if (!json.IsObject()) {
return nullptr;
}
- rapidjson::Value name(current_key.data(), current_key.size());
+ /*! RapidJSON uses 32-bit array/string indices even on 64-bit platforms,
+ instead of using \c size_t. Users may override the SizeType by defining
+ \ref RAPIDJSON_NO_SIZETYPEDEFINE.
+ */
+ rapidjson::Value name(current_key.data(),
cast_set<unsigned>(current_key.size()));
auto it = json.FindMember(name);
if (it == json.MemberEnd()) {
return nullptr;
@@ -1312,7 +1317,7 @@ rapidjson::Value*
find_leaf_node_by_path(rapidjson::Value& json, const PathInDat
// 3. empty root jsonb value(not null)
// 4. type is nothing
bool skip_empty_json(const ColumnNullable* nullable, const DataTypePtr& type,
- TypeIndex base_type_id, int row, const PathInData& path) {
+ TypeIndex base_type_id, size_t row, const PathInData&
path) {
// skip nulls
if (nullable && nullable->is_null_at(row)) {
return true;
@@ -1348,7 +1353,7 @@ Status find_and_set_leave_value(const IColumn* column,
const PathInData& path,
const DataTypeSerDeSPtr& type_serde, const
DataTypePtr& type,
TypeIndex base_type_index, rapidjson::Value&
root,
rapidjson::Document::AllocatorType& allocator,
Arena& mem_pool,
- int row) {
+ size_t row) {
#ifndef NDEBUG
// sanitize type and column
if (column->get_name() != type->create_column()->get_name()) {
@@ -1416,7 +1421,7 @@ void get_json_by_column_tree(rapidjson::Value& root,
rapidjson::Document::Alloca
}
}
-Status ColumnObject::serialize_one_row_to_string(int64_t row, std::string*
output) const {
+Status ColumnObject::serialize_one_row_to_string(size_t row, std::string*
output) const {
if (!is_finalized()) {
const_cast<ColumnObject*>(this)->finalize(FinalizeMode::READ_MODE);
}
@@ -1432,7 +1437,7 @@ Status ColumnObject::serialize_one_row_to_string(int64_t
row, std::string* outpu
return Status::OK();
}
-Status ColumnObject::serialize_one_row_to_string(int64_t row, BufferWritable&
output) const {
+Status ColumnObject::serialize_one_row_to_string(size_t row, BufferWritable&
output) const {
if (!is_finalized()) {
const_cast<ColumnObject*>(this)->finalize(FinalizeMode::READ_MODE);
}
@@ -1447,7 +1452,7 @@ Status ColumnObject::serialize_one_row_to_string(int64_t
row, BufferWritable& ou
return Status::OK();
}
-Status ColumnObject::serialize_one_row_to_json_format(int64_t row,
rapidjson::StringBuffer* output,
+Status ColumnObject::serialize_one_row_to_json_format(size_t row,
rapidjson::StringBuffer* output,
bool* is_null) const {
CHECK(is_finalized());
if (subcolumns.empty()) {
diff --git a/be/src/vec/columns/column_object.h
b/be/src/vec/columns/column_object.h
index 21bb4469115..e4127197a22 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -272,12 +272,12 @@ public:
return
subcolumns.get_mutable_root()->data.get_finalized_column_ptr()->assume_mutable();
}
- Status serialize_one_row_to_string(int64_t row, std::string* output) const;
+ Status serialize_one_row_to_string(size_t row, std::string* output) const;
- Status serialize_one_row_to_string(int64_t row, BufferWritable& output)
const;
+ Status serialize_one_row_to_string(size_t row, BufferWritable& output)
const;
// serialize one row to json format
- Status serialize_one_row_to_json_format(int64_t row,
rapidjson::StringBuffer* output,
+ Status serialize_one_row_to_json_format(size_t row,
rapidjson::StringBuffer* output,
bool* is_null) const;
// merge multiple sub sparse columns into root
diff --git a/be/src/vec/columns/column_string.cpp
b/be/src/vec/columns/column_string.cpp
index 3caa194551b..cb83a29bbad 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -34,6 +34,7 @@
#include "vec/core/sort_block.h"
namespace doris::vectorized {
+#include "common/compile_check_begin.h"
template <typename T>
void ColumnStr<T>::sanity_check() const {
@@ -74,8 +75,8 @@ MutableColumnPtr ColumnStr<T>::clone_resized(size_t to_size)
const {
res->offsets.assign(offsets.begin(), offsets.end());
res->chars.assign(chars.begin(), chars.end());
}
-
- res->offsets.resize_fill(to_size, chars.size());
+ // If offset is uint32, size will not exceed, check the size when
inserting data into ColumnStr<T>.
+ res->offsets.resize_fill(to_size, static_cast<T>(chars.size()));
}
return res;
@@ -92,14 +93,14 @@ void ColumnStr<T>::shrink_padding_chars() {
// deal the 0-th element. no need to move.
auto next_start = offset[0];
- offset[0] = strnlen(data, size_at(0));
+ offset[0] = static_cast<T>(strnlen(data, size_at(0)));
for (size_t i = 1; i < size; i++) {
// get the i-th length and whole move it to cover the last's trailing
void
auto length = strnlen(data + next_start, offset[i] - next_start);
memmove(data + offset[i - 1], data + next_start, length);
// offset i will be changed. so save the old value for (i+1)-th to get
its length.
next_start = offset[i];
- offset[i] = offset[i - 1] + length;
+ offset[i] = offset[i - 1] + static_cast<T>(length);
}
chars.resize_fill(offsets.back()); // just call it to shrink memory here.
no possible to expand.
}
@@ -125,8 +126,8 @@ void ColumnStr<T>::insert_range_from_ignore_overflow(const
doris::vectorized::IC
"Parameter out of bound in IColumnStr<T>::insert_range_from
method.");
}
- size_t nested_offset = src_concrete.offset_at(start);
- size_t nested_length = src_concrete.offsets[start + length - 1] -
nested_offset;
+ auto nested_offset = src_concrete.offset_at(start);
+ auto nested_length = src_concrete.offsets[start + length - 1] -
nested_offset;
size_t old_chars_size = chars.size();
chars.resize(old_chars_size + nested_length);
@@ -136,7 +137,7 @@ void ColumnStr<T>::insert_range_from_ignore_overflow(const
doris::vectorized::IC
offsets.assign(src_concrete.offsets.begin(),
src_concrete.offsets.begin() + length);
} else {
size_t old_size = offsets.size();
- size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see
PaddedPODArray
+ auto prev_max_offset = offsets.back(); /// -1th index is Ok, see
PaddedPODArray
offsets.resize(old_size + length);
for (size_t i = 0; i < length; ++i) {
@@ -161,8 +162,8 @@ void ColumnStr<T>::insert_range_from(const IColumn& src,
size_t start, size_t le
doris::ErrorCode::INTERNAL_ERROR,
"Parameter out of bound in
IColumnStr<T>::insert_range_from method.");
}
- size_t nested_offset = src_offsets[static_cast<ssize_t>(start) - 1];
- size_t nested_length = src_offsets[start + length - 1] - nested_offset;
+ auto nested_offset = src_offsets[static_cast<ssize_t>(start) - 1];
+ auto nested_length = src_offsets[start + length - 1] - nested_offset;
size_t old_chars_size = chars.size();
check_chars_length(old_chars_size + nested_length, offsets.size() +
length);
@@ -174,11 +175,13 @@ void ColumnStr<T>::insert_range_from(const IColumn& src,
size_t start, size_t le
offsets.assign(src_offsets.begin(), src_offsets.begin() + length);
} else {
size_t old_size = offsets.size();
- size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see
PaddedPODArray
+ auto prev_max_offset = offsets.back(); /// -1th index is Ok, see
PaddedPODArray
offsets.resize(old_size + length);
for (size_t i = 0; i < length; ++i) {
- offsets[old_size + i] = src_offsets[start + i] - nested_offset
+ prev_max_offset;
+ // if Offsets is uint32, size will not exceed range of uint32,
cast is OK.
+ offsets[old_size + i] =
+ static_cast<T>(src_offsets[start + i] - nested_offset)
+ prev_max_offset;
}
}
};
@@ -208,7 +211,7 @@ void ColumnStr<T>::insert_many_from(const IColumn& src,
size_t position, size_t
auto prev_pos = old_chars_size;
for (; start_pos < end_pos; ++start_pos) {
memcpy(&chars[prev_pos], data_val, data_length);
- offsets[start_pos] = prev_pos + data_length;
+ offsets[start_pos] = static_cast<T>(prev_pos + data_length);
prev_pos = prev_pos + data_length;
}
}
@@ -229,7 +232,8 @@ void ColumnStr<T>::insert_indices_from(const IColumn& src,
const uint32_t* indic
for (const auto* x = indices_begin; x != indices_end; ++x) {
int64_t src_offset = *x;
total_chars_size += src_offset_data[src_offset] -
src_offset_data[src_offset - 1];
- dst_offsets_data[dst_offsets_pos++] = total_chars_size;
+ // if Offsets is uint32, size will not exceed range of uint32,
cast is OK.
+ dst_offsets_data[dst_offsets_pos++] =
static_cast<T>(total_chars_size);
}
check_chars_length(total_chars_size, offsets.size());
@@ -267,13 +271,16 @@ void ColumnStr<T>::update_crcs_with_value(uint32_t*
__restrict hashes, doris::Pr
if (null_data == nullptr) {
for (size_t i = 0; i < s; i++) {
auto data_ref = get_data_at(i);
- hashes[i] = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size,
hashes[i]);
+ // If offset is uint32, size will not exceed, check the size when
inserting data into ColumnStr<T>.
+ hashes[i] = HashUtil::zlib_crc_hash(data_ref.data,
static_cast<uint32_t>(data_ref.size),
+ hashes[i]);
}
} else {
for (size_t i = 0; i < s; i++) {
if (null_data[i] == 0) {
auto data_ref = get_data_at(i);
- hashes[i] = HashUtil::zlib_crc_hash(data_ref.data,
data_ref.size, hashes[i]);
+ hashes[i] = HashUtil::zlib_crc_hash(
+ data_ref.data, static_cast<uint32_t>(data_ref.size),
hashes[i]);
}
}
}
@@ -391,8 +398,9 @@ ColumnPtr ColumnStr<T>::permute(const IColumn::Permutation&
perm, size_t limit)
template <typename T>
StringRef ColumnStr<T>::serialize_value_into_arena(size_t n, Arena& arena,
char const*& begin) const {
- uint32_t string_size(size_at(n));
- uint32_t offset(offset_at(n));
+ // Use uint32 instead of size_t to reduce agg key's length.
+ auto string_size(static_cast<uint32_t>(size_at(n)));
+ auto offset(static_cast<uint32_t>(offset_at(n)));
StringRef res;
res.size = sizeof(string_size) + string_size;
@@ -421,7 +429,7 @@ const char*
ColumnStr<T>::deserialize_and_insert_from_arena(const char* pos) {
template <typename T>
size_t ColumnStr<T>::get_max_row_byte_size() const {
- size_t max_size = 0;
+ T max_size = 0;
size_t num_rows = offsets.size();
for (size_t i = 0; i < num_rows; ++i) {
max_size = std::max(max_size, size_at(i));
@@ -434,8 +442,9 @@ template <typename T>
void ColumnStr<T>::serialize_vec(std::vector<StringRef>& keys, size_t num_rows,
size_t max_row_byte_size) const {
for (size_t i = 0; i < num_rows; ++i) {
- uint32_t offset(offset_at(i));
- uint32_t string_size(size_at(i));
+ // Use uint32 instead of size_t to reduce agg key's length.
+ auto offset(static_cast<uint32_t>(offset_at(i)));
+ auto string_size(static_cast<uint32_t>(size_at(i)));
auto* ptr = const_cast<char*>(keys[i].data + keys[i].size);
memcpy_fixed<uint32_t>(ptr, (char*)&string_size);
@@ -458,8 +467,8 @@ void
ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz
memcpy(dest, null_map + i, sizeof(uint8_t));
if (null_map[i] == 0) {
- UInt32 offset(offset_at(i));
- UInt32 string_size(size_at(i));
+ auto offset(offset_at(i));
+ auto string_size(size_at(i));
memcpy_fixed<UInt32>(dest + 1, (char*)&string_size);
memcpy(dest + 1 + sizeof(string_size), &chars[offset],
string_size);
@@ -475,8 +484,8 @@ void
ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz
// serialize null first
memcpy(dest, null_map + i, sizeof(uint8_t));
- UInt32 offset(offset_at(i));
- UInt32 string_size(size_at(i));
+ auto offset(offset_at(i));
+ auto string_size(size_at(i));
memcpy_fixed<UInt32>(dest + 1, (char*)&string_size);
memcpy(dest + 1 + sizeof(string_size), &chars[offset],
string_size);
@@ -559,8 +568,8 @@ ColumnPtr ColumnStr<T>::replicate(const IColumn::Offsets&
replicate_offsets) con
T current_new_offset = 0;
for (size_t i = 0; i < col_size; ++i) {
- size_t size_to_replicate = replicate_offsets[i] -
prev_replicate_offset;
- size_t string_size = offsets[i] - prev_string_offset;
+ T size_to_replicate = replicate_offsets[i] - prev_replicate_offset;
+ T string_size = offsets[i] - prev_string_offset;
for (size_t j = 0; j < size_to_replicate; ++j) {
current_new_offset += string_size;
diff --git a/be/src/vec/columns/column_string.h
b/be/src/vec/columns/column_string.h
index f116d4ce1f1..1674fd90933 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -47,6 +47,7 @@
#include "vec/core/types.h"
namespace doris::vectorized {
+#include "common/compile_check_begin.h"
class Arena;
class ColumnSorter;
@@ -86,10 +87,10 @@ private:
Chars chars;
// Start position of i-th element.
- size_t ALWAYS_INLINE offset_at(ssize_t i) const { return offsets[i - 1]; }
+ T ALWAYS_INLINE offset_at(ssize_t i) const { return offsets[i - 1]; }
/// Size of i-th element, including terminating zero.
- size_t ALWAYS_INLINE size_at(ssize_t i) const { return offsets[i] -
offsets[i - 1]; }
+ T ALWAYS_INLINE size_at(ssize_t i) const { return offsets[i] - offsets[i -
1]; }
template <bool positive>
struct less;
@@ -220,7 +221,7 @@ public:
const char* ptr = strings[0].data;
for (size_t i = 0; i != num; i++) {
- uint32_t len = strings[i].size;
+ size_t len = strings[i].size;
length += len;
offset += len;
offsets.push_back(offset);
@@ -282,7 +283,7 @@ public:
Char* data = chars.data();
size_t offset = old_size;
for (size_t i = 0; i < num; i++) {
- uint32_t len = strings[i].size;
+ size_t len = strings[i].size;
if (len) {
memcpy(data + offset, strings[i].data, len);
offset += len;
@@ -305,7 +306,7 @@ public:
Char* data = chars.data();
size_t offset = old_size;
for (size_t i = 0; i < num; i++) {
- uint32_t len = strings[i].size;
+ size_t len = strings[i].size;
if (len) {
memcpy(data + offset, strings[i].data, copy_length);
offset += len;
@@ -342,9 +343,15 @@ public:
for (size_t i = 0; i < num; i++) {
int32_t codeword = data_array[i + start_index];
new_size += dict[codeword].size;
- offsets[offset_size + i] = new_size;
+ offsets[offset_size + i] = static_cast<T>(new_size);
}
+ if (new_size > std::numeric_limits<T>::max()) {
+ throw doris::Exception(ErrorCode::INTERNAL_ERROR,
+ "ColumnString insert size out of range type
{} [{},{}]",
+ typeid(T).name(),
std::numeric_limits<T>::min(),
+ std::numeric_limits<T>::max());
+ }
check_chars_length(new_size, offsets.size());
chars.resize(new_size);
@@ -406,13 +413,16 @@ public:
for (size_t i = start; i < end; ++i) {
if (null_data[i] == 0) {
auto data_ref = get_data_at(i);
- hash = HashUtil::zlib_crc_hash(data_ref.data,
data_ref.size, hash);
+ // If offset is uint32, size will not exceed, check the
size when inserting data into ColumnStr<T>.
+ hash = HashUtil::zlib_crc_hash(data_ref.data,
+
static_cast<uint32_t>(data_ref.size), hash);
}
}
} else {
for (size_t i = start; i < end; ++i) {
auto data_ref = get_data_at(i);
- hash = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size,
hash);
+ hash = HashUtil::zlib_crc_hash(data_ref.data,
static_cast<uint32_t>(data_ref.size),
+ hash);
}
}
}
@@ -473,7 +483,7 @@ public:
void insert_default() override { offsets.push_back(chars.size()); }
void insert_many_defaults(size_t length) override {
- offsets.resize_fill(offsets.size() + length, chars.size());
+ offsets.resize_fill(offsets.size() + length,
static_cast<T>(chars.size()));
}
int compare_at(size_t n, size_t m, const IColumn& rhs_,
@@ -525,3 +535,4 @@ public:
using ColumnString = ColumnStr<UInt32>;
using ColumnString64 = ColumnStr<UInt64>;
} // namespace doris::vectorized
+#include "common/compile_check_end.h"
diff --git a/be/src/vec/columns/column_vector.cpp
b/be/src/vec/columns/column_vector.cpp
index 0e24446a5cd..f0f0bec8b99 100644
--- a/be/src/vec/columns/column_vector.cpp
+++ b/be/src/vec/columns/column_vector.cpp
@@ -42,6 +42,7 @@
#include "vec/data_types/data_type.h"
namespace doris::vectorized {
+#include "common/compile_check_begin.h"
template <typename T>
StringRef ColumnVector<T>::serialize_value_into_arena(size_t n, Arena& arena,
@@ -242,7 +243,7 @@ void ColumnVector<T>::get_permutation(bool reverse, size_t
limit, int nan_direct
if (s == 0) return;
// std::partial_sort need limit << s can get performance benefit
- if (limit > (s / 8.0)) limit = 0;
+ if (limit > (s / 8.0L)) limit = 0;
if (limit) {
for (size_t i = 0; i < s; ++i) res[i] = i;
diff --git a/be/src/vec/columns/column_vector.h
b/be/src/vec/columns/column_vector.h
index 2cb320b6992..970997a9186 100644
--- a/be/src/vec/columns/column_vector.h
+++ b/be/src/vec/columns/column_vector.h
@@ -61,6 +61,7 @@ class ColumnSorter;
} // namespace doris
namespace doris::vectorized {
+#include "common/compile_check_begin.h"
/** Stuff for comparing numbers.
* Integer values are compared as usual.
@@ -178,10 +179,9 @@ public:
void insert_range_of_integer(T begin, T end) {
if constexpr (std::is_integral_v<T>) {
auto old_size = data.size();
- data.resize(old_size + (end - begin));
- for (int i = 0; i < end - begin; i++) {
- data[old_size + i] = begin + i;
- }
+ auto new_size = old_size + static_cast<size_t>(end - begin);
+ data.resize(new_size);
+ std::iota(data.begin() + old_size, data.begin() + new_size, begin);
} else {
throw doris::Exception(ErrorCode::INTERNAL_ERROR,
"double column not support
insert_range_of_integer");
@@ -409,3 +409,4 @@ protected:
};
} // namespace doris::vectorized
+#include "common/compile_check_end.h"
diff --git a/be/src/vec/core/field.h b/be/src/vec/core/field.h
index 8113dc602fb..341f65e075e 100644
--- a/be/src/vec/core/field.h
+++ b/be/src/vec/core/field.h
@@ -165,7 +165,7 @@ class JsonbField {
public:
JsonbField() = default;
- JsonbField(const char* ptr, uint32_t len) : size(len) {
+ JsonbField(const char* ptr, size_t len) : size(len) {
data = new char[size];
if (!data) {
LOG(FATAL) << "new data buffer failed, size: " << size;
@@ -213,7 +213,7 @@ public:
}
const char* get_value() const { return data; }
- uint32_t get_size() const { return size; }
+ size_t get_size() const { return size; }
bool operator<(const JsonbField& r) const {
LOG(FATAL) << "comparing between JsonbField is not supported";
@@ -252,7 +252,7 @@ public:
private:
char* data = nullptr;
- uint32_t size = 0;
+ size_t size = 0;
};
template <typename T>
@@ -498,6 +498,9 @@ public:
bool is_null() const { return which == Types::Null; }
+ // The template parameter T needs to be consistent with `which`.
+ // If not, use NearestFieldType<> externally.
+ // Maybe modify this in the future, reference:
https://github.com/ClickHouse/ClickHouse/pull/22003
template <typename T>
T& get() {
using TWithoutRef = std::remove_reference_t<T>;
@@ -520,6 +523,8 @@ public:
return true;
}
+ // The template parameter T needs to be consistent with `which`.
+ // If not, use NearestFieldType<> externally.
template <typename T>
bool try_get(T& result) const {
const Types::Which requested = TypeToEnum<std::decay_t<T>>::value;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]