This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 38e529cd296 [cherry-pick](branch-2.1) support decimal256 for parquet
reader (#42241)
38e529cd296 is described below
commit 38e529cd2969dc9088bc2a5798ada93f008b1ae8
Author: Socrates <[email protected]>
AuthorDate: Tue Oct 22 19:42:09 2024 +0800
[cherry-pick](branch-2.1) support decimal256 for parquet reader (#42241)
## Proposed changes
pick pr: https://github.com/apache/doris/pull/41526
---
be/src/gutil/endian.h | 13 +++++++++++--
be/src/util/bit_util.h | 9 ++++++++-
be/src/vec/core/wide_integer.h | 1 +
.../exec/format/parquet/parquet_column_convert.cpp | 5 ++++-
.../vec/exec/format/parquet/parquet_column_convert.h | 19 ++++++++++++++++++-
be/test/util/bit_util_test.cpp | 19 ++++++++++++++++++-
.../hdfs_tvf/test_parquet_decimal256.parquet | Bin 0 -> 1320 bytes
.../external_table_p0/tvf/test_hdfs_tvf.groovy | 8 ++++++++
8 files changed, 68 insertions(+), 6 deletions(-)
diff --git a/be/src/gutil/endian.h b/be/src/gutil/endian.h
index f1a9cf2a1a2..6af893ea7a3 100644
--- a/be/src/gutil/endian.h
+++ b/be/src/gutil/endian.h
@@ -61,8 +61,8 @@ inline unsigned __int128 gbswap_128(unsigned __int128
host_int) {
}
inline wide::UInt256 gbswap_256(wide::UInt256 host_int) {
- wide::UInt256 result{gbswap_64(host_int.items[3]),
gbswap_64(host_int.items[2]),
- gbswap_64(host_int.items[1]),
gbswap_64(host_int.items[0])};
+ wide::UInt256 result {gbswap_64(host_int.items[3]),
gbswap_64(host_int.items[2]),
+ gbswap_64(host_int.items[1]),
gbswap_64(host_int.items[0])};
return result;
}
@@ -137,6 +137,9 @@ public:
static unsigned __int128 FromHost128(unsigned __int128 x) { return x; }
static unsigned __int128 ToHost128(unsigned __int128 x) { return x; }
+ static wide::UInt256 FromHost256(wide::UInt256 x) { return x; }
+ static wide::UInt256 ToHost256(wide::UInt256 x) { return x; }
+
static bool IsLittleEndian() { return true; }
#elif defined IS_BIG_ENDIAN
@@ -150,6 +153,12 @@ public:
static uint64 FromHost64(uint64 x) { return gbswap_64(x); }
static uint64 ToHost64(uint64 x) { return gbswap_64(x); }
+ static unsigned __int128 FromHost128(unsigned __int128 x) { return
gbswap_128(x); }
+ static unsigned __int128 ToHost128(unsigned __int128 x) { return
gbswap_128(x); }
+
+ static wide::UInt256 FromHost256(wide::UInt256 x) { return gbswap_256(x); }
+ static wide::UInt256 ToHost256(wide::UInt256 x) { return gbswap_256(x); }
+
static bool IsLittleEndian() { return false; }
#endif /* ENDIAN */
diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h
index 6934f45ef3e..6b7385c0613 100644
--- a/be/src/util/bit_util.h
+++ b/be/src/util/bit_util.h
@@ -20,6 +20,9 @@
#pragma once
+#include <type_traits>
+
+#include "vec/core/wide_integer.h"
#ifndef __APPLE__
#include <endian.h>
#endif
@@ -209,7 +212,11 @@ public:
template <typename T>
static T big_endian_to_host(T value) {
- if constexpr (std::is_same_v<T, __int128>) {
+ if constexpr (std::is_same_v<T, wide::Int256>) {
+ return BigEndian::ToHost256(value);
+ } else if constexpr (std::is_same_v<T, wide::UInt256>) {
+ return BigEndian::ToHost256(value);
+ } else if constexpr (std::is_same_v<T, __int128>) {
return BigEndian::ToHost128(value);
} else if constexpr (std::is_same_v<T, unsigned __int128>) {
return BigEndian::ToHost128(value);
diff --git a/be/src/vec/core/wide_integer.h b/be/src/vec/core/wide_integer.h
index e7902e414a8..261a41d16b9 100644
--- a/be/src/vec/core/wide_integer.h
+++ b/be/src/vec/core/wide_integer.h
@@ -40,6 +40,7 @@
// and modified by Doris
#pragma once
+#include <cstddef>
#include <cstdint>
#include <initializer_list>
#include <limits>
diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
index 2fb0afea82a..0a5ef2913dd 100644
--- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
+++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
@@ -19,6 +19,7 @@
#include <cctz/time_zone.h>
+#include "runtime/define_primitive_type.h"
#include "vec/columns/column_nullable.h"
namespace doris::vectorized::parquet {
const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone();
@@ -27,7 +28,8 @@ const cctz::time_zone ConvertParams::utc0 =
cctz::utc_time_zone();
M(TYPE_DECIMALV2) \
M(TYPE_DECIMAL32) \
M(TYPE_DECIMAL64) \
- M(TYPE_DECIMAL128I)
+ M(TYPE_DECIMAL128I) \
+ M(TYPE_DECIMAL256)
bool PhysicalToLogicalConverter::is_parquet_native_type(PrimitiveType type) {
switch (type) {
@@ -50,6 +52,7 @@ bool
PhysicalToLogicalConverter::is_decimal_type(doris::PrimitiveType type) {
case TYPE_DECIMAL32:
case TYPE_DECIMAL64:
case TYPE_DECIMAL128I:
+ case TYPE_DECIMAL256:
case TYPE_DECIMALV2:
return true;
default:
diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h
b/be/src/vec/exec/format/parquet/parquet_column_convert.h
index 91b81121aa4..cf6f8aa13fa 100644
--- a/be/src/vec/exec/format/parquet/parquet_column_convert.h
+++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h
@@ -20,6 +20,7 @@
#include <gen_cpp/parquet_types.h>
#include "vec/core/types.h"
+#include "vec/core/wide_integer.h"
#include "vec/data_types/data_type_factory.hpp"
#include "vec/exec/format/column_type_convert.h"
#include "vec/exec/format/format_common.h"
@@ -401,7 +402,23 @@ public:
M(13, int128_t) \
M(14, int128_t) \
M(15, int128_t) \
- M(16, int128_t)
+ M(16, int128_t) \
+ M(17, wide::Int256) \
+ M(18, wide::Int256) \
+ M(19, wide::Int256) \
+ M(20, wide::Int256) \
+ M(21, wide::Int256) \
+ M(22, wide::Int256) \
+ M(23, wide::Int256) \
+ M(24, wide::Int256) \
+ M(25, wide::Int256) \
+ M(26, wide::Int256) \
+ M(27, wide::Int256) \
+ M(28, wide::Int256) \
+ M(29, wide::Int256) \
+ M(30, wide::Int256) \
+ M(31, wide::Int256) \
+ M(32, wide::Int256)
switch (_type_length) {
APPLY_FOR_DECIMALS()
diff --git a/be/test/util/bit_util_test.cpp b/be/test/util/bit_util_test.cpp
index 514daafa604..fd3bee01432 100644
--- a/be/test/util/bit_util_test.cpp
+++ b/be/test/util/bit_util_test.cpp
@@ -21,7 +21,6 @@
#include <gtest/gtest-test-part.h>
#include <boost/utility/binary.hpp>
-#include <memory>
#include "gtest/gtest_pred_impl.h"
@@ -48,4 +47,22 @@ TEST(BitUtil, Popcount) {
EXPECT_EQ(BitUtil::popcount_no_hw(0), 0);
}
+TEST(BitUtil, BigEndianToHost) {
+ uint16_t v16 = 0x1234;
+ uint32_t v32 = 0x12345678;
+ uint64_t v64 = 0x123456789abcdef0;
+ unsigned __int128 v128 = ((__int128)0x123456789abcdef0LL << 64) |
0x123456789abcdef0LL;
+ wide::UInt256 v256 =
+ wide::UInt256(0x123456789abcdef0) << 192 |
wide::UInt256(0x123456789abcdef0) << 128 |
+ wide::UInt256(0x123456789abcdef0) << 64 |
wide::UInt256(0x123456789abcdef0);
+ EXPECT_EQ(BitUtil::big_endian_to_host(v16), 0x3412);
+ EXPECT_EQ(BitUtil::big_endian_to_host(v32), 0x78563412);
+ EXPECT_EQ(BitUtil::big_endian_to_host(v64), 0xf0debc9a78563412);
+ EXPECT_EQ(BitUtil::big_endian_to_host(v128),
+ ((__int128)0xf0debc9a78563412LL << 64) | 0xf0debc9a78563412LL);
+ EXPECT_EQ(BitUtil::big_endian_to_host(v256),
+ wide::UInt256(0xf0debc9a78563412) << 192 |
wide::UInt256(0xf0debc9a78563412) << 128 |
+ wide::UInt256(0xf0debc9a78563412) << 64 |
wide::UInt256(0xf0debc9a78563412));
+}
+
} // namespace doris
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet
new file mode 100644
index 00000000000..323ded32160
Binary files /dev/null and
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet
differ
diff --git a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
index be27e213f6b..764c4842229 100644
--- a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
+++ b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
@@ -116,6 +116,14 @@ suite("test_hdfs_tvf","external,hive,tvf,external_docker")
{
"hadoop.username" = "${hdfsUserName}",
"format" = "${format}") order by s_suppkey limit
20; """
+ // test parquet decimal256
+ uri = "${defaultFS}" +
"/user/doris/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet"
+ format = "parquet"
+ qt_parquet_decimal256 """ select * from HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "${format}") order by id; """
+
// test orc
uri = "${defaultFS}" +
"/user/doris/preinstalled_data/hdfs_tvf/test_orc.snappy.orc"
format = "orc"
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]