Vishwanatha-HD commented on code in PR #48207:
URL: https://github.com/apache/arrow/pull/48207#discussion_r2564839938
##########
cpp/src/parquet/statistics.cc:
##########
@@ -925,22 +926,94 @@ void TypedStatisticsImpl<DType>::UpdateSpaced(const T*
values, const uint8_t* va
template <typename DType>
void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst)
const {
+#if ARROW_LITTLE_ENDIAN
auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_,
pool_);
encoder->Put(&src, 1);
auto buffer = encoder->FlushValues();
auto ptr = reinterpret_cast<const char*>(buffer->data());
dst->assign(ptr, static_cast<size_t>(buffer->size()));
+#else
+ // For fixed-width numeric types, write explicit little-endian bytes per spec
+ if constexpr (std::is_same_v<DType, Int32Type>) {
+ uint32_t u;
+ std::memcpy(&u, &src, sizeof(u));
+ u = ::arrow::bit_util::ToLittleEndian(u);
+ dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+ return;
+ } else if constexpr (std::is_same_v<DType, Int64Type>) {
+ uint64_t u;
+ std::memcpy(&u, &src, sizeof(u));
+ u = ::arrow::bit_util::ToLittleEndian(u);
+ dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+ return;
+ } else if constexpr (std::is_same_v<DType, FloatType>) {
+ uint32_t u;
+ static_assert(sizeof(u) == sizeof(float), "size");
+ std::memcpy(&u, &src, sizeof(u));
+ u = ::arrow::bit_util::ToLittleEndian(u);
+ dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+ return;
+ } else if constexpr (std::is_same_v<DType, DoubleType>) {
+ uint64_t u;
+ static_assert(sizeof(u) == sizeof(double), "size");
+ std::memcpy(&u, &src, sizeof(u));
+ u = ::arrow::bit_util::ToLittleEndian(u);
+ dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+ return;
+ }
+ // Fallback: use encoder for other types
+ auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_,
pool_);
+ encoder->Put(&src, 1);
+ auto buffer = encoder->FlushValues();
+ dst->assign(reinterpret_cast<const char*>(buffer->data()),
+ static_cast<size_t>(buffer->size()));
Review Comment:
@kou.. Thanks for the comments.. I have modified in order to reuse the above
implementation, both in PlainEncode and PlainDecode logic.. I have pushed the
changes..
##########
cpp/src/parquet/statistics.cc:
##########
@@ -925,22 +926,94 @@ void TypedStatisticsImpl<DType>::UpdateSpaced(const T*
values, const uint8_t* va
template <typename DType>
void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst)
const {
+#if ARROW_LITTLE_ENDIAN
auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_,
pool_);
encoder->Put(&src, 1);
auto buffer = encoder->FlushValues();
auto ptr = reinterpret_cast<const char*>(buffer->data());
dst->assign(ptr, static_cast<size_t>(buffer->size()));
+#else
+ // For fixed-width numeric types, write explicit little-endian bytes per spec
+ if constexpr (std::is_same_v<DType, Int32Type>) {
+ uint32_t u;
+ std::memcpy(&u, &src, sizeof(u));
+ u = ::arrow::bit_util::ToLittleEndian(u);
+ dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+ return;
+ } else if constexpr (std::is_same_v<DType, Int64Type>) {
+ uint64_t u;
+ std::memcpy(&u, &src, sizeof(u));
+ u = ::arrow::bit_util::ToLittleEndian(u);
+ dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+ return;
+ } else if constexpr (std::is_same_v<DType, FloatType>) {
+ uint32_t u;
+ static_assert(sizeof(u) == sizeof(float), "size");
+ std::memcpy(&u, &src, sizeof(u));
+ u = ::arrow::bit_util::ToLittleEndian(u);
+ dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+ return;
+ } else if constexpr (std::is_same_v<DType, DoubleType>) {
+ uint64_t u;
+ static_assert(sizeof(u) == sizeof(double), "size");
+ std::memcpy(&u, &src, sizeof(u));
+ u = ::arrow::bit_util::ToLittleEndian(u);
+ dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+ return;
+ }
+ // Fallback: use encoder for other types
+ auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_,
pool_);
+ encoder->Put(&src, 1);
+ auto buffer = encoder->FlushValues();
+ dst->assign(reinterpret_cast<const char*>(buffer->data()),
+ static_cast<size_t>(buffer->size()));
+#endif
}
template <typename DType>
void TypedStatisticsImpl<DType>::PlainDecode(const std::string& src, T* dst)
const {
+#if ARROW_LITTLE_ENDIAN
Review Comment:
@pitrou.. I have explained in detailed above as to why this change is
required. If you are fine, then this code can be optimized as it is shown for
PlainEncode type.
##########
cpp/src/parquet/statistics.cc:
##########
@@ -925,22 +926,94 @@ void TypedStatisticsImpl<DType>::UpdateSpaced(const T*
values, const uint8_t* va
template <typename DType>
void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst)
const {
+#if ARROW_LITTLE_ENDIAN
auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_,
pool_);
encoder->Put(&src, 1);
Review Comment:
Hi @pitrou & @kiszk
As you both know, the expectation from the PlainEncode is that
Convert the value src into a sequence of bytes and make sure those bytes are
little-endian (required by Parquet). Because different numeric types have
different sizes, alignment requirements and representations, and Parquet
requires that each one be written in strict little-endian format,
byte-for-byte, we need to handle the individual types seperately..
If you want I can come up with an optimized version of PlainEncode as shown
below. This will simplify the main function since the helper function is
handling most of the logic..
PlainEncoder:
template <typename T>
T ToLittleEndianValue(const T& value) {
#if ARROW_LITTLE_ENDIAN
return value;
#else
// Integer → swap using Arrow helpers
if constexpr (std::is_integral_v<T>) {
return arrow::bit_util::ToLittleEndian(value);
}
// Floating point → reinterpret as integer, swap, reinterpret back.
else if constexpr (std::is_floating_point_v<T>) {
using UInt =
std::conditional_t<sizeof(T) == 4, uint32_t,
std::conditional_t<sizeof(T) == 8, uint64_t, void>>;
UInt u;
std::memcpy(&u, &value, sizeof(u));
u = arrow::bit_util::ToLittleEndian(u);
T out;
std::memcpy(&out, &u, sizeof(out));
return out;
}
// Otherwise: return as-is (variable-length types handled by encoder)
else {
return value;
}
#endif
}
Simplified main PlainEncode function >>>>>
template <typename DType>
void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst)
const {
using CType = typename DType::c_type;
if constexpr (std::is_arithmetic_v<CType>) {
// fixed-width numeric fast path (works for LE and BE)
CType le_value = ToLittleEndianValue(src);
dst->assign(reinterpret_cast<const char*>(&le_value), sizeof(le_value));
return;
}
// Fallback for non-numeric types
auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_,
pool_);
encoder->Put(&src, 1);
auto buffer = encoder->FlushValues();
dst->assign(reinterpret_cast<const char*>(buffer->data()),
static_cast<size_t>(buffer->size()));
}
Similarly, I can write a generic FromLittleEndian helper function and
simplify the PlainDecode function.. Please let me know.. Thanks..
##########
cpp/src/parquet/statistics.cc:
##########
@@ -925,22 +926,94 @@ void TypedStatisticsImpl<DType>::UpdateSpaced(const T*
values, const uint8_t* va
template <typename DType>
void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst)
const {
+#if ARROW_LITTLE_ENDIAN
auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_,
pool_);
encoder->Put(&src, 1);
auto buffer = encoder->FlushValues();
auto ptr = reinterpret_cast<const char*>(buffer->data());
dst->assign(ptr, static_cast<size_t>(buffer->size()));
+#else
+ // For fixed-width numeric types, write explicit little-endian bytes per spec
+ if constexpr (std::is_same_v<DType, Int32Type>) {
+ uint32_t u;
+ std::memcpy(&u, &src, sizeof(u));
+ u = ::arrow::bit_util::ToLittleEndian(u);
+ dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+ return;
+ } else if constexpr (std::is_same_v<DType, Int64Type>) {
+ uint64_t u;
+ std::memcpy(&u, &src, sizeof(u));
+ u = ::arrow::bit_util::ToLittleEndian(u);
+ dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+ return;
+ } else if constexpr (std::is_same_v<DType, FloatType>) {
+ uint32_t u;
+ static_assert(sizeof(u) == sizeof(float), "size");
+ std::memcpy(&u, &src, sizeof(u));
+ u = ::arrow::bit_util::ToLittleEndian(u);
+ dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+ return;
+ } else if constexpr (std::is_same_v<DType, DoubleType>) {
+ uint64_t u;
+ static_assert(sizeof(u) == sizeof(double), "size");
+ std::memcpy(&u, &src, sizeof(u));
+ u = ::arrow::bit_util::ToLittleEndian(u);
+ dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+ return;
+ }
Review Comment:
@kou.. I tried modifying the XXXEncoder::Put() function to have this
functionality.. Unfortunately, it didnt work.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]