Re: [PR] GH-48206: [C++][Parquet] Fix statistics logic on s390x [arrow]

via GitHub Wed, 26 Nov 2025 04:38:02 -0800


Vishwanatha-HD commented on code in PR #48207:
URL: https://github.com/apache/arrow/pull/48207#discussion_r2564839938



##########
cpp/src/parquet/statistics.cc:
##########
@@ -925,22 +926,94 @@ void TypedStatisticsImpl<DType>::UpdateSpaced(const T* 
values, const uint8_t* va
 
 template <typename DType>
 void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst) 
const {
+#if ARROW_LITTLE_ENDIAN
   auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_, 
pool_);
   encoder->Put(&src, 1);
   auto buffer = encoder->FlushValues();
   auto ptr = reinterpret_cast<const char*>(buffer->data());
   dst->assign(ptr, static_cast<size_t>(buffer->size()));
+#else
+  // For fixed-width numeric types, write explicit little-endian bytes per spec
+  if constexpr (std::is_same_v<DType, Int32Type>) {
+    uint32_t u;
+    std::memcpy(&u, &src, sizeof(u));
+    u = ::arrow::bit_util::ToLittleEndian(u);
+    dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+    return;
+  } else if constexpr (std::is_same_v<DType, Int64Type>) {
+    uint64_t u;
+    std::memcpy(&u, &src, sizeof(u));
+    u = ::arrow::bit_util::ToLittleEndian(u);
+    dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+    return;
+  } else if constexpr (std::is_same_v<DType, FloatType>) {
+    uint32_t u;
+    static_assert(sizeof(u) == sizeof(float), "size");
+    std::memcpy(&u, &src, sizeof(u));
+    u = ::arrow::bit_util::ToLittleEndian(u);
+    dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+    return;
+  } else if constexpr (std::is_same_v<DType, DoubleType>) {
+    uint64_t u;
+    static_assert(sizeof(u) == sizeof(double), "size");
+    std::memcpy(&u, &src, sizeof(u));
+    u = ::arrow::bit_util::ToLittleEndian(u);
+    dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+    return;
+  }
+  // Fallback: use encoder for other types
+  auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_, 
pool_);
+  encoder->Put(&src, 1);
+  auto buffer = encoder->FlushValues();
+  dst->assign(reinterpret_cast<const char*>(buffer->data()),
+              static_cast<size_t>(buffer->size()));

Review Comment:
   @kou.. Thanks for the comments.. I have modified in order to reuse the above 
implementation, both in PlainEncode and PlainDecode logic.. I have pushed the 
changes.. 



##########
cpp/src/parquet/statistics.cc:
##########
@@ -925,22 +926,94 @@ void TypedStatisticsImpl<DType>::UpdateSpaced(const T* 
values, const uint8_t* va
 
 template <typename DType>
 void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst) 
const {
+#if ARROW_LITTLE_ENDIAN
   auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_, 
pool_);
   encoder->Put(&src, 1);
   auto buffer = encoder->FlushValues();
   auto ptr = reinterpret_cast<const char*>(buffer->data());
   dst->assign(ptr, static_cast<size_t>(buffer->size()));
+#else
+  // For fixed-width numeric types, write explicit little-endian bytes per spec
+  if constexpr (std::is_same_v<DType, Int32Type>) {
+    uint32_t u;
+    std::memcpy(&u, &src, sizeof(u));
+    u = ::arrow::bit_util::ToLittleEndian(u);
+    dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+    return;
+  } else if constexpr (std::is_same_v<DType, Int64Type>) {
+    uint64_t u;
+    std::memcpy(&u, &src, sizeof(u));
+    u = ::arrow::bit_util::ToLittleEndian(u);
+    dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+    return;
+  } else if constexpr (std::is_same_v<DType, FloatType>) {
+    uint32_t u;
+    static_assert(sizeof(u) == sizeof(float), "size");
+    std::memcpy(&u, &src, sizeof(u));
+    u = ::arrow::bit_util::ToLittleEndian(u);
+    dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+    return;
+  } else if constexpr (std::is_same_v<DType, DoubleType>) {
+    uint64_t u;
+    static_assert(sizeof(u) == sizeof(double), "size");
+    std::memcpy(&u, &src, sizeof(u));
+    u = ::arrow::bit_util::ToLittleEndian(u);
+    dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+    return;
+  }
+  // Fallback: use encoder for other types
+  auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_, 
pool_);
+  encoder->Put(&src, 1);
+  auto buffer = encoder->FlushValues();
+  dst->assign(reinterpret_cast<const char*>(buffer->data()),
+              static_cast<size_t>(buffer->size()));
+#endif
 }
 
 template <typename DType>
 void TypedStatisticsImpl<DType>::PlainDecode(const std::string& src, T* dst) 
const {
+#if ARROW_LITTLE_ENDIAN

Review Comment:
   @pitrou.. I have explained in detailed above as to why this change is 
required. If you are fine, then this code can be optimized as it is shown for 
PlainEncode type.



##########
cpp/src/parquet/statistics.cc:
##########
@@ -925,22 +926,94 @@ void TypedStatisticsImpl<DType>::UpdateSpaced(const T* 
values, const uint8_t* va
 
 template <typename DType>
 void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst) 
const {
+#if ARROW_LITTLE_ENDIAN
   auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_, 
pool_);
   encoder->Put(&src, 1);

Review Comment:
   Hi @pitrou & @kiszk 
   As you both know, the expectation from the PlainEncode is that 
   
   Convert the value src into a sequence of bytes and make sure those bytes are 
little-endian (required by Parquet). Because different numeric types have 
different sizes, alignment requirements and representations, and Parquet 
requires that each one be written in strict little-endian format, 
byte-for-byte, we need to handle the individual types seperately.. 
   
   If you want I can come up with an optimized version of PlainEncode as shown 
below. This will simplify the main function since the helper function is 
handling most of the logic..
   
   PlainEncoder:
   template <typename T>
   T ToLittleEndianValue(const T& value) {
   #if ARROW_LITTLE_ENDIAN
     return value;
   #else
     // Integer → swap using Arrow helpers
     if constexpr (std::is_integral_v<T>) {
       return arrow::bit_util::ToLittleEndian(value);
     }
   
     // Floating point → reinterpret as integer, swap, reinterpret back.
     else if constexpr (std::is_floating_point_v<T>) {
       using UInt =
           std::conditional_t<sizeof(T) == 4, uint32_t,
           std::conditional_t<sizeof(T) == 8, uint64_t, void>>;
   
       UInt u;
       std::memcpy(&u, &value, sizeof(u));
       u = arrow::bit_util::ToLittleEndian(u);
   
       T out;
       std::memcpy(&out, &u, sizeof(out));
       return out;
     }
   
     // Otherwise: return as-is (variable-length types handled by encoder)
     else {
       return value;
     }
   #endif
   }
   
   Simplified main PlainEncode function >>>>>
   template <typename DType>
   void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst) 
const {
     using CType = typename DType::c_type;
   
     if constexpr (std::is_arithmetic_v<CType>) {
       // fixed-width numeric fast path (works for LE and BE)
       CType le_value = ToLittleEndianValue(src);
       dst->assign(reinterpret_cast<const char*>(&le_value), sizeof(le_value));
       return;
     }
   
     // Fallback for non-numeric types
     auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_, 
pool_);
     encoder->Put(&src, 1);
     auto buffer = encoder->FlushValues();
     dst->assign(reinterpret_cast<const char*>(buffer->data()),
                 static_cast<size_t>(buffer->size()));
   }
   
   Similarly, I can write a generic FromLittleEndian helper function and 
simplify the PlainDecode function.. Please let me know.. Thanks.. 



##########
cpp/src/parquet/statistics.cc:
##########
@@ -925,22 +926,94 @@ void TypedStatisticsImpl<DType>::UpdateSpaced(const T* 
values, const uint8_t* va
 
 template <typename DType>
 void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst) 
const {
+#if ARROW_LITTLE_ENDIAN
   auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_, 
pool_);
   encoder->Put(&src, 1);
   auto buffer = encoder->FlushValues();
   auto ptr = reinterpret_cast<const char*>(buffer->data());
   dst->assign(ptr, static_cast<size_t>(buffer->size()));
+#else
+  // For fixed-width numeric types, write explicit little-endian bytes per spec
+  if constexpr (std::is_same_v<DType, Int32Type>) {
+    uint32_t u;
+    std::memcpy(&u, &src, sizeof(u));
+    u = ::arrow::bit_util::ToLittleEndian(u);
+    dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+    return;
+  } else if constexpr (std::is_same_v<DType, Int64Type>) {
+    uint64_t u;
+    std::memcpy(&u, &src, sizeof(u));
+    u = ::arrow::bit_util::ToLittleEndian(u);
+    dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+    return;
+  } else if constexpr (std::is_same_v<DType, FloatType>) {
+    uint32_t u;
+    static_assert(sizeof(u) == sizeof(float), "size");
+    std::memcpy(&u, &src, sizeof(u));
+    u = ::arrow::bit_util::ToLittleEndian(u);
+    dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+    return;
+  } else if constexpr (std::is_same_v<DType, DoubleType>) {
+    uint64_t u;
+    static_assert(sizeof(u) == sizeof(double), "size");
+    std::memcpy(&u, &src, sizeof(u));
+    u = ::arrow::bit_util::ToLittleEndian(u);
+    dst->assign(reinterpret_cast<const char*>(&u), sizeof(u));
+    return;
+  }

Review Comment:
   @kou.. I tried modifying the XXXEncoder::Put() function to have this 
functionality.. Unfortunately, it didnt work.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-48206: [C++][Parquet] Fix statistics logic on s390x [arrow]

Reply via email to