This is an automated email from the ASF dual-hosted git repository.

pitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 546d42c71a GH-50184: [C++][Parquet] Avoid reading past truncated 
statistics values in FormatStatValue (#50025)
546d42c71a is described below

commit 546d42c71a467c3741df2d6ececad966fc66bf4c
Author: jmestwa-coder <[email protected]>
AuthorDate: Wed Jun 17 13:50:02 2026 +0530

    GH-50184: [C++][Parquet] Avoid reading past truncated statistics values in 
FormatStatValue (#50025)
    
    ### Rationale for this change
    
    `FormatStatValue` in `cpp/src/parquet/types.cc` does fixed-width 
`memcpy`/byte loads on a statistics value that comes straight out of the file's 
Thrift-encoded metadata, so its length is attacker controlled. The Parquet spec 
[allows truncated 
statistics](https://github.com/apache/parquet-format/blob/325b9f4a70663c88485d4d59fdc1a121d7497943/src/main/thrift/parquet.thrift#L295-L301),
 so a `min_value`/`max_value` shorter than the column's physical type (for 
example a zero-byte stat on an [...]
    
    ### What changes are included in this PR?
    
    Each fixed-width read in `FormatStatValue` and the helpers it delegates to 
now clamps its copy to `val.size()` instead of assuming the full type width is 
present: the `BOOLEAN`/`INT32`/`INT64`/`INT96`/`FLOAT`/`DOUBLE` numeric loads, 
the int32/int64-backed decimal loads, and the float16 `FIXED_LEN_BYTE_ARRAY` 
load. A truncated value is zero-padded rather than over-read, and no exception 
is raised since truncated statistics are spec-legal. The variable-length 
`FIXED_LEN_BYTE_ARRAY` (dec [...]
    
    ### Are these changes tested?
    
    Yes. `TypePrinter.StatisticsTypesShortValue` in `types_test.cc` formats a 
too-short value for each path and checks it returns without reading past the 
buffer (caught under ASan).
    
    ### Are there any user-facing changes?
    
    No API change. A truncated statistics value is formatted from the bytes 
that are present instead of reading out of bounds.
    
    **This PR contains a "Critical Fix".** It stops an out-of-bounds read on a 
buffer whose length is controlled by the input Parquet file.
    
    * GitHub Issue: #50184
    
    Authored-by: jmestwa-coder <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/types.cc      | 25 ++++++++++++++++++-------
 cpp/src/parquet/types_test.cc | 18 ++++++++++++++++++
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index ff5bd4eafb..9d7604faec 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <algorithm>
 #include <array>
 #include <cmath>
 #include <cstdint>
@@ -110,8 +111,11 @@ template <typename T>
 std::enable_if_t<std::is_arithmetic_v<T>, std::string> FormatNumericValue(
     ::std::string_view val) {
   std::stringstream result;
+  // The statistics value comes from the file's Thrift metadata, so its length
+  // is attacker controlled and the spec allows it to be truncated. Only copy
+  // the bytes that are actually present and leave the rest zero-padded.
   T value{};
-  std::memcpy(&value, val.data(), sizeof(T));
+  std::memcpy(&value, val.data(), std::min(sizeof(T), val.size()));
   result << value;
   return result.str();
 }
@@ -128,14 +132,14 @@ std::string FormatDecimalValue(Type::type parquet_type, 
::std::string_view val,
   switch (parquet_type) {
     case Type::INT32: {
       int32_t int_value{};
-      std::memcpy(&int_value, val.data(), sizeof(int32_t));
+      std::memcpy(&int_value, val.data(), std::min(sizeof(int32_t), 
val.size()));
       ::arrow::Decimal128 decimal_value(int_value);
       result << decimal_value.ToString(scale);
       break;
     }
     case Type::INT64: {
       int64_t long_value{};
-      std::memcpy(&long_value, val.data(), sizeof(int64_t));
+      std::memcpy(&long_value, val.data(), std::min(sizeof(int64_t), 
val.size()));
       ::arrow::Decimal128 decimal_value(long_value);
       result << decimal_value.ToString(scale);
       break;
@@ -174,8 +178,11 @@ std::string FormatNonUTF8Value(::std::string_view val) {
 
 std::string FormatFloat16Value(::std::string_view val) {
   std::stringstream result;
-  auto float16 = ::arrow::util::Float16::FromLittleEndian(
-      reinterpret_cast<const uint8_t*>(val.data()));
+  // FromLittleEndian reads two bytes unconditionally; a truncated stat value
+  // may be shorter, so copy into a zero-padded buffer first.
+  std::array<uint8_t, 2> bytes{};
+  std::memcpy(bytes.data(), val.data(), std::min(bytes.size(), val.size()));
+  auto float16 = ::arrow::util::Float16::FromLittleEndian(bytes.data());
   result << float16.ToFloat();
   return result.str();
 }
@@ -184,12 +191,16 @@ std::string FormatFloat16Value(::std::string_view val) {
 
 std::string FormatStatValue(Type::type parquet_type, ::std::string_view val,
                             const std::shared_ptr<const LogicalType>& 
logical_type) {
+  // Statistics values come straight from the file's Thrift metadata, so their
+  // length is attacker controlled and the spec allows them to be truncated.
+  // Every fixed-width read below clamps its copy to val.size() so a too-short
+  // value cannot drive a load past the end of the buffer.
   std::stringstream result;
   const char* bytes = val.data();
   switch (parquet_type) {
     case Type::BOOLEAN: {
       bool value{};
-      std::memcpy(&value, bytes, sizeof(bool));
+      std::memcpy(&value, bytes, std::min(sizeof(bool), val.size()));
       result << value;
       break;
     }
@@ -213,7 +224,7 @@ std::string FormatStatValue(Type::type parquet_type, 
::std::string_view val,
     }
     case Type::INT96: {
       std::array<int32_t, 3> values{};
-      std::memcpy(values.data(), bytes, 3 * sizeof(int32_t));
+      std::memcpy(values.data(), bytes, std::min(sizeof(values), val.size()));
       result << values[0] << " " << values[1] << " " << values[2];
       break;
     }
diff --git a/cpp/src/parquet/types_test.cc b/cpp/src/parquet/types_test.cc
index 6c77662d58..14814a18c4 100644
--- a/cpp/src/parquet/types_test.cc
+++ b/cpp/src/parquet/types_test.cc
@@ -179,6 +179,24 @@ TEST(TypePrinter, StatisticsTypes) {
             FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin, 
LogicalType::Float16()));
 }
 
+TEST(TypePrinter, StatisticsTypesShortValue) {
+  // The Parquet spec allows truncated statistics, so a min/max may be shorter
+  // than the column's physical type. FormatStatValue must format it without
+  // reading past the end of the value buffer (caught here under ASan).
+  ASSERT_NO_THROW(FormatStatValue(Type::BOOLEAN, std::string()));
+  ASSERT_NO_THROW(FormatStatValue(Type::INT32, std::string("abc")));
+  ASSERT_NO_THROW(FormatStatValue(Type::INT64, std::string("abcdefg")));
+  ASSERT_NO_THROW(FormatStatValue(Type::FLOAT, std::string()));
+  ASSERT_NO_THROW(FormatStatValue(Type::DOUBLE, std::string("1234567")));
+  ASSERT_NO_THROW(FormatStatValue(Type::INT96, std::string("12345678901")));
+  ASSERT_NO_THROW(
+      FormatStatValue(Type::INT32, std::string("abc"), LogicalType::Decimal(6, 
2)));
+  ASSERT_NO_THROW(
+      FormatStatValue(Type::INT64, std::string("abcdefg"), 
LogicalType::Decimal(18, 4)));
+  ASSERT_NO_THROW(
+      FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, std::string("a"), 
LogicalType::Float16()));
+}
+
 TEST(TestInt96Timestamp, Decoding) {
   auto check = [](int32_t julian_day, uint64_t nanoseconds) {
 #if ARROW_LITTLE_ENDIAN

Reply via email to