This is an automated email from the ASF dual-hosted git repository.
pitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 546d42c71a GH-50184: [C++][Parquet] Avoid reading past truncated
statistics values in FormatStatValue (#50025)
546d42c71a is described below
commit 546d42c71a467c3741df2d6ececad966fc66bf4c
Author: jmestwa-coder <[email protected]>
AuthorDate: Wed Jun 17 13:50:02 2026 +0530
GH-50184: [C++][Parquet] Avoid reading past truncated statistics values in
FormatStatValue (#50025)
### Rationale for this change
`FormatStatValue` in `cpp/src/parquet/types.cc` does fixed-width
`memcpy`/byte loads on a statistics value that comes straight out of the file's
Thrift-encoded metadata, so its length is attacker controlled. The Parquet spec
[allows truncated
statistics](https://github.com/apache/parquet-format/blob/325b9f4a70663c88485d4d59fdc1a121d7497943/src/main/thrift/parquet.thrift#L295-L301),
so a `min_value`/`max_value` shorter than the column's physical type (for
example a zero-byte stat on an [...]
### What changes are included in this PR?
Each fixed-width read in `FormatStatValue` and the helpers it delegates to
now clamps its copy to `val.size()` instead of assuming the full type width is
present: the `BOOLEAN`/`INT32`/`INT64`/`INT96`/`FLOAT`/`DOUBLE` numeric loads,
the int32/int64-backed decimal loads, and the float16 `FIXED_LEN_BYTE_ARRAY`
load. A truncated value is zero-padded rather than over-read, and no exception
is raised since truncated statistics are spec-legal. The variable-length
`FIXED_LEN_BYTE_ARRAY` (dec [...]
### Are these changes tested?
Yes. `TypePrinter.StatisticsTypesShortValue` in `types_test.cc` formats a
too-short value for each path and checks it returns without reading past the
buffer (caught under ASan).
### Are there any user-facing changes?
No API change. A truncated statistics value is formatted from the bytes
that are present instead of reading out of bounds.
**This PR contains a "Critical Fix".** It stops an out-of-bounds read on a
buffer whose length is controlled by the input Parquet file.
* GitHub Issue: #50184
Authored-by: jmestwa-coder <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/types.cc | 25 ++++++++++++++++++-------
cpp/src/parquet/types_test.cc | 18 ++++++++++++++++++
2 files changed, 36 insertions(+), 7 deletions(-)
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index ff5bd4eafb..9d7604faec 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.
+#include <algorithm>
#include <array>
#include <cmath>
#include <cstdint>
@@ -110,8 +111,11 @@ template <typename T>
std::enable_if_t<std::is_arithmetic_v<T>, std::string> FormatNumericValue(
::std::string_view val) {
std::stringstream result;
+ // The statistics value comes from the file's Thrift metadata, so its length
+ // is attacker controlled and the spec allows it to be truncated. Only copy
+ // the bytes that are actually present and leave the rest zero-padded.
T value{};
- std::memcpy(&value, val.data(), sizeof(T));
+ std::memcpy(&value, val.data(), std::min(sizeof(T), val.size()));
result << value;
return result.str();
}
@@ -128,14 +132,14 @@ std::string FormatDecimalValue(Type::type parquet_type,
::std::string_view val,
switch (parquet_type) {
case Type::INT32: {
int32_t int_value{};
- std::memcpy(&int_value, val.data(), sizeof(int32_t));
+ std::memcpy(&int_value, val.data(), std::min(sizeof(int32_t),
val.size()));
::arrow::Decimal128 decimal_value(int_value);
result << decimal_value.ToString(scale);
break;
}
case Type::INT64: {
int64_t long_value{};
- std::memcpy(&long_value, val.data(), sizeof(int64_t));
+ std::memcpy(&long_value, val.data(), std::min(sizeof(int64_t),
val.size()));
::arrow::Decimal128 decimal_value(long_value);
result << decimal_value.ToString(scale);
break;
@@ -174,8 +178,11 @@ std::string FormatNonUTF8Value(::std::string_view val) {
std::string FormatFloat16Value(::std::string_view val) {
std::stringstream result;
- auto float16 = ::arrow::util::Float16::FromLittleEndian(
- reinterpret_cast<const uint8_t*>(val.data()));
+ // FromLittleEndian reads two bytes unconditionally; a truncated stat value
+ // may be shorter, so copy into a zero-padded buffer first.
+ std::array<uint8_t, 2> bytes{};
+ std::memcpy(bytes.data(), val.data(), std::min(bytes.size(), val.size()));
+ auto float16 = ::arrow::util::Float16::FromLittleEndian(bytes.data());
result << float16.ToFloat();
return result.str();
}
@@ -184,12 +191,16 @@ std::string FormatFloat16Value(::std::string_view val) {
std::string FormatStatValue(Type::type parquet_type, ::std::string_view val,
const std::shared_ptr<const LogicalType>&
logical_type) {
+ // Statistics values come straight from the file's Thrift metadata, so their
+ // length is attacker controlled and the spec allows them to be truncated.
+ // Every fixed-width read below clamps its copy to val.size() so a too-short
+ // value cannot drive a load past the end of the buffer.
std::stringstream result;
const char* bytes = val.data();
switch (parquet_type) {
case Type::BOOLEAN: {
bool value{};
- std::memcpy(&value, bytes, sizeof(bool));
+ std::memcpy(&value, bytes, std::min(sizeof(bool), val.size()));
result << value;
break;
}
@@ -213,7 +224,7 @@ std::string FormatStatValue(Type::type parquet_type,
::std::string_view val,
}
case Type::INT96: {
std::array<int32_t, 3> values{};
- std::memcpy(values.data(), bytes, 3 * sizeof(int32_t));
+ std::memcpy(values.data(), bytes, std::min(sizeof(values), val.size()));
result << values[0] << " " << values[1] << " " << values[2];
break;
}
diff --git a/cpp/src/parquet/types_test.cc b/cpp/src/parquet/types_test.cc
index 6c77662d58..14814a18c4 100644
--- a/cpp/src/parquet/types_test.cc
+++ b/cpp/src/parquet/types_test.cc
@@ -179,6 +179,24 @@ TEST(TypePrinter, StatisticsTypes) {
FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin,
LogicalType::Float16()));
}
+TEST(TypePrinter, StatisticsTypesShortValue) {
+ // The Parquet spec allows truncated statistics, so a min/max may be shorter
+ // than the column's physical type. FormatStatValue must format it without
+ // reading past the end of the value buffer (caught here under ASan).
+ ASSERT_NO_THROW(FormatStatValue(Type::BOOLEAN, std::string()));
+ ASSERT_NO_THROW(FormatStatValue(Type::INT32, std::string("abc")));
+ ASSERT_NO_THROW(FormatStatValue(Type::INT64, std::string("abcdefg")));
+ ASSERT_NO_THROW(FormatStatValue(Type::FLOAT, std::string()));
+ ASSERT_NO_THROW(FormatStatValue(Type::DOUBLE, std::string("1234567")));
+ ASSERT_NO_THROW(FormatStatValue(Type::INT96, std::string("12345678901")));
+ ASSERT_NO_THROW(
+ FormatStatValue(Type::INT32, std::string("abc"), LogicalType::Decimal(6,
2)));
+ ASSERT_NO_THROW(
+ FormatStatValue(Type::INT64, std::string("abcdefg"),
LogicalType::Decimal(18, 4)));
+ ASSERT_NO_THROW(
+ FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, std::string("a"),
LogicalType::Float16()));
+}
+
TEST(TestInt96Timestamp, Decoding) {
auto check = [](int32_t julian_day, uint64_t nanoseconds) {
#if ARROW_LITTLE_ENDIAN