This is an automated email from the ASF dual-hosted git repository.
wgtmac pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new b3dcb6d63b GH-47657: [C++][Parquet] Check for integer overflow when
coercing timestamps (#49615)
b3dcb6d63b is described below
commit b3dcb6d63b4ea698e6a8fcc00f3fd73d3e25071b
Author: Arnav Balyan <[email protected]>
AuthorDate: Wed Apr 29 12:36:45 2026 +0530
GH-47657: [C++][Parquet] Check for integer overflow when coercing
timestamps (#49615)
### Rationale for this change
- When writing Arrow timestamps to Parquet, timestamp values are
multiplied to convert units
- This multiplication was performed without overflow checking which can
silently write corrupt data.
### What changes are included in this PR?
- Use MultiplyWithOverflowGeneric to handle/detect overflow
### Are these changes tested?
- Yes
### Are there any user-facing changes?
- Yes
* GitHub Issue: #47657
Authored-by: Arnav Balyan <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
---
cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 35 +++++++++++++++++++++++
cpp/src/parquet/column_writer.cc | 9 +++++-
2 files changed, 43 insertions(+), 1 deletion(-)
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index e2384972cf..d29458bf22 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -2178,6 +2178,41 @@ TEST(TestArrowReadWrite,
ImplicitSecondToMillisecondTimestampCoercion) {
ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*tx, *to));
}
+TEST(TestArrowReadWrite, TimestampCoercionOverflow) {
+ using ::arrow::ArrayFromVector;
+ using ::arrow::field;
+ using ::arrow::schema;
+ auto t_s = ::arrow::timestamp(TimeUnit::SECOND);
+ std::vector<int64_t> overflow_values = {9223372036854776LL};
+ std::vector<bool> is_valid = {true};
+ std::shared_ptr<Array> a_s;
+ ArrayFromVector<::arrow::TimestampType, int64_t>(t_s, is_valid,
overflow_values, &a_s);
+
+ auto s = schema({field("timestamp", t_s)});
+ auto table = Table::Make(s, {a_s});
+
+ for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO,
TimeUnit::NANO}) {
+ if (unit == TimeUnit::SECOND) {
+ ASSERT_RAISES(Invalid, WriteTable(*table, default_memory_pool(),
+ CreateOutputStream(),
table->num_rows()));
+ } else {
+ auto coerce_props =
+ ArrowWriterProperties::Builder().coerce_timestamps(unit)->build();
+ ASSERT_RAISES(Invalid, WriteTable(*table, default_memory_pool(),
+ CreateOutputStream(),
table->num_rows(),
+ default_writer_properties(),
coerce_props));
+ }
+ }
+
+ std::vector<bool> null_valid = {false};
+ std::shared_ptr<Array> a_null;
+ ArrayFromVector<::arrow::TimestampType, int64_t>(t_s, null_valid,
overflow_values,
+ &a_null);
+ auto null_table = Table::Make(s, {a_null});
+ ASSERT_OK_NO_THROW(WriteTable(*null_table, default_memory_pool(),
CreateOutputStream(),
+ null_table->num_rows()));
+}
+
TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) {
using ::arrow::ArrayFromVector;
using ::arrow::field;
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 61507c2ba1..b3ed46ee2d 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -41,6 +41,7 @@
#include "arrow/util/crc32.h"
#include "arrow/util/endian.h"
#include "arrow/util/float16.h"
+#include "arrow/util/int_util_overflow.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging_internal.h"
#include "arrow/util/rle_encoding_internal.h"
@@ -2352,7 +2353,13 @@ struct SerializeFunctor<Int64Type,
::arrow::TimestampType> {
auto MultiplyBy = [&](const int64_t factor) {
for (int64_t i = 0; i < array.length(); i++) {
- out[i] = values[i] * factor;
+ if (array.IsValid(i) &&
+ ARROW_PREDICT_FALSE(::arrow::internal::MultiplyWithOverflowGeneric(
+ values[i], factor, &out[i]))) {
+ return Status::Invalid("Integer overflow when casting timestamp
value ",
+ values[i], " from ", source_type.ToString(),
" to ",
+ target_type->ToString());
+ }
}
return Status::OK();
};