This is an automated email from the ASF dual-hosted git repository.

wgtmac pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new b3dcb6d63b GH-47657: [C++][Parquet] Check for integer overflow when 
coercing timestamps (#49615)
b3dcb6d63b is described below

commit b3dcb6d63b4ea698e6a8fcc00f3fd73d3e25071b
Author: Arnav Balyan <[email protected]>
AuthorDate: Wed Apr 29 12:36:45 2026 +0530

    GH-47657: [C++][Parquet] Check for integer overflow when coercing 
timestamps (#49615)
    
    ### Rationale for this change
     - When writing Arrow timestamps to Parquet, timestamp values are 
multiplied to convert units
     - This multiplication was performed without overflow checking which can 
silently write corrupt data.
    
    ### What changes are included in this PR?
     - Use MultiplyWithOverflowGeneric to handle/detect overflow
    
    ### Are these changes tested?
     - Yes
    ### Are there any user-facing changes?
     - Yes
    
    * GitHub Issue: #47657
    
    Authored-by: Arnav Balyan <[email protected]>
    Signed-off-by: Gang Wu <[email protected]>
---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 35 +++++++++++++++++++++++
 cpp/src/parquet/column_writer.cc                  |  9 +++++-
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc 
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index e2384972cf..d29458bf22 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -2178,6 +2178,41 @@ TEST(TestArrowReadWrite, 
ImplicitSecondToMillisecondTimestampCoercion) {
   ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*tx, *to));
 }
 
+TEST(TestArrowReadWrite, TimestampCoercionOverflow) {
+  using ::arrow::ArrayFromVector;
+  using ::arrow::field;
+  using ::arrow::schema;
+  auto t_s = ::arrow::timestamp(TimeUnit::SECOND);
+  std::vector<int64_t> overflow_values = {9223372036854776LL};
+  std::vector<bool> is_valid = {true};
+  std::shared_ptr<Array> a_s;
+  ArrayFromVector<::arrow::TimestampType, int64_t>(t_s, is_valid, 
overflow_values, &a_s);
+
+  auto s = schema({field("timestamp", t_s)});
+  auto table = Table::Make(s, {a_s});
+
+  for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, 
TimeUnit::NANO}) {
+    if (unit == TimeUnit::SECOND) {
+      ASSERT_RAISES(Invalid, WriteTable(*table, default_memory_pool(),
+                                        CreateOutputStream(), 
table->num_rows()));
+    } else {
+      auto coerce_props =
+          ArrowWriterProperties::Builder().coerce_timestamps(unit)->build();
+      ASSERT_RAISES(Invalid, WriteTable(*table, default_memory_pool(),
+                                        CreateOutputStream(), 
table->num_rows(),
+                                        default_writer_properties(), 
coerce_props));
+    }
+  }
+
+  std::vector<bool> null_valid = {false};
+  std::shared_ptr<Array> a_null;
+  ArrayFromVector<::arrow::TimestampType, int64_t>(t_s, null_valid, 
overflow_values,
+                                                   &a_null);
+  auto null_table = Table::Make(s, {a_null});
+  ASSERT_OK_NO_THROW(WriteTable(*null_table, default_memory_pool(), 
CreateOutputStream(),
+                                null_table->num_rows()));
+}
+
 TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) {
   using ::arrow::ArrayFromVector;
   using ::arrow::field;
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 61507c2ba1..b3ed46ee2d 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -41,6 +41,7 @@
 #include "arrow/util/crc32.h"
 #include "arrow/util/endian.h"
 #include "arrow/util/float16.h"
+#include "arrow/util/int_util_overflow.h"
 #include "arrow/util/key_value_metadata.h"
 #include "arrow/util/logging_internal.h"
 #include "arrow/util/rle_encoding_internal.h"
@@ -2352,7 +2353,13 @@ struct SerializeFunctor<Int64Type, 
::arrow::TimestampType> {
 
     auto MultiplyBy = [&](const int64_t factor) {
       for (int64_t i = 0; i < array.length(); i++) {
-        out[i] = values[i] * factor;
+        if (array.IsValid(i) &&
+            ARROW_PREDICT_FALSE(::arrow::internal::MultiplyWithOverflowGeneric(
+                values[i], factor, &out[i]))) {
+          return Status::Invalid("Integer overflow when casting timestamp 
value ",
+                                 values[i], " from ", source_type.ToString(), 
" to ",
+                                 target_type->ToString());
+        }
       }
       return Status::OK();
     };

Reply via email to