This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new e980b7ef70 GH-50078: [C++][ORC] Avoid signed overflow when converting 
timestamps (#50035)
e980b7ef70 is described below

commit e980b7ef70f66c2d5b674bd637a070333e10a3fc
Author: jmestwa-coder <[email protected]>
AuthorDate: Sat Jun 6 02:19:18 2026 +0530

    GH-50078: [C++][ORC] Avoid signed overflow when converting timestamps 
(#50035)
    
    ### Rationale for this change
    
    A far-future ORC timestamp (after ~2262) makes `AppendTimestampBatch` in 
`cpp/src/arrow/adapters/orc/util.cc` overflow int64 nanoseconds in `seconds * 
kOneSecondNanos + nanos`. Reducing the multiply under 
`-fsanitize=signed-integer-overflow`:
    
    ```
    runtime error: signed integer overflow:
      10000000000 * 1000000000 cannot be represented in type 'int64_t'
    ```
    
    ### What changes are included in this PR?
    
    Detect the overflow with `MultiplyWithOverflow`/`AddWithOverflow` and 
return `Status::Invalid` for out-of-range values instead of computing the 
product with undefined behavior. Null slots are skipped.
    
    ### Are these changes tested?
    
    Verified the offending expression with a standalone 
`-fsanitize=signed-integer-overflow` reproducer and confirmed the patched path 
returns an error rather than overflowing.
    
    ### Are there any user-facing changes?
    
    No.
    
    * GitHub Issue: #50078
    
    Lead-authored-by: jmestwa-coder <[email protected]>
    Co-authored-by: Sutou Kouhei <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 cpp/src/arrow/adapters/orc/adapter_test.cc | 25 +++++++++++++++++++++
 cpp/src/arrow/adapters/orc/util.cc         | 36 ++++++++++++++++--------------
 2 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc 
b/cpp/src/arrow/adapters/orc/adapter_test.cc
index 714e61b22b..3cbb6d7828 100644
--- a/cpp/src/arrow/adapters/orc/adapter_test.cc
+++ b/cpp/src/arrow/adapters/orc/adapter_test.cc
@@ -1114,6 +1114,31 @@ TEST(TestWriteReadORCBatch, DenseUnionConversion) {
   TestUnionConversion(std::move(array));
 }
 
+TEST(TestWriteReadORCBatch, TimestampOutOfRangeIsRejected) {
+  // A timestamp far past year 2262 does not fit in int64 nanoseconds, so 
scaling
+  // seconds to nanoseconds overflows. The conversion must report it rather 
than
+  // produce a garbage value.
+  auto orc_type = liborc::Type::buildTypeFromString("struct<ts:timestamp>");
+
+  MemoryOutputStream mem_stream(kDefaultSmallMemStreamSize);
+  auto writer = CreateWriter(/*stripe_size=*/1024, *orc_type, &mem_stream);
+  auto orc_batch = writer->createRowBatch(1);
+
+  auto struct_batch = 
internal::checked_cast<liborc::StructVectorBatch*>(orc_batch.get());
+  auto ts_batch =
+      
internal::checked_cast<liborc::TimestampVectorBatch*>(struct_batch->fields[0]);
+  ts_batch->data[0] = 10000000000;  // ~year 2286, overflows once scaled to 
nanos
+  ts_batch->nanoseconds[0] = 0;
+  ts_batch->numElements = 1;
+  ts_batch->hasNulls = false;
+  struct_batch->numElements = 1;
+
+  ASSIGN_OR_ABORT(auto builder, MakeBuilder(timestamp(TimeUnit::NANO)));
+  ASSERT_RAISES(Invalid, adapters::orc::AppendBatch(
+                             orc_type->getSubtype(0), struct_batch->fields[0],
+                             /*offset=*/0, /*length=*/1, builder.get()));
+}
+
 class TestORCWriterMultipleWrite : public ::testing::Test {
  public:
   TestORCWriterMultipleWrite() : rand(kRandomSeed) {}
diff --git a/cpp/src/arrow/adapters/orc/util.cc 
b/cpp/src/arrow/adapters/orc/util.cc
index 6974faae59..91260302ab 100644
--- a/cpp/src/arrow/adapters/orc/util.cc
+++ b/cpp/src/arrow/adapters/orc/util.cc
@@ -31,6 +31,7 @@
 #include "arrow/util/bitmap_ops.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
+#include "arrow/util/int_util_overflow.h"
 #include "arrow/util/key_value_metadata.h"
 #include "arrow/util/range.h"
 #include "arrow/util/string.h"
@@ -200,26 +201,27 @@ Status AppendTimestampBatch(liborc::ColumnVectorBatch* 
column_vector_batch,
   auto builder = checked_cast<TimestampBuilder*>(abuilder);
   auto batch = 
checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch);
 
-  if (length == 0) {
-    return Status::OK();
-  }
-
-  const uint8_t* valid_bytes = nullptr;
-  if (batch->hasNulls) {
-    valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + 
offset;
-  }
-
   const int64_t* seconds = batch->data.data() + offset;
   const int64_t* nanos = batch->nanoseconds.data() + offset;
 
-  auto transform_timestamp = [seconds, nanos](int64_t index) {
-    return seconds[index] * kOneSecondNanos + nanos[index];
-  };
-
-  auto transform_range = internal::MakeLazyRange(transform_timestamp, length);
-
-  RETURN_NOT_OK(
-      builder->AppendValues(transform_range.begin(), transform_range.end(), 
valid_bytes));
+  const bool has_nulls = batch->hasNulls;
+  RETURN_NOT_OK(builder->Reserve(length));
+  for (int64_t i = 0; i < length; i++) {
+    if (has_nulls && !batch->notNull[offset + i]) {
+      builder->UnsafeAppendNull();
+      continue;
+    }
+    // A timestamp past ~year 2262 does not fit in int64 nanoseconds; computing
+    // it with a bare `seconds * kOneSecondNanos` is signed overflow.
+    int64_t value;
+    if (ARROW_PREDICT_FALSE(
+            internal::MultiplyWithOverflow(seconds[i], kOneSecondNanos, 
&value) ||
+            internal::AddWithOverflow(value, nanos[i], &value))) {
+      return Status::Invalid("ORC timestamp (", seconds[i], "s + ", nanos[i],
+                             "ns) is out of range for nanosecond resolution");
+    }
+    builder->UnsafeAppend(value);
+  }
   return Status::OK();
 }
 

Reply via email to