This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new e980b7ef70 GH-50078: [C++][ORC] Avoid signed overflow when converting
timestamps (#50035)
e980b7ef70 is described below
commit e980b7ef70f66c2d5b674bd637a070333e10a3fc
Author: jmestwa-coder <[email protected]>
AuthorDate: Sat Jun 6 02:19:18 2026 +0530
GH-50078: [C++][ORC] Avoid signed overflow when converting timestamps
(#50035)
### Rationale for this change
A far-future ORC timestamp (after ~2262) makes `AppendTimestampBatch` in
`cpp/src/arrow/adapters/orc/util.cc` overflow int64 nanoseconds in `seconds *
kOneSecondNanos + nanos`. Reducing the multiply under
`-fsanitize=signed-integer-overflow`:
```
runtime error: signed integer overflow:
10000000000 * 1000000000 cannot be represented in type 'int64_t'
```
### What changes are included in this PR?
Detect the overflow with `MultiplyWithOverflow`/`AddWithOverflow` and
return `Status::Invalid` for out-of-range values instead of computing the
product with undefined behavior. Null slots are skipped.
### Are these changes tested?
Verified the offending expression with a standalone
`-fsanitize=signed-integer-overflow` reproducer and confirmed the patched path
returns an error rather than overflowing.
### Are there any user-facing changes?
No.
* GitHub Issue: #50078
Lead-authored-by: jmestwa-coder <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/adapters/orc/adapter_test.cc | 25 +++++++++++++++++++++
cpp/src/arrow/adapters/orc/util.cc | 36 ++++++++++++++++--------------
2 files changed, 44 insertions(+), 17 deletions(-)
diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc
b/cpp/src/arrow/adapters/orc/adapter_test.cc
index 714e61b22b..3cbb6d7828 100644
--- a/cpp/src/arrow/adapters/orc/adapter_test.cc
+++ b/cpp/src/arrow/adapters/orc/adapter_test.cc
@@ -1114,6 +1114,31 @@ TEST(TestWriteReadORCBatch, DenseUnionConversion) {
TestUnionConversion(std::move(array));
}
+TEST(TestWriteReadORCBatch, TimestampOutOfRangeIsRejected) {
+ // A timestamp far past year 2262 does not fit in int64 nanoseconds, so
scaling
+ // seconds to nanoseconds overflows. The conversion must report it rather
than
+ // produce a garbage value.
+ auto orc_type = liborc::Type::buildTypeFromString("struct<ts:timestamp>");
+
+ MemoryOutputStream mem_stream(kDefaultSmallMemStreamSize);
+ auto writer = CreateWriter(/*stripe_size=*/1024, *orc_type, &mem_stream);
+ auto orc_batch = writer->createRowBatch(1);
+
+ auto struct_batch =
internal::checked_cast<liborc::StructVectorBatch*>(orc_batch.get());
+ auto ts_batch =
+
internal::checked_cast<liborc::TimestampVectorBatch*>(struct_batch->fields[0]);
+ ts_batch->data[0] = 10000000000; // ~year 2286, overflows once scaled to
nanos
+ ts_batch->nanoseconds[0] = 0;
+ ts_batch->numElements = 1;
+ ts_batch->hasNulls = false;
+ struct_batch->numElements = 1;
+
+ ASSIGN_OR_ABORT(auto builder, MakeBuilder(timestamp(TimeUnit::NANO)));
+ ASSERT_RAISES(Invalid, adapters::orc::AppendBatch(
+ orc_type->getSubtype(0), struct_batch->fields[0],
+ /*offset=*/0, /*length=*/1, builder.get()));
+}
+
class TestORCWriterMultipleWrite : public ::testing::Test {
public:
TestORCWriterMultipleWrite() : rand(kRandomSeed) {}
diff --git a/cpp/src/arrow/adapters/orc/util.cc
b/cpp/src/arrow/adapters/orc/util.cc
index 6974faae59..91260302ab 100644
--- a/cpp/src/arrow/adapters/orc/util.cc
+++ b/cpp/src/arrow/adapters/orc/util.cc
@@ -31,6 +31,7 @@
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
+#include "arrow/util/int_util_overflow.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/range.h"
#include "arrow/util/string.h"
@@ -200,26 +201,27 @@ Status AppendTimestampBatch(liborc::ColumnVectorBatch*
column_vector_batch,
auto builder = checked_cast<TimestampBuilder*>(abuilder);
auto batch =
checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch);
- if (length == 0) {
- return Status::OK();
- }
-
- const uint8_t* valid_bytes = nullptr;
- if (batch->hasNulls) {
- valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) +
offset;
- }
-
const int64_t* seconds = batch->data.data() + offset;
const int64_t* nanos = batch->nanoseconds.data() + offset;
- auto transform_timestamp = [seconds, nanos](int64_t index) {
- return seconds[index] * kOneSecondNanos + nanos[index];
- };
-
- auto transform_range = internal::MakeLazyRange(transform_timestamp, length);
-
- RETURN_NOT_OK(
- builder->AppendValues(transform_range.begin(), transform_range.end(),
valid_bytes));
+ const bool has_nulls = batch->hasNulls;
+ RETURN_NOT_OK(builder->Reserve(length));
+ for (int64_t i = 0; i < length; i++) {
+ if (has_nulls && !batch->notNull[offset + i]) {
+ builder->UnsafeAppendNull();
+ continue;
+ }
+ // A timestamp past ~year 2262 does not fit in int64 nanoseconds; computing
+ // it with a bare `seconds * kOneSecondNanos` is signed overflow.
+ int64_t value;
+ if (ARROW_PREDICT_FALSE(
+ internal::MultiplyWithOverflow(seconds[i], kOneSecondNanos,
&value) ||
+ internal::AddWithOverflow(value, nanos[i], &value))) {
+ return Status::Invalid("ORC timestamp (", seconds[i], "s + ", nanos[i],
+ "ns) is out of range for nanosecond resolution");
+ }
+ builder->UnsafeAppend(value);
+ }
return Status::OK();
}