This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new ddedb9665 ORC-1952: [C++] Fix the issue where the value of
headerThirdByte exceeds the valid byte range
ddedb9665 is described below
commit ddedb9665b93d2b90c314d937a7edb51e086e9af
Author: luffy-zh <[email protected]>
AuthorDate: Tue Jul 15 09:08:34 2025 -0700
ORC-1952: [C++] Fix the issue where the value of headerThirdByte exceeds
the valid byte range
### What changes were proposed in this pull request?
Ensure DIRECT encoding is employed when the input value exceeds the valid
byte range.
### Why are the changes needed?
In Patched Base Encoding, the value of headerThirdByte exceeds the valid
byte range.
### How was this patch tested?
Add the RleV2_value_limit_test to the RleTest suite.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #2324 from luffy-zh/ORC-1952.
Lead-authored-by: luffy-zh <[email protected]>
Co-authored-by: ffacs <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
(cherry picked from commit 3d6159488d12291fd09eaf774396df4710480aa5)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
c++/src/RLEv2.hh | 1 +
c++/src/RleEncoderV2.cc | 2 +-
c++/test/TestRleEncoder.cc | 38 ++++++++++++++++++++++++++++++++++++++
3 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh
index 8ceb7f125..c2ce5aa85 100644
--- a/c++/src/RLEv2.hh
+++ b/c++/src/RLEv2.hh
@@ -123,6 +123,7 @@ namespace orc {
int64_t* zigzagLiterals_;
int64_t* baseRedLiterals_;
int64_t* adjDeltas_;
+ static constexpr int64_t BASE_VALUE_LIMIT = int64_t(1) << 56;
uint32_t getOpCode(EncodingType encoding);
int64_t* prepareForDirectOrPatchedBase(EncodingOption& option);
diff --git a/c++/src/RleEncoderV2.cc b/c++/src/RleEncoderV2.cc
index 1cda9ee91..91383bb56 100644
--- a/c++/src/RleEncoderV2.cc
+++ b/c++/src/RleEncoderV2.cc
@@ -423,7 +423,7 @@ namespace orc {
// fallback to DIRECT encoding.
// The decision to use patched base was based on zigzag values, but the
// actual patching is done on base reduced literals.
- if ((option.brBits100p - option.brBits95p) != 0) {
+ if ((option.brBits100p - option.brBits95p) != 0 && std::abs(option.min)
< BASE_VALUE_LIMIT) {
option.encoding = PATCHED_BASE;
preparePatchedBlob(option);
return;
diff --git a/c++/test/TestRleEncoder.cc b/c++/test/TestRleEncoder.cc
index d458236cb..c69fc9cab 100644
--- a/c++/test/TestRleEncoder.cc
+++ b/c++/test/TestRleEncoder.cc
@@ -285,5 +285,43 @@ namespace orc {
runExampleTest(data, 9, expectedEncoded, 13);
}
+ TEST_P(RleTest, RleV2_value_limit_test) {
+ std::vector<int64_t> inputData = {-9007199254740992l,
+ -8725724278030337l,
+ -1125762467889153l,
+ -1l,
+ -9007199254740992l,
+ -9007199254740992l,
+ -497l,
+ 127l,
+ -1l,
+ -72057594037927936l,
+ -4194304l,
+ -9007199254740992l,
+ -4503599593816065l,
+ -4194304l,
+ -8936830510563329l,
+ -9007199254740992l,
+ -1l,
+ -70334384439312l,
+ -4063233l,
+ -6755399441973249l};
+ int numValues = inputData.size();
+
+ // Invoke the encoder.
+ const bool isSigned = true;
+ MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+
+ std::unique_ptr<RleEncoder> encoder = getEncoder(RleVersion_2, memStream,
isSigned);
+ encoder->add(inputData.data(), numValues, nullptr);
+ encoder->finishEncode();
+
+ encoder->add(inputData.data(), numValues, nullptr);
+ encoder->flush();
+
+ // Decode and verify.
+ decodeAndVerify(RleVersion_2, memStream, inputData.data(), numValues,
nullptr, isSigned);
+ }
+
INSTANTIATE_TEST_SUITE_P(OrcTest, RleTest, Values(true, false));
} // namespace orc