This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push: new 3d6159488 ORC-1952: [C++] Fix the issue where the value of headerThirdByte exceeds the valid byte range 3d6159488 is described below commit 3d6159488d12291fd09eaf774396df4710480aa5 Author: luffy-zh <zhn...@outlook.com> AuthorDate: Tue Jul 15 09:08:34 2025 -0700 ORC-1952: [C++] Fix the issue where the value of headerThirdByte exceeds the valid byte range ### What changes were proposed in this pull request? Ensure DIRECT encoding is employed when the input value exceeds the valid byte range. ### Why are the changes needed? In Patched Base Encoding, the value of headerThirdByte exceeds the valid byte range. ### How was this patch tested? Add the RleV2_value_limit_test to the RleTest suite. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #2324 from luffy-zh/ORC-1952. Lead-authored-by: luffy-zh <zhn...@outlook.com> Co-authored-by: ffacs <ffacs...@gmail.com> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- c++/src/RLEv2.hh | 1 + c++/src/RleEncoderV2.cc | 2 +- c++/test/TestRleEncoder.cc | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index 8ceb7f125..c2ce5aa85 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -123,6 +123,7 @@ namespace orc { int64_t* zigzagLiterals_; int64_t* baseRedLiterals_; int64_t* adjDeltas_; + static constexpr int64_t BASE_VALUE_LIMIT = int64_t(1) << 56; uint32_t getOpCode(EncodingType encoding); int64_t* prepareForDirectOrPatchedBase(EncodingOption& option); diff --git a/c++/src/RleEncoderV2.cc b/c++/src/RleEncoderV2.cc index 1cda9ee91..91383bb56 100644 --- a/c++/src/RleEncoderV2.cc +++ b/c++/src/RleEncoderV2.cc @@ -423,7 +423,7 @@ namespace orc { // fallback to DIRECT encoding. // The decision to use patched base was based on zigzag values, but the // actual patching is done on base reduced literals. - if ((option.brBits100p - option.brBits95p) != 0) { + if ((option.brBits100p - option.brBits95p) != 0 && std::abs(option.min) < BASE_VALUE_LIMIT) { option.encoding = PATCHED_BASE; preparePatchedBlob(option); return; diff --git a/c++/test/TestRleEncoder.cc b/c++/test/TestRleEncoder.cc index d458236cb..c69fc9cab 100644 --- a/c++/test/TestRleEncoder.cc +++ b/c++/test/TestRleEncoder.cc @@ -285,5 +285,43 @@ namespace orc { runExampleTest(data, 9, expectedEncoded, 13); } + TEST_P(RleTest, RleV2_value_limit_test) { + std::vector<int64_t> inputData = {-9007199254740992l, + -8725724278030337l, + -1125762467889153l, + -1l, + -9007199254740992l, + -9007199254740992l, + -497l, + 127l, + -1l, + -72057594037927936l, + -4194304l, + -9007199254740992l, + -4503599593816065l, + -4194304l, + -8936830510563329l, + -9007199254740992l, + -1l, + -70334384439312l, + -4063233l, + -6755399441973249l}; + int numValues = inputData.size(); + + // Invoke the encoder. + const bool isSigned = true; + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + + std::unique_ptr<RleEncoder> encoder = getEncoder(RleVersion_2, memStream, isSigned); + encoder->add(inputData.data(), numValues, nullptr); + encoder->finishEncode(); + + encoder->add(inputData.data(), numValues, nullptr); + encoder->flush(); + + // Decode and verify. + decodeAndVerify(RleVersion_2, memStream, inputData.data(), numValues, nullptr, isSigned); + } + INSTANTIATE_TEST_SUITE_P(OrcTest, RleTest, Values(true, false)); } // namespace orc