This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new 860c54d90 ORC-1841: [C++][CI] Add UBSAN support to Cmake
860c54d90 is described below
commit 860c54d9001b5e8658babd95c5b212ed742f55f7
Author: luffy-zh <[email protected]>
AuthorDate: Tue Jun 17 16:07:46 2025 -0700
ORC-1841: [C++][CI] Add UBSAN support to Cmake
### What changes were proposed in this pull request?
Add support for UBASN in CMake options and enable it by default in the CI
workflow.
### Why are the changes needed?
Detect undefined behavior issues with the help of the undefined sanitizer.
### How was this patch tested?
Pass all CIs.
### Was this patch authored or co-authored using generative AI tooling?
NO.
Closes #2117 from luffy-zh/ORC-UBSAN.
Authored-by: luffy-zh <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.github/workflows/asan_test.yml | 13 +++++++++----
CMakeLists.txt | 15 +++++++++++++++
c++/include/orc/Int128.hh | 34 ++--------------------------------
c++/src/Adaptor.hh.in | 6 ++++++
c++/src/BloomFilter.cc | 3 ++-
c++/src/BloomFilter.hh | 1 +
c++/src/ColumnReader.cc | 7 ++++++-
c++/src/ConvertColumnReader.cc | 4 ++--
c++/src/Int128.cc | 35 +++++++++++++++++++++++++++++++++++
c++/src/RLE.cc | 1 +
c++/src/RLE.hh | 1 +
11 files changed, 80 insertions(+), 40 deletions(-)
diff --git a/.github/workflows/asan_test.yml b/.github/workflows/asan_test.yml
index f4a31525f..6e7ac64fb 100644
--- a/.github/workflows/asan_test.yml
+++ b/.github/workflows/asan_test.yml
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-name: Address Sanitizer Tests
+name: Address/Undefined Sanitizer Tests
on:
pull_request:
@@ -47,18 +47,23 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
- - name: Configure and Build with ASAN
+ - name: Install dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y build-essential cmake libpthread-stubs0-dev
+ - name: Configure and Build with ASAN and UBSAN
env:
CC: ${{ matrix.cc }}
CXX: ${{ matrix.cxx }}
run: |
- mkdir build && cd build
- cmake .. -DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=ON -DBUILD_JAVA=OFF
+ mkdir -p build && cd build
+ cmake .. -DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=ON -DENABLE_UBSAN=ON
-DBUILD_ENABLE_AVX512=ON -DBUILD_CPP_ENABLE_METRICS=ON -DBUILD_JAVA=OFF
make
- name: Run Tests
working-directory: build
env:
ASAN_OPTIONS:
detect_leaks=1:symbolize=1:strict_string_checks=1:halt_on_error=0:detect_container_overflow=0
LSAN_OPTIONS: suppressions=${{ github.workspace
}}/.github/lsan-suppressions.txt
+ UBSAN_OPTIONS: print_stacktrace=1
run: |
ctest --output-on-failure
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e6ce4fde0..cc4ee9adb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,10 @@ option(ORC_ENABLE_CLANG_TOOLS
"Enable Clang tools"
OFF)
+option(ENABLE_UBSAN
+ "Enable Undefined Behavior Sanitizer"
+ OFF)
+
# Make sure that a build type is selected
if (NOT CMAKE_BUILD_TYPE)
message(STATUS "No build type selected, default to ReleaseWithDebugInfo")
@@ -171,6 +175,17 @@ if (ENABLE_ASAN)
endif()
endif()
+# Configure Undefined Behavior Sanitizer if enabled
+if (ENABLE_UBSAN)
+ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined
-fno-sanitize=alignment,vptr,function -fno-sanitize-recover=all")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined
-fno-sanitize=alignment,vptr,function -fno-sanitize-recover=all")
+ message(STATUS "Undefined Behavior Sanitizer enabled")
+ else()
+ message(WARNING "Undefined Behavior Sanitizer is only supported for GCC
and Clang compilers")
+ endif()
+endif()
+
enable_testing()
INCLUDE(GNUInstallDirs) # Put it before ThirdpartyToolchain to make
CMAKE_INSTALL_LIBDIR available.
diff --git a/c++/include/orc/Int128.hh b/c++/include/orc/Int128.hh
index 6954c771c..e728e70e7 100644
--- a/c++/include/orc/Int128.hh
+++ b/c++/include/orc/Int128.hh
@@ -193,43 +193,13 @@ namespace orc {
* Shift left by the given number of bits.
* Values larger than 2**127 will shift into the sign bit.
*/
- Int128& operator<<=(uint32_t bits) {
- if (bits != 0) {
- if (bits < 64) {
- highbits_ <<= bits;
- highbits_ |= (lowbits_ >> (64 - bits));
- lowbits_ <<= bits;
- } else if (bits < 128) {
- highbits_ = static_cast<int64_t>(lowbits_) << (bits - 64);
- lowbits_ = 0;
- } else {
- highbits_ = 0;
- lowbits_ = 0;
- }
- }
- return *this;
- }
+ Int128& operator<<=(uint32_t bits);
/**
* Shift right by the given number of bits. Negative values will
* sign extend and fill with one bits.
*/
- Int128& operator>>=(uint32_t bits) {
- if (bits != 0) {
- if (bits < 64) {
- lowbits_ >>= bits;
- lowbits_ |= static_cast<uint64_t>(highbits_ << (64 - bits));
- highbits_ = static_cast<int64_t>(static_cast<uint64_t>(highbits_) >>
bits);
- } else if (bits < 128) {
- lowbits_ = static_cast<uint64_t>(highbits_ >> (bits - 64));
- highbits_ = highbits_ >= 0 ? 0 : -1l;
- } else {
- highbits_ = highbits_ >= 0 ? 0 : -1l;
- lowbits_ = static_cast<uint64_t>(highbits_);
- }
- }
- return *this;
- }
+ Int128& operator>>=(uint32_t bits);
bool operator==(const Int128& right) const {
return highbits_ == right.highbits_ && lowbits_ == right.lowbits_;
diff --git a/c++/src/Adaptor.hh.in b/c++/src/Adaptor.hh.in
index 2cce8158e..f3ed763eb 100644
--- a/c++/src/Adaptor.hh.in
+++ b/c++/src/Adaptor.hh.in
@@ -49,6 +49,12 @@ typedef SSIZE_T ssize_t;
ssize_t pread(int fd, void* buf, size_t count, off_t offset);
#endif
+#if defined(__GNUC__) || defined(__clang__)
+ #define NO_SANITIZE_ATTR
__attribute__((no_sanitize("signed-integer-overflow", "shift")))
+#else
+ #define NO_SANITIZE_ATTR
+#endif
+
#ifdef HAS_DIAGNOSTIC_PUSH
#ifdef __clang__
#define DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
diff --git a/c++/src/BloomFilter.cc b/c++/src/BloomFilter.cc
index 887637223..025bdd8a0 100644
--- a/c++/src/BloomFilter.cc
+++ b/c++/src/BloomFilter.cc
@@ -208,7 +208,7 @@ namespace orc {
}
DIAGNOSTIC_POP
-
+ NO_SANITIZE_ATTR
void BloomFilterImpl::addHash(int64_t hash64) {
int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff);
// In Java codes, we use "hash64 >>> 32" which is an unsigned shift op.
@@ -226,6 +226,7 @@ namespace orc {
}
}
+ NO_SANITIZE_ATTR
bool BloomFilterImpl::testHash(int64_t hash64) const {
int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff);
// In Java codes, we use "hash64 >>> 32" which is an unsigned shift op.
diff --git a/c++/src/BloomFilter.hh b/c++/src/BloomFilter.hh
index ebc4a5ee0..75fb02a02 100644
--- a/c++/src/BloomFilter.hh
+++ b/c++/src/BloomFilter.hh
@@ -194,6 +194,7 @@ namespace orc {
// Thomas Wang's integer hash function
//
http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
// Put this in header file so tests can use it as well.
+ NO_SANITIZE_ATTR
inline int64_t getLongHash(int64_t key) {
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
key = key ^ (key >> 24);
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index af434c37c..0fd17de1b 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -726,6 +726,9 @@ namespace orc {
if (totalBytes <= lastBufferLength_) {
// subtract the needed bytes from the ones left over
lastBufferLength_ -= totalBytes;
+ if (lastBuffer_ == nullptr) {
+ throw ParseError("StringDirectColumnReader::skip: lastBuffer_ is
null");
+ }
lastBuffer_ += totalBytes;
} else {
// move the stream forward after accounting for the buffered bytes
@@ -780,7 +783,9 @@ namespace orc {
byteBatch.blob.resize(totalLength);
char* ptr = byteBatch.blob.data();
while (bytesBuffered + lastBufferLength_ < totalLength) {
- memcpy(ptr + bytesBuffered, lastBuffer_, lastBufferLength_);
+ if (lastBuffer_ != nullptr) {
+ memcpy(ptr + bytesBuffered, lastBuffer_, lastBufferLength_);
+ }
bytesBuffered += lastBufferLength_;
const void* readBuffer;
int readLength;
diff --git a/c++/src/ConvertColumnReader.cc b/c++/src/ConvertColumnReader.cc
index c0f88246e..7db5b8895 100644
--- a/c++/src/ConvertColumnReader.cc
+++ b/c++/src/ConvertColumnReader.cc
@@ -126,13 +126,13 @@ namespace orc {
bool shouldThrow) {
constexpr bool
isFileTypeFloatingPoint(std::is_floating_point<FileType>::value);
constexpr bool
isReadTypeFloatingPoint(std::is_floating_point<ReadType>::value);
- int64_t longValue = static_cast<int64_t>(srcValue);
+
if (isFileTypeFloatingPoint) {
if (isReadTypeFloatingPoint) {
destValue = static_cast<ReadType>(srcValue);
} else {
if (!canFitInLong(static_cast<double>(srcValue)) ||
- !downCastToInteger(destValue, longValue)) {
+ !downCastToInteger(destValue, static_cast<int64_t>(srcValue))) {
handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
}
}
diff --git a/c++/src/Int128.cc b/c++/src/Int128.cc
index 1e059fd4e..0d4da78b5 100644
--- a/c++/src/Int128.cc
+++ b/c++/src/Int128.cc
@@ -25,6 +25,41 @@
#include <sstream>
namespace orc {
+ NO_SANITIZE_ATTR
+ Int128& Int128::operator<<=(uint32_t bits) {
+ if (bits != 0) {
+ if (bits < 64) {
+ highbits_ <<= bits;
+ highbits_ |= (lowbits_ >> (64 - bits));
+ lowbits_ <<= bits;
+ } else if (bits < 128) {
+ highbits_ = static_cast<int64_t>(lowbits_) << (bits - 64);
+ lowbits_ = 0;
+ } else {
+ highbits_ = 0;
+ lowbits_ = 0;
+ }
+ }
+ return *this;
+ }
+
+ NO_SANITIZE_ATTR
+ Int128& Int128::operator>>=(uint32_t bits) {
+ if (bits != 0) {
+ if (bits < 64) {
+ lowbits_ >>= bits;
+ lowbits_ |= static_cast<uint64_t>(highbits_ << (64 - bits));
+ highbits_ = static_cast<int64_t>(static_cast<uint64_t>(highbits_) >>
bits);
+ } else if (bits < 128) {
+ lowbits_ = static_cast<uint64_t>(highbits_ >> (bits - 64));
+ highbits_ = highbits_ >= 0 ? 0 : -1l;
+ } else {
+ highbits_ = highbits_ >= 0 ? 0 : -1l;
+ lowbits_ = static_cast<uint64_t>(highbits_);
+ }
+ }
+ return *this;
+ }
Int128 Int128::maximumValue() {
return Int128(0x7fffffffffffffff, 0xffffffffffffffff);
diff --git a/c++/src/RLE.cc b/c++/src/RLE.cc
index cb831c80f..19ca558fc 100644
--- a/c++/src/RLE.cc
+++ b/c++/src/RLE.cc
@@ -77,6 +77,7 @@ namespace orc {
add<int16_t>(data, numValues, notNull);
}
+ NO_SANITIZE_ATTR
void RleEncoder::writeVslong(int64_t val) {
writeVulong((val << 1) ^ (val >> 63));
}
diff --git a/c++/src/RLE.hh b/c++/src/RLE.hh
index e46504e88..3ad93e3dc 100644
--- a/c++/src/RLE.hh
+++ b/c++/src/RLE.hh
@@ -26,6 +26,7 @@
namespace orc {
+ NO_SANITIZE_ATTR
inline int64_t zigZag(int64_t value) {
return (value << 1) ^ (value >> 63);
}