This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 860c54d90 ORC-1841: [C++][CI] Add UBSAN support to Cmake
860c54d90 is described below

commit 860c54d9001b5e8658babd95c5b212ed742f55f7
Author: luffy-zh <[email protected]>
AuthorDate: Tue Jun 17 16:07:46 2025 -0700

    ORC-1841: [C++][CI] Add UBSAN support to Cmake
    
    ### What changes were proposed in this pull request?
    
    Add support for UBASN in CMake options and enable it by default in the CI 
workflow.
    
    ### Why are the changes needed?
    
    Detect undefined behavior issues with the help of the undefined sanitizer.
    
    ### How was this patch tested?
    
    Pass all CIs.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    NO.
    
    Closes #2117 from luffy-zh/ORC-UBSAN.
    
    Authored-by: luffy-zh <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .github/workflows/asan_test.yml | 13 +++++++++----
 CMakeLists.txt                  | 15 +++++++++++++++
 c++/include/orc/Int128.hh       | 34 ++--------------------------------
 c++/src/Adaptor.hh.in           |  6 ++++++
 c++/src/BloomFilter.cc          |  3 ++-
 c++/src/BloomFilter.hh          |  1 +
 c++/src/ColumnReader.cc         |  7 ++++++-
 c++/src/ConvertColumnReader.cc  |  4 ++--
 c++/src/Int128.cc               | 35 +++++++++++++++++++++++++++++++++++
 c++/src/RLE.cc                  |  1 +
 c++/src/RLE.hh                  |  1 +
 11 files changed, 80 insertions(+), 40 deletions(-)

diff --git a/.github/workflows/asan_test.yml b/.github/workflows/asan_test.yml
index f4a31525f..6e7ac64fb 100644
--- a/.github/workflows/asan_test.yml
+++ b/.github/workflows/asan_test.yml
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-name: Address Sanitizer Tests
+name: Address/Undefined Sanitizer Tests
 
 on:
   pull_request:
@@ -47,18 +47,23 @@ jobs:
     steps:
     - name: Checkout
       uses: actions/checkout@v4
-    - name: Configure and Build with ASAN
+    - name: Install dependencies
+    run: |
+      sudo apt-get update
+      sudo apt-get install -y build-essential cmake libpthread-stubs0-dev
+    - name: Configure and Build with ASAN and UBSAN
       env:
         CC: ${{ matrix.cc }}
         CXX: ${{ matrix.cxx }}
       run: |
-        mkdir build && cd build
-        cmake .. -DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=ON -DBUILD_JAVA=OFF
+        mkdir -p build && cd build
+        cmake .. -DCMAKE_BUILD_TYPE=Debug -DENABLE_ASAN=ON -DENABLE_UBSAN=ON 
-DBUILD_ENABLE_AVX512=ON -DBUILD_CPP_ENABLE_METRICS=ON -DBUILD_JAVA=OFF
         make
     - name: Run Tests
       working-directory: build
       env:
         ASAN_OPTIONS: 
detect_leaks=1:symbolize=1:strict_string_checks=1:halt_on_error=0:detect_container_overflow=0
         LSAN_OPTIONS: suppressions=${{ github.workspace 
}}/.github/lsan-suppressions.txt
+        UBSAN_OPTIONS: print_stacktrace=1
       run: |
         ctest --output-on-failure
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e6ce4fde0..cc4ee9adb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,10 @@ option(ORC_ENABLE_CLANG_TOOLS
     "Enable Clang tools"
     OFF)
 
+option(ENABLE_UBSAN
+    "Enable Undefined Behavior Sanitizer"
+    OFF)
+
 # Make sure that a build type is selected
 if (NOT CMAKE_BUILD_TYPE)
   message(STATUS "No build type selected, default to ReleaseWithDebugInfo")
@@ -171,6 +175,17 @@ if (ENABLE_ASAN)
   endif()
 endif()
 
+# Configure Undefined Behavior Sanitizer if enabled
+if (ENABLE_UBSAN)
+  if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined 
-fno-sanitize=alignment,vptr,function  -fno-sanitize-recover=all")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined 
-fno-sanitize=alignment,vptr,function  -fno-sanitize-recover=all")
+    message(STATUS "Undefined Behavior Sanitizer enabled")
+  else()
+    message(WARNING "Undefined Behavior Sanitizer is only supported for GCC 
and Clang compilers")
+  endif()
+endif()
+
 enable_testing()
 
 INCLUDE(GNUInstallDirs)  # Put it before ThirdpartyToolchain to make 
CMAKE_INSTALL_LIBDIR available.
diff --git a/c++/include/orc/Int128.hh b/c++/include/orc/Int128.hh
index 6954c771c..e728e70e7 100644
--- a/c++/include/orc/Int128.hh
+++ b/c++/include/orc/Int128.hh
@@ -193,43 +193,13 @@ namespace orc {
      * Shift left by the given number of bits.
      * Values larger than 2**127 will shift into the sign bit.
      */
-    Int128& operator<<=(uint32_t bits) {
-      if (bits != 0) {
-        if (bits < 64) {
-          highbits_ <<= bits;
-          highbits_ |= (lowbits_ >> (64 - bits));
-          lowbits_ <<= bits;
-        } else if (bits < 128) {
-          highbits_ = static_cast<int64_t>(lowbits_) << (bits - 64);
-          lowbits_ = 0;
-        } else {
-          highbits_ = 0;
-          lowbits_ = 0;
-        }
-      }
-      return *this;
-    }
+    Int128& operator<<=(uint32_t bits);
 
     /**
      * Shift right by the given number of bits. Negative values will
      * sign extend and fill with one bits.
      */
-    Int128& operator>>=(uint32_t bits) {
-      if (bits != 0) {
-        if (bits < 64) {
-          lowbits_ >>= bits;
-          lowbits_ |= static_cast<uint64_t>(highbits_ << (64 - bits));
-          highbits_ = static_cast<int64_t>(static_cast<uint64_t>(highbits_) >> 
bits);
-        } else if (bits < 128) {
-          lowbits_ = static_cast<uint64_t>(highbits_ >> (bits - 64));
-          highbits_ = highbits_ >= 0 ? 0 : -1l;
-        } else {
-          highbits_ = highbits_ >= 0 ? 0 : -1l;
-          lowbits_ = static_cast<uint64_t>(highbits_);
-        }
-      }
-      return *this;
-    }
+    Int128& operator>>=(uint32_t bits);
 
     bool operator==(const Int128& right) const {
       return highbits_ == right.highbits_ && lowbits_ == right.lowbits_;
diff --git a/c++/src/Adaptor.hh.in b/c++/src/Adaptor.hh.in
index 2cce8158e..f3ed763eb 100644
--- a/c++/src/Adaptor.hh.in
+++ b/c++/src/Adaptor.hh.in
@@ -49,6 +49,12 @@ typedef SSIZE_T ssize_t;
   ssize_t pread(int fd, void* buf, size_t count, off_t offset);
 #endif
 
+#if defined(__GNUC__) || defined(__clang__)
+  #define NO_SANITIZE_ATTR 
__attribute__((no_sanitize("signed-integer-overflow", "shift")))
+#else
+  #define NO_SANITIZE_ATTR
+#endif
+
 #ifdef HAS_DIAGNOSTIC_PUSH
   #ifdef __clang__
     #define DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
diff --git a/c++/src/BloomFilter.cc b/c++/src/BloomFilter.cc
index 887637223..025bdd8a0 100644
--- a/c++/src/BloomFilter.cc
+++ b/c++/src/BloomFilter.cc
@@ -208,7 +208,7 @@ namespace orc {
   }
 
   DIAGNOSTIC_POP
-
+  NO_SANITIZE_ATTR
   void BloomFilterImpl::addHash(int64_t hash64) {
     int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff);
     // In Java codes, we use "hash64 >>> 32" which is an unsigned shift op.
@@ -226,6 +226,7 @@ namespace orc {
     }
   }
 
+  NO_SANITIZE_ATTR
   bool BloomFilterImpl::testHash(int64_t hash64) const {
     int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff);
     // In Java codes, we use "hash64 >>> 32" which is an unsigned shift op.
diff --git a/c++/src/BloomFilter.hh b/c++/src/BloomFilter.hh
index ebc4a5ee0..75fb02a02 100644
--- a/c++/src/BloomFilter.hh
+++ b/c++/src/BloomFilter.hh
@@ -194,6 +194,7 @@ namespace orc {
   // Thomas Wang's integer hash function
   // 
http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
   // Put this in header file so tests can use it as well.
+  NO_SANITIZE_ATTR
   inline int64_t getLongHash(int64_t key) {
     key = (~key) + (key << 21);  // key = (key << 21) - key - 1;
     key = key ^ (key >> 24);
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index af434c37c..0fd17de1b 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -726,6 +726,9 @@ namespace orc {
     if (totalBytes <= lastBufferLength_) {
       // subtract the needed bytes from the ones left over
       lastBufferLength_ -= totalBytes;
+      if (lastBuffer_ == nullptr) {
+        throw ParseError("StringDirectColumnReader::skip: lastBuffer_ is 
null");
+      }
       lastBuffer_ += totalBytes;
     } else {
       // move the stream forward after accounting for the buffered bytes
@@ -780,7 +783,9 @@ namespace orc {
     byteBatch.blob.resize(totalLength);
     char* ptr = byteBatch.blob.data();
     while (bytesBuffered + lastBufferLength_ < totalLength) {
-      memcpy(ptr + bytesBuffered, lastBuffer_, lastBufferLength_);
+      if (lastBuffer_ != nullptr) {
+        memcpy(ptr + bytesBuffered, lastBuffer_, lastBufferLength_);
+      }
       bytesBuffered += lastBufferLength_;
       const void* readBuffer;
       int readLength;
diff --git a/c++/src/ConvertColumnReader.cc b/c++/src/ConvertColumnReader.cc
index c0f88246e..7db5b8895 100644
--- a/c++/src/ConvertColumnReader.cc
+++ b/c++/src/ConvertColumnReader.cc
@@ -126,13 +126,13 @@ namespace orc {
                                            bool shouldThrow) {
     constexpr bool 
isFileTypeFloatingPoint(std::is_floating_point<FileType>::value);
     constexpr bool 
isReadTypeFloatingPoint(std::is_floating_point<ReadType>::value);
-    int64_t longValue = static_cast<int64_t>(srcValue);
+
     if (isFileTypeFloatingPoint) {
       if (isReadTypeFloatingPoint) {
         destValue = static_cast<ReadType>(srcValue);
       } else {
         if (!canFitInLong(static_cast<double>(srcValue)) ||
-            !downCastToInteger(destValue, longValue)) {
+            !downCastToInteger(destValue, static_cast<int64_t>(srcValue))) {
           handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
         }
       }
diff --git a/c++/src/Int128.cc b/c++/src/Int128.cc
index 1e059fd4e..0d4da78b5 100644
--- a/c++/src/Int128.cc
+++ b/c++/src/Int128.cc
@@ -25,6 +25,41 @@
 #include <sstream>
 
 namespace orc {
+  NO_SANITIZE_ATTR
+  Int128& Int128::operator<<=(uint32_t bits) {
+    if (bits != 0) {
+      if (bits < 64) {
+        highbits_ <<= bits;
+        highbits_ |= (lowbits_ >> (64 - bits));
+        lowbits_ <<= bits;
+      } else if (bits < 128) {
+        highbits_ = static_cast<int64_t>(lowbits_) << (bits - 64);
+        lowbits_ = 0;
+      } else {
+        highbits_ = 0;
+        lowbits_ = 0;
+      }
+    }
+    return *this;
+  }
+
+  NO_SANITIZE_ATTR
+  Int128& Int128::operator>>=(uint32_t bits) {
+    if (bits != 0) {
+      if (bits < 64) {
+        lowbits_ >>= bits;
+        lowbits_ |= static_cast<uint64_t>(highbits_ << (64 - bits));
+        highbits_ = static_cast<int64_t>(static_cast<uint64_t>(highbits_) >> 
bits);
+      } else if (bits < 128) {
+        lowbits_ = static_cast<uint64_t>(highbits_ >> (bits - 64));
+        highbits_ = highbits_ >= 0 ? 0 : -1l;
+      } else {
+        highbits_ = highbits_ >= 0 ? 0 : -1l;
+        lowbits_ = static_cast<uint64_t>(highbits_);
+      }
+    }
+    return *this;
+  }
 
   Int128 Int128::maximumValue() {
     return Int128(0x7fffffffffffffff, 0xffffffffffffffff);
diff --git a/c++/src/RLE.cc b/c++/src/RLE.cc
index cb831c80f..19ca558fc 100644
--- a/c++/src/RLE.cc
+++ b/c++/src/RLE.cc
@@ -77,6 +77,7 @@ namespace orc {
     add<int16_t>(data, numValues, notNull);
   }
 
+  NO_SANITIZE_ATTR
   void RleEncoder::writeVslong(int64_t val) {
     writeVulong((val << 1) ^ (val >> 63));
   }
diff --git a/c++/src/RLE.hh b/c++/src/RLE.hh
index e46504e88..3ad93e3dc 100644
--- a/c++/src/RLE.hh
+++ b/c++/src/RLE.hh
@@ -26,6 +26,7 @@
 
 namespace orc {
 
+  NO_SANITIZE_ATTR
   inline int64_t zigZag(int64_t value) {
     return (value << 1) ^ (value >> 63);
   }

Reply via email to