This is an automated email from the ASF dual-hosted git repository.

pitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new d1a8cf4133 GH-47769: [C++] SVE dynamic dispatch (#49756)
d1a8cf4133 is described below

commit d1a8cf4133e94941850d1a709f5263d82d1f45ed
Author: Antoine Prouvost <[email protected]>
AuthorDate: Tue Apr 28 12:44:57 2026 +0200

    GH-47769: [C++] SVE dynamic dispatch (#49756)
    
    ### Rationale for this change
    Just like we dynamically dispatch to AVX2 on x86 CPUs, we want to 
dynamically dispatch to more advanced SIMD extension on ARM64 chips.
    
    ### What changes are included in this PR?
    - A new macro to enable selecting the runtime SVE version
    - Detection of the ARM64 CPU features available at runtime
    - Adding SVE to the dynamic dispatch for bit unpacking algorithms.
    
    ### Are these changes tested?
    
    ### Are there any user-facing changes?
    No.
    
    * GitHub Issue: #47769
    
    Lead-authored-by: Antoine Prouvost <[email protected]>
    Co-authored-by: AntoinePrv <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/cmake_modules/DefineOptions.cmake              |   3 +
 cpp/cmake_modules/SetupCxxFlags.cmake              |  32 ++-
 cpp/src/arrow/CMakeLists.txt                       |  28 ++-
 cpp/src/arrow/meson.build                          |   2 +-
 cpp/src/arrow/util/bpacking.cc                     |  39 +++-
 cpp/src/arrow/util/bpacking_benchmark.cc           | 166 +++++---------
 cpp/src/arrow/util/bpacking_dispatch_internal.h    |  44 ++--
 ...acking_simd_default.cc => bpacking_simd_128.cc} |   6 +-
 ...cking_simd_avx2.cc => bpacking_simd_128_alt.cc} |  30 ++-
 ...acking_simd_default.cc => bpacking_simd_256.cc} |  28 ++-
 cpp/src/arrow/util/bpacking_simd_internal.h        |  64 ++++--
 cpp/src/arrow/util/bpacking_simd_kernel_internal.h |  98 +++++----
 cpp/src/arrow/util/bpacking_test.cc                | 244 +++++++++------------
 cpp/src/arrow/util/cpu_info.cc                     |  62 +++++-
 cpp/src/arrow/util/cpu_info.h                      |   4 +
 cpp/src/arrow/util/dispatch_internal.h             |  17 +-
 cpp/src/arrow/util/macros.h                        |   2 +-
 cpp/src/arrow/util/ubsan.h                         |  18 +-
 18 files changed, 498 insertions(+), 389 deletions(-)

diff --git a/cpp/cmake_modules/DefineOptions.cmake 
b/cpp/cmake_modules/DefineOptions.cmake
index 017a5a6efb..51b9fc8b2e 100644
--- a/cpp/cmake_modules/DefineOptions.cmake
+++ b/cpp/cmake_modules/DefineOptions.cmake
@@ -191,6 +191,9 @@ takes precedence over ccache if a storage backend is 
configured" ON)
                        "SSE4_2"
                        "AVX2"
                        "AVX512"
+                       "SVE128" # fixed size SVE
+                       "SVE256" # "
+                       "SVE512" # "
                        "MAX")
 
   define_option(ARROW_ALTIVEC "Build with Altivec if compiler has support" ON)
diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake 
b/cpp/cmake_modules/SetupCxxFlags.cmake
index c35fc6a6fe..7c02d53e62 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -134,7 +134,31 @@ elseif(ARROW_CPU_FLAG STREQUAL "ppc")
 elseif(ARROW_CPU_FLAG STREQUAL "aarch64")
   # Arm64 compiler flags, gcc/clang only
   set(ARROW_ARMV8_MARCH "armv8-a")
-  check_cxx_compiler_flag("-march=${ARROW_ARMV8_MARCH}+sve" CXX_SUPPORTS_SVE)
+  set(ARROW_SVE_FLAGS "-march=${ARROW_ARMV8_MARCH}+sve")
+  set(ARROW_SVE128_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=128")
+  set(ARROW_SVE256_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=256")
+  set(ARROW_SVE512_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=512")
+  # We only have a way to do SVE dynamic dispatch on Linux (BSD may be possible
+  # but is currently not implemented).
+  # We still support explicitly setting runtime SIMD level to some SVE values
+  # on these platforms as this can be useful in development for building SVE
+  # code locally. The compiler supports it but the code won't run.
+  if((APPLE OR WIN32) AND ARROW_RUNTIME_SIMD_LEVEL STREQUAL "MAX")
+    set(ARROW_RUNTIME_SIMD_LEVEL "NONE")
+  endif()
+  check_cxx_compiler_flag("${ARROW_SVE_FLAGS}" CXX_SUPPORTS_SVE)
+  if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES 
"^(SVE128|SVE256|SVE512|MAX)$")
+    set(ARROW_HAVE_RUNTIME_SVE128 ON)
+    add_definitions(-DARROW_HAVE_RUNTIME_SVE128)
+  endif()
+  if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES 
"^(SVE256|SVE512|MAX)$")
+    set(ARROW_HAVE_RUNTIME_SVE256 ON)
+    add_definitions(-DARROW_HAVE_RUNTIME_SVE256)
+  endif()
+  if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(SVE512|MAX)$")
+    set(ARROW_HAVE_RUNTIME_SVE512 ON)
+    add_definitions(-DARROW_HAVE_RUNTIME_SVE512)
+  endif()
   if(ARROW_SIMD_LEVEL STREQUAL "DEFAULT")
     set(ARROW_SIMD_LEVEL "NEON")
   endif()
@@ -528,8 +552,7 @@ if(ARROW_CPU_FLAG STREQUAL "aarch64")
       if(NOT CXX_SUPPORTS_SVE)
         message(FATAL_ERROR "SVE required but compiler doesn't support it.")
       endif()
-      # -march=armv8-a+sve
-      set(ARROW_ARMV8_MARCH "${ARROW_ARMV8_MARCH}+sve")
+      set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_SVE_FLAGS}")
       string(REGEX MATCH "[0-9]+" SVE_VECTOR_BITS ${ARROW_SIMD_LEVEL})
       if(SVE_VECTOR_BITS)
         set(ARROW_HAVE_SVE${SVE_VECTOR_BITS} ON)
@@ -540,8 +563,9 @@ if(ARROW_CPU_FLAG STREQUAL "aarch64")
         set(ARROW_HAVE_SVE_SIZELESS ON)
         add_definitions(-DARROW_HAVE_SVE_SIZELESS)
       endif()
+    else() # ARM v8 without SVE
+      set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=${ARROW_ARMV8_MARCH}")
     endif()
-    set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=${ARROW_ARMV8_MARCH}")
   elseif(NOT ARROW_SIMD_LEVEL STREQUAL "NONE")
     message(WARNING "ARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL} not supported by 
Arm.")
   endif()
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index eee63b11ca..0839cc1fa8 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -343,6 +343,27 @@ macro(append_runtime_avx512_src SRCS SRC)
   endif()
 endmacro()
 
+macro(append_runtime_sve128_src SRCS SRC)
+  if(ARROW_HAVE_RUNTIME_SVE128)
+    list(APPEND ${SRCS} ${SRC})
+    set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS 
"${ARROW_SVE128_FLAGS}")
+  endif()
+endmacro()
+
+macro(append_runtime_sve256_src SRCS SRC)
+  if(ARROW_HAVE_RUNTIME_SVE256)
+    list(APPEND ${SRCS} ${SRC})
+    set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS 
"${ARROW_SVE256_FLAGS}")
+  endif()
+endmacro()
+
+macro(append_runtime_sve512_src SRCS SRC)
+  if(ARROW_HAVE_RUNTIME_SVE512)
+    list(APPEND ${SRCS} ${SRC})
+    set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS 
"${ARROW_SVE512_FLAGS}")
+  endif()
+endmacro()
+
 # Write out compile-time configuration constants
 string(REPLACE "${CMAKE_SOURCE_DIR}" "<CMAKE_SOURCE_DIR>" REDACTED_CXX_FLAGS
                ${CMAKE_CXX_FLAGS})
@@ -498,7 +519,7 @@ set(ARROW_UTIL_SRCS
     util/bitmap_ops.cc
     util/bpacking.cc
     util/bpacking_scalar.cc
-    util/bpacking_simd_default.cc
+    util/bpacking_simd_128.cc
     util/byte_size.cc
     util/byte_stream_split_internal.cc
     util/cancel.cc
@@ -543,9 +564,12 @@ set(ARROW_UTIL_SRCS
 
 append_runtime_avx2_src(ARROW_UTIL_SRCS 
util/byte_stream_split_internal_avx2.cc)
 
-append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_simd_avx2.cc)
+append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_simd_256.cc)
 append_runtime_avx512_src(ARROW_UTIL_SRCS util/bpacking_simd_avx512.cc)
 
+append_runtime_sve128_src(ARROW_UTIL_SRCS util/bpacking_simd_128_alt.cc)
+append_runtime_sve256_src(ARROW_UTIL_SRCS util/bpacking_simd_256.cc)
+
 if(ARROW_WITH_BROTLI)
   list(APPEND ARROW_UTIL_SRCS util/compression_brotli.cc)
 endif()
diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build
index fe7f11af6f..4b8faebecf 100644
--- a/cpp/src/arrow/meson.build
+++ b/cpp/src/arrow/meson.build
@@ -177,7 +177,7 @@ arrow_util_srcs = [
     'util/bitmap_ops.cc',
     'util/bpacking.cc',
     'util/bpacking_scalar.cc',
-    'util/bpacking_simd_default.cc',
+    'util/bpacking_simd_128.cc',
     'util/byte_size.cc',
     'util/byte_stream_split_internal.cc',
     'util/cancel.cc',
diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc
index e959a9f9c4..fb7fd19ae3 100644
--- a/cpp/src/arrow/util/bpacking.cc
+++ b/cpp/src/arrow/util/bpacking.cc
@@ -33,16 +33,29 @@ struct UnpackDynamicFunction {
 
   static constexpr auto implementations() {
     return std::array{
+    // x86 implementations
 #if defined(ARROW_HAVE_SSE4_2)
         Implementation{DispatchLevel::NONE, &bpacking::unpack_sse4_2<Uint>},
-#else
-        Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar<Uint>},
-#endif
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
+#  if defined(ARROW_HAVE_RUNTIME_AVX2)
         Implementation{DispatchLevel::AVX2, &bpacking::unpack_avx2<Uint>},
-#endif
-#if defined(ARROW_HAVE_RUNTIME_AVX512)
+#  endif
+#  if defined(ARROW_HAVE_RUNTIME_AVX512)
         Implementation{DispatchLevel::AVX512, &bpacking::unpack_avx512<Uint>},
+#  endif
+
+    // ARM implementations
+#elif defined(ARROW_HAVE_NEON)
+        Implementation{DispatchLevel::NONE, &bpacking::unpack_neon<Uint>},
+#  if defined(ARROW_HAVE_RUNTIME_SVE128)
+        Implementation{DispatchLevel::SVE128, &bpacking::unpack_sve128<Uint>},
+#  endif
+#  if defined(ARROW_HAVE_RUNTIME_SVE256)
+        Implementation{DispatchLevel::SVE256, &bpacking::unpack_sve256<Uint>},
+#  endif
+
+    // Other implementations
+#else
+        Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar<Uint>},
 #endif
     };
   }
@@ -52,12 +65,14 @@ struct UnpackDynamicFunction {
 
 template <typename Uint>
 void unpack(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
-#if defined(ARROW_HAVE_NEON)
-  return bpacking::unpack_neon(in, out, opts);
-#else
-  static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
-  return dispatch.func(in, out, opts);
-#endif
+  auto constexpr kImplementations = 
UnpackDynamicFunction<Uint>::implementations();
+  if constexpr (kImplementations.size() == 1) {
+    constexpr auto func = kImplementations.front().second;
+    func(in, out, opts);
+  } else {
+    static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
+    return dispatch.func(in, out, opts);
+  }
 }
 
 template void unpack<bool>(const uint8_t*, bool*, const UnpackOptions&);
diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc 
b/cpp/src/arrow/util/bpacking_benchmark.cc
index 354bc76ace..93d7cdf165 100644
--- a/cpp/src/arrow/util/bpacking_benchmark.cc
+++ b/cpp/src/arrow/util/bpacking_benchmark.cc
@@ -26,7 +26,7 @@
 #include "arrow/util/bpacking_scalar_internal.h"
 #include "arrow/util/bpacking_simd_internal.h"
 
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_SVE128)
 #  include "arrow/util/cpu_info.h"
 #endif
 
@@ -107,10 +107,10 @@ void BM_Unpack(benchmark::State& state, bool aligned, 
UnpackFunc<Int> unpack, bo
 // will not emit runs larger than 512 (though other implementation might), so 
we biased
 // the benchmarks towards a rather small scale.
 static const auto kNumValuesRange = benchmark::CreateRange(32, 512, 2);
-constexpr std::initializer_list<int64_t> kBitWidths8 = {1, 2, 8};
-constexpr std::initializer_list<int64_t> kBitWidths16 = {1, 2, 8, 13};
-constexpr std::initializer_list<int64_t> kBitWidths32 = {1, 2, 8, 20};
-constexpr std::initializer_list<int64_t> kBitWidths64 = {1, 2, 8, 20, 47};
+constexpr auto kBitWidths8 = std::initializer_list<int64_t>{1, 2, 8};
+constexpr auto kBitWidths16 = std::initializer_list<int64_t>{1, 2, 8, 13};
+constexpr auto kBitWidths32 = std::initializer_list<int64_t>{1, 2, 8, 20};
+constexpr auto kBitWidths64 = std::initializer_list<int64_t>{1, 2, 8, 20, 47};
 
 static const std::vector<std::vector<int64_t>> kBitWidthsNumValuesBool = {
     {0, 1},
@@ -159,125 +159,69 @@ void BM_UnpackUint64(benchmark::State& state, bool 
aligned, UnpackFunc<uint64_t>
   return BM_Unpack<uint64_t>(state, aligned, unpack, skip, 
std::move(skip_msg));
 }
 
-BENCHMARK_CAPTURE(BM_UnpackBool, ScalarUnaligned, false, 
&bpacking::unpack_scalar<bool>)
-    ->ArgsProduct(kBitWidthsNumValuesBool);
-BENCHMARK_CAPTURE(BM_UnpackUint8, ScalarUnaligned, false,
-                  &bpacking::unpack_scalar<uint8_t>)
-    ->ArgsProduct(kBitWidthsNumValues8);
-BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false,
-                  &bpacking::unpack_scalar<uint16_t>)
-    ->ArgsProduct(kBitWidthsNumValues16);
-BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false,
-                  &bpacking::unpack_scalar<uint32_t>)
-    ->ArgsProduct(kBitWidthsNumValues32);
-BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false,
-                  &bpacking::unpack_scalar<uint64_t>)
-    ->ArgsProduct(kBitWidthsNumValues64);
+// Register BM_Unpack{Bool,Uint8,Uint16,Uint32,Uint64} benchmarks for a given
+// UNPACK_FUNC templated on each of those types, with explicit skip args.
+#define BENCHMARK_UNPACK_ALL_TYPES_SKIP(LABEL, ALIGNED, UNPACK_FUNC, SKIP, 
SKIP_MSG)   \
+  BENCHMARK_CAPTURE(BM_UnpackBool, LABEL, ALIGNED, &UNPACK_FUNC<bool>, SKIP, 
SKIP_MSG) \
+      ->ArgsProduct(kBitWidthsNumValuesBool);                                  
        \
+  BENCHMARK_CAPTURE(BM_UnpackUint8, LABEL, ALIGNED, &UNPACK_FUNC<uint8_t>, 
SKIP,       \
+                    SKIP_MSG)                                                  
        \
+      ->ArgsProduct(kBitWidthsNumValues8);                                     
        \
+  BENCHMARK_CAPTURE(BM_UnpackUint16, LABEL, ALIGNED, &UNPACK_FUNC<uint16_t>, 
SKIP,     \
+                    SKIP_MSG)                                                  
        \
+      ->ArgsProduct(kBitWidthsNumValues16);                                    
        \
+  BENCHMARK_CAPTURE(BM_UnpackUint32, LABEL, ALIGNED, &UNPACK_FUNC<uint32_t>, 
SKIP,     \
+                    SKIP_MSG)                                                  
        \
+      ->ArgsProduct(kBitWidthsNumValues32);                                    
        \
+  BENCHMARK_CAPTURE(BM_UnpackUint64, LABEL, ALIGNED, &UNPACK_FUNC<uint64_t>, 
SKIP,     \
+                    SKIP_MSG)                                                  
        \
+      ->ArgsProduct(kBitWidthsNumValues64)
+
+#define BENCHMARK_UNPACK_ALL_TYPES(LABEL, ALIGNED, UNPACK_FUNC) \
+  BENCHMARK_UNPACK_ALL_TYPES_SKIP(LABEL, ALIGNED, UNPACK_FUNC, false, "")
+
+#define BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(LABEL, ALIGNED, UNPACK_FUNC, 
CPU_FEATURE, \
+                                           SKIP_MSG)                           
      \
+  BENCHMARK_UNPACK_ALL_TYPES_SKIP(                                             
      \
+      LABEL, ALIGNED, UNPACK_FUNC,                                             
      \
+      !CpuInfo::GetInstance()->IsSupported(CpuInfo::CPU_FEATURE), SKIP_MSG)
+
+BENCHMARK_UNPACK_ALL_TYPES(ScalarUnaligned, false, bpacking::unpack_scalar);
 
 #if defined(ARROW_HAVE_SSE4_2)
-BENCHMARK_CAPTURE(BM_UnpackBool, Sse42Unaligned, false, 
&bpacking::unpack_sse4_2<bool>)
-    ->ArgsProduct(kBitWidthsNumValuesBool);
-BENCHMARK_CAPTURE(BM_UnpackUint8, Sse42Unaligned, false,
-                  &bpacking::unpack_sse4_2<uint8_t>)
-    ->ArgsProduct(kBitWidthsNumValues8);
-BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false,
-                  &bpacking::unpack_sse4_2<uint16_t>)
-    ->ArgsProduct(kBitWidthsNumValues16);
-BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false,
-                  &bpacking::unpack_sse4_2<uint32_t>)
-    ->ArgsProduct(kBitWidthsNumValues32);
-BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false,
-                  &bpacking::unpack_sse4_2<uint64_t>)
-    ->ArgsProduct(kBitWidthsNumValues64);
+BENCHMARK_UNPACK_ALL_TYPES(Sse42Unaligned, false, bpacking::unpack_sse4_2);
 #endif
 
 #if defined(ARROW_HAVE_RUNTIME_AVX2)
-BENCHMARK_CAPTURE(BM_UnpackBool, Avx2Unaligned, false, 
&bpacking::unpack_avx2<bool>,
-                  !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
-                  "Avx2 not available")
-    ->ArgsProduct(kBitWidthsNumValuesBool);
-BENCHMARK_CAPTURE(BM_UnpackUint8, Avx2Unaligned, false, 
&bpacking::unpack_avx2<uint8_t>,
-                  !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
-                  "Avx2 not available")
-    ->ArgsProduct(kBitWidthsNumValues8);
-BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, 
&bpacking::unpack_avx2<uint16_t>,
-                  !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
-                  "Avx2 not available")
-    ->ArgsProduct(kBitWidthsNumValues16);
-BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, 
&bpacking::unpack_avx2<uint32_t>,
-                  !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
-                  "Avx2 not available")
-    ->ArgsProduct(kBitWidthsNumValues32);
-BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, 
&bpacking::unpack_avx2<uint64_t>,
-                  !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
-                  "Avx2 not available")
-    ->ArgsProduct(kBitWidthsNumValues64);
+BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Avx2Unaligned, false, 
bpacking::unpack_avx2, AVX2,
+                                   "Avx2 not available");
 #endif
 
 #if defined(ARROW_HAVE_RUNTIME_AVX512)
-BENCHMARK_CAPTURE(BM_UnpackBool, Avx512Unaligned, false, 
&bpacking::unpack_avx512<bool>,
-                  !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
-                  "Avx512 not available")
-    ->ArgsProduct(kBitWidthsNumValuesBool);
-BENCHMARK_CAPTURE(BM_UnpackUint8, Avx512Unaligned, false,
-                  &bpacking::unpack_avx512<uint8_t>,
-                  !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
-                  "Avx512 not available")
-    ->ArgsProduct(kBitWidthsNumValues8);
-BENCHMARK_CAPTURE(BM_UnpackUint16, Avx512Unaligned, false,
-                  &bpacking::unpack_avx512<uint16_t>,
-                  !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
-                  "Avx512 not available")
-    ->ArgsProduct(kBitWidthsNumValues16);
-BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false,
-                  &bpacking::unpack_avx512<uint32_t>,
-                  !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
-                  "Avx512 not available")
-    ->ArgsProduct(kBitWidthsNumValues32);
-BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false,
-                  &bpacking::unpack_avx512<uint64_t>,
-                  !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
-                  "Avx512 not available")
-    ->ArgsProduct(kBitWidthsNumValues64);
+BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Avx512Unaligned, false, 
bpacking::unpack_avx512,
+                                   AVX512, "Avx512 not available");
 #endif
 
 #if defined(ARROW_HAVE_NEON)
-BENCHMARK_CAPTURE(BM_UnpackBool, NeonUnaligned, false, 
&bpacking::unpack_neon<bool>)
-    ->ArgsProduct(kBitWidthsNumValuesBool);
-BENCHMARK_CAPTURE(BM_UnpackUint8, NeonUnaligned, false, 
&bpacking::unpack_neon<uint8_t>)
-    ->ArgsProduct(kBitWidthsNumValues8);
-BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, 
&bpacking::unpack_neon<uint16_t>)
-    ->ArgsProduct(kBitWidthsNumValues16);
-BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, 
&bpacking::unpack_neon<uint32_t>)
-    ->ArgsProduct(kBitWidthsNumValues32);
-BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, 
&bpacking::unpack_neon<uint64_t>)
-    ->ArgsProduct(kBitWidthsNumValues64);
+BENCHMARK_UNPACK_ALL_TYPES(NeonUnaligned, false, bpacking::unpack_neon);
+#endif
+
+#if defined(ARROW_HAVE_RUNTIME_SVE128)
+BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Sve128Unaligned, false, 
bpacking::unpack_sve128,
+                                   SVE128, "Sve128 not available");
 #endif
 
-BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack<bool>)
-    ->ArgsProduct(kBitWidthsNumValuesBool);
-BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack<bool>)
-    ->ArgsProduct(kBitWidthsNumValuesBool);
-
-BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicAligned, true, &unpack<uint8_t>)
-    ->ArgsProduct(kBitWidthsNumValues8);
-BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicUnaligned, false, &unpack<uint8_t>)
-    ->ArgsProduct(kBitWidthsNumValues8);
-
-BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicAligned, true, &unpack<uint16_t>)
-    ->ArgsProduct(kBitWidthsNumValues16);
-BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicUnaligned, false, &unpack<uint16_t>)
-    ->ArgsProduct(kBitWidthsNumValues16);
-
-BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicAligned, true, &unpack<uint32_t>)
-    ->ArgsProduct(kBitWidthsNumValues32);
-BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicUnaligned, false, &unpack<uint32_t>)
-    ->ArgsProduct(kBitWidthsNumValues32);
-
-BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicAligned, true, &unpack<uint64_t>)
-    ->ArgsProduct(kBitWidthsNumValues64);
-BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicUnaligned, false, &unpack<uint64_t>)
-    ->ArgsProduct(kBitWidthsNumValues64);
+#if defined(ARROW_HAVE_RUNTIME_SVE256)
+BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Sve256Unaligned, false, 
bpacking::unpack_sve256,
+                                   SVE256, "Sve256 not available");
+#endif
+
+BENCHMARK_UNPACK_ALL_TYPES(DynamicAligned, true, unpack);
+BENCHMARK_UNPACK_ALL_TYPES(DynamicUnaligned, false, unpack);
+
+#undef BENCHMARK_UNPACK_ALL_TYPES_RUNTIME
+#undef BENCHMARK_UNPACK_ALL_TYPES
+#undef BENCHMARK_UNPACK_ALL_TYPES_SKIP
 
 }  // namespace
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h 
b/cpp/src/arrow/util/bpacking_dispatch_internal.h
index 561bbbe7b9..6ea6adee18 100644
--- a/cpp/src/arrow/util/bpacking_dispatch_internal.h
+++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h
@@ -32,14 +32,14 @@ namespace arrow::internal::bpacking {
 
 /// Unpack a zero bit packed array.
 template <typename Uint>
-void unpack_null(const uint8_t* in, Uint* out, int batch_size) {
+ARROW_FORCE_INLINE void unpack_null(const uint8_t* in, Uint* out, int 
batch_size) {
   std::memset(out, 0, batch_size * sizeof(Uint));
 }
 
 /// Unpack a packed array where packed and unpacked values have exactly the 
same number of
 /// bits.
 template <typename Uint>
-void unpack_full(const uint8_t* in, Uint* out, int batch_size) {
+ARROW_FORCE_INLINE void unpack_full(const uint8_t* in, Uint* out, int 
batch_size) {
   if constexpr (ARROW_LITTLE_ENDIAN == 1) {
     std::memcpy(out, in, batch_size * sizeof(Uint));
   } else {
@@ -60,7 +60,7 @@ void unpack_full(const uint8_t* in, Uint* out, int 
batch_size) {
 /// will be split on the first byte boundary (hence having a spread of two 
bytes) while
 /// four bit integer will be well behaved and never spread over byte boundary 
(hence
 /// having a spread of one).
-constexpr int PackedMaxSpreadBytes(int width, int bit_offset) {
+ARROW_FORCE_INLINE constexpr int PackedMaxSpreadBytes(int width, int 
bit_offset) {
   int max = static_cast<int>(bit_util::BytesForBits(width));
   int start = bit_offset;
   do {
@@ -75,7 +75,7 @@ constexpr int PackedMaxSpreadBytes(int width, int bit_offset) 
{
 
 /// Compute the maximum spread in bytes that a packed integer can cover across 
all bit
 /// offsets.
-constexpr int PackedMaxSpreadBytes(int width) {
+ARROW_FORCE_INLINE constexpr int PackedMaxSpreadBytes(int width) {
   int max = 0;
   for (int offset = 0; offset < 8; ++offset) {
     const int spread = PackedMaxSpreadBytes(width, offset);
@@ -97,7 +97,8 @@ using SpreadBufferUint = std::conditional_t<
 /// In prolog mode, instead of unpacking all required element, the function 
will
 /// stop if it finds a byte aligned value start.
 template <int kPackedBitWidth, bool kIsProlog, typename Uint>
-int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset) 
{
+ARROW_FORCE_INLINE int unpack_exact(const uint8_t* in, const uint8_t* in_end, 
Uint* out,
+                                    int batch_size, int bit_offset) {
   static_assert(kPackedBitWidth > 0);
 
   // For the epilog we adapt the max spread since better alignment give 
shorter spreads
@@ -127,15 +128,28 @@ int unpack_exact(const uint8_t* in, Uint* out, int 
batch_size, int bit_offset) {
     ARROW_COMPILER_ASSUME(spread_bytes <= kMaxSpreadBytes);
 
     // Reading the bytes for the current value.
-    // Must be careful not to read out of input bounds.
     buffer_uint buffer = 0;
-    if constexpr (kLarge) {
-      // We read the max possible bytes in the first pass and handle the rest 
after.
-      // Even though the worst spread does not happen on all iterations we can 
still read
-      // all bytes because we will mask them.
-      std::memcpy(&buffer, in + start_byte, std::min(kBufferSize, 
spread_bytes));
+    if (ARROW_PREDICT_TRUE(in + start_byte + kBufferSize < in_end)) {
+      // Fast path we read the whole buffer. In all but few last reads 
(plural!) we will
+      // always have enough bytes left in the buffer to avoid out-of-bounds 
reads. On top
+      // of this, in Arrow, `unpack` is always called with `max_read_bytes` 
set, meaning
+      // this will often be the *only* path taken.
+      // We added this special case because `std::memcpy` without a compile 
time constant
+      // was not inlined and optimized properly by the compiler, resulting to 
a function
+      // call on each iteration.
+      // This also handles the `kLarge` case detailed below.
+      std::memcpy(&buffer, in + start_byte, kBufferSize);
     } else {
-      std::memcpy(&buffer, in + start_byte, spread_bytes);
+      // Slow path, we need to read exactly the correct number of bytes to 
avoid
+      // out-of-bounds reads.
+      if constexpr (kLarge) {
+        // We read the max possible bytes in the first pass and handle the 
rest after.
+        // Even though the worst spread does not happen on all iterations we 
can still
+        // read all bytes because we will mask them.
+        std::memcpy(&buffer, in + start_byte, std::min(kBufferSize, 
spread_bytes));
+      } else {
+        std::memcpy(&buffer, in + start_byte, spread_bytes);
+      }
     }
 
     buffer = bit_util::FromLittleEndian(buffer);
@@ -192,7 +206,8 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int 
batch_size, int bit_
     const uint8_t* in_end = in + (max_read_bytes >= 0 ? max_read_bytes : 
bytes_batch);
 
     // In case of misalignment, we need to run the prolog until aligned.
-    int extracted = unpack_exact<kPackedBitWidth, true>(in, out, batch_size, 
bit_offset);
+    int extracted =
+        unpack_exact<kPackedBitWidth, true>(in, in_end, out, batch_size, 
bit_offset);
     // We either extracted everything or found a alignment
     const int start_bit = extracted * kPackedBitWidth + bit_offset;
     ARROW_DCHECK((extracted == batch_size) || ((start_bit) % 8 == 0));
@@ -230,7 +245,8 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int 
batch_size, int bit_
       // Running the epilog for the remaining values that don't fit in a kernel
       ARROW_DCHECK_GE(batch_size, 0);
       ARROW_COMPILER_ASSUME(batch_size >= 0);
-      unpack_exact<kPackedBitWidth, false>(in, out, batch_size, /* bit_offset= 
*/ 0);
+      unpack_exact<kPackedBitWidth, false>(in, in_end, out, batch_size,
+                                           /* bit_offset= */ 0);
     }
   }
 }
diff --git a/cpp/src/arrow/util/bpacking_simd_default.cc 
b/cpp/src/arrow/util/bpacking_simd_128.cc
similarity index 89%
copy from cpp/src/arrow/util/bpacking_simd_default.cc
copy to cpp/src/arrow/util/bpacking_simd_128.cc
index 61adee52a3..1bc756b2aa 100644
--- a/cpp/src/arrow/util/bpacking_simd_default.cc
+++ b/cpp/src/arrow/util/bpacking_simd_128.cc
@@ -17,8 +17,10 @@
 
 #if defined(ARROW_HAVE_NEON)
 #  define UNPACK_PLATFORM unpack_neon
+#  define KERNEL_PLATFORM KernelNeon
 #elif defined(ARROW_HAVE_SSE4_2)
 #  define UNPACK_PLATFORM unpack_sse4_2
+#  define KERNEL_PLATFORM KernelSse42
 #endif
 
 #if defined(UNPACK_PLATFORM)
@@ -30,11 +32,11 @@
 namespace arrow::internal::bpacking {
 
 template <typename UnpackedUint, int kPackedBitSize>
-using Simd128Kernel = Kernel<UnpackedUint, kPackedBitSize, 128>;
+using KERNEL_PLATFORM = Kernel<UnpackedUint, kPackedBitSize, 
xsimd::default_arch>;
 
 template <typename Uint>
 void UNPACK_PLATFORM(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
-  return unpack_jump<Simd128Kernel>(in, out, opts);
+  return unpack_jump<KERNEL_PLATFORM>(in, out, opts);
 }
 
 template void UNPACK_PLATFORM<bool>(const uint8_t*, bool*, const 
UnpackOptions&);
diff --git a/cpp/src/arrow/util/bpacking_simd_avx2.cc 
b/cpp/src/arrow/util/bpacking_simd_128_alt.cc
similarity index 54%
rename from cpp/src/arrow/util/bpacking_simd_avx2.cc
rename to cpp/src/arrow/util/bpacking_simd_128_alt.cc
index de1f228aec..bd4799d3cd 100644
--- a/cpp/src/arrow/util/bpacking_simd_avx2.cc
+++ b/cpp/src/arrow/util/bpacking_simd_128_alt.cc
@@ -15,25 +15,37 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#if defined(ARROW_HAVE_RUNTIME_SVE128)
+#  define UNPACK_PLATFORM unpack_sve128
+#  define KERNEL_PLATFORM KernelSve128
+#endif
+
+#if !defined(UNPACK_PLATFORM)
+#  error "This file must be compiled with a known SIMD micro architecture"
+#endif
+
+#include <xsimd/xsimd.hpp>
+
 #include "arrow/util/bpacking_dispatch_internal.h"
-#include "arrow/util/bpacking_internal.h"
 #include "arrow/util/bpacking_simd_internal.h"
 #include "arrow/util/bpacking_simd_kernel_internal.h"
 
 namespace arrow::internal::bpacking {
 
 template <typename UnpackedUint, int kPackedBitSize>
-using Simd256Kernel = Kernel<UnpackedUint, kPackedBitSize, 256>;
+using KERNEL_PLATFORM = Kernel<UnpackedUint, kPackedBitSize, 
xsimd::detail::sve<128>>;
 
 template <typename Uint>
-void unpack_avx2(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
-  return unpack_jump<Simd256Kernel>(in, out, opts);
+void UNPACK_PLATFORM(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
+  return unpack_jump<KERNEL_PLATFORM>(in, out, opts);
 }
 
-template void unpack_avx2<bool>(const uint8_t*, bool*, const UnpackOptions&);
-template void unpack_avx2<uint8_t>(const uint8_t*, uint8_t*, const 
UnpackOptions&);
-template void unpack_avx2<uint16_t>(const uint8_t*, uint16_t*, const 
UnpackOptions&);
-template void unpack_avx2<uint32_t>(const uint8_t*, uint32_t*, const 
UnpackOptions&);
-template void unpack_avx2<uint64_t>(const uint8_t*, uint64_t*, const 
UnpackOptions&);
+template void UNPACK_PLATFORM<bool>(const uint8_t*, bool*, const 
UnpackOptions&);
+template void UNPACK_PLATFORM<uint8_t>(const uint8_t*, uint8_t*, const 
UnpackOptions&);
+template void UNPACK_PLATFORM<uint16_t>(const uint8_t*, uint16_t*, const 
UnpackOptions&);
+template void UNPACK_PLATFORM<uint32_t>(const uint8_t*, uint32_t*, const 
UnpackOptions&);
+template void UNPACK_PLATFORM<uint64_t>(const uint8_t*, uint64_t*, const 
UnpackOptions&);
 
 }  // namespace arrow::internal::bpacking
+
+#undef UNPACK_PLATFORM
diff --git a/cpp/src/arrow/util/bpacking_simd_default.cc 
b/cpp/src/arrow/util/bpacking_simd_256.cc
similarity index 67%
rename from cpp/src/arrow/util/bpacking_simd_default.cc
rename to cpp/src/arrow/util/bpacking_simd_256.cc
index 61adee52a3..8c5ce4dc7d 100644
--- a/cpp/src/arrow/util/bpacking_simd_default.cc
+++ b/cpp/src/arrow/util/bpacking_simd_256.cc
@@ -15,26 +15,31 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#if defined(ARROW_HAVE_NEON)
-#  define UNPACK_PLATFORM unpack_neon
-#elif defined(ARROW_HAVE_SSE4_2)
-#  define UNPACK_PLATFORM unpack_sse4_2
+#if defined(ARROW_HAVE_SVE256) || defined(ARROW_HAVE_RUNTIME_SVE256)
+#  define UNPACK_PLATFORM unpack_sve256
+#  define KERNEL_PLATFORM KernelSve256
+#elif defined(ARROW_HAVE_RUNTIME_AVX2)
+#  define UNPACK_PLATFORM unpack_avx2
+#  define KERNEL_PLATFORM KernelAvx2
 #endif
 
-#if defined(UNPACK_PLATFORM)
+#if !defined(UNPACK_PLATFORM)
+#  error "This file must be compiled with a known SIMD micro architecture"
+#endif
 
-#  include "arrow/util/bpacking_dispatch_internal.h"
-#  include "arrow/util/bpacking_simd_internal.h"
-#  include "arrow/util/bpacking_simd_kernel_internal.h"
+#include "arrow/util/bpacking_dispatch_internal.h"
+#include "arrow/util/bpacking_internal.h"
+#include "arrow/util/bpacking_simd_internal.h"
+#include "arrow/util/bpacking_simd_kernel_internal.h"
 
 namespace arrow::internal::bpacking {
 
 template <typename UnpackedUint, int kPackedBitSize>
-using Simd128Kernel = Kernel<UnpackedUint, kPackedBitSize, 128>;
+using KERNEL_PLATFORM = Kernel<UnpackedUint, kPackedBitSize, 
xsimd::default_arch>;
 
 template <typename Uint>
 void UNPACK_PLATFORM(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
-  return unpack_jump<Simd128Kernel>(in, out, opts);
+  return unpack_jump<KERNEL_PLATFORM>(in, out, opts);
 }
 
 template void UNPACK_PLATFORM<bool>(const uint8_t*, bool*, const 
UnpackOptions&);
@@ -45,5 +50,4 @@ template void UNPACK_PLATFORM<uint64_t>(const uint8_t*, 
uint64_t*, const UnpackO
 
 }  // namespace arrow::internal::bpacking
 
-#  undef UNPACK_PLATFORM
-#endif  // UNPACK_PLATFORM
+#undef UNPACK_PLATFORM
diff --git a/cpp/src/arrow/util/bpacking_simd_internal.h 
b/cpp/src/arrow/util/bpacking_simd_internal.h
index 44ad4b0f86..d5a81baaec 100644
--- a/cpp/src/arrow/util/bpacking_simd_internal.h
+++ b/cpp/src/arrow/util/bpacking_simd_internal.h
@@ -25,68 +25,90 @@
 namespace arrow::internal::bpacking {
 
 #if defined(ARROW_HAVE_NEON)
+#  define UNPACK_ARCH128 unpack_neon
+#elif defined(ARROW_HAVE_SSE4_2)
+#  define UNPACK_ARCH128 unpack_sse4_2
+#endif
+
+#if defined(UNPACK_ARCH128)
 
 template <typename Uint>
-ARROW_EXPORT void unpack_neon(const uint8_t* in, Uint* out, const 
UnpackOptions& opts);
+ARROW_EXPORT void UNPACK_ARCH128(const uint8_t* in, Uint* out, const 
UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_neon<bool>(  //
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<bool>(  //
     const uint8_t* in, bool* out, const UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint8_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<uint8_t>(
     const uint8_t* in, uint8_t* out, const UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint16_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<uint16_t>(
     const uint8_t* in, uint16_t* out, const UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint32_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<uint32_t>(
     const uint8_t* in, uint32_t* out, const UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint64_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<uint64_t>(
     const uint8_t* in, uint64_t* out, const UnpackOptions& opts);
 
-#elif defined(ARROW_HAVE_SSE4_2)
+#endif  // UNPACK_ARCH128
+#undef UNPACK_ARCH128
+
+#if defined(ARROW_HAVE_RUNTIME_SVE128)
+#  define UNPACK_ARCH128_ALT unpack_sve128
+#endif
+
+#if defined(UNPACK_ARCH128_ALT)
 
 template <typename Uint>
-ARROW_EXPORT void unpack_sse4_2(const uint8_t* in, Uint* out, const 
UnpackOptions& opts);
+ARROW_EXPORT void UNPACK_ARCH128_ALT(const uint8_t* in, Uint* out,
+                                     const UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<bool>(  //
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT<bool>(  //
     const uint8_t* in, bool* out, const UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<uint8_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT<uint8_t>(
     const uint8_t* in, uint8_t* out, const UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<uint16_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT<uint16_t>(
     const uint8_t* in, uint16_t* out, const UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<uint32_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT<uint32_t>(
     const uint8_t* in, uint32_t* out, const UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<uint64_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT<uint64_t>(
     const uint8_t* in, uint64_t* out, const UnpackOptions& opts);
 
+#endif  // UNPACK_ARCH128_ALT
+#undef UNPACK_ARCH128_ALT
+
+#if defined(ARROW_HAVE_SVE256) || defined(ARROW_HAVE_RUNTIME_SVE256)
+#  define UNPACK_ARCH256 unpack_sve256
+#elif defined(UNPACK_ARCH256) || defined(ARROW_HAVE_RUNTIME_AVX2)
+#  define UNPACK_ARCH256 unpack_avx2
 #endif
 
-#if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX2)
+#if defined(UNPACK_ARCH256)
 
 template <typename Uint>
-ARROW_EXPORT void unpack_avx2(const uint8_t* in, Uint* out, const 
UnpackOptions& opts);
+ARROW_EXPORT void UNPACK_ARCH256(const uint8_t* in, Uint* out, const 
UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<bool>(  //
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<bool>(  //
     const uint8_t* in, bool* out, const UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint8_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<uint8_t>(
     const uint8_t* in, uint8_t* out, const UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint16_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<uint16_t>(
     const uint8_t* in, uint16_t* out, const UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint32_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<uint32_t>(
     const uint8_t* in, uint32_t* out, const UnpackOptions& opts);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint64_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<uint64_t>(
     const uint8_t* in, uint64_t* out, const UnpackOptions& opts);
 
-#endif
+#endif  // UNPACK_ARCH256
+#undef UNPACK_ARCH256
 
 #if defined(ARROW_HAVE_AVX512) || defined(ARROW_HAVE_RUNTIME_AVX512)
 
diff --git a/cpp/src/arrow/util/bpacking_simd_kernel_internal.h 
b/cpp/src/arrow/util/bpacking_simd_kernel_internal.h
index 318f348b4a..fe879bb5b0 100644
--- a/cpp/src/arrow/util/bpacking_simd_kernel_internal.h
+++ b/cpp/src/arrow/util/bpacking_simd_kernel_internal.h
@@ -37,12 +37,13 @@
 
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bpacking_dispatch_internal.h"
+#include "arrow/util/macros.h"
 #include "arrow/util/type_traits.h"
 
 namespace arrow::internal::bpacking {
 
 template <typename T, std::size_t N>
-constexpr std::array<T, N> BuildConstantArray(T val) {
+ARROW_FORCE_INLINE constexpr std::array<T, N> BuildConstantArray(T val) {
   std::array<T, N> out = {};
   for (auto& v : out) {
     v = val;
@@ -51,7 +52,7 @@ constexpr std::array<T, N> BuildConstantArray(T val) {
 }
 
 template <typename Arr>
-constexpr Arr BuildConstantArrayLike(typename Arr::value_type val) {
+ARROW_FORCE_INLINE constexpr Arr BuildConstantArrayLike(typename 
Arr::value_type val) {
   return BuildConstantArray<typename Arr::value_type, 
std::tuple_size_v<Arr>>(val);
 }
 
@@ -61,7 +62,7 @@ constexpr Arr BuildConstantArrayLike(typename Arr::value_type 
val) {
 
 /// Simple constexpr maximum element suited for non empty arrays.
 template <typename T, std::size_t N>
-constexpr T max_value(const std::array<T, N>& arr) {
+ARROW_FORCE_INLINE constexpr T max_value(const std::array<T, N>& arr) {
   static_assert(N > 0);
   T out = 0;
   for (const T& v : arr) {
@@ -73,7 +74,8 @@ constexpr T max_value(const std::array<T, N>& arr) {
 }
 
 template <std::array kArr, typename Arch, std::size_t... Is>
-constexpr auto array_to_batch_constant_impl(std::index_sequence<Is...>) {
+ARROW_FORCE_INLINE constexpr auto array_to_batch_constant_impl(
+    std::index_sequence<Is...>) {
   using Array = std::decay_t<decltype(kArr)>;
   using value_type = typename Array::value_type;
 
@@ -82,20 +84,20 @@ constexpr auto 
array_to_batch_constant_impl(std::index_sequence<Is...>) {
 
 /// Make a ``xsimd::batch_constant`` from a static constexpr array.
 template <std::array kArr, typename Arch>
-constexpr auto array_to_batch_constant() {
+ARROW_FORCE_INLINE constexpr auto array_to_batch_constant() {
   return array_to_batch_constant_impl<kArr, Arch>(
       std::make_index_sequence<kArr.size()>());
 }
 
 template <typename Uint, typename Arch>
-xsimd::batch<uint8_t, Arch> load_val_as(const uint8_t* in) {
+ARROW_FORCE_INLINE xsimd::batch<uint8_t, Arch> load_val_as(const uint8_t* in) {
   const Uint val = util::SafeLoadAs<Uint>(in);
   const auto batch = xsimd::batch<Uint, Arch>(val);
   return xsimd::bitwise_cast<uint8_t>(batch);
 }
 
 template <int kBytes, typename Arch>
-xsimd::batch<uint8_t, Arch> safe_load_bytes(const uint8_t* in) {
+ARROW_FORCE_INLINE xsimd::batch<uint8_t, Arch> safe_load_bytes(const uint8_t* 
in) {
   if constexpr (kBytes <= sizeof(uint64_t)) {
     return load_val_as<SizedUint<kBytes>, Arch>(in);
   }
@@ -104,7 +106,7 @@ xsimd::batch<uint8_t, Arch> safe_load_bytes(const uint8_t* 
in) {
 }
 
 template <typename Int, int kOffset, int kLength, typename Arr>
-constexpr auto select_stride_impl(Arr shifts) {
+ARROW_FORCE_INLINE constexpr auto select_stride_impl(Arr shifts) {
   std::array<Int, shifts.size() / kLength> out{};
   for (std::size_t i = 0; i < out.size(); ++i) {
     out[i] = shifts[kLength * i + kOffset];
@@ -131,7 +133,8 @@ constexpr auto select_stride_impl(Arr shifts) {
 /// while an offset of 1 would return the values:
 ///         |1|3|5|7|
 template <typename ToInt, int kOffset, typename Int, typename Arch, Int... 
kShifts>
-constexpr auto select_stride(xsimd::batch_constant<Int, Arch, kShifts...>) {
+ARROW_FORCE_INLINE constexpr auto select_stride(
+    xsimd::batch_constant<Int, Arch, kShifts...>) {
   static_assert(kOffset < sizeof(ToInt) / sizeof(Int));
   constexpr auto kStridesArr =
       select_stride_impl<ToInt, kOffset, sizeof(ToInt) / sizeof(Int)>(
@@ -163,8 +166,8 @@ constexpr bool IsNeon = std::is_base_of_v<xsimd::neon, 
Arch>;
 /// TODO(xsimd) Tracking in https://github.com/xtensor-stack/xsimd/pull/1220
 /// When migrating, be sure to use batch_constant overload, and not the batch 
one.
 template <typename Arch, typename Int, Int... kShifts>
-auto left_shift(const xsimd::batch<Int, Arch>& batch,
-                xsimd::batch_constant<Int, Arch, kShifts...> shifts)
+ARROW_FORCE_INLINE auto left_shift(const xsimd::batch<Int, Arch>& batch,
+                                   xsimd::batch_constant<Int, Arch, 
kShifts...> shifts)
     -> xsimd::batch<Int, Arch> {
   constexpr bool kIsSse2 = IsSse2<Arch>;
   constexpr bool kIsAvx2 = IsAvx2<Arch>;
@@ -227,8 +230,9 @@ auto left_shift(const xsimd::batch<Int, Arch>& batch,
 /// integers per second through vectorization, Software Practice & Experience 
45 (1),
 /// 2015. http://arxiv.org/abs/1209.2137
 template <typename Arch, typename Int, Int... kShifts>
-auto right_shift_by_excess(const xsimd::batch<Int, Arch>& batch,
-                           xsimd::batch_constant<Int, Arch, kShifts...> 
shifts) {
+ARROW_FORCE_INLINE auto right_shift_by_excess(
+    const xsimd::batch<Int, Arch>& batch,
+    xsimd::batch_constant<Int, Arch, kShifts...> shifts) {
   constexpr bool kIsSse2 = IsSse2<Arch>;
   constexpr bool kIsAvx2 = IsAvx2<Arch>;
   static_assert(
@@ -297,8 +301,9 @@ auto right_shift_by_excess(const xsimd::batch<Int, Arch>& 
batch,
 ///
 /// @see KernelShape
 /// @see PackedMaxSpreadBytes
-constexpr bool PackedIsOversizedForSimd(int simd_bit_size, int 
unpacked_bit_size,
-                                        int packed_bit_size) {
+ARROW_FORCE_INLINE constexpr bool PackedIsOversizedForSimd(int simd_bit_size,
+                                                           int 
unpacked_bit_size,
+                                                           int 
packed_bit_size) {
   const int unpacked_per_simd = simd_bit_size / unpacked_bit_size;
 
   const auto packed_per_read_for_offset = [&](int bit_offset) -> int {
@@ -382,10 +387,18 @@ struct KernelShape {
 };
 
 /// Packing all useful and derived information about a kernel in a single type.
-template <typename UnpackedUint, int kPackedBitSize, int kSimdBitSize>
+template <typename UnpackedUint, int kPackedBitSize, typename Arch>
 struct KernelTraits {
+  using unpacked_type = UnpackedUint;
+  /// The integer type to work with, `unpacked_type` or an appropriate type 
for bool.
+  using uint_type = std::conditional_t<std::is_same_v<unpacked_type, bool>,
+                                       SizedUint<sizeof(bool)>, unpacked_type>;
+  using arch_type = Arch;
+  using simd_batch = xsimd::batch<uint_type, arch_type>;
+  using simd_bytes = xsimd::batch<uint8_t, arch_type>;
+
   static constexpr KernelShape kShape = {
-      .simd_bit_size_ = kSimdBitSize,
+      .simd_bit_size_ = 8 * simd_bytes ::size,
       .unpacked_bit_size_ = 8 * sizeof(UnpackedUint),
       .packed_bit_size_ = kPackedBitSize,
   };
@@ -393,20 +406,12 @@ struct KernelTraits {
   static_assert(kShape.simd_bit_size() % kShape.unpacked_bit_size() == 0);
   static_assert(0 < kShape.packed_bit_size());
   static_assert(kShape.packed_bit_size() < kShape.simd_bit_size());
-
-  using unpacked_type = UnpackedUint;
-  /// The integer type to work with, `unpacked_type` or an appropriate type 
for bool.
-  using uint_type = std::conditional_t<std::is_same_v<unpacked_type, bool>,
-                                       SizedUint<sizeof(bool)>, unpacked_type>;
-  using simd_batch = xsimd::make_sized_batch_t<uint_type, 
kShape.unpacked_per_simd()>;
-  using simd_bytes = xsimd::make_sized_batch_t<uint8_t, 
kShape.simd_byte_size()>;
-  using arch_type = typename simd_batch::arch_type;
 };
 
 /// Return similar kernel traits but with a different integer unpacking type.
 template <typename KerTraits, typename Uint>
 using KernelTraitsWithUnpackUint = KernelTraits<Uint, 
KerTraits::kShape.packed_bit_size(),
-                                                
KerTraits::kShape.simd_bit_size()>;
+                                                typename KerTraits::arch_type>;
 
 /******************
  *  MediumKernel  *
@@ -506,7 +511,8 @@ constexpr MediumKernelPlanSize MediumKernelPlanSize::Build(
 /// function advise kernel plans to read only read 64 bits.
 /// This limits restrictions set by the plan on the input memory reads built 
to avoid
 /// reading overflow.
-constexpr int adjust_bytes_per_read(int bits_per_read, int simd_byte_size) {
+ARROW_FORCE_INLINE constexpr int adjust_bytes_per_read(int bits_per_read,
+                                                       int simd_byte_size) {
   if (bits_per_read <= static_cast<int>(8 * sizeof(uint32_t))) {
     return sizeof(uint32_t);
   } else if (bits_per_read <= static_cast<int>(8 * sizeof(uint64_t))) {
@@ -718,7 +724,8 @@ struct MediumKernel {
   static constexpr int kBytesRead = kPlan.total_bytes_read();
 
   template <int kReadIdx, int kSwizzleIdx, int kShiftIdx>
-  static void unpack_one_shift_impl(const simd_batch& words, unpacked_type* 
out) {
+  ARROW_FORCE_INLINE static void unpack_one_shift_impl(const simd_batch& words,
+                                                       unpacked_type* out) {
     constexpr auto kRightShiftsArr =
         kPlan.shifts.at(kReadIdx).at(kSwizzleIdx).at(kShiftIdx);
     constexpr auto kRightShifts = array_to_batch_constant<kRightShiftsArr, 
arch_type>();
@@ -741,8 +748,9 @@ struct MediumKernel {
   }
 
   template <int kReadIdx, int kSwizzleIdx, int... kShiftIds>
-  static void unpack_one_swizzle_impl(const simd_bytes& bytes, unpacked_type* 
out,
-                                      std::integer_sequence<int, 
kShiftIds...>) {
+  ARROW_FORCE_INLINE static void unpack_one_swizzle_impl(
+      const simd_bytes& bytes, unpacked_type* out,
+      std::integer_sequence<int, kShiftIds...>) {
     constexpr auto kSwizzlesArr = kPlan.swizzles.at(kReadIdx).at(kSwizzleIdx);
     constexpr auto kSwizzles = array_to_batch_constant<kSwizzlesArr, 
arch_type>();
 
@@ -752,8 +760,8 @@ struct MediumKernel {
   }
 
   template <int kReadIdx, int... kSwizzleIds>
-  static void unpack_one_read_impl(const uint8_t* in, unpacked_type* out,
-                                   std::integer_sequence<int, kSwizzleIds...>) 
{
+  ARROW_FORCE_INLINE static void unpack_one_read_impl(
+      const uint8_t* in, unpacked_type* out, std::integer_sequence<int, 
kSwizzleIds...>) {
     using ShiftSeq = std::make_integer_sequence<int, 
kPlanSize.shifts_per_swizzle()>;
     const auto bytes =
         safe_load_bytes<kPlan.bytes_per_read(), arch_type>(in + 
kPlan.reads.at(kReadIdx));
@@ -761,13 +769,13 @@ struct MediumKernel {
   }
 
   template <int... kReadIds>
-  static void unpack_all_impl(const uint8_t* in, unpacked_type* out,
-                              std::integer_sequence<int, kReadIds...>) {
+  ARROW_FORCE_INLINE static void unpack_all_impl(
+      const uint8_t* in, unpacked_type* out, std::integer_sequence<int, 
kReadIds...>) {
     using SwizzleSeq = std::make_integer_sequence<int, 
kPlanSize.swizzles_per_read()>;
     (unpack_one_read_impl<kReadIds>(in, out, SwizzleSeq{}), ...);
   }
 
-  static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) {
+  ARROW_FORCE_INLINE static const uint8_t* unpack(const uint8_t* in, 
unpacked_type* out) {
     using ReadSeq = std::make_integer_sequence<int, 
kPlanSize.reads_per_kernel()>;
     unpack_all_impl(in, out, ReadSeq{});
     return in + (kPlan.unpacked_per_kernel() * kShape.packed_bit_size()) / 8;
@@ -1005,7 +1013,8 @@ struct LargeKernel {
   static constexpr int kBytesRead = kPlan.total_bytes_read();
 
   template <int kReadIdx>
-  static void unpack_one_read_impl(const uint8_t* in, unpacked_type* out) {
+  ARROW_FORCE_INLINE static void unpack_one_read_impl(const uint8_t* in,
+                                                      unpacked_type* out) {
     constexpr auto kLowSwizzles =
         array_to_batch_constant<kPlan.low_swizzles.at(kReadIdx), arch_type>();
     constexpr auto kLowRShifts =
@@ -1040,12 +1049,12 @@ struct LargeKernel {
   }
 
   template <int... kReadIds>
-  static void unpack_all_impl(const uint8_t* in, unpacked_type* out,
-                              std::integer_sequence<int, kReadIds...>) {
+  ARROW_FORCE_INLINE static void unpack_all_impl(
+      const uint8_t* in, unpacked_type* out, std::integer_sequence<int, 
kReadIds...>) {
     (unpack_one_read_impl<kReadIds>(in, out), ...);
   }
 
-  static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) {
+  ARROW_FORCE_INLINE static const uint8_t* unpack(const uint8_t* in, 
unpacked_type* out) {
     using ReadSeq = std::make_integer_sequence<int, 
kPlanSize.reads_per_kernel()>;
     unpack_all_impl(in, out, ReadSeq{});
     return in + (kPlan.kPlanSize.unpacked_per_kernel() * 
kShape.packed_bit_size()) / 8;
@@ -1064,7 +1073,9 @@ struct NoOpKernel {
   static constexpr int kValuesUnpacked = 0;
   static constexpr int kBytesRead = 0;
 
-  static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) { return 
in; }
+  ARROW_FORCE_INLINE static const uint8_t* unpack(const uint8_t* in, 
unpacked_type* out) {
+    return in;
+  }
 };
 
 template <typename KernelTraits, typename WorkingKernel>
@@ -1073,7 +1084,7 @@ struct CastingKernel : WorkingKernel {
 
   static constexpr int kValuesUnpacked = WorkingKernel::kValuesUnpacked;
 
-  static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) {
+  ARROW_FORCE_INLINE static const uint8_t* unpack(const uint8_t* in, 
unpacked_type* out) {
     using working_type = typename WorkingKernel::unpacked_type;
 
     working_type buffer[kValuesUnpacked] = {};
@@ -1131,8 +1142,7 @@ template <typename Traits>
 using KernelDispatch = decltype(KernelDispatchImpl<Traits>());
 
 /// The public kernel exposed for any size.
-template <typename UnpackedUint, int kPackedBitSize, int kSimdBitSize>
-struct Kernel : KernelDispatch<KernelTraits<UnpackedUint, kPackedBitSize, 
kSimdBitSize>> {
-};
+template <typename UnpackedUint, int kPackedBitSize, typename Arch>
+struct Kernel : KernelDispatch<KernelTraits<UnpackedUint, kPackedBitSize, 
Arch>> {};
 
 }  // namespace arrow::internal::bpacking
diff --git a/cpp/src/arrow/util/bpacking_test.cc 
b/cpp/src/arrow/util/bpacking_test.cc
index 9072c0b8d1..d4d588228e 100644
--- a/cpp/src/arrow/util/bpacking_test.cc
+++ b/cpp/src/arrow/util/bpacking_test.cc
@@ -27,7 +27,8 @@
 #include "arrow/util/bpacking_scalar_internal.h"
 #include "arrow/util/bpacking_simd_internal.h"
 
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX512) || \
+    defined(ARROW_HAVE_RUNTIME_SVE128) || defined(ARROW_HAVE_RUNTIME_SVE256)
 #  include "arrow/util/cpu_info.h"
 #endif
 
@@ -104,9 +105,9 @@ std::vector<uint8_t> PackValues(const std::vector<Int>& 
values, int num_values,
   return out;
 }
 
-class TestUnpack : public ::testing::TestWithParam<int> {
+template <typename Int>
+class TestUnpack : public ::testing::Test {
  protected:
-  template <typename Int>
   void TestRoundtripAlignment(UnpackFunc<Int> unpack, const UnpackOptions& 
opts) {
     const auto original =
         GenerateRandomValuesForPacking<Int>(opts.batch_size, opts.bit_width);
@@ -129,7 +130,6 @@ class TestUnpack : public ::testing::TestWithParam<int> {
                                   << val_original << " but unpacked " << 
val_unpacked;
   }
 
-  template <typename Int>
   void TestUnpackZeros(UnpackFunc<Int> unpack, const UnpackOptions& opts) {
     const auto num_bytes = GetNumBytes(opts.batch_size, opts.bit_width, 
opts.bit_offset);
 
@@ -140,7 +140,6 @@ class TestUnpack : public ::testing::TestWithParam<int> {
     EXPECT_EQ(unpacked, expected);
   }
 
-  template <typename Int>
   void TestUnpackOnes(UnpackFunc<Int> unpack, const UnpackOptions& opts) {
     const auto num_bytes = GetNumBytes(opts.batch_size, opts.bit_width, 
opts.bit_offset);
 
@@ -161,7 +160,6 @@ class TestUnpack : public ::testing::TestWithParam<int> {
     EXPECT_EQ(unpacked, expected);
   }
 
-  template <typename Int>
   void TestUnpackAlternating(UnpackFunc<Int> unpack, const UnpackOptions& 
opts) {
     const auto num_bytes = GetNumBytes(opts.batch_size, opts.bit_width, 
opts.bit_offset);
 
@@ -191,168 +189,136 @@ class TestUnpack : public ::testing::TestWithParam<int> 
{
     EXPECT_EQ(unpacked, expected);
   }
 
-  template <typename Int>
   void TestAll(UnpackFunc<Int> unpack) {
-    const int num_values_base = GetParam();
-
-    constexpr int kMaxBitWidth = std::is_same_v<Int, bool> ? 1 : 8 * 
sizeof(Int);
-
-    // Given how many edge cases there are in unpacking integers, it is best 
to test all
-    // sizes
-    for (int bit_width = 0; bit_width <= kMaxBitWidth; ++bit_width) {
-      SCOPED_TRACE(::testing::Message() << "Testing bit_width=" << bit_width);
-
-      // We test all bit offset within a byte / misalignments to change how the
-      // prolog.
-      for (int bit_offset = 0; bit_offset < 8; ++bit_offset) {
-        SCOPED_TRACE(::testing::Message() << "Testing bit_offset=" << 
bit_offset);
-
-        const UnpackOptions opts{
-            .batch_size = num_values_base,
-            .bit_width = bit_width,
-            .bit_offset = bit_offset,
-            .max_read_bytes = -1,  // No over-reading in testing (strict ASAN)
-        };
-
-        // Known values
-        TestUnpackZeros(unpack, opts);
-        TestUnpackOnes(unpack, opts);
-        TestUnpackAlternating(unpack, opts);
-
-        // Roundtrips
-        TestRoundtripAlignment(unpack, opts);
-
-        if (testing::Test::HasFailure()) return;
+    // There are actually many differences across the different sizes.
+    // It is best to test them all.
+    for (int num_values_base : {64, 128, 2048}) {
+      SCOPED_TRACE(::testing::Message() << "Testing num_values=" << 
num_values_base);
+
+      constexpr int kMaxBitWidth = std::is_same_v<Int, bool> ? 1 : 8 * 
sizeof(Int);
+
+      // Given how many edge cases there are in unpacking integers, it is best 
to test all
+      // bit widths.
+      for (int bit_width = 0; bit_width <= kMaxBitWidth; ++bit_width) {
+        SCOPED_TRACE(::testing::Message() << "Testing bit_width=" << 
bit_width);
+
+        // We test all bit offset within a byte / misalignments to change how 
the
+        // prolog.
+        for (int bit_offset = 0; bit_offset < 8; ++bit_offset) {
+          SCOPED_TRACE(::testing::Message() << "Testing bit_offset=" << 
bit_offset);
+
+          const UnpackOptions opts{
+              .batch_size = num_values_base,
+              .bit_width = bit_width,
+              .bit_offset = bit_offset,
+              .max_read_bytes = -1,  // No over-reading in testing (strict 
ASAN)
+          };
+
+          // Known values
+          TestUnpackZeros(unpack, opts);
+          TestUnpackOnes(unpack, opts);
+          TestUnpackAlternating(unpack, opts);
+
+          // Roundtrips
+          TestRoundtripAlignment(unpack, opts);
+
+          if (testing::Test::HasFailure()) return;
+        }
+
+        // Similarly, we test all epilog sizes. That is extra values that 
could make it
+        // fall outside of an SIMD register
+        for (int epilogue_size = 0; epilogue_size <= kMaxBitWidth; 
++epilogue_size) {
+          SCOPED_TRACE(::testing::Message() << "Testing epilog_size=" << 
epilogue_size);
+
+          const int num_values = num_values_base + epilogue_size;
+
+          const UnpackOptions opts{
+              .batch_size = num_values,
+              .bit_width = bit_width,
+              .bit_offset = 0,
+              .max_read_bytes = -1,  // No over-reading in testing (strict 
ASAN)
+          };
+
+          // Known values
+          TestUnpackZeros(unpack, opts);
+          TestUnpackOnes(unpack, opts);
+          TestUnpackAlternating(unpack, opts);
+
+          // Roundtrips
+          TestRoundtripAlignment(unpack, opts);
+
+          if (testing::Test::HasFailure()) return;
+        }
       }
+    }
+  }
+};
 
-      // Similarly, we test all epilog sizes. That is extra values that could 
make it
-      // fall outside of an SIMD register
-      for (int epilogue_size = 0; epilogue_size <= kMaxBitWidth; 
++epilogue_size) {
-        SCOPED_TRACE(::testing::Message() << "Testing epilog_size=" << 
epilogue_size);
-
-        const int num_values = num_values_base + epilogue_size;
-
-        const UnpackOptions opts{
-            .batch_size = num_values,
-            .bit_width = bit_width,
-            .bit_offset = 0,
-            .max_read_bytes = -1,  // No over-reading in testing (strict ASAN)
-        };
-
-        // Known values
-        TestUnpackZeros(unpack, opts);
-        TestUnpackOnes(unpack, opts);
-        TestUnpackAlternating(unpack, opts);
-
-        // Roundtrips
-        TestRoundtripAlignment(unpack, opts);
+using UnpackTypes = ::testing::Types<bool, uint8_t, uint16_t, uint32_t, 
uint64_t>;
 
-        if (testing::Test::HasFailure()) return;
-      }
-    }
+struct UnpackTypeNames {
+  template <typename T>
+  static std::string GetName(int) {
+    if constexpr (std::is_same_v<T, bool>) return "bool";
+    if constexpr (std::is_same_v<T, uint8_t>) return "uint8_t";
+    if constexpr (std::is_same_v<T, uint16_t>) return "uint16_t";
+    if constexpr (std::is_same_v<T, uint32_t>) return "uint32_t";
+    if constexpr (std::is_same_v<T, uint64_t>) return "uint64_t";
   }
 };
 
-// There are actually many differences across the different sizes.
-// It is best to test them all.
-INSTANTIATE_TEST_SUITE_P(UnpackMultiplesOf64Values, TestUnpack,
-                         ::testing::Values(64, 128, 2048),
-                         [](const 
::testing::TestParamInfo<TestUnpack::ParamType>& info) {
-                           return "Length" + std::to_string(info.param);
-                         });
+TYPED_TEST_SUITE(TestUnpack, UnpackTypes, UnpackTypeNames);
 
-TEST_P(TestUnpack, UnpackBoolScalar) { 
this->TestAll(&bpacking::unpack_scalar<bool>); }
-TEST_P(TestUnpack, Unpack8Scalar) { 
this->TestAll(&bpacking::unpack_scalar<uint8_t>); }
-TEST_P(TestUnpack, Unpack16Scalar) { 
this->TestAll(&bpacking::unpack_scalar<uint16_t>); }
-TEST_P(TestUnpack, Unpack32Scalar) { 
this->TestAll(&bpacking::unpack_scalar<uint32_t>); }
-TEST_P(TestUnpack, Unpack64Scalar) { 
this->TestAll(&bpacking::unpack_scalar<uint64_t>); }
+TYPED_TEST(TestUnpack, UnpackScalar) {
+  this->TestAll(&bpacking::unpack_scalar<TypeParam>);
+}
 
 #if defined(ARROW_HAVE_SSE4_2)
-TEST_P(TestUnpack, UnpackBoolSse4_2) { 
this->TestAll(&bpacking::unpack_sse4_2<bool>); }
-TEST_P(TestUnpack, Unpack8Sse4_2) { 
this->TestAll(&bpacking::unpack_sse4_2<uint8_t>); }
-TEST_P(TestUnpack, Unpack16Sse4_2) { 
this->TestAll(&bpacking::unpack_sse4_2<uint16_t>); }
-TEST_P(TestUnpack, Unpack32Sse4_2) { 
this->TestAll(&bpacking::unpack_sse4_2<uint32_t>); }
-TEST_P(TestUnpack, Unpack64Sse4_2) { 
this->TestAll(&bpacking::unpack_sse4_2<uint64_t>); }
+TYPED_TEST(TestUnpack, UnpackSse4_2) {
+  this->TestAll(&bpacking::unpack_sse4_2<TypeParam>);
+}
 #endif
 
 #if defined(ARROW_HAVE_RUNTIME_AVX2)
-TEST_P(TestUnpack, UnpackBoolAvx2) {
-  if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) {
-    GTEST_SKIP() << "Test requires AVX2";
-  }
-  this->TestAll(&bpacking::unpack_avx2<bool>);
-}
-TEST_P(TestUnpack, Unpack8Avx2) {
-  if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) {
-    GTEST_SKIP() << "Test requires AVX2";
-  }
-  this->TestAll(&bpacking::unpack_avx2<uint8_t>);
-}
-TEST_P(TestUnpack, Unpack16Avx2) {
-  if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) {
-    GTEST_SKIP() << "Test requires AVX2";
-  }
-  this->TestAll(&bpacking::unpack_avx2<uint16_t>);
-}
-TEST_P(TestUnpack, Unpack32Avx2) {
-  if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) {
-    GTEST_SKIP() << "Test requires AVX2";
-  }
-  this->TestAll(&bpacking::unpack_avx2<uint32_t>);
-}
-TEST_P(TestUnpack, Unpack64Avx2) {
+TYPED_TEST(TestUnpack, UnpackAvx2) {
   if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) {
     GTEST_SKIP() << "Test requires AVX2";
   }
-  this->TestAll(&bpacking::unpack_avx2<uint64_t>);
+  this->TestAll(&bpacking::unpack_avx2<TypeParam>);
 }
 #endif
 
 #if defined(ARROW_HAVE_RUNTIME_AVX512)
-TEST_P(TestUnpack, UnpackBoolAvx512) {
-  if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) {
-    GTEST_SKIP() << "Test requires AVX512";
-  }
-  this->TestAll(&bpacking::unpack_avx512<bool>);
-}
-TEST_P(TestUnpack, Unpack8Avx512) {
-  if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) {
-    GTEST_SKIP() << "Test requires AVX512";
-  }
-  this->TestAll(&bpacking::unpack_avx512<uint8_t>);
-}
-TEST_P(TestUnpack, Unpack16Avx512) {
+TYPED_TEST(TestUnpack, UnpackAvx512) {
   if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) {
     GTEST_SKIP() << "Test requires AVX512";
   }
-  this->TestAll(&bpacking::unpack_avx512<uint16_t>);
+  this->TestAll(&bpacking::unpack_avx512<TypeParam>);
 }
-TEST_P(TestUnpack, Unpack32Avx512) {
-  if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) {
-    GTEST_SKIP() << "Test requires AVX512";
-  }
-  this->TestAll(&bpacking::unpack_avx512<uint32_t>);
-}
-TEST_P(TestUnpack, Unpack64Avx512) {
-  if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) {
-    GTEST_SKIP() << "Test requires AVX512";
+#endif
+
+#if defined(ARROW_HAVE_NEON)
+TYPED_TEST(TestUnpack, UnpackNeon) { 
this->TestAll(&bpacking::unpack_neon<TypeParam>); }
+#endif
+
+#if defined(ARROW_HAVE_RUNTIME_SVE128)
+TYPED_TEST(TestUnpack, UnpackSve128) {
+  if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128)) {
+    GTEST_SKIP() << "Test requires SVE128";
   }
-  this->TestAll(&bpacking::unpack_avx512<uint64_t>);
+  this->TestAll(&bpacking::unpack_sve128<TypeParam>);
 }
 #endif
 
-#if defined(ARROW_HAVE_NEON)
-TEST_P(TestUnpack, UnpackBoolNeon) { 
this->TestAll(&bpacking::unpack_neon<bool>); }
-TEST_P(TestUnpack, Unpack8Neon) { 
this->TestAll(&bpacking::unpack_neon<uint8_t>); }
-TEST_P(TestUnpack, Unpack16Neon) { 
this->TestAll(&bpacking::unpack_neon<uint16_t>); }
-TEST_P(TestUnpack, Unpack32Neon) { 
this->TestAll(&bpacking::unpack_neon<uint32_t>); }
-TEST_P(TestUnpack, Unpack64Neon) { 
this->TestAll(&bpacking::unpack_neon<uint64_t>); }
+#if defined(ARROW_HAVE_RUNTIME_SVE256)
+TYPED_TEST(TestUnpack, UnpackSve256) {
+  if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256)) {
+    GTEST_SKIP() << "Test requires SVE256";
+  }
+  this->TestAll(&bpacking::unpack_sve256<TypeParam>);
+}
 #endif
 
-TEST_P(TestUnpack, UnpackBool) { this->TestAll(&unpack<bool>); }
-TEST_P(TestUnpack, Unpack8) { this->TestAll(&unpack<uint8_t>); }
-TEST_P(TestUnpack, Unpack16) { this->TestAll(&unpack<uint16_t>); }
-TEST_P(TestUnpack, Unpack32) { this->TestAll(&unpack<uint32_t>); }
-TEST_P(TestUnpack, Unpack64) { this->TestAll(&unpack<uint64_t>); }
+TYPED_TEST(TestUnpack, Unpack) { this->TestAll(&unpack<TypeParam>); }
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/cpu_info.cc b/cpp/src/arrow/util/cpu_info.cc
index e24a3bbfe2..2cc1ac802b 100644
--- a/cpp/src/arrow/util/cpu_info.cc
+++ b/cpp/src/arrow/util/cpu_info.cc
@@ -297,6 +297,12 @@ void OsRetrieveCpuInfo(int64_t* hardware_flags, 
CpuInfo::Vendor* vendor,
 
 #else
 //------------------------------ LINUX ------------------------------//
+#  if defined(CPUINFO_ARCH_ARM)
+#    include <asm/hwcap.h>
+#    include <sys/auxv.h>
+#    include <sys/prctl.h>
+#  endif
+
 // Get cache size, return 0 on error
 int64_t LinuxGetCacheSize(int level) {
   // get cache size by sysconf()
@@ -413,8 +419,30 @@ void OsRetrieveCpuInfo(int64_t* hardware_flags, 
CpuInfo::Vendor* vendor,
       }
     }
   }
+
+#  if defined(CPUINFO_ARCH_ARM)
+  // Detect SVE and vector length via getauxval/prctl (more reliable than 
/proc/cpuinfo)
+#    ifdef HWCAP_SVE
+  const auto hwcap = getauxval(AT_HWCAP);
+  if (hwcap & HWCAP_SVE) {
+    *hardware_flags |= CpuInfo::SVE;
+#      ifdef PR_SVE_GET_VL
+    const int vl = prctl(PR_SVE_GET_VL);
+    assert(vl >= 0);
+    // prctl returns vector length in bytes; mask off status flags
+    const int vl_bytes = vl & PR_SVE_VL_LEN_MASK;
+    // Running SVE128 on a SVE256 machine is more tricky than the x86 
equivalent of
+    // running SSE code on an AVX machine and requires to explicitly change the
+    // vector length using `prctl` (per thread setting).
+    if (vl_bytes == 16) *hardware_flags |= CpuInfo::SVE128;  // 128 bits
+    if (vl_bytes == 32) *hardware_flags |= CpuInfo::SVE256;  // 256 bits
+    if (vl_bytes == 64) *hardware_flags |= CpuInfo::SVE512;  // 512 bits
+#      endif  // PR_SVE_GET_VL
+  }
+#    endif    // HWCAP_SVE
+#  endif      // CPUINFO_ARCH_ARM
 }
-#endif  // WINDOWS, MACOS, LINUX
+#endif        // WINDOWS, MACOS, LINUX
 
 //============================== Arch Dependent 
==============================//
 
@@ -473,11 +501,35 @@ void ArchVerifyCpuRequirements(const CpuInfo* ci) {
 #elif defined(CPUINFO_ARCH_ARM)
 //------------------------------ AARCH64 ------------------------------//
 bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* 
hardware_flags) {
-  if (simd_level == "NONE") {
-    *hardware_flags &= ~CpuInfo::ASIMD;
-    return true;
+  enum {
+    USER_SIMD_NONE,
+    USER_SIMD_SVE,
+    USER_SIMD_SVE128,
+    USER_SIMD_SVE256,
+    USER_SIMD_SVE512,
+    USER_SIMD_MAX,
+  };
+
+  int level = USER_SIMD_MAX;
+  if (simd_level == "SVE") {
+    level = USER_SIMD_SVE;
+  } else if (simd_level == "SVE128") {
+    level = USER_SIMD_SVE128;
+  } else if (simd_level == "SVE256") {
+    level = USER_SIMD_SVE256;
+  } else if (simd_level == "SVE512") {
+    level = USER_SIMD_SVE512;
+  } else if (simd_level == "NONE") {
+    level = USER_SIMD_NONE;
+  } else {
+    return false;
   }
-  return false;
+
+  if (level < USER_SIMD_SVE512) *hardware_flags &= ~CpuInfo::SVE512;
+  if (level < USER_SIMD_SVE256) *hardware_flags &= ~CpuInfo::SVE256;
+  if (level < USER_SIMD_SVE128) *hardware_flags &= ~CpuInfo::SVE128;
+  if (level < USER_SIMD_SVE) *hardware_flags &= ~CpuInfo::SVE;
+  return true;
 }
 
 void ArchVerifyCpuRequirements(const CpuInfo* ci) {
diff --git a/cpp/src/arrow/util/cpu_info.h b/cpp/src/arrow/util/cpu_info.h
index 949719b97e..de0ef13cc5 100644
--- a/cpp/src/arrow/util/cpu_info.h
+++ b/cpp/src/arrow/util/cpu_info.h
@@ -56,6 +56,10 @@ class ARROW_EXPORT CpuInfo {
 
   /// Arm features
   static constexpr int64_t ASIMD = (1LL << 32);
+  static constexpr int64_t SVE = (1LL << 33);
+  static constexpr int64_t SVE128 = (1LL << 36);
+  static constexpr int64_t SVE256 = (1LL << 34);
+  static constexpr int64_t SVE512 = (1LL << 35);
 
   /// Cache enums for L1 (data), L2 and L3
   enum class CacheLevel { L1 = 0, L2, L3, Last = L3 };
diff --git a/cpp/src/arrow/util/dispatch_internal.h 
b/cpp/src/arrow/util/dispatch_internal.h
index 7ac19b0b24..5fa071cb19 100644
--- a/cpp/src/arrow/util/dispatch_internal.h
+++ b/cpp/src/arrow/util/dispatch_internal.h
@@ -23,8 +23,7 @@
 #include "arrow/status.h"
 #include "arrow/util/cpu_info.h"
 
-namespace arrow {
-namespace internal {
+namespace arrow::internal {
 
 enum class DispatchLevel : int {
   // These dispatch levels, corresponding to instruction set features,
@@ -34,6 +33,9 @@ enum class DispatchLevel : int {
   AVX2,
   AVX512,
   NEON,
+  SVE128,
+  SVE256,
+  SVE512,
   MAX
 };
 
@@ -106,11 +108,18 @@ class DynamicDispatch {
         return cpu_info->IsSupported(CpuInfo::AVX2);
       case DispatchLevel::AVX512:
         return cpu_info->IsSupported(CpuInfo::AVX512);
+      case DispatchLevel::NEON:
+        return cpu_info->IsSupported(CpuInfo::ASIMD);
+      case DispatchLevel::SVE128:
+        return cpu_info->IsSupported(CpuInfo::SVE128);
+      case DispatchLevel::SVE256:
+        return cpu_info->IsSupported(CpuInfo::SVE256);
+      case DispatchLevel::SVE512:
+        return cpu_info->IsSupported(CpuInfo::SVE512);
       default:
         return false;
     }
   }
 };
 
-}  // namespace internal
-}  // namespace arrow
+}  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h
index 832b686b31..2da3933f50 100644
--- a/cpp/src/arrow/util/macros.h
+++ b/cpp/src/arrow/util/macros.h
@@ -75,7 +75,7 @@
 #if defined(__GNUC__)  // GCC and compatible compilers (clang, Intel ICC)
 #  define ARROW_NORETURN __attribute__((noreturn))
 #  define ARROW_NOINLINE __attribute__((noinline))
-#  define ARROW_FORCE_INLINE __attribute__((always_inline))
+#  define ARROW_FORCE_INLINE __attribute__((always_inline)) inline
 #  define ARROW_PREDICT_FALSE(x) (__builtin_expect(!!(x), 0))
 #  define ARROW_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
 #  define ARROW_RESTRICT __restrict
diff --git a/cpp/src/arrow/util/ubsan.h b/cpp/src/arrow/util/ubsan.h
index 5c6a8f419b..6c8a9812af 100644
--- a/cpp/src/arrow/util/ubsan.h
+++ b/cpp/src/arrow/util/ubsan.h
@@ -54,7 +54,7 @@ inline T* MakeNonNull(T* maybe_null = NULLPTR) {
 }
 
 template <typename T>
-inline std::enable_if_t<std::is_trivially_copyable_v<T>, T> SafeLoadAs(
+ARROW_FORCE_INLINE std::enable_if_t<std::is_trivially_copyable_v<T>, T> 
SafeLoadAs(
     const uint8_t* unaligned) {
   using Type = std::remove_const_t<T>;
   arrow::internal::AlignedStorage<Type> raw_data;
@@ -65,7 +65,8 @@ inline std::enable_if_t<std::is_trivially_copyable_v<T>, T> 
SafeLoadAs(
 }
 
 template <typename T>
-inline std::enable_if_t<std::is_trivially_copyable_v<T>, T> SafeLoad(const T* 
unaligned) {
+ARROW_FORCE_INLINE std::enable_if_t<std::is_trivially_copyable_v<T>, T> 
SafeLoad(
+    const T* unaligned) {
   using Type = std::remove_const_t<T>;
   arrow::internal::AlignedStorage<Type> raw_data;
   std::memcpy(raw_data.get(), static_cast<const void*>(unaligned), sizeof(T));
@@ -75,10 +76,11 @@ inline std::enable_if_t<std::is_trivially_copyable_v<T>, T> 
SafeLoad(const T* un
 }
 
 template <typename U, typename T>
-inline std::enable_if_t<std::is_trivially_copyable_v<T> &&
-                            std::is_trivially_copyable_v<U> && sizeof(T) == 
sizeof(U),
-                        U>
-SafeCopy(T value) {
+ARROW_FORCE_INLINE
+    std::enable_if_t<std::is_trivially_copyable_v<T> && 
std::is_trivially_copyable_v<U> &&
+                         sizeof(T) == sizeof(U),
+                     U>
+    SafeCopy(T value) {
   using TypeU = std::remove_const_t<U>;
   arrow::internal::AlignedStorage<TypeU> raw_data;
   std::memcpy(raw_data.get(), static_cast<const void*>(&value), sizeof(T));
@@ -88,8 +90,8 @@ SafeCopy(T value) {
 }
 
 template <typename T>
-inline std::enable_if_t<std::is_trivially_copyable_v<T>, void> SafeStore(void* 
unaligned,
-                                                                         T 
value) {
+ARROW_FORCE_INLINE std::enable_if_t<std::is_trivially_copyable_v<T>, void> 
SafeStore(
+    void* unaligned, T value) {
   std::memcpy(unaligned, &value, sizeof(T));
 }
 

Reply via email to