This is an automated email from the ASF dual-hosted git repository.
pitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new d1a8cf4133 GH-47769: [C++] SVE dynamic dispatch (#49756)
d1a8cf4133 is described below
commit d1a8cf4133e94941850d1a709f5263d82d1f45ed
Author: Antoine Prouvost <[email protected]>
AuthorDate: Tue Apr 28 12:44:57 2026 +0200
GH-47769: [C++] SVE dynamic dispatch (#49756)
### Rationale for this change
Just like we dynamically dispatch to AVX2 on x86 CPUs, we want to
dynamically dispatch to more advanced SIMD extension on ARM64 chips.
### What changes are included in this PR?
- A new macro to enable selecting the runtime SVE version
- Detection of the ARM64 CPU features available at runtime
- Adding SVE to the dynamic dispatch for bit unpacking algorithms.
### Are these changes tested?
### Are there any user-facing changes?
No.
* GitHub Issue: #47769
Lead-authored-by: Antoine Prouvost <[email protected]>
Co-authored-by: AntoinePrv <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/cmake_modules/DefineOptions.cmake | 3 +
cpp/cmake_modules/SetupCxxFlags.cmake | 32 ++-
cpp/src/arrow/CMakeLists.txt | 28 ++-
cpp/src/arrow/meson.build | 2 +-
cpp/src/arrow/util/bpacking.cc | 39 +++-
cpp/src/arrow/util/bpacking_benchmark.cc | 166 +++++---------
cpp/src/arrow/util/bpacking_dispatch_internal.h | 44 ++--
...acking_simd_default.cc => bpacking_simd_128.cc} | 6 +-
...cking_simd_avx2.cc => bpacking_simd_128_alt.cc} | 30 ++-
...acking_simd_default.cc => bpacking_simd_256.cc} | 28 ++-
cpp/src/arrow/util/bpacking_simd_internal.h | 64 ++++--
cpp/src/arrow/util/bpacking_simd_kernel_internal.h | 98 +++++----
cpp/src/arrow/util/bpacking_test.cc | 244 +++++++++------------
cpp/src/arrow/util/cpu_info.cc | 62 +++++-
cpp/src/arrow/util/cpu_info.h | 4 +
cpp/src/arrow/util/dispatch_internal.h | 17 +-
cpp/src/arrow/util/macros.h | 2 +-
cpp/src/arrow/util/ubsan.h | 18 +-
18 files changed, 498 insertions(+), 389 deletions(-)
diff --git a/cpp/cmake_modules/DefineOptions.cmake
b/cpp/cmake_modules/DefineOptions.cmake
index 017a5a6efb..51b9fc8b2e 100644
--- a/cpp/cmake_modules/DefineOptions.cmake
+++ b/cpp/cmake_modules/DefineOptions.cmake
@@ -191,6 +191,9 @@ takes precedence over ccache if a storage backend is
configured" ON)
"SSE4_2"
"AVX2"
"AVX512"
+ "SVE128" # fixed size SVE
+ "SVE256" # "
+ "SVE512" # "
"MAX")
define_option(ARROW_ALTIVEC "Build with Altivec if compiler has support" ON)
diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake
b/cpp/cmake_modules/SetupCxxFlags.cmake
index c35fc6a6fe..7c02d53e62 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -134,7 +134,31 @@ elseif(ARROW_CPU_FLAG STREQUAL "ppc")
elseif(ARROW_CPU_FLAG STREQUAL "aarch64")
# Arm64 compiler flags, gcc/clang only
set(ARROW_ARMV8_MARCH "armv8-a")
- check_cxx_compiler_flag("-march=${ARROW_ARMV8_MARCH}+sve" CXX_SUPPORTS_SVE)
+ set(ARROW_SVE_FLAGS "-march=${ARROW_ARMV8_MARCH}+sve")
+ set(ARROW_SVE128_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=128")
+ set(ARROW_SVE256_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=256")
+ set(ARROW_SVE512_FLAGS "${ARROW_SVE_FLAGS}" "-msve-vector-bits=512")
+ # We only have a way to do SVE dynamic dispatch on Linux (BSD may be possible
+ # but is currently not implemented).
+ # We still support explicitly setting runtime SIMD level to some SVE values
+ # on these platforms as this can be useful in development for building SVE
+ # code locally. The compiler supports it but the code won't run.
+ if((APPLE OR WIN32) AND ARROW_RUNTIME_SIMD_LEVEL STREQUAL "MAX")
+ set(ARROW_RUNTIME_SIMD_LEVEL "NONE")
+ endif()
+ check_cxx_compiler_flag("${ARROW_SVE_FLAGS}" CXX_SUPPORTS_SVE)
+ if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES
"^(SVE128|SVE256|SVE512|MAX)$")
+ set(ARROW_HAVE_RUNTIME_SVE128 ON)
+ add_definitions(-DARROW_HAVE_RUNTIME_SVE128)
+ endif()
+ if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES
"^(SVE256|SVE512|MAX)$")
+ set(ARROW_HAVE_RUNTIME_SVE256 ON)
+ add_definitions(-DARROW_HAVE_RUNTIME_SVE256)
+ endif()
+ if(CXX_SUPPORTS_SVE AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(SVE512|MAX)$")
+ set(ARROW_HAVE_RUNTIME_SVE512 ON)
+ add_definitions(-DARROW_HAVE_RUNTIME_SVE512)
+ endif()
if(ARROW_SIMD_LEVEL STREQUAL "DEFAULT")
set(ARROW_SIMD_LEVEL "NEON")
endif()
@@ -528,8 +552,7 @@ if(ARROW_CPU_FLAG STREQUAL "aarch64")
if(NOT CXX_SUPPORTS_SVE)
message(FATAL_ERROR "SVE required but compiler doesn't support it.")
endif()
- # -march=armv8-a+sve
- set(ARROW_ARMV8_MARCH "${ARROW_ARMV8_MARCH}+sve")
+ set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_SVE_FLAGS}")
string(REGEX MATCH "[0-9]+" SVE_VECTOR_BITS ${ARROW_SIMD_LEVEL})
if(SVE_VECTOR_BITS)
set(ARROW_HAVE_SVE${SVE_VECTOR_BITS} ON)
@@ -540,8 +563,9 @@ if(ARROW_CPU_FLAG STREQUAL "aarch64")
set(ARROW_HAVE_SVE_SIZELESS ON)
add_definitions(-DARROW_HAVE_SVE_SIZELESS)
endif()
+ else() # ARM v8 without SVE
+ set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=${ARROW_ARMV8_MARCH}")
endif()
- set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=${ARROW_ARMV8_MARCH}")
elseif(NOT ARROW_SIMD_LEVEL STREQUAL "NONE")
message(WARNING "ARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL} not supported by
Arm.")
endif()
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index eee63b11ca..0839cc1fa8 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -343,6 +343,27 @@ macro(append_runtime_avx512_src SRCS SRC)
endif()
endmacro()
+macro(append_runtime_sve128_src SRCS SRC)
+ if(ARROW_HAVE_RUNTIME_SVE128)
+ list(APPEND ${SRCS} ${SRC})
+ set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS
"${ARROW_SVE128_FLAGS}")
+ endif()
+endmacro()
+
+macro(append_runtime_sve256_src SRCS SRC)
+ if(ARROW_HAVE_RUNTIME_SVE256)
+ list(APPEND ${SRCS} ${SRC})
+ set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS
"${ARROW_SVE256_FLAGS}")
+ endif()
+endmacro()
+
+macro(append_runtime_sve512_src SRCS SRC)
+ if(ARROW_HAVE_RUNTIME_SVE512)
+ list(APPEND ${SRCS} ${SRC})
+ set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS
"${ARROW_SVE512_FLAGS}")
+ endif()
+endmacro()
+
# Write out compile-time configuration constants
string(REPLACE "${CMAKE_SOURCE_DIR}" "<CMAKE_SOURCE_DIR>" REDACTED_CXX_FLAGS
${CMAKE_CXX_FLAGS})
@@ -498,7 +519,7 @@ set(ARROW_UTIL_SRCS
util/bitmap_ops.cc
util/bpacking.cc
util/bpacking_scalar.cc
- util/bpacking_simd_default.cc
+ util/bpacking_simd_128.cc
util/byte_size.cc
util/byte_stream_split_internal.cc
util/cancel.cc
@@ -543,9 +564,12 @@ set(ARROW_UTIL_SRCS
append_runtime_avx2_src(ARROW_UTIL_SRCS
util/byte_stream_split_internal_avx2.cc)
-append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_simd_avx2.cc)
+append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_simd_256.cc)
append_runtime_avx512_src(ARROW_UTIL_SRCS util/bpacking_simd_avx512.cc)
+append_runtime_sve128_src(ARROW_UTIL_SRCS util/bpacking_simd_128_alt.cc)
+append_runtime_sve256_src(ARROW_UTIL_SRCS util/bpacking_simd_256.cc)
+
if(ARROW_WITH_BROTLI)
list(APPEND ARROW_UTIL_SRCS util/compression_brotli.cc)
endif()
diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build
index fe7f11af6f..4b8faebecf 100644
--- a/cpp/src/arrow/meson.build
+++ b/cpp/src/arrow/meson.build
@@ -177,7 +177,7 @@ arrow_util_srcs = [
'util/bitmap_ops.cc',
'util/bpacking.cc',
'util/bpacking_scalar.cc',
- 'util/bpacking_simd_default.cc',
+ 'util/bpacking_simd_128.cc',
'util/byte_size.cc',
'util/byte_stream_split_internal.cc',
'util/cancel.cc',
diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc
index e959a9f9c4..fb7fd19ae3 100644
--- a/cpp/src/arrow/util/bpacking.cc
+++ b/cpp/src/arrow/util/bpacking.cc
@@ -33,16 +33,29 @@ struct UnpackDynamicFunction {
static constexpr auto implementations() {
return std::array{
+ // x86 implementations
#if defined(ARROW_HAVE_SSE4_2)
Implementation{DispatchLevel::NONE, &bpacking::unpack_sse4_2<Uint>},
-#else
- Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar<Uint>},
-#endif
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
+# if defined(ARROW_HAVE_RUNTIME_AVX2)
Implementation{DispatchLevel::AVX2, &bpacking::unpack_avx2<Uint>},
-#endif
-#if defined(ARROW_HAVE_RUNTIME_AVX512)
+# endif
+# if defined(ARROW_HAVE_RUNTIME_AVX512)
Implementation{DispatchLevel::AVX512, &bpacking::unpack_avx512<Uint>},
+# endif
+
+ // ARM implementations
+#elif defined(ARROW_HAVE_NEON)
+ Implementation{DispatchLevel::NONE, &bpacking::unpack_neon<Uint>},
+# if defined(ARROW_HAVE_RUNTIME_SVE128)
+ Implementation{DispatchLevel::SVE128, &bpacking::unpack_sve128<Uint>},
+# endif
+# if defined(ARROW_HAVE_RUNTIME_SVE256)
+ Implementation{DispatchLevel::SVE256, &bpacking::unpack_sve256<Uint>},
+# endif
+
+ // Other implementations
+#else
+ Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar<Uint>},
#endif
};
}
@@ -52,12 +65,14 @@ struct UnpackDynamicFunction {
template <typename Uint>
void unpack(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
-#if defined(ARROW_HAVE_NEON)
- return bpacking::unpack_neon(in, out, opts);
-#else
- static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
- return dispatch.func(in, out, opts);
-#endif
+ auto constexpr kImplementations =
UnpackDynamicFunction<Uint>::implementations();
+ if constexpr (kImplementations.size() == 1) {
+ constexpr auto func = kImplementations.front().second;
+ func(in, out, opts);
+ } else {
+ static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
+ return dispatch.func(in, out, opts);
+ }
}
template void unpack<bool>(const uint8_t*, bool*, const UnpackOptions&);
diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc
b/cpp/src/arrow/util/bpacking_benchmark.cc
index 354bc76ace..93d7cdf165 100644
--- a/cpp/src/arrow/util/bpacking_benchmark.cc
+++ b/cpp/src/arrow/util/bpacking_benchmark.cc
@@ -26,7 +26,7 @@
#include "arrow/util/bpacking_scalar_internal.h"
#include "arrow/util/bpacking_simd_internal.h"
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_SVE128)
# include "arrow/util/cpu_info.h"
#endif
@@ -107,10 +107,10 @@ void BM_Unpack(benchmark::State& state, bool aligned,
UnpackFunc<Int> unpack, bo
// will not emit runs larger than 512 (though other implementation might), so
we biased
// the benchmarks towards a rather small scale.
static const auto kNumValuesRange = benchmark::CreateRange(32, 512, 2);
-constexpr std::initializer_list<int64_t> kBitWidths8 = {1, 2, 8};
-constexpr std::initializer_list<int64_t> kBitWidths16 = {1, 2, 8, 13};
-constexpr std::initializer_list<int64_t> kBitWidths32 = {1, 2, 8, 20};
-constexpr std::initializer_list<int64_t> kBitWidths64 = {1, 2, 8, 20, 47};
+constexpr auto kBitWidths8 = std::initializer_list<int64_t>{1, 2, 8};
+constexpr auto kBitWidths16 = std::initializer_list<int64_t>{1, 2, 8, 13};
+constexpr auto kBitWidths32 = std::initializer_list<int64_t>{1, 2, 8, 20};
+constexpr auto kBitWidths64 = std::initializer_list<int64_t>{1, 2, 8, 20, 47};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValuesBool = {
{0, 1},
@@ -159,125 +159,69 @@ void BM_UnpackUint64(benchmark::State& state, bool
aligned, UnpackFunc<uint64_t>
return BM_Unpack<uint64_t>(state, aligned, unpack, skip,
std::move(skip_msg));
}
-BENCHMARK_CAPTURE(BM_UnpackBool, ScalarUnaligned, false,
&bpacking::unpack_scalar<bool>)
- ->ArgsProduct(kBitWidthsNumValuesBool);
-BENCHMARK_CAPTURE(BM_UnpackUint8, ScalarUnaligned, false,
- &bpacking::unpack_scalar<uint8_t>)
- ->ArgsProduct(kBitWidthsNumValues8);
-BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false,
- &bpacking::unpack_scalar<uint16_t>)
- ->ArgsProduct(kBitWidthsNumValues16);
-BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false,
- &bpacking::unpack_scalar<uint32_t>)
- ->ArgsProduct(kBitWidthsNumValues32);
-BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false,
- &bpacking::unpack_scalar<uint64_t>)
- ->ArgsProduct(kBitWidthsNumValues64);
+// Register BM_Unpack{Bool,Uint8,Uint16,Uint32,Uint64} benchmarks for a given
+// UNPACK_FUNC templated on each of those types, with explicit skip args.
+#define BENCHMARK_UNPACK_ALL_TYPES_SKIP(LABEL, ALIGNED, UNPACK_FUNC, SKIP,
SKIP_MSG) \
+ BENCHMARK_CAPTURE(BM_UnpackBool, LABEL, ALIGNED, &UNPACK_FUNC<bool>, SKIP,
SKIP_MSG) \
+ ->ArgsProduct(kBitWidthsNumValuesBool);
\
+ BENCHMARK_CAPTURE(BM_UnpackUint8, LABEL, ALIGNED, &UNPACK_FUNC<uint8_t>,
SKIP, \
+ SKIP_MSG)
\
+ ->ArgsProduct(kBitWidthsNumValues8);
\
+ BENCHMARK_CAPTURE(BM_UnpackUint16, LABEL, ALIGNED, &UNPACK_FUNC<uint16_t>,
SKIP, \
+ SKIP_MSG)
\
+ ->ArgsProduct(kBitWidthsNumValues16);
\
+ BENCHMARK_CAPTURE(BM_UnpackUint32, LABEL, ALIGNED, &UNPACK_FUNC<uint32_t>,
SKIP, \
+ SKIP_MSG)
\
+ ->ArgsProduct(kBitWidthsNumValues32);
\
+ BENCHMARK_CAPTURE(BM_UnpackUint64, LABEL, ALIGNED, &UNPACK_FUNC<uint64_t>,
SKIP, \
+ SKIP_MSG)
\
+ ->ArgsProduct(kBitWidthsNumValues64)
+
+#define BENCHMARK_UNPACK_ALL_TYPES(LABEL, ALIGNED, UNPACK_FUNC) \
+ BENCHMARK_UNPACK_ALL_TYPES_SKIP(LABEL, ALIGNED, UNPACK_FUNC, false, "")
+
+#define BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(LABEL, ALIGNED, UNPACK_FUNC,
CPU_FEATURE, \
+ SKIP_MSG)
\
+ BENCHMARK_UNPACK_ALL_TYPES_SKIP(
\
+ LABEL, ALIGNED, UNPACK_FUNC,
\
+ !CpuInfo::GetInstance()->IsSupported(CpuInfo::CPU_FEATURE), SKIP_MSG)
+
+BENCHMARK_UNPACK_ALL_TYPES(ScalarUnaligned, false, bpacking::unpack_scalar);
#if defined(ARROW_HAVE_SSE4_2)
-BENCHMARK_CAPTURE(BM_UnpackBool, Sse42Unaligned, false,
&bpacking::unpack_sse4_2<bool>)
- ->ArgsProduct(kBitWidthsNumValuesBool);
-BENCHMARK_CAPTURE(BM_UnpackUint8, Sse42Unaligned, false,
- &bpacking::unpack_sse4_2<uint8_t>)
- ->ArgsProduct(kBitWidthsNumValues8);
-BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false,
- &bpacking::unpack_sse4_2<uint16_t>)
- ->ArgsProduct(kBitWidthsNumValues16);
-BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false,
- &bpacking::unpack_sse4_2<uint32_t>)
- ->ArgsProduct(kBitWidthsNumValues32);
-BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false,
- &bpacking::unpack_sse4_2<uint64_t>)
- ->ArgsProduct(kBitWidthsNumValues64);
+BENCHMARK_UNPACK_ALL_TYPES(Sse42Unaligned, false, bpacking::unpack_sse4_2);
#endif
#if defined(ARROW_HAVE_RUNTIME_AVX2)
-BENCHMARK_CAPTURE(BM_UnpackBool, Avx2Unaligned, false,
&bpacking::unpack_avx2<bool>,
- !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
- "Avx2 not available")
- ->ArgsProduct(kBitWidthsNumValuesBool);
-BENCHMARK_CAPTURE(BM_UnpackUint8, Avx2Unaligned, false,
&bpacking::unpack_avx2<uint8_t>,
- !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
- "Avx2 not available")
- ->ArgsProduct(kBitWidthsNumValues8);
-BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false,
&bpacking::unpack_avx2<uint16_t>,
- !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
- "Avx2 not available")
- ->ArgsProduct(kBitWidthsNumValues16);
-BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false,
&bpacking::unpack_avx2<uint32_t>,
- !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
- "Avx2 not available")
- ->ArgsProduct(kBitWidthsNumValues32);
-BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false,
&bpacking::unpack_avx2<uint64_t>,
- !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
- "Avx2 not available")
- ->ArgsProduct(kBitWidthsNumValues64);
+BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Avx2Unaligned, false,
bpacking::unpack_avx2, AVX2,
+ "Avx2 not available");
#endif
#if defined(ARROW_HAVE_RUNTIME_AVX512)
-BENCHMARK_CAPTURE(BM_UnpackBool, Avx512Unaligned, false,
&bpacking::unpack_avx512<bool>,
- !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
- "Avx512 not available")
- ->ArgsProduct(kBitWidthsNumValuesBool);
-BENCHMARK_CAPTURE(BM_UnpackUint8, Avx512Unaligned, false,
- &bpacking::unpack_avx512<uint8_t>,
- !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
- "Avx512 not available")
- ->ArgsProduct(kBitWidthsNumValues8);
-BENCHMARK_CAPTURE(BM_UnpackUint16, Avx512Unaligned, false,
- &bpacking::unpack_avx512<uint16_t>,
- !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
- "Avx512 not available")
- ->ArgsProduct(kBitWidthsNumValues16);
-BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false,
- &bpacking::unpack_avx512<uint32_t>,
- !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
- "Avx512 not available")
- ->ArgsProduct(kBitWidthsNumValues32);
-BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false,
- &bpacking::unpack_avx512<uint64_t>,
- !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
- "Avx512 not available")
- ->ArgsProduct(kBitWidthsNumValues64);
+BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Avx512Unaligned, false,
bpacking::unpack_avx512,
+ AVX512, "Avx512 not available");
#endif
#if defined(ARROW_HAVE_NEON)
-BENCHMARK_CAPTURE(BM_UnpackBool, NeonUnaligned, false,
&bpacking::unpack_neon<bool>)
- ->ArgsProduct(kBitWidthsNumValuesBool);
-BENCHMARK_CAPTURE(BM_UnpackUint8, NeonUnaligned, false,
&bpacking::unpack_neon<uint8_t>)
- ->ArgsProduct(kBitWidthsNumValues8);
-BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false,
&bpacking::unpack_neon<uint16_t>)
- ->ArgsProduct(kBitWidthsNumValues16);
-BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false,
&bpacking::unpack_neon<uint32_t>)
- ->ArgsProduct(kBitWidthsNumValues32);
-BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false,
&bpacking::unpack_neon<uint64_t>)
- ->ArgsProduct(kBitWidthsNumValues64);
+BENCHMARK_UNPACK_ALL_TYPES(NeonUnaligned, false, bpacking::unpack_neon);
+#endif
+
+#if defined(ARROW_HAVE_RUNTIME_SVE128)
+BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Sve128Unaligned, false,
bpacking::unpack_sve128,
+ SVE128, "Sve128 not available");
#endif
-BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack<bool>)
- ->ArgsProduct(kBitWidthsNumValuesBool);
-BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack<bool>)
- ->ArgsProduct(kBitWidthsNumValuesBool);
-
-BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicAligned, true, &unpack<uint8_t>)
- ->ArgsProduct(kBitWidthsNumValues8);
-BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicUnaligned, false, &unpack<uint8_t>)
- ->ArgsProduct(kBitWidthsNumValues8);
-
-BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicAligned, true, &unpack<uint16_t>)
- ->ArgsProduct(kBitWidthsNumValues16);
-BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicUnaligned, false, &unpack<uint16_t>)
- ->ArgsProduct(kBitWidthsNumValues16);
-
-BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicAligned, true, &unpack<uint32_t>)
- ->ArgsProduct(kBitWidthsNumValues32);
-BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicUnaligned, false, &unpack<uint32_t>)
- ->ArgsProduct(kBitWidthsNumValues32);
-
-BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicAligned, true, &unpack<uint64_t>)
- ->ArgsProduct(kBitWidthsNumValues64);
-BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicUnaligned, false, &unpack<uint64_t>)
- ->ArgsProduct(kBitWidthsNumValues64);
+#if defined(ARROW_HAVE_RUNTIME_SVE256)
+BENCHMARK_UNPACK_ALL_TYPES_RUNTIME(Sve256Unaligned, false,
bpacking::unpack_sve256,
+ SVE256, "Sve256 not available");
+#endif
+
+BENCHMARK_UNPACK_ALL_TYPES(DynamicAligned, true, unpack);
+BENCHMARK_UNPACK_ALL_TYPES(DynamicUnaligned, false, unpack);
+
+#undef BENCHMARK_UNPACK_ALL_TYPES_RUNTIME
+#undef BENCHMARK_UNPACK_ALL_TYPES
+#undef BENCHMARK_UNPACK_ALL_TYPES_SKIP
} // namespace
} // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h
b/cpp/src/arrow/util/bpacking_dispatch_internal.h
index 561bbbe7b9..6ea6adee18 100644
--- a/cpp/src/arrow/util/bpacking_dispatch_internal.h
+++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h
@@ -32,14 +32,14 @@ namespace arrow::internal::bpacking {
/// Unpack a zero bit packed array.
template <typename Uint>
-void unpack_null(const uint8_t* in, Uint* out, int batch_size) {
+ARROW_FORCE_INLINE void unpack_null(const uint8_t* in, Uint* out, int
batch_size) {
std::memset(out, 0, batch_size * sizeof(Uint));
}
/// Unpack a packed array where packed and unpacked values have exactly the
same number of
/// bits.
template <typename Uint>
-void unpack_full(const uint8_t* in, Uint* out, int batch_size) {
+ARROW_FORCE_INLINE void unpack_full(const uint8_t* in, Uint* out, int
batch_size) {
if constexpr (ARROW_LITTLE_ENDIAN == 1) {
std::memcpy(out, in, batch_size * sizeof(Uint));
} else {
@@ -60,7 +60,7 @@ void unpack_full(const uint8_t* in, Uint* out, int
batch_size) {
/// will be split on the first byte boundary (hence having a spread of two
bytes) while
/// four bit integer will be well behaved and never spread over byte boundary
(hence
/// having a spread of one).
-constexpr int PackedMaxSpreadBytes(int width, int bit_offset) {
+ARROW_FORCE_INLINE constexpr int PackedMaxSpreadBytes(int width, int
bit_offset) {
int max = static_cast<int>(bit_util::BytesForBits(width));
int start = bit_offset;
do {
@@ -75,7 +75,7 @@ constexpr int PackedMaxSpreadBytes(int width, int bit_offset)
{
/// Compute the maximum spread in bytes that a packed integer can cover across
all bit
/// offsets.
-constexpr int PackedMaxSpreadBytes(int width) {
+ARROW_FORCE_INLINE constexpr int PackedMaxSpreadBytes(int width) {
int max = 0;
for (int offset = 0; offset < 8; ++offset) {
const int spread = PackedMaxSpreadBytes(width, offset);
@@ -97,7 +97,8 @@ using SpreadBufferUint = std::conditional_t<
/// In prolog mode, instead of unpacking all required element, the function
will
/// stop if it finds a byte aligned value start.
template <int kPackedBitWidth, bool kIsProlog, typename Uint>
-int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset)
{
+ARROW_FORCE_INLINE int unpack_exact(const uint8_t* in, const uint8_t* in_end,
Uint* out,
+ int batch_size, int bit_offset) {
static_assert(kPackedBitWidth > 0);
// For the epilog we adapt the max spread since better alignment give
shorter spreads
@@ -127,15 +128,28 @@ int unpack_exact(const uint8_t* in, Uint* out, int
batch_size, int bit_offset) {
ARROW_COMPILER_ASSUME(spread_bytes <= kMaxSpreadBytes);
// Reading the bytes for the current value.
- // Must be careful not to read out of input bounds.
buffer_uint buffer = 0;
- if constexpr (kLarge) {
- // We read the max possible bytes in the first pass and handle the rest
after.
- // Even though the worst spread does not happen on all iterations we can
still read
- // all bytes because we will mask them.
- std::memcpy(&buffer, in + start_byte, std::min(kBufferSize,
spread_bytes));
+ if (ARROW_PREDICT_TRUE(in + start_byte + kBufferSize < in_end)) {
+ // Fast path we read the whole buffer. In all but few last reads
(plural!) we will
+ // always have enough bytes left in the buffer to avoid out-of-bounds
reads. On top
+ // of this, in Arrow, `unpack` is always called with `max_read_bytes`
set, meaning
+ // this will often be the *only* path taken.
+ // We added this special case because `std::memcpy` without a compile
time constant
+ // was not inlined and optimized properly by the compiler, resulting to
a function
+ // call on each iteration.
+ // This also handles the `kLarge` case detailed below.
+ std::memcpy(&buffer, in + start_byte, kBufferSize);
} else {
- std::memcpy(&buffer, in + start_byte, spread_bytes);
+ // Slow path, we need to read exactly the correct number of bytes to
avoid
+ // out-of-bounds reads.
+ if constexpr (kLarge) {
+ // We read the max possible bytes in the first pass and handle the
rest after.
+ // Even though the worst spread does not happen on all iterations we
can still
+ // read all bytes because we will mask them.
+ std::memcpy(&buffer, in + start_byte, std::min(kBufferSize,
spread_bytes));
+ } else {
+ std::memcpy(&buffer, in + start_byte, spread_bytes);
+ }
}
buffer = bit_util::FromLittleEndian(buffer);
@@ -192,7 +206,8 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int
batch_size, int bit_
const uint8_t* in_end = in + (max_read_bytes >= 0 ? max_read_bytes :
bytes_batch);
// In case of misalignment, we need to run the prolog until aligned.
- int extracted = unpack_exact<kPackedBitWidth, true>(in, out, batch_size,
bit_offset);
+ int extracted =
+ unpack_exact<kPackedBitWidth, true>(in, in_end, out, batch_size,
bit_offset);
// We either extracted everything or found a alignment
const int start_bit = extracted * kPackedBitWidth + bit_offset;
ARROW_DCHECK((extracted == batch_size) || ((start_bit) % 8 == 0));
@@ -230,7 +245,8 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int
batch_size, int bit_
// Running the epilog for the remaining values that don't fit in a kernel
ARROW_DCHECK_GE(batch_size, 0);
ARROW_COMPILER_ASSUME(batch_size >= 0);
- unpack_exact<kPackedBitWidth, false>(in, out, batch_size, /* bit_offset=
*/ 0);
+ unpack_exact<kPackedBitWidth, false>(in, in_end, out, batch_size,
+ /* bit_offset= */ 0);
}
}
}
diff --git a/cpp/src/arrow/util/bpacking_simd_default.cc
b/cpp/src/arrow/util/bpacking_simd_128.cc
similarity index 89%
copy from cpp/src/arrow/util/bpacking_simd_default.cc
copy to cpp/src/arrow/util/bpacking_simd_128.cc
index 61adee52a3..1bc756b2aa 100644
--- a/cpp/src/arrow/util/bpacking_simd_default.cc
+++ b/cpp/src/arrow/util/bpacking_simd_128.cc
@@ -17,8 +17,10 @@
#if defined(ARROW_HAVE_NEON)
# define UNPACK_PLATFORM unpack_neon
+# define KERNEL_PLATFORM KernelNeon
#elif defined(ARROW_HAVE_SSE4_2)
# define UNPACK_PLATFORM unpack_sse4_2
+# define KERNEL_PLATFORM KernelSse42
#endif
#if defined(UNPACK_PLATFORM)
@@ -30,11 +32,11 @@
namespace arrow::internal::bpacking {
template <typename UnpackedUint, int kPackedBitSize>
-using Simd128Kernel = Kernel<UnpackedUint, kPackedBitSize, 128>;
+using KERNEL_PLATFORM = Kernel<UnpackedUint, kPackedBitSize,
xsimd::default_arch>;
template <typename Uint>
void UNPACK_PLATFORM(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
- return unpack_jump<Simd128Kernel>(in, out, opts);
+ return unpack_jump<KERNEL_PLATFORM>(in, out, opts);
}
template void UNPACK_PLATFORM<bool>(const uint8_t*, bool*, const
UnpackOptions&);
diff --git a/cpp/src/arrow/util/bpacking_simd_avx2.cc
b/cpp/src/arrow/util/bpacking_simd_128_alt.cc
similarity index 54%
rename from cpp/src/arrow/util/bpacking_simd_avx2.cc
rename to cpp/src/arrow/util/bpacking_simd_128_alt.cc
index de1f228aec..bd4799d3cd 100644
--- a/cpp/src/arrow/util/bpacking_simd_avx2.cc
+++ b/cpp/src/arrow/util/bpacking_simd_128_alt.cc
@@ -15,25 +15,37 @@
// specific language governing permissions and limitations
// under the License.
+#if defined(ARROW_HAVE_RUNTIME_SVE128)
+# define UNPACK_PLATFORM unpack_sve128
+# define KERNEL_PLATFORM KernelSve128
+#endif
+
+#if !defined(UNPACK_PLATFORM)
+# error "This file must be compiled with a known SIMD micro architecture"
+#endif
+
+#include <xsimd/xsimd.hpp>
+
#include "arrow/util/bpacking_dispatch_internal.h"
-#include "arrow/util/bpacking_internal.h"
#include "arrow/util/bpacking_simd_internal.h"
#include "arrow/util/bpacking_simd_kernel_internal.h"
namespace arrow::internal::bpacking {
template <typename UnpackedUint, int kPackedBitSize>
-using Simd256Kernel = Kernel<UnpackedUint, kPackedBitSize, 256>;
+using KERNEL_PLATFORM = Kernel<UnpackedUint, kPackedBitSize,
xsimd::detail::sve<128>>;
template <typename Uint>
-void unpack_avx2(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
- return unpack_jump<Simd256Kernel>(in, out, opts);
+void UNPACK_PLATFORM(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
+ return unpack_jump<KERNEL_PLATFORM>(in, out, opts);
}
-template void unpack_avx2<bool>(const uint8_t*, bool*, const UnpackOptions&);
-template void unpack_avx2<uint8_t>(const uint8_t*, uint8_t*, const
UnpackOptions&);
-template void unpack_avx2<uint16_t>(const uint8_t*, uint16_t*, const
UnpackOptions&);
-template void unpack_avx2<uint32_t>(const uint8_t*, uint32_t*, const
UnpackOptions&);
-template void unpack_avx2<uint64_t>(const uint8_t*, uint64_t*, const
UnpackOptions&);
+template void UNPACK_PLATFORM<bool>(const uint8_t*, bool*, const
UnpackOptions&);
+template void UNPACK_PLATFORM<uint8_t>(const uint8_t*, uint8_t*, const
UnpackOptions&);
+template void UNPACK_PLATFORM<uint16_t>(const uint8_t*, uint16_t*, const
UnpackOptions&);
+template void UNPACK_PLATFORM<uint32_t>(const uint8_t*, uint32_t*, const
UnpackOptions&);
+template void UNPACK_PLATFORM<uint64_t>(const uint8_t*, uint64_t*, const
UnpackOptions&);
} // namespace arrow::internal::bpacking
+
+#undef UNPACK_PLATFORM
diff --git a/cpp/src/arrow/util/bpacking_simd_default.cc
b/cpp/src/arrow/util/bpacking_simd_256.cc
similarity index 67%
rename from cpp/src/arrow/util/bpacking_simd_default.cc
rename to cpp/src/arrow/util/bpacking_simd_256.cc
index 61adee52a3..8c5ce4dc7d 100644
--- a/cpp/src/arrow/util/bpacking_simd_default.cc
+++ b/cpp/src/arrow/util/bpacking_simd_256.cc
@@ -15,26 +15,31 @@
// specific language governing permissions and limitations
// under the License.
-#if defined(ARROW_HAVE_NEON)
-# define UNPACK_PLATFORM unpack_neon
-#elif defined(ARROW_HAVE_SSE4_2)
-# define UNPACK_PLATFORM unpack_sse4_2
+#if defined(ARROW_HAVE_SVE256) || defined(ARROW_HAVE_RUNTIME_SVE256)
+# define UNPACK_PLATFORM unpack_sve256
+# define KERNEL_PLATFORM KernelSve256
+#elif defined(ARROW_HAVE_RUNTIME_AVX2)
+# define UNPACK_PLATFORM unpack_avx2
+# define KERNEL_PLATFORM KernelAvx2
#endif
-#if defined(UNPACK_PLATFORM)
+#if !defined(UNPACK_PLATFORM)
+# error "This file must be compiled with a known SIMD micro architecture"
+#endif
-# include "arrow/util/bpacking_dispatch_internal.h"
-# include "arrow/util/bpacking_simd_internal.h"
-# include "arrow/util/bpacking_simd_kernel_internal.h"
+#include "arrow/util/bpacking_dispatch_internal.h"
+#include "arrow/util/bpacking_internal.h"
+#include "arrow/util/bpacking_simd_internal.h"
+#include "arrow/util/bpacking_simd_kernel_internal.h"
namespace arrow::internal::bpacking {
template <typename UnpackedUint, int kPackedBitSize>
-using Simd128Kernel = Kernel<UnpackedUint, kPackedBitSize, 128>;
+using KERNEL_PLATFORM = Kernel<UnpackedUint, kPackedBitSize,
xsimd::default_arch>;
template <typename Uint>
void UNPACK_PLATFORM(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
- return unpack_jump<Simd128Kernel>(in, out, opts);
+ return unpack_jump<KERNEL_PLATFORM>(in, out, opts);
}
template void UNPACK_PLATFORM<bool>(const uint8_t*, bool*, const
UnpackOptions&);
@@ -45,5 +50,4 @@ template void UNPACK_PLATFORM<uint64_t>(const uint8_t*,
uint64_t*, const UnpackO
} // namespace arrow::internal::bpacking
-# undef UNPACK_PLATFORM
-#endif // UNPACK_PLATFORM
+#undef UNPACK_PLATFORM
diff --git a/cpp/src/arrow/util/bpacking_simd_internal.h
b/cpp/src/arrow/util/bpacking_simd_internal.h
index 44ad4b0f86..d5a81baaec 100644
--- a/cpp/src/arrow/util/bpacking_simd_internal.h
+++ b/cpp/src/arrow/util/bpacking_simd_internal.h
@@ -25,68 +25,90 @@
namespace arrow::internal::bpacking {
#if defined(ARROW_HAVE_NEON)
+# define UNPACK_ARCH128 unpack_neon
+#elif defined(ARROW_HAVE_SSE4_2)
+# define UNPACK_ARCH128 unpack_sse4_2
+#endif
+
+#if defined(UNPACK_ARCH128)
template <typename Uint>
-ARROW_EXPORT void unpack_neon(const uint8_t* in, Uint* out, const
UnpackOptions& opts);
+ARROW_EXPORT void UNPACK_ARCH128(const uint8_t* in, Uint* out, const
UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_neon<bool>( //
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<bool>( //
const uint8_t* in, bool* out, const UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint8_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<uint8_t>(
const uint8_t* in, uint8_t* out, const UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint16_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<uint16_t>(
const uint8_t* in, uint16_t* out, const UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint32_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<uint32_t>(
const uint8_t* in, uint32_t* out, const UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint64_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128<uint64_t>(
const uint8_t* in, uint64_t* out, const UnpackOptions& opts);
-#elif defined(ARROW_HAVE_SSE4_2)
+#endif // UNPACK_ARCH128
+#undef UNPACK_ARCH128
+
+#if defined(ARROW_HAVE_RUNTIME_SVE128)
+# define UNPACK_ARCH128_ALT unpack_sve128
+#endif
+
+#if defined(UNPACK_ARCH128_ALT)
template <typename Uint>
-ARROW_EXPORT void unpack_sse4_2(const uint8_t* in, Uint* out, const
UnpackOptions& opts);
+ARROW_EXPORT void UNPACK_ARCH128_ALT(const uint8_t* in, Uint* out,
+ const UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<bool>( //
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT<bool>( //
const uint8_t* in, bool* out, const UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<uint8_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT<uint8_t>(
const uint8_t* in, uint8_t* out, const UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<uint16_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT<uint16_t>(
const uint8_t* in, uint16_t* out, const UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<uint32_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT<uint32_t>(
const uint8_t* in, uint32_t* out, const UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2<uint64_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH128_ALT<uint64_t>(
const uint8_t* in, uint64_t* out, const UnpackOptions& opts);
+#endif // UNPACK_ARCH128_ALT
+#undef UNPACK_ARCH128_ALT
+
+#if defined(ARROW_HAVE_SVE256) || defined(ARROW_HAVE_RUNTIME_SVE256)
+# define UNPACK_ARCH256 unpack_sve256
+#elif defined(UNPACK_ARCH256) || defined(ARROW_HAVE_RUNTIME_AVX2)
+# define UNPACK_ARCH256 unpack_avx2
#endif
-#if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX2)
+#if defined(UNPACK_ARCH256)
template <typename Uint>
-ARROW_EXPORT void unpack_avx2(const uint8_t* in, Uint* out, const
UnpackOptions& opts);
+ARROW_EXPORT void UNPACK_ARCH256(const uint8_t* in, Uint* out, const
UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<bool>( //
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<bool>( //
const uint8_t* in, bool* out, const UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint8_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<uint8_t>(
const uint8_t* in, uint8_t* out, const UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint16_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<uint16_t>(
const uint8_t* in, uint16_t* out, const UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint32_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<uint32_t>(
const uint8_t* in, uint32_t* out, const UnpackOptions& opts);
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint64_t>(
+extern template ARROW_TEMPLATE_EXPORT void UNPACK_ARCH256<uint64_t>(
const uint8_t* in, uint64_t* out, const UnpackOptions& opts);
-#endif
+#endif // UNPACK_ARCH256
+#undef UNPACK_ARCH256
#if defined(ARROW_HAVE_AVX512) || defined(ARROW_HAVE_RUNTIME_AVX512)
diff --git a/cpp/src/arrow/util/bpacking_simd_kernel_internal.h
b/cpp/src/arrow/util/bpacking_simd_kernel_internal.h
index 318f348b4a..fe879bb5b0 100644
--- a/cpp/src/arrow/util/bpacking_simd_kernel_internal.h
+++ b/cpp/src/arrow/util/bpacking_simd_kernel_internal.h
@@ -37,12 +37,13 @@
#include "arrow/util/bit_util.h"
#include "arrow/util/bpacking_dispatch_internal.h"
+#include "arrow/util/macros.h"
#include "arrow/util/type_traits.h"
namespace arrow::internal::bpacking {
template <typename T, std::size_t N>
-constexpr std::array<T, N> BuildConstantArray(T val) {
+ARROW_FORCE_INLINE constexpr std::array<T, N> BuildConstantArray(T val) {
std::array<T, N> out = {};
for (auto& v : out) {
v = val;
@@ -51,7 +52,7 @@ constexpr std::array<T, N> BuildConstantArray(T val) {
}
template <typename Arr>
-constexpr Arr BuildConstantArrayLike(typename Arr::value_type val) {
+ARROW_FORCE_INLINE constexpr Arr BuildConstantArrayLike(typename
Arr::value_type val) {
return BuildConstantArray<typename Arr::value_type,
std::tuple_size_v<Arr>>(val);
}
@@ -61,7 +62,7 @@ constexpr Arr BuildConstantArrayLike(typename Arr::value_type
val) {
/// Simple constexpr maximum element suited for non empty arrays.
template <typename T, std::size_t N>
-constexpr T max_value(const std::array<T, N>& arr) {
+ARROW_FORCE_INLINE constexpr T max_value(const std::array<T, N>& arr) {
static_assert(N > 0);
T out = 0;
for (const T& v : arr) {
@@ -73,7 +74,8 @@ constexpr T max_value(const std::array<T, N>& arr) {
}
template <std::array kArr, typename Arch, std::size_t... Is>
-constexpr auto array_to_batch_constant_impl(std::index_sequence<Is...>) {
+ARROW_FORCE_INLINE constexpr auto array_to_batch_constant_impl(
+ std::index_sequence<Is...>) {
using Array = std::decay_t<decltype(kArr)>;
using value_type = typename Array::value_type;
@@ -82,20 +84,20 @@ constexpr auto
array_to_batch_constant_impl(std::index_sequence<Is...>) {
/// Make a ``xsimd::batch_constant`` from a static constexpr array.
template <std::array kArr, typename Arch>
-constexpr auto array_to_batch_constant() {
+ARROW_FORCE_INLINE constexpr auto array_to_batch_constant() {
return array_to_batch_constant_impl<kArr, Arch>(
std::make_index_sequence<kArr.size()>());
}
template <typename Uint, typename Arch>
-xsimd::batch<uint8_t, Arch> load_val_as(const uint8_t* in) {
+ARROW_FORCE_INLINE xsimd::batch<uint8_t, Arch> load_val_as(const uint8_t* in) {
const Uint val = util::SafeLoadAs<Uint>(in);
const auto batch = xsimd::batch<Uint, Arch>(val);
return xsimd::bitwise_cast<uint8_t>(batch);
}
template <int kBytes, typename Arch>
-xsimd::batch<uint8_t, Arch> safe_load_bytes(const uint8_t* in) {
+ARROW_FORCE_INLINE xsimd::batch<uint8_t, Arch> safe_load_bytes(const uint8_t*
in) {
if constexpr (kBytes <= sizeof(uint64_t)) {
return load_val_as<SizedUint<kBytes>, Arch>(in);
}
@@ -104,7 +106,7 @@ xsimd::batch<uint8_t, Arch> safe_load_bytes(const uint8_t*
in) {
}
template <typename Int, int kOffset, int kLength, typename Arr>
-constexpr auto select_stride_impl(Arr shifts) {
+ARROW_FORCE_INLINE constexpr auto select_stride_impl(Arr shifts) {
std::array<Int, shifts.size() / kLength> out{};
for (std::size_t i = 0; i < out.size(); ++i) {
out[i] = shifts[kLength * i + kOffset];
@@ -131,7 +133,8 @@ constexpr auto select_stride_impl(Arr shifts) {
/// while an offset of 1 would return the values:
/// |1|3|5|7|
template <typename ToInt, int kOffset, typename Int, typename Arch, Int...
kShifts>
-constexpr auto select_stride(xsimd::batch_constant<Int, Arch, kShifts...>) {
+ARROW_FORCE_INLINE constexpr auto select_stride(
+ xsimd::batch_constant<Int, Arch, kShifts...>) {
static_assert(kOffset < sizeof(ToInt) / sizeof(Int));
constexpr auto kStridesArr =
select_stride_impl<ToInt, kOffset, sizeof(ToInt) / sizeof(Int)>(
@@ -163,8 +166,8 @@ constexpr bool IsNeon = std::is_base_of_v<xsimd::neon,
Arch>;
/// TODO(xsimd) Tracking in https://github.com/xtensor-stack/xsimd/pull/1220
/// When migrating, be sure to use batch_constant overload, and not the batch
one.
template <typename Arch, typename Int, Int... kShifts>
-auto left_shift(const xsimd::batch<Int, Arch>& batch,
- xsimd::batch_constant<Int, Arch, kShifts...> shifts)
+ARROW_FORCE_INLINE auto left_shift(const xsimd::batch<Int, Arch>& batch,
+ xsimd::batch_constant<Int, Arch,
kShifts...> shifts)
-> xsimd::batch<Int, Arch> {
constexpr bool kIsSse2 = IsSse2<Arch>;
constexpr bool kIsAvx2 = IsAvx2<Arch>;
@@ -227,8 +230,9 @@ auto left_shift(const xsimd::batch<Int, Arch>& batch,
/// integers per second through vectorization, Software Practice & Experience
45 (1),
/// 2015. http://arxiv.org/abs/1209.2137
template <typename Arch, typename Int, Int... kShifts>
-auto right_shift_by_excess(const xsimd::batch<Int, Arch>& batch,
- xsimd::batch_constant<Int, Arch, kShifts...>
shifts) {
+ARROW_FORCE_INLINE auto right_shift_by_excess(
+ const xsimd::batch<Int, Arch>& batch,
+ xsimd::batch_constant<Int, Arch, kShifts...> shifts) {
constexpr bool kIsSse2 = IsSse2<Arch>;
constexpr bool kIsAvx2 = IsAvx2<Arch>;
static_assert(
@@ -297,8 +301,9 @@ auto right_shift_by_excess(const xsimd::batch<Int, Arch>&
batch,
///
/// @see KernelShape
/// @see PackedMaxSpreadBytes
-constexpr bool PackedIsOversizedForSimd(int simd_bit_size, int
unpacked_bit_size,
- int packed_bit_size) {
+ARROW_FORCE_INLINE constexpr bool PackedIsOversizedForSimd(int simd_bit_size,
+ int
unpacked_bit_size,
+ int
packed_bit_size) {
const int unpacked_per_simd = simd_bit_size / unpacked_bit_size;
const auto packed_per_read_for_offset = [&](int bit_offset) -> int {
@@ -382,10 +387,18 @@ struct KernelShape {
};
/// Packing all useful and derived information about a kernel in a single type.
-template <typename UnpackedUint, int kPackedBitSize, int kSimdBitSize>
+template <typename UnpackedUint, int kPackedBitSize, typename Arch>
struct KernelTraits {
+ using unpacked_type = UnpackedUint;
+ /// The integer type to work with, `unpacked_type` or an appropriate type
for bool.
+ using uint_type = std::conditional_t<std::is_same_v<unpacked_type, bool>,
+ SizedUint<sizeof(bool)>, unpacked_type>;
+ using arch_type = Arch;
+ using simd_batch = xsimd::batch<uint_type, arch_type>;
+ using simd_bytes = xsimd::batch<uint8_t, arch_type>;
+
static constexpr KernelShape kShape = {
- .simd_bit_size_ = kSimdBitSize,
+ .simd_bit_size_ = 8 * simd_bytes ::size,
.unpacked_bit_size_ = 8 * sizeof(UnpackedUint),
.packed_bit_size_ = kPackedBitSize,
};
@@ -393,20 +406,12 @@ struct KernelTraits {
static_assert(kShape.simd_bit_size() % kShape.unpacked_bit_size() == 0);
static_assert(0 < kShape.packed_bit_size());
static_assert(kShape.packed_bit_size() < kShape.simd_bit_size());
-
- using unpacked_type = UnpackedUint;
- /// The integer type to work with, `unpacked_type` or an appropriate type
for bool.
- using uint_type = std::conditional_t<std::is_same_v<unpacked_type, bool>,
- SizedUint<sizeof(bool)>, unpacked_type>;
- using simd_batch = xsimd::make_sized_batch_t<uint_type,
kShape.unpacked_per_simd()>;
- using simd_bytes = xsimd::make_sized_batch_t<uint8_t,
kShape.simd_byte_size()>;
- using arch_type = typename simd_batch::arch_type;
};
/// Return similar kernel traits but with a different integer unpacking type.
template <typename KerTraits, typename Uint>
using KernelTraitsWithUnpackUint = KernelTraits<Uint,
KerTraits::kShape.packed_bit_size(),
-
KerTraits::kShape.simd_bit_size()>;
+ typename KerTraits::arch_type>;
/******************
* MediumKernel *
@@ -506,7 +511,8 @@ constexpr MediumKernelPlanSize MediumKernelPlanSize::Build(
/// function advise kernel plans to read only read 64 bits.
/// This limits restrictions set by the plan on the input memory reads built
to avoid
/// reading overflow.
-constexpr int adjust_bytes_per_read(int bits_per_read, int simd_byte_size) {
+ARROW_FORCE_INLINE constexpr int adjust_bytes_per_read(int bits_per_read,
+ int simd_byte_size) {
if (bits_per_read <= static_cast<int>(8 * sizeof(uint32_t))) {
return sizeof(uint32_t);
} else if (bits_per_read <= static_cast<int>(8 * sizeof(uint64_t))) {
@@ -718,7 +724,8 @@ struct MediumKernel {
static constexpr int kBytesRead = kPlan.total_bytes_read();
template <int kReadIdx, int kSwizzleIdx, int kShiftIdx>
- static void unpack_one_shift_impl(const simd_batch& words, unpacked_type*
out) {
+ ARROW_FORCE_INLINE static void unpack_one_shift_impl(const simd_batch& words,
+ unpacked_type* out) {
constexpr auto kRightShiftsArr =
kPlan.shifts.at(kReadIdx).at(kSwizzleIdx).at(kShiftIdx);
constexpr auto kRightShifts = array_to_batch_constant<kRightShiftsArr,
arch_type>();
@@ -741,8 +748,9 @@ struct MediumKernel {
}
template <int kReadIdx, int kSwizzleIdx, int... kShiftIds>
- static void unpack_one_swizzle_impl(const simd_bytes& bytes, unpacked_type*
out,
- std::integer_sequence<int,
kShiftIds...>) {
+ ARROW_FORCE_INLINE static void unpack_one_swizzle_impl(
+ const simd_bytes& bytes, unpacked_type* out,
+ std::integer_sequence<int, kShiftIds...>) {
constexpr auto kSwizzlesArr = kPlan.swizzles.at(kReadIdx).at(kSwizzleIdx);
constexpr auto kSwizzles = array_to_batch_constant<kSwizzlesArr,
arch_type>();
@@ -752,8 +760,8 @@ struct MediumKernel {
}
template <int kReadIdx, int... kSwizzleIds>
- static void unpack_one_read_impl(const uint8_t* in, unpacked_type* out,
- std::integer_sequence<int, kSwizzleIds...>)
{
+ ARROW_FORCE_INLINE static void unpack_one_read_impl(
+ const uint8_t* in, unpacked_type* out, std::integer_sequence<int,
kSwizzleIds...>) {
using ShiftSeq = std::make_integer_sequence<int,
kPlanSize.shifts_per_swizzle()>;
const auto bytes =
safe_load_bytes<kPlan.bytes_per_read(), arch_type>(in +
kPlan.reads.at(kReadIdx));
@@ -761,13 +769,13 @@ struct MediumKernel {
}
template <int... kReadIds>
- static void unpack_all_impl(const uint8_t* in, unpacked_type* out,
- std::integer_sequence<int, kReadIds...>) {
+ ARROW_FORCE_INLINE static void unpack_all_impl(
+ const uint8_t* in, unpacked_type* out, std::integer_sequence<int,
kReadIds...>) {
using SwizzleSeq = std::make_integer_sequence<int,
kPlanSize.swizzles_per_read()>;
(unpack_one_read_impl<kReadIds>(in, out, SwizzleSeq{}), ...);
}
- static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) {
+ ARROW_FORCE_INLINE static const uint8_t* unpack(const uint8_t* in,
unpacked_type* out) {
using ReadSeq = std::make_integer_sequence<int,
kPlanSize.reads_per_kernel()>;
unpack_all_impl(in, out, ReadSeq{});
return in + (kPlan.unpacked_per_kernel() * kShape.packed_bit_size()) / 8;
@@ -1005,7 +1013,8 @@ struct LargeKernel {
static constexpr int kBytesRead = kPlan.total_bytes_read();
template <int kReadIdx>
- static void unpack_one_read_impl(const uint8_t* in, unpacked_type* out) {
+ ARROW_FORCE_INLINE static void unpack_one_read_impl(const uint8_t* in,
+ unpacked_type* out) {
constexpr auto kLowSwizzles =
array_to_batch_constant<kPlan.low_swizzles.at(kReadIdx), arch_type>();
constexpr auto kLowRShifts =
@@ -1040,12 +1049,12 @@ struct LargeKernel {
}
template <int... kReadIds>
- static void unpack_all_impl(const uint8_t* in, unpacked_type* out,
- std::integer_sequence<int, kReadIds...>) {
+ ARROW_FORCE_INLINE static void unpack_all_impl(
+ const uint8_t* in, unpacked_type* out, std::integer_sequence<int,
kReadIds...>) {
(unpack_one_read_impl<kReadIds>(in, out), ...);
}
- static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) {
+ ARROW_FORCE_INLINE static const uint8_t* unpack(const uint8_t* in,
unpacked_type* out) {
using ReadSeq = std::make_integer_sequence<int,
kPlanSize.reads_per_kernel()>;
unpack_all_impl(in, out, ReadSeq{});
return in + (kPlan.kPlanSize.unpacked_per_kernel() *
kShape.packed_bit_size()) / 8;
@@ -1064,7 +1073,9 @@ struct NoOpKernel {
static constexpr int kValuesUnpacked = 0;
static constexpr int kBytesRead = 0;
- static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) { return
in; }
+ ARROW_FORCE_INLINE static const uint8_t* unpack(const uint8_t* in,
unpacked_type* out) {
+ return in;
+ }
};
template <typename KernelTraits, typename WorkingKernel>
@@ -1073,7 +1084,7 @@ struct CastingKernel : WorkingKernel {
static constexpr int kValuesUnpacked = WorkingKernel::kValuesUnpacked;
- static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) {
+ ARROW_FORCE_INLINE static const uint8_t* unpack(const uint8_t* in,
unpacked_type* out) {
using working_type = typename WorkingKernel::unpacked_type;
working_type buffer[kValuesUnpacked] = {};
@@ -1131,8 +1142,7 @@ template <typename Traits>
using KernelDispatch = decltype(KernelDispatchImpl<Traits>());
/// The public kernel exposed for any size.
-template <typename UnpackedUint, int kPackedBitSize, int kSimdBitSize>
-struct Kernel : KernelDispatch<KernelTraits<UnpackedUint, kPackedBitSize,
kSimdBitSize>> {
-};
+template <typename UnpackedUint, int kPackedBitSize, typename Arch>
+struct Kernel : KernelDispatch<KernelTraits<UnpackedUint, kPackedBitSize,
Arch>> {};
} // namespace arrow::internal::bpacking
diff --git a/cpp/src/arrow/util/bpacking_test.cc
b/cpp/src/arrow/util/bpacking_test.cc
index 9072c0b8d1..d4d588228e 100644
--- a/cpp/src/arrow/util/bpacking_test.cc
+++ b/cpp/src/arrow/util/bpacking_test.cc
@@ -27,7 +27,8 @@
#include "arrow/util/bpacking_scalar_internal.h"
#include "arrow/util/bpacking_simd_internal.h"
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX512) || \
+ defined(ARROW_HAVE_RUNTIME_SVE128) || defined(ARROW_HAVE_RUNTIME_SVE256)
# include "arrow/util/cpu_info.h"
#endif
@@ -104,9 +105,9 @@ std::vector<uint8_t> PackValues(const std::vector<Int>&
values, int num_values,
return out;
}
-class TestUnpack : public ::testing::TestWithParam<int> {
+template <typename Int>
+class TestUnpack : public ::testing::Test {
protected:
- template <typename Int>
void TestRoundtripAlignment(UnpackFunc<Int> unpack, const UnpackOptions&
opts) {
const auto original =
GenerateRandomValuesForPacking<Int>(opts.batch_size, opts.bit_width);
@@ -129,7 +130,6 @@ class TestUnpack : public ::testing::TestWithParam<int> {
<< val_original << " but unpacked " <<
val_unpacked;
}
- template <typename Int>
void TestUnpackZeros(UnpackFunc<Int> unpack, const UnpackOptions& opts) {
const auto num_bytes = GetNumBytes(opts.batch_size, opts.bit_width,
opts.bit_offset);
@@ -140,7 +140,6 @@ class TestUnpack : public ::testing::TestWithParam<int> {
EXPECT_EQ(unpacked, expected);
}
- template <typename Int>
void TestUnpackOnes(UnpackFunc<Int> unpack, const UnpackOptions& opts) {
const auto num_bytes = GetNumBytes(opts.batch_size, opts.bit_width,
opts.bit_offset);
@@ -161,7 +160,6 @@ class TestUnpack : public ::testing::TestWithParam<int> {
EXPECT_EQ(unpacked, expected);
}
- template <typename Int>
void TestUnpackAlternating(UnpackFunc<Int> unpack, const UnpackOptions&
opts) {
const auto num_bytes = GetNumBytes(opts.batch_size, opts.bit_width,
opts.bit_offset);
@@ -191,168 +189,136 @@ class TestUnpack : public ::testing::TestWithParam<int>
{
EXPECT_EQ(unpacked, expected);
}
- template <typename Int>
void TestAll(UnpackFunc<Int> unpack) {
- const int num_values_base = GetParam();
-
- constexpr int kMaxBitWidth = std::is_same_v<Int, bool> ? 1 : 8 *
sizeof(Int);
-
- // Given how many edge cases there are in unpacking integers, it is best
to test all
- // sizes
- for (int bit_width = 0; bit_width <= kMaxBitWidth; ++bit_width) {
- SCOPED_TRACE(::testing::Message() << "Testing bit_width=" << bit_width);
-
- // We test all bit offset within a byte / misalignments to change how the
- // prolog.
- for (int bit_offset = 0; bit_offset < 8; ++bit_offset) {
- SCOPED_TRACE(::testing::Message() << "Testing bit_offset=" <<
bit_offset);
-
- const UnpackOptions opts{
- .batch_size = num_values_base,
- .bit_width = bit_width,
- .bit_offset = bit_offset,
- .max_read_bytes = -1, // No over-reading in testing (strict ASAN)
- };
-
- // Known values
- TestUnpackZeros(unpack, opts);
- TestUnpackOnes(unpack, opts);
- TestUnpackAlternating(unpack, opts);
-
- // Roundtrips
- TestRoundtripAlignment(unpack, opts);
-
- if (testing::Test::HasFailure()) return;
+ // There are actually many differences across the different sizes.
+ // It is best to test them all.
+ for (int num_values_base : {64, 128, 2048}) {
+ SCOPED_TRACE(::testing::Message() << "Testing num_values=" <<
num_values_base);
+
+ constexpr int kMaxBitWidth = std::is_same_v<Int, bool> ? 1 : 8 *
sizeof(Int);
+
+ // Given how many edge cases there are in unpacking integers, it is best
to test all
+ // bit widths.
+ for (int bit_width = 0; bit_width <= kMaxBitWidth; ++bit_width) {
+ SCOPED_TRACE(::testing::Message() << "Testing bit_width=" <<
bit_width);
+
+ // We test all bit offset within a byte / misalignments to change how
the
+ // prolog.
+ for (int bit_offset = 0; bit_offset < 8; ++bit_offset) {
+ SCOPED_TRACE(::testing::Message() << "Testing bit_offset=" <<
bit_offset);
+
+ const UnpackOptions opts{
+ .batch_size = num_values_base,
+ .bit_width = bit_width,
+ .bit_offset = bit_offset,
+ .max_read_bytes = -1, // No over-reading in testing (strict
ASAN)
+ };
+
+ // Known values
+ TestUnpackZeros(unpack, opts);
+ TestUnpackOnes(unpack, opts);
+ TestUnpackAlternating(unpack, opts);
+
+ // Roundtrips
+ TestRoundtripAlignment(unpack, opts);
+
+ if (testing::Test::HasFailure()) return;
+ }
+
+ // Similarly, we test all epilog sizes. That is extra values that
could make it
+ // fall outside of an SIMD register
+ for (int epilogue_size = 0; epilogue_size <= kMaxBitWidth;
++epilogue_size) {
+ SCOPED_TRACE(::testing::Message() << "Testing epilog_size=" <<
epilogue_size);
+
+ const int num_values = num_values_base + epilogue_size;
+
+ const UnpackOptions opts{
+ .batch_size = num_values,
+ .bit_width = bit_width,
+ .bit_offset = 0,
+ .max_read_bytes = -1, // No over-reading in testing (strict
ASAN)
+ };
+
+ // Known values
+ TestUnpackZeros(unpack, opts);
+ TestUnpackOnes(unpack, opts);
+ TestUnpackAlternating(unpack, opts);
+
+ // Roundtrips
+ TestRoundtripAlignment(unpack, opts);
+
+ if (testing::Test::HasFailure()) return;
+ }
}
+ }
+ }
+};
- // Similarly, we test all epilog sizes. That is extra values that could
make it
- // fall outside of an SIMD register
- for (int epilogue_size = 0; epilogue_size <= kMaxBitWidth;
++epilogue_size) {
- SCOPED_TRACE(::testing::Message() << "Testing epilog_size=" <<
epilogue_size);
-
- const int num_values = num_values_base + epilogue_size;
-
- const UnpackOptions opts{
- .batch_size = num_values,
- .bit_width = bit_width,
- .bit_offset = 0,
- .max_read_bytes = -1, // No over-reading in testing (strict ASAN)
- };
-
- // Known values
- TestUnpackZeros(unpack, opts);
- TestUnpackOnes(unpack, opts);
- TestUnpackAlternating(unpack, opts);
-
- // Roundtrips
- TestRoundtripAlignment(unpack, opts);
+using UnpackTypes = ::testing::Types<bool, uint8_t, uint16_t, uint32_t,
uint64_t>;
- if (testing::Test::HasFailure()) return;
- }
- }
+struct UnpackTypeNames {
+ template <typename T>
+ static std::string GetName(int) {
+ if constexpr (std::is_same_v<T, bool>) return "bool";
+ if constexpr (std::is_same_v<T, uint8_t>) return "uint8_t";
+ if constexpr (std::is_same_v<T, uint16_t>) return "uint16_t";
+ if constexpr (std::is_same_v<T, uint32_t>) return "uint32_t";
+ if constexpr (std::is_same_v<T, uint64_t>) return "uint64_t";
}
};
-// There are actually many differences across the different sizes.
-// It is best to test them all.
-INSTANTIATE_TEST_SUITE_P(UnpackMultiplesOf64Values, TestUnpack,
- ::testing::Values(64, 128, 2048),
- [](const
::testing::TestParamInfo<TestUnpack::ParamType>& info) {
- return "Length" + std::to_string(info.param);
- });
+TYPED_TEST_SUITE(TestUnpack, UnpackTypes, UnpackTypeNames);
-TEST_P(TestUnpack, UnpackBoolScalar) {
this->TestAll(&bpacking::unpack_scalar<bool>); }
-TEST_P(TestUnpack, Unpack8Scalar) {
this->TestAll(&bpacking::unpack_scalar<uint8_t>); }
-TEST_P(TestUnpack, Unpack16Scalar) {
this->TestAll(&bpacking::unpack_scalar<uint16_t>); }
-TEST_P(TestUnpack, Unpack32Scalar) {
this->TestAll(&bpacking::unpack_scalar<uint32_t>); }
-TEST_P(TestUnpack, Unpack64Scalar) {
this->TestAll(&bpacking::unpack_scalar<uint64_t>); }
+TYPED_TEST(TestUnpack, UnpackScalar) {
+ this->TestAll(&bpacking::unpack_scalar<TypeParam>);
+}
#if defined(ARROW_HAVE_SSE4_2)
-TEST_P(TestUnpack, UnpackBoolSse4_2) {
this->TestAll(&bpacking::unpack_sse4_2<bool>); }
-TEST_P(TestUnpack, Unpack8Sse4_2) {
this->TestAll(&bpacking::unpack_sse4_2<uint8_t>); }
-TEST_P(TestUnpack, Unpack16Sse4_2) {
this->TestAll(&bpacking::unpack_sse4_2<uint16_t>); }
-TEST_P(TestUnpack, Unpack32Sse4_2) {
this->TestAll(&bpacking::unpack_sse4_2<uint32_t>); }
-TEST_P(TestUnpack, Unpack64Sse4_2) {
this->TestAll(&bpacking::unpack_sse4_2<uint64_t>); }
+TYPED_TEST(TestUnpack, UnpackSse4_2) {
+ this->TestAll(&bpacking::unpack_sse4_2<TypeParam>);
+}
#endif
#if defined(ARROW_HAVE_RUNTIME_AVX2)
-TEST_P(TestUnpack, UnpackBoolAvx2) {
- if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) {
- GTEST_SKIP() << "Test requires AVX2";
- }
- this->TestAll(&bpacking::unpack_avx2<bool>);
-}
-TEST_P(TestUnpack, Unpack8Avx2) {
- if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) {
- GTEST_SKIP() << "Test requires AVX2";
- }
- this->TestAll(&bpacking::unpack_avx2<uint8_t>);
-}
-TEST_P(TestUnpack, Unpack16Avx2) {
- if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) {
- GTEST_SKIP() << "Test requires AVX2";
- }
- this->TestAll(&bpacking::unpack_avx2<uint16_t>);
-}
-TEST_P(TestUnpack, Unpack32Avx2) {
- if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) {
- GTEST_SKIP() << "Test requires AVX2";
- }
- this->TestAll(&bpacking::unpack_avx2<uint32_t>);
-}
-TEST_P(TestUnpack, Unpack64Avx2) {
+TYPED_TEST(TestUnpack, UnpackAvx2) {
if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) {
GTEST_SKIP() << "Test requires AVX2";
}
- this->TestAll(&bpacking::unpack_avx2<uint64_t>);
+ this->TestAll(&bpacking::unpack_avx2<TypeParam>);
}
#endif
#if defined(ARROW_HAVE_RUNTIME_AVX512)
-TEST_P(TestUnpack, UnpackBoolAvx512) {
- if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) {
- GTEST_SKIP() << "Test requires AVX512";
- }
- this->TestAll(&bpacking::unpack_avx512<bool>);
-}
-TEST_P(TestUnpack, Unpack8Avx512) {
- if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) {
- GTEST_SKIP() << "Test requires AVX512";
- }
- this->TestAll(&bpacking::unpack_avx512<uint8_t>);
-}
-TEST_P(TestUnpack, Unpack16Avx512) {
+TYPED_TEST(TestUnpack, UnpackAvx512) {
if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) {
GTEST_SKIP() << "Test requires AVX512";
}
- this->TestAll(&bpacking::unpack_avx512<uint16_t>);
+ this->TestAll(&bpacking::unpack_avx512<TypeParam>);
}
-TEST_P(TestUnpack, Unpack32Avx512) {
- if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) {
- GTEST_SKIP() << "Test requires AVX512";
- }
- this->TestAll(&bpacking::unpack_avx512<uint32_t>);
-}
-TEST_P(TestUnpack, Unpack64Avx512) {
- if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) {
- GTEST_SKIP() << "Test requires AVX512";
+#endif
+
+#if defined(ARROW_HAVE_NEON)
+TYPED_TEST(TestUnpack, UnpackNeon) {
this->TestAll(&bpacking::unpack_neon<TypeParam>); }
+#endif
+
+#if defined(ARROW_HAVE_RUNTIME_SVE128)
+TYPED_TEST(TestUnpack, UnpackSve128) {
+ if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE128)) {
+ GTEST_SKIP() << "Test requires SVE128";
}
- this->TestAll(&bpacking::unpack_avx512<uint64_t>);
+ this->TestAll(&bpacking::unpack_sve128<TypeParam>);
}
#endif
-#if defined(ARROW_HAVE_NEON)
-TEST_P(TestUnpack, UnpackBoolNeon) {
this->TestAll(&bpacking::unpack_neon<bool>); }
-TEST_P(TestUnpack, Unpack8Neon) {
this->TestAll(&bpacking::unpack_neon<uint8_t>); }
-TEST_P(TestUnpack, Unpack16Neon) {
this->TestAll(&bpacking::unpack_neon<uint16_t>); }
-TEST_P(TestUnpack, Unpack32Neon) {
this->TestAll(&bpacking::unpack_neon<uint32_t>); }
-TEST_P(TestUnpack, Unpack64Neon) {
this->TestAll(&bpacking::unpack_neon<uint64_t>); }
+#if defined(ARROW_HAVE_RUNTIME_SVE256)
+TYPED_TEST(TestUnpack, UnpackSve256) {
+ if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::SVE256)) {
+ GTEST_SKIP() << "Test requires SVE256";
+ }
+ this->TestAll(&bpacking::unpack_sve256<TypeParam>);
+}
#endif
-TEST_P(TestUnpack, UnpackBool) { this->TestAll(&unpack<bool>); }
-TEST_P(TestUnpack, Unpack8) { this->TestAll(&unpack<uint8_t>); }
-TEST_P(TestUnpack, Unpack16) { this->TestAll(&unpack<uint16_t>); }
-TEST_P(TestUnpack, Unpack32) { this->TestAll(&unpack<uint32_t>); }
-TEST_P(TestUnpack, Unpack64) { this->TestAll(&unpack<uint64_t>); }
+TYPED_TEST(TestUnpack, Unpack) { this->TestAll(&unpack<TypeParam>); }
} // namespace arrow::internal
diff --git a/cpp/src/arrow/util/cpu_info.cc b/cpp/src/arrow/util/cpu_info.cc
index e24a3bbfe2..2cc1ac802b 100644
--- a/cpp/src/arrow/util/cpu_info.cc
+++ b/cpp/src/arrow/util/cpu_info.cc
@@ -297,6 +297,12 @@ void OsRetrieveCpuInfo(int64_t* hardware_flags,
CpuInfo::Vendor* vendor,
#else
//------------------------------ LINUX ------------------------------//
+# if defined(CPUINFO_ARCH_ARM)
+# include <asm/hwcap.h>
+# include <sys/auxv.h>
+# include <sys/prctl.h>
+# endif
+
// Get cache size, return 0 on error
int64_t LinuxGetCacheSize(int level) {
// get cache size by sysconf()
@@ -413,8 +419,30 @@ void OsRetrieveCpuInfo(int64_t* hardware_flags,
CpuInfo::Vendor* vendor,
}
}
}
+
+# if defined(CPUINFO_ARCH_ARM)
+ // Detect SVE and vector length via getauxval/prctl (more reliable than
/proc/cpuinfo)
+# ifdef HWCAP_SVE
+ const auto hwcap = getauxval(AT_HWCAP);
+ if (hwcap & HWCAP_SVE) {
+ *hardware_flags |= CpuInfo::SVE;
+# ifdef PR_SVE_GET_VL
+ const int vl = prctl(PR_SVE_GET_VL);
+ assert(vl >= 0);
+ // prctl returns vector length in bytes; mask off status flags
+ const int vl_bytes = vl & PR_SVE_VL_LEN_MASK;
+ // Running SVE128 on a SVE256 machine is more tricky than the x86
equivalent of
+ // running SSE code on an AVX machine and requires to explicitly change the
+ // vector length using `prctl` (per thread setting).
+ if (vl_bytes == 16) *hardware_flags |= CpuInfo::SVE128; // 128 bits
+ if (vl_bytes == 32) *hardware_flags |= CpuInfo::SVE256; // 256 bits
+ if (vl_bytes == 64) *hardware_flags |= CpuInfo::SVE512; // 512 bits
+# endif // PR_SVE_GET_VL
+ }
+# endif // HWCAP_SVE
+# endif // CPUINFO_ARCH_ARM
}
-#endif // WINDOWS, MACOS, LINUX
+#endif // WINDOWS, MACOS, LINUX
//============================== Arch Dependent
==============================//
@@ -473,11 +501,35 @@ void ArchVerifyCpuRequirements(const CpuInfo* ci) {
#elif defined(CPUINFO_ARCH_ARM)
//------------------------------ AARCH64 ------------------------------//
bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t*
hardware_flags) {
- if (simd_level == "NONE") {
- *hardware_flags &= ~CpuInfo::ASIMD;
- return true;
+ enum {
+ USER_SIMD_NONE,
+ USER_SIMD_SVE,
+ USER_SIMD_SVE128,
+ USER_SIMD_SVE256,
+ USER_SIMD_SVE512,
+ USER_SIMD_MAX,
+ };
+
+ int level = USER_SIMD_MAX;
+ if (simd_level == "SVE") {
+ level = USER_SIMD_SVE;
+ } else if (simd_level == "SVE128") {
+ level = USER_SIMD_SVE128;
+ } else if (simd_level == "SVE256") {
+ level = USER_SIMD_SVE256;
+ } else if (simd_level == "SVE512") {
+ level = USER_SIMD_SVE512;
+ } else if (simd_level == "NONE") {
+ level = USER_SIMD_NONE;
+ } else {
+ return false;
}
- return false;
+
+ if (level < USER_SIMD_SVE512) *hardware_flags &= ~CpuInfo::SVE512;
+ if (level < USER_SIMD_SVE256) *hardware_flags &= ~CpuInfo::SVE256;
+ if (level < USER_SIMD_SVE128) *hardware_flags &= ~CpuInfo::SVE128;
+ if (level < USER_SIMD_SVE) *hardware_flags &= ~CpuInfo::SVE;
+ return true;
}
void ArchVerifyCpuRequirements(const CpuInfo* ci) {
diff --git a/cpp/src/arrow/util/cpu_info.h b/cpp/src/arrow/util/cpu_info.h
index 949719b97e..de0ef13cc5 100644
--- a/cpp/src/arrow/util/cpu_info.h
+++ b/cpp/src/arrow/util/cpu_info.h
@@ -56,6 +56,10 @@ class ARROW_EXPORT CpuInfo {
/// Arm features
static constexpr int64_t ASIMD = (1LL << 32);
+ static constexpr int64_t SVE = (1LL << 33);
+ static constexpr int64_t SVE128 = (1LL << 36);
+ static constexpr int64_t SVE256 = (1LL << 34);
+ static constexpr int64_t SVE512 = (1LL << 35);
/// Cache enums for L1 (data), L2 and L3
enum class CacheLevel { L1 = 0, L2, L3, Last = L3 };
diff --git a/cpp/src/arrow/util/dispatch_internal.h
b/cpp/src/arrow/util/dispatch_internal.h
index 7ac19b0b24..5fa071cb19 100644
--- a/cpp/src/arrow/util/dispatch_internal.h
+++ b/cpp/src/arrow/util/dispatch_internal.h
@@ -23,8 +23,7 @@
#include "arrow/status.h"
#include "arrow/util/cpu_info.h"
-namespace arrow {
-namespace internal {
+namespace arrow::internal {
enum class DispatchLevel : int {
// These dispatch levels, corresponding to instruction set features,
@@ -34,6 +33,9 @@ enum class DispatchLevel : int {
AVX2,
AVX512,
NEON,
+ SVE128,
+ SVE256,
+ SVE512,
MAX
};
@@ -106,11 +108,18 @@ class DynamicDispatch {
return cpu_info->IsSupported(CpuInfo::AVX2);
case DispatchLevel::AVX512:
return cpu_info->IsSupported(CpuInfo::AVX512);
+ case DispatchLevel::NEON:
+ return cpu_info->IsSupported(CpuInfo::ASIMD);
+ case DispatchLevel::SVE128:
+ return cpu_info->IsSupported(CpuInfo::SVE128);
+ case DispatchLevel::SVE256:
+ return cpu_info->IsSupported(CpuInfo::SVE256);
+ case DispatchLevel::SVE512:
+ return cpu_info->IsSupported(CpuInfo::SVE512);
default:
return false;
}
}
};
-} // namespace internal
-} // namespace arrow
+} // namespace arrow::internal
diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h
index 832b686b31..2da3933f50 100644
--- a/cpp/src/arrow/util/macros.h
+++ b/cpp/src/arrow/util/macros.h
@@ -75,7 +75,7 @@
#if defined(__GNUC__) // GCC and compatible compilers (clang, Intel ICC)
# define ARROW_NORETURN __attribute__((noreturn))
# define ARROW_NOINLINE __attribute__((noinline))
-# define ARROW_FORCE_INLINE __attribute__((always_inline))
+# define ARROW_FORCE_INLINE __attribute__((always_inline)) inline
# define ARROW_PREDICT_FALSE(x) (__builtin_expect(!!(x), 0))
# define ARROW_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
# define ARROW_RESTRICT __restrict
diff --git a/cpp/src/arrow/util/ubsan.h b/cpp/src/arrow/util/ubsan.h
index 5c6a8f419b..6c8a9812af 100644
--- a/cpp/src/arrow/util/ubsan.h
+++ b/cpp/src/arrow/util/ubsan.h
@@ -54,7 +54,7 @@ inline T* MakeNonNull(T* maybe_null = NULLPTR) {
}
template <typename T>
-inline std::enable_if_t<std::is_trivially_copyable_v<T>, T> SafeLoadAs(
+ARROW_FORCE_INLINE std::enable_if_t<std::is_trivially_copyable_v<T>, T>
SafeLoadAs(
const uint8_t* unaligned) {
using Type = std::remove_const_t<T>;
arrow::internal::AlignedStorage<Type> raw_data;
@@ -65,7 +65,8 @@ inline std::enable_if_t<std::is_trivially_copyable_v<T>, T>
SafeLoadAs(
}
template <typename T>
-inline std::enable_if_t<std::is_trivially_copyable_v<T>, T> SafeLoad(const T*
unaligned) {
+ARROW_FORCE_INLINE std::enable_if_t<std::is_trivially_copyable_v<T>, T>
SafeLoad(
+ const T* unaligned) {
using Type = std::remove_const_t<T>;
arrow::internal::AlignedStorage<Type> raw_data;
std::memcpy(raw_data.get(), static_cast<const void*>(unaligned), sizeof(T));
@@ -75,10 +76,11 @@ inline std::enable_if_t<std::is_trivially_copyable_v<T>, T>
SafeLoad(const T* un
}
template <typename U, typename T>
-inline std::enable_if_t<std::is_trivially_copyable_v<T> &&
- std::is_trivially_copyable_v<U> && sizeof(T) ==
sizeof(U),
- U>
-SafeCopy(T value) {
+ARROW_FORCE_INLINE
+ std::enable_if_t<std::is_trivially_copyable_v<T> &&
std::is_trivially_copyable_v<U> &&
+ sizeof(T) == sizeof(U),
+ U>
+ SafeCopy(T value) {
using TypeU = std::remove_const_t<U>;
arrow::internal::AlignedStorage<TypeU> raw_data;
std::memcpy(raw_data.get(), static_cast<const void*>(&value), sizeof(T));
@@ -88,8 +90,8 @@ SafeCopy(T value) {
}
template <typename T>
-inline std::enable_if_t<std::is_trivially_copyable_v<T>, void> SafeStore(void*
unaligned,
- T
value) {
+ARROW_FORCE_INLINE std::enable_if_t<std::is_trivially_copyable_v<T>, void>
SafeStore(
+ void* unaligned, T value) {
std::memcpy(unaligned, &value, sizeof(T));
}