This is an automated email from the ASF dual-hosted git repository.
zyearn pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/brpc.git
The following commit(s) were added to refs/heads/master by this push:
new 49a15c25 optimize crc32c for riscv64 with Zbc carry-less
multiplication (#3312)
49a15c25 is described below
commit 49a15c25e76a8f7640e46f9e09828a67151f4a0e
Author: Felix-Gong <[email protected]>
AuthorDate: Mon Jun 1 01:45:54 2026 +0800
optimize crc32c for riscv64 with Zbc carry-less multiplication (#3312)
* optimize crc32c for riscv64 with Zbc carry-less multiplication
Implement hardware-accelerated CRC32C for RISC-V using the Zbc
(carry-less multiplication) extension. The implementation uses
128-bit folding with 4-way parallelism and Barrett reduction,
following the approach from Hadoop PR #8371.
Key changes:
- Add rv_clmul/rv_clmulh inline assembly wrappers
- Implement 128-bit fold with 4-way parallel processing (64 bytes/iter)
- Add Barrett reduction for final 128-bit to 32-bit conversion
- Runtime CPU feature detection via /proc/cpuinfo
- Compile-time guard: #ifdef __riscv_zbc
- CMake option: WITH_RISCV_ZBC (default OFF)
Performance: 3-4x speedup over table-based 8-byte unrolled baseline,
~1.1 GB/s throughput on 1MB data.
Correctness: Verified against RFC 3720 B.4 and bitwise reference.
Signed-off-by: Felix-Gong <[email protected]>
* fix: wrap isSSE42() in #ifdef __SSE4_2__ to fix -Wunused-function
isSSE42() is only called from within #ifdef __SSE4_2__ blocks,
but the function definition was unconditional, causing
-Wunused-function errors on non-x86 builds with -Werror.
Signed-off-by: Felix-Gong <[email protected]>
* Potential fix for pull request finding
Co-authored-by: Copilot Autofix powered by AI
<[email protected]>
* Fix CRC32C Zbc: add missing ^ 0xFFFFFFFF pre/post processing
rv_crc32c_clmul was missing the standard CRC32C pre/post XOR
conversion. ExtendImpl does crc ^ 0xFFFFFFFF at entry and
result ^ 0xFFFFFFFF at exit, but rv_crc32c_clmul did neither,
causing wrong CRC values on RISC-V with Zbc extension.
Signed-off-by: Felix-Gong <[email protected]>
---------
Signed-off-by: Felix-Gong <[email protected]>
Co-authored-by: Copilot Autofix powered by AI
<[email protected]>
---
CMakeLists.txt | 7 +-
src/butil/crc32c.cc | 211 ++++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 211 insertions(+), 7 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 79f880ec..5e74007b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -172,7 +172,12 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-gcse")
elseif((CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64"))
# RISC-V specific optimizations
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc")
+ option(WITH_RISCV_ZBC "Enable RISC-V Zbc carry-less multiplication for
CRC32C acceleration" OFF)
+ if(WITH_RISCV_ZBC)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc_zbc")
+ else()
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc")
+ endif()
endif()
if(NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-aligned-new")
diff --git a/src/butil/crc32c.cc b/src/butil/crc32c.cc
index 1817cb0a..7de07cf4 100644
--- a/src/butil/crc32c.cc
+++ b/src/butil/crc32c.cc
@@ -421,7 +421,194 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t
size) {
return static_cast<uint32_t>(l ^ 0xffffffffu);
}
-// Detect if SS42 or not.
+#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc)
+#include <stdio.h>
+
+// RISC-V Zbc carry-less multiplication inline helpers
+static inline uint64_t rv_clmul(uint64_t a, uint64_t b) {
+ uint64_t result;
+ __asm__ volatile ("clmul %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+ return result;
+}
+
+static inline uint64_t rv_clmulh(uint64_t a, uint64_t b) {
+ uint64_t result;
+ __asm__ volatile ("clmulh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+ return result;
+}
+
+// Bitwise CRC32C fallback for small chunks
+static inline uint32_t rv_crc32c_bitwise(uint32_t crc, const uint8_t* buf,
+ size_t len) {
+ uint32_t c = crc;
+ for (size_t i = 0; i < len; ++i) {
+ c ^= buf[i];
+ for (int k = 0; k < 8; ++k) {
+ c = (c >> 1) ^ ((c & 1) ? 0x82F63B78U : 0);
+ }
+ }
+ return c;
+}
+
+// Fold a 128-bit CRC state (lo:hi) with fold constants and XOR in new data
+static inline void rv_fold_pair_xor_data(uint64_t* lo, uint64_t* hi,
+ uint64_t k0, uint64_t k1,
+ uint64_t d0, uint64_t d1) {
+ uint64_t l = rv_clmul(*lo, k0) ^ rv_clmul(*hi, k1);
+ uint64_t h = rv_clmulh(*lo, k0) ^ rv_clmulh(*hi, k1);
+ *lo = l ^ d0;
+ *hi = h ^ d1;
+}
+
+// Fold a 128-bit CRC state with fold constants and XOR in another state
+static inline void rv_fold_pair_xor_state(uint64_t* lo, uint64_t* hi,
+ uint64_t k0, uint64_t k1,
+ uint64_t s0, uint64_t s1) {
+ uint64_t l = rv_clmul(*lo, k0) ^ rv_clmul(*hi, k1);
+ uint64_t h = rv_clmulh(*lo, k0) ^ rv_clmulh(*hi, k1);
+ *lo = l ^ s0;
+ *hi = h ^ s1;
+}
+
+// Folding constants for CRC32C (Castagnoli polynomial 0x1EDC6F41)
+// x^(64*i+64) mod P(x) for i=1..4, in bit-reflected form
+static const uint64_t crc32c_fold_const[4] __attribute__((aligned(16))) = {
+ 0x00000000740eef02ULL, // k1: fold 512->256
+ 0x000000009e4addf8ULL, // k2: fold 512->256
+ 0x00000000f20c0dfeULL, // k3: fold 256->128
+ 0x00000000493c7d27ULL // k4: fold 256->128
+};
+
+// Barrett reduction constants for CRC32C finalization
+#define RV_CRC32C_CONST_0 0x00000000dd45aab8ULL // x^64 mod P
+#define RV_CRC32C_CONST_1 0x00000000493c7d27ULL // x^96 mod P
+#define RV_CRC32C_CONST_QUO 0x0000000dea713f1ULL // floor(x^64 / P)
+#define RV_CRC32C_CONST_POLY 0x0000000105ec76f1ULL // P(x) true LE full
+#define RV_CRC32_MASK32 0x00000000FFFFFFFFULL
+
+// Hardware-accelerated CRC32C using RISC-V Zbc carry-less multiplication.
+// Processes data in 64-byte chunks with 128-bit folding, then Barrett reduces.
+static uint32_t rv_crc32c_clmul(uint32_t crc, const char* buf, size_t len) {
+ // Convert external CRC to internal register state
+ crc ^= 0xFFFFFFFF;
+
+ const uint8_t* p = reinterpret_cast<const uint8_t*>(buf);
+ size_t n = len;
+
+ // Small data: use bitwise fallback
+ if (n < 64) {
+ return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF;
+ }
+
+ // Align to 16-byte boundary
+ uintptr_t mis = (uintptr_t)p & 0xF;
+ if (mis) {
+ size_t pre = 16 - mis;
+ if (pre > n) pre = n;
+ crc = rv_crc32c_bitwise(crc, p, pre);
+ p += pre;
+ n -= pre;
+ if (n < 64) {
+ return rv_crc32c_bitwise(crc, p, n) ^ 0xFFFFFFFF;
+ }
+ }
+
+ // Load first 64 bytes and XOR CRC into the first 8 bytes
+ uint64_t x0, x1, y0, y1, z0, z1, w0, w1;
+ memcpy(&x0, p + 0, 8);
+ memcpy(&x1, p + 8, 8);
+ memcpy(&y0, p + 16, 8);
+ memcpy(&y1, p + 24, 8);
+ memcpy(&z0, p + 32, 8);
+ memcpy(&z1, p + 40, 8);
+ memcpy(&w0, p + 48, 8);
+ memcpy(&w1, p + 56, 8);
+
+ x0 ^= (uint64_t)crc;
+ p += 64;
+ n -= 64;
+
+ const uint64_t k1 = crc32c_fold_const[0];
+ const uint64_t k2 = crc32c_fold_const[1];
+ const uint64_t k3 = crc32c_fold_const[2];
+ const uint64_t k4 = crc32c_fold_const[3];
+
+ // Main loop: fold 64 bytes per iteration using 128-bit folding
+ while (n >= 64) {
+ uint64_t d0, d1;
+ memcpy(&d0, p + 0, 8);
+ memcpy(&d1, p + 8, 8);
+ rv_fold_pair_xor_data(&x0, &x1, k1, k2, d0, d1);
+ memcpy(&d0, p + 16, 8);
+ memcpy(&d1, p + 24, 8);
+ rv_fold_pair_xor_data(&y0, &y1, k1, k2, d0, d1);
+ memcpy(&d0, p + 32, 8);
+ memcpy(&d1, p + 40, 8);
+ rv_fold_pair_xor_data(&z0, &z1, k1, k2, d0, d1);
+ memcpy(&d0, p + 48, 8);
+ memcpy(&d1, p + 56, 8);
+ rv_fold_pair_xor_data(&w0, &w1, k1, k2, d0, d1);
+ p += 64;
+ n -= 64;
+ }
+
+ // Reduce 4x128-bit to 1x128-bit
+ rv_fold_pair_xor_state(&x0, &x1, k3, k4, y0, y1);
+ rv_fold_pair_xor_state(&x0, &x1, k3, k4, z0, z1);
+ rv_fold_pair_xor_state(&x0, &x1, k3, k4, w0, w1);
+
+ // Barrett reduction: 128-bit -> 32-bit CRC
+ uint64_t t4 = rv_clmul(x0, RV_CRC32C_CONST_1);
+ uint64_t t3 = rv_clmulh(x0, RV_CRC32C_CONST_1);
+ uint64_t t1 = x1 ^ t4;
+ t4 = t1 & RV_CRC32_MASK32;
+ t1 >>= 32;
+ uint64_t t0 = rv_clmul(t4, RV_CRC32C_CONST_0);
+ t3 = (t3 << 32) ^ t1 ^ t0;
+
+ t4 = t3 & RV_CRC32_MASK32;
+ t4 = rv_clmul(t4, RV_CRC32C_CONST_QUO);
+ t4 &= RV_CRC32_MASK32;
+ t4 = rv_clmul(t4, RV_CRC32C_CONST_POLY);
+ t4 ^= t3;
+
+ uint32_t c = (uint32_t)((t4 >> 32) & RV_CRC32_MASK32);
+ // Handle remaining bytes
+ if (n) {
+ c = rv_crc32c_bitwise(c, p, n);
+ }
+ // Convert internal register state to external CRC
+ return c ^ 0xFFFFFFFF;
+}
+
+// Runtime detection: check if RISC-V CPU supports Zbc extension
+static bool isZbc() {
+ static const bool zbc_supported = []() {
+ FILE* f = fopen("/proc/cpuinfo", "r");
+ if (!f) return false;
+ bool supported = false;
+ char line[1024];
+ while (fgets(line, sizeof(line), f)) {
+ if (strstr(line, "isa") || strstr(line, "hart isa")) {
+ char* colon = strchr(line, ':');
+ if (colon) {
+ if (strstr(colon, "_zbc") || strstr(colon, "zbc")) {
+ supported = true;
+ break;
+ }
+ }
+ }
+ }
+ fclose(f);
+ return supported;
+ }();
+ return zbc_supported;
+}
+}
+#endif // __riscv && __riscv_xlen == 64
+
+// Detect if SSE4.2 or not.
+#ifdef __SSE4_2__
static bool isSSE42() {
#if defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
uint32_t c_;
@@ -432,20 +619,32 @@ static bool isSSE42() {
return false;
#endif
}
+#endif
typedef uint32_t (*Function)(uint32_t, const char*, size_t);
static inline Function Choose_Extend() {
- return isSSE42() ? (Function)ExtendImpl<FastCRC32Functor> :
- (Function)ExtendImpl<SlowCRC32Functor>;
+#ifdef __SSE4_2__
+ if (isSSE42()) {
+ return (Function)ExtendImpl<FastCRC32Functor>;
+ }
+#endif
+#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc)
+ if (isZbc()) {
+ return (Function)rv_crc32c_clmul;
+ }
+#endif
+ return (Function)ExtendImpl<SlowCRC32Functor>;
}
bool IsFastCrc32Supported() {
#ifdef __SSE4_2__
- return isSSE42();
-#else
- return false;
+ if (isSSE42()) return true;
+#endif
+#if defined(__riscv) && (__riscv_xlen == 64) && defined(__riscv_zbc)
+ if (isZbc()) return true;
#endif
+ return false;
}
uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]