Attached is a simple reproducer. It passes with clang v16 -O0, but fails with
17 and 18 only when built with -O0.
Build command: clang main.c -O0
Hope this helps.
Raghuveer
> -----Original Message-----
> From: John Naylor <[email protected]>
> Sent: Sunday, June 15, 2025 7:39 PM
> To: Andy Fan <[email protected]>
> Cc: Jesper Pedersen <[email protected]>; Nathan Bossart
> <[email protected]>; Tomas Vondra <[email protected]>; Devulapalli,
> Raghuveer <[email protected]>; pgsql-
> [email protected]; Shankaran, Akash <[email protected]>
> Subject: Re: Improve CRC32C performance on SSE4.2
>
> On Sun, Jun 15, 2025 at 8:32 AM Andy Fan <[email protected]> wrote:
> >
> > Jesper Pedersen <[email protected]> writes:
> >
> > Hi,
> >
> > Thank you Nathan, Tomas and Jesper for the answers. The patch at [0]
> > works for me and I could work with master smoothly now.
>
> Pushed, thanks for testing! I'll do some more testing to see what
> versions/levels
> are affected and file a bug report, but it'll be a few days before I get to
> it.
>
> --
> John Naylor
> Amazon Web Services
>
#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>
__attribute__((target("sse4.2")))
static uint32_t
crc32c_scalar(const unsigned char *data, ssize_t len, uint32_t crc)
{
const unsigned char *p = data;
const unsigned char *pend = p + len;
while (p + 8 <= pend)
{
crc = (uint32_t) _mm_crc32_u64(crc, *((const uint64_t *) p));
p += 8;
}
/* Process remaining full four bytes if any */
if (p + 4 <= pend)
{
crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
p += 4;
}
/* go byte by byte: */
while (p < pend)
{
crc = _mm_crc32_u8(crc, *p);
p++;
}
return crc;
}
#define clmul_lo_(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
#define clmul_hi_(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))
__attribute__((target("avx512vl,vpclmulqdq")))
static uint32_t
crc32c_avx512(const unsigned char* data, ssize_t length, uint32_t crc)
{
/* adjust names to match generated code */
uint32_t crc0 = crc;
size_t len = length;
const unsigned char *buf = data;
if (len >= 64)
{
const unsigned char *end = buf + len;
const unsigned char *limit = buf + len - 64;
__m128i z0;
/* First vector chunk. */
__m512i x0 = _mm512_loadu_si512((const void *) buf),
y0;
__m512i k;
k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0,
0x9e4addf8, 0));
x0 =
_mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0);
buf += 64;
/* Main loop. */
while (buf <= limit)
{
y0 = clmul_lo_(x0, k), x0 = clmul_hi_(x0, k);
x0 = _mm512_ternarylogic_epi64(x0, y0,
_mm512_loadu_si512((const void *) buf), 0x96);
buf += 64;
}
/* Reduce 512 bits to 128 bits. */
k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0, 0x3da6d0cb,
0, 0xba4fc28e, 0, 0xf20c0dfe, 0, 0x493c7d27, 0, 0, 0, 0, 0);
y0 = clmul_lo_(x0, k), k = clmul_hi_(x0, k);
y0 = _mm512_xor_si512(y0, k);
z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0),
_mm512_extracti32x4_epi32(y0, 1), _mm512_extracti32x4_epi32(y0, 2), 0x96);
z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));
/* Reduce 128 bits to 32 bits, and multiply by x^32. */
crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0));
crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1));
len = end - buf;
}
return crc0;
}
static uint8_t randomval()
{
return (rand() % 255);
}
int main() {
const int size = 64;
/* Initialize to random values */
unsigned char arr[size];
srand(42);
for (size_t ii = 0; ii < size; ++ii) {
arr[ii] = randomval();
}
/* Compute crc32c using simple scalar methods and SIMD method */
uint32_t avxcrc = crc32c_avx512(arr, size, 0xFFFFFFFF);
uint32_t scalar_crc = crc32c_scalar(arr, size, 0xFFFFFFFF);
/* ASSERT values are the same */
if (scalar_crc != avxcrc) {
printf("Failed! ");
}
else {
printf("Success! ");
}
printf("0x%x, 0x%x\n", scalar_crc, avxcrc);
return 0;
}