Attached is a simple reproducer. It passes with clang v16 -O0, but fails with 
17 and 18 only when built with -O0. 

Build command: clang main.c -O0

Hope this helps. 
Raghuveer

> -----Original Message-----
> From: John Naylor <johncnaylo...@gmail.com>
> Sent: Sunday, June 15, 2025 7:39 PM
> To: Andy Fan <zhihuifan1...@163.com>
> Cc: Jesper Pedersen <jesperpedersen...@gmail.com>; Nathan Bossart
> <nathandboss...@gmail.com>; Tomas Vondra <to...@vondra.me>; Devulapalli,
> Raghuveer <raghuveer.devulapa...@intel.com>; pgsql-
> hack...@lists.postgresql.org; Shankaran, Akash <akash.shanka...@intel.com>
> Subject: Re: Improve CRC32C performance on SSE4.2
> 
> On Sun, Jun 15, 2025 at 8:32 AM Andy Fan <zhihuifan1...@163.com> wrote:
> >
> > Jesper Pedersen <jesperpedersen...@gmail.com> writes:
> >
> > Hi,
> >
> > Thank you Nathan, Tomas and Jesper for the answers. The patch at [0]
> > works for me and I could work with master smoothly now.
> 
> Pushed, thanks for testing! I'll do some more testing to see what 
> versions/levels
> are affected and file a bug report, but it'll be a few days before I get to 
> it.
> 
> --
> John Naylor
> Amazon Web Services
> 

#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>

__attribute__((target("sse4.2")))
static uint32_t
crc32c_scalar(const unsigned char *data, ssize_t len, uint32_t crc)
{
    const unsigned char *p = data;
    const unsigned char *pend = p + len;

    while (p + 8 <= pend)
    {
        crc = (uint32_t) _mm_crc32_u64(crc, *((const uint64_t *) p));
        p += 8;
    }

    /* Process remaining full four bytes if any */
    if (p + 4 <= pend)
    {
        crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
        p += 4;
    }

    /* go byte by byte: */
    while (p < pend)
    {
        crc = _mm_crc32_u8(crc, *p);
        p++;
    }

    return crc;
}

#define clmul_lo_(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
#define clmul_hi_(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))

__attribute__((target("avx512vl,vpclmulqdq")))
static uint32_t
crc32c_avx512(const unsigned char* data, ssize_t length, uint32_t crc)
{
        /* adjust names to match generated code */
        uint32_t crc0 = crc;
        size_t          len = length;
        const unsigned char *buf = data;

        if (len >= 64)
        {
                const unsigned char *end = buf + len;
                const unsigned char *limit = buf + len - 64;
                __m128i         z0;

                /* First vector chunk. */
                __m512i         x0 = _mm512_loadu_si512((const void *) buf),
                                        y0;
                __m512i         k;

                k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 
0x9e4addf8, 0));
                x0 = 
_mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0);
                buf += 64;

                /* Main loop. */
                while (buf <= limit)
                {
                        y0 = clmul_lo_(x0, k), x0 = clmul_hi_(x0, k);
                        x0 = _mm512_ternarylogic_epi64(x0, y0, 
_mm512_loadu_si512((const void *) buf), 0x96);
                        buf += 64;
                }

                /* Reduce 512 bits to 128 bits. */
                k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0, 0x3da6d0cb, 
0, 0xba4fc28e, 0, 0xf20c0dfe, 0, 0x493c7d27, 0, 0, 0, 0, 0);
                y0 = clmul_lo_(x0, k), k = clmul_hi_(x0, k);
                y0 = _mm512_xor_si512(y0, k);
                z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0), 
_mm512_extracti32x4_epi32(y0, 1), _mm512_extracti32x4_epi32(y0, 2), 0x96);
                z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));

                /* Reduce 128 bits to 32 bits, and multiply by x^32. */
                crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0));
                crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1));
                len = end - buf;
        }

        return crc0;
}


static uint8_t randomval()
{
    return (rand() % 255);
}

int main() {
    const int size = 64;
    /* Initialize to random values */
    unsigned char arr[size];
    srand(42);
    for (size_t ii = 0; ii < size; ++ii) {
        arr[ii] = randomval();
    }

    /* Compute crc32c using simple scalar methods and SIMD method */
    uint32_t avxcrc = crc32c_avx512(arr, size, 0xFFFFFFFF);
    uint32_t scalar_crc = crc32c_scalar(arr, size, 0xFFFFFFFF);

    /* ASSERT values are the same */
    if (scalar_crc != avxcrc) {
        printf("Failed! ");
    }
    else {
        printf("Success! ");
    }
    printf("0x%x, 0x%x\n", scalar_crc, avxcrc);
    return 0;
}

Reply via email to