[Bug c/113247] New: RISC-V: Performance bug in SHA256

juzhe.zhong at rivai dot ai via Gcc-bugs Fri, 05 Jan 2024 16:03:06 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113247


            Bug ID: 113247
           Summary: RISC-V: Performance bug in SHA256
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: juzhe.zhong at rivai dot ai
  Target Milestone: ---

We found we have a performance bug while testing with various benchmarks.

This is the case from coremark-pro SHA256:

Tested on spike vector vs scalar in case of dynamic instruction count:

897210195 (vector) vs 418451694 (scalar)

Obviously vector dynamic instruction count as twice as scalar.

We tested on our hardware board and Thead C908 (K230),

Both vector performance drop about 60%+ in the real hardware in case of vector
vs scalar.

After investigation, the performance bug issue happens in the following case:

https://compiler-explorer.com/z/GcsnK7edn

#include <stdint.h>

#define Ch(x,y,z)   (z ^ (x & (y ^ z)))
#define Maj(x,y,z)  ((x & y) | (z & (x | y)))

#define SHR(x, n)    (x >> n)
#define ROTR(x,n)    (SHR(x,n) | (x << (32 - n)))
#define S1(x)        (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))
#define S0(x)        (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))

#define s1(x)        (ROTR(x,17) ^ ROTR(x,19) ^  SHR(x,10))
#define s0(x)        (ROTR(x, 7) ^ ROTR(x,18) ^  SHR(x, 3))

#define SHA256_STEP(a,b,c,d,e,f,g,h,x,K)                 \
{                                                        \
    tmp1 = h + S1(e) + Ch(e,f,g) + K + x;                \
    tmp2 = S0(a) + Maj(a,b,c);                           \
    h  = tmp1 + tmp2;                                    \
    d += tmp1;                                           \
}

#define BE_LOAD32(n,b,i) (n) = byteswap(*(uint32_t *)(b + i))

static uint32_t byteswap(uint32_t x)
{
    x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
    x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;  

    return x;
}

void sha256 (const uint8_t *in, uint32_t out[8])
{
    uint32_t tmp1, tmp2, a, b, c, d, e, f, g, h;
    uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14,
w15;

    tmp1 = tmp2 = 0;
    w0 = w1 = w2 = w3 = w4 = w5 = w6 = w7 = w8 = w9 = w10 = w11 = w12 = w13 =
w14 = w15 = 0;

    BE_LOAD32 (  w0, in,  0 );
    BE_LOAD32 (  w1, in,  4 );
    BE_LOAD32 (  w2, in,  8 );
    BE_LOAD32 (  w3, in, 12 );
    BE_LOAD32 (  w4, in, 16 );
    BE_LOAD32 (  w5, in, 20 );
    BE_LOAD32 (  w6, in, 24 );
    BE_LOAD32 (  w7, in, 28 );
    BE_LOAD32 (  w8, in, 32 );
    BE_LOAD32 (  w9, in, 36 );
    BE_LOAD32 ( w10, in, 40 );
    BE_LOAD32 ( w11, in, 44 );
    BE_LOAD32 ( w12, in, 48 );
    BE_LOAD32 ( w13, in, 52 );
    BE_LOAD32 ( w14, in, 56 );
    BE_LOAD32 ( w15, in, 60 );

    a = out[0];
    b = out[1];
    c = out[2];
    d = out[3];
    e = out[4];
    f = out[5];
    g = out[6];
    h = out[7];

    SHA256_STEP(a, b, c, d, e, f, g, h,  w0, 0x428a2f98);
    SHA256_STEP(h, a, b, c, d, e, f, g,  w1, 0x71374491);
    SHA256_STEP(g, h, a, b, c, d, e, f,  w2, 0xb5c0fbcf);
    SHA256_STEP(f, g, h, a, b, c, d, e,  w3, 0xe9b5dba5);
    SHA256_STEP(e, f, g, h, a, b, c, d,  w4, 0x3956c25b);
    SHA256_STEP(d, e, f, g, h, a, b, c,  w5, 0x59f111f1);
    SHA256_STEP(c, d, e, f, g, h, a, b,  w6, 0x923f82a4);
    SHA256_STEP(b, c, d, e, f, g, h, a,  w7, 0xab1c5ed5);
    SHA256_STEP(a, b, c, d, e, f, g, h,  w8, 0xd807aa98);
    SHA256_STEP(h, a, b, c, d, e, f, g,  w9, 0x12835b01);
    SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x243185be);
    SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x550c7dc3);
    SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x72be5d74);
    SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0x80deb1fe);
    SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x9bdc06a7);
    SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc19bf174);

    w0 = s1(w14) + w9 + s0(w1) + w0;
    SHA256_STEP(a, b, c, d, e, f, g, h,  w0, 0xe49b69c1);
    w1 = s1(w15) + w10 + s0(w2) + w1;
    SHA256_STEP(h, a, b, c, d, e, f, g,  w1, 0xefbe4786);
    w2 = s1(w0) + w11 + s0(w3) + w2;
    SHA256_STEP(g, h, a, b, c, d, e, f,  w2, 0x0fc19dc6);
    w3 = s1(w1) + w12 + s0(w4) + w3;
    SHA256_STEP(f, g, h, a, b, c, d, e,  w3, 0x240ca1cc);
    w4 = s1(w2) + w13 + s0(w5) + w4;
    SHA256_STEP(e, f, g, h, a, b, c, d,  w4, 0x2de92c6f);
    w5 = s1(w3) + w14 + s0(w6) + w5;
    SHA256_STEP(d, e, f, g, h, a, b, c,  w5, 0x4a7484aa);
    w6 = s1(w4) + w15 + s0(w7) + w6;
    SHA256_STEP(c, d, e, f, g, h, a, b,  w6, 0x5cb0a9dc);
    w7 = s1(w5) + w0 + s0(w8) + w7;
    SHA256_STEP(b, c, d, e, f, g, h, a,  w7, 0x76f988da);
    w8 = s1(w6) + w1 + s0(w9) + w8;
    SHA256_STEP(a, b, c, d, e, f, g, h,  w8, 0x983e5152);
    w9 = s1(w7) + w2 + s0(w10) + w9;
    SHA256_STEP(h, a, b, c, d, e, f, g,  w9, 0xa831c66d);
    w10 = s1(w8) + w3 + s0(w11) + w10;
    SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xb00327c8);
    w11 = s1(w9) + w4 + s0(w12) + w11;
    SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xbf597fc7);
    w12 = s1(w10) + w5 + s0(w13) + w12;
    SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xc6e00bf3);
    w13 = s1(w11) + w6 + s0(w14) + w13;
    SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd5a79147);
    w14 = s1(w12) + w7 + s0(w15) + w14;
    SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x06ca6351);
    w15 = s1(w13) + w8 + s0(w0) + w15;
    SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x14292967);

    w0 = s1(w14) + w9 + s0(w1) + w0;
    SHA256_STEP(a, b, c, d, e, f, g, h,  w0, 0x27b70a85);
    w1 = s1(w15) + w10 + s0(w2) + w1;
    SHA256_STEP(h, a, b, c, d, e, f, g,  w1, 0x2e1b2138);
    w2 = s1(w0) + w11 + s0(w3) + w2;
    SHA256_STEP(g, h, a, b, c, d, e, f,  w2, 0x4d2c6dfc);
    w3 = s1(w1) + w12 + s0(w4) + w3;
    SHA256_STEP(f, g, h, a, b, c, d, e,  w3, 0x53380d13);
    w4 = s1(w2) + w13 + s0(w5) + w4;
    SHA256_STEP(e, f, g, h, a, b, c, d,  w4, 0x650a7354);
    w5 = s1(w3) + w14 + s0(w6) + w5;
    SHA256_STEP(d, e, f, g, h, a, b, c,  w5, 0x766a0abb);
    w6 = s1(w4) + w15 + s0(w7) + w6;
    SHA256_STEP(c, d, e, f, g, h, a, b,  w6, 0x81c2c92e);
    w7 = s1(w5) + w0 + s0(w8) + w7;
    SHA256_STEP(b, c, d, e, f, g, h, a,  w7, 0x92722c85);
    w8 = s1(w6) + w1 + s0(w9) + w8;
    SHA256_STEP(a, b, c, d, e, f, g, h,  w8, 0xa2bfe8a1);
    w9 = s1(w7) + w2 + s0(w10) + w9;
    SHA256_STEP(h, a, b, c, d, e, f, g,  w9, 0xa81a664b);
    w10 = s1(w8) + w3 + s0(w11) + w10;
    SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xc24b8b70);
    w11 = s1(w9) + w4 + s0(w12) + w11;
    SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xc76c51a3);
    w12 = s1(w10) + w5 + s0(w13) + w12;
    SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xd192e819);
    w13 = s1(w11) + w6 + s0(w14) + w13;
    SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd6990624);
    w14 = s1(w12) + w7 + s0(w15) + w14;
    SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xf40e3585);
    w15 = s1(w13) + w8 + s0(w0) + w15;
    SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x106aa070);

    w0 = s1(w14) + w9 + s0(w1) + w0;
    SHA256_STEP(a, b, c, d, e, f, g, h,  w0, 0x19a4c116);
    w1 = s1(w15) + w10 + s0(w2) + w1;
    SHA256_STEP(h, a, b, c, d, e, f, g,  w1, 0x1e376c08);
    w2 = s1(w0) + w11 + s0(w3) + w2;
    SHA256_STEP(g, h, a, b, c, d, e, f,  w2, 0x2748774c);
    w3 = s1(w1) + w12 + s0(w4) + w3;
    SHA256_STEP(f, g, h, a, b, c, d, e,  w3, 0x34b0bcb5);
    w4 = s1(w2) + w13 + s0(w5) + w4;
    SHA256_STEP(e, f, g, h, a, b, c, d,  w4, 0x391c0cb3);
    w5 = s1(w3) + w14 + s0(w6) + w5;
    SHA256_STEP(d, e, f, g, h, a, b, c,  w5, 0x4ed8aa4a);
    w6 = s1(w4) + w15 + s0(w7) + w6;
    SHA256_STEP(c, d, e, f, g, h, a, b,  w6, 0x5b9cca4f);
    w7 = s1(w5) + w0 + s0(w8) + w7;
    SHA256_STEP(b, c, d, e, f, g, h, a,  w7, 0x682e6ff3);
    w8 = s1(w6) + w1 + s0(w9) + w8;
    SHA256_STEP(a, b, c, d, e, f, g, h,  w8, 0x748f82ee);
    w9 = s1(w7) + w2 + s0(w10) + w9;
    SHA256_STEP(h, a, b, c, d, e, f, g,  w9, 0x78a5636f);
    w10 = s1(w8) + w3 + s0(w11) + w10;
    SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x84c87814);
    w11 = s1(w9) + w4 + s0(w12) + w11;
    SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x8cc70208);
    w12 = s1(w10) + w5 + s0(w13) + w12;
    SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x90befffa);
    w13 = s1(w11) + w6 + s0(w14) + w13;
    SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xa4506ceb);
    w14 = s1(w12) + w7 + s0(w15) + w14;
    SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xbef9a3f7);
    w15 = s1(w13) + w8 + s0(w0) + w15;
    SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc67178f2);

    out[0] += a;
    out[1] += b;
    out[2] += c;
    out[3] += d;
    out[4] += e;
    out[5] += f;
    out[6] += g;
    out[7] += h;
}

The assembly is quite staigth-forward. RVV GCC vectorizes codes which is
profitable to be vectorized.


Confirm both ARM SVE GCC and RISC-V Clang don't vectorize this code.

So I believe we should teach cost model not vectorize it.

[Bug c/113247] New: RISC-V: Performance bug in SHA256

Reply via email to