[PATCH 01/17] cipher: Put the original Dilithium implementation.

NIIBE Yutaka via Gcrypt-devel Sat, 28 Jun 2025 03:47:25 -0700

* cipher/dilithium.c: Add the Dilithium reference implementation into
one file.


--

At dilithium/ref directory of the reference implementation,
it is generated by:

    for f in ntt.c packing.c poly.c polyvec.c reduce.c rounding.c \
             sign.c symmetric-shake.c
      do echo "/*************** dilithium/ref/$f */"
      grep -v '^#include' $f
    done > /tmp/dilithium.c

And added the top comment manually.

GnuPG-bug-id: 7640
Signed-off-by: NIIBE Yutaka <[email protected]>
---
 cipher/dilithium.c | 2264 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 2264 insertions(+)
 create mode 100644 cipher/dilithium.c

diff --git a/cipher/dilithium.c b/cipher/dilithium.c
new file mode 100644
index 00000000..b4d49a67
--- /dev/null
+++ b/cipher/dilithium.c
@@ -0,0 +1,2264 @@
+/*
+  Original code from:
+
+  Repository: https://github.com/pq-crystals/dilithium.git
+  Branch: master
+  Commit: 444cdcc84eb36b66fe27b3a2529ee48f6d8150c2
+
+  Licence:
+  Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/);
+  or Apache 2.0 License (https://www.apache.org/licenses/LICENSE-2.0.html).
+
+  Authors:
+        LÃ©o Ducas
+        Eike Kiltz
+        TancrÃ¨de Lepoint
+        Vadim Lyubashevsky
+        Gregor Seiler
+        Peter Schwabe
+        Damien StehlÃ©
+
+  Dilithium Home: https://github.com/pq-crystals/dilithium.git
+ */
+/*************** dilithium/ref/ntt.c */
+
+static const int32_t zetas[N] = {
+         0,    25847, -2608894,  -518909,   237124,  -777960,  -876248,   466468,
+   1826347,  2353451,  -359251, -2091905,  3119733, -2884855,  3111497,  2680103,
+   2725464,  1024112, -1079900,  3585928,  -549488, -1119584,  2619752, -2108549,
+  -2118186, -3859737, -1399561, -3277672,  1757237,   -19422,  4010497,   280005,
+   2706023,    95776,  3077325,  3530437, -1661693, -3592148, -2537516,  3915439,
+  -3861115, -3043716,  3574422, -2867647,  3539968,  -300467,  2348700,  -539299,
+  -1699267, -1643818,  3505694, -3821735,  3507263, -2140649, -1600420,  3699596,
+    811944,   531354,   954230,  3881043,  3900724, -2556880,  2071892, -2797779,
+  -3930395, -1528703, -3677745, -3041255, -1452451,  3475950,  2176455, -1585221,
+  -1257611,  1939314, -4083598, -1000202, -3190144, -3157330, -3632928,   126922,
+   3412210,  -983419,  2147896,  2715295, -2967645, -3693493,  -411027, -2477047,
+   -671102, -1228525,   -22981, -1308169,  -381987,  1349076,  1852771, -1430430,
+  -3343383,   264944,   508951,  3097992,    44288, -1100098,   904516,  3958618,
+  -3724342,    -8578,  1653064, -3249728,  2389356,  -210977,   759969, -1316856,
+    189548, -3553272,  3159746, -1851402, -2409325,  -177440,  1315589,  1341330,
+   1285669, -1584928,  -812732, -1439742, -3019102, -3881060, -3628969,  3839961,
+   2091667,  3407706,  2316500,  3817976, -3342478,  2244091, -2446433, -3562462,
+    266997,  2434439, -1235728,  3513181, -3520352, -3759364, -1197226, -3193378,
+    900702,  1859098,   909542,   819034,   495491, -1613174,   -43260,  -522500,
+   -655327, -3122442,  2031748,  3207046, -3556995,  -525098,  -768622, -3595838,
+    342297,   286988, -2437823,  4108315,  3437287, -3342277,  1735879,   203044,
+   2842341,  2691481, -2590150,  1265009,  4055324,  1247620,  2486353,  1595974,
+  -3767016,  1250494,  2635921, -3548272, -2994039,  1869119,  1903435, -1050970,
+  -1333058,  1237275, -3318210, -1430225,  -451100,  1312455,  3306115, -1962642,
+  -1279661,  1917081, -2546312, -1374803,  1500165,   777191,  2235880,  3406031,
+   -542412, -2831860, -1671176, -1846953, -2584293, -3724270,   594136, -3776993,
+  -2013608,  2432395,  2454455,  -164721,  1957272,  3369112,   185531, -1207385,
+  -3183426,   162844,  1616392,  3014001,   810149,  1652634, -3694233, -1799107,
+  -3038916,  3523897,  3866901,   269760,  2213111,  -975884,  1717735,   472078,
+   -426683,  1723600, -1803090,  1910376, -1667432, -1104333,  -260646, -3833893,
+  -2939036, -2235985,  -420899, -2286327,   183443,  -976891,  1612842, -3545687,
+   -554416,  3919660,   -48306, -1362209,  3937738,  1400424,  -846154,  1976782
+};
+
+/*************************************************
+* Name:        ntt
+*
+* Description: Forward NTT, in-place. No modular reduction is performed after
+*              additions or subtractions. Output vector is in bitreversed order.
+*
+* Arguments:   - uint32_t p[N]: input/output coefficient array
+**************************************************/
+void ntt(int32_t a[N]) {
+  unsigned int len, start, j, k;
+  int32_t zeta, t;
+
+  k = 0;
+  for(len = 128; len > 0; len >>= 1) {
+    for(start = 0; start < N; start = j + len) {
+      zeta = zetas[++k];
+      for(j = start; j < start + len; ++j) {
+        t = montgomery_reduce((int64_t)zeta * a[j + len]);
+        a[j + len] = a[j] - t;
+        a[j] = a[j] + t;
+      }
+    }
+  }
+}
+
+/*************************************************
+* Name:        invntt_tomont
+*
+* Description: Inverse NTT and multiplication by Montgomery factor 2^32.
+*              In-place. No modular reductions after additions or
+*              subtractions; input coefficients need to be smaller than
+*              Q in absolute value. Output coefficient are smaller than Q in
+*              absolute value.
+*
+* Arguments:   - uint32_t p[N]: input/output coefficient array
+**************************************************/
+void invntt_tomont(int32_t a[N]) {
+  unsigned int start, len, j, k;
+  int32_t t, zeta;
+  const int32_t f = 41978; // mont^2/256
+
+  k = 256;
+  for(len = 1; len < N; len <<= 1) {
+    for(start = 0; start < N; start = j + len) {
+      zeta = -zetas[--k];
+      for(j = start; j < start + len; ++j) {
+        t = a[j];
+        a[j] = t + a[j + len];
+        a[j + len] = t - a[j + len];
+        a[j + len] = montgomery_reduce((int64_t)zeta * a[j + len]);
+      }
+    }
+  }
+
+  for(j = 0; j < N; ++j) {
+    a[j] = montgomery_reduce((int64_t)f * a[j]);
+  }
+}
+/*************** dilithium/ref/packing.c */
+
+/*************************************************
+* Name:        pack_pk
+*
+* Description: Bit-pack public key pk = (rho, t1).
+*
+* Arguments:   - uint8_t pk[]: output byte array
+*              - const uint8_t rho[]: byte array containing rho
+*              - const polyveck *t1: pointer to vector t1
+**************************************************/
+void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+             const uint8_t rho[SEEDBYTES],
+             const polyveck *t1)
+{
+  unsigned int i;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    pk[i] = rho[i];
+  pk += SEEDBYTES;
+
+  for(i = 0; i < K; ++i)
+    polyt1_pack(pk + i*POLYT1_PACKEDBYTES, &t1->vec[i]);
+}
+
+/*************************************************
+* Name:        unpack_pk
+*
+* Description: Unpack public key pk = (rho, t1).
+*
+* Arguments:   - const uint8_t rho[]: output byte array for rho
+*              - const polyveck *t1: pointer to output vector t1
+*              - uint8_t pk[]: byte array containing bit-packed pk
+**************************************************/
+void unpack_pk(uint8_t rho[SEEDBYTES],
+               polyveck *t1,
+               const uint8_t pk[CRYPTO_PUBLICKEYBYTES])
+{
+  unsigned int i;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    rho[i] = pk[i];
+  pk += SEEDBYTES;
+
+  for(i = 0; i < K; ++i)
+    polyt1_unpack(&t1->vec[i], pk + i*POLYT1_PACKEDBYTES);
+}
+
+/*************************************************
+* Name:        pack_sk
+*
+* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2).
+*
+* Arguments:   - uint8_t sk[]: output byte array
+*              - const uint8_t rho[]: byte array containing rho
+*              - const uint8_t tr[]: byte array containing tr
+*              - const uint8_t key[]: byte array containing key
+*              - const polyveck *t0: pointer to vector t0
+*              - const polyvecl *s1: pointer to vector s1
+*              - const polyveck *s2: pointer to vector s2
+**************************************************/
+void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES],
+             const uint8_t rho[SEEDBYTES],
+             const uint8_t tr[TRBYTES],
+             const uint8_t key[SEEDBYTES],
+             const polyveck *t0,
+             const polyvecl *s1,
+             const polyveck *s2)
+{
+  unsigned int i;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    sk[i] = rho[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    sk[i] = key[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < TRBYTES; ++i)
+    sk[i] = tr[i];
+  sk += TRBYTES;
+
+  for(i = 0; i < L; ++i)
+    polyeta_pack(sk + i*POLYETA_PACKEDBYTES, &s1->vec[i]);
+  sk += L*POLYETA_PACKEDBYTES;
+
+  for(i = 0; i < K; ++i)
+    polyeta_pack(sk + i*POLYETA_PACKEDBYTES, &s2->vec[i]);
+  sk += K*POLYETA_PACKEDBYTES;
+
+  for(i = 0; i < K; ++i)
+    polyt0_pack(sk + i*POLYT0_PACKEDBYTES, &t0->vec[i]);
+}
+
+/*************************************************
+* Name:        unpack_sk
+*
+* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2).
+*
+* Arguments:   - const uint8_t rho[]: output byte array for rho
+*              - const uint8_t tr[]: output byte array for tr
+*              - const uint8_t key[]: output byte array for key
+*              - const polyveck *t0: pointer to output vector t0
+*              - const polyvecl *s1: pointer to output vector s1
+*              - const polyveck *s2: pointer to output vector s2
+*              - uint8_t sk[]: byte array containing bit-packed sk
+**************************************************/
+void unpack_sk(uint8_t rho[SEEDBYTES],
+               uint8_t tr[TRBYTES],
+               uint8_t key[SEEDBYTES],
+               polyveck *t0,
+               polyvecl *s1,
+               polyveck *s2,
+               const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+{
+  unsigned int i;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    rho[i] = sk[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    key[i] = sk[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < TRBYTES; ++i)
+    tr[i] = sk[i];
+  sk += TRBYTES;
+
+  for(i=0; i < L; ++i)
+    polyeta_unpack(&s1->vec[i], sk + i*POLYETA_PACKEDBYTES);
+  sk += L*POLYETA_PACKEDBYTES;
+
+  for(i=0; i < K; ++i)
+    polyeta_unpack(&s2->vec[i], sk + i*POLYETA_PACKEDBYTES);
+  sk += K*POLYETA_PACKEDBYTES;
+
+  for(i=0; i < K; ++i)
+    polyt0_unpack(&t0->vec[i], sk + i*POLYT0_PACKEDBYTES);
+}
+
+/*************************************************
+* Name:        pack_sig
+*
+* Description: Bit-pack signature sig = (c, z, h).
+*
+* Arguments:   - uint8_t sig[]: output byte array
+*              - const uint8_t *c: pointer to challenge hash length SEEDBYTES
+*              - const polyvecl *z: pointer to vector z
+*              - const polyveck *h: pointer to hint vector h
+**************************************************/
+void pack_sig(uint8_t sig[CRYPTO_BYTES],
+              const uint8_t c[CTILDEBYTES],
+              const polyvecl *z,
+              const polyveck *h)
+{
+  unsigned int i, j, k;
+
+  for(i=0; i < CTILDEBYTES; ++i)
+    sig[i] = c[i];
+  sig += CTILDEBYTES;
+
+  for(i = 0; i < L; ++i)
+    polyz_pack(sig + i*POLYZ_PACKEDBYTES, &z->vec[i]);
+  sig += L*POLYZ_PACKEDBYTES;
+
+  /* Encode h */
+  for(i = 0; i < OMEGA + K; ++i)
+    sig[i] = 0;
+
+  k = 0;
+  for(i = 0; i < K; ++i) {
+    for(j = 0; j < N; ++j)
+      if(h->vec[i].coeffs[j] != 0)
+        sig[k++] = j;
+
+    sig[OMEGA + i] = k;
+  }
+}
+
+/*************************************************
+* Name:        unpack_sig
+*
+* Description: Unpack signature sig = (c, z, h).
+*
+* Arguments:   - uint8_t *c: pointer to output challenge hash
+*              - polyvecl *z: pointer to output vector z
+*              - polyveck *h: pointer to output hint vector h
+*              - const uint8_t sig[]: byte array containing
+*                bit-packed signature
+*
+* Returns 1 in case of malformed signature; otherwise 0.
+**************************************************/
+int unpack_sig(uint8_t c[CTILDEBYTES],
+               polyvecl *z,
+               polyveck *h,
+               const uint8_t sig[CRYPTO_BYTES])
+{
+  unsigned int i, j, k;
+
+  for(i = 0; i < CTILDEBYTES; ++i)
+    c[i] = sig[i];
+  sig += CTILDEBYTES;
+
+  for(i = 0; i < L; ++i)
+    polyz_unpack(&z->vec[i], sig + i*POLYZ_PACKEDBYTES);
+  sig += L*POLYZ_PACKEDBYTES;
+
+  /* Decode h */
+  k = 0;
+  for(i = 0; i < K; ++i) {
+    for(j = 0; j < N; ++j)
+      h->vec[i].coeffs[j] = 0;
+
+    if(sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA)
+      return 1;
+
+    for(j = k; j < sig[OMEGA + i]; ++j) {
+      /* Coefficients are ordered for strong unforgeability */
+      if(j > k && sig[j] <= sig[j-1]) return 1;
+      h->vec[i].coeffs[sig[j]] = 1;
+    }
+
+    k = sig[OMEGA + i];
+  }
+
+  /* Extra indices are zero for strong unforgeability */
+  for(j = k; j < OMEGA; ++j)
+    if(sig[j])
+      return 1;
+
+  return 0;
+}
+/*************** dilithium/ref/poly.c */
+
+#ifdef DBENCH
+extern const uint64_t timing_overhead;
+extern uint64_t *tred, *tadd, *tmul, *tround, *tsample, *tpack;
+#define DBENCH_START() uint64_t time = cpucycles()
+#define DBENCH_STOP(t) t += cpucycles() - time - timing_overhead
+#else
+#define DBENCH_START()
+#define DBENCH_STOP(t)
+#endif
+
+/*************************************************
+* Name:        poly_reduce
+*
+* Description: Inplace reduction of all coefficients of polynomial to
+*              representative in [-6283008,6283008].
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_reduce(poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    a->coeffs[i] = reduce32(a->coeffs[i]);
+
+  DBENCH_STOP(*tred);
+}
+
+/*************************************************
+* Name:        poly_caddq
+*
+* Description: For all coefficients of in/out polynomial add Q if
+*              coefficient is negative.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_caddq(poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    a->coeffs[i] = caddq(a->coeffs[i]);
+
+  DBENCH_STOP(*tred);
+}
+
+/*************************************************
+* Name:        poly_add
+*
+* Description: Add polynomials. No modular reduction is performed.
+*
+* Arguments:   - poly *c: pointer to output polynomial
+*              - const poly *a: pointer to first summand
+*              - const poly *b: pointer to second summand
+**************************************************/
+void poly_add(poly *c, const poly *a, const poly *b)  {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    c->coeffs[i] = a->coeffs[i] + b->coeffs[i];
+
+  DBENCH_STOP(*tadd);
+}
+
+/*************************************************
+* Name:        poly_sub
+*
+* Description: Subtract polynomials. No modular reduction is
+*              performed.
+*
+* Arguments:   - poly *c: pointer to output polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial to be
+*                               subtraced from first input polynomial
+**************************************************/
+void poly_sub(poly *c, const poly *a, const poly *b) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    c->coeffs[i] = a->coeffs[i] - b->coeffs[i];
+
+  DBENCH_STOP(*tadd);
+}
+
+/*************************************************
+* Name:        poly_shiftl
+*
+* Description: Multiply polynomial by 2^D without modular reduction. Assumes
+*              input coefficients to be less than 2^{31-D} in absolute value.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_shiftl(poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    a->coeffs[i] <<= D;
+
+  DBENCH_STOP(*tmul);
+}
+
+/*************************************************
+* Name:        poly_ntt
+*
+* Description: Inplace forward NTT. Coefficients can grow by
+*              8*Q in absolute value.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_ntt(poly *a) {
+  DBENCH_START();
+
+  ntt(a->coeffs);
+
+  DBENCH_STOP(*tmul);
+}
+
+/*************************************************
+* Name:        poly_invntt_tomont
+*
+* Description: Inplace inverse NTT and multiplication by 2^{32}.
+*              Input coefficients need to be less than Q in absolute
+*              value and output coefficients are again bounded by Q.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_invntt_tomont(poly *a) {
+  DBENCH_START();
+
+  invntt_tomont(a->coeffs);
+
+  DBENCH_STOP(*tmul);
+}
+
+/*************************************************
+* Name:        poly_pointwise_montgomery
+*
+* Description: Pointwise multiplication of polynomials in NTT domain
+*              representation and multiplication of resulting polynomial
+*              by 2^{-32}.
+*
+* Arguments:   - poly *c: pointer to output polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    c->coeffs[i] = montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]);
+
+  DBENCH_STOP(*tmul);
+}
+
+/*************************************************
+* Name:        poly_power2round
+*
+* Description: For all coefficients c of the input polynomial,
+*              compute c0, c1 such that c mod Q = c1*2^D + c0
+*              with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be
+*              standard representatives.
+*
+* Arguments:   - poly *a1: pointer to output polynomial with coefficients c1
+*              - poly *a0: pointer to output polynomial with coefficients c0
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void poly_power2round(poly *a1, poly *a0, const poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    a1->coeffs[i] = power2round(&a0->coeffs[i], a->coeffs[i]);
+
+  DBENCH_STOP(*tround);
+}
+
+/*************************************************
+* Name:        poly_decompose
+*
+* Description: For all coefficients c of the input polynomial,
+*              compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0
+*              with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we
+*              set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0.
+*              Assumes coefficients to be standard representatives.
+*
+* Arguments:   - poly *a1: pointer to output polynomial with coefficients c1
+*              - poly *a0: pointer to output polynomial with coefficients c0
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void poly_decompose(poly *a1, poly *a0, const poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    a1->coeffs[i] = decompose(&a0->coeffs[i], a->coeffs[i]);
+
+  DBENCH_STOP(*tround);
+}
+
+/*************************************************
+* Name:        poly_make_hint
+*
+* Description: Compute hint polynomial. The coefficients of which indicate
+*              whether the low bits of the corresponding coefficient of
+*              the input polynomial overflow into the high bits.
+*
+* Arguments:   - poly *h: pointer to output hint polynomial
+*              - const poly *a0: pointer to low part of input polynomial
+*              - const poly *a1: pointer to high part of input polynomial
+*
+* Returns number of 1 bits.
+**************************************************/
+unsigned int poly_make_hint(poly *h, const poly *a0, const poly *a1) {
+  unsigned int i, s = 0;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i) {
+    h->coeffs[i] = make_hint(a0->coeffs[i], a1->coeffs[i]);
+    s += h->coeffs[i];
+  }
+
+  DBENCH_STOP(*tround);
+  return s;
+}
+
+/*************************************************
+* Name:        poly_use_hint
+*
+* Description: Use hint polynomial to correct the high bits of a polynomial.
+*
+* Arguments:   - poly *b: pointer to output polynomial with corrected high bits
+*              - const poly *a: pointer to input polynomial
+*              - const poly *h: pointer to input hint polynomial
+**************************************************/
+void poly_use_hint(poly *b, const poly *a, const poly *h) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    b->coeffs[i] = use_hint(a->coeffs[i], h->coeffs[i]);
+
+  DBENCH_STOP(*tround);
+}
+
+/*************************************************
+* Name:        poly_chknorm
+*
+* Description: Check infinity norm of polynomial against given bound.
+*              Assumes input coefficients were reduced by reduce32().
+*
+* Arguments:   - const poly *a: pointer to polynomial
+*              - int32_t B: norm bound
+*
+* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise.
+**************************************************/
+int poly_chknorm(const poly *a, int32_t B) {
+  unsigned int i;
+  int32_t t;
+  DBENCH_START();
+
+  if(B > (Q-1)/8)
+    return 1;
+
+  /* It is ok to leak which coefficient violates the bound since
+     the probability for each coefficient is independent of secret
+     data but we must not leak the sign of the centralized representative. */
+  for(i = 0; i < N; ++i) {
+    /* Absolute value */
+    t = a->coeffs[i] >> 31;
+    t = a->coeffs[i] - (t & 2*a->coeffs[i]);
+
+    if(t >= B) {
+      DBENCH_STOP(*tsample);
+      return 1;
+    }
+  }
+
+  DBENCH_STOP(*tsample);
+  return 0;
+}
+
+/*************************************************
+* Name:        rej_uniform
+*
+* Description: Sample uniformly random coefficients in [0, Q-1] by
+*              performing rejection sampling on array of random bytes.
+*
+* Arguments:   - int32_t *a: pointer to output array (allocated)
+*              - unsigned int len: number of coefficients to be sampled
+*              - const uint8_t *buf: array of random bytes
+*              - unsigned int buflen: length of array of random bytes
+*
+* Returns number of sampled coefficients. Can be smaller than len if not enough
+* random bytes were given.
+**************************************************/
+static unsigned int rej_uniform(int32_t *a,
+                                unsigned int len,
+                                const uint8_t *buf,
+                                unsigned int buflen)
+{
+  unsigned int ctr, pos;
+  uint32_t t;
+  DBENCH_START();
+
+  ctr = pos = 0;
+  while(ctr < len && pos + 3 <= buflen) {
+    t  = buf[pos++];
+    t |= (uint32_t)buf[pos++] << 8;
+    t |= (uint32_t)buf[pos++] << 16;
+    t &= 0x7FFFFF;
+
+    if(t < Q)
+      a[ctr++] = t;
+  }
+
+  DBENCH_STOP(*tsample);
+  return ctr;
+}
+
+/*************************************************
+* Name:        poly_uniform
+*
+* Description: Sample polynomial with uniformly random coefficients
+*              in [0,Q-1] by performing rejection sampling on the
+*              output stream of SHAKE128(seed|nonce)
+*
+* Arguments:   - poly *a: pointer to output polynomial
+*              - const uint8_t seed[]: byte array with seed of length SEEDBYTES
+*              - uint16_t nonce: 2-byte nonce
+**************************************************/
+#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES)
+void poly_uniform(poly *a,
+                  const uint8_t seed[SEEDBYTES],
+                  uint16_t nonce)
+{
+  unsigned int i, ctr, off;
+  unsigned int buflen = POLY_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES;
+  uint8_t buf[POLY_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES + 2];
+  stream128_state state;
+
+  stream128_init(&state, seed, nonce);
+  stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state);
+
+  ctr = rej_uniform(a->coeffs, N, buf, buflen);
+
+  while(ctr < N) {
+    off = buflen % 3;
+    for(i = 0; i < off; ++i)
+      buf[i] = buf[buflen - off + i];
+
+    stream128_squeezeblocks(buf + off, 1, &state);
+    buflen = STREAM128_BLOCKBYTES + off;
+    ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen);
+  }
+}
+
+/*************************************************
+* Name:        rej_eta
+*
+* Description: Sample uniformly random coefficients in [-ETA, ETA] by
+*              performing rejection sampling on array of random bytes.
+*
+* Arguments:   - int32_t *a: pointer to output array (allocated)
+*              - unsigned int len: number of coefficients to be sampled
+*              - const uint8_t *buf: array of random bytes
+*              - unsigned int buflen: length of array of random bytes
+*
+* Returns number of sampled coefficients. Can be smaller than len if not enough
+* random bytes were given.
+**************************************************/
+static unsigned int rej_eta(int32_t *a,
+                            unsigned int len,
+                            const uint8_t *buf,
+                            unsigned int buflen)
+{
+  unsigned int ctr, pos;
+  uint32_t t0, t1;
+  DBENCH_START();
+
+  ctr = pos = 0;
+  while(ctr < len && pos < buflen) {
+    t0 = buf[pos] & 0x0F;
+    t1 = buf[pos++] >> 4;
+
+#if ETA == 2
+    if(t0 < 15) {
+      t0 = t0 - (205*t0 >> 10)*5;
+      a[ctr++] = 2 - t0;
+    }
+    if(t1 < 15 && ctr < len) {
+      t1 = t1 - (205*t1 >> 10)*5;
+      a[ctr++] = 2 - t1;
+    }
+#elif ETA == 4
+    if(t0 < 9)
+      a[ctr++] = 4 - t0;
+    if(t1 < 9 && ctr < len)
+      a[ctr++] = 4 - t1;
+#endif
+  }
+
+  DBENCH_STOP(*tsample);
+  return ctr;
+}
+
+/*************************************************
+* Name:        poly_uniform_eta
+*
+* Description: Sample polynomial with uniformly random coefficients
+*              in [-ETA,ETA] by performing rejection sampling on the
+*              output stream from SHAKE256(seed|nonce)
+*
+* Arguments:   - poly *a: pointer to output polynomial
+*              - const uint8_t seed[]: byte array with seed of length CRHBYTES
+*              - uint16_t nonce: 2-byte nonce
+**************************************************/
+#if ETA == 2
+#define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES)
+#elif ETA == 4
+#define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES)
+#endif
+void poly_uniform_eta(poly *a,
+                      const uint8_t seed[CRHBYTES],
+                      uint16_t nonce)
+{
+  unsigned int ctr;
+  unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS*STREAM256_BLOCKBYTES;
+  uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS*STREAM256_BLOCKBYTES];
+  stream256_state state;
+
+  stream256_init(&state, seed, nonce);
+  stream256_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state);
+
+  ctr = rej_eta(a->coeffs, N, buf, buflen);
+
+  while(ctr < N) {
+    stream256_squeezeblocks(buf, 1, &state);
+    ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM256_BLOCKBYTES);
+  }
+}
+
+/*************************************************
+* Name:        poly_uniform_gamma1m1
+*
+* Description: Sample polynomial with uniformly random coefficients
+*              in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream
+*              of SHAKE256(seed|nonce)
+*
+* Arguments:   - poly *a: pointer to output polynomial
+*              - const uint8_t seed[]: byte array with seed of length CRHBYTES
+*              - uint16_t nonce: 16-bit nonce
+**************************************************/
+#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES)
+void poly_uniform_gamma1(poly *a,
+                         const uint8_t seed[CRHBYTES],
+                         uint16_t nonce)
+{
+  uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS*STREAM256_BLOCKBYTES];
+  stream256_state state;
+
+  stream256_init(&state, seed, nonce);
+  stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state);
+  polyz_unpack(a, buf);
+}
+
+/*************************************************
+* Name:        challenge
+*
+* Description: Implementation of H. Samples polynomial with TAU nonzero
+*              coefficients in {-1,1} using the output stream of
+*              SHAKE256(seed).
+*
+* Arguments:   - poly *c: pointer to output polynomial
+*              - const uint8_t mu[]: byte array containing seed of length CTILDEBYTES
+**************************************************/
+void poly_challenge(poly *c, const uint8_t seed[CTILDEBYTES]) {
+  unsigned int i, b, pos;
+  uint64_t signs;
+  uint8_t buf[SHAKE256_RATE];
+  keccak_state state;
+
+  shake256_init(&state);
+  shake256_absorb(&state, seed, CTILDEBYTES);
+  shake256_finalize(&state);
+  shake256_squeezeblocks(buf, 1, &state);
+
+  signs = 0;
+  for(i = 0; i < 8; ++i)
+    signs |= (uint64_t)buf[i] << 8*i;
+  pos = 8;
+
+  for(i = 0; i < N; ++i)
+    c->coeffs[i] = 0;
+  for(i = N-TAU; i < N; ++i) {
+    do {
+      if(pos >= SHAKE256_RATE) {
+        shake256_squeezeblocks(buf, 1, &state);
+        pos = 0;
+      }
+
+      b = buf[pos++];
+    } while(b > i);
+
+    c->coeffs[i] = c->coeffs[b];
+    c->coeffs[b] = 1 - 2*(signs & 1);
+    signs >>= 1;
+  }
+}
+
+/*************************************************
+* Name:        polyeta_pack
+*
+* Description: Bit-pack polynomial with coefficients in [-ETA,ETA].
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYETA_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyeta_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  uint8_t t[8];
+  DBENCH_START();
+
+#if ETA == 2
+  for(i = 0; i < N/8; ++i) {
+    t[0] = ETA - a->coeffs[8*i+0];
+    t[1] = ETA - a->coeffs[8*i+1];
+    t[2] = ETA - a->coeffs[8*i+2];
+    t[3] = ETA - a->coeffs[8*i+3];
+    t[4] = ETA - a->coeffs[8*i+4];
+    t[5] = ETA - a->coeffs[8*i+5];
+    t[6] = ETA - a->coeffs[8*i+6];
+    t[7] = ETA - a->coeffs[8*i+7];
+
+    r[3*i+0]  = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6);
+    r[3*i+1]  = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7);
+    r[3*i+2]  = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5);
+  }
+#elif ETA == 4
+  for(i = 0; i < N/2; ++i) {
+    t[0] = ETA - a->coeffs[2*i+0];
+    t[1] = ETA - a->coeffs[2*i+1];
+    r[i] = t[0] | (t[1] << 4);
+  }
+#endif
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyeta_unpack
+*
+* Description: Unpack polynomial with coefficients in [-ETA,ETA].
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *a: byte array with bit-packed polynomial
+**************************************************/
+void polyeta_unpack(poly *r, const uint8_t *a) {
+  unsigned int i;
+  DBENCH_START();
+
+#if ETA == 2
+  for(i = 0; i < N/8; ++i) {
+    r->coeffs[8*i+0] =  (a[3*i+0] >> 0) & 7;
+    r->coeffs[8*i+1] =  (a[3*i+0] >> 3) & 7;
+    r->coeffs[8*i+2] = ((a[3*i+0] >> 6) | (a[3*i+1] << 2)) & 7;
+    r->coeffs[8*i+3] =  (a[3*i+1] >> 1) & 7;
+    r->coeffs[8*i+4] =  (a[3*i+1] >> 4) & 7;
+    r->coeffs[8*i+5] = ((a[3*i+1] >> 7) | (a[3*i+2] << 1)) & 7;
+    r->coeffs[8*i+6] =  (a[3*i+2] >> 2) & 7;
+    r->coeffs[8*i+7] =  (a[3*i+2] >> 5) & 7;
+
+    r->coeffs[8*i+0] = ETA - r->coeffs[8*i+0];
+    r->coeffs[8*i+1] = ETA - r->coeffs[8*i+1];
+    r->coeffs[8*i+2] = ETA - r->coeffs[8*i+2];
+    r->coeffs[8*i+3] = ETA - r->coeffs[8*i+3];
+    r->coeffs[8*i+4] = ETA - r->coeffs[8*i+4];
+    r->coeffs[8*i+5] = ETA - r->coeffs[8*i+5];
+    r->coeffs[8*i+6] = ETA - r->coeffs[8*i+6];
+    r->coeffs[8*i+7] = ETA - r->coeffs[8*i+7];
+  }
+#elif ETA == 4
+  for(i = 0; i < N/2; ++i) {
+    r->coeffs[2*i+0] = a[i] & 0x0F;
+    r->coeffs[2*i+1] = a[i] >> 4;
+    r->coeffs[2*i+0] = ETA - r->coeffs[2*i+0];
+    r->coeffs[2*i+1] = ETA - r->coeffs[2*i+1];
+  }
+#endif
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyt1_pack
+*
+* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits.
+*              Input coefficients are assumed to be standard representatives.
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYT1_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyt1_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N/4; ++i) {
+    r[5*i+0] = (a->coeffs[4*i+0] >> 0);
+    r[5*i+1] = (a->coeffs[4*i+0] >> 8) | (a->coeffs[4*i+1] << 2);
+    r[5*i+2] = (a->coeffs[4*i+1] >> 6) | (a->coeffs[4*i+2] << 4);
+    r[5*i+3] = (a->coeffs[4*i+2] >> 4) | (a->coeffs[4*i+3] << 6);
+    r[5*i+4] = (a->coeffs[4*i+3] >> 2);
+  }
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyt1_unpack
+*
+* Description: Unpack polynomial t1 with 10-bit coefficients.
+*              Output coefficients are standard representatives.
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *a: byte array with bit-packed polynomial
+**************************************************/
+void polyt1_unpack(poly *r, const uint8_t *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N/4; ++i) {
+    r->coeffs[4*i+0] = ((a[5*i+0] >> 0) | ((uint32_t)a[5*i+1] << 8)) & 0x3FF;
+    r->coeffs[4*i+1] = ((a[5*i+1] >> 2) | ((uint32_t)a[5*i+2] << 6)) & 0x3FF;
+    r->coeffs[4*i+2] = ((a[5*i+2] >> 4) | ((uint32_t)a[5*i+3] << 4)) & 0x3FF;
+    r->coeffs[4*i+3] = ((a[5*i+3] >> 6) | ((uint32_t)a[5*i+4] << 2)) & 0x3FF;
+  }
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyt0_pack
+*
+* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYT0_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyt0_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  uint32_t t[8];
+  DBENCH_START();
+
+  for(i = 0; i < N/8; ++i) {
+    t[0] = (1 << (D-1)) - a->coeffs[8*i+0];
+    t[1] = (1 << (D-1)) - a->coeffs[8*i+1];
+    t[2] = (1 << (D-1)) - a->coeffs[8*i+2];
+    t[3] = (1 << (D-1)) - a->coeffs[8*i+3];
+    t[4] = (1 << (D-1)) - a->coeffs[8*i+4];
+    t[5] = (1 << (D-1)) - a->coeffs[8*i+5];
+    t[6] = (1 << (D-1)) - a->coeffs[8*i+6];
+    t[7] = (1 << (D-1)) - a->coeffs[8*i+7];
+
+    r[13*i+ 0]  =  t[0];
+    r[13*i+ 1]  =  t[0] >>  8;
+    r[13*i+ 1] |=  t[1] <<  5;
+    r[13*i+ 2]  =  t[1] >>  3;
+    r[13*i+ 3]  =  t[1] >> 11;
+    r[13*i+ 3] |=  t[2] <<  2;
+    r[13*i+ 4]  =  t[2] >>  6;
+    r[13*i+ 4] |=  t[3] <<  7;
+    r[13*i+ 5]  =  t[3] >>  1;
+    r[13*i+ 6]  =  t[3] >>  9;
+    r[13*i+ 6] |=  t[4] <<  4;
+    r[13*i+ 7]  =  t[4] >>  4;
+    r[13*i+ 8]  =  t[4] >> 12;
+    r[13*i+ 8] |=  t[5] <<  1;
+    r[13*i+ 9]  =  t[5] >>  7;
+    r[13*i+ 9] |=  t[6] <<  6;
+    r[13*i+10]  =  t[6] >>  2;
+    r[13*i+11]  =  t[6] >> 10;
+    r[13*i+11] |=  t[7] <<  3;
+    r[13*i+12]  =  t[7] >>  5;
+  }
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyt0_unpack
+*
+* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *a: byte array with bit-packed polynomial
+**************************************************/
+void polyt0_unpack(poly *r, const uint8_t *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N/8; ++i) {
+    r->coeffs[8*i+0]  = a[13*i+0];
+    r->coeffs[8*i+0] |= (uint32_t)a[13*i+1] << 8;
+    r->coeffs[8*i+0] &= 0x1FFF;
+
+    r->coeffs[8*i+1]  = a[13*i+1] >> 5;
+    r->coeffs[8*i+1] |= (uint32_t)a[13*i+2] << 3;
+    r->coeffs[8*i+1] |= (uint32_t)a[13*i+3] << 11;
+    r->coeffs[8*i+1] &= 0x1FFF;
+
+    r->coeffs[8*i+2]  = a[13*i+3] >> 2;
+    r->coeffs[8*i+2] |= (uint32_t)a[13*i+4] << 6;
+    r->coeffs[8*i+2] &= 0x1FFF;
+
+    r->coeffs[8*i+3]  = a[13*i+4] >> 7;
+    r->coeffs[8*i+3] |= (uint32_t)a[13*i+5] << 1;
+    r->coeffs[8*i+3] |= (uint32_t)a[13*i+6] << 9;
+    r->coeffs[8*i+3] &= 0x1FFF;
+
+    r->coeffs[8*i+4]  = a[13*i+6] >> 4;
+    r->coeffs[8*i+4] |= (uint32_t)a[13*i+7] << 4;
+    r->coeffs[8*i+4] |= (uint32_t)a[13*i+8] << 12;
+    r->coeffs[8*i+4] &= 0x1FFF;
+
+    r->coeffs[8*i+5]  = a[13*i+8] >> 1;
+    r->coeffs[8*i+5] |= (uint32_t)a[13*i+9] << 7;
+    r->coeffs[8*i+5] &= 0x1FFF;
+
+    r->coeffs[8*i+6]  = a[13*i+9] >> 6;
+    r->coeffs[8*i+6] |= (uint32_t)a[13*i+10] << 2;
+    r->coeffs[8*i+6] |= (uint32_t)a[13*i+11] << 10;
+    r->coeffs[8*i+6] &= 0x1FFF;
+
+    r->coeffs[8*i+7]  = a[13*i+11] >> 3;
+    r->coeffs[8*i+7] |= (uint32_t)a[13*i+12] << 5;
+    r->coeffs[8*i+7] &= 0x1FFF;
+
+    r->coeffs[8*i+0] = (1 << (D-1)) - r->coeffs[8*i+0];
+    r->coeffs[8*i+1] = (1 << (D-1)) - r->coeffs[8*i+1];
+    r->coeffs[8*i+2] = (1 << (D-1)) - r->coeffs[8*i+2];
+    r->coeffs[8*i+3] = (1 << (D-1)) - r->coeffs[8*i+3];
+    r->coeffs[8*i+4] = (1 << (D-1)) - r->coeffs[8*i+4];
+    r->coeffs[8*i+5] = (1 << (D-1)) - r->coeffs[8*i+5];
+    r->coeffs[8*i+6] = (1 << (D-1)) - r->coeffs[8*i+6];
+    r->coeffs[8*i+7] = (1 << (D-1)) - r->coeffs[8*i+7];
+  }
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyz_pack
+*
+* Description: Bit-pack polynomial with coefficients
+*              in [-(GAMMA1 - 1), GAMMA1].
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYZ_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyz_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  uint32_t t[4];
+  DBENCH_START();
+
+#if GAMMA1 == (1 << 17)
+  for(i = 0; i < N/4; ++i) {
+    t[0] = GAMMA1 - a->coeffs[4*i+0];
+    t[1] = GAMMA1 - a->coeffs[4*i+1];
+    t[2] = GAMMA1 - a->coeffs[4*i+2];
+    t[3] = GAMMA1 - a->coeffs[4*i+3];
+
+    r[9*i+0]  = t[0];
+    r[9*i+1]  = t[0] >> 8;
+    r[9*i+2]  = t[0] >> 16;
+    r[9*i+2] |= t[1] << 2;
+    r[9*i+3]  = t[1] >> 6;
+    r[9*i+4]  = t[1] >> 14;
+    r[9*i+4] |= t[2] << 4;
+    r[9*i+5]  = t[2] >> 4;
+    r[9*i+6]  = t[2] >> 12;
+    r[9*i+6] |= t[3] << 6;
+    r[9*i+7]  = t[3] >> 2;
+    r[9*i+8]  = t[3] >> 10;
+  }
+#elif GAMMA1 == (1 << 19)
+  for(i = 0; i < N/2; ++i) {
+    t[0] = GAMMA1 - a->coeffs[2*i+0];
+    t[1] = GAMMA1 - a->coeffs[2*i+1];
+
+    r[5*i+0]  = t[0];
+    r[5*i+1]  = t[0] >> 8;
+    r[5*i+2]  = t[0] >> 16;
+    r[5*i+2] |= t[1] << 4;
+    r[5*i+3]  = t[1] >> 4;
+    r[5*i+4]  = t[1] >> 12;
+  }
+#endif
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyz_unpack
+*
+* Description: Unpack polynomial z with coefficients
+*              in [-(GAMMA1 - 1), GAMMA1].
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *a: byte array with bit-packed polynomial
+**************************************************/
+void polyz_unpack(poly *r, const uint8_t *a) {
+  unsigned int i;
+  DBENCH_START();
+
+#if GAMMA1 == (1 << 17)
+  for(i = 0; i < N/4; ++i) {
+    r->coeffs[4*i+0]  = a[9*i+0];
+    r->coeffs[4*i+0] |= (uint32_t)a[9*i+1] << 8;
+    r->coeffs[4*i+0] |= (uint32_t)a[9*i+2] << 16;
+    r->coeffs[4*i+0] &= 0x3FFFF;
+
+    r->coeffs[4*i+1]  = a[9*i+2] >> 2;
+    r->coeffs[4*i+1] |= (uint32_t)a[9*i+3] << 6;
+    r->coeffs[4*i+1] |= (uint32_t)a[9*i+4] << 14;
+    r->coeffs[4*i+1] &= 0x3FFFF;
+
+    r->coeffs[4*i+2]  = a[9*i+4] >> 4;
+    r->coeffs[4*i+2] |= (uint32_t)a[9*i+5] << 4;
+    r->coeffs[4*i+2] |= (uint32_t)a[9*i+6] << 12;
+    r->coeffs[4*i+2] &= 0x3FFFF;
+
+    r->coeffs[4*i+3]  = a[9*i+6] >> 6;
+    r->coeffs[4*i+3] |= (uint32_t)a[9*i+7] << 2;
+    r->coeffs[4*i+3] |= (uint32_t)a[9*i+8] << 10;
+    r->coeffs[4*i+3] &= 0x3FFFF;
+
+    r->coeffs[4*i+0] = GAMMA1 - r->coeffs[4*i+0];
+    r->coeffs[4*i+1] = GAMMA1 - r->coeffs[4*i+1];
+    r->coeffs[4*i+2] = GAMMA1 - r->coeffs[4*i+2];
+    r->coeffs[4*i+3] = GAMMA1 - r->coeffs[4*i+3];
+  }
+#elif GAMMA1 == (1 << 19)
+  for(i = 0; i < N/2; ++i) {
+    r->coeffs[2*i+0]  = a[5*i+0];
+    r->coeffs[2*i+0] |= (uint32_t)a[5*i+1] << 8;
+    r->coeffs[2*i+0] |= (uint32_t)a[5*i+2] << 16;
+    r->coeffs[2*i+0] &= 0xFFFFF;
+
+    r->coeffs[2*i+1]  = a[5*i+2] >> 4;
+    r->coeffs[2*i+1] |= (uint32_t)a[5*i+3] << 4;
+    r->coeffs[2*i+1] |= (uint32_t)a[5*i+4] << 12;
+    /* r->coeffs[2*i+1] &= 0xFFFFF; */ /* No effect, since we're anyway at 20 bits */
+
+    r->coeffs[2*i+0] = GAMMA1 - r->coeffs[2*i+0];
+    r->coeffs[2*i+1] = GAMMA1 - r->coeffs[2*i+1];
+  }
+#endif
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyw1_pack
+*
+* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43].
+*              Input coefficients are assumed to be standard representatives.
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYW1_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyw1_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+#if GAMMA2 == (Q-1)/88
+  for(i = 0; i < N/4; ++i) {
+    r[3*i+0]  = a->coeffs[4*i+0];
+    r[3*i+0] |= a->coeffs[4*i+1] << 6;
+    r[3*i+1]  = a->coeffs[4*i+1] >> 2;
+    r[3*i+1] |= a->coeffs[4*i+2] << 4;
+    r[3*i+2]  = a->coeffs[4*i+2] >> 4;
+    r[3*i+2] |= a->coeffs[4*i+3] << 2;
+  }
+#elif GAMMA2 == (Q-1)/32
+  for(i = 0; i < N/2; ++i)
+    r[i] = a->coeffs[2*i+0] | (a->coeffs[2*i+1] << 4);
+#endif
+
+  DBENCH_STOP(*tpack);
+}
+/*************** dilithium/ref/polyvec.c */
+
+/*************************************************
+* Name:        expand_mat
+*
+* Description: Implementation of ExpandA. Generates matrix A with uniformly
+*              random coefficients a_{i,j} by performing rejection
+*              sampling on the output stream of SHAKE128(rho|j|i)
+*
+* Arguments:   - polyvecl mat[K]: output matrix
+*              - const uint8_t rho[]: byte array containing seed rho
+**************************************************/
+void polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) {
+  unsigned int i, j;
+
+  for(i = 0; i < K; ++i)
+    for(j = 0; j < L; ++j)
+      poly_uniform(&mat[i].vec[j], rho, (i << 8) + j);
+}
+
+void polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v);
+}
+
+/**************************************************************/
+/************ Vectors of polynomials of length L **************/
+/**************************************************************/
+
+void polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_uniform_eta(&v->vec[i], seed, nonce++);
+}
+
+void polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_uniform_gamma1(&v->vec[i], seed, L*nonce + i);
+}
+
+void polyvecl_reduce(polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_reduce(&v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvecl_add
+*
+* Description: Add vectors of polynomials of length L.
+*              No modular reduction is performed.
+*
+* Arguments:   - polyvecl *w: pointer to output vector
+*              - const polyvecl *u: pointer to first summand
+*              - const polyvecl *v: pointer to second summand
+**************************************************/
+void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvecl_ntt
+*
+* Description: Forward NTT of all polynomials in vector of length L. Output
+*              coefficients can be up to 16*Q larger than input coefficients.
+*
+* Arguments:   - polyvecl *v: pointer to input/output vector
+**************************************************/
+void polyvecl_ntt(polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_ntt(&v->vec[i]);
+}
+
+void polyvecl_invntt_tomont(polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_invntt_tomont(&v->vec[i]);
+}
+
+void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvecl_pointwise_acc_montgomery
+*
+* Description: Pointwise multiply vectors of polynomials of length L, multiply
+*              resulting vector by 2^{-32} and add (accumulate) polynomials
+*              in it. Input/output vectors are in NTT domain representation.
+*
+* Arguments:   - poly *w: output polynomial
+*              - const polyvecl *u: pointer to first input vector
+*              - const polyvecl *v: pointer to second input vector
+**************************************************/
+void polyvecl_pointwise_acc_montgomery(poly *w,
+                                       const polyvecl *u,
+                                       const polyvecl *v)
+{
+  unsigned int i;
+  poly t;
+
+  poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]);
+  for(i = 1; i < L; ++i) {
+    poly_pointwise_montgomery(&t, &u->vec[i], &v->vec[i]);
+    poly_add(w, w, &t);
+  }
+}
+
+/*************************************************
+* Name:        polyvecl_chknorm
+*
+* Description: Check infinity norm of polynomials in vector of length L.
+*              Assumes input polyvecl to be reduced by polyvecl_reduce().
+*
+* Arguments:   - const polyvecl *v: pointer to vector
+*              - int32_t B: norm bound
+*
+* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8
+* and 1 otherwise.
+**************************************************/
+int polyvecl_chknorm(const polyvecl *v, int32_t bound)  {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    if(poly_chknorm(&v->vec[i], bound))
+      return 1;
+
+  return 0;
+}
+
+/**************************************************************/
+/************ Vectors of polynomials of length K **************/
+/**************************************************************/
+
+void polyveck_uniform_eta(polyveck *v, const uint8_t seed[CRHBYTES], uint16_t nonce) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_uniform_eta(&v->vec[i], seed, nonce++);
+}
+
+/*************************************************
+* Name:        polyveck_reduce
+*
+* Description: Reduce coefficients of polynomials in vector of length K
+*              to representatives in [-6283008,6283008].
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_reduce(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_reduce(&v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_caddq
+*
+* Description: For all coefficients of polynomials in vector of length K
+*              add Q if coefficient is negative.
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_caddq(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_caddq(&v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_add
+*
+* Description: Add vectors of polynomials of length K.
+*              No modular reduction is performed.
+*
+* Arguments:   - polyveck *w: pointer to output vector
+*              - const polyveck *u: pointer to first summand
+*              - const polyveck *v: pointer to second summand
+**************************************************/
+void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_sub
+*
+* Description: Subtract vectors of polynomials of length K.
+*              No modular reduction is performed.
+*
+* Arguments:   - polyveck *w: pointer to output vector
+*              - const polyveck *u: pointer to first input vector
+*              - const polyveck *v: pointer to second input vector to be
+*                                   subtracted from first input vector
+**************************************************/
+void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_shiftl
+*
+* Description: Multiply vector of polynomials of Length K by 2^D without modular
+*              reduction. Assumes input coefficients to be less than 2^{31-D}.
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_shiftl(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_shiftl(&v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_ntt
+*
+* Description: Forward NTT of all polynomials in vector of length K. Output
+*              coefficients can be up to 16*Q larger than input coefficients.
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_ntt(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_ntt(&v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_invntt_tomont
+*
+* Description: Inverse NTT and multiplication by 2^{32} of polynomials
+*              in vector of length K. Input coefficients need to be less
+*              than 2*Q.
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_invntt_tomont(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_invntt_tomont(&v->vec[i]);
+}
+
+void polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
+}
+
+
+/*************************************************
+* Name:        polyveck_chknorm
+*
+* Description: Check infinity norm of polynomials in vector of length K.
+*              Assumes input polyveck to be reduced by polyveck_reduce().
+*
+* Arguments:   - const polyveck *v: pointer to vector
+*              - int32_t B: norm bound
+*
+* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8
+* and 1 otherwise.
+**************************************************/
+int polyveck_chknorm(const polyveck *v, int32_t bound) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    if(poly_chknorm(&v->vec[i], bound))
+      return 1;
+
+  return 0;
+}
+
+/*************************************************
+* Name:        polyveck_power2round
+*
+* Description: For all coefficients a of polynomials in vector of length K,
+*              compute a0, a1 such that a mod^+ Q = a1*2^D + a0
+*              with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
+*              standard representatives.
+*
+* Arguments:   - polyveck *v1: pointer to output vector of polynomials with
+*                              coefficients a1
+*              - polyveck *v0: pointer to output vector of polynomials with
+*                              coefficients a0
+*              - const polyveck *v: pointer to input vector
+**************************************************/
+void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_decompose
+*
+* Description: For all coefficients a of polynomials in vector of length K,
+*              compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0
+*              with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
+*              set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
+*              Assumes coefficients to be standard representatives.
+*
+* Arguments:   - polyveck *v1: pointer to output vector of polynomials with
+*                              coefficients a1
+*              - polyveck *v0: pointer to output vector of polynomials with
+*                              coefficients a0
+*              - const polyveck *v: pointer to input vector
+**************************************************/
+void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_make_hint
+*
+* Description: Compute hint vector.
+*
+* Arguments:   - polyveck *h: pointer to output vector
+*              - const polyveck *v0: pointer to low part of input vector
+*              - const polyveck *v1: pointer to high part of input vector
+*
+* Returns number of 1 bits.
+**************************************************/
+unsigned int polyveck_make_hint(polyveck *h,
+                                const polyveck *v0,
+                                const polyveck *v1)
+{
+  unsigned int i, s = 0;
+
+  for(i = 0; i < K; ++i)
+    s += poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]);
+
+  return s;
+}
+
+/*************************************************
+* Name:        polyveck_use_hint
+*
+* Description: Use hint vector to correct the high bits of input vector.
+*
+* Arguments:   - polyveck *w: pointer to output vector of polynomials with
+*                             corrected high bits
+*              - const polyveck *u: pointer to input vector
+*              - const polyveck *h: pointer to input hint vector
+**************************************************/
+void polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]);
+}
+
+void polyveck_pack_w1(uint8_t r[K*POLYW1_PACKEDBYTES], const polyveck *w1) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    polyw1_pack(&r[i*POLYW1_PACKEDBYTES], &w1->vec[i]);
+}
+/*************** dilithium/ref/reduce.c */
+
+/*************************************************
+* Name:        montgomery_reduce
+*
+* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31,
+*              compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q.
+*
+* Arguments:   - int64_t: finite field element a
+*
+* Returns r.
+**************************************************/
+int32_t montgomery_reduce(int64_t a) {
+  int32_t t;
+
+  t = (int64_t)(int32_t)a*QINV;
+  t = (a - (int64_t)t*Q) >> 32;
+  return t;
+}
+
+/*************************************************
+* Name:        reduce32
+*
+* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1,
+*              compute r \equiv a (mod Q) such that -6283008 <= r <= 6283008.
+*
+* Arguments:   - int32_t: finite field element a
+*
+* Returns r.
+**************************************************/
+int32_t reduce32(int32_t a) {
+  int32_t t;
+
+  t = (a + (1 << 22)) >> 23;
+  t = a - t*Q;
+  return t;
+}
+
+/*************************************************
+* Name:        caddq
+*
+* Description: Add Q if input coefficient is negative.
+*
+* Arguments:   - int32_t: finite field element a
+*
+* Returns r.
+**************************************************/
+int32_t caddq(int32_t a) {
+  a += (a >> 31) & Q;
+  return a;
+}
+
+/*************************************************
+* Name:        freeze
+*
+* Description: For finite field element a, compute standard
+*              representative r = a mod^+ Q.
+*
+* Arguments:   - int32_t: finite field element a
+*
+* Returns r.
+**************************************************/
+int32_t freeze(int32_t a) {
+  a = reduce32(a);
+  a = caddq(a);
+  return a;
+}
+/*************** dilithium/ref/rounding.c */
+
+/*************************************************
+* Name:        power2round
+*
+* Description: For finite field element a, compute a0, a1 such that
+*              a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
+*              Assumes a to be standard representative.
+*
+* Arguments:   - int32_t a: input element
+*              - int32_t *a0: pointer to output element a0
+*
+* Returns a1.
+**************************************************/
+int32_t power2round(int32_t *a0, int32_t a)  {
+  int32_t a1;
+
+  a1 = (a + (1 << (D-1)) - 1) >> D;
+  *a0 = a - (a1 << D);
+  return a1;
+}
+
+/*************************************************
+* Name:        decompose
+*
+* Description: For finite field element a, compute high and low bits a0, a1 such
+*              that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
+*              if a1 = (Q-1)/ALPHA where we set a1 = 0 and
+*              -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard
+*              representative.
+*
+* Arguments:   - int32_t a: input element
+*              - int32_t *a0: pointer to output element a0
+*
+* Returns a1.
+**************************************************/
+int32_t decompose(int32_t *a0, int32_t a) {
+  int32_t a1;
+
+  a1  = (a + 127) >> 7;
+#if GAMMA2 == (Q-1)/32
+  a1  = (a1*1025 + (1 << 21)) >> 22;
+  a1 &= 15;
+#elif GAMMA2 == (Q-1)/88
+  a1  = (a1*11275 + (1 << 23)) >> 24;
+  a1 ^= ((43 - a1) >> 31) & a1;
+#endif
+
+  *a0  = a - a1*2*GAMMA2;
+  *a0 -= (((Q-1)/2 - *a0) >> 31) & Q;
+  return a1;
+}
+
+/*************************************************
+* Name:        make_hint
+*
+* Description: Compute hint bit indicating whether the low bits of the
+*              input element overflow into the high bits.
+*
+* Arguments:   - int32_t a0: low bits of input element
+*              - int32_t a1: high bits of input element
+*
+* Returns 1 if overflow.
+**************************************************/
+unsigned int make_hint(int32_t a0, int32_t a1) {
+  if(a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0))
+    return 1;
+
+  return 0;
+}
+
+/*************************************************
+* Name:        use_hint
+*
+* Description: Correct high bits according to hint.
+*
+* Arguments:   - int32_t a: input element
+*              - unsigned int hint: hint bit
+*
+* Returns corrected high bits.
+**************************************************/
+int32_t use_hint(int32_t a, unsigned int hint) {
+  int32_t a0, a1;
+
+  a1 = decompose(&a0, a);
+  if(hint == 0)
+    return a1;
+
+#if GAMMA2 == (Q-1)/32
+  if(a0 > 0)
+    return (a1 + 1) & 15;
+  else
+    return (a1 - 1) & 15;
+#elif GAMMA2 == (Q-1)/88
+  if(a0 > 0)
+    return (a1 == 43) ?  0 : a1 + 1;
+  else
+    return (a1 ==  0) ? 43 : a1 - 1;
+#endif
+}
+/*************** dilithium/ref/sign.c */
+
+/*************************************************
+* Name:        crypto_sign_keypair
+*
+* Description: Generates public and private key.
+*
+* Arguments:   - uint8_t *pk: pointer to output public key (allocated
+*                             array of CRYPTO_PUBLICKEYBYTES bytes)
+*              - uint8_t *sk: pointer to output private key (allocated
+*                             array of CRYPTO_SECRETKEYBYTES bytes)
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
+  uint8_t seedbuf[2*SEEDBYTES + CRHBYTES];
+  uint8_t tr[TRBYTES];
+  const uint8_t *rho, *rhoprime, *key;
+  polyvecl mat[K];
+  polyvecl s1, s1hat;
+  polyveck s2, t1, t0;
+
+  /* Get randomness for rho, rhoprime and key */
+  randombytes(seedbuf, SEEDBYTES);
+  seedbuf[SEEDBYTES+0] = K;
+  seedbuf[SEEDBYTES+1] = L;
+  shake256(seedbuf, 2*SEEDBYTES + CRHBYTES, seedbuf, SEEDBYTES+2);
+  rho = seedbuf;
+  rhoprime = rho + SEEDBYTES;
+  key = rhoprime + CRHBYTES;
+
+  /* Expand matrix */
+  polyvec_matrix_expand(mat, rho);
+
+  /* Sample short vectors s1 and s2 */
+  polyvecl_uniform_eta(&s1, rhoprime, 0);
+  polyveck_uniform_eta(&s2, rhoprime, L);
+
+  /* Matrix-vector multiplication */
+  s1hat = s1;
+  polyvecl_ntt(&s1hat);
+  polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat);
+  polyveck_reduce(&t1);
+  polyveck_invntt_tomont(&t1);
+
+  /* Add error vector s2 */
+  polyveck_add(&t1, &t1, &s2);
+
+  /* Extract t1 and write public key */
+  polyveck_caddq(&t1);
+  polyveck_power2round(&t1, &t0, &t1);
+  pack_pk(pk, rho, &t1);
+
+  /* Compute H(rho, t1) and write secret key */
+  shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES);
+  pack_sk(sk, rho, tr, key, &t0, &s1, &s2);
+
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_sign_signature_internal
+*
+* Description: Computes signature. Internal API.
+*
+* Arguments:   - uint8_t *sig:   pointer to output signature (of length CRYPTO_BYTES)
+*              - size_t *siglen: pointer to output length of signature
+*              - uint8_t *m:     pointer to message to be signed
+*              - size_t mlen:    length of message
+*              - uint8_t *pre:   pointer to prefix string
+*              - size_t prelen:  length of prefix string
+*              - uint8_t *rnd:   pointer to random seed
+*              - uint8_t *sk:    pointer to bit-packed secret key
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_sign_signature_internal(uint8_t *sig,
+                                   size_t *siglen,
+                                   const uint8_t *m,
+                                   size_t mlen,
+                                   const uint8_t *pre,
+                                   size_t prelen,
+                                   const uint8_t rnd[RNDBYTES],
+                                   const uint8_t *sk)
+{
+  unsigned int n;
+  uint8_t seedbuf[2*SEEDBYTES + TRBYTES + 2*CRHBYTES];
+  uint8_t *rho, *tr, *key, *mu, *rhoprime;
+  uint16_t nonce = 0;
+  polyvecl mat[K], s1, y, z;
+  polyveck t0, s2, w1, w0, h;
+  poly cp;
+  keccak_state state;
+
+  rho = seedbuf;
+  tr = rho + SEEDBYTES;
+  key = tr + TRBYTES;
+  mu = key + SEEDBYTES;
+  rhoprime = mu + CRHBYTES;
+  unpack_sk(rho, tr, key, &t0, &s1, &s2, sk);
+
+  /* Compute mu = CRH(tr, pre, msg) */
+  shake256_init(&state);
+  shake256_absorb(&state, tr, TRBYTES);
+  shake256_absorb(&state, pre, prelen);
+  shake256_absorb(&state, m, mlen);
+  shake256_finalize(&state);
+  shake256_squeeze(mu, CRHBYTES, &state);
+
+  /* Compute rhoprime = CRH(key, rnd, mu) */
+  shake256_init(&state);
+  shake256_absorb(&state, key, SEEDBYTES);
+  shake256_absorb(&state, rnd, RNDBYTES);
+  shake256_absorb(&state, mu, CRHBYTES);
+  shake256_finalize(&state);
+  shake256_squeeze(rhoprime, CRHBYTES, &state);
+
+  /* Expand matrix and transform vectors */
+  polyvec_matrix_expand(mat, rho);
+  polyvecl_ntt(&s1);
+  polyveck_ntt(&s2);
+  polyveck_ntt(&t0);
+
+rej:
+  /* Sample intermediate vector y */
+  polyvecl_uniform_gamma1(&y, rhoprime, nonce++);
+
+  /* Matrix-vector multiplication */
+  z = y;
+  polyvecl_ntt(&z);
+  polyvec_matrix_pointwise_montgomery(&w1, mat, &z);
+  polyveck_reduce(&w1);
+  polyveck_invntt_tomont(&w1);
+
+  /* Decompose w and call the random oracle */
+  polyveck_caddq(&w1);
+  polyveck_decompose(&w1, &w0, &w1);
+  polyveck_pack_w1(sig, &w1);
+
+  shake256_init(&state);
+  shake256_absorb(&state, mu, CRHBYTES);
+  shake256_absorb(&state, sig, K*POLYW1_PACKEDBYTES);
+  shake256_finalize(&state);
+  shake256_squeeze(sig, CTILDEBYTES, &state);
+  poly_challenge(&cp, sig);
+  poly_ntt(&cp);
+
+  /* Compute z, reject if it reveals secret */
+  polyvecl_pointwise_poly_montgomery(&z, &cp, &s1);
+  polyvecl_invntt_tomont(&z);
+  polyvecl_add(&z, &z, &y);
+  polyvecl_reduce(&z);
+  if(polyvecl_chknorm(&z, GAMMA1 - BETA))
+    goto rej;
+
+  /* Check that subtracting cs2 does not change high bits of w and low bits
+   * do not reveal secret information */
+  polyveck_pointwise_poly_montgomery(&h, &cp, &s2);
+  polyveck_invntt_tomont(&h);
+  polyveck_sub(&w0, &w0, &h);
+  polyveck_reduce(&w0);
+  if(polyveck_chknorm(&w0, GAMMA2 - BETA))
+    goto rej;
+
+  /* Compute hints for w1 */
+  polyveck_pointwise_poly_montgomery(&h, &cp, &t0);
+  polyveck_invntt_tomont(&h);
+  polyveck_reduce(&h);
+  if(polyveck_chknorm(&h, GAMMA2))
+    goto rej;
+
+  polyveck_add(&w0, &w0, &h);
+  n = polyveck_make_hint(&h, &w0, &w1);
+  if(n > OMEGA)
+    goto rej;
+
+  /* Write signature */
+  pack_sig(sig, sig, &z, &h);
+  *siglen = CRYPTO_BYTES;
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_sign_signature
+*
+* Description: Computes signature.
+*
+* Arguments:   - uint8_t *sig:   pointer to output signature (of length CRYPTO_BYTES)
+*              - size_t *siglen: pointer to output length of signature
+*              - uint8_t *m:     pointer to message to be signed
+*              - size_t mlen:    length of message
+*              - uint8_t *ctx:   pointer to contex string
+*              - size_t ctxlen:  length of contex string
+*              - uint8_t *sk:    pointer to bit-packed secret key
+*
+* Returns 0 (success) or -1 (context string too long)
+**************************************************/
+int crypto_sign_signature(uint8_t *sig,
+                          size_t *siglen,
+                          const uint8_t *m,
+                          size_t mlen,
+                          const uint8_t *ctx,
+                          size_t ctxlen,
+                          const uint8_t *sk)
+{
+  size_t i;
+  uint8_t pre[257];
+  uint8_t rnd[RNDBYTES];
+
+  if(ctxlen > 255)
+    return -1;
+
+  /* Prepare pre = (0, ctxlen, ctx) */
+  pre[0] = 0;
+  pre[1] = ctxlen;
+  for(i = 0; i < ctxlen; i++)
+    pre[2 + i] = ctx[i];
+
+#ifdef DILITHIUM_RANDOMIZED_SIGNING
+  randombytes(rnd, RNDBYTES);
+#else
+  for(i=0;i<RNDBYTES;i++)
+    rnd[i] = 0;
+#endif
+
+  crypto_sign_signature_internal(sig,siglen,m,mlen,pre,2+ctxlen,rnd,sk);
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_sign
+*
+* Description: Compute signed message.
+*
+* Arguments:   - uint8_t *sm: pointer to output signed message (allocated
+*                             array with CRYPTO_BYTES + mlen bytes),
+*                             can be equal to m
+*              - size_t *smlen: pointer to output length of signed
+*                               message
+*              - const uint8_t *m: pointer to message to be signed
+*              - size_t mlen: length of message
+*              - const uint8_t *ctx: pointer to context string
+*              - size_t ctxlen: length of context string
+*              - const uint8_t *sk: pointer to bit-packed secret key
+*
+* Returns 0 (success) or -1 (context string too long)
+**************************************************/
+int crypto_sign(uint8_t *sm,
+                size_t *smlen,
+                const uint8_t *m,
+                size_t mlen,
+                const uint8_t *ctx,
+                size_t ctxlen,
+                const uint8_t *sk)
+{
+  int ret;
+  size_t i;
+
+  for(i = 0; i < mlen; ++i)
+    sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i];
+  ret = crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, ctx, ctxlen, sk);
+  *smlen += mlen;
+  return ret;
+}
+
+/*************************************************
+* Name:        crypto_sign_verify_internal
+*
+* Description: Verifies signature. Internal API.
+*
+* Arguments:   - uint8_t *m: pointer to input signature
+*              - size_t siglen: length of signature
+*              - const uint8_t *m: pointer to message
+*              - size_t mlen: length of message
+*              - const uint8_t *pre: pointer to prefix string
+*              - size_t prelen: length of prefix string
+*              - const uint8_t *pk: pointer to bit-packed public key
+*
+* Returns 0 if signature could be verified correctly and -1 otherwise
+**************************************************/
+int crypto_sign_verify_internal(const uint8_t *sig,
+                                size_t siglen,
+                                const uint8_t *m,
+                                size_t mlen,
+                                const uint8_t *pre,
+                                size_t prelen,
+                                const uint8_t *pk)
+{
+  unsigned int i;
+  uint8_t buf[K*POLYW1_PACKEDBYTES];
+  uint8_t rho[SEEDBYTES];
+  uint8_t mu[CRHBYTES];
+  uint8_t c[CTILDEBYTES];
+  uint8_t c2[CTILDEBYTES];
+  poly cp;
+  polyvecl mat[K], z;
+  polyveck t1, w1, h;
+  keccak_state state;
+
+  if(siglen != CRYPTO_BYTES)
+    return -1;
+
+  unpack_pk(rho, &t1, pk);
+  if(unpack_sig(c, &z, &h, sig))
+    return -1;
+  if(polyvecl_chknorm(&z, GAMMA1 - BETA))
+    return -1;
+
+  /* Compute CRH(H(rho, t1), pre, msg) */
+  shake256(mu, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES);
+  shake256_init(&state);
+  shake256_absorb(&state, mu, TRBYTES);
+  shake256_absorb(&state, pre, prelen);
+  shake256_absorb(&state, m, mlen);
+  shake256_finalize(&state);
+  shake256_squeeze(mu, CRHBYTES, &state);
+
+  /* Matrix-vector multiplication; compute Az - c2^dt1 */
+  poly_challenge(&cp, c);
+  polyvec_matrix_expand(mat, rho);
+
+  polyvecl_ntt(&z);
+  polyvec_matrix_pointwise_montgomery(&w1, mat, &z);
+
+  poly_ntt(&cp);
+  polyveck_shiftl(&t1);
+  polyveck_ntt(&t1);
+  polyveck_pointwise_poly_montgomery(&t1, &cp, &t1);
+
+  polyveck_sub(&w1, &w1, &t1);
+  polyveck_reduce(&w1);
+  polyveck_invntt_tomont(&w1);
+
+  /* Reconstruct w1 */
+  polyveck_caddq(&w1);
+  polyveck_use_hint(&w1, &w1, &h);
+  polyveck_pack_w1(buf, &w1);
+
+  /* Call random oracle and verify challenge */
+  shake256_init(&state);
+  shake256_absorb(&state, mu, CRHBYTES);
+  shake256_absorb(&state, buf, K*POLYW1_PACKEDBYTES);
+  shake256_finalize(&state);
+  shake256_squeeze(c2, CTILDEBYTES, &state);
+  for(i = 0; i < CTILDEBYTES; ++i)
+    if(c[i] != c2[i])
+      return -1;
+
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_sign_verify
+*
+* Description: Verifies signature.
+*
+* Arguments:   - uint8_t *m: pointer to input signature
+*              - size_t siglen: length of signature
+*              - const uint8_t *m: pointer to message
+*              - size_t mlen: length of message
+*              - const uint8_t *ctx: pointer to context string
+*              - size_t ctxlen: length of context string
+*              - const uint8_t *pk: pointer to bit-packed public key
+*
+* Returns 0 if signature could be verified correctly and -1 otherwise
+**************************************************/
+int crypto_sign_verify(const uint8_t *sig,
+                       size_t siglen,
+                       const uint8_t *m,
+                       size_t mlen,
+                       const uint8_t *ctx,
+                       size_t ctxlen,
+                       const uint8_t *pk)
+{
+  size_t i;
+  uint8_t pre[257];
+
+  if(ctxlen > 255)
+    return -1;
+
+  pre[0] = 0;
+  pre[1] = ctxlen;
+  for(i = 0; i < ctxlen; i++)
+    pre[2 + i] = ctx[i];
+
+  return crypto_sign_verify_internal(sig,siglen,m,mlen,pre,2+ctxlen,pk);
+}
+
+/*************************************************
+* Name:        crypto_sign_open
+*
+* Description: Verify signed message.
+*
+* Arguments:   - uint8_t *m: pointer to output message (allocated
+*                            array with smlen bytes), can be equal to sm
+*              - size_t *mlen: pointer to output length of message
+*              - const uint8_t *sm: pointer to signed message
+*              - size_t smlen: length of signed message
+*              - const uint8_t *ctx: pointer to context tring
+*              - size_t ctxlen: length of context string
+*              - const uint8_t *pk: pointer to bit-packed public key
+*
+* Returns 0 if signed message could be verified correctly and -1 otherwise
+**************************************************/
+int crypto_sign_open(uint8_t *m,
+                     size_t *mlen,
+                     const uint8_t *sm,
+                     size_t smlen,
+                     const uint8_t *ctx,
+                     size_t ctxlen,
+                     const uint8_t *pk)
+{
+  size_t i;
+
+  if(smlen < CRYPTO_BYTES)
+    goto badsig;
+
+  *mlen = smlen - CRYPTO_BYTES;
+  if(crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, ctx, ctxlen, pk))
+    goto badsig;
+  else {
+    /* All good, copy msg, return 0 */
+    for(i = 0; i < *mlen; ++i)
+      m[i] = sm[CRYPTO_BYTES + i];
+    return 0;
+  }
+
+badsig:
+  /* Signature verification failed */
+  *mlen = 0;
+  for(i = 0; i < smlen; ++i)
+    m[i] = 0;
+
+  return -1;
+}
+/*************** dilithium/ref/symmetric-shake.c */
+
+void dilithium_shake128_stream_init(keccak_state *state, const uint8_t seed[SEEDBYTES], uint16_t nonce)
+{
+  uint8_t t[2];
+  t[0] = nonce;
+  t[1] = nonce >> 8;
+
+  shake128_init(state);
+  shake128_absorb(state, seed, SEEDBYTES);
+  shake128_absorb(state, t, 2);
+  shake128_finalize(state);
+}
+
+void dilithium_shake256_stream_init(keccak_state *state, const uint8_t seed[CRHBYTES], uint16_t nonce)
+{
+  uint8_t t[2];
+  t[0] = nonce;
+  t[1] = nonce >> 8;
+
+  shake256_init(state);
+  shake256_absorb(state, seed, CRHBYTES);
+  shake256_absorb(state, t, 2);
+  shake256_finalize(state);
+}

_______________________________________________
Gcrypt-devel mailing list
[email protected]
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel

[PATCH 01/17] cipher: Put the original Dilithium implementation.

Reply via email to