Re: ppc64: v3: AES/GCM Performance improvement with stitched implementation

2023-12-18 Thread Danny Tsen
Hi Niels,

Here is another revised patch testing with NETTLE_FAT_OVERRIDE.  Same 
performance as last version.  I also added a new test vector with 917 bytes for 
AES/GCM tests to test multiple blocks and partial.  Attached are the patch and 
benchmark for AES.

Thanks.
-Danny


On Dec 12, 2023, at 9:01 AM, Danny Tsen  wrote:



On Dec 11, 2023, at 10:32 AM, Niels Möller  wrote:

Danny Tsen  writes:

Here is the version 2 for AES/GCM stitched patch. The stitched code is
in all assembly and m4 macros are used. The overall performance
improved around ~110% and 120% for encrypt and decrypt respectably.
Please see the attached patch and aes benchmark.

Thanks, comments below.

--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -226,6 +231,8 @@ fat_init (void)
_nettle_ghash_update_arm64() */
 _nettle_ghash_set_key_vec = _nettle_ghash_set_key_ppc64;
 _nettle_ghash_update_vec = _nettle_ghash_update_ppc64;
+  _nettle_ppc_gcm_aes_encrypt_vec = _nettle_ppc_gcm_aes_encrypt_ppc64;
+  _nettle_ppc_gcm_aes_decrypt_vec = _nettle_ppc_gcm_aes_decrypt_ppc64;
   }
 else
   {

Fat setup is a bit tricky, here it looks like
_nettle_ppc_gcm_aes_decrypt_vec is left undefined by the else clause. I
would suspect that breaks when the extensions aren't available. You can
test that with NETTLE_FAT_OVERRIDE=none.

Sure.  I’ll test and see to fix it.


gcm_aes128_encrypt(struct gcm_aes128_ctx *ctx,
size_t length, uint8_t *dst, const uint8_t *src)
{
+#if defined(HAVE_NATIVE_AES_GCM_STITCH)
+  if (length >= 128) {
+PPC_GCM_CRYPT(1, _AES128_ROUNDS, ctx, length, dst, src);
+if (length == 0) {
+  return;
+}
+  }
+#endif /* HAVE_NATIVE_AES_GCM_STITCH */
+
 GCM_ENCRYPT(ctx, aes128_encrypt, length, dst, src);
}

In a non-fat build, it's right with a compile-time #if to select if the
optimized code should be called. But in a fat build, we'de need a valid
function in all cases, but doing different things depending on the
runtime fat initialization. One could do that with two versions of
gcm_aes128_encrypt (which is likely preferable if we do something
similar for other archs that has separate assembly for aes128, aes192,
etc). Or we would need to call some function unconditionally, which
would be a nop if the extensions are not available. E.g, do something
like

#if HAVE_NATIVE_fat_aes_gcm_encrypt
void
gcm_aes128_encrypt(struct gcm_aes128_ctx *ctx,
 size_t length, uint8_t *dst, const uint8_t *src)
{
  size_t done = _gcm_aes_encrypt (>key, >gcm.x, >gcm.ctr,
  _AES128_ROUNDS, >cipher.keys, length, dst, src);
  ctx->data_size += done;
  length -= done;
  if (length > 0)
{
  src += done;
  dst += done;
  GCM_ENCRYPT(ctx, aes128_encrypt, length, dst, src);
}
}
#endif

where the C-implementation of _gcm_aes_encrypt could just return 0.

And it's preferable that te same interface could be used on other archs,
even if they don't do 8 blocks at a time like your ppc code.

--- a/gcm.h
+++ b/gcm.h
@@ -195,6 +195,47 @@ gcm_digest(struct gcm_ctx *ctx, const struct gcm_key *key,
  (nettle_cipher_func *) (encrypt), \
  (length), (digest)))

+#if defined(HAVE_NATIVE_AES_GCM_STITCH)
+#define _ppc_gcm_aes_encrypt _nettle_ppc_gcm_aes_encrypt
+#define _ppc_gcm_aes_decrypt _nettle_ppc_gcm_aes_decrypt
+void
+_ppc_gcm_aes_encrypt (void *ctx, size_t rounds, uint8_t *ctr,
+  size_t len, uint8_t *dst, const uint8_t *src);
+void
+_ppc_gcm_aes_decrypt (void *ctx, size_t rounds, uint8_t *ctr,
+  size_t len, uint8_t *dst, const uint8_t *src);
+struct ppc_gcm_aes_context {
+  uint8_t *x;
+  uint8_t *htable;
+  struct aes128_ctx *rkeys;
+};
+#define GET_PPC_CTX(gcm_aes_ctx, ctx, key, cipher) \
+  { \
+(gcm_aes_ctx)->x = (uint8_t *) &(ctx)->x; \
+(gcm_aes_ctx)->htable = (uint8_t *) (key); \
+(gcm_aes_ctx)->rkeys = (struct aes128_ctx *) (cipher)->keys; \
+  }
+
+#define PPC_GCM_CRYPT(encrypt, rounds, ctx, length, dst, src) \
+  { \
+size_t rem_len = 0; \
+struct ppc_gcm_aes_context c; \
+struct gcm_ctx *gctx = &(ctx)->gcm; \
+GET_PPC_CTX(, gctx, &(ctx)->key, &(ctx)->cipher); \
+if ((encrypt)) { \
+  _ppc_gcm_aes_encrypt(, (rounds), (&(ctx)->gcm)->ctr.b, (length), 
(dst), (src)); \
+} else { \
+  _ppc_gcm_aes_decrypt(, (rounds), (&(ctx)->gcm)->ctr.b, (length), 
(dst), (src)); \
+} \
+rem_len = (length) % (GCM_BLOCK_SIZE * 8); \
+(length) -= rem_len; \
+gctx->data_size += (length); \
+(dst) += (length); \
+(src) += (length); \
+(length) = rem_len; \
+  }
+#endif /* HAVE_NATIVE_AES_GCM_STITCH */

This looks a little awkward. I think it would be better to pass the
various pointers needed by the assembly implementation as separate
(register) arguments. Or pass the pointer to the struct gcm_aesxxx_ctx
directly (with the disadvantage that assembly code needs to know
corresponding offsets).


I’ll see what’s a better options.

--- a/powerpc64/machine.m4
+++ b/powerpc64/machine.m4
@@ -63,3 +63,40 @@ C INC_VR(VR, INC)

Re: ppc64: v2, AES/GCM Performance improvement with stitched implementation

2023-12-18 Thread Danny Tsen
Correction, 719 bytes test vector.


On Dec 12, 2023, at 9:01 AM, Danny Tsen  wrote:



On Dec 11, 2023, at 10:32 AM, Niels Möller  wrote:

Danny Tsen  writes:

Here is the version 2 for AES/GCM stitched patch. The stitched code is
in all assembly and m4 macros are used. The overall performance
improved around ~110% and 120% for encrypt and decrypt respectably.
Please see the attached patch and aes benchmark.

Thanks, comments below.

--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -226,6 +231,8 @@ fat_init (void)
_nettle_ghash_update_arm64() */
 _nettle_ghash_set_key_vec = _nettle_ghash_set_key_ppc64;
 _nettle_ghash_update_vec = _nettle_ghash_update_ppc64;
+  _nettle_ppc_gcm_aes_encrypt_vec = _nettle_ppc_gcm_aes_encrypt_ppc64;
+  _nettle_ppc_gcm_aes_decrypt_vec = _nettle_ppc_gcm_aes_decrypt_ppc64;
   }
 else
   {

Fat setup is a bit tricky, here it looks like
_nettle_ppc_gcm_aes_decrypt_vec is left undefined by the else clause. I
would suspect that breaks when the extensions aren't available. You can
test that with NETTLE_FAT_OVERRIDE=none.

Sure.  I’ll test and see to fix it.


gcm_aes128_encrypt(struct gcm_aes128_ctx *ctx,
size_t length, uint8_t *dst, const uint8_t *src)
{
+#if defined(HAVE_NATIVE_AES_GCM_STITCH)
+  if (length >= 128) {
+PPC_GCM_CRYPT(1, _AES128_ROUNDS, ctx, length, dst, src);
+if (length == 0) {
+  return;
+}
+  }
+#endif /* HAVE_NATIVE_AES_GCM_STITCH */
+
 GCM_ENCRYPT(ctx, aes128_encrypt, length, dst, src);
}

In a non-fat build, it's right with a compile-time #if to select if the
optimized code should be called. But in a fat build, we'de need a valid
function in all cases, but doing different things depending on the
runtime fat initialization. One could do that with two versions of
gcm_aes128_encrypt (which is likely preferable if we do something
similar for other archs that has separate assembly for aes128, aes192,
etc). Or we would need to call some function unconditionally, which
would be a nop if the extensions are not available. E.g, do something
like

#if HAVE_NATIVE_fat_aes_gcm_encrypt
void
gcm_aes128_encrypt(struct gcm_aes128_ctx *ctx,
 size_t length, uint8_t *dst, const uint8_t *src)
{
  size_t done = _gcm_aes_encrypt (>key, >gcm.x, >gcm.ctr,
  _AES128_ROUNDS, >cipher.keys, length, dst, src);
  ctx->data_size += done;
  length -= done;
  if (length > 0)
{
  src += done;
  dst += done;
  GCM_ENCRYPT(ctx, aes128_encrypt, length, dst, src);
}
}
#endif

where the C-implementation of _gcm_aes_encrypt could just return 0.

And it's preferable that te same interface could be used on other archs,
even if they don't do 8 blocks at a time like your ppc code.

--- a/gcm.h
+++ b/gcm.h
@@ -195,6 +195,47 @@ gcm_digest(struct gcm_ctx *ctx, const struct gcm_key *key,
  (nettle_cipher_func *) (encrypt), \
  (length), (digest)))

+#if defined(HAVE_NATIVE_AES_GCM_STITCH)
+#define _ppc_gcm_aes_encrypt _nettle_ppc_gcm_aes_encrypt
+#define _ppc_gcm_aes_decrypt _nettle_ppc_gcm_aes_decrypt
+void
+_ppc_gcm_aes_encrypt (void *ctx, size_t rounds, uint8_t *ctr,
+  size_t len, uint8_t *dst, const uint8_t *src);
+void
+_ppc_gcm_aes_decrypt (void *ctx, size_t rounds, uint8_t *ctr,
+  size_t len, uint8_t *dst, const uint8_t *src);
+struct ppc_gcm_aes_context {
+  uint8_t *x;
+  uint8_t *htable;
+  struct aes128_ctx *rkeys;
+};
+#define GET_PPC_CTX(gcm_aes_ctx, ctx, key, cipher) \
+  { \
+(gcm_aes_ctx)->x = (uint8_t *) &(ctx)->x; \
+(gcm_aes_ctx)->htable = (uint8_t *) (key); \
+(gcm_aes_ctx)->rkeys = (struct aes128_ctx *) (cipher)->keys; \
+  }
+
+#define PPC_GCM_CRYPT(encrypt, rounds, ctx, length, dst, src) \
+  { \
+size_t rem_len = 0; \
+struct ppc_gcm_aes_context c; \
+struct gcm_ctx *gctx = &(ctx)->gcm; \
+GET_PPC_CTX(, gctx, &(ctx)->key, &(ctx)->cipher); \
+if ((encrypt)) { \
+  _ppc_gcm_aes_encrypt(, (rounds), (&(ctx)->gcm)->ctr.b, (length), 
(dst), (src)); \
+} else { \
+  _ppc_gcm_aes_decrypt(, (rounds), (&(ctx)->gcm)->ctr.b, (length), 
(dst), (src)); \
+} \
+rem_len = (length) % (GCM_BLOCK_SIZE * 8); \
+(length) -= rem_len; \
+gctx->data_size += (length); \
+(dst) += (length); \
+(src) += (length); \
+(length) = rem_len; \
+  }
+#endif /* HAVE_NATIVE_AES_GCM_STITCH */

This looks a little awkward. I think it would be better to pass the
various pointers needed by the assembly implementation as separate
(register) arguments. Or pass the pointer to the struct gcm_aesxxx_ctx
directly (with the disadvantage that assembly code needs to know
corresponding offsets).


I’ll see what’s a better options.

--- a/powerpc64/machine.m4
+++ b/powerpc64/machine.m4
@@ -63,3 +63,40 @@ C INC_VR(VR, INC)
define(`INC_VR',`ifelse(substr($1,0,1),`v',
``v'eval($2+substr($1,1,len($1)))',
`eval($2+$1)')')
+
+C Adding state and round key 0
+C XOR_4RK0(state, state, rkey0)
+define(`XOR_4RK0',
+  `vxor $1, $1, $5
+   vxor $2, $2, $5
+   vxor $3, $3, $5
+   vxor $4,