[dm-devel] [PATCH v4 4/4] crypto: lrw - Do not use auxiliary buffer

2018-09-14 Thread Ondrej Mosnacek
This patch simplifies the LRW template to recompute the LRW tweaks from
scratch in the second pass and thus also removes the need to allocate a
dynamic buffer using kmalloc().

As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt.

PERFORMANCE MEASUREMENTS (x86_64)
Performed using: https://gitlab.com/omos/linux-crypto-bench
Crypto driver used: lrw(ecb-aes-aesni)

The results show that the new code has about the same performance as the
old code. For 512-byte message it seems to be even slightly faster, but
that might be just noise.

Before:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 200 203
lrw(aes) 320  64 202 204
lrw(aes) 384  64 204 205
lrw(aes) 256 512 415 415
lrw(aes) 320 512 432 440
lrw(aes) 384 512 449 451
lrw(aes) 256409618381995
lrw(aes) 320409621231980
lrw(aes) 384409621002119
lrw(aes) 256   1638471836954
lrw(aes) 320   1638478447631
lrw(aes) 384   1638482568126
lrw(aes) 256   32768   14772   14484
lrw(aes) 320   32768   15281   15431
lrw(aes) 384   32768   16469   16293

After:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 197 196
lrw(aes) 320  64 200 197
lrw(aes) 384  64 203 199
lrw(aes) 256 512 385 380
lrw(aes) 320 512 401 395
lrw(aes) 384 512 415 415
lrw(aes) 256409618691846
lrw(aes) 320409620801981
lrw(aes) 384409621602109
lrw(aes) 256   1638470777127
lrw(aes) 320   1638478077766
lrw(aes) 384   1638481088357
lrw(aes) 256   32768   14111   14454
lrw(aes) 320   32768   15268   15082
lrw(aes) 384   32768   16581   16250

[1] https://lkml.org/lkml/2018/8/23/1315

Signed-off-by: Ondrej Mosnacek 
---
 crypto/lrw.c | 280 ++-
 1 file changed, 51 insertions(+), 229 deletions(-)

diff --git a/crypto/lrw.c b/crypto/lrw.c
index 7377b5b486fd..6fcf0d431185 100644
--- a/crypto/lrw.c
+++ b/crypto/lrw.c
@@ -29,8 +29,6 @@
 #include 
 #include 
 
-#define LRW_BUFFER_SIZE 128u
-
 #define LRW_BLOCK_SIZE 16
 
 struct priv {
@@ -56,19 +54,7 @@ struct priv {
 };
 
 struct rctx {
-   be128 buf[LRW_BUFFER_SIZE / sizeof(be128)];
-
be128 t;
-
-   be128 *ext;
-
-   struct scatterlist srcbuf[2];
-   struct scatterlist dstbuf[2];
-   struct scatterlist *src;
-   struct scatterlist *dst;
-
-   unsigned int left;
-
struct skcipher_request subreq;
 };
 
@@ -152,86 +138,31 @@ static int next_index(u32 *counter)
return 127;
 }
 
-static int post_crypt(struct skcipher_request *req)
+/*
+ * We compute the tweak masks twice (both before and after the ECB encryption 
or
+ * decryption) to avoid having to allocate a temporary buffer and/or make
+ * mutliple calls to the 'ecb(..)' instance, which usually would be slower than
+ * just doing the next_index() calls again.
+ */
+static int xor_tweak(struct skcipher_request *req, bool second_pass)
 {
-   struct rctx *rctx = skcipher_request_ctx(req);
-   be128 *buf = rctx->ext ?: rctx->buf;
-   struct skcipher_request *subreq;
const int bs = LRW_BLOCK_SIZE;
-   struct skcipher_walk w;
-   struct scatterlist *sg;
-   unsigned offset;
-   int err;
-
-   subreq = &rctx->subreq;
-   err = skcipher_walk_virt(&w, subreq, false);
-
-   while (w.nbytes) {
-   unsigned int avail = w.nbytes;
-   be128 *wdst;
-
-   wdst = w.dst.virt.addr;
-
-   do {
-   be128_xor(wdst, buf++, wdst);
-   wdst++;
-   } while ((avail -= bs) >= bs);
-
-   err = skcipher_walk_done(&w, avail);
- 

[dm-devel] [PATCH v4 2/4] crypto: testmgr - Add test for LRW counter wrap-around

2018-09-14 Thread Ondrej Mosnacek
This patch adds a test vector for lrw(aes) that triggers wrap-around of
the counter, which is a tricky corner case.

Suggested-by: Eric Biggers 
Signed-off-by: Ondrej Mosnacek 
---
 crypto/testmgr.h | 21 +
 1 file changed, 21 insertions(+)

diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 0b3d7cadbe93..47629cb1efd3 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -13145,6 +13145,27 @@ static const struct cipher_testvec 
aes_lrw_tv_template[] = {
.ctext  = "\x5b\x90\x8e\xc1\xab\xdd\x67\x5f"
  "\x3d\x69\x8a\x95\x53\xc8\x9c\xe5",
.len= 16,
+   }, { /* Test counter wrap-around, modified from LRW-32-AES 1 */
+   .key= "\x45\x62\xac\x25\xf8\x28\x17\x6d"
+ "\x4c\x26\x84\x14\xb5\x68\x01\x85"
+ "\x25\x8e\x2a\x05\xe7\x3e\x9d\x03"
+ "\xee\x5a\x83\x0c\xcc\x09\x4c\x87",
+   .klen   = 32,
+   .iv = "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff",
+   .ptext  = "\x30\x31\x32\x33\x34\x35\x36\x37"
+ "\x38\x39\x41\x42\x43\x44\x45\x46"
+ "\x30\x31\x32\x33\x34\x35\x36\x37"
+ "\x38\x39\x41\x42\x43\x44\x45\x46"
+ "\x30\x31\x32\x33\x34\x35\x36\x37"
+ "\x38\x39\x41\x42\x43\x44\x45\x46",
+   .ctext  = "\x47\x90\x50\xf6\xf4\x8d\x5c\x7f"
+ "\x84\xc7\x83\x95\x2d\xa2\x02\xc0"
+ "\xda\x7f\xa3\xc0\x88\x2a\x0a\x50"
+ "\xfb\xc1\x78\x03\x39\xfe\x1d\xe5"
+ "\xf1\xb2\x73\xcd\x65\xa3\xdf\x5f"
+ "\xe9\x5d\x48\x92\x54\x63\x4e\xb8",
+   .len= 48,
}, {
 /* http://www.mail-archive.com/stds-p1619@listserv.ieee.org/msg00173.html */
.key= "\xf8\xd4\x76\xff\xd6\x46\xee\x6c"
-- 
2.17.1

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


[dm-devel] [PATCH v4 3/4] crypto: lrw - Optimize tweak computation

2018-09-14 Thread Ondrej Mosnacek
This patch rewrites the tweak computation to a slightly simpler method
that performs less bswaps. Based on performance measurements the new
code seems to provide slightly better performance than the old one.

PERFORMANCE MEASUREMENTS (x86_64)
Performed using: https://gitlab.com/omos/linux-crypto-bench
Crypto driver used: lrw(ecb-aes-aesni)

Before:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 204 286
lrw(aes) 320  64 227 203
lrw(aes) 384  64 208 204
lrw(aes) 256 512 441 439
lrw(aes) 320 512 456 455
lrw(aes) 384 512 469 483
lrw(aes) 256409621362190
lrw(aes) 320409621612213
lrw(aes) 384409622952369
lrw(aes) 256   1638476927868
lrw(aes) 320   1638482308691
lrw(aes) 384   1638489718813
lrw(aes) 256   32768   15336   15560
lrw(aes) 320   32768   16410   16346
lrw(aes) 384   32768   18023   17465

After:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 200 203
lrw(aes) 320  64 202 204
lrw(aes) 384  64 204 205
lrw(aes) 256 512 415 415
lrw(aes) 320 512 432 440
lrw(aes) 384 512 449 451
lrw(aes) 256409618381995
lrw(aes) 320409621231980
lrw(aes) 384409621002119
lrw(aes) 256   1638471836954
lrw(aes) 320   1638478447631
lrw(aes) 384   1638482568126
lrw(aes) 256   32768   14772   14484
lrw(aes) 320   32768   15281   15431
lrw(aes) 384   32768   16469   16293

Signed-off-by: Ondrej Mosnacek 
---
 crypto/lrw.c | 61 +++-
 1 file changed, 37 insertions(+), 24 deletions(-)

diff --git a/crypto/lrw.c b/crypto/lrw.c
index 5504d1325a56..7377b5b486fd 100644
--- a/crypto/lrw.c
+++ b/crypto/lrw.c
@@ -120,27 +120,28 @@ static int setkey(struct crypto_skcipher *parent, const 
u8 *key,
return 0;
 }
 
-static inline void inc(be128 *iv)
-{
-   be64_add_cpu(&iv->b, 1);
-   if (!iv->b)
-   be64_add_cpu(&iv->a, 1);
-}
-
-/* this returns the number of consequative 1 bits starting
- * from the right, get_index128(00 00 00 00 00 00 ... 00 00 10 FB) = 2 */
-static inline int get_index128(be128 *block)
+/*
+ * Returns the number of trailing '1' bits in the words of the counter, which 
is
+ * represented by 4 32-bit words, arranged from least to most significant.
+ * At the same time, increments the counter by one.
+ *
+ * For example:
+ *
+ * u32 counter[4] = { 0x, 0x1, 0x0, 0x0 };
+ * int i = next_index(&counter);
+ * // i == 33, counter == { 0x0, 0x2, 0x0, 0x0 }
+ */
+static int next_index(u32 *counter)
 {
-   int x;
-   __be32 *p = (__be32 *) block;
+   int i, res = 0;
 
-   for (p += 3, x = 0; x < 128; p--, x += 32) {
-   u32 val = be32_to_cpup(p);
-
-   if (!~val)
-   continue;
-
-   return x + ffz(val);
+   for (i = 0; i < 4; i++) {
+   if (counter[i] + 1 != 0) {
+   res += ffz(counter[i]++);
+   break;
+   }
+   counter[i] = 0;
+   res += 32;
}
 
/*
@@ -214,8 +215,9 @@ static int pre_crypt(struct skcipher_request *req)
struct scatterlist *sg;
unsigned cryptlen;
unsigned offset;
-   be128 *iv;
bool more;
+   __be32 *iv;
+   u32 counter[4];
int err;
 
subreq = &rctx->subreq;
@@ -230,7 +232,12 @@ static int pre_crypt(struct skcipher_request *req)
   cryptlen, req->iv);
 
err = skcipher_walk_virt(&w, subreq, false);
-   iv = w.iv;
+   iv = (__be32 *)w.iv;
+
+   counter[0] = be32_to_cpu(iv[3]);
+   counter[1] = be32

[dm-devel] [PATCH v4 0/4] crypto: lrw - Fixes and improvements

2018-09-14 Thread Ondrej Mosnacek
This patchset contains a corner-case fix and several improvements  for
the LRW template.

The first patch fixes an out-of-bounds array access (and subsequently
incorrect cipher output) when the LRW counter goes from all ones to all
zeros. This patch should be applied to the crypto-2.6 tree and also go
to stable.

The second patch adds a test vector for lrw(aes) that covers the above
bug.

The third patch is a small optimization of the LRW tweak computation.

The fourth patch is a follow-up to a similar patch for XTS (it
simplifies away the use of dynamically allocated auxiliary buffer to
cache the computed tweak values):
https://patchwork.kernel.org/patch/10588775/

Patches 2-4 should be applied only to cryptodev-2.6, but they all depend
on the first patch.

Changes in v4:
  - applied various corrections/suggestions from Eric Biggers
  - added a fix for buggy behavior on counter wrap-around (+ test vector)

v3: https://www.spinics.net/lists/linux-crypto/msg34946.html
Changes in v3:
  - fix a copy-paste error

v2: https://www.spinics.net/lists/linux-crypto/msg34890.html
Changes in v2:
  - small cleanup suggested by Eric Biggers

v1: https://www.spinics.net/lists/linux-crypto/msg34871.html

Ondrej Mosnacek (4):
  crypto: lrw - Fix out-of bounds access on counter overflow
  crypto: testmgr - Add test for LRW counter wrap-around
  crypto: lrw - Optimize tweak computation
  crypto: lrw - Do not use auxiliary buffer

 crypto/lrw.c | 342 +--
 crypto/testmgr.h |  21 +++
 2 files changed, 112 insertions(+), 251 deletions(-)

-- 
2.17.1

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


[dm-devel] [PATCH v4 1/4] crypto: lrw - Fix out-of bounds access on counter overflow

2018-09-14 Thread Ondrej Mosnacek
When the LRW block counter overflows, the current implementation returns
128 as the index to the precomputed multiplication table, which has 128
entries. This patch fixes it to return the correct value (127).

Fixes: 64470f1b8510 ("[CRYPTO] lrw: Liskov Rivest Wagner, a tweakable narrow 
block cipher mode")
Cc:  # 2.6.20+
Reported-by: Eric Biggers 
Signed-off-by: Ondrej Mosnacek 
---
 crypto/lrw.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/crypto/lrw.c b/crypto/lrw.c
index 393a782679c7..5504d1325a56 100644
--- a/crypto/lrw.c
+++ b/crypto/lrw.c
@@ -143,7 +143,12 @@ static inline int get_index128(be128 *block)
return x + ffz(val);
}
 
-   return x;
+   /*
+* If we get here, then x == 128 and we are incrementing the counter
+* from all ones to all zeros. This means we must return index 127, i.e.
+* the one corresponding to key2*{ 1,...,1 }.
+*/
+   return 127;
 }
 
 static int post_crypt(struct skcipher_request *req)
-- 
2.17.1

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


Re: [dm-devel] [PATCH v3 2/2] crypto: lrw - Do not use auxiliary buffer

2018-09-13 Thread Ondrej Mosnacek
On Wed, Sep 12, 2018 at 8:51 AM Eric Biggers  wrote:
> On Tue, Sep 11, 2018 at 09:42:39AM +0200, Ondrej Mosnacek wrote:
> > This patch simplifies the LRW template to recompute the LRW tweaks from
> > scratch in the second pass and thus also removes the need to allocate a
> > dynamic buffer using kmalloc().
> >
> > As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt.
> >
> > PERFORMANCE MEASUREMENTS (x86_64)
> > Performed using: https://gitlab.com/omos/linux-crypto-bench
> > Crypto driver used: lrw(ecb-aes-aesni)
> >
> > The results show that the new code has about the same performance as the
> > old code. For 512-byte message it seems to be even slightly faster, but
> > that might be just noise.
> >
> > Before:
> >ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
> > lrw(aes) 256  64 200 203
> > lrw(aes) 320  64 202 204
> > lrw(aes) 384  64 204 205
> > lrw(aes) 256 512 415 415
> > lrw(aes) 320 512 432 440
> > lrw(aes) 384 512 449 451
> > lrw(aes) 256409618381995
> > lrw(aes) 320409621231980
> > lrw(aes) 384409621002119
> > lrw(aes) 256   1638471836954
> > lrw(aes) 320   1638478447631
> > lrw(aes) 384   1638482568126
> > lrw(aes) 256   32768   14772   14484
> > lrw(aes) 320   32768   15281   15431
> > lrw(aes) 384   32768   16469   16293
> >
> > After:
> >ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
> > lrw(aes) 256  64 197 196
> > lrw(aes) 320  64 200 197
> > lrw(aes) 384  64 203 199
> > lrw(aes) 256 512 385 380
> > lrw(aes) 320 512 401 395
> > lrw(aes) 384 512 415 415
> > lrw(aes) 256409618691846
> > lrw(aes) 320409620801981
> > lrw(aes) 384409621602109
> > lrw(aes) 256   1638470777127
> > lrw(aes) 320   1638478077766
> > lrw(aes) 384   1638481088357
> > lrw(aes) 256   32768   14111   14454
> > lrw(aes) 320   32768   15268   15082
> > lrw(aes) 384   32768   16581   16250
> >
> > [1] https://lkml.org/lkml/2018/8/23/1315
> >
> > Signed-off-by: Ondrej Mosnacek 
> > ---
> >  crypto/lrw.c | 280 ++-
> >  1 file changed, 51 insertions(+), 229 deletions(-)
> >
> > diff --git a/crypto/lrw.c b/crypto/lrw.c
> > index b4f30b6f16d6..d5d2fba9af59 100644
> > --- a/crypto/lrw.c
> > +++ b/crypto/lrw.c
> > @@ -29,8 +29,6 @@
> >  #include 
> >  #include 
> >
> > -#define LRW_BUFFER_SIZE 128u
> > -
> >  #define LRW_BLOCK_SIZE 16
> >
> >  struct priv {
> > @@ -56,19 +54,7 @@ struct priv {
> >  };
> >
> >  struct rctx {
> > - be128 buf[LRW_BUFFER_SIZE / sizeof(be128)];
> > -
> > - be128 t;
> > -
> > - be128 *ext;
> > -
> > - struct scatterlist srcbuf[2];
> > - struct scatterlist dstbuf[2];
> > - struct scatterlist *src;
> > - struct scatterlist *dst;
> > -
> > - unsigned int left;
> > -
> > + be128 t, orig_iv;
> >   struct skcipher_request subreq;
> >  };
> >
> > @@ -135,86 +121,31 @@ static int next_index(u32 *counter)
> >   return res;
> >  }
> >
> > -static int post_crypt(struct skcipher_request *req)
> > +/*
> > + * We compute the tweak masks twice (both bef

Re: [dm-devel] [PATCH v3 1/2] crypto: lrw - Optimize tweak computation

2018-09-13 Thread Ondrej Mosnacek
On Wed, Sep 12, 2018 at 8:28 AM Eric Biggers  wrote:
> Hi Ondrej,
>
> On Tue, Sep 11, 2018 at 09:42:38AM +0200, Ondrej Mosnacek wrote:
> > This patch rewrites the tweak computation to a slightly simpler method
> > that performs less bswaps. Based on performance measurements the new
> > code seems to provide slightly better performance than the old one.
> >
> > PERFORMANCE MEASUREMENTS (x86_64)
> > Performed using: https://gitlab.com/omos/linux-crypto-bench
> > Crypto driver used: lrw(ecb-aes-aesni)
> >
> > Before:
> >ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
> > lrw(aes) 256  64 204 286
> > lrw(aes) 320  64 227 203
> > lrw(aes) 384  64 208 204
> > lrw(aes) 256 512 441 439
> > lrw(aes) 320 512 456 455
> > lrw(aes) 384 512 469 483
> > lrw(aes) 256409621362190
> > lrw(aes) 320409621612213
> > lrw(aes) 384409622952369
> > lrw(aes) 256   1638476927868
> > lrw(aes) 320   1638482308691
> > lrw(aes) 384   1638489718813
> > lrw(aes) 256   32768   15336   15560
> > lrw(aes) 320   32768   16410   16346
> > lrw(aes) 384   32768   18023   17465
> >
> > After:
> >ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
> > lrw(aes) 256  64 200 203
> > lrw(aes) 320  64 202 204
> > lrw(aes) 384  64 204 205
> > lrw(aes) 256 512 415 415
> > lrw(aes) 320 512 432 440
> > lrw(aes) 384 512 449 451
> > lrw(aes) 256409618381995
> > lrw(aes) 320409621231980
> > lrw(aes) 384409621002119
> > lrw(aes) 256   1638471836954
> > lrw(aes) 320   1638478447631
> > lrw(aes) 384   1638482568126
> >     lrw(aes) 256   32768   14772   14484
> > lrw(aes) 320   32768   15281   15431
> > lrw(aes) 384   32768   16469   16293
> >
> > Signed-off-by: Ondrej Mosnacek 
> > ---
> >  crypto/lrw.c | 49 +
> >  1 file changed, 25 insertions(+), 24 deletions(-)
> >
> > diff --git a/crypto/lrw.c b/crypto/lrw.c
> > index 393a782679c7..b4f30b6f16d6 100644
> > --- a/crypto/lrw.c
> > +++ b/crypto/lrw.c
> > @@ -120,30 +120,19 @@ static int setkey(struct crypto_skcipher *parent, 
> > const u8 *key,
> >   return 0;
> >  }
> >
> > -static inline void inc(be128 *iv)
> > +static int next_index(u32 *counter)
> >  {
> > - be64_add_cpu(&iv->b, 1);
> > - if (!iv->b)
> > - be64_add_cpu(&iv->a, 1);
> > -}
> > -
> > -/* this returns the number of consequative 1 bits starting
> > - * from the right, get_index128(00 00 00 00 00 00 ... 00 00 10 FB) = 2 */
> > -static inline int get_index128(be128 *block)
> > -{
> > - int x;
> > - __be32 *p = (__be32 *) block;
> > -
> > - for (p += 3, x = 0; x < 128; p--, x += 32) {
> > - u32 val = be32_to_cpup(p);
> > -
> > - if (!~val)
> > - continue;
> > + int i, res = 0;
> >
> > - return x + ffz(val);
> > + for (i = 0; i < 4; i++) {
> > + if (counter[i] + 1 != 0) {
> > + res += ffz(counter[i]++);
> > + break;
> > + }
> > + counter[i] = 0;
> > + res += 32;
> >   }
> > -
> >

[dm-devel] [PATCH v3 2/2] crypto: lrw - Do not use auxiliary buffer

2018-09-11 Thread Ondrej Mosnacek
This patch simplifies the LRW template to recompute the LRW tweaks from
scratch in the second pass and thus also removes the need to allocate a
dynamic buffer using kmalloc().

As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt.

PERFORMANCE MEASUREMENTS (x86_64)
Performed using: https://gitlab.com/omos/linux-crypto-bench
Crypto driver used: lrw(ecb-aes-aesni)

The results show that the new code has about the same performance as the
old code. For 512-byte message it seems to be even slightly faster, but
that might be just noise.

Before:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 200 203
lrw(aes) 320  64 202 204
lrw(aes) 384  64 204 205
lrw(aes) 256 512 415 415
lrw(aes) 320 512 432 440
lrw(aes) 384 512 449 451
lrw(aes) 256409618381995
lrw(aes) 320409621231980
lrw(aes) 384409621002119
lrw(aes) 256   1638471836954
lrw(aes) 320   1638478447631
lrw(aes) 384   1638482568126
lrw(aes) 256   32768   14772   14484
lrw(aes) 320   32768   15281   15431
lrw(aes) 384   32768   16469   16293

After:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 197 196
lrw(aes) 320  64 200 197
lrw(aes) 384  64 203 199
lrw(aes) 256 512 385 380
lrw(aes) 320 512 401 395
lrw(aes) 384 512 415 415
lrw(aes) 256409618691846
lrw(aes) 320409620801981
lrw(aes) 384409621602109
lrw(aes) 256   1638470777127
lrw(aes) 320   1638478077766
lrw(aes) 384   1638481088357
lrw(aes) 256   32768   14111   14454
lrw(aes) 320   32768   15268   15082
lrw(aes) 384   32768   16581   16250

[1] https://lkml.org/lkml/2018/8/23/1315

Signed-off-by: Ondrej Mosnacek 
---
 crypto/lrw.c | 280 ++-
 1 file changed, 51 insertions(+), 229 deletions(-)

diff --git a/crypto/lrw.c b/crypto/lrw.c
index b4f30b6f16d6..d5d2fba9af59 100644
--- a/crypto/lrw.c
+++ b/crypto/lrw.c
@@ -29,8 +29,6 @@
 #include 
 #include 
 
-#define LRW_BUFFER_SIZE 128u
-
 #define LRW_BLOCK_SIZE 16
 
 struct priv {
@@ -56,19 +54,7 @@ struct priv {
 };
 
 struct rctx {
-   be128 buf[LRW_BUFFER_SIZE / sizeof(be128)];
-
-   be128 t;
-
-   be128 *ext;
-
-   struct scatterlist srcbuf[2];
-   struct scatterlist dstbuf[2];
-   struct scatterlist *src;
-   struct scatterlist *dst;
-
-   unsigned int left;
-
+   be128 t, orig_iv;
struct skcipher_request subreq;
 };
 
@@ -135,86 +121,31 @@ static int next_index(u32 *counter)
return res;
 }
 
-static int post_crypt(struct skcipher_request *req)
+/*
+ * We compute the tweak masks twice (both before and after the ECB encryption 
or
+ * decryption) to avoid having to allocate a temporary buffer and/or make
+ * mutliple calls to the 'ecb(..)' instance, which usually would be slower than
+ * just doing the gf128mul_x_ble() calls again.
+ */
+static int xor_tweak(struct skcipher_request *req, bool second_pass)
 {
-   struct rctx *rctx = skcipher_request_ctx(req);
-   be128 *buf = rctx->ext ?: rctx->buf;
-   struct skcipher_request *subreq;
const int bs = LRW_BLOCK_SIZE;
-   struct skcipher_walk w;
-   struct scatterlist *sg;
-   unsigned offset;
-   int err;
-
-   subreq = &rctx->subreq;
-   err = skcipher_walk_virt(&w, subreq, false);
-
-   while (w.nbytes) {
-   unsigned int avail = w.nbytes;
-   be128 *wdst;
-
-   wdst = w.dst.virt.addr;
-
-   do {
-   be128_xor(wdst, buf++, wdst);
-   wdst++;
-   } while ((avail -= bs) >= bs);
-
-   err 

[dm-devel] [PATCH v5] crypto: xts - Drop use of auxiliary buffer

2018-09-11 Thread Ondrej Mosnacek
Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in
gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore
caching the computed XTS tweaks has only negligible advantage over
computing them twice.

In fact, since the current caching implementation limits the size of
the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B),
it is often actually slower than the simple recomputing implementation.

This patch simplifies the XTS template to recompute the XTS tweaks from
scratch in the second pass and thus also removes the need to allocate a
dynamic buffer using kmalloc().

As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt.

PERFORMANCE RESULTS
I measured time to encrypt/decrypt a memory buffer of varying sizes with
xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest
that after this patch the performance is either better or comparable for
both small and large buffers. Note that there is a lot of noise in the
measurements, but the overall difference is easy to see.

Old code:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
xts(aes) 256  64 331 328
xts(aes) 384  64 332 333
xts(aes) 512  64 338 348
xts(aes) 256 512 889 920
xts(aes) 384 5121019 993
xts(aes) 512 5121032 990
xts(aes) 256409621522292
xts(aes) 384409624532597
xts(aes) 512409630412641
xts(aes) 256   1638494438027
xts(aes) 384   1638485368925
xts(aes) 512   1638492329417
xts(aes) 256   32768   16383   14897
xts(aes) 384   32768   17527   16102
xts(aes) 512   32768   18483   17322

New code:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
xts(aes) 256  64 328 324
xts(aes) 384  64 324 319
xts(aes) 512  64 320 322
xts(aes) 256 512 476 473
xts(aes) 384 512 509 492
xts(aes) 512 512 531 514
xts(aes) 256409621321829
xts(aes) 384409623572055
xts(aes) 512409621782027
xts(aes) 256   1638469206983
xts(aes) 384   1638485977505
xts(aes) 512   1638478418164
xts(aes) 256   32768   13468   12307
xts(aes) 384   32768   14808   13402
xts(aes) 512   32768   15753   14636

[1] https://lkml.org/lkml/2018/8/23/1315
[2] https://gitlab.com/omos/linux-crypto-bench

Signed-off-by: Ondrej Mosnacek 
---
 crypto/xts.c | 269 +--
 1 file changed, 46 insertions(+), 223 deletions(-)

Changes in v5:
  - fix dumb mistakes

v4: https://www.spinics.net/lists/linux-crypto/msg34889.html
Changes in v4:
  - small cleanup suggested by Eric Biggers

v3: https://www.spinics.net/lists/linux-crypto/msg34823.html
Changes in v3:
  - add comment explaining the new approach as suggested by Eric
  - ensure correct alignment in second xor_tweak() pass
  - align performance results table header to the right

v2: https://www.spinics.net/lists/linux-crypto/msg34799.html
Changes in v2:
  - rebase to latest cryptodev tree

v1: https://www.spinics.net/lists/linux-crypto/msg34776.html

diff --git a/crypto/xts.c b/crypto/xts.c
index ccf55fbb8bc2..847f54f76789 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -26,8 +26,6 @@
 #include 
 #include 
 
-#define XTS_BUFFER_SIZE 128u
-
 struct priv {
struct crypto_skcipher *child;
struct crypto_cipher *tweak;
@@ -39,19 +37,7 @@ struct xts_instance_ctx {
 };
 
 struct rctx {
-   le128 buf[XTS_BUFFER_SIZE / sizeof(le128)];
-
le128 t;
-
-   le128 *ext;
-
-   struct scatterlist srcbuf[2];
-   struct scatterlist dstbuf[2];
-   struct scatterlist *src;
-   struct scatterlist *dst;
-
-   unsigned int left;
-
struct skcipher_request subreq;
 };
 
@@ -96,81 +82,27 @@ static int setkey(struct crypto_skcipher *par

[dm-devel] [PATCH v3 0/2] crypto: lrw - Simplify and optimize the LRW template

2018-09-11 Thread Ondrej Mosnacek
This patchset is a follow-up to a similar patch for XTS:
https://patchwork.kernel.org/patch/10588775/

The first patch applies a small optimization to the tweak computation
and the second simplifes away the use of auxiliary buffer to cache
computed tweaks.

Changes in v3:
  - fix a copy-paste error

v2: https://www.spinics.net/lists/linux-crypto/msg34890.html
Changes in v2:
  - small cleanup suggested by Eric Biggers

v1: https://www.spinics.net/lists/linux-crypto/msg34871.html

Ondrej Mosnacek (2):
  crypto: lrw - Optimize tweak computation
  crypto: lrw - Do not use auxiliary buffer

 crypto/lrw.c | 327 ---
 1 file changed, 75 insertions(+), 252 deletions(-)

-- 
2.17.1

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


[dm-devel] [PATCH v3 1/2] crypto: lrw - Optimize tweak computation

2018-09-11 Thread Ondrej Mosnacek
This patch rewrites the tweak computation to a slightly simpler method
that performs less bswaps. Based on performance measurements the new
code seems to provide slightly better performance than the old one.

PERFORMANCE MEASUREMENTS (x86_64)
Performed using: https://gitlab.com/omos/linux-crypto-bench
Crypto driver used: lrw(ecb-aes-aesni)

Before:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 204 286
lrw(aes) 320  64 227 203
lrw(aes) 384  64 208 204
lrw(aes) 256 512 441 439
lrw(aes) 320 512 456 455
lrw(aes) 384 512 469 483
lrw(aes) 256409621362190
lrw(aes) 320409621612213
lrw(aes) 384409622952369
lrw(aes) 256   1638476927868
lrw(aes) 320   1638482308691
lrw(aes) 384   1638489718813
lrw(aes) 256   32768   15336   15560
lrw(aes) 320   32768   16410   16346
lrw(aes) 384   32768   18023   17465

After:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 200 203
lrw(aes) 320  64 202 204
lrw(aes) 384  64 204 205
lrw(aes) 256 512 415 415
lrw(aes) 320 512 432 440
lrw(aes) 384 512 449 451
lrw(aes) 256409618381995
lrw(aes) 320409621231980
lrw(aes) 384409621002119
lrw(aes) 256   1638471836954
lrw(aes) 320   1638478447631
lrw(aes) 384   1638482568126
lrw(aes) 256   32768   14772   14484
lrw(aes) 320   32768   15281   15431
lrw(aes) 384   32768   16469   16293

Signed-off-by: Ondrej Mosnacek 
---
 crypto/lrw.c | 49 +
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/crypto/lrw.c b/crypto/lrw.c
index 393a782679c7..b4f30b6f16d6 100644
--- a/crypto/lrw.c
+++ b/crypto/lrw.c
@@ -120,30 +120,19 @@ static int setkey(struct crypto_skcipher *parent, const 
u8 *key,
return 0;
 }
 
-static inline void inc(be128 *iv)
+static int next_index(u32 *counter)
 {
-   be64_add_cpu(&iv->b, 1);
-   if (!iv->b)
-   be64_add_cpu(&iv->a, 1);
-}
-
-/* this returns the number of consequative 1 bits starting
- * from the right, get_index128(00 00 00 00 00 00 ... 00 00 10 FB) = 2 */
-static inline int get_index128(be128 *block)
-{
-   int x;
-   __be32 *p = (__be32 *) block;
-
-   for (p += 3, x = 0; x < 128; p--, x += 32) {
-   u32 val = be32_to_cpup(p);
-
-   if (!~val)
-   continue;
+   int i, res = 0;
 
-   return x + ffz(val);
+   for (i = 0; i < 4; i++) {
+   if (counter[i] + 1 != 0) {
+   res += ffz(counter[i]++);
+   break;
+   }
+   counter[i] = 0;
+   res += 32;
}
-
-   return x;
+   return res;
 }
 
 static int post_crypt(struct skcipher_request *req)
@@ -209,8 +198,9 @@ static int pre_crypt(struct skcipher_request *req)
struct scatterlist *sg;
unsigned cryptlen;
unsigned offset;
-   be128 *iv;
bool more;
+   __u32 *iv;
+   u32 counter[4];
int err;
 
subreq = &rctx->subreq;
@@ -227,6 +217,11 @@ static int pre_crypt(struct skcipher_request *req)
err = skcipher_walk_virt(&w, subreq, false);
iv = w.iv;
 
+   counter[0] = be32_to_cpu(iv[3]);
+   counter[1] = be32_to_cpu(iv[2]);
+   counter[2] = be32_to_cpu(iv[1]);
+   counter[3] = be32_to_cpu(iv[0]);
+
while (w.nbytes) {
unsigned int avail = w.nbytes;
be128 *wsrc;
@@ -242,10 +237,16 @@ static int pre_crypt(struct skcipher_request *req)
/* T <- I*Key2, using the optimization
 * discussed in the specification

Re: [dm-devel] [PATCH v4] crypto: xts - Drop use of auxiliary buffer

2018-09-11 Thread Ondrej Mosnacek
On Mon, Sep 10, 2018 at 6:17 PM Eric Biggers  wrote:
> Hi Ondrej,
>
> On Mon, Sep 10, 2018 at 01:28:41PM +0200, Ondrej Mosnacek wrote:
> >
> > -static int init_crypt(struct skcipher_request *req, crypto_completion_t 
> > done)
> > +static int xor_tweak_pre(struct skcipher_request *req)
> >  {
> > - struct priv *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
> > - struct rctx *rctx = skcipher_request_ctx(req);
> > - struct skcipher_request *subreq;
> > - gfp_t gfp;
> > -
> > - subreq = &rctx->subreq;
> > - skcipher_request_set_tfm(subreq, ctx->child);
> > - skcipher_request_set_callback(subreq, req->base.flags, done, req);
> > -
> > - gfp = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL :
> > -GFP_ATOMIC;
> > - rctx->ext = NULL;
> > -
> > - subreq->cryptlen = XTS_BUFFER_SIZE;
> > - if (req->cryptlen > XTS_BUFFER_SIZE) {
> > - unsigned int n = min(req->cryptlen, (unsigned int)PAGE_SIZE);
> > -
> > - rctx->ext = kmalloc(n, gfp);
> > - if (rctx->ext)
> > - subreq->cryptlen = n;
> > - }
> > -
> > - rctx->src = req->src;
> > - rctx->dst = req->dst;
> > - rctx->left = req->cryptlen;
> > -
> > - /* calculate first value of T */
> > - crypto_cipher_encrypt_one(ctx->tweak, (u8 *)&rctx->t, req->iv);
> > -
> > - return 0;
> > + return xor_tweak(req, false);
> >  }
> >
> > -static void exit_crypt(struct skcipher_request *req)
> > +static int xor_tweak_post(struct skcipher_request *req)
> >  {
> > - struct rctx *rctx = skcipher_request_ctx(req);
> > -
> > - rctx->left = 0;
> > -
> > - if (rctx->ext)
> > - kzfree(rctx->ext);
> > + return xor_tweak(req, false);
> >  }
>
> I think you meant 'xor_tweak(req, true);' here?

Ouch, yes I sent the patches out too quickly... Will fix it all and resubmit.

>
> > +static void crypt_done(struct crypto_async_request *areq, int err)
> >  {
> >   struct skcipher_request *req = areq->data;
> > - struct skcipher_request *subreq;
> > - struct rctx *rctx;
> > -
> > - rctx = skcipher_request_ctx(req);
> > -
> > - if (err == -EINPROGRESS) {
> > - if (rctx->left != req->cryptlen)
> > - return;
> > - goto out;
> > - }
> > -
> > - subreq = &rctx->subreq;
> > - subreq->base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG;
> >
> > - err = do_encrypt(req, err ?: post_crypt(req));
> > - if (rctx->left)
> > - return;
> > + if (!err)
> > + err = xor_tweak(req, true);
>
> Use xor_tweak_post()?
>
> Note that you could also change the bool to
>
> enum {
> FIRST_PASS,
> SECOND_PASS,
> }
>
> if you find the bool to be unclear.

Yes, that would work, too. I slightly prefer aliasing the functions,
but I guess it is a matter of taste...

Thanks for the review,

-- 
Ondrej Mosnacek 
Associate Software Engineer, Security Technologies
Red Hat, Inc.

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


[dm-devel] [PATCH v2 2/2] crypto: lrw - Do not use auxiliary buffer

2018-09-10 Thread Ondrej Mosnacek
This patch simplifies the LRW template to recompute the LRW tweaks from
scratch in the second pass and thus also removes the need to allocate a
dynamic buffer using kmalloc().

As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt.

PERFORMANCE MEASUREMENTS (x86_64)
Performed using: https://gitlab.com/omos/linux-crypto-bench
Crypto driver used: lrw(ecb-aes-aesni)

The results show that the new code has about the same performance as the
old code. For 512-byte message it seems to be even slightly faster, but
that might be just noise.

Before:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 200 203
lrw(aes) 320  64 202 204
lrw(aes) 384  64 204 205
lrw(aes) 256 512 415 415
lrw(aes) 320 512 432 440
lrw(aes) 384 512 449 451
lrw(aes) 256409618381995
lrw(aes) 320409621231980
lrw(aes) 384409621002119
lrw(aes) 256   1638471836954
lrw(aes) 320   1638478447631
lrw(aes) 384   1638482568126
lrw(aes) 256   32768   14772   14484
lrw(aes) 320   32768   15281   15431
lrw(aes) 384   32768   16469   16293

After:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 197 196
lrw(aes) 320  64 200 197
lrw(aes) 384  64 203 199
lrw(aes) 256 512 385 380
lrw(aes) 320 512 401 395
lrw(aes) 384 512 415 415
lrw(aes) 256409618691846
lrw(aes) 320409620801981
lrw(aes) 384409621602109
lrw(aes) 256   1638470777127
lrw(aes) 320   1638478077766
lrw(aes) 384   1638481088357
lrw(aes) 256   32768   14111   14454
lrw(aes) 320   32768   15268   15082
lrw(aes) 384   32768   16581   16250

[1] https://lkml.org/lkml/2018/8/23/1315

Signed-off-by: Ondrej Mosnacek 
---
 crypto/lrw.c | 280 ++-
 1 file changed, 51 insertions(+), 229 deletions(-)

diff --git a/crypto/lrw.c b/crypto/lrw.c
index b4f30b6f16d6..6315a6a59589 100644
--- a/crypto/lrw.c
+++ b/crypto/lrw.c
@@ -29,8 +29,6 @@
 #include 
 #include 
 
-#define LRW_BUFFER_SIZE 128u
-
 #define LRW_BLOCK_SIZE 16
 
 struct priv {
@@ -56,19 +54,7 @@ struct priv {
 };
 
 struct rctx {
-   be128 buf[LRW_BUFFER_SIZE / sizeof(be128)];
-
-   be128 t;
-
-   be128 *ext;
-
-   struct scatterlist srcbuf[2];
-   struct scatterlist dstbuf[2];
-   struct scatterlist *src;
-   struct scatterlist *dst;
-
-   unsigned int left;
-
+   be128 t, orig_iv;
struct skcipher_request subreq;
 };
 
@@ -135,86 +121,31 @@ static int next_index(u32 *counter)
return res;
 }
 
-static int post_crypt(struct skcipher_request *req)
+/*
+ * We compute the tweak masks twice (both before and after the ECB encryption 
or
+ * decryption) to avoid having to allocate a temporary buffer and/or make
+ * mutliple calls to the 'ecb(..)' instance, which usually would be slower than
+ * just doing the gf128mul_x_ble() calls again.
+ */
+static int xor_tweak(struct skcipher_request *req, bool second_pass)
 {
-   struct rctx *rctx = skcipher_request_ctx(req);
-   be128 *buf = rctx->ext ?: rctx->buf;
-   struct skcipher_request *subreq;
const int bs = LRW_BLOCK_SIZE;
-   struct skcipher_walk w;
-   struct scatterlist *sg;
-   unsigned offset;
-   int err;
-
-   subreq = &rctx->subreq;
-   err = skcipher_walk_virt(&w, subreq, false);
-
-   while (w.nbytes) {
-   unsigned int avail = w.nbytes;
-   be128 *wdst;
-
-   wdst = w.dst.virt.addr;
-
-   do {
-   be128_xor(wdst, buf++, wdst);
-   wdst++;
-   } while ((avail -= bs) >= bs);
-
-   err 

[dm-devel] [PATCH v2 1/2] crypto: lrw - Optimize tweak computation

2018-09-10 Thread Ondrej Mosnacek
This patch rewrites the tweak computation to a slightly simpler method
that performs less bswaps. Based on performance measurements the new
code seems to provide slightly better performance than the old one.

PERFORMANCE MEASUREMENTS (x86_64)
Performed using: https://gitlab.com/omos/linux-crypto-bench
Crypto driver used: lrw(ecb-aes-aesni)

Before:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 204 286
lrw(aes) 320  64 227 203
lrw(aes) 384  64 208 204
lrw(aes) 256 512 441 439
lrw(aes) 320 512 456 455
lrw(aes) 384 512 469 483
lrw(aes) 256409621362190
lrw(aes) 320409621612213
lrw(aes) 384409622952369
lrw(aes) 256   1638476927868
lrw(aes) 320   1638482308691
lrw(aes) 384   1638489718813
lrw(aes) 256   32768   15336   15560
lrw(aes) 320   32768   16410   16346
lrw(aes) 384   32768   18023   17465

After:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 200 203
lrw(aes) 320  64 202 204
lrw(aes) 384  64 204 205
lrw(aes) 256 512 415 415
lrw(aes) 320 512 432 440
lrw(aes) 384 512 449 451
lrw(aes) 256409618381995
lrw(aes) 320409621231980
lrw(aes) 384409621002119
lrw(aes) 256   1638471836954
lrw(aes) 320   1638478447631
lrw(aes) 384   1638482568126
lrw(aes) 256   32768   14772   14484
lrw(aes) 320   32768   15281   15431
lrw(aes) 384   32768   16469   16293

Signed-off-by: Ondrej Mosnacek 
---
 crypto/lrw.c | 49 +
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/crypto/lrw.c b/crypto/lrw.c
index 393a782679c7..b4f30b6f16d6 100644
--- a/crypto/lrw.c
+++ b/crypto/lrw.c
@@ -120,30 +120,19 @@ static int setkey(struct crypto_skcipher *parent, const 
u8 *key,
return 0;
 }
 
-static inline void inc(be128 *iv)
+static int next_index(u32 *counter)
 {
-   be64_add_cpu(&iv->b, 1);
-   if (!iv->b)
-   be64_add_cpu(&iv->a, 1);
-}
-
-/* this returns the number of consequative 1 bits starting
- * from the right, get_index128(00 00 00 00 00 00 ... 00 00 10 FB) = 2 */
-static inline int get_index128(be128 *block)
-{
-   int x;
-   __be32 *p = (__be32 *) block;
-
-   for (p += 3, x = 0; x < 128; p--, x += 32) {
-   u32 val = be32_to_cpup(p);
-
-   if (!~val)
-   continue;
+   int i, res = 0;
 
-   return x + ffz(val);
+   for (i = 0; i < 4; i++) {
+   if (counter[i] + 1 != 0) {
+   res += ffz(counter[i]++);
+   break;
+   }
+   counter[i] = 0;
+   res += 32;
}
-
-   return x;
+   return res;
 }
 
 static int post_crypt(struct skcipher_request *req)
@@ -209,8 +198,9 @@ static int pre_crypt(struct skcipher_request *req)
struct scatterlist *sg;
unsigned cryptlen;
unsigned offset;
-   be128 *iv;
bool more;
+   __u32 *iv;
+   u32 counter[4];
int err;
 
subreq = &rctx->subreq;
@@ -227,6 +217,11 @@ static int pre_crypt(struct skcipher_request *req)
err = skcipher_walk_virt(&w, subreq, false);
iv = w.iv;
 
+   counter[0] = be32_to_cpu(iv[3]);
+   counter[1] = be32_to_cpu(iv[2]);
+   counter[2] = be32_to_cpu(iv[1]);
+   counter[3] = be32_to_cpu(iv[0]);
+
while (w.nbytes) {
unsigned int avail = w.nbytes;
be128 *wsrc;
@@ -242,10 +237,16 @@ static int pre_crypt(struct skcipher_request *req)
/* T <- I*Key2, using the optimization
 * discussed in the specification

[dm-devel] [PATCH v2 0/2] crypto: lrw - Simplify and optimize the LRW template

2018-09-10 Thread Ondrej Mosnacek
This patchset is a follow-up to a similar patch for XTS:
https://patchwork.kernel.org/patch/10588775/

The first patch applies a small optimization to the tweak computation
and the second simplifes away the use of auxiliary buffer to cache
computed tweaks.

Changes in v2:
  - small cleanup suggested by Eric Biggers

v1: https://www.spinics.net/lists/linux-crypto/msg34871.html

Ondrej Mosnacek (2):
  crypto: lrw - Optimize tweak computation
  crypto: lrw - Do not use auxiliary buffer

 crypto/lrw.c | 327 ---
 1 file changed, 75 insertions(+), 252 deletions(-)

-- 
2.17.1

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


[dm-devel] [PATCH v4] crypto: xts - Drop use of auxiliary buffer

2018-09-10 Thread Ondrej Mosnacek
Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in
gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore
caching the computed XTS tweaks has only negligible advantage over
computing them twice.

In fact, since the current caching implementation limits the size of
the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B),
it is often actually slower than the simple recomputing implementation.

This patch simplifies the XTS template to recompute the XTS tweaks from
scratch in the second pass and thus also removes the need to allocate a
dynamic buffer using kmalloc().

As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt.

PERFORMANCE RESULTS
I measured time to encrypt/decrypt a memory buffer of varying sizes with
xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest
that after this patch the performance is either better or comparable for
both small and large buffers. Note that there is a lot of noise in the
measurements, but the overall difference is easy to see.

Old code:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
xts(aes) 256  64 331 328
xts(aes) 384  64 332 333
xts(aes) 512  64 338 348
xts(aes) 256 512 889 920
xts(aes) 384 5121019 993
xts(aes) 512 5121032 990
xts(aes) 256409621522292
xts(aes) 384409624532597
xts(aes) 512409630412641
xts(aes) 256   1638494438027
xts(aes) 384   1638485368925
xts(aes) 512   1638492329417
xts(aes) 256   32768   16383   14897
xts(aes) 384   32768   17527   16102
xts(aes) 512   32768   18483   17322

New code:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
xts(aes) 256  64 328 324
xts(aes) 384  64 324 319
xts(aes) 512  64 320 322
xts(aes) 256 512 476 473
xts(aes) 384 512 509 492
xts(aes) 512 512 531 514
xts(aes) 256409621321829
xts(aes) 384409623572055
xts(aes) 512409621782027
xts(aes) 256   1638469206983
xts(aes) 384   1638485977505
xts(aes) 512   1638478418164
xts(aes) 256   32768   13468   12307
xts(aes) 384   32768   14808   13402
xts(aes) 512   32768   15753   14636

[1] https://lkml.org/lkml/2018/8/23/1315
[2] https://gitlab.com/omos/linux-crypto-bench

Signed-off-by: Ondrej Mosnacek 
---
 crypto/xts.c | 269 +--
 1 file changed, 46 insertions(+), 223 deletions(-)

Changes in v4:
  - small cleanup suggested by Eric Biggers

v3: https://www.spinics.net/lists/linux-crypto/msg34823.html
Changes in v3:
  - add comment explaining the new approach as suggested by Eric
  - ensure correct alignment in second xor_tweak() pass
  - align performance results table header to the right

v2: https://www.spinics.net/lists/linux-crypto/msg34799.html
Changes in v2:
  - rebase to latest cryptodev tree

v1: https://www.spinics.net/lists/linux-crypto/msg34776.html

diff --git a/crypto/xts.c b/crypto/xts.c
index ccf55fbb8bc2..daa57a8c266d 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -26,8 +26,6 @@
 #include 
 #include 
 
-#define XTS_BUFFER_SIZE 128u
-
 struct priv {
struct crypto_skcipher *child;
struct crypto_cipher *tweak;
@@ -39,19 +37,7 @@ struct xts_instance_ctx {
 };
 
 struct rctx {
-   le128 buf[XTS_BUFFER_SIZE / sizeof(le128)];
-
le128 t;
-
-   le128 *ext;
-
-   struct scatterlist srcbuf[2];
-   struct scatterlist dstbuf[2];
-   struct scatterlist *src;
-   struct scatterlist *dst;
-
-   unsigned int left;
-
struct skcipher_request subreq;
 };
 
@@ -96,81 +82,27 @@ static int setkey(struct crypto_skcipher *parent, const u8 
*key,
return err;
 }
 
-static int post_crypt(struct skcipher_request *req)

Re: [dm-devel] [PATCH v3] crypto: xts - Drop use of auxiliary buffer

2018-09-10 Thread Ondrej Mosnacek
On Sat, Sep 8, 2018 at 3:35 AM Eric Biggers  wrote:
> Hi Ondrej,
>
> On Wed, Sep 05, 2018 at 01:30:39PM +0200, Ondrej Mosnacek wrote:
> > Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in
> > gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore
> > caching the computed XTS tweaks has only negligible advantage over
> > computing them twice.
> >
> > In fact, since the current caching implementation limits the size of
> > the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B),
> > it is often actually slower than the simple recomputing implementation.
> >
> > This patch simplifies the XTS template to recompute the XTS tweaks from
> > scratch in the second pass and thus also removes the need to allocate a
> > dynamic buffer using kmalloc().
> >
> > As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt.
> >
> > PERFORMANCE RESULTS
> > I measured time to encrypt/decrypt a memory buffer of varying sizes with
> > xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest
> > that after this patch the performance is either better or comparable for
> > both small and large buffers. Note that there is a lot of noise in the
> > measurements, but the overall difference is easy to see.
> >
> > Old code:
> >ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
> > xts(aes) 256  64 331 328
> > xts(aes) 384  64 332 333
> > xts(aes) 512  64 338 348
> > xts(aes) 256 512 889 920
> > xts(aes) 384 5121019 993
> > xts(aes) 512 5121032 990
> > xts(aes) 256409621522292
> > xts(aes) 384409624532597
> > xts(aes) 512409630412641
> > xts(aes) 256   1638494438027
> > xts(aes) 384   1638485368925
> > xts(aes) 512   1638492329417
> > xts(aes) 256   32768   16383   14897
> > xts(aes) 384   32768   17527   16102
> > xts(aes) 512   32768   18483   17322
> >
> > New code:
> >ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
> > xts(aes) 256  64 328 324
> > xts(aes) 384  64 324 319
> > xts(aes) 512  64 320 322
> > xts(aes) 256 512 476 473
> > xts(aes) 384 512 509 492
> > xts(aes) 512 512 531 514
> > xts(aes) 256409621321829
> > xts(aes) 384409623572055
> > xts(aes) 512409621782027
> > xts(aes) 256   1638469206983
> > xts(aes) 384   1638485977505
> > xts(aes) 512   1638478418164
> >     xts(aes) 256   32768   13468   12307
> > xts(aes) 384   32768   14808   13402
> > xts(aes) 512   32768   15753   14636
> >
> > [1] https://lkml.org/lkml/2018/8/23/1315
> > [2] https://gitlab.com/omos/linux-crypto-bench
> >
> > Signed-off-by: Ondrej Mosnacek 
> > ---
> >  crypto/xts.c | 265 ---
> >  1 file changed, 39 insertions(+), 226 deletions(-)
> >
> > Changes in v3:
> >   - add comment explaining the new approach as suggested by Eric
> >   - ensure correct alignment in second xor_tweak() pass
> >   - align performance results table header to the right
> >
> > v2: https://www.spinics.net/lists/linux-crypto/msg34799.html
> > Changes in v2:
> >   - rebase to latest cryptodev tree
> >
> > v1: https://www.spinics.net/lists/linux-crypto/msg34776.html
> >
> > diff --git a/crypto/xts.

[dm-devel] [PATCH 0/2] crpyto: lrw - Simplify and optimize the LRW template

2018-09-07 Thread Ondrej Mosnacek
This patchset is a follow-up to a similar patch for XTS:
https://patchwork.kernel.org/patch/10588775/

The first patch applies a small optimization to the tweak computation
and the second simplifes away the use of auxiliary buffer to cache
computed tweaks.

Ondrej Mosnacek (2):
  crypto: lrw - Optimize tweak computation
  crypto: lrw - Do not use auxiliary buffer

 crypto/lrw.c | 321 +++
 1 file changed, 66 insertions(+), 255 deletions(-)

-- 
2.17.1

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


[dm-devel] [PATCH 2/2] crypto: lrw - Do not use auxiliary buffer

2018-09-07 Thread Ondrej Mosnacek
This patch simplifies the LRW template to recompute the LRW tweaks from
scratch in the second pass and thus also removes the need to allocate a
dynamic buffer using kmalloc().

As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt.

PERFORMANCE MEASUREMENTS (x86_64)
Performed using: https://gitlab.com/omos/linux-crypto-bench
Crypto driver used: lrw(ecb-aes-aesni)

The results show that the new code has about the same performance as the
old code. For 512-byte message it seems to be even slightly faster, but
that might be just noise.

Before:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 200 203
lrw(aes) 320  64 202 204
lrw(aes) 384  64 204 205
lrw(aes) 256 512 415 415
lrw(aes) 320 512 432 440
lrw(aes) 384 512 449 451
lrw(aes) 256409618381995
lrw(aes) 320409621231980
lrw(aes) 384409621002119
lrw(aes) 256   1638471836954
lrw(aes) 320   1638478447631
lrw(aes) 384   1638482568126
lrw(aes) 256   32768   14772   14484
lrw(aes) 320   32768   15281   15431
lrw(aes) 384   32768   16469   16293

After:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 197 196
lrw(aes) 320  64 200 197
lrw(aes) 384  64 203 199
lrw(aes) 256 512 385 380
lrw(aes) 320 512 401 395
lrw(aes) 384 512 415 415
lrw(aes) 256409618691846
lrw(aes) 320409620801981
lrw(aes) 384409621602109
lrw(aes) 256   1638470777127
lrw(aes) 320   1638478077766
lrw(aes) 384   1638481088357
lrw(aes) 256   32768   14111   14454
lrw(aes) 320   32768   15268   15082
lrw(aes) 384   32768   16581   16250

[1] https://lkml.org/lkml/2018/8/23/1315

Signed-off-by: Ondrej Mosnacek 
---
 crypto/lrw.c | 274 ---
 1 file changed, 42 insertions(+), 232 deletions(-)

diff --git a/crypto/lrw.c b/crypto/lrw.c
index b4f30b6f16d6..990eb0f2f51e 100644
--- a/crypto/lrw.c
+++ b/crypto/lrw.c
@@ -29,8 +29,6 @@
 #include 
 #include 
 
-#define LRW_BUFFER_SIZE 128u
-
 #define LRW_BLOCK_SIZE 16
 
 struct priv {
@@ -56,19 +54,7 @@ struct priv {
 };
 
 struct rctx {
-   be128 buf[LRW_BUFFER_SIZE / sizeof(be128)];
-
-   be128 t;
-
-   be128 *ext;
-
-   struct scatterlist srcbuf[2];
-   struct scatterlist dstbuf[2];
-   struct scatterlist *src;
-   struct scatterlist *dst;
-
-   unsigned int left;
-
+   be128 t, orig_iv;
struct skcipher_request subreq;
 };
 
@@ -135,85 +121,26 @@ static int next_index(u32 *counter)
return res;
 }
 
-static int post_crypt(struct skcipher_request *req)
+/*
+ * We compute the tweak masks twice (both before and after the ECB encryption 
or
+ * decryption) to avoid having to allocate a temporary buffer and/or make
+ * mutliple calls to the 'ecb(..)' instance, which usually would be slower than
+ * just doing the gf128mul_x_ble() calls again.
+ */
+static int xor_tweak(struct skcipher_request *req, struct skcipher_request 
*subreq)
 {
-   struct rctx *rctx = skcipher_request_ctx(req);
-   be128 *buf = rctx->ext ?: rctx->buf;
-   struct skcipher_request *subreq;
const int bs = LRW_BLOCK_SIZE;
-   struct skcipher_walk w;
-   struct scatterlist *sg;
-   unsigned offset;
-   int err;
-
-   subreq = &rctx->subreq;
-   err = skcipher_walk_virt(&w, subreq, false);
-
-   while (w.nbytes) {
-   unsigned int avail = w.nbytes;
-   be128 *wdst;
-
-   wdst = w.dst.virt.addr;
-
-   do {
-   be128_xor(wdst, buf++, wdst);
-   wdst++;
-   } while ((avail -= bs) >= bs);
-
-   err 

[dm-devel] [PATCH 1/2] crypto: lrw - Optimize tweak computation

2018-09-07 Thread Ondrej Mosnacek
This patch rewrites the tweak computation to a slightly simpler method
that performs less bswaps. Based on performance measurements the new
code seems to provide slightly better performance than the old one.

PERFORMANCE MEASUREMENTS (x86_64)
Performed using: https://gitlab.com/omos/linux-crypto-bench
Crypto driver used: lrw(ecb-aes-aesni)

Before:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 204 286
lrw(aes) 320  64 227 203
lrw(aes) 384  64 208 204
lrw(aes) 256 512 441 439
lrw(aes) 320 512 456 455
lrw(aes) 384 512 469 483
lrw(aes) 256409621362190
lrw(aes) 320409621612213
lrw(aes) 384409622952369
lrw(aes) 256   1638476927868
lrw(aes) 320   1638482308691
lrw(aes) 384   1638489718813
lrw(aes) 256   32768   15336   15560
lrw(aes) 320   32768   16410   16346
lrw(aes) 384   32768   18023   17465

After:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
lrw(aes) 256  64 200 203
lrw(aes) 320  64 202 204
lrw(aes) 384  64 204 205
lrw(aes) 256 512 415 415
lrw(aes) 320 512 432 440
lrw(aes) 384 512 449 451
lrw(aes) 256409618381995
lrw(aes) 320409621231980
lrw(aes) 384409621002119
lrw(aes) 256   1638471836954
lrw(aes) 320   1638478447631
lrw(aes) 384   1638482568126
lrw(aes) 256   32768   14772   14484
lrw(aes) 320   32768   15281   15431
lrw(aes) 384   32768   16469   16293

Signed-off-by: Ondrej Mosnacek 
---
 crypto/lrw.c | 49 +
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/crypto/lrw.c b/crypto/lrw.c
index 393a782679c7..b4f30b6f16d6 100644
--- a/crypto/lrw.c
+++ b/crypto/lrw.c
@@ -120,30 +120,19 @@ static int setkey(struct crypto_skcipher *parent, const 
u8 *key,
return 0;
 }
 
-static inline void inc(be128 *iv)
+static int next_index(u32 *counter)
 {
-   be64_add_cpu(&iv->b, 1);
-   if (!iv->b)
-   be64_add_cpu(&iv->a, 1);
-}
-
-/* this returns the number of consequative 1 bits starting
- * from the right, get_index128(00 00 00 00 00 00 ... 00 00 10 FB) = 2 */
-static inline int get_index128(be128 *block)
-{
-   int x;
-   __be32 *p = (__be32 *) block;
-
-   for (p += 3, x = 0; x < 128; p--, x += 32) {
-   u32 val = be32_to_cpup(p);
-
-   if (!~val)
-   continue;
+   int i, res = 0;
 
-   return x + ffz(val);
+   for (i = 0; i < 4; i++) {
+   if (counter[i] + 1 != 0) {
+   res += ffz(counter[i]++);
+   break;
+   }
+   counter[i] = 0;
+   res += 32;
}
-
-   return x;
+   return res;
 }
 
 static int post_crypt(struct skcipher_request *req)
@@ -209,8 +198,9 @@ static int pre_crypt(struct skcipher_request *req)
struct scatterlist *sg;
unsigned cryptlen;
unsigned offset;
-   be128 *iv;
bool more;
+   __u32 *iv;
+   u32 counter[4];
int err;
 
subreq = &rctx->subreq;
@@ -227,6 +217,11 @@ static int pre_crypt(struct skcipher_request *req)
err = skcipher_walk_virt(&w, subreq, false);
iv = w.iv;
 
+   counter[0] = be32_to_cpu(iv[3]);
+   counter[1] = be32_to_cpu(iv[2]);
+   counter[2] = be32_to_cpu(iv[1]);
+   counter[3] = be32_to_cpu(iv[0]);
+
while (w.nbytes) {
unsigned int avail = w.nbytes;
be128 *wsrc;
@@ -242,10 +237,16 @@ static int pre_crypt(struct skcipher_request *req)
/* T <- I*Key2, using the optimization
 * discussed in the specification

[dm-devel] [PATCH v3] crypto: xts - Drop use of auxiliary buffer

2018-09-06 Thread Ondrej Mosnacek
Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in
gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore
caching the computed XTS tweaks has only negligible advantage over
computing them twice.

In fact, since the current caching implementation limits the size of
the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B),
it is often actually slower than the simple recomputing implementation.

This patch simplifies the XTS template to recompute the XTS tweaks from
scratch in the second pass and thus also removes the need to allocate a
dynamic buffer using kmalloc().

As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt.

PERFORMANCE RESULTS
I measured time to encrypt/decrypt a memory buffer of varying sizes with
xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest
that after this patch the performance is either better or comparable for
both small and large buffers. Note that there is a lot of noise in the
measurements, but the overall difference is easy to see.

Old code:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
xts(aes) 256  64 331 328
xts(aes) 384  64 332 333
xts(aes) 512  64 338 348
xts(aes) 256 512 889 920
xts(aes) 384 5121019 993
xts(aes) 512 5121032 990
xts(aes) 256409621522292
xts(aes) 384409624532597
xts(aes) 512409630412641
xts(aes) 256   1638494438027
xts(aes) 384   1638485368925
xts(aes) 512   1638492329417
xts(aes) 256   32768   16383   14897
xts(aes) 384   32768   17527   16102
xts(aes) 512   32768   18483   17322

New code:
   ALGORITHM KEY (b)DATA (B)   TIME ENC (ns)   TIME DEC (ns)
xts(aes) 256  64 328 324
xts(aes) 384  64 324 319
xts(aes) 512  64 320 322
xts(aes) 256 512 476 473
xts(aes) 384 512 509 492
xts(aes) 512 512 531 514
xts(aes) 256409621321829
xts(aes) 384409623572055
xts(aes) 512409621782027
xts(aes) 256   1638469206983
xts(aes) 384   1638485977505
xts(aes) 512   1638478418164
xts(aes) 256   32768   13468   12307
xts(aes) 384   32768   14808   13402
xts(aes) 512   32768   15753   14636

[1] https://lkml.org/lkml/2018/8/23/1315
[2] https://gitlab.com/omos/linux-crypto-bench

Signed-off-by: Ondrej Mosnacek 
---
 crypto/xts.c | 265 ---
 1 file changed, 39 insertions(+), 226 deletions(-)

Changes in v3:
  - add comment explaining the new approach as suggested by Eric
  - ensure correct alignment in second xor_tweak() pass
  - align performance results table header to the right

v2: https://www.spinics.net/lists/linux-crypto/msg34799.html
Changes in v2:
  - rebase to latest cryptodev tree

v1: https://www.spinics.net/lists/linux-crypto/msg34776.html

diff --git a/crypto/xts.c b/crypto/xts.c
index ccf55fbb8bc2..24cfecdec565 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -26,8 +26,6 @@
 #include 
 #include 
 
-#define XTS_BUFFER_SIZE 128u
-
 struct priv {
struct crypto_skcipher *child;
struct crypto_cipher *tweak;
@@ -39,19 +37,7 @@ struct xts_instance_ctx {
 };
 
 struct rctx {
-   le128 buf[XTS_BUFFER_SIZE / sizeof(le128)];
-
le128 t;
-
-   le128 *ext;
-
-   struct scatterlist srcbuf[2];
-   struct scatterlist dstbuf[2];
-   struct scatterlist *src;
-   struct scatterlist *dst;
-
-   unsigned int left;
-
struct skcipher_request subreq;
 };
 
@@ -96,265 +82,92 @@ static int setkey(struct crypto_skcipher *parent, const u8 
*key,
return err;
 }
 
-static int post_crypt(struct skcipher_request *req)
+/*
+ * We compute the tweak masks twice (both before and after the ECB encryption 
or
+ * decryption) to avoid having to 

Re: [dm-devel] [PATCH v2] crypto: xts - Drop use of auxiliary buffer

2018-09-05 Thread Ondrej Mosnacek
Hi Eric,

On Wed, Sep 5, 2018 at 8:32 AM Eric Biggers  wrote:
> Hi Ondrej,
>
> On Tue, Sep 04, 2018 at 10:06:42AM +0200, Ondrej Mosnacek wrote:
> > Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in
> > gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore
> > caching the computed XTS tweaks has only negligible advantage over
> > computing them twice.
> >
> > In fact, since the current caching implementation limits the size of
> > the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B),
> > it is often actually slower than the simple recomputing implementation.
> >
> > This patch simplifies the XTS template to recompute the XTS tweaks from
> > scratch in the second pass and thus also removes the need to allocate a
> > dynamic buffer using kmalloc().
> >
> > As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt.
> >
> > PERFORMANCE RESULTS
> > I measured time to encrypt/decrypt a memory buffer of varying sizes with
> > xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest
> > that after this patch the performance is either better or comparable for
> > both small and large buffers. Note that there is a lot of noise in the
> > measurements, but the overall difference is easy to see.
> >
> > Old code:
> > ALGORITHM   KEY (b) DATA (B)TIME ENC (ns)   TIME DEC (ns)
> > xts(aes) 256  64 331 328
> > xts(aes) 384  64 332 333
> > xts(aes) 512  64 338 348
> > xts(aes) 256 512 889 920
> > xts(aes) 384 5121019 993
> > xts(aes) 512 5121032 990
> > xts(aes) 256409621522292
> > xts(aes) 384409624532597
> > xts(aes) 512409630412641
> > xts(aes) 256   1638494438027
> > xts(aes) 384   1638485368925
> > xts(aes) 512   1638492329417
> > xts(aes) 256   32768   16383   14897
> > xts(aes) 384   32768   17527   16102
> > xts(aes) 512   32768   18483   17322
> >
> > New code:
> > ALGORITHM   KEY (b) DATA (B)TIME ENC (ns)   TIME DEC (ns)
> > xts(aes) 256  64 328 324
> > xts(aes) 384  64 324 319
> > xts(aes) 512  64 320 322
> > xts(aes) 256 512 476 473
> > xts(aes) 384 512 509 492
> > xts(aes) 512 512 531 514
> > xts(aes) 256409621321829
> > xts(aes) 384409623572055
> > xts(aes) 512409621782027
> > xts(aes) 256   1638469206983
> > xts(aes) 384   1638485977505
> > xts(aes) 512   1638478418164
> > xts(aes) 256   32768   13468   12307
> > xts(aes) 384   32768   14808   13402
> > xts(aes) 512   32768   15753   14636
>
> Can you align the headers of these tables?

Sure.

>
> > +static int xor_tweak(struct rctx *rctx, struct skcipher_request *req)
> >  {
> > - struct rctx *rctx = skcipher_request_ctx(req);
> > - le128 *buf = rctx->ext ?: rctx->buf;
> > - struct skcipher_request *subreq;
> >   const int bs = XTS_BLOCK_SIZE;
> >   struct skcipher_walk w;
> > - struct scatterlist *sg;
> > - unsigned offset;
> > + le128 t = rctx->t;
> >   int err;
>
> Maybe you could add a brief comment above xor_tweak() explaining the design
> choice for posterity, e.g.:
>
> /*
>  * We compute the tweak masks twice (both before and after the ECB encryption 
> or
>  * decryption) to avoid having to allocate a temporary buffer, which u

[dm-devel] [PATCH v2] crypto: xts - Drop use of auxiliary buffer

2018-09-04 Thread Ondrej Mosnacek
Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in
gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore
caching the computed XTS tweaks has only negligible advantage over
computing them twice.

In fact, since the current caching implementation limits the size of
the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B),
it is often actually slower than the simple recomputing implementation.

This patch simplifies the XTS template to recompute the XTS tweaks from
scratch in the second pass and thus also removes the need to allocate a
dynamic buffer using kmalloc().

As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt.

PERFORMANCE RESULTS
I measured time to encrypt/decrypt a memory buffer of varying sizes with
xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest
that after this patch the performance is either better or comparable for
both small and large buffers. Note that there is a lot of noise in the
measurements, but the overall difference is easy to see.

Old code:
ALGORITHM   KEY (b) DATA (B)TIME ENC (ns)   TIME DEC (ns)
xts(aes) 256  64 331 328
xts(aes) 384  64 332 333
xts(aes) 512  64 338 348
xts(aes) 256 512 889 920
xts(aes) 384 5121019 993
xts(aes) 512 5121032 990
xts(aes) 256409621522292
xts(aes) 384409624532597
xts(aes) 512409630412641
xts(aes) 256   1638494438027
xts(aes) 384   1638485368925
xts(aes) 512   1638492329417
xts(aes) 256   32768   16383   14897
xts(aes) 384   32768   17527   16102
xts(aes) 512   32768   18483   17322

New code:
ALGORITHM   KEY (b) DATA (B)TIME ENC (ns)   TIME DEC (ns)
xts(aes) 256  64 328 324
xts(aes) 384  64 324 319
xts(aes) 512  64 320 322
xts(aes) 256 512 476 473
xts(aes) 384 512 509 492
xts(aes) 512 512 531 514
xts(aes) 256409621321829
xts(aes) 384409623572055
xts(aes) 512409621782027
xts(aes) 256   1638469206983
xts(aes) 384   1638485977505
xts(aes) 512   1638478418164
xts(aes) 256   32768   13468   12307
xts(aes) 384   32768   14808   13402
xts(aes) 512   32768   15753   14636

[1] https://lkml.org/lkml/2018/8/23/1315
[2] https://gitlab.com/omos/linux-crypto-bench

Cc: Mikulas Patocka 
Signed-off-by: Ondrej Mosnacek 
---
 crypto/xts.c | 258 ++-
 1 file changed, 30 insertions(+), 228 deletions(-)

v1: https://www.spinics.net/lists/linux-crypto/msg34776.html
Changes in v2:
  - rebase to latest cryptodev tree

diff --git a/crypto/xts.c b/crypto/xts.c
index ccf55fbb8bc2..6c49e76df8ca 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -26,8 +26,6 @@
 #include 
 #include 
 
-#define XTS_BUFFER_SIZE 128u
-
 struct priv {
struct crypto_skcipher *child;
struct crypto_cipher *tweak;
@@ -39,19 +37,7 @@ struct xts_instance_ctx {
 };
 
 struct rctx {
-   le128 buf[XTS_BUFFER_SIZE / sizeof(le128)];
-
le128 t;
-
-   le128 *ext;
-
-   struct scatterlist srcbuf[2];
-   struct scatterlist dstbuf[2];
-   struct scatterlist *src;
-   struct scatterlist *dst;
-
-   unsigned int left;
-
struct skcipher_request subreq;
 };
 
@@ -96,265 +82,81 @@ static int setkey(struct crypto_skcipher *parent, const u8 
*key,
return err;
 }
 
-static int post_crypt(struct skcipher_request *req)
+static int xor_tweak(struct rctx *rctx, struct skcipher_request *req)
 {
-   struct rctx *rctx = skcipher_request_ctx(req);
-   le128 *buf = rctx->ext ?: rctx->buf;
-   struct skcipher_request *subreq;
const int bs = XTS_BLOCK_SIZE;
struct skcipher_walk w;
-   struct scatterlist *sg;
-   unsigned offset;
+   l

[dm-devel] [PATCH] crypto: xts - Drop use of auxiliary buffer

2018-09-03 Thread Ondrej Mosnacek
Hi Herbert, Mikulas,

I noticed the discussion about using kmalloc() inside crypto code and it
made me wonder if the code in xts.c can actually be simplified to not
require kmalloc() at all, while not badly affecting performace. I played
around with it a little and it turns out we can drop the whole caching
of tweak blocks, reducing code size by ~200 lines and not only preserve,
but even improve the performance in some cases. See the full patch
below.

Obviously, this doesn't solve the general issue of using kmalloc() in
crypto API routines, but it does resolve the original reported problem
and also makes the XTS code easier to maintain.

Regards,
Ondrej

->8-
Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in
gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore
caching the computed XTS tweaks has only negligible advantage over
computing them twice.

In fact, since the current caching implementation limits the size of
the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B),
it is often actually slower than the simple recomputing implementation.

This patch simplifies the XTS template to recompute the XTS tweaks from
scratch in the second pass and thus also removes the need to allocate a
dynamic buffer using kmalloc().

As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt.

PERFORMANCE RESULTS
I measured time to encrypt/decrypt a memory buffer of varying sizes with
xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest
that after this patch the performance is either better or comparable for
both small and large buffers. Note that there is a lot of noise in the
measurements, but the overall difference is easy to see.

Old code:
ALGORITHM   KEY (b) DATA (B)TIME ENC (ns)   TIME DEC (ns)
xts(aes) 256  64 331 328
xts(aes) 384  64 332 333
xts(aes) 512  64 338 348
xts(aes) 256 512 889 920
xts(aes) 384 5121019 993
xts(aes) 512 5121032 990
xts(aes) 256409621522292
xts(aes) 384409624532597
xts(aes) 512409630412641
xts(aes) 256   1638494438027
xts(aes) 384   1638485368925
xts(aes) 512   1638492329417
xts(aes) 256   32768   16383   14897
xts(aes) 384   32768   17527   16102
xts(aes) 512   32768   18483   17322

New code:
ALGORITHM   KEY (b) DATA (B)TIME ENC (ns)   TIME DEC (ns)
xts(aes) 256  64 328 324
xts(aes) 384  64 324 319
xts(aes) 512  64 320 322
xts(aes) 256 512 476 473
xts(aes) 384 512 509 492
xts(aes) 512 512 531 514
xts(aes) 256409621321829
xts(aes) 384409623572055
xts(aes) 512409621782027
xts(aes) 256   1638469206983
xts(aes) 384   1638485977505
xts(aes) 512   1638478418164
xts(aes) 256   32768   13468   12307
xts(aes) 384   32768   14808   13402
xts(aes) 512   32768   15753   14636

[1] https://lkml.org/lkml/2018/8/23/1315
[2] https://gitlab.com/omos/linux-crypto-bench

Cc: Mikulas Patocka 
Signed-off-by: Ondrej Mosnacek 
---
 crypto/xts.c | 258 ++-
 1 file changed, 30 insertions(+), 228 deletions(-)

diff --git a/crypto/xts.c b/crypto/xts.c
index 12284183bd20..6c49e76df8ca 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -26,8 +26,6 @@
 #include 
 #include 
 
-#define XTS_BUFFER_SIZE 128u
-
 struct priv {
struct crypto_skcipher *child;
struct crypto_cipher *tweak;
@@ -39,19 +37,7 @@ struct xts_instance_ctx {
 };
 
 struct rctx {
-   le128 buf[XTS_BUFFER_SIZE / sizeof(le128)];
-
le128 t;
-
-   le128 *ext;
-
-   struct scatterlist srcbuf[2];
-   struct scatterlist dstbuf[2];
-   struct scatterlist *src;
-   struct scatterlist *dst;
-
-   unsigned int left

[dm-devel] [PATCH] dm integrity: Reject mappings too large for device

2017-06-06 Thread Ondrej Mosnacek
Before this patch, dm-integrity would successfully create mappings with
the number of sectors greater than the provided data sector count.
Attempts to read sectors of this mapping that were beyond the provided
data sector count would then yield run-time messages of the form
"device-mapper: integrity: Too big sector number: ...".

This patch fixes this by emitting an error when the requested mapping
size is bigger than the provided data sector count.

Cc: Mikulas Patocka 
Cc: Milan Broz 
Signed-off-by: Ondrej Mosnacek 
---
 drivers/md/dm-integrity.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 7910bfe50da4..4ab10cf718c9 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -3040,6 +3040,11 @@ static int dm_integrity_ctr(struct dm_target *ti, 
unsigned argc, char **argv)
ti->error = "The device is too small";
goto bad;
}
+   if (ti->len > ic->provided_data_sectors) {
+   r = -EINVAL;
+   ti->error = "Not enough provided sectors for requested mapping 
size";
+   goto bad;
+   }
 
if (!buffer_sectors)
buffer_sectors = 1;
-- 
2.11.0

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


Re: [dm-devel] dm-crypt IV generation (summary)

2017-05-18 Thread Ondrej Mosnacek
2017-04-07 8:12 GMT+02:00 Herbert Xu :
> On Fri, Mar 10, 2017 at 02:44:26PM +0100, Ondrej Mosnacek wrote:
>>
>> ISSUES:
>> a) The 'keycount' parameter.
>> In order to support multi-key modes from Loop-AES,
>> dm-crypt accepts a keycount parameter which, if it != 1, causes
>> consecutive sectors to be encrypted with a different key. This
>> parameter can be specified with any of the cipher modes, which makes
>> porting the whole scale of modes supported by dm-crypt to crypto API
>> rather messy, since a lot of dm-crypt specific stuff needs to be moved
>> into the crypto drivers.
>
> Actually I think this one can probably easily handled in the crypto
> layer.  All we need is to add a multikey template that sits on top
> of an underlying IV generator.  The multikey instance can then accept
> a key length that is a multiple of the underlying key length.

I thought of that, too, but unfortunately with TCW the key structure is:

| KEY 1 | KEY 2 | ... | KEY n | IV SEED (size = IV size) | WHITENING
(size = 16 bytes) |

So part of the key needs to be processed before it is split into multiple keys.

Also with the LMK mode, there is a similar optional key appendix,
which is of the same size as the other subkeys.

>> b) New AEAD functionality; random IV generator.
>> The soon-to-be-added AEAD functionality in dm-crypt
>> introduces a new IV generator that generates IVs randomly and stores
>> them as sector metadata. This means IV generation cannot be handled
>> solely in the driver. Also, additional AEAD implementation of IV
>> generators would be eventually needed.
>
> Again I don't see the problem here.  IV generators are supposed
> to return the IV to the caller so that it can be transmitted.
>
> For example, the IV generated in IPsec is explicitly transmitted.
> Here we just need to store the IV.

My point is that it doesn't make much sense to have a crypto API alg
that calls get_random_bytes() as part of its implementation. IMHO,
that might tempt HW drivers to replace it with some crappy
alternatives, which wouldn't be good... Also, how would we test such
alg with testmgr?

I would say the best solution in this case would be to treat the
'random' IV mode as a special case in dm-crypt: the IVs would be
generated in dm-crypt and passed to a special template (which would
have a similar interface as the other IV generation templates) that
would just pass the generated IVs to underlying skciphers instead of
generating them from the sequence number. This template would need to
provide some way to pass the IVs, e.g. by prepending them to the
plaintext/ciphertext.

Cheers,
Ondrej

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


[dm-devel] dm-crypt IV generation (summary)

2017-03-13 Thread Ondrej Mosnacek
Hi all,

I was tasked to post a summary the whole dm-crypt IV generation
problem and all the suggested solutions along with their drawbacks, so
here it goes...

PROBLEM STATEMENT:
Currently, dm-crypt uses a fixed 512-byte sector size and handles
en-/decrypting of a bio by submitting a separate request to the crypto
API for each sector. This causes a performance problem with some
crypto drivers that have a high processing overhead for small
requests, i.e. most crypto accelerators [1][2] and also e.g. the
AES-NI driver [3].

POSSIBLE SOLUTIONS:
1. Move IV generator algorithms to crypto API and allow them to
process the whole bio at once. ([5] or something equivalent)
ISSUES:
a) The 'keycount' parameter.
In order to support multi-key modes from Loop-AES,
dm-crypt accepts a keycount parameter which, if it != 1, causes
consecutive sectors to be encrypted with a different key. This
parameter can be specified with any of the cipher modes, which makes
porting the whole scale of modes supported by dm-crypt to crypto API
rather messy, since a lot of dm-crypt specific stuff needs to be moved
into the crypto drivers.
b) New AEAD functionality; random IV generator.
The soon-to-be-added AEAD functionality in dm-crypt
introduces a new IV generator that generates IVs randomly and stores
them as sector metadata. This means IV generation cannot be handled
solely in the driver. Also, additional AEAD implementation of IV
generators would be eventually needed.

2. Restrict the keycount parameter to allow only reasonable
combinations and then move IV generators to crypto API.
In Loop-AES, the multi-key mode always uses exactly 64 keys and
there is no reasonable scenario where someone would like to use
keycount != 1 with other IV mode than LMK. Thus it might be acceptable
to only work with multiple keys in LMK (and only allow keycount == 64
for it) and work with just one key in all other modes (and only allow
keycount == 1 for them). I.e., the cipher format would remain the
same, just with more restricted values.

Note that this would be basically a breaking change (the keycount
parameter is documented [4], so there may be users somewhere that use
it in some unusual combination...). Still, such combinations would
have to be used explicitly, as they are never used with common
LUKS/Loop-AES/TrueCrypt partitions (something like cryptsetup --type
plain ... or dmsetup would have to be used directly).

Applying this change would allow implementing the IV generators as
simple crypto algorithms that honor the usual interface (without
setkey hacks and passing of special structures via IV).

ISSUES:
a) Backward incompatibility (see above).
b) Again need to somehow handle the new 'random' IV generator.

3. Extend crypto API to allow passing multiple requests at once (each
with a separate IV) to crypto drivers.
(see [3])
ISSUES:
a) HW accelerators would have to specifically support this
scheme (unless they are somehow programmable).
I.e. accelerators that already have the plain64 IV
generator hard-coded could not fully benefit from this (I don't know
how many are already out there). However, future ones could implement
this 'universal' IV mode and enjoy basically the same benefit.

4. Implement support of 4KB sectors (or other sizes, too) in dm-crypt.
This would partially help, since on devices with 4K sectors the
size of each crypto request would then be 8x bigger and overhead would
be reduced without the need to mess with the dm-crypt IV generators.

ISSUES:
a) Only 4KB would be processed at once, not the whole bio (but
it may suffice).
b) Partitions encrypted as 4KB sectors could not be safely
moved to a 512B-sector device (writes would not be atomic).

QUESTIONS:
@Mike/dm-devel: Do you think it would be feasible to apply solution 2
(thus introducing the small incompatibility)?

REFERENCES:
[1] 
https://nelenkov.blogspot.com/2015/05/hardware-accelerated-disk-encryption-in.html
[2] https://lkml.org/lkml/2017/3/2/274
[3] https://www.mail-archive.com/linux-crypto@vger.kernel.org/msg23007.html
[4] https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt
[5] https://lkml.org/lkml/2017/2/7/182

Cheers,
Ondrej

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


Re: [dm-devel] [RFC PATCH v4] IV Generation algorithms for dm-crypt

2017-03-01 Thread Ondrej Mosnacek
2017-03-01 13:42 GMT+01:00 Gilad Ben-Yossef :
> It really is an observation about overhead of context switches between
> dm-crypt and
> whatever/wherever you handle crypto - be it an off CPU hardware engine
> or a bunch
> of parallel kernel threads running on other cores. You really want to
> burst as much as
> possible.

[...]

>> For XTS you need just simple linear IV. No problem with that, implementation
>> in crypto API and hw is trivial.
>>
>> But for compatible IV (that provides compatibility with loopAES and very old 
>> TrueCrypt),
>> these should be never ever implemented again anywhere.
>
>>
>> Specifically "tcw" is broken, insecure and provided here just to help people 
>> to migrate
>> from old Truecrypt containers. Even Truecrypt followers removed it from the 
>> codebase.
>> (It is basically combination of IV and slight modification of CBC mode. All
>> recent version switched to XTS and plain IV.)
>>
>> So building abstraction over something known to be broken and that is now 
>> intentionally
>> isolated inside dmcrypt is, in my opinion, really not a good idea.
>>
>
> I don't think anyone is interested in these modes. How do you support
> XTS and essiv in
> a generic way without supporting this broken modes is not something
> I'm clear on though.

Wouldn't adopting a bulk request API (something like what I tried to
do here [1]) that allows users to supply multiple messages, each with
their own IV, fulfill this purpose? That way, we wouldn't need to
introduce any new modes into Crypto API and the drivers/accelerators
would only need to provide bulk implementations of common modes
(xts(aes), cbc(aes), ...) to provide better performance for dm-crypt
(and possibly other users, too).

I'm not sure how exactly these crypto accelerators work, but wouldn't
it help if the drivers simply get more messages (in our case sectors)
in a single call? I wonder, would (efficiently) supporting such a
scheme require changes in the HW itself or could it be achieved just
by modifying driver code (let's say specifically for your CryptoCell
accelerator)?

Thanks,
Ondrej

[1] https://www.mail-archive.com/linux-crypto@vger.kernel.org/msg23007.html

>
> Thanks,
> Gilad
>
>
>
> --
> Gilad Ben-Yossef
> Chief Coffee Drinker
>
> "If you take a class in large-scale robotics, can you end up in a
> situation where the homework eats your dog?"
>  -- Jean-Baptiste Queru

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


[dm-devel] [RFC PATCH 4/6] crypto: simd - Add bulk request support

2017-01-12 Thread Ondrej Mosnacek
This patch adds proper support for the new bulk requests to the SIMD helpers.

Signed-off-by: Ondrej Mosnacek 
---
 crypto/simd.c | 61 +++
 1 file changed, 61 insertions(+)

diff --git a/crypto/simd.c b/crypto/simd.c
index 8820337..2ae5930 100644
--- a/crypto/simd.c
+++ b/crypto/simd.c
@@ -100,6 +100,64 @@ static int simd_skcipher_decrypt(struct skcipher_request 
*req)
return crypto_skcipher_decrypt(subreq);
 }
 
+static int simd_skcipher_encrypt_bulk(struct skcipher_bulk_request *req)
+{
+   struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req);
+   struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct skcipher_bulk_request *subreq;
+   struct crypto_skcipher *child;
+
+   subreq = skcipher_bulk_request_ctx(req);
+   *subreq = *req;
+
+   if (!may_use_simd() ||
+   (in_atomic() && cryptd_skcipher_queued(ctx->cryptd_tfm)))
+   child = &ctx->cryptd_tfm->base;
+   else
+   child = cryptd_skcipher_child(ctx->cryptd_tfm);
+
+   skcipher_bulk_request_set_tfm(subreq, child);
+
+   return crypto_skcipher_encrypt_bulk(subreq);
+}
+
+static int simd_skcipher_decrypt_bulk(struct skcipher_bulk_request *req)
+{
+   struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req);
+   struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct skcipher_bulk_request *subreq;
+   struct crypto_skcipher *child;
+
+   subreq = skcipher_bulk_request_ctx(req);
+   *subreq = *req;
+
+   if (!may_use_simd() ||
+   (in_atomic() && cryptd_skcipher_queued(ctx->cryptd_tfm)))
+   child = &ctx->cryptd_tfm->base;
+   else
+   child = cryptd_skcipher_child(ctx->cryptd_tfm);
+
+   skcipher_bulk_request_set_tfm(subreq, child);
+
+   return crypto_skcipher_decrypt_bulk(subreq);
+}
+
+static unsigned int simd_skcipher_reqsize_bulk(struct crypto_skcipher *tfm,
+  unsigned int maxmsgs)
+{
+   struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct crypto_skcipher *tfm_cryptd, *tfm_child;
+   unsigned int reqsize_cryptd, reqsize_child;
+
+   tfm_cryptd = &ctx->cryptd_tfm->base;
+   tfm_child = cryptd_skcipher_child(ctx->cryptd_tfm);
+
+   reqsize_cryptd = crypto_skcipher_bulk_reqsize(tfm_cryptd, maxmsgs);
+   reqsize_child = crypto_skcipher_bulk_reqsize(tfm_child, maxmsgs);
+   return sizeof(struct skcipher_bulk_request) +
+   max(reqsize_cryptd, reqsize_child);
+}
+
 static void simd_skcipher_exit(struct crypto_skcipher *tfm)
 {
struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
@@ -187,6 +245,9 @@ struct simd_skcipher_alg *simd_skcipher_create_compat(const 
char *algname,
alg->setkey = simd_skcipher_setkey;
alg->encrypt = simd_skcipher_encrypt;
alg->decrypt = simd_skcipher_decrypt;
+   alg->encrypt_bulk = simd_skcipher_encrypt_bulk;
+   alg->decrypt_bulk = simd_skcipher_decrypt_bulk;
+   alg->reqsize_bulk = simd_skcipher_reqsize_bulk;
 
err = crypto_register_skcipher(alg);
if (err)
-- 
2.9.3

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


[dm-devel] [RFC PATCH 6/6] dm-crypt: Add bulk crypto processing support

2017-01-12 Thread Ondrej Mosnacek
This patch converts dm-crypt to use bulk requests when invoking skcipher
operations, allowing the crypto drivers to process multiple sectors at once,
while reducing the overhead caused by the small sector size.

The new code detects if multiple sectors from a bio are contigously stored
within a single page (which should almost always be the case), and in such case
processes all these sectors via a single bulk request.

Note that the bio can also consist of several (likely consecutive) pages, which
could be all bundled in a single request. However, since we need to specify an
upper bound on how many sectors we are going to send at once (and this bound
may affect the amount of memory allocated per single request), it is best to
just limit the request bundling to a single page.

Note that if the 'keycount' parameter of the cipher specification is set to a
value other than 1, dm-crypt still sends only one sector in each request, since
in such case the neighboring sectors are encrypted with different keys.

This change causes a detectable read/write speedup (about 5-10%) on a ramdisk
when AES-NI accelerated ciphers are used.

Signed-off-by: Ondrej Mosnacek 
---
 drivers/md/dm-crypt.c | 254 --
 1 file changed, 165 insertions(+), 89 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 7c6c572..d3f69e1 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -37,6 +37,9 @@
 
 #define DM_MSG_PREFIX "crypt"
 
+/* for now, we only bundle consecutve sectors within a single page */
+#define MAX_CONSEC_SECTORS (1 << (PAGE_SHIFT - SECTOR_SHIFT))
+
 /*
  * context holding the current state of a multi-part conversion
  */
@@ -48,7 +51,7 @@ struct convert_context {
struct bvec_iter iter_out;
sector_t cc_sector;
atomic_t cc_pending;
-   struct skcipher_request *req;
+   struct skcipher_bulk_request *req;
 };
 
 /*
@@ -73,6 +76,7 @@ struct dm_crypt_request {
struct scatterlist sg_in;
struct scatterlist sg_out;
sector_t iv_sector;
+   sector_t sector_count;
 };
 
 struct crypt_config;
@@ -83,9 +87,9 @@ struct crypt_iv_operations {
void (*dtr)(struct crypt_config *cc);
int (*init)(struct crypt_config *cc);
int (*wipe)(struct crypt_config *cc);
-   int (*generator)(struct crypt_config *cc, u8 *iv,
+   int (*generator)(struct crypt_config *cc, u8 *iv, unsigned int sector,
 struct dm_crypt_request *dmreq);
-   int (*post)(struct crypt_config *cc, u8 *iv,
+   int (*post)(struct crypt_config *cc, u8 *iv, unsigned int sector,
struct dm_crypt_request *dmreq);
 };
 
@@ -163,14 +167,14 @@ struct crypt_config {
/*
 * Layout of each crypto request:
 *
-*   struct skcipher_request
+*   struct skcipher_bulk_request
 *  context
 *  padding
 *   struct dm_crypt_request
 *  padding
-*   IV
+*   IVs
 *
-* The padding is added so that dm_crypt_request and the IV are
+* The padding is added so that dm_crypt_request and the IVs are
 * correctly aligned.
 */
unsigned int dmreq_start;
@@ -245,20 +249,24 @@ static struct crypto_skcipher *any_tfm(struct 
crypt_config *cc)
  * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
  */
 
-static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
+static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *ivs, unsigned int i,
  struct dm_crypt_request *dmreq)
 {
+   u8 *iv = ivs + i * cc->iv_size;
+
memset(iv, 0, cc->iv_size);
-   *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0x);
+   *(__le32 *)iv = cpu_to_le32((dmreq->iv_sector + i) & 0x);
 
return 0;
 }
 
-static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
-   struct dm_crypt_request *dmreq)
+static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *ivs,
+   unsigned int i, struct dm_crypt_request *dmreq)
 {
+   u8 *iv = ivs + i * cc->iv_size;
+
memset(iv, 0, cc->iv_size);
-   *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
+   *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector + i);
 
return 0;
 }
@@ -410,13 +418,14 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, 
struct dm_target *ti,
return err;
 }
 
-static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
+static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *ivs, unsigned int i,
  struct dm_crypt_request *dmreq)
 {
struct crypto_cipher *essiv_tfm = cc->iv_private;
+   u8 *iv = ivs + i * cc->iv_size;
 
memset(iv, 0, cc->iv_size);
-   *(__le64 *)iv = cpu_to_le64(dmreq-&g

[dm-devel] [RFC PATCH 5/6] crypto: aesni-intel - Add bulk request support

2017-01-12 Thread Ondrej Mosnacek
This patch implements bulk request handling in the AES-NI crypto drivers.
The major advantage of this is that with bulk requests, the kernel_fpu_*
functions (which are usually quite slow) are now called only once for the whole
request.

Signed-off-by: Ondrej Mosnacek 
---
 arch/x86/crypto/aesni-intel_glue.c| 267 +++---
 arch/x86/crypto/glue_helper.c |  23 ++-
 arch/x86/include/asm/crypto/glue_helper.h |   2 +-
 3 files changed, 221 insertions(+), 71 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_glue.c 
b/arch/x86/crypto/aesni-intel_glue.c
index 36ca150..5f67afc 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -364,70 +364,116 @@ static int aesni_skcipher_setkey(struct crypto_skcipher 
*tfm, const u8 *key,
  crypto_skcipher_ctx(tfm), key, len);
 }
 
-static int ecb_encrypt(struct skcipher_request *req)
+typedef void (*aesni_crypt_t)(struct crypto_aes_ctx *ctx,
+ u8 *out, const u8 *in, unsigned int len);
+
+typedef void (*aesni_ivcrypt_t)(struct crypto_aes_ctx *ctx,
+   u8 *out, const u8 *in, unsigned int len,
+   u8 *iv);
+
+static int ecb_crypt(struct crypto_aes_ctx *ctx, struct skcipher_walk *walk,
+aesni_crypt_t crypt)
 {
-   struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-   struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
-   struct skcipher_walk walk;
unsigned int nbytes;
int err;
 
-   err = skcipher_walk_virt(&walk, req, true);
-
kernel_fpu_begin();
-   while ((nbytes = walk.nbytes)) {
-   aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes & AES_BLOCK_MASK);
+   while ((nbytes = walk->nbytes)) {
+   crypt(ctx, walk->dst.virt.addr, walk->src.virt.addr,
+ nbytes & AES_BLOCK_MASK);
nbytes &= AES_BLOCK_SIZE - 1;
-   err = skcipher_walk_done(&walk, nbytes);
+   err = skcipher_walk_done(walk, nbytes);
}
kernel_fpu_end();
 
return err;
 }
 
-static int ecb_decrypt(struct skcipher_request *req)
+static int cbc_crypt(struct crypto_aes_ctx *ctx, struct skcipher_walk *walk,
+aesni_ivcrypt_t crypt)
 {
-   struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-   struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
-   struct skcipher_walk walk;
unsigned int nbytes;
int err;
 
-   err = skcipher_walk_virt(&walk, req, true);
-
kernel_fpu_begin();
-   while ((nbytes = walk.nbytes)) {
-   aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes & AES_BLOCK_MASK);
+   while ((nbytes = walk->nbytes)) {
+   crypt(ctx, walk->dst.virt.addr, walk->src.virt.addr,
+ nbytes & AES_BLOCK_MASK, walk->iv);
nbytes &= AES_BLOCK_SIZE - 1;
-   err = skcipher_walk_done(&walk, nbytes);
+   err = skcipher_walk_done(walk, nbytes);
}
kernel_fpu_end();
 
return err;
 }
 
-static int cbc_encrypt(struct skcipher_request *req)
+static int ecb_encrypt(struct skcipher_request *req)
 {
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
struct skcipher_walk walk;
-   unsigned int nbytes;
int err;
 
err = skcipher_walk_virt(&walk, req, true);
+   if (err)
+   return err;
 
-   kernel_fpu_begin();
-   while ((nbytes = walk.nbytes)) {
-   aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes & AES_BLOCK_MASK, walk.iv);
-   nbytes &= AES_BLOCK_SIZE - 1;
-   err = skcipher_walk_done(&walk, nbytes);
-   }
-   kernel_fpu_end();
+   return ecb_crypt(ctx, &walk, aesni_ecb_enc);
+}
 
-   return err;
+static int ecb_decrypt(struct skcipher_request *req)
+{
+   struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+   struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+   struct skcipher_walk walk;
+   int err;
+
+   err = skcipher_walk_virt(&walk, req, true);
+   if (err)
+   return err;
+
+   return ecb_crypt(ctx, &walk, aesni_ecb_dec);
+}
+
+static int ecb_encrypt_bulk(struct skcipher_bulk_request *req)
+{
+   struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req);
+   struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+   struct skcipher_walk walk;
+   int err;
+
+   err = skcipher_walk_virt_bulk(&walk, req, true);
+   if (err)
+   return 

[dm-devel] [RFC PATCH 2/6] crypto: skcipher - Add bulk request support to walk

2017-01-12 Thread Ondrej Mosnacek
This patch tweaks skcipher_walk so it can be used with the new bulk requests.

The new skipher_walk can be initialized either from a skcipher_request (in
which case its behavior is equivalent to the old code) or from a
skcipher_bulk_request, in which case the usage is almost identical, the most
significant exception being that skciphers which somehow tweak the IV
(e.g. XTS) must check the new nextmsg flag before processing each chunk and
re-tweak the IV if it is set. For other ciphers skcipher_walk automatically
switches to the next IV at message boundaries.

Signed-off-by: Ondrej Mosnacek 
---
 crypto/skcipher.c  | 192 +++--
 include/crypto/internal/skcipher.h |  10 +-
 2 files changed, 153 insertions(+), 49 deletions(-)

diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 8b6d684..b810e90 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -33,6 +33,7 @@ enum {
SKCIPHER_WALK_COPY = 1 << 2,
SKCIPHER_WALK_DIFF = 1 << 3,
SKCIPHER_WALK_SLEEP = 1 << 4,
+   SKCIPHER_WALK_HETEROGENOUS = 1 << 5,
 };
 
 struct skcipher_walk_buffer {
@@ -94,6 +95,41 @@ static inline u8 *skcipher_get_spot(u8 *start, unsigned int 
len)
return max(start, end_page);
 }
 
+static int skcipher_copy_iv(struct skcipher_walk *walk)
+{
+   unsigned a = crypto_tfm_ctx_alignment() - 1;
+   unsigned alignmask = walk->alignmask;
+   unsigned ivsize = walk->ivsize;
+   unsigned bs = walk->stride;
+   unsigned aligned_bs;
+   unsigned size;
+   u8 *iv;
+
+   aligned_bs = ALIGN(bs, alignmask);
+
+   /* Minimum size to align buffer by alignmask. */
+   size = alignmask & ~a;
+
+   if (walk->flags & SKCIPHER_WALK_PHYS)
+   size += ivsize;
+   else {
+   size += aligned_bs + ivsize;
+
+   /* Minimum size to ensure buffer does not straddle a page. */
+   size += (bs - 1) & ~(alignmask | a);
+   }
+
+   walk->buffer = kmalloc(size, skcipher_walk_gfp(walk));
+   if (!walk->buffer)
+   return -ENOMEM;
+
+   iv = PTR_ALIGN(walk->buffer, alignmask + 1);
+   iv = skcipher_get_spot(iv, bs) + aligned_bs;
+
+   walk->iv = memcpy(iv, walk->iv, walk->ivsize);
+   return 0;
+}
+
 static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize)
 {
u8 *addr;
@@ -108,9 +144,12 @@ static int skcipher_done_slow(struct skcipher_walk *walk, 
unsigned int bsize)
 int skcipher_walk_done(struct skcipher_walk *walk, int err)
 {
unsigned int n = walk->nbytes - err;
-   unsigned int nbytes;
+   unsigned int nbytes, nbytes_msg;
+
+   walk->nextmsg = false; /* reset the nextmsg flag */
 
nbytes = walk->total - n;
+   nbytes_msg = walk->total_msg - n;
 
if (unlikely(err < 0)) {
nbytes = 0;
@@ -139,8 +178,31 @@ int skcipher_walk_done(struct skcipher_walk *walk, int err)
if (err > 0)
err = 0;
 
+   if (nbytes && !nbytes_msg) {
+   walk->nextmsg = true;
+
+   /* write the output IV: */
+   if (walk->iv != walk->oiv)
+   memcpy(walk->oiv, walk->iv, walk->ivsize);
+
+   /* advance to the IV of next message: */
+   walk->oiv += walk->ivsize;
+   walk->iv = walk->oiv;
+
+   if (unlikely(((unsigned long)walk->iv & walk->alignmask))) {
+   err = skcipher_copy_iv(walk);
+   if (err)
+   return err;
+   }
+
+   nbytes_msg = *walk->nextmsgsize;
+   if (walk->flags & SKCIPHER_WALK_HETEROGENOUS)
+   ++walk->nextmsgsize;
+   }
+
+   walk->nbytes = nbytes_msg;
+   walk->total_msg = nbytes_msg;
walk->total = nbytes;
-   walk->nbytes = nbytes;
 
scatterwalk_advance(&walk->in, n);
scatterwalk_advance(&walk->out, n);
@@ -343,13 +405,13 @@ static int skcipher_walk_next(struct skcipher_walk *walk)
walk->flags &= ~(SKCIPHER_WALK_SLOW | SKCIPHER_WALK_COPY |
 SKCIPHER_WALK_DIFF);
 
-   n = walk->total;
+   n = walk->total_msg;
bsize = min(walk->stride, max(n, walk->blocksize));
n = scatterwalk_clamp(&walk->in, n);
n = scatterwalk_clamp(&walk->out, n);
 
if (unlikely(n < bsize)) {
-   if (unlikely(walk->total < walk->blocksize))
+   if (unlikely(walk->total_msg < walk->blocksize))
return skcipher_walk_done(walk, -EINVAL);
 
 slow_path:
@@ -388,41 +450,6 @@ static int skcipher_walk_next(struct skcipher_walk *walk)
 }
 EXPORT_SYMBOL_GPL(skcipher_walk_next);
 
-static 

[dm-devel] [RFC PATCH 0/6] Add bulk skcipher requests to crypto API and dm-crypt

2017-01-12 Thread Ondrej Mosnacek
Hi,

the goal of this patchset is to allow those skcipher API users that need to
process batches of small messages (especially dm-crypt) to do so efficiently.

The first patch introduces a new request type (and corresponding encrypt/decrypt
functions) to the skcipher API. The new API can be used to submit multiple
messages at once, thus enabling the drivers to reduce overhead as opposed to
processing each message separately.

The skcipher drivers can provide support for the new request type by setting the
corresponding fields of their skcipher_alg structure. If 'native' support is not
provided by a driver (i.e. the fields are left NULL), the crypto API
transparently provides a generic fallback implementation, which simply processes
the bulk request as a set of standard requests on the same tfm.

The second patch extends skcipher_walk so it can be used for processing the new
bulk requests, while preserving equivalent functionality when used with standard
requests.

The third and fourth patches add native bulk request support to the cryptd and
SIMD helper wrappers, respectively.

The fifth patch adds bulk request support to the AES-NI skcipher drivers, in
order to provide an example for both implementing the bulk request processing
and the usage of the extended skcipher_walk in such implementation. Also, this
patch provides a slight optimization, since the kernel_fpu_* functions are
called just once per the whole bulk request. Note that both the standard and
bulk implementation mostly use the same code under the hood.

The last patch converts dm-crypt to use bulk requests and makes it submit
multiple sectors at once, whenever they are stored sequentially within a single
page.

With all the patches applied, I was able to measure a small speedup (~5-10%)
with AES-NI ciphers and dm-crypt device mapped over a ramdisk.

To-be-done:
testing the bulk API in testmgr.c
documentation update

Ondrej Mosnacek (6):
  crypto: skcipher - Add bulk request processing API
  crypto: skcipher - Add bulk request support to walk
  crypto: cryptd - Add skcipher bulk request support
  crypto: simd - Add bulk request support
  crypto: aesni-intel - Add bulk request support
  dm-crypt: Add bulk crypto processing support

 arch/x86/crypto/aesni-intel_glue.c| 267 +++--
 arch/x86/crypto/glue_helper.c |  23 +--
 arch/x86/include/asm/crypto/glue_helper.h |   2 +-
 crypto/Makefile   |   1 +
 crypto/cryptd.c   | 111 +++
 crypto/simd.c |  61 ++
 crypto/skcipher.c | 207 +++-
 crypto/skcipher_bulk.c| 312 ++
 drivers/md/dm-crypt.c | 254 +++-
 include/crypto/internal/skcipher.h|  42 +++-
 include/crypto/skcipher.h | 299 +++-
 11 files changed, 1369 insertions(+), 210 deletions(-)
 create mode 100644 crypto/skcipher_bulk.c

-- 
2.9.3

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel


[dm-devel] [RFC PATCH 1/6] crypto: skcipher - Add bulk request processing API

2017-01-12 Thread Ondrej Mosnacek
This patch adds bulk request processing to the skcipher interface.
Specifically, it adds a new type of request ('skcipher_bulk_request'), which
allows passing multiple independent messages to the skcipher driver.

The buffers for the message data are passed via just two sg lists (one for src
buffer, one for dst buffer). The IVs are passed via a single buffer, where they
are stored sequentially. The interface allows specifying either a fixed length
for all messages or a pointer to an array of message lengths.

A skcipher implementation that wants to provide support for bulk requests may
set the appropriate fields of its skcipher_alg struct. If these fields are not
provided (or the skcipher is created from an (a)blkcipher), the crypto API
automatically sets these fields to a fallback implementation, which just splits
the bulk request into a series of regular skcipher requests on the same tfm.

This means that the new type of request can be used with all skciphers, even if
they do not support bulk requests natively.

Note that when allocating a skcipher_bulk_request, the user must specify the
maximum number of messages that they are going to submit via the request. This
is necessary for the fallback implementation, which has to allocate space for
the appropriate number of subrequests so that they can be processed in
parallel. If the skcipher is synchronous, then the fallback implementation
only allocates space for one subrequest and processes the patrial requests
sequentially.

Signed-off-by: Ondrej Mosnacek 
---
 crypto/Makefile|   1 +
 crypto/skcipher.c  |  15 ++
 crypto/skcipher_bulk.c | 312 +
 include/crypto/internal/skcipher.h |  32 
 include/crypto/skcipher.h  | 299 ++-
 5 files changed, 658 insertions(+), 1 deletion(-)
 create mode 100644 crypto/skcipher_bulk.c

diff --git a/crypto/Makefile b/crypto/Makefile
index b8f0e3e..cd1cf57 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_CRYPTO_AEAD2) += aead.o
 crypto_blkcipher-y := ablkcipher.o
 crypto_blkcipher-y += blkcipher.o
 crypto_blkcipher-y += skcipher.o
+crypto_blkcipher-y += skcipher_bulk.o
 obj-$(CONFIG_CRYPTO_BLKCIPHER2) += crypto_blkcipher.o
 obj-$(CONFIG_CRYPTO_SEQIV) += seqiv.o
 obj-$(CONFIG_CRYPTO_ECHAINIV) += echainiv.o
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 6ee6a15..8b6d684 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -667,6 +667,8 @@ static int crypto_init_skcipher_ops_blkcipher(struct 
crypto_tfm *tfm)
skcipher->ivsize = crypto_blkcipher_ivsize(blkcipher);
skcipher->keysize = calg->cra_blkcipher.max_keysize;
 
+   crypto_skcipher_bulk_set_fallback(skcipher);
+
return 0;
 }
 
@@ -760,6 +762,8 @@ static int crypto_init_skcipher_ops_ablkcipher(struct 
crypto_tfm *tfm)
sizeof(struct ablkcipher_request);
skcipher->keysize = calg->cra_ablkcipher.max_keysize;
 
+   crypto_skcipher_bulk_set_fallback(skcipher);
+
return 0;
 }
 
@@ -789,6 +793,14 @@ static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm)
skcipher->ivsize = alg->ivsize;
skcipher->keysize = alg->max_keysize;
 
+   if (!alg->encrypt_bulk || !alg->decrypt_bulk || !alg->reqsize_bulk)
+   crypto_skcipher_bulk_set_fallback(skcipher);
+   else {
+   skcipher->encrypt_bulk = alg->encrypt_bulk;
+   skcipher->decrypt_bulk = alg->decrypt_bulk;
+   skcipher->reqsize_bulk = alg->reqsize_bulk;
+   }
+
if (alg->exit)
skcipher->base.exit = crypto_skcipher_exit_tfm;
 
@@ -822,6 +834,9 @@ static void crypto_skcipher_show(struct seq_file *m, struct 
crypto_alg *alg)
seq_printf(m, "ivsize   : %u\n", skcipher->ivsize);
seq_printf(m, "chunksize: %u\n", skcipher->chunksize);
seq_printf(m, "walksize : %u\n", skcipher->walksize);
+   seq_printf(m, "bulk : %s\n",
+  (skcipher->encrypt_bulk && skcipher->decrypt_bulk &&
+   skcipher->reqsize_bulk) ?  "yes" : "no");
 }
 
 #ifdef CONFIG_NET
diff --git a/crypto/skcipher_bulk.c b/crypto/skcipher_bulk.c
new file mode 100644
index 000..9630122
--- /dev/null
+++ b/crypto/skcipher_bulk.c
@@ -0,0 +1,312 @@
+/*
+ * Bulk IV fallback for skcipher.
+ *
+ * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved.
+ * Copyright (c) 2016-2017 Ondrej Mosnacek 
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include 
+#include 
+#include 
+#include 

[dm-devel] [RFC PATCH 3/6] crypto: cryptd - Add skcipher bulk request support

2017-01-12 Thread Ondrej Mosnacek
This patch adds proper support for the new bulk requests to cryptd.

Signed-off-by: Ondrej Mosnacek 
---
 crypto/cryptd.c | 111 
 1 file changed, 111 insertions(+)

diff --git a/crypto/cryptd.c b/crypto/cryptd.c
index 0508c48..b7d6e13 100644
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -555,6 +555,114 @@ static int cryptd_skcipher_decrypt_enqueue(struct 
skcipher_request *req)
return cryptd_skcipher_enqueue(req, cryptd_skcipher_decrypt);
 }
 
+static void cryptd_skcipher_bulk_complete(struct skcipher_bulk_request *req,
+ int err)
+{
+   struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req);
+   struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct cryptd_skcipher_request_ctx *rctx =
+   skcipher_bulk_request_ctx(req);
+   int refcnt = atomic_read(&ctx->refcnt);
+
+   local_bh_disable();
+   rctx->complete(&req->base, err);
+   local_bh_enable();
+
+   if (err != -EINPROGRESS && refcnt && atomic_dec_and_test(&ctx->refcnt))
+   crypto_free_skcipher(tfm);
+}
+
+static void cryptd_skcipher_bulk_encrypt(struct crypto_async_request *base,
+int err)
+{
+   struct skcipher_bulk_request *req = skcipher_bulk_request_cast(base);
+   struct cryptd_skcipher_request_ctx *rctx =
+   skcipher_bulk_request_ctx(req);
+   struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req);
+   struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct crypto_skcipher *child = ctx->child;
+   SKCIPHER_BULK_REQUEST_ON_STACK(subreq, req->maxmsgs, child);
+
+   if (unlikely(err == -EINPROGRESS))
+   goto out;
+
+   skcipher_bulk_request_set_tfm(subreq, child);
+   skcipher_bulk_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP,
+  NULL, NULL);
+   skcipher_bulk_request_set_crypt(subreq, req->src, req->dst, req->nmsgs,
+   req->msgsize, req->msgsizes, req->ivs);
+
+   err = crypto_skcipher_encrypt_bulk(subreq);
+   skcipher_bulk_request_zero(subreq);
+
+   req->base.complete = rctx->complete;
+
+out:
+   cryptd_skcipher_bulk_complete(req, err);
+}
+
+static void cryptd_skcipher_bulk_decrypt(struct crypto_async_request *base,
+int err)
+{
+   struct skcipher_bulk_request *req = skcipher_bulk_request_cast(base);
+   struct cryptd_skcipher_request_ctx *rctx =
+   skcipher_bulk_request_ctx(req);
+   struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req);
+   struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct crypto_skcipher *child = ctx->child;
+   SKCIPHER_BULK_REQUEST_ON_STACK(subreq, req->maxmsgs, child);
+
+   if (unlikely(err == -EINPROGRESS))
+   goto out;
+
+   skcipher_bulk_request_set_tfm(subreq, child);
+   skcipher_bulk_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP,
+  NULL, NULL);
+   skcipher_bulk_request_set_crypt(subreq, req->src, req->dst, req->nmsgs,
+   req->msgsize, req->msgsizes, req->ivs);
+
+   err = crypto_skcipher_decrypt_bulk(subreq);
+   skcipher_bulk_request_zero(subreq);
+
+   req->base.complete = rctx->complete;
+
+out:
+   cryptd_skcipher_bulk_complete(req, err);
+}
+
+static int cryptd_skcipher_bulk_enqueue(struct skcipher_bulk_request *req,
+   crypto_completion_t compl)
+{
+   struct cryptd_skcipher_request_ctx *rctx =
+   skcipher_bulk_request_ctx(req);
+   struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req);
+   struct cryptd_queue *queue;
+
+   queue = cryptd_get_queue(crypto_skcipher_tfm(tfm));
+   rctx->complete = req->base.complete;
+   req->base.complete = compl;
+
+   return cryptd_enqueue_request(queue, &req->base);
+}
+
+static int cryptd_skcipher_bulk_encrypt_enqueue(
+   struct skcipher_bulk_request *req)
+{
+   return cryptd_skcipher_bulk_enqueue(req, cryptd_skcipher_bulk_encrypt);
+}
+
+static int cryptd_skcipher_bulk_decrypt_enqueue(
+   struct skcipher_bulk_request *req)
+{
+   return cryptd_skcipher_bulk_enqueue(req, cryptd_skcipher_bulk_decrypt);
+}
+
+static unsigned int cryptd_skcipher_bulk_reqsize(struct crypto_skcipher *tfm,
+unsigned int maxmsgs)
+{
+   return sizeof(struct cryptd_skcipher_request_ctx);
+}
+
 static int cryptd_skcipher_init_tfm(struct crypto_skcipher *tfm)
 {
struct skcipher_instance *inst = skcipher_alg_instance