[dm-devel] [PATCH v4 4/4] crypto: lrw - Do not use auxiliary buffer
This patch simplifies the LRW template to recompute the LRW tweaks from scratch in the second pass and thus also removes the need to allocate a dynamic buffer using kmalloc(). As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt. PERFORMANCE MEASUREMENTS (x86_64) Performed using: https://gitlab.com/omos/linux-crypto-bench Crypto driver used: lrw(ecb-aes-aesni) The results show that the new code has about the same performance as the old code. For 512-byte message it seems to be even slightly faster, but that might be just noise. Before: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 200 203 lrw(aes) 320 64 202 204 lrw(aes) 384 64 204 205 lrw(aes) 256 512 415 415 lrw(aes) 320 512 432 440 lrw(aes) 384 512 449 451 lrw(aes) 256409618381995 lrw(aes) 320409621231980 lrw(aes) 384409621002119 lrw(aes) 256 1638471836954 lrw(aes) 320 1638478447631 lrw(aes) 384 1638482568126 lrw(aes) 256 32768 14772 14484 lrw(aes) 320 32768 15281 15431 lrw(aes) 384 32768 16469 16293 After: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 197 196 lrw(aes) 320 64 200 197 lrw(aes) 384 64 203 199 lrw(aes) 256 512 385 380 lrw(aes) 320 512 401 395 lrw(aes) 384 512 415 415 lrw(aes) 256409618691846 lrw(aes) 320409620801981 lrw(aes) 384409621602109 lrw(aes) 256 1638470777127 lrw(aes) 320 1638478077766 lrw(aes) 384 1638481088357 lrw(aes) 256 32768 14111 14454 lrw(aes) 320 32768 15268 15082 lrw(aes) 384 32768 16581 16250 [1] https://lkml.org/lkml/2018/8/23/1315 Signed-off-by: Ondrej Mosnacek --- crypto/lrw.c | 280 ++- 1 file changed, 51 insertions(+), 229 deletions(-) diff --git a/crypto/lrw.c b/crypto/lrw.c index 7377b5b486fd..6fcf0d431185 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -29,8 +29,6 @@ #include #include -#define LRW_BUFFER_SIZE 128u - #define LRW_BLOCK_SIZE 16 struct priv { @@ -56,19 +54,7 @@ struct priv { }; struct rctx { - be128 buf[LRW_BUFFER_SIZE / sizeof(be128)]; - be128 t; - - be128 *ext; - - struct scatterlist srcbuf[2]; - struct scatterlist dstbuf[2]; - struct scatterlist *src; - struct scatterlist *dst; - - unsigned int left; - struct skcipher_request subreq; }; @@ -152,86 +138,31 @@ static int next_index(u32 *counter) return 127; } -static int post_crypt(struct skcipher_request *req) +/* + * We compute the tweak masks twice (both before and after the ECB encryption or + * decryption) to avoid having to allocate a temporary buffer and/or make + * mutliple calls to the 'ecb(..)' instance, which usually would be slower than + * just doing the next_index() calls again. + */ +static int xor_tweak(struct skcipher_request *req, bool second_pass) { - struct rctx *rctx = skcipher_request_ctx(req); - be128 *buf = rctx->ext ?: rctx->buf; - struct skcipher_request *subreq; const int bs = LRW_BLOCK_SIZE; - struct skcipher_walk w; - struct scatterlist *sg; - unsigned offset; - int err; - - subreq = &rctx->subreq; - err = skcipher_walk_virt(&w, subreq, false); - - while (w.nbytes) { - unsigned int avail = w.nbytes; - be128 *wdst; - - wdst = w.dst.virt.addr; - - do { - be128_xor(wdst, buf++, wdst); - wdst++; - } while ((avail -= bs) >= bs); - - err = skcipher_walk_done(&w, avail); -
[dm-devel] [PATCH v4 2/4] crypto: testmgr - Add test for LRW counter wrap-around
This patch adds a test vector for lrw(aes) that triggers wrap-around of the counter, which is a tricky corner case. Suggested-by: Eric Biggers Signed-off-by: Ondrej Mosnacek --- crypto/testmgr.h | 21 + 1 file changed, 21 insertions(+) diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 0b3d7cadbe93..47629cb1efd3 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -13145,6 +13145,27 @@ static const struct cipher_testvec aes_lrw_tv_template[] = { .ctext = "\x5b\x90\x8e\xc1\xab\xdd\x67\x5f" "\x3d\x69\x8a\x95\x53\xc8\x9c\xe5", .len= 16, + }, { /* Test counter wrap-around, modified from LRW-32-AES 1 */ + .key= "\x45\x62\xac\x25\xf8\x28\x17\x6d" + "\x4c\x26\x84\x14\xb5\x68\x01\x85" + "\x25\x8e\x2a\x05\xe7\x3e\x9d\x03" + "\xee\x5a\x83\x0c\xcc\x09\x4c\x87", + .klen = 32, + .iv = "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .ptext = "\x30\x31\x32\x33\x34\x35\x36\x37" + "\x38\x39\x41\x42\x43\x44\x45\x46" + "\x30\x31\x32\x33\x34\x35\x36\x37" + "\x38\x39\x41\x42\x43\x44\x45\x46" + "\x30\x31\x32\x33\x34\x35\x36\x37" + "\x38\x39\x41\x42\x43\x44\x45\x46", + .ctext = "\x47\x90\x50\xf6\xf4\x8d\x5c\x7f" + "\x84\xc7\x83\x95\x2d\xa2\x02\xc0" + "\xda\x7f\xa3\xc0\x88\x2a\x0a\x50" + "\xfb\xc1\x78\x03\x39\xfe\x1d\xe5" + "\xf1\xb2\x73\xcd\x65\xa3\xdf\x5f" + "\xe9\x5d\x48\x92\x54\x63\x4e\xb8", + .len= 48, }, { /* http://www.mail-archive.com/stds-p1619@listserv.ieee.org/msg00173.html */ .key= "\xf8\xd4\x76\xff\xd6\x46\xee\x6c" -- 2.17.1 -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
[dm-devel] [PATCH v4 3/4] crypto: lrw - Optimize tweak computation
This patch rewrites the tweak computation to a slightly simpler method that performs less bswaps. Based on performance measurements the new code seems to provide slightly better performance than the old one. PERFORMANCE MEASUREMENTS (x86_64) Performed using: https://gitlab.com/omos/linux-crypto-bench Crypto driver used: lrw(ecb-aes-aesni) Before: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 204 286 lrw(aes) 320 64 227 203 lrw(aes) 384 64 208 204 lrw(aes) 256 512 441 439 lrw(aes) 320 512 456 455 lrw(aes) 384 512 469 483 lrw(aes) 256409621362190 lrw(aes) 320409621612213 lrw(aes) 384409622952369 lrw(aes) 256 1638476927868 lrw(aes) 320 1638482308691 lrw(aes) 384 1638489718813 lrw(aes) 256 32768 15336 15560 lrw(aes) 320 32768 16410 16346 lrw(aes) 384 32768 18023 17465 After: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 200 203 lrw(aes) 320 64 202 204 lrw(aes) 384 64 204 205 lrw(aes) 256 512 415 415 lrw(aes) 320 512 432 440 lrw(aes) 384 512 449 451 lrw(aes) 256409618381995 lrw(aes) 320409621231980 lrw(aes) 384409621002119 lrw(aes) 256 1638471836954 lrw(aes) 320 1638478447631 lrw(aes) 384 1638482568126 lrw(aes) 256 32768 14772 14484 lrw(aes) 320 32768 15281 15431 lrw(aes) 384 32768 16469 16293 Signed-off-by: Ondrej Mosnacek --- crypto/lrw.c | 61 +++- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/crypto/lrw.c b/crypto/lrw.c index 5504d1325a56..7377b5b486fd 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -120,27 +120,28 @@ static int setkey(struct crypto_skcipher *parent, const u8 *key, return 0; } -static inline void inc(be128 *iv) -{ - be64_add_cpu(&iv->b, 1); - if (!iv->b) - be64_add_cpu(&iv->a, 1); -} - -/* this returns the number of consequative 1 bits starting - * from the right, get_index128(00 00 00 00 00 00 ... 00 00 10 FB) = 2 */ -static inline int get_index128(be128 *block) +/* + * Returns the number of trailing '1' bits in the words of the counter, which is + * represented by 4 32-bit words, arranged from least to most significant. + * At the same time, increments the counter by one. + * + * For example: + * + * u32 counter[4] = { 0x, 0x1, 0x0, 0x0 }; + * int i = next_index(&counter); + * // i == 33, counter == { 0x0, 0x2, 0x0, 0x0 } + */ +static int next_index(u32 *counter) { - int x; - __be32 *p = (__be32 *) block; + int i, res = 0; - for (p += 3, x = 0; x < 128; p--, x += 32) { - u32 val = be32_to_cpup(p); - - if (!~val) - continue; - - return x + ffz(val); + for (i = 0; i < 4; i++) { + if (counter[i] + 1 != 0) { + res += ffz(counter[i]++); + break; + } + counter[i] = 0; + res += 32; } /* @@ -214,8 +215,9 @@ static int pre_crypt(struct skcipher_request *req) struct scatterlist *sg; unsigned cryptlen; unsigned offset; - be128 *iv; bool more; + __be32 *iv; + u32 counter[4]; int err; subreq = &rctx->subreq; @@ -230,7 +232,12 @@ static int pre_crypt(struct skcipher_request *req) cryptlen, req->iv); err = skcipher_walk_virt(&w, subreq, false); - iv = w.iv; + iv = (__be32 *)w.iv; + + counter[0] = be32_to_cpu(iv[3]); + counter[1] = be32
[dm-devel] [PATCH v4 0/4] crypto: lrw - Fixes and improvements
This patchset contains a corner-case fix and several improvements for the LRW template. The first patch fixes an out-of-bounds array access (and subsequently incorrect cipher output) when the LRW counter goes from all ones to all zeros. This patch should be applied to the crypto-2.6 tree and also go to stable. The second patch adds a test vector for lrw(aes) that covers the above bug. The third patch is a small optimization of the LRW tweak computation. The fourth patch is a follow-up to a similar patch for XTS (it simplifies away the use of dynamically allocated auxiliary buffer to cache the computed tweak values): https://patchwork.kernel.org/patch/10588775/ Patches 2-4 should be applied only to cryptodev-2.6, but they all depend on the first patch. Changes in v4: - applied various corrections/suggestions from Eric Biggers - added a fix for buggy behavior on counter wrap-around (+ test vector) v3: https://www.spinics.net/lists/linux-crypto/msg34946.html Changes in v3: - fix a copy-paste error v2: https://www.spinics.net/lists/linux-crypto/msg34890.html Changes in v2: - small cleanup suggested by Eric Biggers v1: https://www.spinics.net/lists/linux-crypto/msg34871.html Ondrej Mosnacek (4): crypto: lrw - Fix out-of bounds access on counter overflow crypto: testmgr - Add test for LRW counter wrap-around crypto: lrw - Optimize tweak computation crypto: lrw - Do not use auxiliary buffer crypto/lrw.c | 342 +-- crypto/testmgr.h | 21 +++ 2 files changed, 112 insertions(+), 251 deletions(-) -- 2.17.1 -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
[dm-devel] [PATCH v4 1/4] crypto: lrw - Fix out-of bounds access on counter overflow
When the LRW block counter overflows, the current implementation returns 128 as the index to the precomputed multiplication table, which has 128 entries. This patch fixes it to return the correct value (127). Fixes: 64470f1b8510 ("[CRYPTO] lrw: Liskov Rivest Wagner, a tweakable narrow block cipher mode") Cc: # 2.6.20+ Reported-by: Eric Biggers Signed-off-by: Ondrej Mosnacek --- crypto/lrw.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/crypto/lrw.c b/crypto/lrw.c index 393a782679c7..5504d1325a56 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -143,7 +143,12 @@ static inline int get_index128(be128 *block) return x + ffz(val); } - return x; + /* +* If we get here, then x == 128 and we are incrementing the counter +* from all ones to all zeros. This means we must return index 127, i.e. +* the one corresponding to key2*{ 1,...,1 }. +*/ + return 127; } static int post_crypt(struct skcipher_request *req) -- 2.17.1 -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
Re: [dm-devel] [PATCH v3 2/2] crypto: lrw - Do not use auxiliary buffer
On Wed, Sep 12, 2018 at 8:51 AM Eric Biggers wrote: > On Tue, Sep 11, 2018 at 09:42:39AM +0200, Ondrej Mosnacek wrote: > > This patch simplifies the LRW template to recompute the LRW tweaks from > > scratch in the second pass and thus also removes the need to allocate a > > dynamic buffer using kmalloc(). > > > > As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt. > > > > PERFORMANCE MEASUREMENTS (x86_64) > > Performed using: https://gitlab.com/omos/linux-crypto-bench > > Crypto driver used: lrw(ecb-aes-aesni) > > > > The results show that the new code has about the same performance as the > > old code. For 512-byte message it seems to be even slightly faster, but > > that might be just noise. > > > > Before: > >ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) > > lrw(aes) 256 64 200 203 > > lrw(aes) 320 64 202 204 > > lrw(aes) 384 64 204 205 > > lrw(aes) 256 512 415 415 > > lrw(aes) 320 512 432 440 > > lrw(aes) 384 512 449 451 > > lrw(aes) 256409618381995 > > lrw(aes) 320409621231980 > > lrw(aes) 384409621002119 > > lrw(aes) 256 1638471836954 > > lrw(aes) 320 1638478447631 > > lrw(aes) 384 1638482568126 > > lrw(aes) 256 32768 14772 14484 > > lrw(aes) 320 32768 15281 15431 > > lrw(aes) 384 32768 16469 16293 > > > > After: > >ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) > > lrw(aes) 256 64 197 196 > > lrw(aes) 320 64 200 197 > > lrw(aes) 384 64 203 199 > > lrw(aes) 256 512 385 380 > > lrw(aes) 320 512 401 395 > > lrw(aes) 384 512 415 415 > > lrw(aes) 256409618691846 > > lrw(aes) 320409620801981 > > lrw(aes) 384409621602109 > > lrw(aes) 256 1638470777127 > > lrw(aes) 320 1638478077766 > > lrw(aes) 384 1638481088357 > > lrw(aes) 256 32768 14111 14454 > > lrw(aes) 320 32768 15268 15082 > > lrw(aes) 384 32768 16581 16250 > > > > [1] https://lkml.org/lkml/2018/8/23/1315 > > > > Signed-off-by: Ondrej Mosnacek > > --- > > crypto/lrw.c | 280 ++- > > 1 file changed, 51 insertions(+), 229 deletions(-) > > > > diff --git a/crypto/lrw.c b/crypto/lrw.c > > index b4f30b6f16d6..d5d2fba9af59 100644 > > --- a/crypto/lrw.c > > +++ b/crypto/lrw.c > > @@ -29,8 +29,6 @@ > > #include > > #include > > > > -#define LRW_BUFFER_SIZE 128u > > - > > #define LRW_BLOCK_SIZE 16 > > > > struct priv { > > @@ -56,19 +54,7 @@ struct priv { > > }; > > > > struct rctx { > > - be128 buf[LRW_BUFFER_SIZE / sizeof(be128)]; > > - > > - be128 t; > > - > > - be128 *ext; > > - > > - struct scatterlist srcbuf[2]; > > - struct scatterlist dstbuf[2]; > > - struct scatterlist *src; > > - struct scatterlist *dst; > > - > > - unsigned int left; > > - > > + be128 t, orig_iv; > > struct skcipher_request subreq; > > }; > > > > @@ -135,86 +121,31 @@ static int next_index(u32 *counter) > > return res; > > } > > > > -static int post_crypt(struct skcipher_request *req) > > +/* > > + * We compute the tweak masks twice (both bef
Re: [dm-devel] [PATCH v3 1/2] crypto: lrw - Optimize tweak computation
On Wed, Sep 12, 2018 at 8:28 AM Eric Biggers wrote: > Hi Ondrej, > > On Tue, Sep 11, 2018 at 09:42:38AM +0200, Ondrej Mosnacek wrote: > > This patch rewrites the tweak computation to a slightly simpler method > > that performs less bswaps. Based on performance measurements the new > > code seems to provide slightly better performance than the old one. > > > > PERFORMANCE MEASUREMENTS (x86_64) > > Performed using: https://gitlab.com/omos/linux-crypto-bench > > Crypto driver used: lrw(ecb-aes-aesni) > > > > Before: > >ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) > > lrw(aes) 256 64 204 286 > > lrw(aes) 320 64 227 203 > > lrw(aes) 384 64 208 204 > > lrw(aes) 256 512 441 439 > > lrw(aes) 320 512 456 455 > > lrw(aes) 384 512 469 483 > > lrw(aes) 256409621362190 > > lrw(aes) 320409621612213 > > lrw(aes) 384409622952369 > > lrw(aes) 256 1638476927868 > > lrw(aes) 320 1638482308691 > > lrw(aes) 384 1638489718813 > > lrw(aes) 256 32768 15336 15560 > > lrw(aes) 320 32768 16410 16346 > > lrw(aes) 384 32768 18023 17465 > > > > After: > >ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) > > lrw(aes) 256 64 200 203 > > lrw(aes) 320 64 202 204 > > lrw(aes) 384 64 204 205 > > lrw(aes) 256 512 415 415 > > lrw(aes) 320 512 432 440 > > lrw(aes) 384 512 449 451 > > lrw(aes) 256409618381995 > > lrw(aes) 320409621231980 > > lrw(aes) 384409621002119 > > lrw(aes) 256 1638471836954 > > lrw(aes) 320 1638478447631 > > lrw(aes) 384 1638482568126 > > lrw(aes) 256 32768 14772 14484 > > lrw(aes) 320 32768 15281 15431 > > lrw(aes) 384 32768 16469 16293 > > > > Signed-off-by: Ondrej Mosnacek > > --- > > crypto/lrw.c | 49 + > > 1 file changed, 25 insertions(+), 24 deletions(-) > > > > diff --git a/crypto/lrw.c b/crypto/lrw.c > > index 393a782679c7..b4f30b6f16d6 100644 > > --- a/crypto/lrw.c > > +++ b/crypto/lrw.c > > @@ -120,30 +120,19 @@ static int setkey(struct crypto_skcipher *parent, > > const u8 *key, > > return 0; > > } > > > > -static inline void inc(be128 *iv) > > +static int next_index(u32 *counter) > > { > > - be64_add_cpu(&iv->b, 1); > > - if (!iv->b) > > - be64_add_cpu(&iv->a, 1); > > -} > > - > > -/* this returns the number of consequative 1 bits starting > > - * from the right, get_index128(00 00 00 00 00 00 ... 00 00 10 FB) = 2 */ > > -static inline int get_index128(be128 *block) > > -{ > > - int x; > > - __be32 *p = (__be32 *) block; > > - > > - for (p += 3, x = 0; x < 128; p--, x += 32) { > > - u32 val = be32_to_cpup(p); > > - > > - if (!~val) > > - continue; > > + int i, res = 0; > > > > - return x + ffz(val); > > + for (i = 0; i < 4; i++) { > > + if (counter[i] + 1 != 0) { > > + res += ffz(counter[i]++); > > + break; > > + } > > + counter[i] = 0; > > + res += 32; > > } > > - > >
[dm-devel] [PATCH v3 2/2] crypto: lrw - Do not use auxiliary buffer
This patch simplifies the LRW template to recompute the LRW tweaks from scratch in the second pass and thus also removes the need to allocate a dynamic buffer using kmalloc(). As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt. PERFORMANCE MEASUREMENTS (x86_64) Performed using: https://gitlab.com/omos/linux-crypto-bench Crypto driver used: lrw(ecb-aes-aesni) The results show that the new code has about the same performance as the old code. For 512-byte message it seems to be even slightly faster, but that might be just noise. Before: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 200 203 lrw(aes) 320 64 202 204 lrw(aes) 384 64 204 205 lrw(aes) 256 512 415 415 lrw(aes) 320 512 432 440 lrw(aes) 384 512 449 451 lrw(aes) 256409618381995 lrw(aes) 320409621231980 lrw(aes) 384409621002119 lrw(aes) 256 1638471836954 lrw(aes) 320 1638478447631 lrw(aes) 384 1638482568126 lrw(aes) 256 32768 14772 14484 lrw(aes) 320 32768 15281 15431 lrw(aes) 384 32768 16469 16293 After: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 197 196 lrw(aes) 320 64 200 197 lrw(aes) 384 64 203 199 lrw(aes) 256 512 385 380 lrw(aes) 320 512 401 395 lrw(aes) 384 512 415 415 lrw(aes) 256409618691846 lrw(aes) 320409620801981 lrw(aes) 384409621602109 lrw(aes) 256 1638470777127 lrw(aes) 320 1638478077766 lrw(aes) 384 1638481088357 lrw(aes) 256 32768 14111 14454 lrw(aes) 320 32768 15268 15082 lrw(aes) 384 32768 16581 16250 [1] https://lkml.org/lkml/2018/8/23/1315 Signed-off-by: Ondrej Mosnacek --- crypto/lrw.c | 280 ++- 1 file changed, 51 insertions(+), 229 deletions(-) diff --git a/crypto/lrw.c b/crypto/lrw.c index b4f30b6f16d6..d5d2fba9af59 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -29,8 +29,6 @@ #include #include -#define LRW_BUFFER_SIZE 128u - #define LRW_BLOCK_SIZE 16 struct priv { @@ -56,19 +54,7 @@ struct priv { }; struct rctx { - be128 buf[LRW_BUFFER_SIZE / sizeof(be128)]; - - be128 t; - - be128 *ext; - - struct scatterlist srcbuf[2]; - struct scatterlist dstbuf[2]; - struct scatterlist *src; - struct scatterlist *dst; - - unsigned int left; - + be128 t, orig_iv; struct skcipher_request subreq; }; @@ -135,86 +121,31 @@ static int next_index(u32 *counter) return res; } -static int post_crypt(struct skcipher_request *req) +/* + * We compute the tweak masks twice (both before and after the ECB encryption or + * decryption) to avoid having to allocate a temporary buffer and/or make + * mutliple calls to the 'ecb(..)' instance, which usually would be slower than + * just doing the gf128mul_x_ble() calls again. + */ +static int xor_tweak(struct skcipher_request *req, bool second_pass) { - struct rctx *rctx = skcipher_request_ctx(req); - be128 *buf = rctx->ext ?: rctx->buf; - struct skcipher_request *subreq; const int bs = LRW_BLOCK_SIZE; - struct skcipher_walk w; - struct scatterlist *sg; - unsigned offset; - int err; - - subreq = &rctx->subreq; - err = skcipher_walk_virt(&w, subreq, false); - - while (w.nbytes) { - unsigned int avail = w.nbytes; - be128 *wdst; - - wdst = w.dst.virt.addr; - - do { - be128_xor(wdst, buf++, wdst); - wdst++; - } while ((avail -= bs) >= bs); - - err
[dm-devel] [PATCH v5] crypto: xts - Drop use of auxiliary buffer
Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore caching the computed XTS tweaks has only negligible advantage over computing them twice. In fact, since the current caching implementation limits the size of the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B), it is often actually slower than the simple recomputing implementation. This patch simplifies the XTS template to recompute the XTS tweaks from scratch in the second pass and thus also removes the need to allocate a dynamic buffer using kmalloc(). As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt. PERFORMANCE RESULTS I measured time to encrypt/decrypt a memory buffer of varying sizes with xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest that after this patch the performance is either better or comparable for both small and large buffers. Note that there is a lot of noise in the measurements, but the overall difference is easy to see. Old code: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) xts(aes) 256 64 331 328 xts(aes) 384 64 332 333 xts(aes) 512 64 338 348 xts(aes) 256 512 889 920 xts(aes) 384 5121019 993 xts(aes) 512 5121032 990 xts(aes) 256409621522292 xts(aes) 384409624532597 xts(aes) 512409630412641 xts(aes) 256 1638494438027 xts(aes) 384 1638485368925 xts(aes) 512 1638492329417 xts(aes) 256 32768 16383 14897 xts(aes) 384 32768 17527 16102 xts(aes) 512 32768 18483 17322 New code: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) xts(aes) 256 64 328 324 xts(aes) 384 64 324 319 xts(aes) 512 64 320 322 xts(aes) 256 512 476 473 xts(aes) 384 512 509 492 xts(aes) 512 512 531 514 xts(aes) 256409621321829 xts(aes) 384409623572055 xts(aes) 512409621782027 xts(aes) 256 1638469206983 xts(aes) 384 1638485977505 xts(aes) 512 1638478418164 xts(aes) 256 32768 13468 12307 xts(aes) 384 32768 14808 13402 xts(aes) 512 32768 15753 14636 [1] https://lkml.org/lkml/2018/8/23/1315 [2] https://gitlab.com/omos/linux-crypto-bench Signed-off-by: Ondrej Mosnacek --- crypto/xts.c | 269 +-- 1 file changed, 46 insertions(+), 223 deletions(-) Changes in v5: - fix dumb mistakes v4: https://www.spinics.net/lists/linux-crypto/msg34889.html Changes in v4: - small cleanup suggested by Eric Biggers v3: https://www.spinics.net/lists/linux-crypto/msg34823.html Changes in v3: - add comment explaining the new approach as suggested by Eric - ensure correct alignment in second xor_tweak() pass - align performance results table header to the right v2: https://www.spinics.net/lists/linux-crypto/msg34799.html Changes in v2: - rebase to latest cryptodev tree v1: https://www.spinics.net/lists/linux-crypto/msg34776.html diff --git a/crypto/xts.c b/crypto/xts.c index ccf55fbb8bc2..847f54f76789 100644 --- a/crypto/xts.c +++ b/crypto/xts.c @@ -26,8 +26,6 @@ #include #include -#define XTS_BUFFER_SIZE 128u - struct priv { struct crypto_skcipher *child; struct crypto_cipher *tweak; @@ -39,19 +37,7 @@ struct xts_instance_ctx { }; struct rctx { - le128 buf[XTS_BUFFER_SIZE / sizeof(le128)]; - le128 t; - - le128 *ext; - - struct scatterlist srcbuf[2]; - struct scatterlist dstbuf[2]; - struct scatterlist *src; - struct scatterlist *dst; - - unsigned int left; - struct skcipher_request subreq; }; @@ -96,81 +82,27 @@ static int setkey(struct crypto_skcipher *par
[dm-devel] [PATCH v3 0/2] crypto: lrw - Simplify and optimize the LRW template
This patchset is a follow-up to a similar patch for XTS: https://patchwork.kernel.org/patch/10588775/ The first patch applies a small optimization to the tweak computation and the second simplifes away the use of auxiliary buffer to cache computed tweaks. Changes in v3: - fix a copy-paste error v2: https://www.spinics.net/lists/linux-crypto/msg34890.html Changes in v2: - small cleanup suggested by Eric Biggers v1: https://www.spinics.net/lists/linux-crypto/msg34871.html Ondrej Mosnacek (2): crypto: lrw - Optimize tweak computation crypto: lrw - Do not use auxiliary buffer crypto/lrw.c | 327 --- 1 file changed, 75 insertions(+), 252 deletions(-) -- 2.17.1 -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
[dm-devel] [PATCH v3 1/2] crypto: lrw - Optimize tweak computation
This patch rewrites the tweak computation to a slightly simpler method that performs less bswaps. Based on performance measurements the new code seems to provide slightly better performance than the old one. PERFORMANCE MEASUREMENTS (x86_64) Performed using: https://gitlab.com/omos/linux-crypto-bench Crypto driver used: lrw(ecb-aes-aesni) Before: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 204 286 lrw(aes) 320 64 227 203 lrw(aes) 384 64 208 204 lrw(aes) 256 512 441 439 lrw(aes) 320 512 456 455 lrw(aes) 384 512 469 483 lrw(aes) 256409621362190 lrw(aes) 320409621612213 lrw(aes) 384409622952369 lrw(aes) 256 1638476927868 lrw(aes) 320 1638482308691 lrw(aes) 384 1638489718813 lrw(aes) 256 32768 15336 15560 lrw(aes) 320 32768 16410 16346 lrw(aes) 384 32768 18023 17465 After: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 200 203 lrw(aes) 320 64 202 204 lrw(aes) 384 64 204 205 lrw(aes) 256 512 415 415 lrw(aes) 320 512 432 440 lrw(aes) 384 512 449 451 lrw(aes) 256409618381995 lrw(aes) 320409621231980 lrw(aes) 384409621002119 lrw(aes) 256 1638471836954 lrw(aes) 320 1638478447631 lrw(aes) 384 1638482568126 lrw(aes) 256 32768 14772 14484 lrw(aes) 320 32768 15281 15431 lrw(aes) 384 32768 16469 16293 Signed-off-by: Ondrej Mosnacek --- crypto/lrw.c | 49 + 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/crypto/lrw.c b/crypto/lrw.c index 393a782679c7..b4f30b6f16d6 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -120,30 +120,19 @@ static int setkey(struct crypto_skcipher *parent, const u8 *key, return 0; } -static inline void inc(be128 *iv) +static int next_index(u32 *counter) { - be64_add_cpu(&iv->b, 1); - if (!iv->b) - be64_add_cpu(&iv->a, 1); -} - -/* this returns the number of consequative 1 bits starting - * from the right, get_index128(00 00 00 00 00 00 ... 00 00 10 FB) = 2 */ -static inline int get_index128(be128 *block) -{ - int x; - __be32 *p = (__be32 *) block; - - for (p += 3, x = 0; x < 128; p--, x += 32) { - u32 val = be32_to_cpup(p); - - if (!~val) - continue; + int i, res = 0; - return x + ffz(val); + for (i = 0; i < 4; i++) { + if (counter[i] + 1 != 0) { + res += ffz(counter[i]++); + break; + } + counter[i] = 0; + res += 32; } - - return x; + return res; } static int post_crypt(struct skcipher_request *req) @@ -209,8 +198,9 @@ static int pre_crypt(struct skcipher_request *req) struct scatterlist *sg; unsigned cryptlen; unsigned offset; - be128 *iv; bool more; + __u32 *iv; + u32 counter[4]; int err; subreq = &rctx->subreq; @@ -227,6 +217,11 @@ static int pre_crypt(struct skcipher_request *req) err = skcipher_walk_virt(&w, subreq, false); iv = w.iv; + counter[0] = be32_to_cpu(iv[3]); + counter[1] = be32_to_cpu(iv[2]); + counter[2] = be32_to_cpu(iv[1]); + counter[3] = be32_to_cpu(iv[0]); + while (w.nbytes) { unsigned int avail = w.nbytes; be128 *wsrc; @@ -242,10 +237,16 @@ static int pre_crypt(struct skcipher_request *req) /* T <- I*Key2, using the optimization * discussed in the specification
Re: [dm-devel] [PATCH v4] crypto: xts - Drop use of auxiliary buffer
On Mon, Sep 10, 2018 at 6:17 PM Eric Biggers wrote: > Hi Ondrej, > > On Mon, Sep 10, 2018 at 01:28:41PM +0200, Ondrej Mosnacek wrote: > > > > -static int init_crypt(struct skcipher_request *req, crypto_completion_t > > done) > > +static int xor_tweak_pre(struct skcipher_request *req) > > { > > - struct priv *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); > > - struct rctx *rctx = skcipher_request_ctx(req); > > - struct skcipher_request *subreq; > > - gfp_t gfp; > > - > > - subreq = &rctx->subreq; > > - skcipher_request_set_tfm(subreq, ctx->child); > > - skcipher_request_set_callback(subreq, req->base.flags, done, req); > > - > > - gfp = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL : > > -GFP_ATOMIC; > > - rctx->ext = NULL; > > - > > - subreq->cryptlen = XTS_BUFFER_SIZE; > > - if (req->cryptlen > XTS_BUFFER_SIZE) { > > - unsigned int n = min(req->cryptlen, (unsigned int)PAGE_SIZE); > > - > > - rctx->ext = kmalloc(n, gfp); > > - if (rctx->ext) > > - subreq->cryptlen = n; > > - } > > - > > - rctx->src = req->src; > > - rctx->dst = req->dst; > > - rctx->left = req->cryptlen; > > - > > - /* calculate first value of T */ > > - crypto_cipher_encrypt_one(ctx->tweak, (u8 *)&rctx->t, req->iv); > > - > > - return 0; > > + return xor_tweak(req, false); > > } > > > > -static void exit_crypt(struct skcipher_request *req) > > +static int xor_tweak_post(struct skcipher_request *req) > > { > > - struct rctx *rctx = skcipher_request_ctx(req); > > - > > - rctx->left = 0; > > - > > - if (rctx->ext) > > - kzfree(rctx->ext); > > + return xor_tweak(req, false); > > } > > I think you meant 'xor_tweak(req, true);' here? Ouch, yes I sent the patches out too quickly... Will fix it all and resubmit. > > > +static void crypt_done(struct crypto_async_request *areq, int err) > > { > > struct skcipher_request *req = areq->data; > > - struct skcipher_request *subreq; > > - struct rctx *rctx; > > - > > - rctx = skcipher_request_ctx(req); > > - > > - if (err == -EINPROGRESS) { > > - if (rctx->left != req->cryptlen) > > - return; > > - goto out; > > - } > > - > > - subreq = &rctx->subreq; > > - subreq->base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG; > > > > - err = do_encrypt(req, err ?: post_crypt(req)); > > - if (rctx->left) > > - return; > > + if (!err) > > + err = xor_tweak(req, true); > > Use xor_tweak_post()? > > Note that you could also change the bool to > > enum { > FIRST_PASS, > SECOND_PASS, > } > > if you find the bool to be unclear. Yes, that would work, too. I slightly prefer aliasing the functions, but I guess it is a matter of taste... Thanks for the review, -- Ondrej Mosnacek Associate Software Engineer, Security Technologies Red Hat, Inc. -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
[dm-devel] [PATCH v2 2/2] crypto: lrw - Do not use auxiliary buffer
This patch simplifies the LRW template to recompute the LRW tweaks from scratch in the second pass and thus also removes the need to allocate a dynamic buffer using kmalloc(). As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt. PERFORMANCE MEASUREMENTS (x86_64) Performed using: https://gitlab.com/omos/linux-crypto-bench Crypto driver used: lrw(ecb-aes-aesni) The results show that the new code has about the same performance as the old code. For 512-byte message it seems to be even slightly faster, but that might be just noise. Before: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 200 203 lrw(aes) 320 64 202 204 lrw(aes) 384 64 204 205 lrw(aes) 256 512 415 415 lrw(aes) 320 512 432 440 lrw(aes) 384 512 449 451 lrw(aes) 256409618381995 lrw(aes) 320409621231980 lrw(aes) 384409621002119 lrw(aes) 256 1638471836954 lrw(aes) 320 1638478447631 lrw(aes) 384 1638482568126 lrw(aes) 256 32768 14772 14484 lrw(aes) 320 32768 15281 15431 lrw(aes) 384 32768 16469 16293 After: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 197 196 lrw(aes) 320 64 200 197 lrw(aes) 384 64 203 199 lrw(aes) 256 512 385 380 lrw(aes) 320 512 401 395 lrw(aes) 384 512 415 415 lrw(aes) 256409618691846 lrw(aes) 320409620801981 lrw(aes) 384409621602109 lrw(aes) 256 1638470777127 lrw(aes) 320 1638478077766 lrw(aes) 384 1638481088357 lrw(aes) 256 32768 14111 14454 lrw(aes) 320 32768 15268 15082 lrw(aes) 384 32768 16581 16250 [1] https://lkml.org/lkml/2018/8/23/1315 Signed-off-by: Ondrej Mosnacek --- crypto/lrw.c | 280 ++- 1 file changed, 51 insertions(+), 229 deletions(-) diff --git a/crypto/lrw.c b/crypto/lrw.c index b4f30b6f16d6..6315a6a59589 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -29,8 +29,6 @@ #include #include -#define LRW_BUFFER_SIZE 128u - #define LRW_BLOCK_SIZE 16 struct priv { @@ -56,19 +54,7 @@ struct priv { }; struct rctx { - be128 buf[LRW_BUFFER_SIZE / sizeof(be128)]; - - be128 t; - - be128 *ext; - - struct scatterlist srcbuf[2]; - struct scatterlist dstbuf[2]; - struct scatterlist *src; - struct scatterlist *dst; - - unsigned int left; - + be128 t, orig_iv; struct skcipher_request subreq; }; @@ -135,86 +121,31 @@ static int next_index(u32 *counter) return res; } -static int post_crypt(struct skcipher_request *req) +/* + * We compute the tweak masks twice (both before and after the ECB encryption or + * decryption) to avoid having to allocate a temporary buffer and/or make + * mutliple calls to the 'ecb(..)' instance, which usually would be slower than + * just doing the gf128mul_x_ble() calls again. + */ +static int xor_tweak(struct skcipher_request *req, bool second_pass) { - struct rctx *rctx = skcipher_request_ctx(req); - be128 *buf = rctx->ext ?: rctx->buf; - struct skcipher_request *subreq; const int bs = LRW_BLOCK_SIZE; - struct skcipher_walk w; - struct scatterlist *sg; - unsigned offset; - int err; - - subreq = &rctx->subreq; - err = skcipher_walk_virt(&w, subreq, false); - - while (w.nbytes) { - unsigned int avail = w.nbytes; - be128 *wdst; - - wdst = w.dst.virt.addr; - - do { - be128_xor(wdst, buf++, wdst); - wdst++; - } while ((avail -= bs) >= bs); - - err
[dm-devel] [PATCH v2 1/2] crypto: lrw - Optimize tweak computation
This patch rewrites the tweak computation to a slightly simpler method that performs less bswaps. Based on performance measurements the new code seems to provide slightly better performance than the old one. PERFORMANCE MEASUREMENTS (x86_64) Performed using: https://gitlab.com/omos/linux-crypto-bench Crypto driver used: lrw(ecb-aes-aesni) Before: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 204 286 lrw(aes) 320 64 227 203 lrw(aes) 384 64 208 204 lrw(aes) 256 512 441 439 lrw(aes) 320 512 456 455 lrw(aes) 384 512 469 483 lrw(aes) 256409621362190 lrw(aes) 320409621612213 lrw(aes) 384409622952369 lrw(aes) 256 1638476927868 lrw(aes) 320 1638482308691 lrw(aes) 384 1638489718813 lrw(aes) 256 32768 15336 15560 lrw(aes) 320 32768 16410 16346 lrw(aes) 384 32768 18023 17465 After: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 200 203 lrw(aes) 320 64 202 204 lrw(aes) 384 64 204 205 lrw(aes) 256 512 415 415 lrw(aes) 320 512 432 440 lrw(aes) 384 512 449 451 lrw(aes) 256409618381995 lrw(aes) 320409621231980 lrw(aes) 384409621002119 lrw(aes) 256 1638471836954 lrw(aes) 320 1638478447631 lrw(aes) 384 1638482568126 lrw(aes) 256 32768 14772 14484 lrw(aes) 320 32768 15281 15431 lrw(aes) 384 32768 16469 16293 Signed-off-by: Ondrej Mosnacek --- crypto/lrw.c | 49 + 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/crypto/lrw.c b/crypto/lrw.c index 393a782679c7..b4f30b6f16d6 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -120,30 +120,19 @@ static int setkey(struct crypto_skcipher *parent, const u8 *key, return 0; } -static inline void inc(be128 *iv) +static int next_index(u32 *counter) { - be64_add_cpu(&iv->b, 1); - if (!iv->b) - be64_add_cpu(&iv->a, 1); -} - -/* this returns the number of consequative 1 bits starting - * from the right, get_index128(00 00 00 00 00 00 ... 00 00 10 FB) = 2 */ -static inline int get_index128(be128 *block) -{ - int x; - __be32 *p = (__be32 *) block; - - for (p += 3, x = 0; x < 128; p--, x += 32) { - u32 val = be32_to_cpup(p); - - if (!~val) - continue; + int i, res = 0; - return x + ffz(val); + for (i = 0; i < 4; i++) { + if (counter[i] + 1 != 0) { + res += ffz(counter[i]++); + break; + } + counter[i] = 0; + res += 32; } - - return x; + return res; } static int post_crypt(struct skcipher_request *req) @@ -209,8 +198,9 @@ static int pre_crypt(struct skcipher_request *req) struct scatterlist *sg; unsigned cryptlen; unsigned offset; - be128 *iv; bool more; + __u32 *iv; + u32 counter[4]; int err; subreq = &rctx->subreq; @@ -227,6 +217,11 @@ static int pre_crypt(struct skcipher_request *req) err = skcipher_walk_virt(&w, subreq, false); iv = w.iv; + counter[0] = be32_to_cpu(iv[3]); + counter[1] = be32_to_cpu(iv[2]); + counter[2] = be32_to_cpu(iv[1]); + counter[3] = be32_to_cpu(iv[0]); + while (w.nbytes) { unsigned int avail = w.nbytes; be128 *wsrc; @@ -242,10 +237,16 @@ static int pre_crypt(struct skcipher_request *req) /* T <- I*Key2, using the optimization * discussed in the specification
[dm-devel] [PATCH v2 0/2] crypto: lrw - Simplify and optimize the LRW template
This patchset is a follow-up to a similar patch for XTS: https://patchwork.kernel.org/patch/10588775/ The first patch applies a small optimization to the tweak computation and the second simplifes away the use of auxiliary buffer to cache computed tweaks. Changes in v2: - small cleanup suggested by Eric Biggers v1: https://www.spinics.net/lists/linux-crypto/msg34871.html Ondrej Mosnacek (2): crypto: lrw - Optimize tweak computation crypto: lrw - Do not use auxiliary buffer crypto/lrw.c | 327 --- 1 file changed, 75 insertions(+), 252 deletions(-) -- 2.17.1 -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
[dm-devel] [PATCH v4] crypto: xts - Drop use of auxiliary buffer
Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore caching the computed XTS tweaks has only negligible advantage over computing them twice. In fact, since the current caching implementation limits the size of the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B), it is often actually slower than the simple recomputing implementation. This patch simplifies the XTS template to recompute the XTS tweaks from scratch in the second pass and thus also removes the need to allocate a dynamic buffer using kmalloc(). As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt. PERFORMANCE RESULTS I measured time to encrypt/decrypt a memory buffer of varying sizes with xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest that after this patch the performance is either better or comparable for both small and large buffers. Note that there is a lot of noise in the measurements, but the overall difference is easy to see. Old code: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) xts(aes) 256 64 331 328 xts(aes) 384 64 332 333 xts(aes) 512 64 338 348 xts(aes) 256 512 889 920 xts(aes) 384 5121019 993 xts(aes) 512 5121032 990 xts(aes) 256409621522292 xts(aes) 384409624532597 xts(aes) 512409630412641 xts(aes) 256 1638494438027 xts(aes) 384 1638485368925 xts(aes) 512 1638492329417 xts(aes) 256 32768 16383 14897 xts(aes) 384 32768 17527 16102 xts(aes) 512 32768 18483 17322 New code: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) xts(aes) 256 64 328 324 xts(aes) 384 64 324 319 xts(aes) 512 64 320 322 xts(aes) 256 512 476 473 xts(aes) 384 512 509 492 xts(aes) 512 512 531 514 xts(aes) 256409621321829 xts(aes) 384409623572055 xts(aes) 512409621782027 xts(aes) 256 1638469206983 xts(aes) 384 1638485977505 xts(aes) 512 1638478418164 xts(aes) 256 32768 13468 12307 xts(aes) 384 32768 14808 13402 xts(aes) 512 32768 15753 14636 [1] https://lkml.org/lkml/2018/8/23/1315 [2] https://gitlab.com/omos/linux-crypto-bench Signed-off-by: Ondrej Mosnacek --- crypto/xts.c | 269 +-- 1 file changed, 46 insertions(+), 223 deletions(-) Changes in v4: - small cleanup suggested by Eric Biggers v3: https://www.spinics.net/lists/linux-crypto/msg34823.html Changes in v3: - add comment explaining the new approach as suggested by Eric - ensure correct alignment in second xor_tweak() pass - align performance results table header to the right v2: https://www.spinics.net/lists/linux-crypto/msg34799.html Changes in v2: - rebase to latest cryptodev tree v1: https://www.spinics.net/lists/linux-crypto/msg34776.html diff --git a/crypto/xts.c b/crypto/xts.c index ccf55fbb8bc2..daa57a8c266d 100644 --- a/crypto/xts.c +++ b/crypto/xts.c @@ -26,8 +26,6 @@ #include #include -#define XTS_BUFFER_SIZE 128u - struct priv { struct crypto_skcipher *child; struct crypto_cipher *tweak; @@ -39,19 +37,7 @@ struct xts_instance_ctx { }; struct rctx { - le128 buf[XTS_BUFFER_SIZE / sizeof(le128)]; - le128 t; - - le128 *ext; - - struct scatterlist srcbuf[2]; - struct scatterlist dstbuf[2]; - struct scatterlist *src; - struct scatterlist *dst; - - unsigned int left; - struct skcipher_request subreq; }; @@ -96,81 +82,27 @@ static int setkey(struct crypto_skcipher *parent, const u8 *key, return err; } -static int post_crypt(struct skcipher_request *req)
Re: [dm-devel] [PATCH v3] crypto: xts - Drop use of auxiliary buffer
On Sat, Sep 8, 2018 at 3:35 AM Eric Biggers wrote: > Hi Ondrej, > > On Wed, Sep 05, 2018 at 01:30:39PM +0200, Ondrej Mosnacek wrote: > > Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in > > gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore > > caching the computed XTS tweaks has only negligible advantage over > > computing them twice. > > > > In fact, since the current caching implementation limits the size of > > the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B), > > it is often actually slower than the simple recomputing implementation. > > > > This patch simplifies the XTS template to recompute the XTS tweaks from > > scratch in the second pass and thus also removes the need to allocate a > > dynamic buffer using kmalloc(). > > > > As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt. > > > > PERFORMANCE RESULTS > > I measured time to encrypt/decrypt a memory buffer of varying sizes with > > xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest > > that after this patch the performance is either better or comparable for > > both small and large buffers. Note that there is a lot of noise in the > > measurements, but the overall difference is easy to see. > > > > Old code: > >ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) > > xts(aes) 256 64 331 328 > > xts(aes) 384 64 332 333 > > xts(aes) 512 64 338 348 > > xts(aes) 256 512 889 920 > > xts(aes) 384 5121019 993 > > xts(aes) 512 5121032 990 > > xts(aes) 256409621522292 > > xts(aes) 384409624532597 > > xts(aes) 512409630412641 > > xts(aes) 256 1638494438027 > > xts(aes) 384 1638485368925 > > xts(aes) 512 1638492329417 > > xts(aes) 256 32768 16383 14897 > > xts(aes) 384 32768 17527 16102 > > xts(aes) 512 32768 18483 17322 > > > > New code: > >ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) > > xts(aes) 256 64 328 324 > > xts(aes) 384 64 324 319 > > xts(aes) 512 64 320 322 > > xts(aes) 256 512 476 473 > > xts(aes) 384 512 509 492 > > xts(aes) 512 512 531 514 > > xts(aes) 256409621321829 > > xts(aes) 384409623572055 > > xts(aes) 512409621782027 > > xts(aes) 256 1638469206983 > > xts(aes) 384 1638485977505 > > xts(aes) 512 1638478418164 > > xts(aes) 256 32768 13468 12307 > > xts(aes) 384 32768 14808 13402 > > xts(aes) 512 32768 15753 14636 > > > > [1] https://lkml.org/lkml/2018/8/23/1315 > > [2] https://gitlab.com/omos/linux-crypto-bench > > > > Signed-off-by: Ondrej Mosnacek > > --- > > crypto/xts.c | 265 --- > > 1 file changed, 39 insertions(+), 226 deletions(-) > > > > Changes in v3: > > - add comment explaining the new approach as suggested by Eric > > - ensure correct alignment in second xor_tweak() pass > > - align performance results table header to the right > > > > v2: https://www.spinics.net/lists/linux-crypto/msg34799.html > > Changes in v2: > > - rebase to latest cryptodev tree > > > > v1: https://www.spinics.net/lists/linux-crypto/msg34776.html > > > > diff --git a/crypto/xts.
[dm-devel] [PATCH 0/2] crpyto: lrw - Simplify and optimize the LRW template
This patchset is a follow-up to a similar patch for XTS: https://patchwork.kernel.org/patch/10588775/ The first patch applies a small optimization to the tweak computation and the second simplifes away the use of auxiliary buffer to cache computed tweaks. Ondrej Mosnacek (2): crypto: lrw - Optimize tweak computation crypto: lrw - Do not use auxiliary buffer crypto/lrw.c | 321 +++ 1 file changed, 66 insertions(+), 255 deletions(-) -- 2.17.1 -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
[dm-devel] [PATCH 2/2] crypto: lrw - Do not use auxiliary buffer
This patch simplifies the LRW template to recompute the LRW tweaks from scratch in the second pass and thus also removes the need to allocate a dynamic buffer using kmalloc(). As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt. PERFORMANCE MEASUREMENTS (x86_64) Performed using: https://gitlab.com/omos/linux-crypto-bench Crypto driver used: lrw(ecb-aes-aesni) The results show that the new code has about the same performance as the old code. For 512-byte message it seems to be even slightly faster, but that might be just noise. Before: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 200 203 lrw(aes) 320 64 202 204 lrw(aes) 384 64 204 205 lrw(aes) 256 512 415 415 lrw(aes) 320 512 432 440 lrw(aes) 384 512 449 451 lrw(aes) 256409618381995 lrw(aes) 320409621231980 lrw(aes) 384409621002119 lrw(aes) 256 1638471836954 lrw(aes) 320 1638478447631 lrw(aes) 384 1638482568126 lrw(aes) 256 32768 14772 14484 lrw(aes) 320 32768 15281 15431 lrw(aes) 384 32768 16469 16293 After: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 197 196 lrw(aes) 320 64 200 197 lrw(aes) 384 64 203 199 lrw(aes) 256 512 385 380 lrw(aes) 320 512 401 395 lrw(aes) 384 512 415 415 lrw(aes) 256409618691846 lrw(aes) 320409620801981 lrw(aes) 384409621602109 lrw(aes) 256 1638470777127 lrw(aes) 320 1638478077766 lrw(aes) 384 1638481088357 lrw(aes) 256 32768 14111 14454 lrw(aes) 320 32768 15268 15082 lrw(aes) 384 32768 16581 16250 [1] https://lkml.org/lkml/2018/8/23/1315 Signed-off-by: Ondrej Mosnacek --- crypto/lrw.c | 274 --- 1 file changed, 42 insertions(+), 232 deletions(-) diff --git a/crypto/lrw.c b/crypto/lrw.c index b4f30b6f16d6..990eb0f2f51e 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -29,8 +29,6 @@ #include #include -#define LRW_BUFFER_SIZE 128u - #define LRW_BLOCK_SIZE 16 struct priv { @@ -56,19 +54,7 @@ struct priv { }; struct rctx { - be128 buf[LRW_BUFFER_SIZE / sizeof(be128)]; - - be128 t; - - be128 *ext; - - struct scatterlist srcbuf[2]; - struct scatterlist dstbuf[2]; - struct scatterlist *src; - struct scatterlist *dst; - - unsigned int left; - + be128 t, orig_iv; struct skcipher_request subreq; }; @@ -135,85 +121,26 @@ static int next_index(u32 *counter) return res; } -static int post_crypt(struct skcipher_request *req) +/* + * We compute the tweak masks twice (both before and after the ECB encryption or + * decryption) to avoid having to allocate a temporary buffer and/or make + * mutliple calls to the 'ecb(..)' instance, which usually would be slower than + * just doing the gf128mul_x_ble() calls again. + */ +static int xor_tweak(struct skcipher_request *req, struct skcipher_request *subreq) { - struct rctx *rctx = skcipher_request_ctx(req); - be128 *buf = rctx->ext ?: rctx->buf; - struct skcipher_request *subreq; const int bs = LRW_BLOCK_SIZE; - struct skcipher_walk w; - struct scatterlist *sg; - unsigned offset; - int err; - - subreq = &rctx->subreq; - err = skcipher_walk_virt(&w, subreq, false); - - while (w.nbytes) { - unsigned int avail = w.nbytes; - be128 *wdst; - - wdst = w.dst.virt.addr; - - do { - be128_xor(wdst, buf++, wdst); - wdst++; - } while ((avail -= bs) >= bs); - - err
[dm-devel] [PATCH 1/2] crypto: lrw - Optimize tweak computation
This patch rewrites the tweak computation to a slightly simpler method that performs less bswaps. Based on performance measurements the new code seems to provide slightly better performance than the old one. PERFORMANCE MEASUREMENTS (x86_64) Performed using: https://gitlab.com/omos/linux-crypto-bench Crypto driver used: lrw(ecb-aes-aesni) Before: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 204 286 lrw(aes) 320 64 227 203 lrw(aes) 384 64 208 204 lrw(aes) 256 512 441 439 lrw(aes) 320 512 456 455 lrw(aes) 384 512 469 483 lrw(aes) 256409621362190 lrw(aes) 320409621612213 lrw(aes) 384409622952369 lrw(aes) 256 1638476927868 lrw(aes) 320 1638482308691 lrw(aes) 384 1638489718813 lrw(aes) 256 32768 15336 15560 lrw(aes) 320 32768 16410 16346 lrw(aes) 384 32768 18023 17465 After: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) lrw(aes) 256 64 200 203 lrw(aes) 320 64 202 204 lrw(aes) 384 64 204 205 lrw(aes) 256 512 415 415 lrw(aes) 320 512 432 440 lrw(aes) 384 512 449 451 lrw(aes) 256409618381995 lrw(aes) 320409621231980 lrw(aes) 384409621002119 lrw(aes) 256 1638471836954 lrw(aes) 320 1638478447631 lrw(aes) 384 1638482568126 lrw(aes) 256 32768 14772 14484 lrw(aes) 320 32768 15281 15431 lrw(aes) 384 32768 16469 16293 Signed-off-by: Ondrej Mosnacek --- crypto/lrw.c | 49 + 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/crypto/lrw.c b/crypto/lrw.c index 393a782679c7..b4f30b6f16d6 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -120,30 +120,19 @@ static int setkey(struct crypto_skcipher *parent, const u8 *key, return 0; } -static inline void inc(be128 *iv) +static int next_index(u32 *counter) { - be64_add_cpu(&iv->b, 1); - if (!iv->b) - be64_add_cpu(&iv->a, 1); -} - -/* this returns the number of consequative 1 bits starting - * from the right, get_index128(00 00 00 00 00 00 ... 00 00 10 FB) = 2 */ -static inline int get_index128(be128 *block) -{ - int x; - __be32 *p = (__be32 *) block; - - for (p += 3, x = 0; x < 128; p--, x += 32) { - u32 val = be32_to_cpup(p); - - if (!~val) - continue; + int i, res = 0; - return x + ffz(val); + for (i = 0; i < 4; i++) { + if (counter[i] + 1 != 0) { + res += ffz(counter[i]++); + break; + } + counter[i] = 0; + res += 32; } - - return x; + return res; } static int post_crypt(struct skcipher_request *req) @@ -209,8 +198,9 @@ static int pre_crypt(struct skcipher_request *req) struct scatterlist *sg; unsigned cryptlen; unsigned offset; - be128 *iv; bool more; + __u32 *iv; + u32 counter[4]; int err; subreq = &rctx->subreq; @@ -227,6 +217,11 @@ static int pre_crypt(struct skcipher_request *req) err = skcipher_walk_virt(&w, subreq, false); iv = w.iv; + counter[0] = be32_to_cpu(iv[3]); + counter[1] = be32_to_cpu(iv[2]); + counter[2] = be32_to_cpu(iv[1]); + counter[3] = be32_to_cpu(iv[0]); + while (w.nbytes) { unsigned int avail = w.nbytes; be128 *wsrc; @@ -242,10 +237,16 @@ static int pre_crypt(struct skcipher_request *req) /* T <- I*Key2, using the optimization * discussed in the specification
[dm-devel] [PATCH v3] crypto: xts - Drop use of auxiliary buffer
Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore caching the computed XTS tweaks has only negligible advantage over computing them twice. In fact, since the current caching implementation limits the size of the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B), it is often actually slower than the simple recomputing implementation. This patch simplifies the XTS template to recompute the XTS tweaks from scratch in the second pass and thus also removes the need to allocate a dynamic buffer using kmalloc(). As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt. PERFORMANCE RESULTS I measured time to encrypt/decrypt a memory buffer of varying sizes with xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest that after this patch the performance is either better or comparable for both small and large buffers. Note that there is a lot of noise in the measurements, but the overall difference is easy to see. Old code: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) xts(aes) 256 64 331 328 xts(aes) 384 64 332 333 xts(aes) 512 64 338 348 xts(aes) 256 512 889 920 xts(aes) 384 5121019 993 xts(aes) 512 5121032 990 xts(aes) 256409621522292 xts(aes) 384409624532597 xts(aes) 512409630412641 xts(aes) 256 1638494438027 xts(aes) 384 1638485368925 xts(aes) 512 1638492329417 xts(aes) 256 32768 16383 14897 xts(aes) 384 32768 17527 16102 xts(aes) 512 32768 18483 17322 New code: ALGORITHM KEY (b)DATA (B) TIME ENC (ns) TIME DEC (ns) xts(aes) 256 64 328 324 xts(aes) 384 64 324 319 xts(aes) 512 64 320 322 xts(aes) 256 512 476 473 xts(aes) 384 512 509 492 xts(aes) 512 512 531 514 xts(aes) 256409621321829 xts(aes) 384409623572055 xts(aes) 512409621782027 xts(aes) 256 1638469206983 xts(aes) 384 1638485977505 xts(aes) 512 1638478418164 xts(aes) 256 32768 13468 12307 xts(aes) 384 32768 14808 13402 xts(aes) 512 32768 15753 14636 [1] https://lkml.org/lkml/2018/8/23/1315 [2] https://gitlab.com/omos/linux-crypto-bench Signed-off-by: Ondrej Mosnacek --- crypto/xts.c | 265 --- 1 file changed, 39 insertions(+), 226 deletions(-) Changes in v3: - add comment explaining the new approach as suggested by Eric - ensure correct alignment in second xor_tweak() pass - align performance results table header to the right v2: https://www.spinics.net/lists/linux-crypto/msg34799.html Changes in v2: - rebase to latest cryptodev tree v1: https://www.spinics.net/lists/linux-crypto/msg34776.html diff --git a/crypto/xts.c b/crypto/xts.c index ccf55fbb8bc2..24cfecdec565 100644 --- a/crypto/xts.c +++ b/crypto/xts.c @@ -26,8 +26,6 @@ #include #include -#define XTS_BUFFER_SIZE 128u - struct priv { struct crypto_skcipher *child; struct crypto_cipher *tweak; @@ -39,19 +37,7 @@ struct xts_instance_ctx { }; struct rctx { - le128 buf[XTS_BUFFER_SIZE / sizeof(le128)]; - le128 t; - - le128 *ext; - - struct scatterlist srcbuf[2]; - struct scatterlist dstbuf[2]; - struct scatterlist *src; - struct scatterlist *dst; - - unsigned int left; - struct skcipher_request subreq; }; @@ -96,265 +82,92 @@ static int setkey(struct crypto_skcipher *parent, const u8 *key, return err; } -static int post_crypt(struct skcipher_request *req) +/* + * We compute the tweak masks twice (both before and after the ECB encryption or + * decryption) to avoid having to
Re: [dm-devel] [PATCH v2] crypto: xts - Drop use of auxiliary buffer
Hi Eric, On Wed, Sep 5, 2018 at 8:32 AM Eric Biggers wrote: > Hi Ondrej, > > On Tue, Sep 04, 2018 at 10:06:42AM +0200, Ondrej Mosnacek wrote: > > Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in > > gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore > > caching the computed XTS tweaks has only negligible advantage over > > computing them twice. > > > > In fact, since the current caching implementation limits the size of > > the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B), > > it is often actually slower than the simple recomputing implementation. > > > > This patch simplifies the XTS template to recompute the XTS tweaks from > > scratch in the second pass and thus also removes the need to allocate a > > dynamic buffer using kmalloc(). > > > > As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt. > > > > PERFORMANCE RESULTS > > I measured time to encrypt/decrypt a memory buffer of varying sizes with > > xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest > > that after this patch the performance is either better or comparable for > > both small and large buffers. Note that there is a lot of noise in the > > measurements, but the overall difference is easy to see. > > > > Old code: > > ALGORITHM KEY (b) DATA (B)TIME ENC (ns) TIME DEC (ns) > > xts(aes) 256 64 331 328 > > xts(aes) 384 64 332 333 > > xts(aes) 512 64 338 348 > > xts(aes) 256 512 889 920 > > xts(aes) 384 5121019 993 > > xts(aes) 512 5121032 990 > > xts(aes) 256409621522292 > > xts(aes) 384409624532597 > > xts(aes) 512409630412641 > > xts(aes) 256 1638494438027 > > xts(aes) 384 1638485368925 > > xts(aes) 512 1638492329417 > > xts(aes) 256 32768 16383 14897 > > xts(aes) 384 32768 17527 16102 > > xts(aes) 512 32768 18483 17322 > > > > New code: > > ALGORITHM KEY (b) DATA (B)TIME ENC (ns) TIME DEC (ns) > > xts(aes) 256 64 328 324 > > xts(aes) 384 64 324 319 > > xts(aes) 512 64 320 322 > > xts(aes) 256 512 476 473 > > xts(aes) 384 512 509 492 > > xts(aes) 512 512 531 514 > > xts(aes) 256409621321829 > > xts(aes) 384409623572055 > > xts(aes) 512409621782027 > > xts(aes) 256 1638469206983 > > xts(aes) 384 1638485977505 > > xts(aes) 512 1638478418164 > > xts(aes) 256 32768 13468 12307 > > xts(aes) 384 32768 14808 13402 > > xts(aes) 512 32768 15753 14636 > > Can you align the headers of these tables? Sure. > > > +static int xor_tweak(struct rctx *rctx, struct skcipher_request *req) > > { > > - struct rctx *rctx = skcipher_request_ctx(req); > > - le128 *buf = rctx->ext ?: rctx->buf; > > - struct skcipher_request *subreq; > > const int bs = XTS_BLOCK_SIZE; > > struct skcipher_walk w; > > - struct scatterlist *sg; > > - unsigned offset; > > + le128 t = rctx->t; > > int err; > > Maybe you could add a brief comment above xor_tweak() explaining the design > choice for posterity, e.g.: > > /* > * We compute the tweak masks twice (both before and after the ECB encryption > or > * decryption) to avoid having to allocate a temporary buffer, which u
[dm-devel] [PATCH v2] crypto: xts - Drop use of auxiliary buffer
Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore caching the computed XTS tweaks has only negligible advantage over computing them twice. In fact, since the current caching implementation limits the size of the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B), it is often actually slower than the simple recomputing implementation. This patch simplifies the XTS template to recompute the XTS tweaks from scratch in the second pass and thus also removes the need to allocate a dynamic buffer using kmalloc(). As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt. PERFORMANCE RESULTS I measured time to encrypt/decrypt a memory buffer of varying sizes with xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest that after this patch the performance is either better or comparable for both small and large buffers. Note that there is a lot of noise in the measurements, but the overall difference is easy to see. Old code: ALGORITHM KEY (b) DATA (B)TIME ENC (ns) TIME DEC (ns) xts(aes) 256 64 331 328 xts(aes) 384 64 332 333 xts(aes) 512 64 338 348 xts(aes) 256 512 889 920 xts(aes) 384 5121019 993 xts(aes) 512 5121032 990 xts(aes) 256409621522292 xts(aes) 384409624532597 xts(aes) 512409630412641 xts(aes) 256 1638494438027 xts(aes) 384 1638485368925 xts(aes) 512 1638492329417 xts(aes) 256 32768 16383 14897 xts(aes) 384 32768 17527 16102 xts(aes) 512 32768 18483 17322 New code: ALGORITHM KEY (b) DATA (B)TIME ENC (ns) TIME DEC (ns) xts(aes) 256 64 328 324 xts(aes) 384 64 324 319 xts(aes) 512 64 320 322 xts(aes) 256 512 476 473 xts(aes) 384 512 509 492 xts(aes) 512 512 531 514 xts(aes) 256409621321829 xts(aes) 384409623572055 xts(aes) 512409621782027 xts(aes) 256 1638469206983 xts(aes) 384 1638485977505 xts(aes) 512 1638478418164 xts(aes) 256 32768 13468 12307 xts(aes) 384 32768 14808 13402 xts(aes) 512 32768 15753 14636 [1] https://lkml.org/lkml/2018/8/23/1315 [2] https://gitlab.com/omos/linux-crypto-bench Cc: Mikulas Patocka Signed-off-by: Ondrej Mosnacek --- crypto/xts.c | 258 ++- 1 file changed, 30 insertions(+), 228 deletions(-) v1: https://www.spinics.net/lists/linux-crypto/msg34776.html Changes in v2: - rebase to latest cryptodev tree diff --git a/crypto/xts.c b/crypto/xts.c index ccf55fbb8bc2..6c49e76df8ca 100644 --- a/crypto/xts.c +++ b/crypto/xts.c @@ -26,8 +26,6 @@ #include #include -#define XTS_BUFFER_SIZE 128u - struct priv { struct crypto_skcipher *child; struct crypto_cipher *tweak; @@ -39,19 +37,7 @@ struct xts_instance_ctx { }; struct rctx { - le128 buf[XTS_BUFFER_SIZE / sizeof(le128)]; - le128 t; - - le128 *ext; - - struct scatterlist srcbuf[2]; - struct scatterlist dstbuf[2]; - struct scatterlist *src; - struct scatterlist *dst; - - unsigned int left; - struct skcipher_request subreq; }; @@ -96,265 +82,81 @@ static int setkey(struct crypto_skcipher *parent, const u8 *key, return err; } -static int post_crypt(struct skcipher_request *req) +static int xor_tweak(struct rctx *rctx, struct skcipher_request *req) { - struct rctx *rctx = skcipher_request_ctx(req); - le128 *buf = rctx->ext ?: rctx->buf; - struct skcipher_request *subreq; const int bs = XTS_BLOCK_SIZE; struct skcipher_walk w; - struct scatterlist *sg; - unsigned offset; + l
[dm-devel] [PATCH] crypto: xts - Drop use of auxiliary buffer
Hi Herbert, Mikulas, I noticed the discussion about using kmalloc() inside crypto code and it made me wonder if the code in xts.c can actually be simplified to not require kmalloc() at all, while not badly affecting performace. I played around with it a little and it turns out we can drop the whole caching of tweak blocks, reducing code size by ~200 lines and not only preserve, but even improve the performance in some cases. See the full patch below. Obviously, this doesn't solve the general issue of using kmalloc() in crypto API routines, but it does resolve the original reported problem and also makes the XTS code easier to maintain. Regards, Ondrej ->8- Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore caching the computed XTS tweaks has only negligible advantage over computing them twice. In fact, since the current caching implementation limits the size of the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B), it is often actually slower than the simple recomputing implementation. This patch simplifies the XTS template to recompute the XTS tweaks from scratch in the second pass and thus also removes the need to allocate a dynamic buffer using kmalloc(). As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt. PERFORMANCE RESULTS I measured time to encrypt/decrypt a memory buffer of varying sizes with xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest that after this patch the performance is either better or comparable for both small and large buffers. Note that there is a lot of noise in the measurements, but the overall difference is easy to see. Old code: ALGORITHM KEY (b) DATA (B)TIME ENC (ns) TIME DEC (ns) xts(aes) 256 64 331 328 xts(aes) 384 64 332 333 xts(aes) 512 64 338 348 xts(aes) 256 512 889 920 xts(aes) 384 5121019 993 xts(aes) 512 5121032 990 xts(aes) 256409621522292 xts(aes) 384409624532597 xts(aes) 512409630412641 xts(aes) 256 1638494438027 xts(aes) 384 1638485368925 xts(aes) 512 1638492329417 xts(aes) 256 32768 16383 14897 xts(aes) 384 32768 17527 16102 xts(aes) 512 32768 18483 17322 New code: ALGORITHM KEY (b) DATA (B)TIME ENC (ns) TIME DEC (ns) xts(aes) 256 64 328 324 xts(aes) 384 64 324 319 xts(aes) 512 64 320 322 xts(aes) 256 512 476 473 xts(aes) 384 512 509 492 xts(aes) 512 512 531 514 xts(aes) 256409621321829 xts(aes) 384409623572055 xts(aes) 512409621782027 xts(aes) 256 1638469206983 xts(aes) 384 1638485977505 xts(aes) 512 1638478418164 xts(aes) 256 32768 13468 12307 xts(aes) 384 32768 14808 13402 xts(aes) 512 32768 15753 14636 [1] https://lkml.org/lkml/2018/8/23/1315 [2] https://gitlab.com/omos/linux-crypto-bench Cc: Mikulas Patocka Signed-off-by: Ondrej Mosnacek --- crypto/xts.c | 258 ++- 1 file changed, 30 insertions(+), 228 deletions(-) diff --git a/crypto/xts.c b/crypto/xts.c index 12284183bd20..6c49e76df8ca 100644 --- a/crypto/xts.c +++ b/crypto/xts.c @@ -26,8 +26,6 @@ #include #include -#define XTS_BUFFER_SIZE 128u - struct priv { struct crypto_skcipher *child; struct crypto_cipher *tweak; @@ -39,19 +37,7 @@ struct xts_instance_ctx { }; struct rctx { - le128 buf[XTS_BUFFER_SIZE / sizeof(le128)]; - le128 t; - - le128 *ext; - - struct scatterlist srcbuf[2]; - struct scatterlist dstbuf[2]; - struct scatterlist *src; - struct scatterlist *dst; - - unsigned int left
[dm-devel] [PATCH] dm integrity: Reject mappings too large for device
Before this patch, dm-integrity would successfully create mappings with the number of sectors greater than the provided data sector count. Attempts to read sectors of this mapping that were beyond the provided data sector count would then yield run-time messages of the form "device-mapper: integrity: Too big sector number: ...". This patch fixes this by emitting an error when the requested mapping size is bigger than the provided data sector count. Cc: Mikulas Patocka Cc: Milan Broz Signed-off-by: Ondrej Mosnacek --- drivers/md/dm-integrity.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 7910bfe50da4..4ab10cf718c9 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3040,6 +3040,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->error = "The device is too small"; goto bad; } + if (ti->len > ic->provided_data_sectors) { + r = -EINVAL; + ti->error = "Not enough provided sectors for requested mapping size"; + goto bad; + } if (!buffer_sectors) buffer_sectors = 1; -- 2.11.0 -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
Re: [dm-devel] dm-crypt IV generation (summary)
2017-04-07 8:12 GMT+02:00 Herbert Xu : > On Fri, Mar 10, 2017 at 02:44:26PM +0100, Ondrej Mosnacek wrote: >> >> ISSUES: >> a) The 'keycount' parameter. >> In order to support multi-key modes from Loop-AES, >> dm-crypt accepts a keycount parameter which, if it != 1, causes >> consecutive sectors to be encrypted with a different key. This >> parameter can be specified with any of the cipher modes, which makes >> porting the whole scale of modes supported by dm-crypt to crypto API >> rather messy, since a lot of dm-crypt specific stuff needs to be moved >> into the crypto drivers. > > Actually I think this one can probably easily handled in the crypto > layer. All we need is to add a multikey template that sits on top > of an underlying IV generator. The multikey instance can then accept > a key length that is a multiple of the underlying key length. I thought of that, too, but unfortunately with TCW the key structure is: | KEY 1 | KEY 2 | ... | KEY n | IV SEED (size = IV size) | WHITENING (size = 16 bytes) | So part of the key needs to be processed before it is split into multiple keys. Also with the LMK mode, there is a similar optional key appendix, which is of the same size as the other subkeys. >> b) New AEAD functionality; random IV generator. >> The soon-to-be-added AEAD functionality in dm-crypt >> introduces a new IV generator that generates IVs randomly and stores >> them as sector metadata. This means IV generation cannot be handled >> solely in the driver. Also, additional AEAD implementation of IV >> generators would be eventually needed. > > Again I don't see the problem here. IV generators are supposed > to return the IV to the caller so that it can be transmitted. > > For example, the IV generated in IPsec is explicitly transmitted. > Here we just need to store the IV. My point is that it doesn't make much sense to have a crypto API alg that calls get_random_bytes() as part of its implementation. IMHO, that might tempt HW drivers to replace it with some crappy alternatives, which wouldn't be good... Also, how would we test such alg with testmgr? I would say the best solution in this case would be to treat the 'random' IV mode as a special case in dm-crypt: the IVs would be generated in dm-crypt and passed to a special template (which would have a similar interface as the other IV generation templates) that would just pass the generated IVs to underlying skciphers instead of generating them from the sequence number. This template would need to provide some way to pass the IVs, e.g. by prepending them to the plaintext/ciphertext. Cheers, Ondrej -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
[dm-devel] dm-crypt IV generation (summary)
Hi all, I was tasked to post a summary the whole dm-crypt IV generation problem and all the suggested solutions along with their drawbacks, so here it goes... PROBLEM STATEMENT: Currently, dm-crypt uses a fixed 512-byte sector size and handles en-/decrypting of a bio by submitting a separate request to the crypto API for each sector. This causes a performance problem with some crypto drivers that have a high processing overhead for small requests, i.e. most crypto accelerators [1][2] and also e.g. the AES-NI driver [3]. POSSIBLE SOLUTIONS: 1. Move IV generator algorithms to crypto API and allow them to process the whole bio at once. ([5] or something equivalent) ISSUES: a) The 'keycount' parameter. In order to support multi-key modes from Loop-AES, dm-crypt accepts a keycount parameter which, if it != 1, causes consecutive sectors to be encrypted with a different key. This parameter can be specified with any of the cipher modes, which makes porting the whole scale of modes supported by dm-crypt to crypto API rather messy, since a lot of dm-crypt specific stuff needs to be moved into the crypto drivers. b) New AEAD functionality; random IV generator. The soon-to-be-added AEAD functionality in dm-crypt introduces a new IV generator that generates IVs randomly and stores them as sector metadata. This means IV generation cannot be handled solely in the driver. Also, additional AEAD implementation of IV generators would be eventually needed. 2. Restrict the keycount parameter to allow only reasonable combinations and then move IV generators to crypto API. In Loop-AES, the multi-key mode always uses exactly 64 keys and there is no reasonable scenario where someone would like to use keycount != 1 with other IV mode than LMK. Thus it might be acceptable to only work with multiple keys in LMK (and only allow keycount == 64 for it) and work with just one key in all other modes (and only allow keycount == 1 for them). I.e., the cipher format would remain the same, just with more restricted values. Note that this would be basically a breaking change (the keycount parameter is documented [4], so there may be users somewhere that use it in some unusual combination...). Still, such combinations would have to be used explicitly, as they are never used with common LUKS/Loop-AES/TrueCrypt partitions (something like cryptsetup --type plain ... or dmsetup would have to be used directly). Applying this change would allow implementing the IV generators as simple crypto algorithms that honor the usual interface (without setkey hacks and passing of special structures via IV). ISSUES: a) Backward incompatibility (see above). b) Again need to somehow handle the new 'random' IV generator. 3. Extend crypto API to allow passing multiple requests at once (each with a separate IV) to crypto drivers. (see [3]) ISSUES: a) HW accelerators would have to specifically support this scheme (unless they are somehow programmable). I.e. accelerators that already have the plain64 IV generator hard-coded could not fully benefit from this (I don't know how many are already out there). However, future ones could implement this 'universal' IV mode and enjoy basically the same benefit. 4. Implement support of 4KB sectors (or other sizes, too) in dm-crypt. This would partially help, since on devices with 4K sectors the size of each crypto request would then be 8x bigger and overhead would be reduced without the need to mess with the dm-crypt IV generators. ISSUES: a) Only 4KB would be processed at once, not the whole bio (but it may suffice). b) Partitions encrypted as 4KB sectors could not be safely moved to a 512B-sector device (writes would not be atomic). QUESTIONS: @Mike/dm-devel: Do you think it would be feasible to apply solution 2 (thus introducing the small incompatibility)? REFERENCES: [1] https://nelenkov.blogspot.com/2015/05/hardware-accelerated-disk-encryption-in.html [2] https://lkml.org/lkml/2017/3/2/274 [3] https://www.mail-archive.com/linux-crypto@vger.kernel.org/msg23007.html [4] https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt [5] https://lkml.org/lkml/2017/2/7/182 Cheers, Ondrej -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
Re: [dm-devel] [RFC PATCH v4] IV Generation algorithms for dm-crypt
2017-03-01 13:42 GMT+01:00 Gilad Ben-Yossef : > It really is an observation about overhead of context switches between > dm-crypt and > whatever/wherever you handle crypto - be it an off CPU hardware engine > or a bunch > of parallel kernel threads running on other cores. You really want to > burst as much as > possible. [...] >> For XTS you need just simple linear IV. No problem with that, implementation >> in crypto API and hw is trivial. >> >> But for compatible IV (that provides compatibility with loopAES and very old >> TrueCrypt), >> these should be never ever implemented again anywhere. > >> >> Specifically "tcw" is broken, insecure and provided here just to help people >> to migrate >> from old Truecrypt containers. Even Truecrypt followers removed it from the >> codebase. >> (It is basically combination of IV and slight modification of CBC mode. All >> recent version switched to XTS and plain IV.) >> >> So building abstraction over something known to be broken and that is now >> intentionally >> isolated inside dmcrypt is, in my opinion, really not a good idea. >> > > I don't think anyone is interested in these modes. How do you support > XTS and essiv in > a generic way without supporting this broken modes is not something > I'm clear on though. Wouldn't adopting a bulk request API (something like what I tried to do here [1]) that allows users to supply multiple messages, each with their own IV, fulfill this purpose? That way, we wouldn't need to introduce any new modes into Crypto API and the drivers/accelerators would only need to provide bulk implementations of common modes (xts(aes), cbc(aes), ...) to provide better performance for dm-crypt (and possibly other users, too). I'm not sure how exactly these crypto accelerators work, but wouldn't it help if the drivers simply get more messages (in our case sectors) in a single call? I wonder, would (efficiently) supporting such a scheme require changes in the HW itself or could it be achieved just by modifying driver code (let's say specifically for your CryptoCell accelerator)? Thanks, Ondrej [1] https://www.mail-archive.com/linux-crypto@vger.kernel.org/msg23007.html > > Thanks, > Gilad > > > > -- > Gilad Ben-Yossef > Chief Coffee Drinker > > "If you take a class in large-scale robotics, can you end up in a > situation where the homework eats your dog?" > -- Jean-Baptiste Queru -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
[dm-devel] [RFC PATCH 4/6] crypto: simd - Add bulk request support
This patch adds proper support for the new bulk requests to the SIMD helpers. Signed-off-by: Ondrej Mosnacek --- crypto/simd.c | 61 +++ 1 file changed, 61 insertions(+) diff --git a/crypto/simd.c b/crypto/simd.c index 8820337..2ae5930 100644 --- a/crypto/simd.c +++ b/crypto/simd.c @@ -100,6 +100,64 @@ static int simd_skcipher_decrypt(struct skcipher_request *req) return crypto_skcipher_decrypt(subreq); } +static int simd_skcipher_encrypt_bulk(struct skcipher_bulk_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req); + struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_bulk_request *subreq; + struct crypto_skcipher *child; + + subreq = skcipher_bulk_request_ctx(req); + *subreq = *req; + + if (!may_use_simd() || + (in_atomic() && cryptd_skcipher_queued(ctx->cryptd_tfm))) + child = &ctx->cryptd_tfm->base; + else + child = cryptd_skcipher_child(ctx->cryptd_tfm); + + skcipher_bulk_request_set_tfm(subreq, child); + + return crypto_skcipher_encrypt_bulk(subreq); +} + +static int simd_skcipher_decrypt_bulk(struct skcipher_bulk_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req); + struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_bulk_request *subreq; + struct crypto_skcipher *child; + + subreq = skcipher_bulk_request_ctx(req); + *subreq = *req; + + if (!may_use_simd() || + (in_atomic() && cryptd_skcipher_queued(ctx->cryptd_tfm))) + child = &ctx->cryptd_tfm->base; + else + child = cryptd_skcipher_child(ctx->cryptd_tfm); + + skcipher_bulk_request_set_tfm(subreq, child); + + return crypto_skcipher_decrypt_bulk(subreq); +} + +static unsigned int simd_skcipher_reqsize_bulk(struct crypto_skcipher *tfm, + unsigned int maxmsgs) +{ + struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher *tfm_cryptd, *tfm_child; + unsigned int reqsize_cryptd, reqsize_child; + + tfm_cryptd = &ctx->cryptd_tfm->base; + tfm_child = cryptd_skcipher_child(ctx->cryptd_tfm); + + reqsize_cryptd = crypto_skcipher_bulk_reqsize(tfm_cryptd, maxmsgs); + reqsize_child = crypto_skcipher_bulk_reqsize(tfm_child, maxmsgs); + return sizeof(struct skcipher_bulk_request) + + max(reqsize_cryptd, reqsize_child); +} + static void simd_skcipher_exit(struct crypto_skcipher *tfm) { struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -187,6 +245,9 @@ struct simd_skcipher_alg *simd_skcipher_create_compat(const char *algname, alg->setkey = simd_skcipher_setkey; alg->encrypt = simd_skcipher_encrypt; alg->decrypt = simd_skcipher_decrypt; + alg->encrypt_bulk = simd_skcipher_encrypt_bulk; + alg->decrypt_bulk = simd_skcipher_decrypt_bulk; + alg->reqsize_bulk = simd_skcipher_reqsize_bulk; err = crypto_register_skcipher(alg); if (err) -- 2.9.3 -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
[dm-devel] [RFC PATCH 6/6] dm-crypt: Add bulk crypto processing support
This patch converts dm-crypt to use bulk requests when invoking skcipher operations, allowing the crypto drivers to process multiple sectors at once, while reducing the overhead caused by the small sector size. The new code detects if multiple sectors from a bio are contigously stored within a single page (which should almost always be the case), and in such case processes all these sectors via a single bulk request. Note that the bio can also consist of several (likely consecutive) pages, which could be all bundled in a single request. However, since we need to specify an upper bound on how many sectors we are going to send at once (and this bound may affect the amount of memory allocated per single request), it is best to just limit the request bundling to a single page. Note that if the 'keycount' parameter of the cipher specification is set to a value other than 1, dm-crypt still sends only one sector in each request, since in such case the neighboring sectors are encrypted with different keys. This change causes a detectable read/write speedup (about 5-10%) on a ramdisk when AES-NI accelerated ciphers are used. Signed-off-by: Ondrej Mosnacek --- drivers/md/dm-crypt.c | 254 -- 1 file changed, 165 insertions(+), 89 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 7c6c572..d3f69e1 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -37,6 +37,9 @@ #define DM_MSG_PREFIX "crypt" +/* for now, we only bundle consecutve sectors within a single page */ +#define MAX_CONSEC_SECTORS (1 << (PAGE_SHIFT - SECTOR_SHIFT)) + /* * context holding the current state of a multi-part conversion */ @@ -48,7 +51,7 @@ struct convert_context { struct bvec_iter iter_out; sector_t cc_sector; atomic_t cc_pending; - struct skcipher_request *req; + struct skcipher_bulk_request *req; }; /* @@ -73,6 +76,7 @@ struct dm_crypt_request { struct scatterlist sg_in; struct scatterlist sg_out; sector_t iv_sector; + sector_t sector_count; }; struct crypt_config; @@ -83,9 +87,9 @@ struct crypt_iv_operations { void (*dtr)(struct crypt_config *cc); int (*init)(struct crypt_config *cc); int (*wipe)(struct crypt_config *cc); - int (*generator)(struct crypt_config *cc, u8 *iv, + int (*generator)(struct crypt_config *cc, u8 *iv, unsigned int sector, struct dm_crypt_request *dmreq); - int (*post)(struct crypt_config *cc, u8 *iv, + int (*post)(struct crypt_config *cc, u8 *iv, unsigned int sector, struct dm_crypt_request *dmreq); }; @@ -163,14 +167,14 @@ struct crypt_config { /* * Layout of each crypto request: * -* struct skcipher_request +* struct skcipher_bulk_request * context * padding * struct dm_crypt_request * padding -* IV +* IVs * -* The padding is added so that dm_crypt_request and the IV are +* The padding is added so that dm_crypt_request and the IVs are * correctly aligned. */ unsigned int dmreq_start; @@ -245,20 +249,24 @@ static struct crypto_skcipher *any_tfm(struct crypt_config *cc) * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 */ -static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, +static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *ivs, unsigned int i, struct dm_crypt_request *dmreq) { + u8 *iv = ivs + i * cc->iv_size; + memset(iv, 0, cc->iv_size); - *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0x); + *(__le32 *)iv = cpu_to_le32((dmreq->iv_sector + i) & 0x); return 0; } -static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq) +static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *ivs, + unsigned int i, struct dm_crypt_request *dmreq) { + u8 *iv = ivs + i * cc->iv_size; + memset(iv, 0, cc->iv_size); - *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); + *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector + i); return 0; } @@ -410,13 +418,14 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, return err; } -static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, +static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *ivs, unsigned int i, struct dm_crypt_request *dmreq) { struct crypto_cipher *essiv_tfm = cc->iv_private; + u8 *iv = ivs + i * cc->iv_size; memset(iv, 0, cc->iv_size); - *(__le64 *)iv = cpu_to_le64(dmreq-&g
[dm-devel] [RFC PATCH 5/6] crypto: aesni-intel - Add bulk request support
This patch implements bulk request handling in the AES-NI crypto drivers. The major advantage of this is that with bulk requests, the kernel_fpu_* functions (which are usually quite slow) are now called only once for the whole request. Signed-off-by: Ondrej Mosnacek --- arch/x86/crypto/aesni-intel_glue.c| 267 +++--- arch/x86/crypto/glue_helper.c | 23 ++- arch/x86/include/asm/crypto/glue_helper.h | 2 +- 3 files changed, 221 insertions(+), 71 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 36ca150..5f67afc 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -364,70 +364,116 @@ static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, crypto_skcipher_ctx(tfm), key, len); } -static int ecb_encrypt(struct skcipher_request *req) +typedef void (*aesni_crypt_t)(struct crypto_aes_ctx *ctx, + u8 *out, const u8 *in, unsigned int len); + +typedef void (*aesni_ivcrypt_t)(struct crypto_aes_ctx *ctx, + u8 *out, const u8 *in, unsigned int len, + u8 *iv); + +static int ecb_crypt(struct crypto_aes_ctx *ctx, struct skcipher_walk *walk, +aesni_crypt_t crypt) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); - struct skcipher_walk walk; unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); - kernel_fpu_begin(); - while ((nbytes = walk.nbytes)) { - aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, - nbytes & AES_BLOCK_MASK); + while ((nbytes = walk->nbytes)) { + crypt(ctx, walk->dst.virt.addr, walk->src.virt.addr, + nbytes & AES_BLOCK_MASK); nbytes &= AES_BLOCK_SIZE - 1; - err = skcipher_walk_done(&walk, nbytes); + err = skcipher_walk_done(walk, nbytes); } kernel_fpu_end(); return err; } -static int ecb_decrypt(struct skcipher_request *req) +static int cbc_crypt(struct crypto_aes_ctx *ctx, struct skcipher_walk *walk, +aesni_ivcrypt_t crypt) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); - struct skcipher_walk walk; unsigned int nbytes; int err; - err = skcipher_walk_virt(&walk, req, true); - kernel_fpu_begin(); - while ((nbytes = walk.nbytes)) { - aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, - nbytes & AES_BLOCK_MASK); + while ((nbytes = walk->nbytes)) { + crypt(ctx, walk->dst.virt.addr, walk->src.virt.addr, + nbytes & AES_BLOCK_MASK, walk->iv); nbytes &= AES_BLOCK_SIZE - 1; - err = skcipher_walk_done(&walk, nbytes); + err = skcipher_walk_done(walk, nbytes); } kernel_fpu_end(); return err; } -static int cbc_encrypt(struct skcipher_request *req) +static int ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); struct skcipher_walk walk; - unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; - kernel_fpu_begin(); - while ((nbytes = walk.nbytes)) { - aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, - nbytes & AES_BLOCK_MASK, walk.iv); - nbytes &= AES_BLOCK_SIZE - 1; - err = skcipher_walk_done(&walk, nbytes); - } - kernel_fpu_end(); + return ecb_crypt(ctx, &walk, aesni_ecb_enc); +} - return err; +static int ecb_decrypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; + + return ecb_crypt(ctx, &walk, aesni_ecb_dec); +} + +static int ecb_encrypt_bulk(struct skcipher_bulk_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt_bulk(&walk, req, true); + if (err) + return
[dm-devel] [RFC PATCH 2/6] crypto: skcipher - Add bulk request support to walk
This patch tweaks skcipher_walk so it can be used with the new bulk requests. The new skipher_walk can be initialized either from a skcipher_request (in which case its behavior is equivalent to the old code) or from a skcipher_bulk_request, in which case the usage is almost identical, the most significant exception being that skciphers which somehow tweak the IV (e.g. XTS) must check the new nextmsg flag before processing each chunk and re-tweak the IV if it is set. For other ciphers skcipher_walk automatically switches to the next IV at message boundaries. Signed-off-by: Ondrej Mosnacek --- crypto/skcipher.c | 192 +++-- include/crypto/internal/skcipher.h | 10 +- 2 files changed, 153 insertions(+), 49 deletions(-) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 8b6d684..b810e90 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -33,6 +33,7 @@ enum { SKCIPHER_WALK_COPY = 1 << 2, SKCIPHER_WALK_DIFF = 1 << 3, SKCIPHER_WALK_SLEEP = 1 << 4, + SKCIPHER_WALK_HETEROGENOUS = 1 << 5, }; struct skcipher_walk_buffer { @@ -94,6 +95,41 @@ static inline u8 *skcipher_get_spot(u8 *start, unsigned int len) return max(start, end_page); } +static int skcipher_copy_iv(struct skcipher_walk *walk) +{ + unsigned a = crypto_tfm_ctx_alignment() - 1; + unsigned alignmask = walk->alignmask; + unsigned ivsize = walk->ivsize; + unsigned bs = walk->stride; + unsigned aligned_bs; + unsigned size; + u8 *iv; + + aligned_bs = ALIGN(bs, alignmask); + + /* Minimum size to align buffer by alignmask. */ + size = alignmask & ~a; + + if (walk->flags & SKCIPHER_WALK_PHYS) + size += ivsize; + else { + size += aligned_bs + ivsize; + + /* Minimum size to ensure buffer does not straddle a page. */ + size += (bs - 1) & ~(alignmask | a); + } + + walk->buffer = kmalloc(size, skcipher_walk_gfp(walk)); + if (!walk->buffer) + return -ENOMEM; + + iv = PTR_ALIGN(walk->buffer, alignmask + 1); + iv = skcipher_get_spot(iv, bs) + aligned_bs; + + walk->iv = memcpy(iv, walk->iv, walk->ivsize); + return 0; +} + static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize) { u8 *addr; @@ -108,9 +144,12 @@ static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize) int skcipher_walk_done(struct skcipher_walk *walk, int err) { unsigned int n = walk->nbytes - err; - unsigned int nbytes; + unsigned int nbytes, nbytes_msg; + + walk->nextmsg = false; /* reset the nextmsg flag */ nbytes = walk->total - n; + nbytes_msg = walk->total_msg - n; if (unlikely(err < 0)) { nbytes = 0; @@ -139,8 +178,31 @@ int skcipher_walk_done(struct skcipher_walk *walk, int err) if (err > 0) err = 0; + if (nbytes && !nbytes_msg) { + walk->nextmsg = true; + + /* write the output IV: */ + if (walk->iv != walk->oiv) + memcpy(walk->oiv, walk->iv, walk->ivsize); + + /* advance to the IV of next message: */ + walk->oiv += walk->ivsize; + walk->iv = walk->oiv; + + if (unlikely(((unsigned long)walk->iv & walk->alignmask))) { + err = skcipher_copy_iv(walk); + if (err) + return err; + } + + nbytes_msg = *walk->nextmsgsize; + if (walk->flags & SKCIPHER_WALK_HETEROGENOUS) + ++walk->nextmsgsize; + } + + walk->nbytes = nbytes_msg; + walk->total_msg = nbytes_msg; walk->total = nbytes; - walk->nbytes = nbytes; scatterwalk_advance(&walk->in, n); scatterwalk_advance(&walk->out, n); @@ -343,13 +405,13 @@ static int skcipher_walk_next(struct skcipher_walk *walk) walk->flags &= ~(SKCIPHER_WALK_SLOW | SKCIPHER_WALK_COPY | SKCIPHER_WALK_DIFF); - n = walk->total; + n = walk->total_msg; bsize = min(walk->stride, max(n, walk->blocksize)); n = scatterwalk_clamp(&walk->in, n); n = scatterwalk_clamp(&walk->out, n); if (unlikely(n < bsize)) { - if (unlikely(walk->total < walk->blocksize)) + if (unlikely(walk->total_msg < walk->blocksize)) return skcipher_walk_done(walk, -EINVAL); slow_path: @@ -388,41 +450,6 @@ static int skcipher_walk_next(struct skcipher_walk *walk) } EXPORT_SYMBOL_GPL(skcipher_walk_next); -static
[dm-devel] [RFC PATCH 0/6] Add bulk skcipher requests to crypto API and dm-crypt
Hi, the goal of this patchset is to allow those skcipher API users that need to process batches of small messages (especially dm-crypt) to do so efficiently. The first patch introduces a new request type (and corresponding encrypt/decrypt functions) to the skcipher API. The new API can be used to submit multiple messages at once, thus enabling the drivers to reduce overhead as opposed to processing each message separately. The skcipher drivers can provide support for the new request type by setting the corresponding fields of their skcipher_alg structure. If 'native' support is not provided by a driver (i.e. the fields are left NULL), the crypto API transparently provides a generic fallback implementation, which simply processes the bulk request as a set of standard requests on the same tfm. The second patch extends skcipher_walk so it can be used for processing the new bulk requests, while preserving equivalent functionality when used with standard requests. The third and fourth patches add native bulk request support to the cryptd and SIMD helper wrappers, respectively. The fifth patch adds bulk request support to the AES-NI skcipher drivers, in order to provide an example for both implementing the bulk request processing and the usage of the extended skcipher_walk in such implementation. Also, this patch provides a slight optimization, since the kernel_fpu_* functions are called just once per the whole bulk request. Note that both the standard and bulk implementation mostly use the same code under the hood. The last patch converts dm-crypt to use bulk requests and makes it submit multiple sectors at once, whenever they are stored sequentially within a single page. With all the patches applied, I was able to measure a small speedup (~5-10%) with AES-NI ciphers and dm-crypt device mapped over a ramdisk. To-be-done: testing the bulk API in testmgr.c documentation update Ondrej Mosnacek (6): crypto: skcipher - Add bulk request processing API crypto: skcipher - Add bulk request support to walk crypto: cryptd - Add skcipher bulk request support crypto: simd - Add bulk request support crypto: aesni-intel - Add bulk request support dm-crypt: Add bulk crypto processing support arch/x86/crypto/aesni-intel_glue.c| 267 +++-- arch/x86/crypto/glue_helper.c | 23 +-- arch/x86/include/asm/crypto/glue_helper.h | 2 +- crypto/Makefile | 1 + crypto/cryptd.c | 111 +++ crypto/simd.c | 61 ++ crypto/skcipher.c | 207 +++- crypto/skcipher_bulk.c| 312 ++ drivers/md/dm-crypt.c | 254 +++- include/crypto/internal/skcipher.h| 42 +++- include/crypto/skcipher.h | 299 +++- 11 files changed, 1369 insertions(+), 210 deletions(-) create mode 100644 crypto/skcipher_bulk.c -- 2.9.3 -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel
[dm-devel] [RFC PATCH 1/6] crypto: skcipher - Add bulk request processing API
This patch adds bulk request processing to the skcipher interface. Specifically, it adds a new type of request ('skcipher_bulk_request'), which allows passing multiple independent messages to the skcipher driver. The buffers for the message data are passed via just two sg lists (one for src buffer, one for dst buffer). The IVs are passed via a single buffer, where they are stored sequentially. The interface allows specifying either a fixed length for all messages or a pointer to an array of message lengths. A skcipher implementation that wants to provide support for bulk requests may set the appropriate fields of its skcipher_alg struct. If these fields are not provided (or the skcipher is created from an (a)blkcipher), the crypto API automatically sets these fields to a fallback implementation, which just splits the bulk request into a series of regular skcipher requests on the same tfm. This means that the new type of request can be used with all skciphers, even if they do not support bulk requests natively. Note that when allocating a skcipher_bulk_request, the user must specify the maximum number of messages that they are going to submit via the request. This is necessary for the fallback implementation, which has to allocate space for the appropriate number of subrequests so that they can be processed in parallel. If the skcipher is synchronous, then the fallback implementation only allocates space for one subrequest and processes the patrial requests sequentially. Signed-off-by: Ondrej Mosnacek --- crypto/Makefile| 1 + crypto/skcipher.c | 15 ++ crypto/skcipher_bulk.c | 312 + include/crypto/internal/skcipher.h | 32 include/crypto/skcipher.h | 299 ++- 5 files changed, 658 insertions(+), 1 deletion(-) create mode 100644 crypto/skcipher_bulk.c diff --git a/crypto/Makefile b/crypto/Makefile index b8f0e3e..cd1cf57 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_CRYPTO_AEAD2) += aead.o crypto_blkcipher-y := ablkcipher.o crypto_blkcipher-y += blkcipher.o crypto_blkcipher-y += skcipher.o +crypto_blkcipher-y += skcipher_bulk.o obj-$(CONFIG_CRYPTO_BLKCIPHER2) += crypto_blkcipher.o obj-$(CONFIG_CRYPTO_SEQIV) += seqiv.o obj-$(CONFIG_CRYPTO_ECHAINIV) += echainiv.o diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 6ee6a15..8b6d684 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -667,6 +667,8 @@ static int crypto_init_skcipher_ops_blkcipher(struct crypto_tfm *tfm) skcipher->ivsize = crypto_blkcipher_ivsize(blkcipher); skcipher->keysize = calg->cra_blkcipher.max_keysize; + crypto_skcipher_bulk_set_fallback(skcipher); + return 0; } @@ -760,6 +762,8 @@ static int crypto_init_skcipher_ops_ablkcipher(struct crypto_tfm *tfm) sizeof(struct ablkcipher_request); skcipher->keysize = calg->cra_ablkcipher.max_keysize; + crypto_skcipher_bulk_set_fallback(skcipher); + return 0; } @@ -789,6 +793,14 @@ static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm) skcipher->ivsize = alg->ivsize; skcipher->keysize = alg->max_keysize; + if (!alg->encrypt_bulk || !alg->decrypt_bulk || !alg->reqsize_bulk) + crypto_skcipher_bulk_set_fallback(skcipher); + else { + skcipher->encrypt_bulk = alg->encrypt_bulk; + skcipher->decrypt_bulk = alg->decrypt_bulk; + skcipher->reqsize_bulk = alg->reqsize_bulk; + } + if (alg->exit) skcipher->base.exit = crypto_skcipher_exit_tfm; @@ -822,6 +834,9 @@ static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) seq_printf(m, "ivsize : %u\n", skcipher->ivsize); seq_printf(m, "chunksize: %u\n", skcipher->chunksize); seq_printf(m, "walksize : %u\n", skcipher->walksize); + seq_printf(m, "bulk : %s\n", + (skcipher->encrypt_bulk && skcipher->decrypt_bulk && + skcipher->reqsize_bulk) ? "yes" : "no"); } #ifdef CONFIG_NET diff --git a/crypto/skcipher_bulk.c b/crypto/skcipher_bulk.c new file mode 100644 index 000..9630122 --- /dev/null +++ b/crypto/skcipher_bulk.c @@ -0,0 +1,312 @@ +/* + * Bulk IV fallback for skcipher. + * + * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved. + * Copyright (c) 2016-2017 Ondrej Mosnacek + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include +#include +#include
[dm-devel] [RFC PATCH 3/6] crypto: cryptd - Add skcipher bulk request support
This patch adds proper support for the new bulk requests to cryptd. Signed-off-by: Ondrej Mosnacek --- crypto/cryptd.c | 111 1 file changed, 111 insertions(+) diff --git a/crypto/cryptd.c b/crypto/cryptd.c index 0508c48..b7d6e13 100644 --- a/crypto/cryptd.c +++ b/crypto/cryptd.c @@ -555,6 +555,114 @@ static int cryptd_skcipher_decrypt_enqueue(struct skcipher_request *req) return cryptd_skcipher_enqueue(req, cryptd_skcipher_decrypt); } +static void cryptd_skcipher_bulk_complete(struct skcipher_bulk_request *req, + int err) +{ + struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req); + struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct cryptd_skcipher_request_ctx *rctx = + skcipher_bulk_request_ctx(req); + int refcnt = atomic_read(&ctx->refcnt); + + local_bh_disable(); + rctx->complete(&req->base, err); + local_bh_enable(); + + if (err != -EINPROGRESS && refcnt && atomic_dec_and_test(&ctx->refcnt)) + crypto_free_skcipher(tfm); +} + +static void cryptd_skcipher_bulk_encrypt(struct crypto_async_request *base, +int err) +{ + struct skcipher_bulk_request *req = skcipher_bulk_request_cast(base); + struct cryptd_skcipher_request_ctx *rctx = + skcipher_bulk_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req); + struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher *child = ctx->child; + SKCIPHER_BULK_REQUEST_ON_STACK(subreq, req->maxmsgs, child); + + if (unlikely(err == -EINPROGRESS)) + goto out; + + skcipher_bulk_request_set_tfm(subreq, child); + skcipher_bulk_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + skcipher_bulk_request_set_crypt(subreq, req->src, req->dst, req->nmsgs, + req->msgsize, req->msgsizes, req->ivs); + + err = crypto_skcipher_encrypt_bulk(subreq); + skcipher_bulk_request_zero(subreq); + + req->base.complete = rctx->complete; + +out: + cryptd_skcipher_bulk_complete(req, err); +} + +static void cryptd_skcipher_bulk_decrypt(struct crypto_async_request *base, +int err) +{ + struct skcipher_bulk_request *req = skcipher_bulk_request_cast(base); + struct cryptd_skcipher_request_ctx *rctx = + skcipher_bulk_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req); + struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher *child = ctx->child; + SKCIPHER_BULK_REQUEST_ON_STACK(subreq, req->maxmsgs, child); + + if (unlikely(err == -EINPROGRESS)) + goto out; + + skcipher_bulk_request_set_tfm(subreq, child); + skcipher_bulk_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + skcipher_bulk_request_set_crypt(subreq, req->src, req->dst, req->nmsgs, + req->msgsize, req->msgsizes, req->ivs); + + err = crypto_skcipher_decrypt_bulk(subreq); + skcipher_bulk_request_zero(subreq); + + req->base.complete = rctx->complete; + +out: + cryptd_skcipher_bulk_complete(req, err); +} + +static int cryptd_skcipher_bulk_enqueue(struct skcipher_bulk_request *req, + crypto_completion_t compl) +{ + struct cryptd_skcipher_request_ctx *rctx = + skcipher_bulk_request_ctx(req); + struct crypto_skcipher *tfm = crypto_skcipher_bulk_reqtfm(req); + struct cryptd_queue *queue; + + queue = cryptd_get_queue(crypto_skcipher_tfm(tfm)); + rctx->complete = req->base.complete; + req->base.complete = compl; + + return cryptd_enqueue_request(queue, &req->base); +} + +static int cryptd_skcipher_bulk_encrypt_enqueue( + struct skcipher_bulk_request *req) +{ + return cryptd_skcipher_bulk_enqueue(req, cryptd_skcipher_bulk_encrypt); +} + +static int cryptd_skcipher_bulk_decrypt_enqueue( + struct skcipher_bulk_request *req) +{ + return cryptd_skcipher_bulk_enqueue(req, cryptd_skcipher_bulk_decrypt); +} + +static unsigned int cryptd_skcipher_bulk_reqsize(struct crypto_skcipher *tfm, +unsigned int maxmsgs) +{ + return sizeof(struct cryptd_skcipher_request_ctx); +} + static int cryptd_skcipher_init_tfm(struct crypto_skcipher *tfm) { struct skcipher_instance *inst = skcipher_alg_instance