crypto: arm/aes-ce - implement ciphertext stealing for XTS

Update the AES-XTS implementation based on AES instructions so that it
can deal with inputs whose size is not a multiple of the cipher block
size. This is part of the original XTS specification, but was never
implemented before in the Linux kernel.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Ard Biesheuvel 2019-09-03 09:43:35 -07:00 committed by Herbert Xu
parent 67cfa5d3b7
commit c61b1607ed
2 changed files with 208 additions and 23 deletions

View File

@ -369,9 +369,9 @@ ENDPROC(ce_aes_ctr_encrypt)
/* /*
* aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
* int blocks, u8 iv[], u32 const rk2[], int first) * int bytes, u8 iv[], u32 const rk2[], int first)
* aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
* int blocks, u8 iv[], u32 const rk2[], int first) * int bytes, u8 iv[], u32 const rk2[], int first)
*/ */
.macro next_tweak, out, in, const, tmp .macro next_tweak, out, in, const, tmp
@ -414,7 +414,7 @@ ENTRY(ce_aes_xts_encrypt)
.Lxtsencloop4x: .Lxtsencloop4x:
next_tweak q4, q4, q15, q10 next_tweak q4, q4, q15, q10
.Lxtsenc4x: .Lxtsenc4x:
subs r4, r4, #4 subs r4, r4, #64
bmi .Lxtsenc1x bmi .Lxtsenc1x
vld1.8 {q0-q1}, [r1]! @ get 4 pt blocks vld1.8 {q0-q1}, [r1]! @ get 4 pt blocks
vld1.8 {q2-q3}, [r1]! vld1.8 {q2-q3}, [r1]!
@ -434,24 +434,58 @@ ENTRY(ce_aes_xts_encrypt)
vst1.8 {q2-q3}, [r0]! vst1.8 {q2-q3}, [r0]!
vmov q4, q7 vmov q4, q7
teq r4, #0 teq r4, #0
beq .Lxtsencout beq .Lxtsencret
b .Lxtsencloop4x b .Lxtsencloop4x
.Lxtsenc1x: .Lxtsenc1x:
adds r4, r4, #4 adds r4, r4, #64
beq .Lxtsencout beq .Lxtsencout
subs r4, r4, #16
bmi .LxtsencctsNx
.Lxtsencloop: .Lxtsencloop:
vld1.8 {q0}, [r1]! vld1.8 {q0}, [r1]!
.Lxtsencctsout:
veor q0, q0, q4 veor q0, q0, q4
bl aes_encrypt bl aes_encrypt
veor q0, q0, q4 veor q0, q0, q4
vst1.8 {q0}, [r0]! teq r4, #0
subs r4, r4, #1
beq .Lxtsencout beq .Lxtsencout
subs r4, r4, #16
next_tweak q4, q4, q15, q6 next_tweak q4, q4, q15, q6
bmi .Lxtsenccts
vst1.8 {q0}, [r0]!
b .Lxtsencloop b .Lxtsencloop
.Lxtsencout: .Lxtsencout:
vst1.8 {q0}, [r0]
.Lxtsencret:
vst1.8 {q4}, [r5] vst1.8 {q4}, [r5]
pop {r4-r6, pc} pop {r4-r6, pc}
.LxtsencctsNx:
vmov q0, q3
sub r0, r0, #16
.Lxtsenccts:
movw ip, :lower16:.Lcts_permute_table
movt ip, :upper16:.Lcts_permute_table
add r1, r1, r4 @ rewind input pointer
add r4, r4, #16 @ # bytes in final block
add lr, ip, #32
add ip, ip, r4
sub lr, lr, r4
add r4, r0, r4 @ output address of final block
vld1.8 {q1}, [r1] @ load final partial block
vld1.8 {q2}, [ip]
vld1.8 {q3}, [lr]
vtbl.8 d4, {d0-d1}, d4
vtbl.8 d5, {d0-d1}, d5
vtbx.8 d0, {d2-d3}, d6
vtbx.8 d1, {d2-d3}, d7
vst1.8 {q2}, [r4] @ overlapping stores
mov r4, #0
b .Lxtsencctsout
ENDPROC(ce_aes_xts_encrypt) ENDPROC(ce_aes_xts_encrypt)
@ -462,13 +496,17 @@ ENTRY(ce_aes_xts_decrypt)
prepare_key r2, r3 prepare_key r2, r3
vmov q4, q0 vmov q4, q0
/* subtract 16 bytes if we are doing CTS */
tst r4, #0xf
subne r4, r4, #0x10
teq r6, #0 @ start of a block? teq r6, #0 @ start of a block?
bne .Lxtsdec4x bne .Lxtsdec4x
.Lxtsdecloop4x: .Lxtsdecloop4x:
next_tweak q4, q4, q15, q10 next_tweak q4, q4, q15, q10
.Lxtsdec4x: .Lxtsdec4x:
subs r4, r4, #4 subs r4, r4, #64
bmi .Lxtsdec1x bmi .Lxtsdec1x
vld1.8 {q0-q1}, [r1]! @ get 4 ct blocks vld1.8 {q0-q1}, [r1]! @ get 4 ct blocks
vld1.8 {q2-q3}, [r1]! vld1.8 {q2-q3}, [r1]!
@ -491,22 +529,55 @@ ENTRY(ce_aes_xts_decrypt)
beq .Lxtsdecout beq .Lxtsdecout
b .Lxtsdecloop4x b .Lxtsdecloop4x
.Lxtsdec1x: .Lxtsdec1x:
adds r4, r4, #4 adds r4, r4, #64
beq .Lxtsdecout beq .Lxtsdecout
subs r4, r4, #16
.Lxtsdecloop: .Lxtsdecloop:
vld1.8 {q0}, [r1]! vld1.8 {q0}, [r1]!
bmi .Lxtsdeccts
.Lxtsdecctsout:
veor q0, q0, q4 veor q0, q0, q4
add ip, r2, #32 @ 3rd round key
bl aes_decrypt bl aes_decrypt
veor q0, q0, q4 veor q0, q0, q4
vst1.8 {q0}, [r0]! vst1.8 {q0}, [r0]!
subs r4, r4, #1 teq r4, #0
beq .Lxtsdecout beq .Lxtsdecout
subs r4, r4, #16
next_tweak q4, q4, q15, q6 next_tweak q4, q4, q15, q6
b .Lxtsdecloop b .Lxtsdecloop
.Lxtsdecout: .Lxtsdecout:
vst1.8 {q4}, [r5] vst1.8 {q4}, [r5]
pop {r4-r6, pc} pop {r4-r6, pc}
.Lxtsdeccts:
movw ip, :lower16:.Lcts_permute_table
movt ip, :upper16:.Lcts_permute_table
add r1, r1, r4 @ rewind input pointer
add r4, r4, #16 @ # bytes in final block
add lr, ip, #32
add ip, ip, r4
sub lr, lr, r4
add r4, r0, r4 @ output address of final block
next_tweak q5, q4, q15, q6
vld1.8 {q1}, [r1] @ load final partial block
vld1.8 {q2}, [ip]
vld1.8 {q3}, [lr]
veor q0, q0, q5
bl aes_decrypt
veor q0, q0, q5
vtbl.8 d4, {d0-d1}, d4
vtbl.8 d5, {d0-d1}, d5
vtbx.8 d0, {d2-d3}, d6
vtbx.8 d1, {d2-d3}, d7
vst1.8 {q2}, [r4] @ overlapping stores
mov r4, #0
b .Lxtsdecctsout
ENDPROC(ce_aes_xts_decrypt) ENDPROC(ce_aes_xts_decrypt)
/* /*
@ -532,3 +603,13 @@ ENTRY(ce_aes_invert)
vst1.32 {q0}, [r0] vst1.32 {q0}, [r0]
bx lr bx lr
ENDPROC(ce_aes_invert) ENDPROC(ce_aes_invert)
.section ".rodata", "a"
.align 6
.Lcts_permute_table:
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff

View File

@ -13,6 +13,7 @@
#include <crypto/ctr.h> #include <crypto/ctr.h>
#include <crypto/internal/simd.h> #include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h> #include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <linux/cpufeature.h> #include <linux/cpufeature.h>
#include <linux/module.h> #include <linux/module.h>
#include <crypto/xts.h> #include <crypto/xts.h>
@ -39,10 +40,10 @@ asmlinkage void ce_aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks, u8 ctr[]); int rounds, int blocks, u8 ctr[]);
asmlinkage void ce_aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], asmlinkage void ce_aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int blocks, u8 iv[], int rounds, int bytes, u8 iv[],
u32 const rk2[], int first); u32 const rk2[], int first);
asmlinkage void ce_aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], asmlinkage void ce_aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int blocks, u8 iv[], int rounds, int bytes, u8 iv[],
u32 const rk2[], int first); u32 const rk2[], int first);
struct aes_block { struct aes_block {
@ -317,20 +318,71 @@ static int xts_encrypt(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, first, rounds = num_rounds(&ctx->key1); int err, first, rounds = num_rounds(&ctx->key1);
int tail = req->cryptlen % AES_BLOCK_SIZE;
struct scatterlist sg_src[2], sg_dst[2];
struct skcipher_request subreq;
struct scatterlist *src, *dst;
struct skcipher_walk walk; struct skcipher_walk walk;
unsigned int blocks;
if (req->cryptlen < AES_BLOCK_SIZE)
return -EINVAL;
err = skcipher_walk_virt(&walk, req, false); err = skcipher_walk_virt(&walk, req, false);
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
int xts_blocks = DIV_ROUND_UP(req->cryptlen,
AES_BLOCK_SIZE) - 2;
skcipher_walk_abort(&walk);
skcipher_request_set_tfm(&subreq, tfm);
skcipher_request_set_callback(&subreq,
skcipher_request_flags(req),
NULL, NULL);
skcipher_request_set_crypt(&subreq, req->src, req->dst,
xts_blocks * AES_BLOCK_SIZE,
req->iv);
req = &subreq;
err = skcipher_walk_virt(&walk, req, false);
} else {
tail = 0;
}
for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) {
int nbytes = walk.nbytes;
if (walk.nbytes < walk.total)
nbytes &= ~(AES_BLOCK_SIZE - 1);
kernel_neon_begin(); kernel_neon_begin();
ce_aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, ce_aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key1.key_enc, rounds, blocks, walk.iv, ctx->key1.key_enc, rounds, nbytes, walk.iv,
ctx->key2.key_enc, first); ctx->key2.key_enc, first);
kernel_neon_end(); kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
} }
if (err || likely(!tail))
return err; return err;
dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
if (req->dst != req->src)
dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
req->iv);
err = skcipher_walk_virt(&walk, req, false);
if (err)
return err;
kernel_neon_begin();
ce_aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key1.key_enc, rounds, walk.nbytes, walk.iv,
ctx->key2.key_enc, first);
kernel_neon_end();
return skcipher_walk_done(&walk, 0);
} }
static int xts_decrypt(struct skcipher_request *req) static int xts_decrypt(struct skcipher_request *req)
@ -338,20 +390,71 @@ static int xts_decrypt(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, first, rounds = num_rounds(&ctx->key1); int err, first, rounds = num_rounds(&ctx->key1);
int tail = req->cryptlen % AES_BLOCK_SIZE;
struct scatterlist sg_src[2], sg_dst[2];
struct skcipher_request subreq;
struct scatterlist *src, *dst;
struct skcipher_walk walk; struct skcipher_walk walk;
unsigned int blocks;
if (req->cryptlen < AES_BLOCK_SIZE)
return -EINVAL;
err = skcipher_walk_virt(&walk, req, false); err = skcipher_walk_virt(&walk, req, false);
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
int xts_blocks = DIV_ROUND_UP(req->cryptlen,
AES_BLOCK_SIZE) - 2;
skcipher_walk_abort(&walk);
skcipher_request_set_tfm(&subreq, tfm);
skcipher_request_set_callback(&subreq,
skcipher_request_flags(req),
NULL, NULL);
skcipher_request_set_crypt(&subreq, req->src, req->dst,
xts_blocks * AES_BLOCK_SIZE,
req->iv);
req = &subreq;
err = skcipher_walk_virt(&walk, req, false);
} else {
tail = 0;
}
for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) {
int nbytes = walk.nbytes;
if (walk.nbytes < walk.total)
nbytes &= ~(AES_BLOCK_SIZE - 1);
kernel_neon_begin(); kernel_neon_begin();
ce_aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, ce_aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key1.key_dec, rounds, blocks, walk.iv, ctx->key1.key_dec, rounds, nbytes, walk.iv,
ctx->key2.key_enc, first); ctx->key2.key_enc, first);
kernel_neon_end(); kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
} }
if (err || likely(!tail))
return err; return err;
dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
if (req->dst != req->src)
dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
req->iv);
err = skcipher_walk_virt(&walk, req, false);
if (err)
return err;
kernel_neon_begin();
ce_aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key1.key_dec, rounds, walk.nbytes, walk.iv,
ctx->key2.key_enc, first);
kernel_neon_end();
return skcipher_walk_done(&walk, 0);
} }
static struct skcipher_alg aes_algs[] = { { static struct skcipher_alg aes_algs[] = { {
@ -426,6 +529,7 @@ static struct skcipher_alg aes_algs[] = { {
.min_keysize = 2 * AES_MIN_KEY_SIZE, .min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE, .max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE, .ivsize = AES_BLOCK_SIZE,
.walksize = 2 * AES_BLOCK_SIZE,
.setkey = xts_set_key, .setkey = xts_set_key,
.encrypt = xts_encrypt, .encrypt = xts_encrypt,
.decrypt = xts_decrypt, .decrypt = xts_decrypt,