crypto: arm64/aes-blk - remove configurable interleave

The AES block mode implementation using Crypto Extensions or plain NEON
was written before real hardware existed, and so its interleave factor
was made build time configurable (as well as an option to instantiate
all interleaved sequences inline rather than as subroutines)

We ended up using INTERLEAVE=4 with inlining disabled for both flavors
of the core AES routines, so let's stick with that, and remove the option
to configure this at build time. This makes the code easier to modify,
which is nice now that we're adding yield support.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Ard Biesheuvel 2018-03-10 15:21:51 +00:00 committed by Herbert Xu
parent 4bf7e7a19d
commit 55868b45cf
2 changed files with 40 additions and 200 deletions

View File

@ -62,9 +62,6 @@ aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o
aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
AFLAGS_aes-ce.o := -DINTERLEAVE=4
AFLAGS_aes-neon.o := -DINTERLEAVE=4
CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS
$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE

View File

@ -13,44 +13,6 @@
.text .text
.align 4 .align 4
/*
* There are several ways to instantiate this code:
* - no interleave, all inline
* - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
* - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
* - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
* - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
*
* Macros imported by this code:
* - enc_prepare - setup NEON registers for encryption
* - dec_prepare - setup NEON registers for decryption
* - enc_switch_key - change to new key after having prepared for encryption
* - encrypt_block - encrypt a single block
* - decrypt block - decrypt a single block
* - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
* - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
* - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
* - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
*/
#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
#define FRAME_POP ldp x29, x30, [sp],#16
#if INTERLEAVE == 2
aes_encrypt_block2x:
encrypt_block2x v0, v1, w3, x2, x8, w7
ret
ENDPROC(aes_encrypt_block2x)
aes_decrypt_block2x:
decrypt_block2x v0, v1, w3, x2, x8, w7
ret
ENDPROC(aes_decrypt_block2x)
#elif INTERLEAVE == 4
aes_encrypt_block4x: aes_encrypt_block4x:
encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret ret
@ -61,48 +23,6 @@ aes_decrypt_block4x:
ret ret
ENDPROC(aes_decrypt_block4x) ENDPROC(aes_decrypt_block4x)
#else
#error INTERLEAVE should equal 2 or 4
#endif
.macro do_encrypt_block2x
bl aes_encrypt_block2x
.endm
.macro do_decrypt_block2x
bl aes_decrypt_block2x
.endm
.macro do_encrypt_block4x
bl aes_encrypt_block4x
.endm
.macro do_decrypt_block4x
bl aes_decrypt_block4x
.endm
#else
#define FRAME_PUSH
#define FRAME_POP
.macro do_encrypt_block2x
encrypt_block2x v0, v1, w3, x2, x8, w7
.endm
.macro do_decrypt_block2x
decrypt_block2x v0, v1, w3, x2, x8, w7
.endm
.macro do_encrypt_block4x
encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
.endm
.macro do_decrypt_block4x
decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
.endm
#endif
/* /*
* aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks) * int blocks)
@ -111,28 +31,21 @@ ENDPROC(aes_decrypt_block4x)
*/ */
AES_ENTRY(aes_ecb_encrypt) AES_ENTRY(aes_ecb_encrypt)
FRAME_PUSH stp x29, x30, [sp, #-16]!
mov x29, sp
enc_prepare w3, x2, x5 enc_prepare w3, x2, x5
.LecbencloopNx: .LecbencloopNx:
#if INTERLEAVE >= 2 subs w4, w4, #4
subs w4, w4, #INTERLEAVE
bmi .Lecbenc1x bmi .Lecbenc1x
#if INTERLEAVE == 2
ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
do_encrypt_block2x
st1 {v0.16b-v1.16b}, [x0], #32
#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
do_encrypt_block4x bl aes_encrypt_block4x
st1 {v0.16b-v3.16b}, [x0], #64 st1 {v0.16b-v3.16b}, [x0], #64
#endif
b .LecbencloopNx b .LecbencloopNx
.Lecbenc1x: .Lecbenc1x:
adds w4, w4, #INTERLEAVE adds w4, w4, #4
beq .Lecbencout beq .Lecbencout
#endif
.Lecbencloop: .Lecbencloop:
ld1 {v0.16b}, [x1], #16 /* get next pt block */ ld1 {v0.16b}, [x1], #16 /* get next pt block */
encrypt_block v0, w3, x2, x5, w6 encrypt_block v0, w3, x2, x5, w6
@ -140,34 +53,27 @@ AES_ENTRY(aes_ecb_encrypt)
subs w4, w4, #1 subs w4, w4, #1
bne .Lecbencloop bne .Lecbencloop
.Lecbencout: .Lecbencout:
FRAME_POP ldp x29, x30, [sp], #16
ret ret
AES_ENDPROC(aes_ecb_encrypt) AES_ENDPROC(aes_ecb_encrypt)
AES_ENTRY(aes_ecb_decrypt) AES_ENTRY(aes_ecb_decrypt)
FRAME_PUSH stp x29, x30, [sp, #-16]!
mov x29, sp
dec_prepare w3, x2, x5 dec_prepare w3, x2, x5
.LecbdecloopNx: .LecbdecloopNx:
#if INTERLEAVE >= 2 subs w4, w4, #4
subs w4, w4, #INTERLEAVE
bmi .Lecbdec1x bmi .Lecbdec1x
#if INTERLEAVE == 2
ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
do_decrypt_block2x
st1 {v0.16b-v1.16b}, [x0], #32
#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
do_decrypt_block4x bl aes_decrypt_block4x
st1 {v0.16b-v3.16b}, [x0], #64 st1 {v0.16b-v3.16b}, [x0], #64
#endif
b .LecbdecloopNx b .LecbdecloopNx
.Lecbdec1x: .Lecbdec1x:
adds w4, w4, #INTERLEAVE adds w4, w4, #4
beq .Lecbdecout beq .Lecbdecout
#endif
.Lecbdecloop: .Lecbdecloop:
ld1 {v0.16b}, [x1], #16 /* get next ct block */ ld1 {v0.16b}, [x1], #16 /* get next ct block */
decrypt_block v0, w3, x2, x5, w6 decrypt_block v0, w3, x2, x5, w6
@ -175,7 +81,7 @@ AES_ENTRY(aes_ecb_decrypt)
subs w4, w4, #1 subs w4, w4, #1
bne .Lecbdecloop bne .Lecbdecloop
.Lecbdecout: .Lecbdecout:
FRAME_POP ldp x29, x30, [sp], #16
ret ret
AES_ENDPROC(aes_ecb_decrypt) AES_ENDPROC(aes_ecb_decrypt)
@ -204,30 +110,20 @@ AES_ENDPROC(aes_cbc_encrypt)
AES_ENTRY(aes_cbc_decrypt) AES_ENTRY(aes_cbc_decrypt)
FRAME_PUSH stp x29, x30, [sp, #-16]!
mov x29, sp
ld1 {v7.16b}, [x5] /* get iv */ ld1 {v7.16b}, [x5] /* get iv */
dec_prepare w3, x2, x6 dec_prepare w3, x2, x6
.LcbcdecloopNx: .LcbcdecloopNx:
#if INTERLEAVE >= 2 subs w4, w4, #4
subs w4, w4, #INTERLEAVE
bmi .Lcbcdec1x bmi .Lcbcdec1x
#if INTERLEAVE == 2
ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
mov v2.16b, v0.16b
mov v3.16b, v1.16b
do_decrypt_block2x
eor v0.16b, v0.16b, v7.16b
eor v1.16b, v1.16b, v2.16b
mov v7.16b, v3.16b
st1 {v0.16b-v1.16b}, [x0], #32
#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
mov v4.16b, v0.16b mov v4.16b, v0.16b
mov v5.16b, v1.16b mov v5.16b, v1.16b
mov v6.16b, v2.16b mov v6.16b, v2.16b
do_decrypt_block4x bl aes_decrypt_block4x
sub x1, x1, #16 sub x1, x1, #16
eor v0.16b, v0.16b, v7.16b eor v0.16b, v0.16b, v7.16b
eor v1.16b, v1.16b, v4.16b eor v1.16b, v1.16b, v4.16b
@ -235,12 +131,10 @@ AES_ENTRY(aes_cbc_decrypt)
eor v2.16b, v2.16b, v5.16b eor v2.16b, v2.16b, v5.16b
eor v3.16b, v3.16b, v6.16b eor v3.16b, v3.16b, v6.16b
st1 {v0.16b-v3.16b}, [x0], #64 st1 {v0.16b-v3.16b}, [x0], #64
#endif
b .LcbcdecloopNx b .LcbcdecloopNx
.Lcbcdec1x: .Lcbcdec1x:
adds w4, w4, #INTERLEAVE adds w4, w4, #4
beq .Lcbcdecout beq .Lcbcdecout
#endif
.Lcbcdecloop: .Lcbcdecloop:
ld1 {v1.16b}, [x1], #16 /* get next ct block */ ld1 {v1.16b}, [x1], #16 /* get next ct block */
mov v0.16b, v1.16b /* ...and copy to v0 */ mov v0.16b, v1.16b /* ...and copy to v0 */
@ -251,8 +145,8 @@ AES_ENTRY(aes_cbc_decrypt)
subs w4, w4, #1 subs w4, w4, #1
bne .Lcbcdecloop bne .Lcbcdecloop
.Lcbcdecout: .Lcbcdecout:
FRAME_POP
st1 {v7.16b}, [x5] /* return iv */ st1 {v7.16b}, [x5] /* return iv */
ldp x29, x30, [sp], #16
ret ret
AES_ENDPROC(aes_cbc_decrypt) AES_ENDPROC(aes_cbc_decrypt)
@ -263,34 +157,19 @@ AES_ENDPROC(aes_cbc_decrypt)
*/ */
AES_ENTRY(aes_ctr_encrypt) AES_ENTRY(aes_ctr_encrypt)
FRAME_PUSH stp x29, x30, [sp, #-16]!
mov x29, sp
enc_prepare w3, x2, x6 enc_prepare w3, x2, x6
ld1 {v4.16b}, [x5] ld1 {v4.16b}, [x5]
umov x6, v4.d[1] /* keep swabbed ctr in reg */ umov x6, v4.d[1] /* keep swabbed ctr in reg */
rev x6, x6 rev x6, x6
#if INTERLEAVE >= 2
cmn w6, w4 /* 32 bit overflow? */ cmn w6, w4 /* 32 bit overflow? */
bcs .Lctrloop bcs .Lctrloop
.LctrloopNx: .LctrloopNx:
subs w4, w4, #INTERLEAVE subs w4, w4, #4
bmi .Lctr1x bmi .Lctr1x
#if INTERLEAVE == 2
mov v0.8b, v4.8b
mov v1.8b, v4.8b
rev x7, x6
add x6, x6, #1
ins v0.d[1], x7
rev x7, x6
add x6, x6, #1
ins v1.d[1], x7
ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
do_encrypt_block2x
eor v0.16b, v0.16b, v2.16b
eor v1.16b, v1.16b, v3.16b
st1 {v0.16b-v1.16b}, [x0], #32
#else
ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */ ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
dup v7.4s, w6 dup v7.4s, w6
mov v0.16b, v4.16b mov v0.16b, v4.16b
@ -303,23 +182,21 @@ AES_ENTRY(aes_ctr_encrypt)
mov v2.s[3], v8.s[1] mov v2.s[3], v8.s[1]
mov v3.s[3], v8.s[2] mov v3.s[3], v8.s[2]
ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
do_encrypt_block4x bl aes_encrypt_block4x
eor v0.16b, v5.16b, v0.16b eor v0.16b, v5.16b, v0.16b
ld1 {v5.16b}, [x1], #16 /* get 1 input block */ ld1 {v5.16b}, [x1], #16 /* get 1 input block */
eor v1.16b, v6.16b, v1.16b eor v1.16b, v6.16b, v1.16b
eor v2.16b, v7.16b, v2.16b eor v2.16b, v7.16b, v2.16b
eor v3.16b, v5.16b, v3.16b eor v3.16b, v5.16b, v3.16b
st1 {v0.16b-v3.16b}, [x0], #64 st1 {v0.16b-v3.16b}, [x0], #64
add x6, x6, #INTERLEAVE add x6, x6, #4
#endif
rev x7, x6 rev x7, x6
ins v4.d[1], x7 ins v4.d[1], x7
cbz w4, .Lctrout cbz w4, .Lctrout
b .LctrloopNx b .LctrloopNx
.Lctr1x: .Lctr1x:
adds w4, w4, #INTERLEAVE adds w4, w4, #4
beq .Lctrout beq .Lctrout
#endif
.Lctrloop: .Lctrloop:
mov v0.16b, v4.16b mov v0.16b, v4.16b
encrypt_block v0, w3, x2, x8, w7 encrypt_block v0, w3, x2, x8, w7
@ -339,12 +216,12 @@ AES_ENTRY(aes_ctr_encrypt)
.Lctrout: .Lctrout:
st1 {v4.16b}, [x5] /* return next CTR value */ st1 {v4.16b}, [x5] /* return next CTR value */
FRAME_POP ldp x29, x30, [sp], #16
ret ret
.Lctrtailblock: .Lctrtailblock:
st1 {v0.16b}, [x0] st1 {v0.16b}, [x0]
FRAME_POP ldp x29, x30, [sp], #16
ret ret
.Lctrcarry: .Lctrcarry:
@ -378,7 +255,9 @@ CPU_LE( .quad 1, 0x87 )
CPU_BE( .quad 0x87, 1 ) CPU_BE( .quad 0x87, 1 )
AES_ENTRY(aes_xts_encrypt) AES_ENTRY(aes_xts_encrypt)
FRAME_PUSH stp x29, x30, [sp, #-16]!
mov x29, sp
ld1 {v4.16b}, [x6] ld1 {v4.16b}, [x6]
cbz w7, .Lxtsencnotfirst cbz w7, .Lxtsencnotfirst
@ -394,25 +273,8 @@ AES_ENTRY(aes_xts_encrypt)
ldr q7, .Lxts_mul_x ldr q7, .Lxts_mul_x
next_tweak v4, v4, v7, v8 next_tweak v4, v4, v7, v8
.LxtsencNx: .LxtsencNx:
#if INTERLEAVE >= 2 subs w4, w4, #4
subs w4, w4, #INTERLEAVE
bmi .Lxtsenc1x bmi .Lxtsenc1x
#if INTERLEAVE == 2
ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
next_tweak v5, v4, v7, v8
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
do_encrypt_block2x
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
st1 {v0.16b-v1.16b}, [x0], #32
cbz w4, .LxtsencoutNx
next_tweak v4, v5, v7, v8
b .LxtsencNx
.LxtsencoutNx:
mov v4.16b, v5.16b
b .Lxtsencout
#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
next_tweak v5, v4, v7, v8 next_tweak v5, v4, v7, v8
eor v0.16b, v0.16b, v4.16b eor v0.16b, v0.16b, v4.16b
@ -421,7 +283,7 @@ AES_ENTRY(aes_xts_encrypt)
eor v2.16b, v2.16b, v6.16b eor v2.16b, v2.16b, v6.16b
next_tweak v7, v6, v7, v8 next_tweak v7, v6, v7, v8
eor v3.16b, v3.16b, v7.16b eor v3.16b, v3.16b, v7.16b
do_encrypt_block4x bl aes_encrypt_block4x
eor v3.16b, v3.16b, v7.16b eor v3.16b, v3.16b, v7.16b
eor v0.16b, v0.16b, v4.16b eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b eor v1.16b, v1.16b, v5.16b
@ -430,11 +292,9 @@ AES_ENTRY(aes_xts_encrypt)
mov v4.16b, v7.16b mov v4.16b, v7.16b
cbz w4, .Lxtsencout cbz w4, .Lxtsencout
b .LxtsencloopNx b .LxtsencloopNx
#endif
.Lxtsenc1x: .Lxtsenc1x:
adds w4, w4, #INTERLEAVE adds w4, w4, #4
beq .Lxtsencout beq .Lxtsencout
#endif
.Lxtsencloop: .Lxtsencloop:
ld1 {v1.16b}, [x1], #16 ld1 {v1.16b}, [x1], #16
eor v0.16b, v1.16b, v4.16b eor v0.16b, v1.16b, v4.16b
@ -447,13 +307,15 @@ AES_ENTRY(aes_xts_encrypt)
b .Lxtsencloop b .Lxtsencloop
.Lxtsencout: .Lxtsencout:
st1 {v4.16b}, [x6] st1 {v4.16b}, [x6]
FRAME_POP ldp x29, x30, [sp], #16
ret ret
AES_ENDPROC(aes_xts_encrypt) AES_ENDPROC(aes_xts_encrypt)
AES_ENTRY(aes_xts_decrypt) AES_ENTRY(aes_xts_decrypt)
FRAME_PUSH stp x29, x30, [sp, #-16]!
mov x29, sp
ld1 {v4.16b}, [x6] ld1 {v4.16b}, [x6]
cbz w7, .Lxtsdecnotfirst cbz w7, .Lxtsdecnotfirst
@ -469,25 +331,8 @@ AES_ENTRY(aes_xts_decrypt)
ldr q7, .Lxts_mul_x ldr q7, .Lxts_mul_x
next_tweak v4, v4, v7, v8 next_tweak v4, v4, v7, v8
.LxtsdecNx: .LxtsdecNx:
#if INTERLEAVE >= 2 subs w4, w4, #4
subs w4, w4, #INTERLEAVE
bmi .Lxtsdec1x bmi .Lxtsdec1x
#if INTERLEAVE == 2
ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
next_tweak v5, v4, v7, v8
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
do_decrypt_block2x
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
st1 {v0.16b-v1.16b}, [x0], #32
cbz w4, .LxtsdecoutNx
next_tweak v4, v5, v7, v8
b .LxtsdecNx
.LxtsdecoutNx:
mov v4.16b, v5.16b
b .Lxtsdecout
#else
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
next_tweak v5, v4, v7, v8 next_tweak v5, v4, v7, v8
eor v0.16b, v0.16b, v4.16b eor v0.16b, v0.16b, v4.16b
@ -496,7 +341,7 @@ AES_ENTRY(aes_xts_decrypt)
eor v2.16b, v2.16b, v6.16b eor v2.16b, v2.16b, v6.16b
next_tweak v7, v6, v7, v8 next_tweak v7, v6, v7, v8
eor v3.16b, v3.16b, v7.16b eor v3.16b, v3.16b, v7.16b
do_decrypt_block4x bl aes_decrypt_block4x
eor v3.16b, v3.16b, v7.16b eor v3.16b, v3.16b, v7.16b
eor v0.16b, v0.16b, v4.16b eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b eor v1.16b, v1.16b, v5.16b
@ -505,11 +350,9 @@ AES_ENTRY(aes_xts_decrypt)
mov v4.16b, v7.16b mov v4.16b, v7.16b
cbz w4, .Lxtsdecout cbz w4, .Lxtsdecout
b .LxtsdecloopNx b .LxtsdecloopNx
#endif
.Lxtsdec1x: .Lxtsdec1x:
adds w4, w4, #INTERLEAVE adds w4, w4, #4
beq .Lxtsdecout beq .Lxtsdecout
#endif
.Lxtsdecloop: .Lxtsdecloop:
ld1 {v1.16b}, [x1], #16 ld1 {v1.16b}, [x1], #16
eor v0.16b, v1.16b, v4.16b eor v0.16b, v1.16b, v4.16b
@ -522,7 +365,7 @@ AES_ENTRY(aes_xts_decrypt)
b .Lxtsdecloop b .Lxtsdecloop
.Lxtsdecout: .Lxtsdecout:
st1 {v4.16b}, [x6] st1 {v4.16b}, [x6]
FRAME_POP ldp x29, x30, [sp], #16
ret ret
AES_ENDPROC(aes_xts_decrypt) AES_ENDPROC(aes_xts_decrypt)