crypto: arm64/aes-ccm - yield NEON after every block of input

Avoid excessive scheduling delays under a preemptible kernel by
yielding the NEON after every block of input.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Ard Biesheuvel 2018-04-30 18:18:23 +02:00 committed by Herbert Xu
parent d82f37ab5e
commit 7b67ae4d5c
1 changed files with 94 additions and 54 deletions

View File

@ -19,24 +19,33 @@
* u32 *macp, u8 const rk[], u32 rounds); * u32 *macp, u8 const rk[], u32 rounds);
*/ */
ENTRY(ce_aes_ccm_auth_data) ENTRY(ce_aes_ccm_auth_data)
ldr w8, [x3] /* leftover from prev round? */ frame_push 7
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
ldr w25, [x22] /* leftover from prev round? */
ld1 {v0.16b}, [x0] /* load mac */ ld1 {v0.16b}, [x0] /* load mac */
cbz w8, 1f cbz w25, 1f
sub w8, w8, #16 sub w25, w25, #16
eor v1.16b, v1.16b, v1.16b eor v1.16b, v1.16b, v1.16b
0: ldrb w7, [x1], #1 /* get 1 byte of input */ 0: ldrb w7, [x20], #1 /* get 1 byte of input */
subs w2, w2, #1 subs w21, w21, #1
add w8, w8, #1 add w25, w25, #1
ins v1.b[0], w7 ins v1.b[0], w7
ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */
beq 8f /* out of input? */ beq 8f /* out of input? */
cbnz w8, 0b cbnz w25, 0b
eor v0.16b, v0.16b, v1.16b eor v0.16b, v0.16b, v1.16b
1: ld1 {v3.4s}, [x4] /* load first round key */ 1: ld1 {v3.4s}, [x23] /* load first round key */
prfm pldl1strm, [x1] prfm pldl1strm, [x20]
cmp w5, #12 /* which key size? */ cmp w24, #12 /* which key size? */
add x6, x4, #16 add x6, x23, #16
sub w7, w5, #2 /* modified # of rounds */ sub w7, w24, #2 /* modified # of rounds */
bmi 2f bmi 2f
bne 5f bne 5f
mov v5.16b, v3.16b mov v5.16b, v3.16b
@ -55,33 +64,43 @@ ENTRY(ce_aes_ccm_auth_data)
ld1 {v5.4s}, [x6], #16 /* load next round key */ ld1 {v5.4s}, [x6], #16 /* load next round key */
bpl 3b bpl 3b
aese v0.16b, v4.16b aese v0.16b, v4.16b
subs w2, w2, #16 /* last data? */ subs w21, w21, #16 /* last data? */
eor v0.16b, v0.16b, v5.16b /* final round */ eor v0.16b, v0.16b, v5.16b /* final round */
bmi 6f bmi 6f
ld1 {v1.16b}, [x1], #16 /* load next input block */ ld1 {v1.16b}, [x20], #16 /* load next input block */
eor v0.16b, v0.16b, v1.16b /* xor with mac */ eor v0.16b, v0.16b, v1.16b /* xor with mac */
bne 1b beq 6f
6: st1 {v0.16b}, [x0] /* store mac */
if_will_cond_yield_neon
st1 {v0.16b}, [x19] /* store mac */
do_cond_yield_neon
ld1 {v0.16b}, [x19] /* reload mac */
endif_yield_neon
b 1b
6: st1 {v0.16b}, [x19] /* store mac */
beq 10f beq 10f
adds w2, w2, #16 adds w21, w21, #16
beq 10f beq 10f
mov w8, w2 mov w25, w21
7: ldrb w7, [x1], #1 7: ldrb w7, [x20], #1
umov w6, v0.b[0] umov w6, v0.b[0]
eor w6, w6, w7 eor w6, w6, w7
strb w6, [x0], #1 strb w6, [x19], #1
subs w2, w2, #1 subs w21, w21, #1
beq 10f beq 10f
ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */
b 7b b 7b
8: mov w7, w8 8: mov w7, w25
add w8, w8, #16 add w25, w25, #16
9: ext v1.16b, v1.16b, v1.16b, #1 9: ext v1.16b, v1.16b, v1.16b, #1
adds w7, w7, #1 adds w7, w7, #1
bne 9b bne 9b
eor v0.16b, v0.16b, v1.16b eor v0.16b, v0.16b, v1.16b
st1 {v0.16b}, [x0] st1 {v0.16b}, [x19]
10: str w8, [x3] 10: str w25, [x22]
frame_pop
ret ret
ENDPROC(ce_aes_ccm_auth_data) ENDPROC(ce_aes_ccm_auth_data)
@ -126,19 +145,29 @@ ENTRY(ce_aes_ccm_final)
ENDPROC(ce_aes_ccm_final) ENDPROC(ce_aes_ccm_final)
.macro aes_ccm_do_crypt,enc .macro aes_ccm_do_crypt,enc
ldr x8, [x6, #8] /* load lower ctr */ frame_push 8
ld1 {v0.16b}, [x5] /* load mac */
CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */ mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
mov x25, x6
ldr x26, [x25, #8] /* load lower ctr */
ld1 {v0.16b}, [x24] /* load mac */
CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */
0: /* outer loop */ 0: /* outer loop */
ld1 {v1.8b}, [x6] /* load upper ctr */ ld1 {v1.8b}, [x25] /* load upper ctr */
prfm pldl1strm, [x1] prfm pldl1strm, [x20]
add x8, x8, #1 add x26, x26, #1
rev x9, x8 rev x9, x26
cmp w4, #12 /* which key size? */ cmp w23, #12 /* which key size? */
sub w7, w4, #2 /* get modified # of rounds */ sub w7, w23, #2 /* get modified # of rounds */
ins v1.d[1], x9 /* no carry in lower ctr */ ins v1.d[1], x9 /* no carry in lower ctr */
ld1 {v3.4s}, [x3] /* load first round key */ ld1 {v3.4s}, [x22] /* load first round key */
add x10, x3, #16 add x10, x22, #16
bmi 1f bmi 1f
bne 4f bne 4f
mov v5.16b, v3.16b mov v5.16b, v3.16b
@ -165,9 +194,9 @@ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
bpl 2b bpl 2b
aese v0.16b, v4.16b aese v0.16b, v4.16b
aese v1.16b, v4.16b aese v1.16b, v4.16b
subs w2, w2, #16 subs w21, w21, #16
bmi 6f /* partial block? */ bmi 7f /* partial block? */
ld1 {v2.16b}, [x1], #16 /* load next input block */ ld1 {v2.16b}, [x20], #16 /* load next input block */
.if \enc == 1 .if \enc == 1
eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ eor v2.16b, v2.16b, v5.16b /* final round enc+mac */
eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */
@ -176,18 +205,29 @@ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
eor v1.16b, v2.16b, v5.16b /* final round enc */ eor v1.16b, v2.16b, v5.16b /* final round enc */
.endif .endif
eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */
st1 {v1.16b}, [x0], #16 /* write output block */ st1 {v1.16b}, [x19], #16 /* write output block */
bne 0b beq 5f
CPU_LE( rev x8, x8 )
st1 {v0.16b}, [x5] /* store mac */
str x8, [x6, #8] /* store lsb end of ctr (BE) */
5: ret
6: eor v0.16b, v0.16b, v5.16b /* final round mac */ if_will_cond_yield_neon
st1 {v0.16b}, [x24] /* store mac */
do_cond_yield_neon
ld1 {v0.16b}, [x24] /* reload mac */
endif_yield_neon
b 0b
5:
CPU_LE( rev x26, x26 )
st1 {v0.16b}, [x24] /* store mac */
str x26, [x25, #8] /* store lsb end of ctr (BE) */
6: frame_pop
ret
7: eor v0.16b, v0.16b, v5.16b /* final round mac */
eor v1.16b, v1.16b, v5.16b /* final round enc */ eor v1.16b, v1.16b, v5.16b /* final round enc */
st1 {v0.16b}, [x5] /* store mac */ st1 {v0.16b}, [x24] /* store mac */
add w2, w2, #16 /* process partial tail block */ add w21, w21, #16 /* process partial tail block */
7: ldrb w9, [x1], #1 /* get 1 byte of input */ 8: ldrb w9, [x20], #1 /* get 1 byte of input */
umov w6, v1.b[0] /* get top crypted ctr byte */ umov w6, v1.b[0] /* get top crypted ctr byte */
umov w7, v0.b[0] /* get top mac byte */ umov w7, v0.b[0] /* get top mac byte */
.if \enc == 1 .if \enc == 1
@ -197,13 +237,13 @@ CPU_LE( rev x8, x8 )
eor w9, w9, w6 eor w9, w9, w6
eor w7, w7, w9 eor w7, w7, w9
.endif .endif
strb w9, [x0], #1 /* store out byte */ strb w9, [x19], #1 /* store out byte */
strb w7, [x5], #1 /* store mac byte */ strb w7, [x24], #1 /* store mac byte */
subs w2, w2, #1 subs w21, w21, #1
beq 5b beq 6b
ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */
ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */
b 7b b 8b
.endm .endm
/* /*