2017-01-12 00:41:52 +08:00
|
|
|
/*
|
|
|
|
* Scalar AES core transform
|
|
|
|
*
|
|
|
|
* Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
#include <asm/assembler.h>
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
#include <asm/cache.h>
|
2017-01-12 00:41:52 +08:00
|
|
|
|
|
|
|
.text
|
|
|
|
|
|
|
|
rk .req x0
|
|
|
|
out .req x1
|
|
|
|
in .req x2
|
|
|
|
rounds .req x3
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
tt .req x2
|
2017-01-12 00:41:52 +08:00
|
|
|
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
.macro __pair1, sz, op, reg0, reg1, in0, in1e, in1d, shift
|
|
|
|
.ifc \op\shift, b0
|
|
|
|
ubfiz \reg0, \in0, #2, #8
|
|
|
|
ubfiz \reg1, \in1e, #2, #8
|
|
|
|
.else
|
2017-01-29 07:25:37 +08:00
|
|
|
ubfx \reg0, \in0, #\shift, #8
|
|
|
|
ubfx \reg1, \in1e, #\shift, #8
|
2017-01-12 00:41:52 +08:00
|
|
|
.endif
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* AArch64 cannot do byte size indexed loads from a table containing
|
|
|
|
* 32-bit quantities, i.e., 'ldrb w12, [tt, w12, uxtw #2]' is not a
|
|
|
|
* valid instruction. So perform the shift explicitly first for the
|
|
|
|
* high bytes (the low byte is shifted implicitly by using ubfiz rather
|
|
|
|
* than ubfx above)
|
|
|
|
*/
|
|
|
|
.ifnc \op, b
|
2017-01-29 07:25:37 +08:00
|
|
|
ldr \reg0, [tt, \reg0, uxtw #2]
|
|
|
|
ldr \reg1, [tt, \reg1, uxtw #2]
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
.else
|
|
|
|
.if \shift > 0
|
|
|
|
lsl \reg0, \reg0, #2
|
|
|
|
lsl \reg1, \reg1, #2
|
|
|
|
.endif
|
|
|
|
ldrb \reg0, [tt, \reg0, uxtw]
|
|
|
|
ldrb \reg1, [tt, \reg1, uxtw]
|
|
|
|
.endif
|
2017-01-29 07:25:37 +08:00
|
|
|
.endm
|
2017-01-12 00:41:52 +08:00
|
|
|
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
.macro __pair0, sz, op, reg0, reg1, in0, in1e, in1d, shift
|
|
|
|
ubfx \reg0, \in0, #\shift, #8
|
|
|
|
ubfx \reg1, \in1d, #\shift, #8
|
|
|
|
ldr\op \reg0, [tt, \reg0, uxtw #\sz]
|
|
|
|
ldr\op \reg1, [tt, \reg1, uxtw #\sz]
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc, sz, op
|
2017-01-29 07:25:37 +08:00
|
|
|
ldp \out0, \out1, [rk], #8
|
2017-01-12 00:41:52 +08:00
|
|
|
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
__pair\enc \sz, \op, w12, w13, \in0, \in1, \in3, 0
|
|
|
|
__pair\enc \sz, \op, w14, w15, \in1, \in2, \in0, 8
|
|
|
|
__pair\enc \sz, \op, w16, w17, \in2, \in3, \in1, 16
|
|
|
|
__pair\enc \sz, \op, \t0, \t1, \in3, \in0, \in2, 24
|
|
|
|
|
|
|
|
eor \out0, \out0, w12
|
|
|
|
eor \out1, \out1, w13
|
|
|
|
eor \out0, \out0, w14, ror #24
|
|
|
|
eor \out1, \out1, w15, ror #24
|
|
|
|
eor \out0, \out0, w16, ror #16
|
|
|
|
eor \out1, \out1, w17, ror #16
|
2017-01-29 07:25:37 +08:00
|
|
|
eor \out0, \out0, \t0, ror #8
|
2017-01-12 00:41:52 +08:00
|
|
|
eor \out1, \out1, \t1, ror #8
|
|
|
|
.endm
|
|
|
|
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
.macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
|
|
|
|
__hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
|
|
|
|
__hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op
|
2017-01-12 00:41:52 +08:00
|
|
|
.endm
|
|
|
|
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
.macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
|
|
|
|
__hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
|
|
|
|
__hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op
|
2017-01-12 00:41:52 +08:00
|
|
|
.endm
|
|
|
|
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
.macro do_crypt, round, ttab, ltab, bsz
|
|
|
|
ldp w4, w5, [in]
|
|
|
|
ldp w6, w7, [in, #8]
|
|
|
|
ldp w8, w9, [rk], #16
|
|
|
|
ldp w10, w11, [rk, #-8]
|
2017-01-12 00:41:52 +08:00
|
|
|
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
CPU_BE( rev w4, w4 )
|
2017-01-12 00:41:52 +08:00
|
|
|
CPU_BE( rev w5, w5 )
|
|
|
|
CPU_BE( rev w6, w6 )
|
|
|
|
CPU_BE( rev w7, w7 )
|
|
|
|
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
eor w4, w4, w8
|
2017-01-12 00:41:52 +08:00
|
|
|
eor w5, w5, w9
|
|
|
|
eor w6, w6, w10
|
|
|
|
eor w7, w7, w11
|
|
|
|
|
2017-01-29 07:25:36 +08:00
|
|
|
adr_l tt, \ttab
|
2017-01-12 00:41:52 +08:00
|
|
|
|
|
|
|
tbnz rounds, #1, 1f
|
|
|
|
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
0: \round w8, w9, w10, w11, w4, w5, w6, w7
|
|
|
|
\round w4, w5, w6, w7, w8, w9, w10, w11
|
2017-01-12 00:41:52 +08:00
|
|
|
|
|
|
|
1: subs rounds, rounds, #4
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
\round w8, w9, w10, w11, w4, w5, w6, w7
|
|
|
|
b.ls 3f
|
|
|
|
2: \round w4, w5, w6, w7, w8, w9, w10, w11
|
|
|
|
b 0b
|
|
|
|
3: adr_l tt, \ltab
|
|
|
|
\round w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b
|
|
|
|
|
|
|
|
CPU_BE( rev w4, w4 )
|
2017-01-12 00:41:52 +08:00
|
|
|
CPU_BE( rev w5, w5 )
|
|
|
|
CPU_BE( rev w6, w6 )
|
|
|
|
CPU_BE( rev w7, w7 )
|
|
|
|
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
stp w4, w5, [out]
|
|
|
|
stp w6, w7, [out, #8]
|
2017-01-12 00:41:52 +08:00
|
|
|
ret
|
|
|
|
.endm
|
|
|
|
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
.align L1_CACHE_SHIFT
|
|
|
|
.type __aes_arm64_inverse_sbox, %object
|
|
|
|
__aes_arm64_inverse_sbox:
|
|
|
|
.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
|
|
|
|
.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
|
|
|
|
.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
|
|
|
|
.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
|
|
|
|
.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
|
|
|
|
.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
|
|
|
|
.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
|
|
|
|
.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
|
|
|
|
.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
|
|
|
|
.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
|
|
|
|
.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
|
|
|
|
.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
|
|
|
|
.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
|
|
|
|
.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
|
|
|
|
.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
|
|
|
|
.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
|
|
|
|
.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
|
|
|
|
.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
|
|
|
|
.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
|
|
|
|
.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
|
|
|
|
.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
|
|
|
|
.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
|
|
|
|
.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
|
|
|
|
.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
|
|
|
|
.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
|
|
|
|
.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
|
|
|
|
.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
|
|
|
|
.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
|
|
|
|
.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
|
|
|
|
.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
|
|
|
|
.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
|
|
|
|
.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
|
|
|
|
.size __aes_arm64_inverse_sbox, . - __aes_arm64_inverse_sbox
|
|
|
|
|
2017-01-12 00:41:52 +08:00
|
|
|
ENTRY(__aes_arm64_encrypt)
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2
|
2017-01-12 00:41:52 +08:00
|
|
|
ENDPROC(__aes_arm64_encrypt)
|
|
|
|
|
|
|
|
.align 5
|
|
|
|
ENTRY(__aes_arm64_decrypt)
|
crypto: arm64/aes - avoid expanded lookup tables in the final round
For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.
This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-07-24 18:28:20 +08:00
|
|
|
do_crypt iround, crypto_it_tab, __aes_arm64_inverse_sbox, 0
|
2017-01-12 00:41:52 +08:00
|
|
|
ENDPROC(__aes_arm64_decrypt)
|