linux_old1/arch/x86/crypto/glue_helper-asm-avx.S

/*
 * Shared glue code for 128bit block ciphers, AVX assembler macros
 *
 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 */

#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
	vmovdqu (0*16)(src), x0; \
	vmovdqu (1*16)(src), x1; \
	vmovdqu (2*16)(src), x2; \
	vmovdqu (3*16)(src), x3; \
	vmovdqu (4*16)(src), x4; \
	vmovdqu (5*16)(src), x5; \
	vmovdqu (6*16)(src), x6; \
	vmovdqu (7*16)(src), x7;

#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
	vmovdqu x0, (0*16)(dst); \
	vmovdqu x1, (1*16)(dst); \
	vmovdqu x2, (2*16)(dst); \
	vmovdqu x3, (3*16)(dst); \
	vmovdqu x4, (4*16)(dst); \
	vmovdqu x5, (5*16)(dst); \
	vmovdqu x6, (6*16)(dst); \
	vmovdqu x7, (7*16)(dst);

#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
	vpxor (0*16)(src), x1, x1; \
	vpxor (1*16)(src), x2, x2; \
	vpxor (2*16)(src), x3, x3; \
	vpxor (3*16)(src), x4, x4; \
	vpxor (4*16)(src), x5, x5; \
	vpxor (5*16)(src), x6, x6; \
	vpxor (6*16)(src), x7, x7; \
	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);

#define inc_le128(x, minus_one, tmp) \
	vpcmpeqq minus_one, x, tmp; \
	vpsubq minus_one, x, x; \
	vpslldq $8, tmp, tmp; \
	vpsubq tmp, x, x;

#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
	vpcmpeqd t0, t0, t0; \
	vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
	vmovdqa bswap, t1; \
	\
	/* load IV and byteswap */ \
	vmovdqu (iv), x7; \
	vpshufb t1, x7, x0; \
	\
	/* construct IVs */ \
	inc_le128(x7, t0, t2); \
	vpshufb t1, x7, x1; \
	inc_le128(x7, t0, t2); \
	vpshufb t1, x7, x2; \
	inc_le128(x7, t0, t2); \
	vpshufb t1, x7, x3; \
	inc_le128(x7, t0, t2); \
	vpshufb t1, x7, x4; \
	inc_le128(x7, t0, t2); \
	vpshufb t1, x7, x5; \
	inc_le128(x7, t0, t2); \
	vpshufb t1, x7, x6; \
	inc_le128(x7, t0, t2); \
	vmovdqa x7, t2; \
	vpshufb t1, x7, x7; \
	inc_le128(t2, t0, t1); \
	vmovdqu t2, (iv);

#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
	vpxor (0*16)(src), x0, x0; \
	vpxor (1*16)(src), x1, x1; \
	vpxor (2*16)(src), x2, x2; \
	vpxor (3*16)(src), x3, x3; \
	vpxor (4*16)(src), x4, x4; \
	vpxor (5*16)(src), x5, x5; \
	vpxor (6*16)(src), x6, x6; \
	vpxor (7*16)(src), x7, x7; \
	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);

#define gf128mul_x_ble(iv, mask, tmp) \
	vpsrad $31, iv, tmp; \
	vpaddq iv, iv, iv; \
	vpshufd $0x13, tmp, tmp; \
	vpand mask, tmp, tmp; \
	vpxor tmp, iv, iv;

#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
		      t1, xts_gf128mul_and_shl1_mask) \
	vmovdqa xts_gf128mul_and_shl1_mask, t0; \
	\
	/* load IV */ \
	vmovdqu (iv), tiv; \
	vpxor (0*16)(src), tiv, x0; \
	vmovdqu tiv, (0*16)(dst); \
	\
	/* construct and store IVs, also xor with source */ \
	gf128mul_x_ble(tiv, t0, t1); \
	vpxor (1*16)(src), tiv, x1; \
	vmovdqu tiv, (1*16)(dst); \
	\
	gf128mul_x_ble(tiv, t0, t1); \
	vpxor (2*16)(src), tiv, x2; \
	vmovdqu tiv, (2*16)(dst); \
	\
	gf128mul_x_ble(tiv, t0, t1); \
	vpxor (3*16)(src), tiv, x3; \
	vmovdqu tiv, (3*16)(dst); \
	\
	gf128mul_x_ble(tiv, t0, t1); \
	vpxor (4*16)(src), tiv, x4; \
	vmovdqu tiv, (4*16)(dst); \
	\
	gf128mul_x_ble(tiv, t0, t1); \
	vpxor (5*16)(src), tiv, x5; \
	vmovdqu tiv, (5*16)(dst); \
	\
	gf128mul_x_ble(tiv, t0, t1); \
	vpxor (6*16)(src), tiv, x6; \
	vmovdqu tiv, (6*16)(dst); \
	\
	gf128mul_x_ble(tiv, t0, t1); \
	vpxor (7*16)(src), tiv, x7; \
	vmovdqu tiv, (7*16)(dst); \
	\
	gf128mul_x_ble(tiv, t0, t1); \
	vmovdqu tiv, (iv);

#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
	vpxor (0*16)(dst), x0, x0; \
	vpxor (1*16)(dst), x1, x1; \
	vpxor (2*16)(dst), x2, x2; \
	vpxor (3*16)(dst), x3, x3; \
	vpxor (4*16)(dst), x4, x4; \
	vpxor (5*16)(dst), x5, x5; \
	vpxor (6*16)(dst), x6, x6; \
	vpxor (7*16)(dst), x7, x7; \
	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
crypto: cast6/avx - avoid using temporary stack buffers Introduce new assembler functions to avoid use temporary stack buffers in glue code. This also allows use of vector instructions for xoring output in CTR and CBC modes and construction of IVs for CTR mode. ECB mode sees ~0.5% decrease in speed because added one extra function call. CBC mode decryption and CTR mode benefit from vector operations and gain ~2%. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2012-10-20 20:06:41 +08:00			`/*`
			`* Shared glue code for 128bit block ciphers, AVX assembler macros`
			`*`
crypto: x86 - add more optimized XTS-mode for serpent-avx This patch adds AVX optimized XTS-mode helper functions/macros and converts serpent-avx to use the new facilities. Benefits are slightly improved speed and reduced stack usage as use of temporary IV-array is avoided. tcrypt results, with Intel i5-2450M: enc dec 16B 1.00x 1.00x 64B 1.00x 1.00x 256B 1.04x 1.06x 1024B 1.09x 1.09x 8192B 1.10x 1.09x Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2013-04-09 02:50:55 +08:00			`* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>`
crypto: cast6/avx - avoid using temporary stack buffers Introduce new assembler functions to avoid use temporary stack buffers in glue code. This also allows use of vector instructions for xoring output in CTR and CBC modes and construction of IVs for CTR mode. ECB mode sees ~0.5% decrease in speed because added one extra function call. CBC mode decryption and CTR mode benefit from vector operations and gain ~2%. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2012-10-20 20:06:41 +08:00			`*`
			`* This program is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation; either version 2 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`*/`

			`#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \`
			`vmovdqu (0*16)(src), x0; \`
			`vmovdqu (1*16)(src), x1; \`
			`vmovdqu (2*16)(src), x2; \`
			`vmovdqu (3*16)(src), x3; \`
			`vmovdqu (4*16)(src), x4; \`
			`vmovdqu (5*16)(src), x5; \`
			`vmovdqu (6*16)(src), x6; \`
			`vmovdqu (7*16)(src), x7;`

			`#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \`
			`vmovdqu x0, (0*16)(dst); \`
			`vmovdqu x1, (1*16)(dst); \`
			`vmovdqu x2, (2*16)(dst); \`
			`vmovdqu x3, (3*16)(dst); \`
			`vmovdqu x4, (4*16)(dst); \`
			`vmovdqu x5, (5*16)(dst); \`
			`vmovdqu x6, (6*16)(dst); \`
			`vmovdqu x7, (7*16)(dst);`

			`#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \`
			`vpxor (0*16)(src), x1, x1; \`
			`vpxor (1*16)(src), x2, x2; \`
			`vpxor (2*16)(src), x3, x3; \`
			`vpxor (3*16)(src), x4, x4; \`
			`vpxor (4*16)(src), x5, x5; \`
			`vpxor (5*16)(src), x6, x6; \`
			`vpxor (6*16)(src), x7, x7; \`
			`store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);`

			`#define inc_le128(x, minus_one, tmp) \`
			`vpcmpeqq minus_one, x, tmp; \`
			`vpsubq minus_one, x, x; \`
			`vpslldq $8, tmp, tmp; \`
			`vpsubq tmp, x, x;`

			`#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \`
			`vpcmpeqd t0, t0, t0; \`
			`vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \`
			`vmovdqa bswap, t1; \`
			`\`
			`/* load IV and byteswap */ \`
			`vmovdqu (iv), x7; \`
			`vpshufb t1, x7, x0; \`
			`\`
			`/* construct IVs */ \`
			`inc_le128(x7, t0, t2); \`
			`vpshufb t1, x7, x1; \`
			`inc_le128(x7, t0, t2); \`
			`vpshufb t1, x7, x2; \`
			`inc_le128(x7, t0, t2); \`
			`vpshufb t1, x7, x3; \`
			`inc_le128(x7, t0, t2); \`
			`vpshufb t1, x7, x4; \`
			`inc_le128(x7, t0, t2); \`
			`vpshufb t1, x7, x5; \`
			`inc_le128(x7, t0, t2); \`
			`vpshufb t1, x7, x6; \`
			`inc_le128(x7, t0, t2); \`
			`vmovdqa x7, t2; \`
			`vpshufb t1, x7, x7; \`
			`inc_le128(t2, t0, t1); \`
			`vmovdqu t2, (iv);`

			`#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \`
			`vpxor (0*16)(src), x0, x0; \`
			`vpxor (1*16)(src), x1, x1; \`
			`vpxor (2*16)(src), x2, x2; \`
			`vpxor (3*16)(src), x3, x3; \`
			`vpxor (4*16)(src), x4, x4; \`
			`vpxor (5*16)(src), x5, x5; \`
			`vpxor (6*16)(src), x6, x6; \`
			`vpxor (7*16)(src), x7, x7; \`
			`store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);`
crypto: x86 - add more optimized XTS-mode for serpent-avx This patch adds AVX optimized XTS-mode helper functions/macros and converts serpent-avx to use the new facilities. Benefits are slightly improved speed and reduced stack usage as use of temporary IV-array is avoided. tcrypt results, with Intel i5-2450M: enc dec 16B 1.00x 1.00x 64B 1.00x 1.00x 256B 1.04x 1.06x 1024B 1.09x 1.09x 8192B 1.10x 1.09x Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2013-04-09 02:50:55 +08:00
			`#define gf128mul_x_ble(iv, mask, tmp) \`
			`vpsrad $31, iv, tmp; \`
			`vpaddq iv, iv, iv; \`
			`vpshufd $0x13, tmp, tmp; \`
			`vpand mask, tmp, tmp; \`
			`vpxor tmp, iv, iv;`

			`#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \`
			`t1, xts_gf128mul_and_shl1_mask) \`
			`vmovdqa xts_gf128mul_and_shl1_mask, t0; \`
			`\`
			`/* load IV */ \`
			`vmovdqu (iv), tiv; \`
			`vpxor (0*16)(src), tiv, x0; \`
			`vmovdqu tiv, (0*16)(dst); \`
			`\`
			`/* construct and store IVs, also xor with source */ \`
			`gf128mul_x_ble(tiv, t0, t1); \`
			`vpxor (1*16)(src), tiv, x1; \`
			`vmovdqu tiv, (1*16)(dst); \`
			`\`
			`gf128mul_x_ble(tiv, t0, t1); \`
			`vpxor (2*16)(src), tiv, x2; \`
			`vmovdqu tiv, (2*16)(dst); \`
			`\`
			`gf128mul_x_ble(tiv, t0, t1); \`
			`vpxor (3*16)(src), tiv, x3; \`
			`vmovdqu tiv, (3*16)(dst); \`
			`\`
			`gf128mul_x_ble(tiv, t0, t1); \`
			`vpxor (4*16)(src), tiv, x4; \`
			`vmovdqu tiv, (4*16)(dst); \`
			`\`
			`gf128mul_x_ble(tiv, t0, t1); \`
			`vpxor (5*16)(src), tiv, x5; \`
			`vmovdqu tiv, (5*16)(dst); \`
			`\`
			`gf128mul_x_ble(tiv, t0, t1); \`
			`vpxor (6*16)(src), tiv, x6; \`
			`vmovdqu tiv, (6*16)(dst); \`
			`\`
			`gf128mul_x_ble(tiv, t0, t1); \`
			`vpxor (7*16)(src), tiv, x7; \`
			`vmovdqu tiv, (7*16)(dst); \`
			`\`
			`gf128mul_x_ble(tiv, t0, t1); \`
			`vmovdqu tiv, (iv);`

			`#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \`
			`vpxor (0*16)(dst), x0, x0; \`
			`vpxor (1*16)(dst), x1, x1; \`
			`vpxor (2*16)(dst), x2, x2; \`
			`vpxor (3*16)(dst), x3, x3; \`
			`vpxor (4*16)(dst), x4, x4; \`
			`vpxor (5*16)(dst), x5, x5; \`
			`vpxor (6*16)(dst), x6, x6; \`
			`vpxor (7*16)(dst), x7, x7; \`
			`store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);`