linux/arch/powerpc/lib/xor_vmx.c

/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 * Copyright (C) IBM Corporation, 2012
 *
 * Author: Anton Blanchard <anton@au.ibm.com>
 */

/*
 * Sparse (as at v0.5.0) gets very, very confused by this file.
 * Make it a bit simpler for it.
 */
#if !defined(__CHECKER__)
#include <altivec.h>
#else
#define vec_xor(a, b) a ^ b
#define vector __attribute__((vector_size(16)))
#endif

#include "xor_vmx.h"

typedef vector signed char unative_t;

#define DEFINE(V)				\
	unative_t *V = (unative_t *)V##_in;	\
	unative_t V##_0, V##_1, V##_2, V##_3

#define LOAD(V)			\
	do {			\
		V##_0 = V[0];	\
		V##_1 = V[1];	\
		V##_2 = V[2];	\
		V##_3 = V[3];	\
	} while (0)

#define STORE(V)		\
	do {			\
		V[0] = V##_0;	\
		V[1] = V##_1;	\
		V[2] = V##_2;	\
		V[3] = V##_3;	\
	} while (0)

#define XOR(V1, V2)					\
	do {						\
		V1##_0 = vec_xor(V1##_0, V2##_0);	\
		V1##_1 = vec_xor(V1##_1, V2##_1);	\
		V1##_2 = vec_xor(V1##_2, V2##_2);	\
		V1##_3 = vec_xor(V1##_3, V2##_3);	\
	} while (0)

void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
		     unsigned long *v2_in)
{
	DEFINE(v1);
	DEFINE(v2);
	unsigned long lines = bytes / (sizeof(unative_t)) / 4;

	do {
		LOAD(v1);
		LOAD(v2);
		XOR(v1, v2);
		STORE(v1);

		v1 += 4;
		v2 += 4;
	} while (--lines > 0);
}

void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
		     unsigned long *v2_in, unsigned long *v3_in)
{
	DEFINE(v1);
	DEFINE(v2);
	DEFINE(v3);
	unsigned long lines = bytes / (sizeof(unative_t)) / 4;

	do {
		LOAD(v1);
		LOAD(v2);
		LOAD(v3);
		XOR(v1, v2);
		XOR(v1, v3);
		STORE(v1);

		v1 += 4;
		v2 += 4;
		v3 += 4;
	} while (--lines > 0);
}

void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
		     unsigned long *v2_in, unsigned long *v3_in,
		     unsigned long *v4_in)
{
	DEFINE(v1);
	DEFINE(v2);
	DEFINE(v3);
	DEFINE(v4);
	unsigned long lines = bytes / (sizeof(unative_t)) / 4;

	do {
		LOAD(v1);
		LOAD(v2);
		LOAD(v3);
		LOAD(v4);
		XOR(v1, v2);
		XOR(v3, v4);
		XOR(v1, v3);
		STORE(v1);

		v1 += 4;
		v2 += 4;
		v3 += 4;
		v4 += 4;
	} while (--lines > 0);
}

void __xor_altivec_5(unsigned long bytes, unsigned long *v1_in,
		     unsigned long *v2_in, unsigned long *v3_in,
		     unsigned long *v4_in, unsigned long *v5_in)
{
	DEFINE(v1);
	DEFINE(v2);
	DEFINE(v3);
	DEFINE(v4);
	DEFINE(v5);
	unsigned long lines = bytes / (sizeof(unative_t)) / 4;

	do {
		LOAD(v1);
		LOAD(v2);
		LOAD(v3);
		LOAD(v4);
		LOAD(v5);
		XOR(v1, v2);
		XOR(v3, v4);
		XOR(v1, v5);
		XOR(v1, v3);
		STORE(v1);

		v1 += 4;
		v2 += 4;
		v3 += 4;
		v4 += 4;
		v5 += 4;
	} while (--lines > 0);
}
powerpc: Add VMX optimised xor for RAID5 Add a VMX optimised xor, used primarily for RAID5. On a POWER7 blade this is a decent win: 32regs : 17932.800 MB/sec altivec : 19724.800 MB/sec The bigger gain is when the same test is run in SMT4 mode, as it would if there was a lot of work going on: 8regs : 8377.600 MB/sec altivec : 15801.600 MB/sec I tested this against an array created without the patch, and also verified it worked as expected on a little endian kernel. [ Fix !CONFIG_ALTIVEC build -- BenH ] Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2013-10-14 18:03:58 +08:00			`/*`
			`* This program is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation; either version 2 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program; if not, write to the Free Software`
			`* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.`
			`*`
			`* Copyright (C) IBM Corporation, 2012`
			`*`
			`* Author: Anton Blanchard <anton@au.ibm.com>`
			`*/`
powerpc: rework sparse for lib/xor_vmx.c Sparse doesn't seem to be passing -maltivec around properly, leading to lots of errors: .../include/altivec.h:34:2: error: Use the "-maltivec" flag to enable PowerPC AltiVec support arch/powerpc/lib/xor_vmx.c:27:16: error: Expected ; at end of declaration arch/powerpc/lib/xor_vmx.c:27:16: error: got signed arch/powerpc/lib/xor_vmx.c:60:9: error: No right hand side of '*'-expression arch/powerpc/lib/xor_vmx.c:60:9: error: Expected ; at end of statement arch/powerpc/lib/xor_vmx.c:60:9: error: got v1_in ... arch/powerpc/lib/xor_vmx.c:87:9: error: too many errors Only include the altivec.h header for non-__CHECKER__ builds. For builds with __CHECKER__, make up some stubs instead, as suggested by Balbir. (The vector size of 16 is arbitrary.) Suggested-by: Balbir Singh <bsingharora@gmail.com> Signed-off-by: Daniel Axtens <dja@axtens.net> Tested-by: Balbir Singh <bsingharora@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> 2016-04-26 21:49:09 +08:00
			`/*`
			`* Sparse (as at v0.5.0) gets very, very confused by this file.`
			`* Make it a bit simpler for it.`
			`*/`
			`#if !defined(__CHECKER__)`
powerpc: Add VMX optimised xor for RAID5 Add a VMX optimised xor, used primarily for RAID5. On a POWER7 blade this is a decent win: 32regs : 17932.800 MB/sec altivec : 19724.800 MB/sec The bigger gain is when the same test is run in SMT4 mode, as it would if there was a lot of work going on: 8regs : 8377.600 MB/sec altivec : 15801.600 MB/sec I tested this against an array created without the patch, and also verified it worked as expected on a little endian kernel. [ Fix !CONFIG_ALTIVEC build -- BenH ] Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2013-10-14 18:03:58 +08:00			`#include <altivec.h>`
powerpc: rework sparse for lib/xor_vmx.c Sparse doesn't seem to be passing -maltivec around properly, leading to lots of errors: .../include/altivec.h:34:2: error: Use the "-maltivec" flag to enable PowerPC AltiVec support arch/powerpc/lib/xor_vmx.c:27:16: error: Expected ; at end of declaration arch/powerpc/lib/xor_vmx.c:27:16: error: got signed arch/powerpc/lib/xor_vmx.c:60:9: error: No right hand side of '*'-expression arch/powerpc/lib/xor_vmx.c:60:9: error: Expected ; at end of statement arch/powerpc/lib/xor_vmx.c:60:9: error: got v1_in ... arch/powerpc/lib/xor_vmx.c:87:9: error: too many errors Only include the altivec.h header for non-__CHECKER__ builds. For builds with __CHECKER__, make up some stubs instead, as suggested by Balbir. (The vector size of 16 is arbitrary.) Suggested-by: Balbir Singh <bsingharora@gmail.com> Signed-off-by: Daniel Axtens <dja@axtens.net> Tested-by: Balbir Singh <bsingharora@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> 2016-04-26 21:49:09 +08:00			`#else`
			`#define vec_xor(a, b) a ^ b`
			`#define vector __attribute__((vector_size(16)))`
			`#endif`
powerpc: Add VMX optimised xor for RAID5 Add a VMX optimised xor, used primarily for RAID5. On a POWER7 blade this is a decent win: 32regs : 17932.800 MB/sec altivec : 19724.800 MB/sec The bigger gain is when the same test is run in SMT4 mode, as it would if there was a lot of work going on: 8regs : 8377.600 MB/sec altivec : 15801.600 MB/sec I tested this against an array created without the patch, and also verified it worked as expected on a little endian kernel. [ Fix !CONFIG_ALTIVEC build -- BenH ] Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2013-10-14 18:03:58 +08:00
powerpc/lib/xor_vmx: Ensure no altivec code executes before enable_kernel_altivec() The xor_vmx.c file is used for the RAID5 xor operations. In these functions altivec is enabled to run the operation and then disabled. The code uses enable_kernel_altivec() around the core of the algorithm, however the whole file is built with -maltivec, so the compiler is within its rights to generate altivec code anywhere. This has been seen at least once in the wild: 0:mon> di $xor_altivec_2 c0000000000b97d0 3c4c01d9 addis r2,r12,473 c0000000000b97d4 3842db30 addi r2,r2,-9424 c0000000000b97d8 7c0802a6 mflr r0 c0000000000b97dc f8010010 std r0,16(r1) c0000000000b97e0 60000000 nop c0000000000b97e4 7c0802a6 mflr r0 c0000000000b97e8 faa1ffa8 std r21,-88(r1) ... c0000000000b981c f821ff41 stdu r1,-192(r1) c0000000000b9820 7f8101ce stvx v28,r1,r0 <-- POP c0000000000b9824 38000030 li r0,48 c0000000000b9828 7fa101ce stvx v29,r1,r0 ... c0000000000b984c 4bf6a06d bl c0000000000238b8 # enable_kernel_altivec This patch splits the non-altivec code into xor_vmx_glue.c which calls the altivec functions in xor_vmx.c. By compiling xor_vmx_glue.c without -maltivec we can guarantee that altivec instruction will not be executed outside of the enable/disable block. Signed-off-by: Matt Brown <matthew.brown.dev@gmail.com> [mpe: Rework change log and include disassembly] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> 2017-05-24 07:45:59 +08:00			`#include "xor_vmx.h"`
powerpc: Add VMX optimised xor for RAID5 Add a VMX optimised xor, used primarily for RAID5. On a POWER7 blade this is a decent win: 32regs : 17932.800 MB/sec altivec : 19724.800 MB/sec The bigger gain is when the same test is run in SMT4 mode, as it would if there was a lot of work going on: 8regs : 8377.600 MB/sec altivec : 15801.600 MB/sec I tested this against an array created without the patch, and also verified it worked as expected on a little endian kernel. [ Fix !CONFIG_ALTIVEC build -- BenH ] Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2013-10-14 18:03:58 +08:00
			`typedef vector signed char unative_t;`

			`#define DEFINE(V) \`
			`unative_t V = (unative_t )V##_in; \`
			`unative_t V##_0, V##_1, V##_2, V##_3`

			`#define LOAD(V) \`
			`do { \`
			`V##_0 = V[0]; \`
			`V##_1 = V[1]; \`
			`V##_2 = V[2]; \`
			`V##_3 = V[3]; \`
			`} while (0)`

			`#define STORE(V) \`
			`do { \`
			`V[0] = V##_0; \`
			`V[1] = V##_1; \`
			`V[2] = V##_2; \`
			`V[3] = V##_3; \`
			`} while (0)`

			`#define XOR(V1, V2) \`
			`do { \`
			`V1##_0 = vec_xor(V1##_0, V2##_0); \`
			`V1##_1 = vec_xor(V1##_1, V2##_1); \`
			`V1##_2 = vec_xor(V1##_2, V2##_2); \`
			`V1##_3 = vec_xor(V1##_3, V2##_3); \`
			`} while (0)`

powerpc/lib/xor_vmx: Ensure no altivec code executes before enable_kernel_altivec() The xor_vmx.c file is used for the RAID5 xor operations. In these functions altivec is enabled to run the operation and then disabled. The code uses enable_kernel_altivec() around the core of the algorithm, however the whole file is built with -maltivec, so the compiler is within its rights to generate altivec code anywhere. This has been seen at least once in the wild: 0:mon> di $xor_altivec_2 c0000000000b97d0 3c4c01d9 addis r2,r12,473 c0000000000b97d4 3842db30 addi r2,r2,-9424 c0000000000b97d8 7c0802a6 mflr r0 c0000000000b97dc f8010010 std r0,16(r1) c0000000000b97e0 60000000 nop c0000000000b97e4 7c0802a6 mflr r0 c0000000000b97e8 faa1ffa8 std r21,-88(r1) ... c0000000000b981c f821ff41 stdu r1,-192(r1) c0000000000b9820 7f8101ce stvx v28,r1,r0 <-- POP c0000000000b9824 38000030 li r0,48 c0000000000b9828 7fa101ce stvx v29,r1,r0 ... c0000000000b984c 4bf6a06d bl c0000000000238b8 # enable_kernel_altivec This patch splits the non-altivec code into xor_vmx_glue.c which calls the altivec functions in xor_vmx.c. By compiling xor_vmx_glue.c without -maltivec we can guarantee that altivec instruction will not be executed outside of the enable/disable block. Signed-off-by: Matt Brown <matthew.brown.dev@gmail.com> [mpe: Rework change log and include disassembly] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> 2017-05-24 07:45:59 +08:00			`void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in,`
			`unsigned long *v2_in)`
powerpc: Add VMX optimised xor for RAID5 Add a VMX optimised xor, used primarily for RAID5. On a POWER7 blade this is a decent win: 32regs : 17932.800 MB/sec altivec : 19724.800 MB/sec The bigger gain is when the same test is run in SMT4 mode, as it would if there was a lot of work going on: 8regs : 8377.600 MB/sec altivec : 15801.600 MB/sec I tested this against an array created without the patch, and also verified it worked as expected on a little endian kernel. [ Fix !CONFIG_ALTIVEC build -- BenH ] Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2013-10-14 18:03:58 +08:00			`{`
			`DEFINE(v1);`
			`DEFINE(v2);`
			`unsigned long lines = bytes / (sizeof(unative_t)) / 4;`

			`do {`
			`LOAD(v1);`
			`LOAD(v2);`
			`XOR(v1, v2);`
			`STORE(v1);`

			`v1 += 4;`
			`v2 += 4;`
			`} while (--lines > 0);`
			`}`

powerpc/lib/xor_vmx: Ensure no altivec code executes before enable_kernel_altivec() The xor_vmx.c file is used for the RAID5 xor operations. In these functions altivec is enabled to run the operation and then disabled. The code uses enable_kernel_altivec() around the core of the algorithm, however the whole file is built with -maltivec, so the compiler is within its rights to generate altivec code anywhere. This has been seen at least once in the wild: 0:mon> di $xor_altivec_2 c0000000000b97d0 3c4c01d9 addis r2,r12,473 c0000000000b97d4 3842db30 addi r2,r2,-9424 c0000000000b97d8 7c0802a6 mflr r0 c0000000000b97dc f8010010 std r0,16(r1) c0000000000b97e0 60000000 nop c0000000000b97e4 7c0802a6 mflr r0 c0000000000b97e8 faa1ffa8 std r21,-88(r1) ... c0000000000b981c f821ff41 stdu r1,-192(r1) c0000000000b9820 7f8101ce stvx v28,r1,r0 <-- POP c0000000000b9824 38000030 li r0,48 c0000000000b9828 7fa101ce stvx v29,r1,r0 ... c0000000000b984c 4bf6a06d bl c0000000000238b8 # enable_kernel_altivec This patch splits the non-altivec code into xor_vmx_glue.c which calls the altivec functions in xor_vmx.c. By compiling xor_vmx_glue.c without -maltivec we can guarantee that altivec instruction will not be executed outside of the enable/disable block. Signed-off-by: Matt Brown <matthew.brown.dev@gmail.com> [mpe: Rework change log and include disassembly] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> 2017-05-24 07:45:59 +08:00			`void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in,`
			`unsigned long v2_in, unsigned long v3_in)`
powerpc: Add VMX optimised xor for RAID5 Add a VMX optimised xor, used primarily for RAID5. On a POWER7 blade this is a decent win: 32regs : 17932.800 MB/sec altivec : 19724.800 MB/sec The bigger gain is when the same test is run in SMT4 mode, as it would if there was a lot of work going on: 8regs : 8377.600 MB/sec altivec : 15801.600 MB/sec I tested this against an array created without the patch, and also verified it worked as expected on a little endian kernel. [ Fix !CONFIG_ALTIVEC build -- BenH ] Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2013-10-14 18:03:58 +08:00			`{`
			`DEFINE(v1);`
			`DEFINE(v2);`
			`DEFINE(v3);`
			`unsigned long lines = bytes / (sizeof(unative_t)) / 4;`

			`do {`
			`LOAD(v1);`
			`LOAD(v2);`
			`LOAD(v3);`
			`XOR(v1, v2);`
			`XOR(v1, v3);`
			`STORE(v1);`

			`v1 += 4;`
			`v2 += 4;`
			`v3 += 4;`
			`} while (--lines > 0);`
			`}`

powerpc/lib/xor_vmx: Ensure no altivec code executes before enable_kernel_altivec() The xor_vmx.c file is used for the RAID5 xor operations. In these functions altivec is enabled to run the operation and then disabled. The code uses enable_kernel_altivec() around the core of the algorithm, however the whole file is built with -maltivec, so the compiler is within its rights to generate altivec code anywhere. This has been seen at least once in the wild: 0:mon> di $xor_altivec_2 c0000000000b97d0 3c4c01d9 addis r2,r12,473 c0000000000b97d4 3842db30 addi r2,r2,-9424 c0000000000b97d8 7c0802a6 mflr r0 c0000000000b97dc f8010010 std r0,16(r1) c0000000000b97e0 60000000 nop c0000000000b97e4 7c0802a6 mflr r0 c0000000000b97e8 faa1ffa8 std r21,-88(r1) ... c0000000000b981c f821ff41 stdu r1,-192(r1) c0000000000b9820 7f8101ce stvx v28,r1,r0 <-- POP c0000000000b9824 38000030 li r0,48 c0000000000b9828 7fa101ce stvx v29,r1,r0 ... c0000000000b984c 4bf6a06d bl c0000000000238b8 # enable_kernel_altivec This patch splits the non-altivec code into xor_vmx_glue.c which calls the altivec functions in xor_vmx.c. By compiling xor_vmx_glue.c without -maltivec we can guarantee that altivec instruction will not be executed outside of the enable/disable block. Signed-off-by: Matt Brown <matthew.brown.dev@gmail.com> [mpe: Rework change log and include disassembly] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> 2017-05-24 07:45:59 +08:00			`void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in,`
			`unsigned long v2_in, unsigned long v3_in,`
			`unsigned long *v4_in)`
powerpc: Add VMX optimised xor for RAID5 Add a VMX optimised xor, used primarily for RAID5. On a POWER7 blade this is a decent win: 32regs : 17932.800 MB/sec altivec : 19724.800 MB/sec The bigger gain is when the same test is run in SMT4 mode, as it would if there was a lot of work going on: 8regs : 8377.600 MB/sec altivec : 15801.600 MB/sec I tested this against an array created without the patch, and also verified it worked as expected on a little endian kernel. [ Fix !CONFIG_ALTIVEC build -- BenH ] Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2013-10-14 18:03:58 +08:00			`{`
			`DEFINE(v1);`
			`DEFINE(v2);`
			`DEFINE(v3);`
			`DEFINE(v4);`
			`unsigned long lines = bytes / (sizeof(unative_t)) / 4;`

			`do {`
			`LOAD(v1);`
			`LOAD(v2);`
			`LOAD(v3);`
			`LOAD(v4);`
			`XOR(v1, v2);`
			`XOR(v3, v4);`
			`XOR(v1, v3);`
			`STORE(v1);`

			`v1 += 4;`
			`v2 += 4;`
			`v3 += 4;`
			`v4 += 4;`
			`} while (--lines > 0);`
			`}`

powerpc/lib/xor_vmx: Ensure no altivec code executes before enable_kernel_altivec() The xor_vmx.c file is used for the RAID5 xor operations. In these functions altivec is enabled to run the operation and then disabled. The code uses enable_kernel_altivec() around the core of the algorithm, however the whole file is built with -maltivec, so the compiler is within its rights to generate altivec code anywhere. This has been seen at least once in the wild: 0:mon> di $xor_altivec_2 c0000000000b97d0 3c4c01d9 addis r2,r12,473 c0000000000b97d4 3842db30 addi r2,r2,-9424 c0000000000b97d8 7c0802a6 mflr r0 c0000000000b97dc f8010010 std r0,16(r1) c0000000000b97e0 60000000 nop c0000000000b97e4 7c0802a6 mflr r0 c0000000000b97e8 faa1ffa8 std r21,-88(r1) ... c0000000000b981c f821ff41 stdu r1,-192(r1) c0000000000b9820 7f8101ce stvx v28,r1,r0 <-- POP c0000000000b9824 38000030 li r0,48 c0000000000b9828 7fa101ce stvx v29,r1,r0 ... c0000000000b984c 4bf6a06d bl c0000000000238b8 # enable_kernel_altivec This patch splits the non-altivec code into xor_vmx_glue.c which calls the altivec functions in xor_vmx.c. By compiling xor_vmx_glue.c without -maltivec we can guarantee that altivec instruction will not be executed outside of the enable/disable block. Signed-off-by: Matt Brown <matthew.brown.dev@gmail.com> [mpe: Rework change log and include disassembly] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> 2017-05-24 07:45:59 +08:00			`void __xor_altivec_5(unsigned long bytes, unsigned long *v1_in,`
			`unsigned long v2_in, unsigned long v3_in,`
			`unsigned long v4_in, unsigned long v5_in)`
powerpc: Add VMX optimised xor for RAID5 Add a VMX optimised xor, used primarily for RAID5. On a POWER7 blade this is a decent win: 32regs : 17932.800 MB/sec altivec : 19724.800 MB/sec The bigger gain is when the same test is run in SMT4 mode, as it would if there was a lot of work going on: 8regs : 8377.600 MB/sec altivec : 15801.600 MB/sec I tested this against an array created without the patch, and also verified it worked as expected on a little endian kernel. [ Fix !CONFIG_ALTIVEC build -- BenH ] Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2013-10-14 18:03:58 +08:00			`{`
			`DEFINE(v1);`
			`DEFINE(v2);`
			`DEFINE(v3);`
			`DEFINE(v4);`
			`DEFINE(v5);`
			`unsigned long lines = bytes / (sizeof(unative_t)) / 4;`

			`do {`
			`LOAD(v1);`
			`LOAD(v2);`
			`LOAD(v3);`
			`LOAD(v4);`
			`LOAD(v5);`
			`XOR(v1, v2);`
			`XOR(v3, v4);`
			`XOR(v1, v5);`
			`XOR(v1, v3);`
			`STORE(v1);`

			`v1 += 4;`
			`v2 += 4;`
			`v3 += 4;`
			`v4 += 4;`
			`v5 += 4;`
			`} while (--lines > 0);`
			`}`