linux/arch/x86/lib/memcpy_64.S

/* Copyright 2002 Andi Kleen */

#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeature.h>

/*
 * memcpy - Copy a memory block.
 *
 * Input:	
 * rdi destination
 * rsi source
 * rdx count
 * 
 * Output:
 * rax original destination
 */	

	ALIGN
memcpy_c:
	CFI_STARTPROC
	movq %rdi,%rax
	movl %edx,%ecx
	shrl $3,%ecx
	andl $7,%edx
	rep movsq
	movl %edx,%ecx
	rep movsb
	ret
	CFI_ENDPROC
ENDPROC(memcpy_c)

ENTRY(__memcpy)
ENTRY(memcpy)
	CFI_STARTPROC
	movq %rdi,%rax

	movl %edx,%ecx
	shrl $6,%ecx
	jz .Lhandle_tail

	.p2align 4
.Lloop_64:
	decl %ecx

	movq (%rsi),%r11
	movq 8(%rsi),%r8

	movq %r11,(%rdi)
	movq %r8,1*8(%rdi)

	movq 2*8(%rsi),%r9
	movq 3*8(%rsi),%r10

	movq %r9,2*8(%rdi)
	movq %r10,3*8(%rdi)

	movq 4*8(%rsi),%r11
	movq 5*8(%rsi),%r8

	movq %r11,4*8(%rdi)
	movq %r8,5*8(%rdi)

	movq 6*8(%rsi),%r9
	movq 7*8(%rsi),%r10

	movq %r9,6*8(%rdi)
	movq %r10,7*8(%rdi)

	leaq 64(%rsi),%rsi
	leaq 64(%rdi),%rdi
	jnz  .Lloop_64

.Lhandle_tail:
	movl %edx,%ecx
	andl $63,%ecx
	shrl $3,%ecx
	jz   .Lhandle_7
	.p2align 4
.Lloop_8:
	decl %ecx
	movq (%rsi),%r8
	movq %r8,(%rdi)
	leaq 8(%rdi),%rdi
	leaq 8(%rsi),%rsi
	jnz  .Lloop_8

.Lhandle_7:
	movl %edx,%ecx
	andl $7,%ecx
	jz .Lende
	.p2align 4
.Lloop_1:
	movb (%rsi),%r8b
	movb %r8b,(%rdi)
	incq %rdi
	incq %rsi
	decl %ecx
	jnz .Lloop_1

.Lende:
	ret
	CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)

	/* Some CPUs run faster using the string copy instructions.
	   It is also a lot simpler. Use this when possible */

	.section .altinstr_replacement,"ax"
1:	.byte 0xeb				/* jmp <disp8> */
	.byte (memcpy_c - memcpy) - (2f - 1b)	/* offset */
2:
	.previous
	.section .altinstructions,"a"
	.align 8
	.quad memcpy
	.quad 1b
	.byte X86_FEATURE_REP_GOOD
	/* Replace only beginning, memcpy is used to apply alternatives, so it
	 * is silly to overwrite itself with nops - reboot is only outcome... */
	.byte 2b - 1b
	.byte 2b - 1b
	.previous
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-17 06:20:36 +08:00			`/* Copyright 2002 Andi Kleen */`
Remove all inclusions of <linux/config.h> kbuild explicitly includes this at build time. Signed-off-by: Dave Jones <davej@redhat.com> 2006-10-04 15:38:54 +08:00
[PATCH] annotate arch/x86_64/lib/.S Add unwind annotations to arch/x86_64/lib/.S, and also use the macros provided by linux/linkage.h where-ever possible. Some of the alternative instructions handling needed to be adjusted so that the replacement code would also have valid unwind information. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andi Kleen <ak@suse.de> 2006-09-26 16:52:32 +08:00			`#include <linux/linkage.h>`
			`#include <asm/dwarf2.h>`
			`#include <asm/cpufeature.h>`

Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-17 06:20:36 +08:00			`/*`
			`* memcpy - Copy a memory block.`
			`*`
			`* Input:`
			`* rdi destination`
			`* rsi source`
			`* rdx count`
			`*`
			`* Output:`
			`* rax original destination`
			`*/`

[PATCH] annotate arch/x86_64/lib/.S Add unwind annotations to arch/x86_64/lib/.S, and also use the macros provided by linux/linkage.h where-ever possible. Some of the alternative instructions handling needed to be adjusted so that the replacement code would also have valid unwind information. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andi Kleen <ak@suse.de> 2006-09-26 16:52:32 +08:00			`ALIGN`
			`memcpy_c:`
			`CFI_STARTPROC`
			`movq %rdi,%rax`
			`movl %edx,%ecx`
			`shrl $3,%ecx`
			`andl $7,%edx`
			`rep movsq`
			`movl %edx,%ecx`
			`rep movsb`
			`ret`
			`CFI_ENDPROC`
			`ENDPROC(memcpy_c)`

			`ENTRY(__memcpy)`
			`ENTRY(memcpy)`
			`CFI_STARTPROC`
[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-02-04 04:51:02 +08:00			`movq %rdi,%rax`

			`movl %edx,%ecx`
			`shrl $6,%ecx`
			`jz .Lhandle_tail`

			`.p2align 4`
			`.Lloop_64:`
			`decl %ecx`

			`movq (%rsi),%r11`
			`movq 8(%rsi),%r8`

			`movq %r11,(%rdi)`
			`movq %r8,1*8(%rdi)`

			`movq 2*8(%rsi),%r9`
			`movq 3*8(%rsi),%r10`

			`movq %r9,2*8(%rdi)`
			`movq %r10,3*8(%rdi)`

			`movq 4*8(%rsi),%r11`
			`movq 5*8(%rsi),%r8`

			`movq %r11,4*8(%rdi)`
			`movq %r8,5*8(%rdi)`

			`movq 6*8(%rsi),%r9`
			`movq 7*8(%rsi),%r10`

			`movq %r9,6*8(%rdi)`
			`movq %r10,7*8(%rdi)`

			`leaq 64(%rsi),%rsi`
			`leaq 64(%rdi),%rdi`
			`jnz .Lloop_64`

			`.Lhandle_tail:`
			`movl %edx,%ecx`
			`andl $63,%ecx`
			`shrl $3,%ecx`
			`jz .Lhandle_7`
			`.p2align 4`
			`.Lloop_8:`
			`decl %ecx`
			`movq (%rsi),%r8`
			`movq %r8,(%rdi)`
			`leaq 8(%rdi),%rdi`
			`leaq 8(%rsi),%rsi`
			`jnz .Lloop_8`

			`.Lhandle_7:`
			`movl %edx,%ecx`
			`andl $7,%ecx`
			`jz .Lende`
			`.p2align 4`
			`.Lloop_1:`
			`movb (%rsi),%r8b`
			`movb %r8b,(%rdi)`
			`incq %rdi`
			`incq %rsi`
			`decl %ecx`
			`jnz .Lloop_1`

			`.Lende:`
			`ret`
[PATCH] annotate arch/x86_64/lib/.S Add unwind annotations to arch/x86_64/lib/.S, and also use the macros provided by linux/linkage.h where-ever possible. Some of the alternative instructions handling needed to be adjusted so that the replacement code would also have valid unwind information. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andi Kleen <ak@suse.de> 2006-09-26 16:52:32 +08:00			`CFI_ENDPROC`
			`ENDPROC(memcpy)`
			`ENDPROC(__memcpy)`
[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-02-04 04:51:02 +08:00
			`/* Some CPUs run faster using the string copy instructions.`
			`It is also a lot simpler. Use this when possible */`

[PATCH] annotate arch/x86_64/lib/.S Add unwind annotations to arch/x86_64/lib/.S, and also use the macros provided by linux/linkage.h where-ever possible. Some of the alternative instructions handling needed to be adjusted so that the replacement code would also have valid unwind information. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andi Kleen <ak@suse.de> 2006-09-26 16:52:32 +08:00			`.section .altinstr_replacement,"ax"`
			`1: .byte 0xeb /* jmp <disp8> */`
			`.byte (memcpy_c - memcpy) - (2f - 1b) /* offset */`
			`2:`
			`.previous`
[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-02-04 04:51:02 +08:00			`.section .altinstructions,"a"`
			`.align 8`
[PATCH] annotate arch/x86_64/lib/.S Add unwind annotations to arch/x86_64/lib/.S, and also use the macros provided by linux/linkage.h where-ever possible. Some of the alternative instructions handling needed to be adjusted so that the replacement code would also have valid unwind information. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andi Kleen <ak@suse.de> 2006-09-26 16:52:32 +08:00			`.quad memcpy`
			`.quad 1b`
			`.byte X86_FEATURE_REP_GOOD`
Do not replace whole memcpy in apply alternatives apply_alternatives uses memcpy() to apply alternatives. Which has the unfortunate effect that while applying memcpy alternative to memcpy itself it tries to overwrite itself with nops - which causes #UD fault as it overwrites half of an instruction in copy loop, and from this point on only possible outcome is triplefault and reboot. So let's overwrite only first two instructions of memcpy - as long as the main memcpy loop is not in first two bytes it will work fine. Signed-off-by: Petr Vandrovec <petr@vandrovec.name> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2007-08-12 16:12:52 +08:00			`/* Replace only beginning, memcpy is used to apply alternatives, so it`
			`* is silly to overwrite itself with nops - reboot is only outcome... */`
			`.byte 2b - 1b`
[PATCH] annotate arch/x86_64/lib/.S Add unwind annotations to arch/x86_64/lib/.S, and also use the macros provided by linux/linkage.h where-ever possible. Some of the alternative instructions handling needed to be adjusted so that the replacement code would also have valid unwind information. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andi Kleen <ak@suse.de> 2006-09-26 16:52:32 +08:00			`.byte 2b - 1b`
[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-02-04 04:51:02 +08:00			`.previous`