diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 10c067694af4..ad5441ed1b57 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -1,30 +1,38 @@ /* Copyright 2002 Andi Kleen */ #include -#include + #include +#include /* * memcpy - Copy a memory block. * - * Input: - * rdi destination - * rsi source - * rdx count - * + * Input: + * rdi destination + * rsi source + * rdx count + * * Output: * rax original destination - */ + */ +/* + * memcpy_c() - fast string ops (REP MOVSQ) based variant. + * + * Calls to this get patched into the kernel image via the + * alternative instructions framework: + */ ALIGN memcpy_c: CFI_STARTPROC - movq %rdi,%rax - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx + movq %rdi, %rax + + movl %edx, %ecx + shrl $3, %ecx + andl $7, %edx rep movsq - movl %edx,%ecx + movl %edx, %ecx rep movsb ret CFI_ENDPROC @@ -33,92 +41,110 @@ ENDPROC(memcpy_c) ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC - movq %rdi,%rax - movl %edx,%ecx - shrl $6,%ecx + /* + * Put the number of full 64-byte blocks into %ecx. + * Tail portion is handled at the end: + */ + movq %rdi, %rax + movl %edx, %ecx + shrl $6, %ecx jz .Lhandle_tail .p2align 4 .Lloop_64: + /* + * We decrement the loop index here - and the zero-flag is + * checked at the end of the loop (instructions inbetween do + * not change the zero flag): + */ decl %ecx - movq (%rsi),%r11 - movq 8(%rsi),%r8 + /* + * Move in blocks of 4x16 bytes: + */ + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r8 + movq %r11, 0*8(%rdi) + movq %r8, 1*8(%rdi) - movq %r11,(%rdi) - movq %r8,1*8(%rdi) + movq 2*8(%rsi), %r9 + movq 3*8(%rsi), %r10 + movq %r9, 2*8(%rdi) + movq %r10, 3*8(%rdi) - movq 2*8(%rsi),%r9 - movq 3*8(%rsi),%r10 + movq 4*8(%rsi), %r11 + movq 5*8(%rsi), %r8 + movq %r11, 4*8(%rdi) + movq %r8, 5*8(%rdi) - movq %r9,2*8(%rdi) - movq %r10,3*8(%rdi) + movq 6*8(%rsi), %r9 + movq 7*8(%rsi), %r10 + movq %r9, 6*8(%rdi) + movq %r10, 7*8(%rdi) - movq 4*8(%rsi),%r11 - movq 5*8(%rsi),%r8 + leaq 64(%rsi), %rsi + leaq 64(%rdi), %rdi - movq %r11,4*8(%rdi) - movq %r8,5*8(%rdi) - - movq 6*8(%rsi),%r9 - movq 7*8(%rsi),%r10 - - movq %r9,6*8(%rdi) - movq %r10,7*8(%rdi) - - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi jnz .Lloop_64 .Lhandle_tail: - movl %edx,%ecx - andl $63,%ecx - shrl $3,%ecx + movl %edx, %ecx + andl $63, %ecx + shrl $3, %ecx jz .Lhandle_7 + .p2align 4 .Lloop_8: decl %ecx - movq (%rsi),%r8 - movq %r8,(%rdi) - leaq 8(%rdi),%rdi - leaq 8(%rsi),%rsi + movq (%rsi), %r8 + movq %r8, (%rdi) + leaq 8(%rdi), %rdi + leaq 8(%rsi), %rsi jnz .Lloop_8 .Lhandle_7: - movl %edx,%ecx - andl $7,%ecx - jz .Lende + movl %edx, %ecx + andl $7, %ecx + jz .Lend + .p2align 4 .Lloop_1: - movb (%rsi),%r8b - movb %r8b,(%rdi) + movb (%rsi), %r8b + movb %r8b, (%rdi) incq %rdi incq %rsi decl %ecx jnz .Lloop_1 -.Lende: +.Lend: ret CFI_ENDPROC ENDPROC(memcpy) ENDPROC(__memcpy) - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ + /* + * Some CPUs run faster using the string copy instructions. + * It is also a lot simpler. Use this when possible: + */ - .section .altinstr_replacement,"ax" + .section .altinstr_replacement, "ax" 1: .byte 0xeb /* jmp */ .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ 2: .previous - .section .altinstructions,"a" + + .section .altinstructions, "a" .align 8 .quad memcpy .quad 1b .byte X86_FEATURE_REP_GOOD - /* Replace only beginning, memcpy is used to apply alternatives, so it - * is silly to overwrite itself with nops - reboot is only outcome... */ + + /* + * Replace only beginning, memcpy is used to apply alternatives, + * so it is silly to overwrite itself with nops - reboot is the + * only outcome... + */ .byte 2b - 1b .byte 2b - 1b .previous