From f4cb1cc18f364d761d5614eb6293cccc6647f259 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Sat, 16 Nov 2013 12:37:01 -0800 Subject: [PATCH 1/4] x86-64, copy_user: Remove zero byte check before copy user buffer. Operation of rep movsb instruction handles zero byte copy. As pointed out by Linus, there is no need to check zero size in kernel. Removing this redundant check saves a few cycles in copy user functions. Reported-by: Linus Torvalds Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1384634221-6006-1-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/lib/copy_user_64.S | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index a30ca15be21c..ffe4eb9f09eb 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -236,8 +236,6 @@ ENDPROC(copy_user_generic_unrolled) ENTRY(copy_user_generic_string) CFI_STARTPROC ASM_STAC - andl %edx,%edx - jz 4f cmpl $8,%edx jb 2f /* less than 8 bytes, go to byte copy loop */ ALIGN_DESTINATION @@ -249,7 +247,7 @@ ENTRY(copy_user_generic_string) 2: movl %edx,%ecx 3: rep movsb -4: xorl %eax,%eax + xorl %eax,%eax ASM_CLAC ret @@ -279,12 +277,10 @@ ENDPROC(copy_user_generic_string) ENTRY(copy_user_enhanced_fast_string) CFI_STARTPROC ASM_STAC - andl %edx,%edx - jz 2f movl %edx,%ecx 1: rep movsb -2: xorl %eax,%eax + xorl %eax,%eax ASM_CLAC ret From 661c80192d21269c7fc566f1d547510b0c867677 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 20 Nov 2013 12:50:51 -0800 Subject: [PATCH 2/4] x86-64, copy_user: Use leal to produce 32-bit results When we are using lea to produce a 32-bit result, we can use the leal form, rather than using leaq and worry about truncation elsewhere. Make the leal explicit, both to be more obvious and since that is what gcc generates and thus is less likely to trigger obscure gas bugs. Cc: Fenghua Yu Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1384634221-6006-1-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/lib/copy_user_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index ffe4eb9f09eb..dee945d55594 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -186,7 +186,7 @@ ENTRY(copy_user_generic_unrolled) 30: shll $6,%ecx addl %ecx,%edx jmp 60f -40: lea (%rdx,%rcx,8),%rdx +40: leal (%rdx,%rcx,8),%edx jmp 60f 50: movl %ecx,%edx 60: jmp copy_user_handle_tail /* ecx is zerorest also */ @@ -252,7 +252,7 @@ ENTRY(copy_user_generic_string) ret .section .fixup,"ax" -11: lea (%rdx,%rcx,8),%rcx +11: leal (%rdx,%rcx,8),%ecx 12: movl %ecx,%edx /* ecx is zerorest also */ jmp copy_user_handle_tail .previous From c5fe5d80680e2949ffe102180f5fc6cefc0d145f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 27 Dec 2013 15:30:58 -0800 Subject: [PATCH 3/4] x86: Replace assembly access_ok() with a C variant It turns out that the assembly variant doesn't actually produce that good code, presumably partly because it creates a long dependency chain with no scheduling, and partly because we cannot get a flags result out of gcc (which could be fixed with asm goto, but it turns out not to be worth it.) The C code allows gcc to schedule and generate multiple (easily predictable) branches, and as a side benefit we can really optimize the case where the size is constant. Link: http://lkml.kernel.org/r/CA%2B55aFzPBdbfKovMT8Edr4SmE2_=%2BOKJFac9XW2awegogTkVTA@mail.gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uaccess.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8ec57c07b125..84ecf1df2ac6 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -40,22 +40,28 @@ /* * Test whether a block of memory is a valid user space address. * Returns 0 if the range is valid, nonzero otherwise. - * - * This is equivalent to the following test: - * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64) - * - * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... */ +static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit) +{ + /* + * If we have used "sizeof()" for the size, + * we know it won't overflow the limit (but + * it might overflow the 'addr', so it's + * important to subtract the size from the + * limit, not add it to the address). + */ + if (__builtin_constant_p(size)) + return addr > limit - size; + + /* Arbitrary sizes? Be careful about overflow */ + addr += size; + return (addr < size) || (addr > limit); +} #define __range_not_ok(addr, size, limit) \ ({ \ - unsigned long flag, roksum; \ __chk_user_ptr(addr); \ - asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \ - : "=&r" (flag), "=r" (roksum) \ - : "1" (addr), "g" ((long)(size)), \ - "rm" (limit)); \ - flag; \ + __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ }) /** From a740576a4abf933de8f50787f24f24456cebd761 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 27 Dec 2013 16:52:47 -0800 Subject: [PATCH 4/4] x86: Slightly tweak the access_ok() C variant for better code gcc can under very specific circumstances realize that the code sequence: foo += bar; if (foo < bar) ... ... is equivalent to a carry out from the addition. Tweak the implementation of access_ok() (specifically __chk_range_not_ok()) to make it more likely that gcc will make that connection. It isn't fool-proof (sometimes gcc seems to think it can make better code with lea, and ends up with a second comparison), still, but it seems to be able to connect the two more frequently this way. Cc: Linus Torvalds Link: http://lkml.kernel.org/r/CA%2B55aFzPBdbfKovMT8Edr4SmE2_=%2BOKJFac9XW2awegogTkVTA@mail.gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uaccess.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 84ecf1df2ac6..6f1bb74d547b 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -41,7 +41,7 @@ * Test whether a block of memory is a valid user space address. * Returns 0 if the range is valid, nonzero otherwise. */ -static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit) +static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit) { /* * If we have used "sizeof()" for the size, @@ -55,7 +55,9 @@ static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, uns /* Arbitrary sizes? Be careful about overflow */ addr += size; - return (addr < size) || (addr > limit); + if (addr < size) + return true; + return addr > limit; } #define __range_not_ok(addr, size, limit) \ @@ -84,7 +86,7 @@ static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, uns * this function, memory access functions may still return -EFAULT. */ #define access_ok(type, addr, size) \ - (likely(__range_not_ok(addr, size, user_addr_max()) == 0)) + likely(!__range_not_ok(addr, size, user_addr_max())) /* * The exception table consists of pairs of addresses relative to the