x86/asm: Optimize clear_page()
Currently, we CALL clear_page() which then JMPs to the proper function chosen by the alternatives. What we should do instead is CALL the proper function directly. (This was something Ingo suggested a while ago). So let's do that. Measuring our favourite kernel build workload shows that there are no significant changes in performance. AMD === -- /tmp/before 2017-02-09 18:01:46.451961188 +0100 ++ /tmp/after 2017-02-09 18:01:54.883961175 +0100 @@ -1,15 +1,15 @@ Performance counter stats for 'system wide' (5 runs): - 1028960.373643 cpu-clock (msec) # 6.000 CPUs utilized ( +- 1.41% ) + 1023086.018961 cpu-clock (msec) # 6.000 CPUs utilized ( +- 1.20% ) - 518,744 context-switches # 0.504 K/sec ( +- 1.04% ) + 518,254 context-switches # 0.507 K/sec ( +- 1.01% ) - 38,112 cpu-migrations # 0.037 K/sec ( +- 1.95% ) + 37,917 cpu-migrations # 0.037 K/sec ( +- 1.02% ) - 20,874,266 page-faults # 0.020 M/sec ( +- 0.07% ) + 20,918,897 page-faults # 0.020 M/sec ( +- 0.18% ) - 2,043,646,230,667 cycles # 1.986 GHz ( +- 0.14% ) (66.67%) + 2,045,305,584,032 cycles # 1.999 GHz ( +- 0.16% ) (66.67%) - 553,698,855,431 stalled-cycles-frontend # 27.09% frontend cycles idle ( +- 0.07% ) (66.67%) + 555,099,401,413 stalled-cycles-frontend # 27.14% frontend cycles idle ( +- 0.13% ) (66.67%) - 621,544,286,390 stalled-cycles-backend # 30.41% backend cycles idle ( +- 0.39% ) (66.67%) + 621,371,430,254 stalled-cycles-backend # 30.38% backend cycles idle ( +- 0.32% ) (66.67%) - 1,738,364,431,659 instructions # 0.85 insn per cycle + 1,739,895,771,901 instructions # 0.85 insn per cycle - # 0.36 stalled cycles per insn ( +- 0.11% ) (66.67%) + # 0.36 stalled cycles per insn ( +- 0.13% ) (66.67%) - 391,170,943,850 branches # 380.161 M/sec ( +- 0.13% ) (66.67%) + 391,398,551,757 branches # 382.567 M/sec ( +- 0.13% ) (66.67%) - 22,567,810,411 branch-misses # 5.77% of all branches ( +- 0.11% ) (66.67%) + 22,574,726,683 branch-misses # 5.77% of all branches ( +- 0.13% ) (66.67%) - 171.480741921 seconds time elapsed ( +- 1.41% ) + 170.509229451 seconds time elapsed ( +- 1.20% ) Intel ===== -- /tmp/before 2017-02-09 20:36:19.851947473 +0100 ++ /tmp/after 2017-02-09 20:36:30.151947458 +0100 @@ -1,15 +1,15 @@ Performance counter stats for 'system wide' (5 runs): - 2207248.598126 cpu-clock (msec) # 8.000 CPUs utilized ( +- 0.69% ) + 2213300.106631 cpu-clock (msec) # 8.000 CPUs utilized ( +- 0.73% ) - 899,342 context-switches # 0.407 K/sec ( +- 0.68% ) + 898,381 context-switches # 0.406 K/sec ( +- 0.79% ) - 80,553 cpu-migrations # 0.036 K/sec ( +- 1.13% ) + 80,979 cpu-migrations # 0.037 K/sec ( +- 1.11% ) - 36,171,148 page-faults # 0.016 M/sec ( +- 0.02% ) + 36,179,791 page-faults # 0.016 M/sec ( +- 0.02% ) - 6,665,288,826,484 cycles # 3.020 GHz ( +- 0.07% ) (83.33%) + 6,671,638,410,799 cycles # 3.014 GHz ( +- 0.06% ) (83.33%) - 5,065,975,115,197 stalled-cycles-frontend # 76.01% frontend cycles idle ( +- 0.11% ) (83.33%) + 5,076,835,183,223 stalled-cycles-frontend # 76.10% frontend cycles idle ( +- 0.11% ) (83.33%) - 3,841,556,350,614 stalled-cycles-backend # 57.64% backend cycles idle ( +- 0.13% ) (66.67%) + 3,852,823,974,333 stalled-cycles-backend # 57.75% backend cycles idle ( +- 0.12% ) (66.67%) - 4,148,398,171,079 instructions # 0.62 insn per cycle + 4,148,997,156,059 instructions # 0.62 insn per cycle - # 1.22 stalled cycles per insn ( +- 0.10% ) (83.33%) + # 1.22 stalled cycles per insn ( +- 0.11% ) (83.33%) - 887,187,118,591 branches # 401.943 M/sec ( +- 0.09% ) (83.33%) + 887,271,341,121 branches # 400.882 M/sec ( +- 0.11% ) (83.33%) - 30,139,439,034 branch-misses # 3.40% of all branches ( +- 0.09% ) (83.33%) + 30,134,864,997 branch-misses # 3.40% of all branches ( +- 0.06% ) (83.33%) - 275.904405540 seconds time elapsed ( +- 0.69% ) + 276.660352016 seconds time elapsed ( +- 0.73% ) allmodconfig vmlinux size grows by a ~1Kb but that's fine - we optimize our calling of the clear_page variants. text data bss dec hex filename 9051979 23067670 27009024 59128673 3863b61 vmlinux 9053000 23067670 27009024 59129694 3863f5e vmlinux.clear_page Reported-by: kernel test robot <fengguang.wu@intel.com> Tested-by: Fengguang Wu <fengguang.wu@intel.com> Signed-off-by: Borislav Petkov <bp@suse.de> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/20170215111927.emdgxf2pide3kwro@pd.tnic Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
2140a9942b
commit
f25d384755
|
@ -4,6 +4,7 @@
|
||||||
#include <asm/page_64_types.h>
|
#include <asm/page_64_types.h>
|
||||||
|
|
||||||
#ifndef __ASSEMBLY__
|
#ifndef __ASSEMBLY__
|
||||||
|
#include <asm/alternative.h>
|
||||||
|
|
||||||
/* duplicated to the one in bootmem.h */
|
/* duplicated to the one in bootmem.h */
|
||||||
extern unsigned long max_pfn;
|
extern unsigned long max_pfn;
|
||||||
|
@ -34,7 +35,20 @@ extern unsigned long __phys_addr_symbol(unsigned long);
|
||||||
#define pfn_valid(pfn) ((pfn) < max_pfn)
|
#define pfn_valid(pfn) ((pfn) < max_pfn)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void clear_page(void *page);
|
void clear_page_orig(void *page);
|
||||||
|
void clear_page_rep(void *page);
|
||||||
|
void clear_page_erms(void *page);
|
||||||
|
|
||||||
|
static inline void clear_page(void *page)
|
||||||
|
{
|
||||||
|
alternative_call_2(clear_page_orig,
|
||||||
|
clear_page_rep, X86_FEATURE_REP_GOOD,
|
||||||
|
clear_page_erms, X86_FEATURE_ERMS,
|
||||||
|
"=D" (page),
|
||||||
|
"0" (page)
|
||||||
|
: "memory", "rax", "rcx");
|
||||||
|
}
|
||||||
|
|
||||||
void copy_page(void *to, void *from);
|
void copy_page(void *to, void *from);
|
||||||
|
|
||||||
#endif /* !__ASSEMBLY__ */
|
#endif /* !__ASSEMBLY__ */
|
||||||
|
|
|
@ -14,20 +14,15 @@
|
||||||
* Zero a page.
|
* Zero a page.
|
||||||
* %rdi - page
|
* %rdi - page
|
||||||
*/
|
*/
|
||||||
ENTRY(clear_page)
|
ENTRY(clear_page_rep)
|
||||||
|
|
||||||
ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
|
|
||||||
"jmp clear_page_c_e", X86_FEATURE_ERMS
|
|
||||||
|
|
||||||
movl $4096/8,%ecx
|
movl $4096/8,%ecx
|
||||||
xorl %eax,%eax
|
xorl %eax,%eax
|
||||||
rep stosq
|
rep stosq
|
||||||
ret
|
ret
|
||||||
ENDPROC(clear_page)
|
ENDPROC(clear_page_rep)
|
||||||
EXPORT_SYMBOL(clear_page)
|
EXPORT_SYMBOL_GPL(clear_page_rep)
|
||||||
|
|
||||||
ENTRY(clear_page_orig)
|
ENTRY(clear_page_orig)
|
||||||
|
|
||||||
xorl %eax,%eax
|
xorl %eax,%eax
|
||||||
movl $4096/64,%ecx
|
movl $4096/64,%ecx
|
||||||
.p2align 4
|
.p2align 4
|
||||||
|
@ -47,10 +42,12 @@ ENTRY(clear_page_orig)
|
||||||
nop
|
nop
|
||||||
ret
|
ret
|
||||||
ENDPROC(clear_page_orig)
|
ENDPROC(clear_page_orig)
|
||||||
|
EXPORT_SYMBOL_GPL(clear_page_orig)
|
||||||
|
|
||||||
ENTRY(clear_page_c_e)
|
ENTRY(clear_page_erms)
|
||||||
movl $4096,%ecx
|
movl $4096,%ecx
|
||||||
xorl %eax,%eax
|
xorl %eax,%eax
|
||||||
rep stosb
|
rep stosb
|
||||||
ret
|
ret
|
||||||
ENDPROC(clear_page_c_e)
|
ENDPROC(clear_page_erms)
|
||||||
|
EXPORT_SYMBOL_GPL(clear_page_erms)
|
||||||
|
|
Loading…
Reference in New Issue