2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Access to the shared data page by the vDSO & syscall map
|
|
|
|
*
|
|
|
|
* Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <asm/processor.h>
|
|
|
|
#include <asm/ppc_asm.h>
|
2005-09-10 02:57:26 +08:00
|
|
|
#include <asm/asm-offsets.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <asm/unistd.h>
|
|
|
|
#include <asm/vdso.h>
|
|
|
|
|
|
|
|
.text
|
powerpc/vdso: Avoid link stack corruption in __get_datapage()
powerpc has a link register (lr) used for calling functions. We "bl
<func>" to call a function, and "blr" to return back to the call site.
The lr is only a single register, so if we call another function from
inside this function (ie. nested calls), software must save away the
lr on the software stack before calling the new function. Before
returning (ie. before the "blr"), the lr is restored by software from
the software stack.
This makes branch prediction quite difficult for the processor as it
will only know the branch target just before the "blr".
To help with this, modern powerpc processors keep a (non-architected)
hardware stack of lr called a "link stack". When a "bl <func>" is
run, the lr is pushed onto this stack. When a "blr" is called, the
branch predictor pops the lr value from the top of the link stack, and
uses it to predict the branch target. Hence the processor pipeline
knows a lot earlier the branch target.
This works great but there are some cases where you call "bl" but
without a matching "blr". Once such case is when trying to determine
the program counter (which can't be read directly). Here you "bl+4;
mflr" to get the program counter. If you do this, the link stack will
get out of sync with reality, causing the branch predictor to
mis-predict subsequent function returns.
To avoid this, modern micro-architectures have a special case of bl.
Using the form "bcl 20,31,+4", ensures the processor doesn't push to
the link stack.
The 32 and 64 bit variants of __get_datapage() use a "bl; mflr" to
determine the loaded address of the VDSO. The current versions of
these attempt to use this special bl variant.
Unfortunately they use +8 rather than the required +4. Hence the
current code results in the link stack getting out of sync with
reality and hence the resulting performance degradation.
This patch moves it to bcl+4 by moving __kernel_datapage_offset out of
__get_datapage().
With this patch, running a gettimeofday() (which uses
__get_datapage()) microbenchmark we get a decent bump in performance
on POWER7/8.
For the benchmark in tools/testing/selftests/powerpc/benchmarks/gettimeofday.c
POWER8:
64bit gets ~4% improvement
32bit gets ~9% improvement
POWER7:
64bit gets ~7% improvement
Signed-off-by: Michael Neuling <mikey@neuling.org>
Reported-by: Aaron Sawdey <sawdey@us.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2015-09-25 12:01:40 +08:00
|
|
|
.global __kernel_datapage_offset;
|
|
|
|
__kernel_datapage_offset:
|
|
|
|
.long 0
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
V_FUNCTION_BEGIN(__get_datapage)
|
|
|
|
.cfi_startproc
|
|
|
|
/* We don't want that exposed or overridable as we want other objects
|
|
|
|
* to be able to bl directly to here
|
|
|
|
*/
|
|
|
|
.protected __get_datapage
|
|
|
|
.hidden __get_datapage
|
|
|
|
|
|
|
|
mflr r0
|
|
|
|
.cfi_register lr,r0
|
|
|
|
|
powerpc/vdso: Avoid link stack corruption in __get_datapage()
powerpc has a link register (lr) used for calling functions. We "bl
<func>" to call a function, and "blr" to return back to the call site.
The lr is only a single register, so if we call another function from
inside this function (ie. nested calls), software must save away the
lr on the software stack before calling the new function. Before
returning (ie. before the "blr"), the lr is restored by software from
the software stack.
This makes branch prediction quite difficult for the processor as it
will only know the branch target just before the "blr".
To help with this, modern powerpc processors keep a (non-architected)
hardware stack of lr called a "link stack". When a "bl <func>" is
run, the lr is pushed onto this stack. When a "blr" is called, the
branch predictor pops the lr value from the top of the link stack, and
uses it to predict the branch target. Hence the processor pipeline
knows a lot earlier the branch target.
This works great but there are some cases where you call "bl" but
without a matching "blr". Once such case is when trying to determine
the program counter (which can't be read directly). Here you "bl+4;
mflr" to get the program counter. If you do this, the link stack will
get out of sync with reality, causing the branch predictor to
mis-predict subsequent function returns.
To avoid this, modern micro-architectures have a special case of bl.
Using the form "bcl 20,31,+4", ensures the processor doesn't push to
the link stack.
The 32 and 64 bit variants of __get_datapage() use a "bl; mflr" to
determine the loaded address of the VDSO. The current versions of
these attempt to use this special bl variant.
Unfortunately they use +8 rather than the required +4. Hence the
current code results in the link stack getting out of sync with
reality and hence the resulting performance degradation.
This patch moves it to bcl+4 by moving __kernel_datapage_offset out of
__get_datapage().
With this patch, running a gettimeofday() (which uses
__get_datapage()) microbenchmark we get a decent bump in performance
on POWER7/8.
For the benchmark in tools/testing/selftests/powerpc/benchmarks/gettimeofday.c
POWER8:
64bit gets ~4% improvement
32bit gets ~9% improvement
POWER7:
64bit gets ~7% improvement
Signed-off-by: Michael Neuling <mikey@neuling.org>
Reported-by: Aaron Sawdey <sawdey@us.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2015-09-25 12:01:40 +08:00
|
|
|
bcl 20,31,data_page_branch
|
|
|
|
data_page_branch:
|
2005-04-17 06:20:36 +08:00
|
|
|
mflr r3
|
|
|
|
mtlr r0
|
powerpc/vdso: Avoid link stack corruption in __get_datapage()
powerpc has a link register (lr) used for calling functions. We "bl
<func>" to call a function, and "blr" to return back to the call site.
The lr is only a single register, so if we call another function from
inside this function (ie. nested calls), software must save away the
lr on the software stack before calling the new function. Before
returning (ie. before the "blr"), the lr is restored by software from
the software stack.
This makes branch prediction quite difficult for the processor as it
will only know the branch target just before the "blr".
To help with this, modern powerpc processors keep a (non-architected)
hardware stack of lr called a "link stack". When a "bl <func>" is
run, the lr is pushed onto this stack. When a "blr" is called, the
branch predictor pops the lr value from the top of the link stack, and
uses it to predict the branch target. Hence the processor pipeline
knows a lot earlier the branch target.
This works great but there are some cases where you call "bl" but
without a matching "blr". Once such case is when trying to determine
the program counter (which can't be read directly). Here you "bl+4;
mflr" to get the program counter. If you do this, the link stack will
get out of sync with reality, causing the branch predictor to
mis-predict subsequent function returns.
To avoid this, modern micro-architectures have a special case of bl.
Using the form "bcl 20,31,+4", ensures the processor doesn't push to
the link stack.
The 32 and 64 bit variants of __get_datapage() use a "bl; mflr" to
determine the loaded address of the VDSO. The current versions of
these attempt to use this special bl variant.
Unfortunately they use +8 rather than the required +4. Hence the
current code results in the link stack getting out of sync with
reality and hence the resulting performance degradation.
This patch moves it to bcl+4 by moving __kernel_datapage_offset out of
__get_datapage().
With this patch, running a gettimeofday() (which uses
__get_datapage()) microbenchmark we get a decent bump in performance
on POWER7/8.
For the benchmark in tools/testing/selftests/powerpc/benchmarks/gettimeofday.c
POWER8:
64bit gets ~4% improvement
32bit gets ~9% improvement
POWER7:
64bit gets ~7% improvement
Signed-off-by: Michael Neuling <mikey@neuling.org>
Reported-by: Aaron Sawdey <sawdey@us.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2015-09-25 12:01:40 +08:00
|
|
|
addi r3, r3, __kernel_datapage_offset-data_page_branch
|
2005-04-17 06:20:36 +08:00
|
|
|
lwz r0,0(r3)
|
|
|
|
add r3,r0,r3
|
|
|
|
blr
|
|
|
|
.cfi_endproc
|
|
|
|
V_FUNCTION_END(__get_datapage)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
|
|
|
|
*
|
|
|
|
* returns a pointer to the syscall map. the map is agnostic to the
|
|
|
|
* size of "long", unlike kernel bitops, it stores bits from top to
|
|
|
|
* bottom so that memory actually contains a linear bitmap
|
|
|
|
* check for syscall N by testing bit (0x80000000 >> (N & 0x1f)) of
|
|
|
|
* 32 bits int at N >> 5.
|
|
|
|
*/
|
|
|
|
V_FUNCTION_BEGIN(__kernel_get_syscall_map)
|
|
|
|
.cfi_startproc
|
|
|
|
mflr r12
|
|
|
|
.cfi_register lr,r12
|
|
|
|
mr r4,r3
|
|
|
|
bl __get_datapage@local
|
|
|
|
mtlr r12
|
|
|
|
addi r3,r3,CFG_SYSCALL_MAP32
|
|
|
|
cmpli cr0,r4,0
|
|
|
|
beqlr
|
|
|
|
li r0,__NR_syscalls
|
|
|
|
stw r0,0(r4)
|
2005-11-16 10:54:32 +08:00
|
|
|
crclr cr0*4+so
|
2005-04-17 06:20:36 +08:00
|
|
|
blr
|
|
|
|
.cfi_endproc
|
|
|
|
V_FUNCTION_END(__kernel_get_syscall_map)
|
2005-11-11 18:15:21 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* void unsigned long long __kernel_get_tbfreq(void);
|
|
|
|
*
|
|
|
|
* returns the timebase frequency in HZ
|
|
|
|
*/
|
|
|
|
V_FUNCTION_BEGIN(__kernel_get_tbfreq)
|
|
|
|
.cfi_startproc
|
|
|
|
mflr r12
|
|
|
|
.cfi_register lr,r12
|
|
|
|
bl __get_datapage@local
|
|
|
|
lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3)
|
2005-11-14 11:55:58 +08:00
|
|
|
lwz r3,CFG_TB_TICKS_PER_SEC(r3)
|
2005-11-11 18:15:21 +08:00
|
|
|
mtlr r12
|
2005-11-16 10:54:32 +08:00
|
|
|
crclr cr0*4+so
|
2005-11-14 11:55:58 +08:00
|
|
|
blr
|
2005-11-11 18:15:21 +08:00
|
|
|
.cfi_endproc
|
|
|
|
V_FUNCTION_END(__kernel_get_tbfreq)
|