Merge branch 'x86-entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 vsyscall updates from Thomas Gleixner: "Further hardening of the legacy vsyscall by providing support for execute only mode and switching the default to it. This prevents a certain class of attacks which rely on the vsyscall page being accessible at a fixed address in the canonical kernel address space" * 'x86-entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: selftests/x86: Add a test for process_vm_readv() on the vsyscall page x86/vsyscall: Add __ro_after_init to global variables x86/vsyscall: Change the default vsyscall mode to xonly selftests/x86/vsyscall: Verify that vsyscall=none blocks execution x86/vsyscall: Document odd SIGSEGV error code for vsyscalls x86/vsyscall: Show something useful on a read fault x86/vsyscall: Add a new vsyscall=xonly mode Documentation/admin: Remove the vsyscall=native documentation
This commit is contained in:
commit
0d37dde706
|
@ -5100,13 +5100,12 @@
|
|||
targets for exploits that can control RIP.
|
||||
|
||||
emulate [default] Vsyscalls turn into traps and are
|
||||
emulated reasonably safely.
|
||||
emulated reasonably safely. The vsyscall
|
||||
page is readable.
|
||||
|
||||
native Vsyscalls are native syscall instructions.
|
||||
This is a little bit faster than trapping
|
||||
and makes a few dynamic recompilers work
|
||||
better than they would in emulation mode.
|
||||
It also makes exploits much easier to write.
|
||||
xonly Vsyscalls turn into traps and are
|
||||
emulated reasonably safely. The vsyscall
|
||||
page is not readable.
|
||||
|
||||
none Vsyscalls don't work at all. This makes
|
||||
them quite hard to use for exploits but
|
||||
|
|
|
@ -2288,7 +2288,7 @@ config COMPAT_VDSO
|
|||
choice
|
||||
prompt "vsyscall table for legacy applications"
|
||||
depends on X86_64
|
||||
default LEGACY_VSYSCALL_EMULATE
|
||||
default LEGACY_VSYSCALL_XONLY
|
||||
help
|
||||
Legacy user code that does not know how to find the vDSO expects
|
||||
to be able to issue three syscalls by calling fixed addresses in
|
||||
|
@ -2296,23 +2296,38 @@ choice
|
|||
it can be used to assist security vulnerability exploitation.
|
||||
|
||||
This setting can be changed at boot time via the kernel command
|
||||
line parameter vsyscall=[emulate|none].
|
||||
line parameter vsyscall=[emulate|xonly|none].
|
||||
|
||||
On a system with recent enough glibc (2.14 or newer) and no
|
||||
static binaries, you can say None without a performance penalty
|
||||
to improve security.
|
||||
|
||||
If unsure, select "Emulate".
|
||||
If unsure, select "Emulate execution only".
|
||||
|
||||
config LEGACY_VSYSCALL_EMULATE
|
||||
bool "Emulate"
|
||||
bool "Full emulation"
|
||||
help
|
||||
The kernel traps and emulates calls into the fixed
|
||||
vsyscall address mapping. This makes the mapping
|
||||
non-executable, but it still contains known contents,
|
||||
which could be used in certain rare security vulnerability
|
||||
exploits. This configuration is recommended when userspace
|
||||
still uses the vsyscall area.
|
||||
The kernel traps and emulates calls into the fixed vsyscall
|
||||
address mapping. This makes the mapping non-executable, but
|
||||
it still contains readable known contents, which could be
|
||||
used in certain rare security vulnerability exploits. This
|
||||
configuration is recommended when using legacy userspace
|
||||
that still uses vsyscalls along with legacy binary
|
||||
instrumentation tools that require code to be readable.
|
||||
|
||||
An example of this type of legacy userspace is running
|
||||
Pin on an old binary that still uses vsyscalls.
|
||||
|
||||
config LEGACY_VSYSCALL_XONLY
|
||||
bool "Emulate execution only"
|
||||
help
|
||||
The kernel traps and emulates calls into the fixed vsyscall
|
||||
address mapping and does not allow reads. This
|
||||
configuration is recommended when userspace might use the
|
||||
legacy vsyscall area but support for legacy binary
|
||||
instrumentation of legacy code is not needed. It mitigates
|
||||
certain uses of the vsyscall area as an ASLR-bypassing
|
||||
buffer.
|
||||
|
||||
config LEGACY_VSYSCALL_NONE
|
||||
bool "None"
|
||||
|
|
|
@ -42,9 +42,11 @@
|
|||
#define CREATE_TRACE_POINTS
|
||||
#include "vsyscall_trace.h"
|
||||
|
||||
static enum { EMULATE, NONE } vsyscall_mode =
|
||||
static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
|
||||
#ifdef CONFIG_LEGACY_VSYSCALL_NONE
|
||||
NONE;
|
||||
#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
|
||||
XONLY;
|
||||
#else
|
||||
EMULATE;
|
||||
#endif
|
||||
|
@ -54,6 +56,8 @@ static int __init vsyscall_setup(char *str)
|
|||
if (str) {
|
||||
if (!strcmp("emulate", str))
|
||||
vsyscall_mode = EMULATE;
|
||||
else if (!strcmp("xonly", str))
|
||||
vsyscall_mode = XONLY;
|
||||
else if (!strcmp("none", str))
|
||||
vsyscall_mode = NONE;
|
||||
else
|
||||
|
@ -113,7 +117,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
|
|||
}
|
||||
}
|
||||
|
||||
bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
|
||||
bool emulate_vsyscall(unsigned long error_code,
|
||||
struct pt_regs *regs, unsigned long address)
|
||||
{
|
||||
struct task_struct *tsk;
|
||||
unsigned long caller;
|
||||
|
@ -122,6 +127,22 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
|
|||
long ret;
|
||||
unsigned long orig_dx;
|
||||
|
||||
/* Write faults or kernel-privilege faults never get fixed up. */
|
||||
if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
|
||||
return false;
|
||||
|
||||
if (!(error_code & X86_PF_INSTR)) {
|
||||
/* Failed vsyscall read */
|
||||
if (vsyscall_mode == EMULATE)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* User code tried and failed to read the vsyscall page.
|
||||
*/
|
||||
warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* No point in checking CS -- the only way to get here is a user mode
|
||||
* trap to a high address, which means that we're in 64-bit user code.
|
||||
|
@ -284,7 +305,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma)
|
|||
static const struct vm_operations_struct gate_vma_ops = {
|
||||
.name = gate_vma_name,
|
||||
};
|
||||
static struct vm_area_struct gate_vma = {
|
||||
static struct vm_area_struct gate_vma __ro_after_init = {
|
||||
.vm_start = VSYSCALL_ADDR,
|
||||
.vm_end = VSYSCALL_ADDR + PAGE_SIZE,
|
||||
.vm_page_prot = PAGE_READONLY_EXEC,
|
||||
|
@ -357,12 +378,20 @@ void __init map_vsyscall(void)
|
|||
extern char __vsyscall_page;
|
||||
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
|
||||
|
||||
if (vsyscall_mode != NONE) {
|
||||
/*
|
||||
* For full emulation, the page needs to exist for real. In
|
||||
* execute-only mode, there is no PTE at all backing the vsyscall
|
||||
* page.
|
||||
*/
|
||||
if (vsyscall_mode == EMULATE) {
|
||||
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
|
||||
PAGE_KERNEL_VVAR);
|
||||
set_vsyscall_pgtable_user_bits(swapper_pg_dir);
|
||||
}
|
||||
|
||||
if (vsyscall_mode == XONLY)
|
||||
gate_vma.vm_flags = VM_EXEC;
|
||||
|
||||
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
|
||||
(unsigned long)VSYSCALL_ADDR);
|
||||
}
|
||||
|
|
|
@ -13,10 +13,12 @@ extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
|
|||
* Called on instruction fetch fault in vsyscall page.
|
||||
* Returns true if handled.
|
||||
*/
|
||||
extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
|
||||
extern bool emulate_vsyscall(unsigned long error_code,
|
||||
struct pt_regs *regs, unsigned long address);
|
||||
#else
|
||||
static inline void map_vsyscall(void) {}
|
||||
static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
|
||||
static inline bool emulate_vsyscall(unsigned long error_code,
|
||||
struct pt_regs *regs, unsigned long address)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -710,6 +710,10 @@ static void set_signal_archinfo(unsigned long address,
|
|||
* To avoid leaking information about the kernel page
|
||||
* table layout, pretend that user-mode accesses to
|
||||
* kernel addresses are always protection faults.
|
||||
*
|
||||
* NB: This means that failed vsyscalls with vsyscall=none
|
||||
* will have the PROT bit. This doesn't leak any
|
||||
* information and does not appear to cause any problems.
|
||||
*/
|
||||
if (address >= TASK_SIZE_MAX)
|
||||
error_code |= X86_PF_PROT;
|
||||
|
@ -1369,16 +1373,18 @@ void do_user_addr_fault(struct pt_regs *regs,
|
|||
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* Instruction fetch faults in the vsyscall page might need
|
||||
* emulation. The vsyscall page is at a high address
|
||||
* (>PAGE_OFFSET), but is considered to be part of the user
|
||||
* address space.
|
||||
* Faults in the vsyscall page might need emulation. The
|
||||
* vsyscall page is at a high address (>PAGE_OFFSET), but is
|
||||
* considered to be part of the user address space.
|
||||
*
|
||||
* The vsyscall page does not have a "real" VMA, so do this
|
||||
* emulation before we go searching for VMAs.
|
||||
*
|
||||
* PKRU never rejects instruction fetches, so we don't need
|
||||
* to consider the PF_PK bit.
|
||||
*/
|
||||
if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
|
||||
if (emulate_vsyscall(regs, address))
|
||||
if (is_vsyscall_vaddr(address)) {
|
||||
if (emulate_vsyscall(hw_error_code, regs, address))
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <sched.h>
|
||||
#include <stdbool.h>
|
||||
#include <setjmp.h>
|
||||
#include <sys/uio.h>
|
||||
|
||||
#ifdef __x86_64__
|
||||
# define VSYS(x) (x)
|
||||
|
@ -49,21 +50,21 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
|
|||
}
|
||||
|
||||
/* vsyscalls and vDSO */
|
||||
bool should_read_vsyscall = false;
|
||||
bool vsyscall_map_r = false, vsyscall_map_x = false;
|
||||
|
||||
typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
|
||||
gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000);
|
||||
const gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000);
|
||||
gtod_t vdso_gtod;
|
||||
|
||||
typedef int (*vgettime_t)(clockid_t, struct timespec *);
|
||||
vgettime_t vdso_gettime;
|
||||
|
||||
typedef long (*time_func_t)(time_t *t);
|
||||
time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400);
|
||||
const time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400);
|
||||
time_func_t vdso_time;
|
||||
|
||||
typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
|
||||
getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800);
|
||||
const getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800);
|
||||
getcpu_t vdso_getcpu;
|
||||
|
||||
static void init_vdso(void)
|
||||
|
@ -107,7 +108,7 @@ static int init_vsys(void)
|
|||
maps = fopen("/proc/self/maps", "r");
|
||||
if (!maps) {
|
||||
printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n");
|
||||
should_read_vsyscall = true;
|
||||
vsyscall_map_r = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -133,12 +134,8 @@ static int init_vsys(void)
|
|||
}
|
||||
|
||||
printf("\tvsyscall permissions are %c-%c\n", r, x);
|
||||
should_read_vsyscall = (r == 'r');
|
||||
if (x != 'x') {
|
||||
vgtod = NULL;
|
||||
vtime = NULL;
|
||||
vgetcpu = NULL;
|
||||
}
|
||||
vsyscall_map_r = (r == 'r');
|
||||
vsyscall_map_x = (x == 'x');
|
||||
|
||||
found = true;
|
||||
break;
|
||||
|
@ -148,10 +145,8 @@ static int init_vsys(void)
|
|||
|
||||
if (!found) {
|
||||
printf("\tno vsyscall map in /proc/self/maps\n");
|
||||
should_read_vsyscall = false;
|
||||
vgtod = NULL;
|
||||
vtime = NULL;
|
||||
vgetcpu = NULL;
|
||||
vsyscall_map_r = false;
|
||||
vsyscall_map_x = false;
|
||||
}
|
||||
|
||||
return nerrs;
|
||||
|
@ -183,9 +178,13 @@ static inline long sys_getcpu(unsigned * cpu, unsigned * node,
|
|||
}
|
||||
|
||||
static jmp_buf jmpbuf;
|
||||
static volatile unsigned long segv_err;
|
||||
|
||||
static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
|
||||
{
|
||||
ucontext_t *ctx = (ucontext_t *)ctx_void;
|
||||
|
||||
segv_err = ctx->uc_mcontext.gregs[REG_ERR];
|
||||
siglongjmp(jmpbuf, 1);
|
||||
}
|
||||
|
||||
|
@ -238,7 +237,7 @@ static int test_gtod(void)
|
|||
err(1, "syscall gettimeofday");
|
||||
if (vdso_gtod)
|
||||
ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso);
|
||||
if (vgtod)
|
||||
if (vsyscall_map_x)
|
||||
ret_vsys = vgtod(&tv_vsys, &tz_vsys);
|
||||
if (sys_gtod(&tv_sys2, &tz_sys) != 0)
|
||||
err(1, "syscall gettimeofday");
|
||||
|
@ -252,7 +251,7 @@ static int test_gtod(void)
|
|||
}
|
||||
}
|
||||
|
||||
if (vgtod) {
|
||||
if (vsyscall_map_x) {
|
||||
if (ret_vsys == 0) {
|
||||
nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys);
|
||||
} else {
|
||||
|
@ -273,7 +272,7 @@ static int test_time(void) {
|
|||
t_sys1 = sys_time(&t2_sys1);
|
||||
if (vdso_time)
|
||||
t_vdso = vdso_time(&t2_vdso);
|
||||
if (vtime)
|
||||
if (vsyscall_map_x)
|
||||
t_vsys = vtime(&t2_vsys);
|
||||
t_sys2 = sys_time(&t2_sys2);
|
||||
if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) {
|
||||
|
@ -294,7 +293,7 @@ static int test_time(void) {
|
|||
}
|
||||
}
|
||||
|
||||
if (vtime) {
|
||||
if (vsyscall_map_x) {
|
||||
if (t_vsys < 0 || t_vsys != t2_vsys) {
|
||||
printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys);
|
||||
nerrs++;
|
||||
|
@ -330,7 +329,7 @@ static int test_getcpu(int cpu)
|
|||
ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0);
|
||||
if (vdso_getcpu)
|
||||
ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0);
|
||||
if (vgetcpu)
|
||||
if (vsyscall_map_x)
|
||||
ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0);
|
||||
|
||||
if (ret_sys == 0) {
|
||||
|
@ -369,7 +368,7 @@ static int test_getcpu(int cpu)
|
|||
}
|
||||
}
|
||||
|
||||
if (vgetcpu) {
|
||||
if (vsyscall_map_x) {
|
||||
if (ret_vsys) {
|
||||
printf("[FAIL]\tvsyscall getcpu() failed\n");
|
||||
nerrs++;
|
||||
|
@ -410,20 +409,88 @@ static int test_vsys_r(void)
|
|||
can_read = false;
|
||||
}
|
||||
|
||||
if (can_read && !should_read_vsyscall) {
|
||||
if (can_read && !vsyscall_map_r) {
|
||||
printf("[FAIL]\tWe have read access, but we shouldn't\n");
|
||||
return 1;
|
||||
} else if (!can_read && should_read_vsyscall) {
|
||||
} else if (!can_read && vsyscall_map_r) {
|
||||
printf("[FAIL]\tWe don't have read access, but we should\n");
|
||||
return 1;
|
||||
} else if (can_read) {
|
||||
printf("[OK]\tWe have read access\n");
|
||||
} else {
|
||||
printf("[OK]\tgot expected result\n");
|
||||
printf("[OK]\tWe do not have read access: #PF(0x%lx)\n",
|
||||
segv_err);
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int test_vsys_x(void)
|
||||
{
|
||||
#ifdef __x86_64__
|
||||
if (vsyscall_map_x) {
|
||||
/* We already tested this adequately. */
|
||||
return 0;
|
||||
}
|
||||
|
||||
printf("[RUN]\tMake sure that vsyscalls really page fault\n");
|
||||
|
||||
bool can_exec;
|
||||
if (sigsetjmp(jmpbuf, 1) == 0) {
|
||||
vgtod(NULL, NULL);
|
||||
can_exec = true;
|
||||
} else {
|
||||
can_exec = false;
|
||||
}
|
||||
|
||||
if (can_exec) {
|
||||
printf("[FAIL]\tExecuting the vsyscall did not page fault\n");
|
||||
return 1;
|
||||
} else if (segv_err & (1 << 4)) { /* INSTR */
|
||||
printf("[OK]\tExecuting the vsyscall page failed: #PF(0x%lx)\n",
|
||||
segv_err);
|
||||
} else {
|
||||
printf("[FAILT]\tExecution failed with the wrong error: #PF(0x%lx)\n",
|
||||
segv_err);
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int test_process_vm_readv(void)
|
||||
{
|
||||
#ifdef __x86_64__
|
||||
char buf[4096];
|
||||
struct iovec local, remote;
|
||||
int ret;
|
||||
|
||||
printf("[RUN]\tprocess_vm_readv() from vsyscall page\n");
|
||||
|
||||
local.iov_base = buf;
|
||||
local.iov_len = 4096;
|
||||
remote.iov_base = (void *)0xffffffffff600000;
|
||||
remote.iov_len = 4096;
|
||||
ret = process_vm_readv(getpid(), &local, 1, &remote, 1, 0);
|
||||
if (ret != 4096) {
|
||||
printf("[OK]\tprocess_vm_readv() failed (ret = %d, errno = %d)\n", ret, errno);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (vsyscall_map_r) {
|
||||
if (!memcmp(buf, (const void *)0xffffffffff600000, 4096)) {
|
||||
printf("[OK]\tIt worked and read correct data\n");
|
||||
} else {
|
||||
printf("[FAIL]\tIt worked but returned incorrect data\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
#define X86_EFLAGS_TF (1UL << 8)
|
||||
|
@ -455,7 +522,7 @@ static int test_emulation(void)
|
|||
time_t tmp;
|
||||
bool is_native;
|
||||
|
||||
if (!vtime)
|
||||
if (!vsyscall_map_x)
|
||||
return 0;
|
||||
|
||||
printf("[RUN]\tchecking that vsyscalls are emulated\n");
|
||||
|
@ -497,6 +564,9 @@ int main(int argc, char **argv)
|
|||
|
||||
sethandler(SIGSEGV, sigsegv, 0);
|
||||
nerrs += test_vsys_r();
|
||||
nerrs += test_vsys_x();
|
||||
|
||||
nerrs += test_process_vm_readv();
|
||||
|
||||
#ifdef __x86_64__
|
||||
nerrs += test_emulation();
|
||||
|
|
Loading…
Reference in New Issue