mirror of https://gitee.com/openkylin/linux.git
syscalls/x86: Use 'struct pt_regs' based syscall calling convention for 64-bit syscalls
Let's make use of ARCH_HAS_SYSCALL_WRAPPER=y on pure 64-bit x86-64 systems: Each syscall defines a stub which takes struct pt_regs as its only argument. It decodes just those parameters it needs, e.g: asmlinkage long sys_xyzzy(const struct pt_regs *regs) { return SyS_xyzzy(regs->di, regs->si, regs->dx); } This approach avoids leaking random user-provided register content down the call chain. For example, for sys_recv() which is a 4-parameter syscall, the assembly now is (in slightly reordered fashion): <sys_recv>: callq <__fentry__> /* decode regs->di, ->si, ->dx and ->r10 */ mov 0x70(%rdi),%rdi mov 0x68(%rdi),%rsi mov 0x60(%rdi),%rdx mov 0x38(%rdi),%rcx [ SyS_recv() is automatically inlined by the compiler, as it is not [yet] used anywhere else ] /* clear %r9 and %r8, the 5th and 6th args */ xor %r9d,%r9d xor %r8d,%r8d /* do the actual work */ callq __sys_recvfrom /* cleanup and return */ cltq retq The only valid place in an x86-64 kernel which rightfully calls a syscall function on its own -- vsyscall -- needs to be modified to pass struct pt_regs onwards as well. To keep the syscall table generation working independent of SYSCALL_PTREGS being enabled, the stubs are named the same as the "original" syscall stubs, i.e. sys_*(). This patch is based on an original proof-of-concept | From: Linus Torvalds <torvalds@linux-foundation.org> | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> and was split up and heavily modified by me, in particular to base it on ARCH_HAS_SYSCALL_WRAPPER, to limit it to 64-bit-only for the time being, and to update the vsyscall to the new calling convention. Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/20180405095307.3730-4-linux@dominikbrodowski.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
1bd21c6c21
commit
fa697140f9
|
@ -2954,3 +2954,8 @@ source "crypto/Kconfig"
|
|||
source "arch/x86/kvm/Kconfig"
|
||||
|
||||
source "lib/Kconfig"
|
||||
|
||||
config SYSCALL_PTREGS
|
||||
def_bool y
|
||||
depends on X86_64 && !COMPAT
|
||||
select ARCH_HAS_SYSCALL_WRAPPER
|
||||
|
|
|
@ -284,9 +284,13 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
|
|||
nr &= __SYSCALL_MASK;
|
||||
if (likely(nr < NR_syscalls)) {
|
||||
nr = array_index_nospec(nr, NR_syscalls);
|
||||
#ifdef CONFIG_SYSCALL_PTREGS
|
||||
regs->ax = sys_call_table[nr](regs);
|
||||
#else
|
||||
regs->ax = sys_call_table[nr](
|
||||
regs->di, regs->si, regs->dx,
|
||||
regs->r10, regs->r8, regs->r9);
|
||||
#endif
|
||||
}
|
||||
|
||||
syscall_return_slowpath(regs);
|
||||
|
|
|
@ -7,14 +7,19 @@
|
|||
#include <asm/asm-offsets.h>
|
||||
#include <asm/syscall.h>
|
||||
|
||||
#ifdef CONFIG_SYSCALL_PTREGS
|
||||
/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */
|
||||
extern asmlinkage long sys_ni_syscall(const struct pt_regs *);
|
||||
#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *);
|
||||
#else /* CONFIG_SYSCALL_PTREGS */
|
||||
extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
|
||||
#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
|
||||
#endif /* CONFIG_SYSCALL_PTREGS */
|
||||
#include <asm/syscalls_64.h>
|
||||
#undef __SYSCALL_64
|
||||
|
||||
#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
|
||||
|
||||
extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
|
||||
|
||||
asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
|
||||
/*
|
||||
* Smells like a compiler bug -- it doesn't work
|
||||
|
|
|
@ -127,6 +127,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
|
|||
int vsyscall_nr, syscall_nr, tmp;
|
||||
int prev_sig_on_uaccess_err;
|
||||
long ret;
|
||||
#ifdef CONFIG_SYSCALL_PTREGS
|
||||
unsigned long orig_dx;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* No point in checking CS -- the only way to get here is a user mode
|
||||
|
@ -227,19 +230,38 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
|
|||
ret = -EFAULT;
|
||||
switch (vsyscall_nr) {
|
||||
case 0:
|
||||
#ifdef CONFIG_SYSCALL_PTREGS
|
||||
/* this decodes regs->di and regs->si on its own */
|
||||
ret = sys_gettimeofday(regs);
|
||||
#else
|
||||
ret = sys_gettimeofday(
|
||||
(struct timeval __user *)regs->di,
|
||||
(struct timezone __user *)regs->si);
|
||||
#endif /* CONFIG_SYSCALL_PTREGS */
|
||||
break;
|
||||
|
||||
case 1:
|
||||
#ifdef CONFIG_SYSCALL_PTREGS
|
||||
/* this decodes regs->di on its own */
|
||||
ret = sys_time(regs);
|
||||
#else
|
||||
ret = sys_time((time_t __user *)regs->di);
|
||||
#endif /* CONFIG_SYSCALL_PTREGS */
|
||||
break;
|
||||
|
||||
case 2:
|
||||
#ifdef CONFIG_SYSCALL_PTREGS
|
||||
/* while we could clobber regs->dx, we didn't in the past... */
|
||||
orig_dx = regs->dx;
|
||||
regs->dx = 0;
|
||||
/* this decodes regs->di, regs->si and regs->dx on its own */
|
||||
ret = sys_getcpu(regs);
|
||||
regs->dx = orig_dx;
|
||||
#else
|
||||
ret = sys_getcpu((unsigned __user *)regs->di,
|
||||
(unsigned __user *)regs->si,
|
||||
NULL);
|
||||
#endif /* CONFIG_SYSCALL_PTREGS */
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,9 +20,13 @@
|
|||
#include <asm/thread_info.h> /* for TS_COMPAT */
|
||||
#include <asm/unistd.h>
|
||||
|
||||
#ifdef CONFIG_SYSCALL_PTREGS
|
||||
typedef asmlinkage long (*sys_call_ptr_t)(const struct pt_regs *);
|
||||
#else
|
||||
typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
|
||||
unsigned long, unsigned long,
|
||||
unsigned long, unsigned long);
|
||||
#endif /* CONFIG_SYSCALL_PTREGS */
|
||||
extern const sys_call_ptr_t sys_call_table[];
|
||||
|
||||
#if defined(CONFIG_X86_32)
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* syscall_wrapper.h - x86 specific wrappers to syscall definitions
|
||||
*/
|
||||
|
||||
#ifndef _ASM_X86_SYSCALL_WRAPPER_H
|
||||
#define _ASM_X86_SYSCALL_WRAPPER_H
|
||||
|
||||
/*
|
||||
* Instead of the generic __SYSCALL_DEFINEx() definition, this macro takes
|
||||
* struct pt_regs *regs as the only argument of the syscall stub named
|
||||
* sys_*(). It decodes just the registers it needs and passes them on to
|
||||
* the SyS_*() wrapper and then to the SYSC_*() function doing the actual job.
|
||||
* These wrappers and functions are inlined, meaning that the assembly looks
|
||||
* as follows (slightly re-ordered):
|
||||
*
|
||||
* <sys_recv>: <-- syscall with 4 parameters
|
||||
* callq <__fentry__>
|
||||
*
|
||||
* mov 0x70(%rdi),%rdi <-- decode regs->di
|
||||
* mov 0x68(%rdi),%rsi <-- decode regs->si
|
||||
* mov 0x60(%rdi),%rdx <-- decode regs->dx
|
||||
* mov 0x38(%rdi),%rcx <-- decode regs->r10
|
||||
*
|
||||
* xor %r9d,%r9d <-- clear %r9
|
||||
* xor %r8d,%r8d <-- clear %r8
|
||||
*
|
||||
* callq __sys_recvfrom <-- do the actual work in __sys_recvfrom()
|
||||
* which takes 6 arguments
|
||||
*
|
||||
* cltq <-- extend return value to 64-bit
|
||||
* retq <-- return
|
||||
*
|
||||
* This approach avoids leaking random user-provided register content down
|
||||
* the call chain.
|
||||
*
|
||||
* As the generic SYSCALL_DEFINE0() macro does not decode any parameters for
|
||||
* obvious reasons, and passing struct pt_regs *regs to it in %rdi does not
|
||||
* hurt, there is no need to override it.
|
||||
*/
|
||||
#define __SYSCALL_DEFINEx(x, name, ...) \
|
||||
asmlinkage long sys##name(const struct pt_regs *regs); \
|
||||
ALLOW_ERROR_INJECTION(sys##name, ERRNO); \
|
||||
static long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
|
||||
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
|
||||
asmlinkage long sys##name(const struct pt_regs *regs) \
|
||||
{ \
|
||||
return SyS##name(__MAP(x,__SC_ARGS \
|
||||
,,regs->di,,regs->si,,regs->dx \
|
||||
,,regs->r10,,regs->r8,,regs->r9)); \
|
||||
} \
|
||||
static long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
|
||||
{ \
|
||||
long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
|
||||
__MAP(x,__SC_TEST,__VA_ARGS__); \
|
||||
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
|
||||
|
||||
/*
|
||||
* For VSYSCALLS, we need to declare these three syscalls with the new
|
||||
* pt_regs-based calling convention for in-kernel use.
|
||||
*/
|
||||
struct pt_regs;
|
||||
asmlinkage long sys_getcpu(const struct pt_regs *regs); /* di,si,dx */
|
||||
asmlinkage long sys_gettimeofday(const struct pt_regs *regs); /* di,si */
|
||||
asmlinkage long sys_time(const struct pt_regs *regs); /* di */
|
||||
|
||||
#endif /* _ASM_X86_SYSCALL_WRAPPER_H */
|
|
@ -18,6 +18,12 @@
|
|||
/* Common in X86_32 and X86_64 */
|
||||
/* kernel/ioport.c */
|
||||
long ksys_ioperm(unsigned long from, unsigned long num, int turn_on);
|
||||
|
||||
#ifndef CONFIG_SYSCALL_PTREGS
|
||||
/*
|
||||
* If CONFIG_SYSCALL_PTREGS is enabled, a different syscall calling convention
|
||||
* is used. Do not include these -- invalid -- prototypes then
|
||||
*/
|
||||
asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
|
||||
asmlinkage long sys_iopl(unsigned int);
|
||||
|
||||
|
@ -53,4 +59,5 @@ asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
|
|||
unsigned long, unsigned long, unsigned long);
|
||||
|
||||
#endif /* CONFIG_X86_32 */
|
||||
#endif /* CONFIG_SYSCALL_PTREGS */
|
||||
#endif /* _ASM_X86_SYSCALLS_H */
|
||||
|
|
|
@ -102,7 +102,7 @@ union bpf_attr;
|
|||
* for SYSCALL_DEFINE<n>/COMPAT_SYSCALL_DEFINE<n>
|
||||
*/
|
||||
#define __MAP0(m,...)
|
||||
#define __MAP1(m,t,a) m(t,a)
|
||||
#define __MAP1(m,t,a,...) m(t,a)
|
||||
#define __MAP2(m,t,a,...) m(t,a), __MAP1(m,__VA_ARGS__)
|
||||
#define __MAP3(m,t,a,...) m(t,a), __MAP2(m,__VA_ARGS__)
|
||||
#define __MAP4(m,t,a,...) m(t,a), __MAP3(m,__VA_ARGS__)
|
||||
|
|
Loading…
Reference in New Issue