x86/entry: Add STACKLEAK erasing the kernel stack at the end of syscalls
The STACKLEAK feature (initially developed by PaX Team) has the following benefits: 1. Reduces the information that can be revealed through kernel stack leak bugs. The idea of erasing the thread stack at the end of syscalls is similar to CONFIG_PAGE_POISONING and memzero_explicit() in kernel crypto, which all comply with FDP_RIP.2 (Full Residual Information Protection) of the Common Criteria standard. 2. Blocks some uninitialized stack variable attacks (e.g. CVE-2017-17712, CVE-2010-2963). That kind of bugs should be killed by improving C compilers in future, which might take a long time. This commit introduces the code filling the used part of the kernel stack with a poison value before returning to userspace. Full STACKLEAK feature also contains the gcc plugin which comes in a separate commit. The STACKLEAK feature is ported from grsecurity/PaX. More information at: https://grsecurity.net/ https://pax.grsecurity.net/ This code is modified from Brad Spengler/PaX Team's code in the last public patch of grsecurity/PaX based on our understanding of the code. Changes or omissions from the original code are ours and don't reflect the original grsecurity/PaX code. Performance impact: Hardware: Intel Core i7-4770, 16 GB RAM Test #1: building the Linux kernel on a single core 0.91% slowdown Test #2: hackbench -s 4096 -l 2000 -g 15 -f 25 -P 4.2% slowdown So the STACKLEAK description in Kconfig includes: "The tradeoff is the performance impact: on a single CPU system kernel compilation sees a 1% slowdown, other systems and workloads may vary and you are advised to test this feature on your expected workload before deploying it". Signed-off-by: Alexander Popov <alex.popov@linux.com> Acked-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com> Acked-by: Ingo Molnar <mingo@kernel.org> Signed-off-by: Kees Cook <keescook@chromium.org>
This commit is contained in:
parent
57361846b5
commit
afaef01c00
|
@ -24,6 +24,7 @@ ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
|
|||
[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
|
||||
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
|
||||
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
|
||||
STACKLEAK_POISON value in this last hole: ffffffffffff4111
|
||||
|
||||
Virtual memory map with 5 level page tables:
|
||||
|
||||
|
@ -50,6 +51,7 @@ ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
|
|||
[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
|
||||
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
|
||||
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
|
||||
STACKLEAK_POISON value in this last hole: ffffffffffff4111
|
||||
|
||||
Architecture defines a 64-bit virtual address. Implementations can support
|
||||
less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
|
||||
|
|
|
@ -419,6 +419,13 @@ config SECCOMP_FILTER
|
|||
|
||||
See Documentation/userspace-api/seccomp_filter.rst for details.
|
||||
|
||||
config HAVE_ARCH_STACKLEAK
|
||||
bool
|
||||
help
|
||||
An architecture should select this if it has the code which
|
||||
fills the used part of the kernel stack with the STACKLEAK_POISON
|
||||
value before returning from system calls.
|
||||
|
||||
config HAVE_STACKPROTECTOR
|
||||
bool
|
||||
help
|
||||
|
|
|
@ -127,6 +127,7 @@ config X86
|
|||
select HAVE_ARCH_PREL32_RELOCATIONS
|
||||
select HAVE_ARCH_SECCOMP_FILTER
|
||||
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
|
||||
select HAVE_ARCH_STACKLEAK
|
||||
select HAVE_ARCH_TRACEHOOK
|
||||
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
|
||||
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
|
||||
|
|
|
@ -329,8 +329,22 @@ For 32-bit we have the following conventions - kernel is built with
|
|||
|
||||
#endif
|
||||
|
||||
.macro STACKLEAK_ERASE_NOCLOBBER
|
||||
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
|
||||
PUSH_AND_CLEAR_REGS
|
||||
call stackleak_erase
|
||||
POP_REGS
|
||||
#endif
|
||||
.endm
|
||||
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
.macro STACKLEAK_ERASE
|
||||
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
|
||||
call stackleak_erase
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/*
|
||||
* This does 'call enter_from_user_mode' unless we can avoid it based on
|
||||
* kernel config or using the static jump infrastructure.
|
||||
|
|
|
@ -46,6 +46,8 @@
|
|||
#include <asm/frame.h>
|
||||
#include <asm/nospec-branch.h>
|
||||
|
||||
#include "calling.h"
|
||||
|
||||
.section .entry.text, "ax"
|
||||
|
||||
/*
|
||||
|
@ -711,6 +713,7 @@ ENTRY(ret_from_fork)
|
|||
/* When we fork, we trace the syscall return in the child, too. */
|
||||
movl %esp, %eax
|
||||
call syscall_return_slowpath
|
||||
STACKLEAK_ERASE
|
||||
jmp restore_all
|
||||
|
||||
/* kernel thread */
|
||||
|
@ -885,6 +888,8 @@ ENTRY(entry_SYSENTER_32)
|
|||
ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
|
||||
"jmp .Lsyscall_32_done", X86_FEATURE_XENPV
|
||||
|
||||
STACKLEAK_ERASE
|
||||
|
||||
/* Opportunistic SYSEXIT */
|
||||
TRACE_IRQS_ON /* User mode traces as IRQs on. */
|
||||
|
||||
|
@ -996,6 +1001,8 @@ ENTRY(entry_INT80_32)
|
|||
call do_int80_syscall_32
|
||||
.Lsyscall_32_done:
|
||||
|
||||
STACKLEAK_ERASE
|
||||
|
||||
restore_all:
|
||||
TRACE_IRQS_IRET
|
||||
SWITCH_TO_ENTRY_STACK
|
||||
|
|
|
@ -329,6 +329,8 @@ syscall_return_via_sysret:
|
|||
* We are on the trampoline stack. All regs except RDI are live.
|
||||
* We can do future final exit work right here.
|
||||
*/
|
||||
STACKLEAK_ERASE_NOCLOBBER
|
||||
|
||||
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
|
||||
|
||||
popq %rdi
|
||||
|
@ -688,6 +690,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
|
|||
* We are on the trampoline stack. All regs except RDI are live.
|
||||
* We can do future final exit work right here.
|
||||
*/
|
||||
STACKLEAK_ERASE_NOCLOBBER
|
||||
|
||||
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
|
||||
|
||||
|
|
|
@ -261,6 +261,11 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
|
|||
|
||||
/* Opportunistic SYSRET */
|
||||
sysret32_from_system_call:
|
||||
/*
|
||||
* We are not going to return to userspace from the trampoline
|
||||
* stack. So let's erase the thread stack right now.
|
||||
*/
|
||||
STACKLEAK_ERASE
|
||||
TRACE_IRQS_ON /* User mode traces as IRQs on. */
|
||||
movq RBX(%rsp), %rbx /* pt_regs->rbx */
|
||||
movq RBP(%rsp), %rbp /* pt_regs->rbp */
|
||||
|
|
|
@ -1192,6 +1192,10 @@ struct task_struct {
|
|||
void *security;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
|
||||
unsigned long lowest_stack;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* New fields for task_struct should be added above here, so that
|
||||
* they are included in the randomized portion of task_struct.
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_STACKLEAK_H
|
||||
#define _LINUX_STACKLEAK_H
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/task_stack.h>
|
||||
|
||||
/*
|
||||
* Check that the poison value points to the unused hole in the
|
||||
* virtual memory map for your platform.
|
||||
*/
|
||||
#define STACKLEAK_POISON -0xBEEF
|
||||
#define STACKLEAK_SEARCH_DEPTH 128
|
||||
|
||||
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
|
||||
#include <asm/stacktrace.h>
|
||||
|
||||
static inline void stackleak_task_init(struct task_struct *t)
|
||||
{
|
||||
t->lowest_stack = (unsigned long)end_of_stack(t) + sizeof(unsigned long);
|
||||
}
|
||||
#else /* !CONFIG_GCC_PLUGIN_STACKLEAK */
|
||||
static inline void stackleak_task_init(struct task_struct *t) { }
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -117,6 +117,10 @@ obj-$(CONFIG_HAS_IOMEM) += iomem.o
|
|||
obj-$(CONFIG_ZONE_DEVICE) += memremap.o
|
||||
obj-$(CONFIG_RSEQ) += rseq.o
|
||||
|
||||
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
|
||||
KASAN_SANITIZE_stackleak.o := n
|
||||
KCOV_INSTRUMENT_stackleak.o := n
|
||||
|
||||
$(obj)/configs.o: $(obj)/config_data.h
|
||||
|
||||
targets += config_data.gz
|
||||
|
|
|
@ -91,6 +91,7 @@
|
|||
#include <linux/kcov.h>
|
||||
#include <linux/livepatch.h>
|
||||
#include <linux/thread_info.h>
|
||||
#include <linux/stackleak.h>
|
||||
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/pgalloc.h>
|
||||
|
@ -1880,6 +1881,8 @@ static __latent_entropy struct task_struct *copy_process(
|
|||
if (retval)
|
||||
goto bad_fork_cleanup_io;
|
||||
|
||||
stackleak_task_init(p);
|
||||
|
||||
if (pid != &init_struct_pid) {
|
||||
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
|
||||
if (IS_ERR(pid)) {
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* This code fills the used part of the kernel stack with a poison value
|
||||
* before returning to userspace. It's part of the STACKLEAK feature
|
||||
* ported from grsecurity/PaX.
|
||||
*
|
||||
* Author: Alexander Popov <alex.popov@linux.com>
|
||||
*
|
||||
* STACKLEAK reduces the information which kernel stack leak bugs can
|
||||
* reveal and blocks some uninitialized stack variable attacks.
|
||||
*/
|
||||
|
||||
#include <linux/stackleak.h>
|
||||
|
||||
asmlinkage void stackleak_erase(void)
|
||||
{
|
||||
/* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */
|
||||
unsigned long kstack_ptr = current->lowest_stack;
|
||||
unsigned long boundary = (unsigned long)end_of_stack(current);
|
||||
unsigned int poison_count = 0;
|
||||
const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long);
|
||||
|
||||
/* Check that 'lowest_stack' value is sane */
|
||||
if (unlikely(kstack_ptr - boundary >= THREAD_SIZE))
|
||||
kstack_ptr = boundary;
|
||||
|
||||
/* Search for the poison value in the kernel stack */
|
||||
while (kstack_ptr > boundary && poison_count <= depth) {
|
||||
if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON)
|
||||
poison_count++;
|
||||
else
|
||||
poison_count = 0;
|
||||
|
||||
kstack_ptr -= sizeof(unsigned long);
|
||||
}
|
||||
|
||||
/*
|
||||
* One 'long int' at the bottom of the thread stack is reserved and
|
||||
* should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK=y).
|
||||
*/
|
||||
if (kstack_ptr == boundary)
|
||||
kstack_ptr += sizeof(unsigned long);
|
||||
|
||||
/*
|
||||
* Now write the poison value to the kernel stack. Start from
|
||||
* 'kstack_ptr' and move up till the new 'boundary'. We assume that
|
||||
* the stack pointer doesn't change when we write poison.
|
||||
*/
|
||||
if (on_thread_stack())
|
||||
boundary = current_stack_pointer;
|
||||
else
|
||||
boundary = current_top_of_stack();
|
||||
|
||||
while (kstack_ptr < boundary) {
|
||||
*(unsigned long *)kstack_ptr = STACKLEAK_POISON;
|
||||
kstack_ptr += sizeof(unsigned long);
|
||||
}
|
||||
|
||||
/* Reset the 'lowest_stack' value for the next syscall */
|
||||
current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64;
|
||||
}
|
||||
|
|
@ -139,4 +139,23 @@ config GCC_PLUGIN_RANDSTRUCT_PERFORMANCE
|
|||
in structures. This reduces the performance hit of RANDSTRUCT
|
||||
at the cost of weakened randomization.
|
||||
|
||||
config GCC_PLUGIN_STACKLEAK
|
||||
bool "Erase the kernel stack before returning from syscalls"
|
||||
depends on GCC_PLUGINS
|
||||
depends on HAVE_ARCH_STACKLEAK
|
||||
help
|
||||
This option makes the kernel erase the kernel stack before
|
||||
returning from system calls. That reduces the information which
|
||||
kernel stack leak bugs can reveal and blocks some uninitialized
|
||||
stack variable attacks.
|
||||
|
||||
The tradeoff is the performance impact: on a single CPU system kernel
|
||||
compilation sees a 1% slowdown, other systems and workloads may vary
|
||||
and you are advised to test this feature on your expected workload
|
||||
before deploying it.
|
||||
|
||||
This plugin was ported from grsecurity/PaX. More information at:
|
||||
* https://grsecurity.net/
|
||||
* https://pax.grsecurity.net/
|
||||
|
||||
endif
|
||||
|
|
Loading…
Reference in New Issue