seccomp updates for v5.11-rc1
- Improve seccomp performance via constant-action bitmaps (YiFei Zhu & Kees Cook) - Fix bogus __user annotations (Jann Horn) - Add missed CONFIG for improved selftest coverage (Mickaël Salaün) -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEpcP2jyKd1g9yPm4TiXL039xtwCYFAl/ZG5IACgkQiXL039xt wCbhuw/+P77jwT/p1DRnKp5vG7TXTqqXrdhQZYNyBUxRaKSGCEMydvJn/h3KscyW 4eEy9vZKTAhIQg5oI5OXZ9jxzFdpxEg8lMPSKReNEga3d0//H9gOJHYc782D/bf1 +6x6I4qWv+LMM/52P60gznBH+3WFVtyM5Jw+LF5igOCEVSERoZ3ChsmdSZgkALG0 DJXKL+Dy1Wj9ESeBtuh1UsKoh4ADTAoPC+LvfGuxn2T+VtnxX/sOSDkkrpHfX+2J UKkIgWJHeNmq74nwWjpNuDz24ARTiVWOVQX01nOHRohtu39TZcpU774Pdp4Dsj2W oDDwOzIWp4/27aQxkOKv6NXMwd29XbrpH1gweyuvQh9cohSbzx6qZlXujqyd9izs 6Nh74mvC3cns6sQWSWz5ddU4dMQ4rNjpD2CK1P8A7ZVTfH+5baaPmF8CRp126E6f /MAUk7Rfbe6YfYdfMwhXXhTvus0e5yenGFXr46gasJDfGnyy4cLS/MO7AZ+mR0CB d9DnrsIJVggL5cZ2LZmivIng18JWnbkgnenmHSXahdLstmYVkdpo4ckBl1G/dXK0 lDmi9j9FoTxB6OrztEKA0RZB+C1e6q7X7euwsHjgF9XKgD5S+DdeYwqd2lypjyvb d9VNLFdngD0CRY7wcJZKRma+yPemlPNurdMjF9LrqaAu232G1UA= =jJwG -----END PGP SIGNATURE----- Merge tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux Pull seccomp updates from Kees Cook: "The major change here is finally gaining seccomp constant-action bitmaps, which internally reduces the seccomp overhead for many real-world syscall filters to O(1), as discussed at Plumbers this year. - Improve seccomp performance via constant-action bitmaps (YiFei Zhu & Kees Cook) - Fix bogus __user annotations (Jann Horn) - Add missed CONFIG for improved selftest coverage (Mickaël Salaün)" * tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux: selftests/seccomp: Update kernel config seccomp: Remove bogus __user annotations seccomp/cache: Report cache data through /proc/pid/seccomp_cache xtensa: Enable seccomp architecture tracking sh: Enable seccomp architecture tracking s390: Enable seccomp architecture tracking riscv: Enable seccomp architecture tracking powerpc: Enable seccomp architecture tracking parisc: Enable seccomp architecture tracking csky: Enable seccomp architecture tracking arm: Enable seccomp architecture tracking arm64: Enable seccomp architecture tracking selftests/seccomp: Compare bitmap vs filter overhead x86: Enable seccomp architecture tracking seccomp/cache: Add "emulator" to check if filter is constant allow seccomp/cache: Lookup syscall allowlist bitmap for fast path
This commit is contained in:
commit
e994cc240a
17
arch/Kconfig
17
arch/Kconfig
|
@ -486,6 +486,9 @@ config HAVE_ARCH_SECCOMP_FILTER
|
|||
- secure_computing return value is checked and a return value of -1
|
||||
results in the system call being skipped immediately.
|
||||
- seccomp syscall wired up
|
||||
- if !HAVE_SPARSE_SYSCALL_NR, have SECCOMP_ARCH_NATIVE,
|
||||
SECCOMP_ARCH_NATIVE_NR, SECCOMP_ARCH_NATIVE_NAME defined. If
|
||||
COMPAT is supported, have the SECCOMP_ARCH_COMPAT* defines too.
|
||||
|
||||
config SECCOMP
|
||||
prompt "Enable seccomp to safely execute untrusted bytecode"
|
||||
|
@ -514,6 +517,20 @@ config SECCOMP_FILTER
|
|||
|
||||
See Documentation/userspace-api/seccomp_filter.rst for details.
|
||||
|
||||
config SECCOMP_CACHE_DEBUG
|
||||
bool "Show seccomp filter cache status in /proc/pid/seccomp_cache"
|
||||
depends on SECCOMP_FILTER && !HAVE_SPARSE_SYSCALL_NR
|
||||
depends on PROC_FS
|
||||
help
|
||||
This enables the /proc/pid/seccomp_cache interface to monitor
|
||||
seccomp cache data. The file format is subject to change. Reading
|
||||
the file requires CAP_SYS_ADMIN.
|
||||
|
||||
This option is for debugging only. Enabling presents the risk that
|
||||
an adversary may be able to infer the seccomp filter logic.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config HAVE_ARCH_STACKLEAK
|
||||
bool
|
||||
help
|
||||
|
|
|
@ -4,7 +4,6 @@ generic-y += extable.h
|
|||
generic-y += flat.h
|
||||
generic-y += local64.h
|
||||
generic-y += parport.h
|
||||
generic-y += seccomp.h
|
||||
|
||||
generated-y += mach-types.h
|
||||
generated-y += unistd-nr.h
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
#ifndef _ASM_SECCOMP_H
|
||||
#define _ASM_SECCOMP_H
|
||||
|
||||
#include <asm-generic/seccomp.h>
|
||||
|
||||
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_ARM
|
||||
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
|
||||
#define SECCOMP_ARCH_NATIVE_NAME "arm"
|
||||
|
||||
#endif /* _ASM_SECCOMP_H */
|
|
@ -19,4 +19,13 @@
|
|||
|
||||
#include <asm-generic/seccomp.h>
|
||||
|
||||
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_AARCH64
|
||||
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
|
||||
#define SECCOMP_ARCH_NATIVE_NAME "aarch64"
|
||||
#ifdef CONFIG_COMPAT
|
||||
# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_ARM
|
||||
# define SECCOMP_ARCH_COMPAT_NR __NR_compat_syscalls
|
||||
# define SECCOMP_ARCH_COMPAT_NAME "arm"
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_SECCOMP_H */
|
||||
|
|
|
@ -4,6 +4,5 @@ generic-y += gpio.h
|
|||
generic-y += kvm_para.h
|
||||
generic-y += local64.h
|
||||
generic-y += qrwlock.h
|
||||
generic-y += seccomp.h
|
||||
generic-y += user.h
|
||||
generic-y += vmlinux.lds.h
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
#ifndef _ASM_SECCOMP_H
|
||||
#define _ASM_SECCOMP_H
|
||||
|
||||
#include <asm-generic/seccomp.h>
|
||||
|
||||
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_CSKY
|
||||
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
|
||||
#define SECCOMP_ARCH_NATIVE_NAME "csky"
|
||||
|
||||
#endif /* _ASM_SECCOMP_H */
|
|
@ -5,5 +5,4 @@ generated-y += syscall_table_c32.h
|
|||
generic-y += kvm_para.h
|
||||
generic-y += local64.h
|
||||
generic-y += mcs_spinlock.h
|
||||
generic-y += seccomp.h
|
||||
generic-y += user.h
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
#ifndef _ASM_SECCOMP_H
|
||||
#define _ASM_SECCOMP_H
|
||||
|
||||
#include <asm-generic/seccomp.h>
|
||||
|
||||
#ifdef CONFIG_64BIT
|
||||
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_PARISC64
|
||||
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
|
||||
# define SECCOMP_ARCH_NATIVE_NAME "parisc64"
|
||||
# ifdef CONFIG_COMPAT
|
||||
# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_PARISC
|
||||
# define SECCOMP_ARCH_COMPAT_NR NR_syscalls
|
||||
# define SECCOMP_ARCH_COMPAT_NAME "parisc"
|
||||
# endif
|
||||
#else /* !CONFIG_64BIT */
|
||||
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_PARISC
|
||||
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
|
||||
# define SECCOMP_ARCH_NATIVE_NAME "parisc"
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_SECCOMP_H */
|
|
@ -8,4 +8,27 @@
|
|||
|
||||
#include <asm-generic/seccomp.h>
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
#define __SECCOMP_ARCH_LE __AUDIT_ARCH_LE
|
||||
#define __SECCOMP_ARCH_LE_NAME "le"
|
||||
#else
|
||||
#define __SECCOMP_ARCH_LE 0
|
||||
#define __SECCOMP_ARCH_LE_NAME
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PPC64
|
||||
# define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_PPC64 | __SECCOMP_ARCH_LE)
|
||||
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
|
||||
# define SECCOMP_ARCH_NATIVE_NAME "ppc64" __SECCOMP_ARCH_LE_NAME
|
||||
# ifdef CONFIG_COMPAT
|
||||
# define SECCOMP_ARCH_COMPAT (AUDIT_ARCH_PPC | __SECCOMP_ARCH_LE)
|
||||
# define SECCOMP_ARCH_COMPAT_NR NR_syscalls
|
||||
# define SECCOMP_ARCH_COMPAT_NAME "ppc" __SECCOMP_ARCH_LE_NAME
|
||||
# endif
|
||||
#else /* !CONFIG_PPC64 */
|
||||
# define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_PPC | __SECCOMP_ARCH_LE)
|
||||
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
|
||||
# define SECCOMP_ARCH_NATIVE_NAME "ppc" __SECCOMP_ARCH_LE_NAME
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_POWERPC_SECCOMP_H */
|
||||
|
|
|
@ -7,4 +7,14 @@
|
|||
|
||||
#include <asm-generic/seccomp.h>
|
||||
|
||||
#ifdef CONFIG_64BIT
|
||||
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_RISCV64
|
||||
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
|
||||
# define SECCOMP_ARCH_NATIVE_NAME "riscv64"
|
||||
#else /* !CONFIG_64BIT */
|
||||
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_RISCV32
|
||||
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
|
||||
# define SECCOMP_ARCH_NATIVE_NAME "riscv32"
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_SECCOMP_H */
|
||||
|
|
|
@ -16,4 +16,13 @@
|
|||
|
||||
#include <asm-generic/seccomp.h>
|
||||
|
||||
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_S390X
|
||||
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
|
||||
#define SECCOMP_ARCH_NATIVE_NAME "s390x"
|
||||
#ifdef CONFIG_COMPAT
|
||||
# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_S390
|
||||
# define SECCOMP_ARCH_COMPAT_NR NR_syscalls
|
||||
# define SECCOMP_ARCH_COMPAT_NAME "s390"
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_S390_SECCOMP_H */
|
||||
|
|
|
@ -8,4 +8,14 @@
|
|||
#define __NR_seccomp_exit __NR_exit
|
||||
#define __NR_seccomp_sigreturn __NR_rt_sigreturn
|
||||
|
||||
#ifdef CONFIG_CPU_LITTLE_ENDIAN
|
||||
#define __SECCOMP_ARCH_LE __AUDIT_ARCH_LE
|
||||
#else
|
||||
#define __SECCOMP_ARCH_LE 0
|
||||
#endif
|
||||
|
||||
#define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_SH | __SECCOMP_ARCH_LE)
|
||||
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
|
||||
#define SECCOMP_ARCH_NATIVE_NAME "sh"
|
||||
|
||||
#endif /* __ASM_SECCOMP_H */
|
||||
|
|
|
@ -16,6 +16,26 @@
|
|||
#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_X86_64
|
||||
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
|
||||
# define SECCOMP_ARCH_NATIVE_NAME "x86_64"
|
||||
# ifdef CONFIG_COMPAT
|
||||
# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_I386
|
||||
# define SECCOMP_ARCH_COMPAT_NR IA32_NR_syscalls
|
||||
# define SECCOMP_ARCH_COMPAT_NAME "ia32"
|
||||
# endif
|
||||
/*
|
||||
* x32 will have __X32_SYSCALL_BIT set in syscall number. We don't support
|
||||
* caching them and they are treated as out of range syscalls, which will
|
||||
* always pass through the BPF filter.
|
||||
*/
|
||||
#else /* !CONFIG_X86_64 */
|
||||
# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_I386
|
||||
# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
|
||||
# define SECCOMP_ARCH_NATIVE_NAME "ia32"
|
||||
#endif
|
||||
|
||||
#include <asm-generic/seccomp.h>
|
||||
|
||||
#endif /* _ASM_X86_SECCOMP_H */
|
||||
|
|
|
@ -7,5 +7,4 @@ generic-y += mcs_spinlock.h
|
|||
generic-y += param.h
|
||||
generic-y += qrwlock.h
|
||||
generic-y += qspinlock.h
|
||||
generic-y += seccomp.h
|
||||
generic-y += user.h
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
#ifndef _ASM_SECCOMP_H
|
||||
#define _ASM_SECCOMP_H
|
||||
|
||||
#include <asm-generic/seccomp.h>
|
||||
|
||||
#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_XTENSA
|
||||
#define SECCOMP_ARCH_NATIVE_NR NR_syscalls
|
||||
#define SECCOMP_ARCH_NATIVE_NAME "xtensa"
|
||||
|
||||
#endif /* _ASM_SECCOMP_H */
|
|
@ -3263,6 +3263,9 @@ static const struct pid_entry tgid_base_stuff[] = {
|
|||
#ifdef CONFIG_PROC_PID_ARCH_STATUS
|
||||
ONE("arch_status", S_IRUGO, proc_pid_arch_status),
|
||||
#endif
|
||||
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
|
||||
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
|
||||
#endif
|
||||
};
|
||||
|
||||
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
|
||||
|
@ -3592,6 +3595,9 @@ static const struct pid_entry tid_base_stuff[] = {
|
|||
#ifdef CONFIG_PROC_PID_ARCH_STATUS
|
||||
ONE("arch_status", S_IRUGO, proc_pid_arch_status),
|
||||
#endif
|
||||
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
|
||||
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
|
||||
#endif
|
||||
};
|
||||
|
||||
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
|
||||
|
|
|
@ -121,4 +121,11 @@ static inline long seccomp_get_metadata(struct task_struct *task,
|
|||
return -EINVAL;
|
||||
}
|
||||
#endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
|
||||
|
||||
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
|
||||
struct seq_file;
|
||||
|
||||
int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
|
||||
struct pid *pid, struct task_struct *task);
|
||||
#endif
|
||||
#endif /* _LINUX_SECCOMP_H */
|
||||
|
|
296
kernel/seccomp.c
296
kernel/seccomp.c
|
@ -143,6 +143,38 @@ struct notification {
|
|||
struct list_head notifications;
|
||||
};
|
||||
|
||||
#ifdef SECCOMP_ARCH_NATIVE
|
||||
/**
|
||||
* struct action_cache - per-filter cache of seccomp actions per
|
||||
* arch/syscall pair
|
||||
*
|
||||
* @allow_native: A bitmap where each bit represents whether the
|
||||
* filter will always allow the syscall, for the
|
||||
* native architecture.
|
||||
* @allow_compat: A bitmap where each bit represents whether the
|
||||
* filter will always allow the syscall, for the
|
||||
* compat architecture.
|
||||
*/
|
||||
struct action_cache {
|
||||
DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
|
||||
#ifdef SECCOMP_ARCH_COMPAT
|
||||
DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
|
||||
#endif
|
||||
};
|
||||
#else
|
||||
struct action_cache { };
|
||||
|
||||
static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
|
||||
const struct seccomp_data *sd)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
|
||||
{
|
||||
}
|
||||
#endif /* SECCOMP_ARCH_NATIVE */
|
||||
|
||||
/**
|
||||
* struct seccomp_filter - container for seccomp BPF programs
|
||||
*
|
||||
|
@ -159,6 +191,7 @@ struct notification {
|
|||
* this filter after reaching 0. The @users count is always smaller
|
||||
* or equal to @refs. Hence, reaching 0 for @users does not mean
|
||||
* the filter can be freed.
|
||||
* @cache: cache of arch/syscall mappings to actions
|
||||
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
|
||||
* @prev: points to a previously installed, or inherited, filter
|
||||
* @prog: the BPF program to evaluate
|
||||
|
@ -180,6 +213,7 @@ struct seccomp_filter {
|
|||
refcount_t refs;
|
||||
refcount_t users;
|
||||
bool log;
|
||||
struct action_cache cache;
|
||||
struct seccomp_filter *prev;
|
||||
struct bpf_prog *prog;
|
||||
struct notification *notif;
|
||||
|
@ -298,6 +332,52 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
|
|||
return 0;
|
||||
}
|
||||
|
||||
#ifdef SECCOMP_ARCH_NATIVE
|
||||
static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
|
||||
size_t bitmap_size,
|
||||
int syscall_nr)
|
||||
{
|
||||
if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
|
||||
return false;
|
||||
syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
|
||||
|
||||
return test_bit(syscall_nr, bitmap);
|
||||
}
|
||||
|
||||
/**
|
||||
* seccomp_cache_check_allow - lookup seccomp cache
|
||||
* @sfilter: The seccomp filter
|
||||
* @sd: The seccomp data to lookup the cache with
|
||||
*
|
||||
* Returns true if the seccomp_data is cached and allowed.
|
||||
*/
|
||||
static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
|
||||
const struct seccomp_data *sd)
|
||||
{
|
||||
int syscall_nr = sd->nr;
|
||||
const struct action_cache *cache = &sfilter->cache;
|
||||
|
||||
#ifndef SECCOMP_ARCH_COMPAT
|
||||
/* A native-only architecture doesn't need to check sd->arch. */
|
||||
return seccomp_cache_check_allow_bitmap(cache->allow_native,
|
||||
SECCOMP_ARCH_NATIVE_NR,
|
||||
syscall_nr);
|
||||
#else
|
||||
if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
|
||||
return seccomp_cache_check_allow_bitmap(cache->allow_native,
|
||||
SECCOMP_ARCH_NATIVE_NR,
|
||||
syscall_nr);
|
||||
if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
|
||||
return seccomp_cache_check_allow_bitmap(cache->allow_compat,
|
||||
SECCOMP_ARCH_COMPAT_NR,
|
||||
syscall_nr);
|
||||
#endif /* SECCOMP_ARCH_COMPAT */
|
||||
|
||||
WARN_ON_ONCE(true);
|
||||
return false;
|
||||
}
|
||||
#endif /* SECCOMP_ARCH_NATIVE */
|
||||
|
||||
/**
|
||||
* seccomp_run_filters - evaluates all seccomp filters against @sd
|
||||
* @sd: optional seccomp data to be passed to filters
|
||||
|
@ -320,6 +400,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
|
|||
if (WARN_ON(f == NULL))
|
||||
return SECCOMP_RET_KILL_PROCESS;
|
||||
|
||||
if (seccomp_cache_check_allow(f, sd))
|
||||
return SECCOMP_RET_ALLOW;
|
||||
|
||||
/*
|
||||
* All filters in the list are evaluated and the lowest BPF return
|
||||
* value always takes priority (ignoring the DATA).
|
||||
|
@ -470,6 +553,9 @@ void seccomp_filter_release(struct task_struct *tsk)
|
|||
{
|
||||
struct seccomp_filter *orig = tsk->seccomp.filter;
|
||||
|
||||
/* We are effectively holding the siglock by not having any sighand. */
|
||||
WARN_ON(tsk->sighand != NULL);
|
||||
|
||||
/* Detach task from its filter tree. */
|
||||
tsk->seccomp.filter = NULL;
|
||||
__seccomp_filter_release(orig);
|
||||
|
@ -544,7 +630,12 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
|
|||
{
|
||||
struct seccomp_filter *sfilter;
|
||||
int ret;
|
||||
const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
|
||||
const bool save_orig =
|
||||
#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
|
||||
true;
|
||||
#else
|
||||
false;
|
||||
#endif
|
||||
|
||||
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
@ -609,6 +700,148 @@ seccomp_prepare_user_filter(const char __user *user_filter)
|
|||
return filter;
|
||||
}
|
||||
|
||||
#ifdef SECCOMP_ARCH_NATIVE
|
||||
/**
|
||||
* seccomp_is_const_allow - check if filter is constant allow with given data
|
||||
* @fprog: The BPF programs
|
||||
* @sd: The seccomp data to check against, only syscall number and arch
|
||||
* number are considered constant.
|
||||
*/
|
||||
static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
|
||||
struct seccomp_data *sd)
|
||||
{
|
||||
unsigned int reg_value = 0;
|
||||
unsigned int pc;
|
||||
bool op_res;
|
||||
|
||||
if (WARN_ON_ONCE(!fprog))
|
||||
return false;
|
||||
|
||||
for (pc = 0; pc < fprog->len; pc++) {
|
||||
struct sock_filter *insn = &fprog->filter[pc];
|
||||
u16 code = insn->code;
|
||||
u32 k = insn->k;
|
||||
|
||||
switch (code) {
|
||||
case BPF_LD | BPF_W | BPF_ABS:
|
||||
switch (k) {
|
||||
case offsetof(struct seccomp_data, nr):
|
||||
reg_value = sd->nr;
|
||||
break;
|
||||
case offsetof(struct seccomp_data, arch):
|
||||
reg_value = sd->arch;
|
||||
break;
|
||||
default:
|
||||
/* can't optimize (non-constant value load) */
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case BPF_RET | BPF_K:
|
||||
/* reached return with constant values only, check allow */
|
||||
return k == SECCOMP_RET_ALLOW;
|
||||
case BPF_JMP | BPF_JA:
|
||||
pc += insn->k;
|
||||
break;
|
||||
case BPF_JMP | BPF_JEQ | BPF_K:
|
||||
case BPF_JMP | BPF_JGE | BPF_K:
|
||||
case BPF_JMP | BPF_JGT | BPF_K:
|
||||
case BPF_JMP | BPF_JSET | BPF_K:
|
||||
switch (BPF_OP(code)) {
|
||||
case BPF_JEQ:
|
||||
op_res = reg_value == k;
|
||||
break;
|
||||
case BPF_JGE:
|
||||
op_res = reg_value >= k;
|
||||
break;
|
||||
case BPF_JGT:
|
||||
op_res = reg_value > k;
|
||||
break;
|
||||
case BPF_JSET:
|
||||
op_res = !!(reg_value & k);
|
||||
break;
|
||||
default:
|
||||
/* can't optimize (unknown jump) */
|
||||
return false;
|
||||
}
|
||||
|
||||
pc += op_res ? insn->jt : insn->jf;
|
||||
break;
|
||||
case BPF_ALU | BPF_AND | BPF_K:
|
||||
reg_value &= k;
|
||||
break;
|
||||
default:
|
||||
/* can't optimize (unknown insn) */
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* ran off the end of the filter?! */
|
||||
WARN_ON(1);
|
||||
return false;
|
||||
}
|
||||
|
||||
static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
|
||||
void *bitmap, const void *bitmap_prev,
|
||||
size_t bitmap_size, int arch)
|
||||
{
|
||||
struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
|
||||
struct seccomp_data sd;
|
||||
int nr;
|
||||
|
||||
if (bitmap_prev) {
|
||||
/* The new filter must be as restrictive as the last. */
|
||||
bitmap_copy(bitmap, bitmap_prev, bitmap_size);
|
||||
} else {
|
||||
/* Before any filters, all syscalls are always allowed. */
|
||||
bitmap_fill(bitmap, bitmap_size);
|
||||
}
|
||||
|
||||
for (nr = 0; nr < bitmap_size; nr++) {
|
||||
/* No bitmap change: not a cacheable action. */
|
||||
if (!test_bit(nr, bitmap))
|
||||
continue;
|
||||
|
||||
sd.nr = nr;
|
||||
sd.arch = arch;
|
||||
|
||||
/* No bitmap change: continue to always allow. */
|
||||
if (seccomp_is_const_allow(fprog, &sd))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Not a cacheable action: always run filters.
|
||||
* atomic clear_bit() not needed, filter not visible yet.
|
||||
*/
|
||||
__clear_bit(nr, bitmap);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* seccomp_cache_prepare - emulate the filter to find cachable syscalls
|
||||
* @sfilter: The seccomp filter
|
||||
*
|
||||
* Returns 0 if successful or -errno if error occurred.
|
||||
*/
|
||||
static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
|
||||
{
|
||||
struct action_cache *cache = &sfilter->cache;
|
||||
const struct action_cache *cache_prev =
|
||||
sfilter->prev ? &sfilter->prev->cache : NULL;
|
||||
|
||||
seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
|
||||
cache_prev ? cache_prev->allow_native : NULL,
|
||||
SECCOMP_ARCH_NATIVE_NR,
|
||||
SECCOMP_ARCH_NATIVE);
|
||||
|
||||
#ifdef SECCOMP_ARCH_COMPAT
|
||||
seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
|
||||
cache_prev ? cache_prev->allow_compat : NULL,
|
||||
SECCOMP_ARCH_COMPAT_NR,
|
||||
SECCOMP_ARCH_COMPAT);
|
||||
#endif /* SECCOMP_ARCH_COMPAT */
|
||||
}
|
||||
#endif /* SECCOMP_ARCH_NATIVE */
|
||||
|
||||
/**
|
||||
* seccomp_attach_filter: validate and attach filter
|
||||
* @flags: flags to change filter behavior
|
||||
|
@ -658,6 +891,7 @@ static long seccomp_attach_filter(unsigned int flags,
|
|||
* task reference.
|
||||
*/
|
||||
filter->prev = current->seccomp.filter;
|
||||
seccomp_cache_prepare(filter);
|
||||
current->seccomp.filter = filter;
|
||||
atomic_inc(¤t->seccomp.filter_count);
|
||||
|
||||
|
@ -1967,7 +2201,7 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
|
|||
return true;
|
||||
}
|
||||
|
||||
static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
|
||||
static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
|
||||
size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
char names[sizeof(seccomp_actions_avail)];
|
||||
|
@ -1985,7 +2219,7 @@ static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
|
|||
return proc_dostring(&table, 0, buffer, lenp, ppos);
|
||||
}
|
||||
|
||||
static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
|
||||
static int write_actions_logged(struct ctl_table *ro_table, void *buffer,
|
||||
size_t *lenp, loff_t *ppos, u32 *actions_logged)
|
||||
{
|
||||
char names[sizeof(seccomp_actions_avail)];
|
||||
|
@ -2103,3 +2337,59 @@ static int __init seccomp_sysctl_init(void)
|
|||
device_initcall(seccomp_sysctl_init)
|
||||
|
||||
#endif /* CONFIG_SYSCTL */
|
||||
|
||||
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
|
||||
/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
|
||||
static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
|
||||
const void *bitmap, size_t bitmap_size)
|
||||
{
|
||||
int nr;
|
||||
|
||||
for (nr = 0; nr < bitmap_size; nr++) {
|
||||
bool cached = test_bit(nr, bitmap);
|
||||
char *status = cached ? "ALLOW" : "FILTER";
|
||||
|
||||
seq_printf(m, "%s %d %s\n", name, nr, status);
|
||||
}
|
||||
}
|
||||
|
||||
int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
|
||||
struct pid *pid, struct task_struct *task)
|
||||
{
|
||||
struct seccomp_filter *f;
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* We don't want some sandboxed process to know what their seccomp
|
||||
* filters consist of.
|
||||
*/
|
||||
if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
|
||||
return -EACCES;
|
||||
|
||||
if (!lock_task_sighand(task, &flags))
|
||||
return -ESRCH;
|
||||
|
||||
f = READ_ONCE(task->seccomp.filter);
|
||||
if (!f) {
|
||||
unlock_task_sighand(task, &flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* prevent filter from being freed while we are printing it */
|
||||
__get_seccomp_filter(f);
|
||||
unlock_task_sighand(task, &flags);
|
||||
|
||||
proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
|
||||
f->cache.allow_native,
|
||||
SECCOMP_ARCH_NATIVE_NR);
|
||||
|
||||
#ifdef SECCOMP_ARCH_COMPAT
|
||||
proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
|
||||
f->cache.allow_compat,
|
||||
SECCOMP_ARCH_COMPAT_NR);
|
||||
#endif /* SECCOMP_ARCH_COMPAT */
|
||||
|
||||
__put_seccomp_filter(f);
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_SECCOMP_CACHE_DEBUG */
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
CONFIG_PID_NS=y
|
||||
CONFIG_SECCOMP=y
|
||||
CONFIG_SECCOMP_FILTER=y
|
||||
CONFIG_USER_NS=y
|
||||
|
|
|
@ -4,12 +4,16 @@
|
|||
*/
|
||||
#define _GNU_SOURCE
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <linux/filter.h>
|
||||
#include <linux/seccomp.h>
|
||||
#include <sys/param.h>
|
||||
#include <sys/prctl.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/types.h>
|
||||
|
@ -70,18 +74,74 @@ unsigned long long calibrate(void)
|
|||
return samples * seconds;
|
||||
}
|
||||
|
||||
bool approx(int i_one, int i_two)
|
||||
{
|
||||
double one = i_one, one_bump = one * 0.01;
|
||||
double two = i_two, two_bump = two * 0.01;
|
||||
|
||||
one_bump = one + MAX(one_bump, 2.0);
|
||||
two_bump = two + MAX(two_bump, 2.0);
|
||||
|
||||
/* Equal to, or within 1% or 2 digits */
|
||||
if (one == two ||
|
||||
(one > two && one <= two_bump) ||
|
||||
(two > one && two <= one_bump))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool le(int i_one, int i_two)
|
||||
{
|
||||
if (i_one <= i_two)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
long compare(const char *name_one, const char *name_eval, const char *name_two,
|
||||
unsigned long long one, bool (*eval)(int, int), unsigned long long two)
|
||||
{
|
||||
bool good;
|
||||
|
||||
printf("\t%s %s %s (%lld %s %lld): ", name_one, name_eval, name_two,
|
||||
(long long)one, name_eval, (long long)two);
|
||||
if (one > INT_MAX) {
|
||||
printf("Miscalculation! Measurement went negative: %lld\n", (long long)one);
|
||||
return 1;
|
||||
}
|
||||
if (two > INT_MAX) {
|
||||
printf("Miscalculation! Measurement went negative: %lld\n", (long long)two);
|
||||
return 1;
|
||||
}
|
||||
|
||||
good = eval(one, two);
|
||||
printf("%s\n", good ? "✔️" : "❌");
|
||||
|
||||
return good ? 0 : 1;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
struct sock_filter bitmap_filter[] = {
|
||||
BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)),
|
||||
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
|
||||
};
|
||||
struct sock_fprog bitmap_prog = {
|
||||
.len = (unsigned short)ARRAY_SIZE(bitmap_filter),
|
||||
.filter = bitmap_filter,
|
||||
};
|
||||
struct sock_filter filter[] = {
|
||||
BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, args[0])),
|
||||
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
|
||||
};
|
||||
struct sock_fprog prog = {
|
||||
.len = (unsigned short)ARRAY_SIZE(filter),
|
||||
.filter = filter,
|
||||
};
|
||||
long ret;
|
||||
unsigned long long samples;
|
||||
unsigned long long native, filter1, filter2;
|
||||
|
||||
long ret, bits;
|
||||
unsigned long long samples, calc;
|
||||
unsigned long long native, filter1, filter2, bitmap1, bitmap2;
|
||||
unsigned long long entry, per_filter1, per_filter2;
|
||||
|
||||
printf("Current BPF sysctl settings:\n");
|
||||
system("sysctl net.core.bpf_jit_enable");
|
||||
|
@ -101,35 +161,82 @@ int main(int argc, char *argv[])
|
|||
ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
|
||||
assert(ret == 0);
|
||||
|
||||
/* One filter */
|
||||
/* One filter resulting in a bitmap */
|
||||
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
|
||||
assert(ret == 0);
|
||||
|
||||
bitmap1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
|
||||
printf("getpid RET_ALLOW 1 filter (bitmap): %llu ns\n", bitmap1);
|
||||
|
||||
/* Second filter resulting in a bitmap */
|
||||
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
|
||||
assert(ret == 0);
|
||||
|
||||
bitmap2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
|
||||
printf("getpid RET_ALLOW 2 filters (bitmap): %llu ns\n", bitmap2);
|
||||
|
||||
/* Third filter, can no longer be converted to bitmap */
|
||||
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
|
||||
assert(ret == 0);
|
||||
|
||||
filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
|
||||
printf("getpid RET_ALLOW 1 filter: %llu ns\n", filter1);
|
||||
printf("getpid RET_ALLOW 3 filters (full): %llu ns\n", filter1);
|
||||
|
||||
if (filter1 == native)
|
||||
printf("No overhead measured!? Try running again with more samples.\n");
|
||||
|
||||
/* Two filters */
|
||||
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
|
||||
/* Fourth filter, can not be converted to bitmap because of filter 3 */
|
||||
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
|
||||
assert(ret == 0);
|
||||
|
||||
filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
|
||||
printf("getpid RET_ALLOW 2 filters: %llu ns\n", filter2);
|
||||
printf("getpid RET_ALLOW 4 filters (full): %llu ns\n", filter2);
|
||||
|
||||
/* Calculations */
|
||||
printf("Estimated total seccomp overhead for 1 filter: %llu ns\n",
|
||||
/* Estimations */
|
||||
#define ESTIMATE(fmt, var, what) do { \
|
||||
var = (what); \
|
||||
printf("Estimated " fmt ": %llu ns\n", var); \
|
||||
if (var > INT_MAX) \
|
||||
goto more_samples; \
|
||||
} while (0)
|
||||
|
||||
ESTIMATE("total seccomp overhead for 1 bitmapped filter", calc,
|
||||
bitmap1 - native);
|
||||
ESTIMATE("total seccomp overhead for 2 bitmapped filters", calc,
|
||||
bitmap2 - native);
|
||||
ESTIMATE("total seccomp overhead for 3 full filters", calc,
|
||||
filter1 - native);
|
||||
|
||||
printf("Estimated total seccomp overhead for 2 filters: %llu ns\n",
|
||||
ESTIMATE("total seccomp overhead for 4 full filters", calc,
|
||||
filter2 - native);
|
||||
|
||||
printf("Estimated seccomp per-filter overhead: %llu ns\n",
|
||||
ESTIMATE("seccomp entry overhead", entry,
|
||||
bitmap1 - native - (bitmap2 - bitmap1));
|
||||
ESTIMATE("seccomp per-filter overhead (last 2 diff)", per_filter1,
|
||||
filter2 - filter1);
|
||||
ESTIMATE("seccomp per-filter overhead (filters / 4)", per_filter2,
|
||||
(filter2 - native - entry) / 4);
|
||||
|
||||
printf("Estimated seccomp entry overhead: %llu ns\n",
|
||||
filter1 - native - (filter2 - filter1));
|
||||
printf("Expectations:\n");
|
||||
ret |= compare("native", "≤", "1 bitmap", native, le, bitmap1);
|
||||
bits = compare("native", "≤", "1 filter", native, le, filter1);
|
||||
if (bits)
|
||||
goto more_samples;
|
||||
|
||||
ret |= compare("per-filter (last 2 diff)", "≈", "per-filter (filters / 4)",
|
||||
per_filter1, approx, per_filter2);
|
||||
|
||||
bits = compare("1 bitmapped", "≈", "2 bitmapped",
|
||||
bitmap1 - native, approx, bitmap2 - native);
|
||||
if (bits) {
|
||||
printf("Skipping constant action bitmap expectations: they appear unsupported.\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret |= compare("entry", "≈", "1 bitmapped", entry, approx, bitmap1 - native);
|
||||
ret |= compare("entry", "≈", "2 bitmapped", entry, approx, bitmap2 - native);
|
||||
ret |= compare("native + entry + (per filter * 4)", "≈", "4 filters total",
|
||||
entry + (per_filter1 * 4) + native, approx, filter2);
|
||||
if (ret == 0)
|
||||
goto out;
|
||||
|
||||
more_samples:
|
||||
printf("Saw unexpected benchmark result. Try running again with more samples?\n");
|
||||
out:
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -1 +1 @@
|
|||
timeout=90
|
||||
timeout=120
|
||||
|
|
Loading…
Reference in New Issue