From 58990d1ff3f7896ee341030e9a7c2e4002570683 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 7 Jun 2018 17:40:03 +0200 Subject: [PATCH 1/4] bpf: reject passing modified ctx to helper functions As commit 28e33f9d78ee ("bpf: disallow arithmetic operations on context pointer") already describes, f1174f77b50c ("bpf/verifier: rework value tracking") removed the specific white-listed cases we had previously where we would allow for pointer arithmetic in order to further generalize it, and allow e.g. context access via modified registers. While the dereferencing of modified context pointers had been forbidden through 28e33f9d78ee, syzkaller did recently manage to trigger several KASAN splats for slab out of bounds access and use after frees by simply passing a modified context pointer to a helper function which would then do the bad access since verifier allowed it in adjust_ptr_min_max_vals(). Rejecting arithmetic on ctx pointer in adjust_ptr_min_max_vals() generally could break existing programs as there's a valid use case in tracing in combination with passing the ctx to helpers as bpf_probe_read(), where the register then becomes unknown at verification time due to adding a non-constant offset to it. An access sequence may look like the following: offset = args->filename; /* field __data_loc filename */ bpf_probe_read(&dst, len, (char *)args + offset); // args is ctx There are two options: i) we could special case the ctx and as soon as we add a constant or bounded offset to it (hence ctx type wouldn't change) we could turn the ctx into an unknown scalar, or ii) we generalize the sanity test for ctx member access into a small helper and assert it on the ctx register that was passed as a function argument. Fwiw, latter is more obvious and less complex at the same time, and one case that may potentially be legitimate in future for ctx member access at least would be for ctx to carry a const offset. Therefore, fix follows approach from ii) and adds test cases to BPF kselftests. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Reported-by: syzbot+3d0b2441dbb71751615e@syzkaller.appspotmail.com Reported-by: syzbot+c8504affd4fdd0c1b626@syzkaller.appspotmail.com Reported-by: syzbot+e5190cb881d8660fb1a3@syzkaller.appspotmail.com Reported-by: syzbot+efae31b384d5badbd620@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Edward Cree Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 48 +++++++++++------ tools/testing/selftests/bpf/test_verifier.c | 58 ++++++++++++++++++++- 2 files changed, 88 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d6403b5166f4..cced0c1e63e2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1617,6 +1617,30 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif +static int check_ctx_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + /* Access to ctx or passing it to a helper is only allowed in + * its original, unmodified form. + */ + + if (reg->off) { + verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", + regno, reg->off); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); + return -EACCES; + } + + return 0; +} + /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -1686,24 +1710,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into ctx\n", value_regno); return -EACCES; } - /* ctx accesses must be at a fixed offset, so that we can - * determine what type of data were returned. - */ - if (reg->off) { - verbose(env, - "dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n", - regno, reg->off, off - reg->off); - return -EACCES; - } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, - "variable ctx access var_off=%s off=%d size=%d", - tn_buf, off, size); - return -EACCES; - } + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; + err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a @@ -1984,6 +1995,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 7cb1d74057ce..2ecd27b670d7 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -8647,7 +8647,7 @@ static struct bpf_test tests[] = { offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .errstr = "dereference of modified ctx ptr R1 off=68+8, ctx+const is allowed, ctx+const+const is not", + .errstr = "dereference of modified ctx ptr", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, @@ -12258,6 +12258,62 @@ static struct bpf_test tests[] = { .result = ACCEPT, .retval = 5, }, + { + "pass unmodified ctx pointer to helper", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_csum_update), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + }, + { + "pass modified ctx pointer to helper, 1", + .insns = { + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -612), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_csum_update), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .errstr = "dereference of modified ctx ptr", + }, + { + "pass modified ctx pointer to helper, 2", + .insns = { + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -612), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_socket_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result_unpriv = REJECT, + .result = REJECT, + .errstr_unpriv = "dereference of modified ctx ptr", + .errstr = "dereference of modified ctx ptr", + }, + { + "pass modified ctx pointer to helper, 3", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_3, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_csum_update), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .errstr = "variable ctx access var_off=(0x0; 0x4)", + }, }; static int probe_filter_length(const struct bpf_insn *fp) From 23316a366e1654e4ad05817c6075bc1019efb30a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 6 Jun 2018 09:12:44 -0700 Subject: [PATCH 2/4] tools/bpf: fix selftest get_cgroup_id_user Commit f269099a7e7a ("tools/bpf: add a selftest for bpf_get_current_cgroup_id() helper") added a test for bpf_get_current_cgroup_id() helper. The bpf program is attached to tracepoint syscalls/sys_enter_nanosleep and will record the cgroup id if the tracepoint is hit. The test program creates a cgroup and attachs itself to this cgroup and expects that the test program process cgroup id is the same as the cgroup_id retrieved by the bpf program. In a light system where no other processes called nanosleep syscall, the test case can pass. In a busy system where many different processes can hit syscalls/sys_enter_nanosleep tracepoint, the cgroup id recorded by bpf program may not match the test program process cgroup_id. This patch fixed an issue by communicating the test program pid to bpf program. The bpf program only records cgroup id if the current task pid is the same as passed-in pid. This ensures that the recorded cgroup_id is for the cgroup within which the test program resides. Fixes: f269099a7e7a ("tools/bpf: add a selftest for bpf_get_current_cgroup_id() helper") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/get_cgroup_id_kern.c | 14 +++++++++++++- tools/testing/selftests/bpf/get_cgroup_id_user.c | 12 ++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/get_cgroup_id_kern.c b/tools/testing/selftests/bpf/get_cgroup_id_kern.c index 2cf8cb23f209..014dba10b8a5 100644 --- a/tools/testing/selftests/bpf/get_cgroup_id_kern.c +++ b/tools/testing/selftests/bpf/get_cgroup_id_kern.c @@ -11,12 +11,24 @@ struct bpf_map_def SEC("maps") cg_ids = { .max_entries = 1, }; +struct bpf_map_def SEC("maps") pidmap = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u32), + .max_entries = 1, +}; + SEC("tracepoint/syscalls/sys_enter_nanosleep") int trace(void *ctx) { - __u32 key = 0; + __u32 pid = bpf_get_current_pid_tgid(); + __u32 key = 0, *expected_pid; __u64 *val; + expected_pid = bpf_map_lookup_elem(&pidmap, &key); + if (!expected_pid || *expected_pid != pid) + return 0; + val = bpf_map_lookup_elem(&cg_ids, &key); if (val) *val = bpf_get_current_cgroup_id(); diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c index ea19a42e5894..e8da7b39158d 100644 --- a/tools/testing/selftests/bpf/get_cgroup_id_user.c +++ b/tools/testing/selftests/bpf/get_cgroup_id_user.c @@ -50,13 +50,13 @@ int main(int argc, char **argv) const char *probe_name = "syscalls/sys_enter_nanosleep"; const char *file = "get_cgroup_id_kern.o"; int err, bytes, efd, prog_fd, pmu_fd; + int cgroup_fd, cgidmap_fd, pidmap_fd; struct perf_event_attr attr = {}; - int cgroup_fd, cgidmap_fd; struct bpf_object *obj; __u64 kcgid = 0, ucgid; + __u32 key = 0, pid; int exit_code = 1; char buf[256]; - __u32 key = 0; err = setup_cgroup_environment(); if (CHECK(err, "setup_cgroup_environment", "err %d errno %d\n", err, @@ -81,6 +81,14 @@ int main(int argc, char **argv) cgidmap_fd, errno)) goto close_prog; + pidmap_fd = bpf_find_map(__func__, obj, "pidmap"); + if (CHECK(pidmap_fd < 0, "bpf_find_map", "err %d errno %d\n", + pidmap_fd, errno)) + goto close_prog; + + pid = getpid(); + bpf_map_update_elem(pidmap_fd, &key, &pid, 0); + snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%s/id", probe_name); efd = open(buf, O_RDONLY, 0); From a5a16e43529b5040760ebf9bd9b056dd34861f93 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 7 Jun 2018 15:37:34 +0200 Subject: [PATCH 3/4] xsk: Fix umem fill/completion queue mmap on 32-bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With gcc-4.1.2 on 32-bit: net/xdp/xsk.c:663: warning: integer constant is too large for ‘long’ type net/xdp/xsk.c:665: warning: integer constant is too large for ‘long’ type Add the missing "ULL" suffixes to the large XDP_UMEM_PGOFF_*_RING values to fix this. net/xdp/xsk.c:663: warning: comparison is always false due to limited range of data type net/xdp/xsk.c:665: warning: comparison is always false due to limited range of data type "unsigned long" is 32-bit on 32-bit systems, hence the offset is truncated, and can never be equal to any of the XDP_UMEM_PGOFF_*_RING values. Use loff_t (and the required cast) to fix this. Fixes: 423f38329d267969 ("xsk: add umem fill queue support and mmap") Fixes: fe2308328cd2f26e ("xsk: add umem completion queue support and mmap") Signed-off-by: Geert Uytterhoeven Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_xdp.h | 4 ++-- net/xdp/xsk.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 1fa0e977ea8d..caed8b1614ff 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -63,8 +63,8 @@ struct xdp_statistics { /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_PGOFF_TX_RING 0x80000000 -#define XDP_UMEM_PGOFF_FILL_RING 0x100000000 -#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 +#define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL +#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL /* Rx/Tx descriptor */ struct xdp_desc { diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c6ed2454f7ce..36919a254ba3 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -643,7 +643,7 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, static int xsk_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); struct xsk_queue *q = NULL; From c09290c5637692a9bfe7740e4c5e693efff12810 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 8 Jun 2018 00:06:01 +0200 Subject: [PATCH 4/4] bpf, xdp: fix crash in xdp_umem_unaccount_pages syzkaller was able to trigger the following panic for AF_XDP: BUG: KASAN: null-ptr-deref in atomic64_sub include/asm-generic/atomic-instrumented.h:144 [inline] BUG: KASAN: null-ptr-deref in atomic_long_sub include/asm-generic/atomic-long.h:199 [inline] BUG: KASAN: null-ptr-deref in xdp_umem_unaccount_pages.isra.4+0x3d/0x80 net/xdp/xdp_umem.c:135 Write of size 8 at addr 0000000000000060 by task syz-executor246/4527 CPU: 1 PID: 4527 Comm: syz-executor246 Not tainted 4.17.0+ #89 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 kasan_report_error mm/kasan/report.c:352 [inline] kasan_report.cold.7+0x6d/0x2fe mm/kasan/report.c:412 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267 kasan_check_write+0x14/0x20 mm/kasan/kasan.c:278 atomic64_sub include/asm-generic/atomic-instrumented.h:144 [inline] atomic_long_sub include/asm-generic/atomic-long.h:199 [inline] xdp_umem_unaccount_pages.isra.4+0x3d/0x80 net/xdp/xdp_umem.c:135 xdp_umem_reg net/xdp/xdp_umem.c:334 [inline] xdp_umem_create+0xd6c/0x10f0 net/xdp/xdp_umem.c:349 xsk_setsockopt+0x443/0x550 net/xdp/xsk.c:531 __sys_setsockopt+0x1bd/0x390 net/socket.c:1935 __do_sys_setsockopt net/socket.c:1946 [inline] __se_sys_setsockopt net/socket.c:1943 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1943 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe In xdp_umem_reg() the call to xdp_umem_account_pages() passed with CAP_IPC_LOCK where we didn't need to end up charging rlimit on memlock for the current user and therefore umem->user continues to be NULL. Later on through fault injection syzkaller triggered a failure in either umem->pgs or umem->pages allocation such that we bail out and undo accounting in xdp_umem_unaccount_pages() where we eventually hit the panic since it tries to deref the umem->user. The code is pretty close to mm_account_pinned_pages() and mm_unaccount_pinned_pages() pair and potentially could reuse it even in a later cleanup, and it appears that the initial commit c0c77d8fb787 ("xsk: add user memory registration support sockopt") got this right while later follow-up introduced the bug via a49049ea2576 ("xsk: simplified umem setup"). Fixes: a49049ea2576 ("xsk: simplified umem setup") Reported-by: syzbot+979217770b09ebf5c407@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- net/xdp/xdp_umem.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 7eb4948a38d2..b9ef487c4618 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -132,8 +132,10 @@ static void xdp_umem_unpin_pages(struct xdp_umem *umem) static void xdp_umem_unaccount_pages(struct xdp_umem *umem) { - atomic_long_sub(umem->npgs, &umem->user->locked_vm); - free_uid(umem->user); + if (umem->user) { + atomic_long_sub(umem->npgs, &umem->user->locked_vm); + free_uid(umem->user); + } } static void xdp_umem_release(struct xdp_umem *umem)