From 9f2dfda2f2f1c6181c3732c16b85c59ab2d195e0 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Wed, 16 Dec 2015 21:59:56 +0100 Subject: [PATCH 01/13] uml: fix hostfs mknod() An inverted return value check in hostfs_mknod() caused the function to return success after handling it as an error (and cleaning up). It resulted in the following segfault when trying to bind() a named unix socket: Pid: 198, comm: a.out Not tainted 4.4.0-rc4 RIP: 0033:[<0000000061077df6>] RSP: 00000000daae5d60 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 000000006092a460 RCX: 00000000dfc54208 RDX: 0000000061073ef1 RSI: 0000000000000070 RDI: 00000000e027d600 RBP: 00000000daae5de0 R08: 00000000da980ac0 R09: 0000000000000000 R10: 0000000000000003 R11: 00007fb1ae08f72a R12: 0000000000000000 R13: 000000006092a460 R14: 00000000daaa97c0 R15: 00000000daaa9a88 Kernel panic - not syncing: Kernel mode fault at addr 0x40, ip 0x61077df6 CPU: 0 PID: 198 Comm: a.out Not tainted 4.4.0-rc4 #1 Stack: e027d620 dfc54208 0000006f da981398 61bee000 0000c1ed daae5de0 0000006e e027d620 dfcd4208 00000005 6092a460 Call Trace: [<60dedc67>] SyS_bind+0xf7/0x110 [<600587be>] handle_syscall+0x7e/0x80 [<60066ad7>] userspace+0x3e7/0x4e0 [<6006321f>] ? save_registers+0x1f/0x40 [<6006c88e>] ? arch_prctl+0x1be/0x1f0 [<60054985>] fork_handler+0x85/0x90 Let's also get rid of the "cosmic ray protection" while we're at it. Fixes: e9193059b1b3 "hostfs: fix races in dentry_name() and inode_name()" Signed-off-by: Vegard Nossum Cc: Jeff Dike Cc: Al Viro Cc: stable@vger.kernel.org Signed-off-by: Richard Weinberger --- fs/hostfs/hostfs_kern.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 2ac99db3750e..5a7b3229b956 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -730,15 +730,13 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, init_special_inode(inode, mode, dev); err = do_mknod(name, mode, MAJOR(dev), MINOR(dev)); - if (!err) + if (err) goto out_free; err = read_name(inode, name); __putname(name); if (err) goto out_put; - if (err) - goto out_put; d_instantiate(dentry, inode); return 0; From 0754fb298f2f2719f0393491d010d46cfb25d043 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 18 Dec 2015 21:28:53 +0100 Subject: [PATCH 02/13] uml: flush stdout before forking I was seeing some really weird behaviour where piping UML's output somewhere would cause output to get duplicated: $ ./vmlinux | head -n 40 Checking that ptrace can change system call numbers...Core dump limits : soft - 0 hard - NONE OK Checking syscall emulation patch for ptrace...Core dump limits : soft - 0 hard - NONE OK Checking advanced syscall emulation patch for ptrace...Core dump limits : soft - 0 hard - NONE OK Core dump limits : soft - 0 hard - NONE This is because these tests do a fork() which duplicates the non-empty stdout buffer, then glibc flushes the duplicated buffer as each child exits. A simple workaround is to flush before forking. Cc: stable@vger.kernel.org Signed-off-by: Vegard Nossum Signed-off-by: Richard Weinberger --- arch/um/os-Linux/start_up.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index 47f1ff056a54..22a358ef1b0c 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -94,6 +94,8 @@ static int start_ptraced_child(void) { int pid, n, status; + fflush(stdout); + pid = fork(); if (pid == 0) ptrace_child(); From d5e3f5cbe5cee7fe6da26566559a978547179b37 Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Mon, 21 Dec 2015 11:28:02 +0000 Subject: [PATCH 03/13] um: Prevent IRQ handler reentrancy The existing IRQ handler design in UML does not prevent reentrancy This is mitigated by fd-enable/fd-disable semantics for the IO portion of the UML subsystem. The timer, however, can and is re-entered resulting in very deep stack usage and occasional stack exhaustion. This patch prevents this by checking if there is a timer interrupt in-flight before processing any pending timer interrupts. Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/os-Linux/signal.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index c211153ca69a..7801666514ed 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -62,6 +62,7 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) static int signals_enabled; static unsigned int signals_pending; +static unsigned int signals_active = 0; void sig_handler(int sig, struct siginfo *si, mcontext_t *mc) { @@ -101,7 +102,12 @@ void timer_alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc) block_signals(); + signals_active |= SIGALRM_MASK; + timer_real_alarm_handler(mc); + + signals_active &= ~SIGALRM_MASK; + set_signals(enabled); } @@ -286,8 +292,16 @@ void unblock_signals(void) if (save_pending & SIGIO_MASK) sig_handler_common(SIGIO, NULL, NULL); - if (save_pending & SIGALRM_MASK) + /* Do not reenter the handler */ + + if ((save_pending & SIGALRM_MASK) && (!(signals_active & SIGALRM_MASK))) timer_real_alarm_handler(NULL); + + /* Rerun the loop only if there is still pending SIGIO and not in TIMER handler */ + + if (!(signals_pending & SIGIO_MASK) && (signals_active & SIGALRM_MASK)) + return; + } } From 470a166e8c5a4da4be88545b1c4dde308abac5b2 Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Mon, 21 Dec 2015 11:28:03 +0000 Subject: [PATCH 04/13] um: Do not change hard IRQ flags in soft IRQ processing Software IRQ processing in generic architectures assumes that the exit out of hard IRQ may have re-enabled interrupts (some architectures may have an implicit EOI). It presumes them enabled and toggles the flags once more just in case unless this is turned off in the architecture specific hardirq.h by setting __ARCH_IRQ_EXIT_IRQS_DISABLED This patch adds this to UML where due to the way IRQs are handled it is an optimization (it works fine without it too). Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/include/asm/hardirq.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 arch/um/include/asm/hardirq.h diff --git a/arch/um/include/asm/hardirq.h b/arch/um/include/asm/hardirq.h new file mode 100644 index 000000000000..756f0778e327 --- /dev/null +++ b/arch/um/include/asm/hardirq.h @@ -0,0 +1,23 @@ +#ifndef __ASM_UM_HARDIRQ_H +#define __ASM_UM_HARDIRQ_H + +#include +#include + +typedef struct { + unsigned int __softirq_pending; +} ____cacheline_aligned irq_cpustat_t; + +#include /* Standard mappings for irq_cpustat_t above */ +#include + +#ifndef ack_bad_irq +static inline void ack_bad_irq(unsigned int irq) +{ + printk(KERN_CRIT "unexpected IRQ trap at vector %02x\n", irq); +} +#endif + +#define __ARCH_IRQ_EXIT_IRQS_DISABLED 1 + +#endif /* __ASM_UM_HARDIRQ_H */ From 8c6157b6b30a765ec233a1be5f9446f24a5283de Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Mon, 21 Dec 2015 18:54:00 +0000 Subject: [PATCH 05/13] um: Update UBD to use pread/pwrite family of functions This decreases the number of syscalls per read/write by half. Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/ubd_kern.c | 27 +++++---------------------- arch/um/include/shared/os.h | 2 ++ arch/um/os-Linux/file.c | 19 +++++++++++++++++++ 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index e8ab93c3e638..39ba20755e03 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -535,11 +535,7 @@ static int read_cow_bitmap(int fd, void *buf, int offset, int len) { int err; - err = os_seek_file(fd, offset); - if (err < 0) - return err; - - err = os_read_file(fd, buf, len); + err = os_pread_file(fd, buf, len, offset); if (err < 0) return err; @@ -1377,14 +1373,8 @@ static int update_bitmap(struct io_thread_req *req) if(req->cow_offset == -1) return 0; - n = os_seek_file(req->fds[1], req->cow_offset); - if(n < 0){ - printk("do_io - bitmap lseek failed : err = %d\n", -n); - return 1; - } - - n = os_write_file(req->fds[1], &req->bitmap_words, - sizeof(req->bitmap_words)); + n = os_pwrite_file(req->fds[1], &req->bitmap_words, + sizeof(req->bitmap_words), req->cow_offset); if(n != sizeof(req->bitmap_words)){ printk("do_io - bitmap update failed, err = %d fd = %d\n", -n, req->fds[1]); @@ -1399,7 +1389,6 @@ static void do_io(struct io_thread_req *req) char *buf; unsigned long len; int n, nsectors, start, end, bit; - int err; __u64 off; if (req->op == UBD_FLUSH) { @@ -1428,18 +1417,12 @@ static void do_io(struct io_thread_req *req) len = (end - start) * req->sectorsize; buf = &req->buffer[start * req->sectorsize]; - err = os_seek_file(req->fds[bit], off); - if(err < 0){ - printk("do_io - lseek failed : err = %d\n", -err); - req->error = 1; - return; - } if(req->op == UBD_READ){ n = 0; do { buf = &buf[n]; len -= n; - n = os_read_file(req->fds[bit], buf, len); + n = os_pread_file(req->fds[bit], buf, len, off); if (n < 0) { printk("do_io - read failed, err = %d " "fd = %d\n", -n, req->fds[bit]); @@ -1449,7 +1432,7 @@ static void do_io(struct io_thread_req *req) } while((n < len) && (n != 0)); if (n < len) memset(&buf[n], 0, len - n); } else { - n = os_write_file(req->fds[bit], buf, len); + n = os_pwrite_file(req->fds[bit], buf, len, off); if(n != len){ printk("do_io - write failed err = %d " "fd = %d\n", -n, req->fds[bit]); diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 868e6c3f83dd..7a04ddd85334 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -146,6 +146,8 @@ extern int os_read_file(int fd, void *buf, int len); extern int os_write_file(int fd, const void *buf, int count); extern int os_sync_file(int fd); extern int os_file_size(const char *file, unsigned long long *size_out); +extern int os_pread_file(int fd, void *buf, int len, unsigned long long offset); +extern int os_pwrite_file(int fd, const void *buf, int count, unsigned long long offset); extern int os_file_modtime(const char *file, unsigned long *modtime); extern int os_pipe(int *fd, int stream, int close_on_exec); extern int os_set_fd_async(int fd); diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index 26e0164895e4..2db18cbbb0ea 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -264,6 +264,15 @@ int os_read_file(int fd, void *buf, int len) return n; } +int os_pread_file(int fd, void *buf, int len, unsigned long long offset) +{ + int n = pread(fd, buf, len, offset); + + if (n < 0) + return -errno; + return n; +} + int os_write_file(int fd, const void *buf, int len) { int n = write(fd, (void *) buf, len); @@ -282,6 +291,16 @@ int os_sync_file(int fd) return n; } +int os_pwrite_file(int fd, const void *buf, int len, unsigned long long offset) +{ + int n = pwrite(fd, (void *) buf, len, offset); + + if (n < 0) + return -errno; + return n; +} + + int os_file_size(const char *file, unsigned long long *size_out) { struct uml_stat buf; From a7df4716d19594b7b3f106f0bc0ca1c548e508e6 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 31 Dec 2015 17:06:17 +0100 Subject: [PATCH 06/13] um: link with -lpthread Similarly to commit fb1770aa78a43530940d0c2dd161e77bc705bdac, with gcc 5 on Ubuntu and CONFIG_STATIC_LINK=y I was seeing these linker errors: /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/librt.a(timer_create.o): In function `__timer_create_new': (.text+0xcd): undefined reference to `pthread_once' /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/librt.a(timer_create.o): In function `__timer_create_new': (.text+0x126): undefined reference to `pthread_attr_init' /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/librt.a(timer_create.o): In function `__timer_create_new': (.text+0x168): undefined reference to `pthread_attr_setdetachstate' [...] Obviously we also need -lpthread for librt.a. Cc: stable@vger.kernel.org # 4.4 Signed-off-by: Vegard Nossum Signed-off-by: Richard Weinberger --- scripts/link-vmlinux.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index dacf71a43ad4..ba6c34ea5429 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -62,7 +62,7 @@ vmlinux_link() -Wl,--start-group \ ${KBUILD_VMLINUX_MAIN} \ -Wl,--end-group \ - -lutil -lrt ${1} + -lutil -lrt -lpthread ${1} rm -f linux fi } From e04c989eb785af61d2895d76d38c09166296f9c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Tue, 29 Dec 2015 21:35:44 +0100 Subject: [PATCH 07/13] um: Fix ptrace GETREGS/SETREGS bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fix two related bugs: * PTRACE_GETREGS doesn't get the right orig_ax (syscall) value * PTRACE_SETREGS can't set the orig_ax value (erased by initial value) Get rid of the now useless and error-prone get_syscall(). Fix inconsistent behavior in the ptrace implementation for i386 when updating orig_eax automatically update the syscall number as well. This is now updated in handle_syscall(). Signed-off-by: Mickaël Salaün Cc: Jeff Dike Cc: Richard Weinberger Cc: Thomas Gleixner Cc: Kees Cook Cc: Andy Lutomirski Cc: Will Drewry Cc: Thomas Meyer Cc: Nicolas Iooss Cc: Anton Ivanov Cc: Meredydd Luff Cc: David Drysdale Signed-off-by: Richard Weinberger Acked-by: Kees Cook --- arch/um/include/shared/os.h | 1 - arch/um/kernel/skas/syscall.c | 26 ++++++++++++++------------ arch/um/os-Linux/skas/process.c | 7 ------- arch/x86/um/ptrace_32.c | 8 +++----- 4 files changed, 17 insertions(+), 25 deletions(-) diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 7a04ddd85334..de5d572225f3 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -284,7 +284,6 @@ extern void initial_thread_cb_skas(void (*proc)(void *), void *arg); extern void halt_skas(void); extern void reboot_skas(void); -extern int get_syscall(struct uml_pt_regs *regs); /* irq.c */ extern int os_waiting_for_events(struct irq_fd *active_fds); diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 1683b8efdfda..6cadce761bcf 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -7,29 +7,31 @@ #include #include #include +#include #include -#include void handle_syscall(struct uml_pt_regs *r) { struct pt_regs *regs = container_of(r, struct pt_regs, regs); - long result; int syscall; - if (syscall_trace_enter(regs)) { - result = -ENOSYS; + /* Initialize the syscall number and default return value. */ + UPT_SYSCALL_NR(r) = PT_SYSCALL_NR(r->gp); + PT_REGS_SET_SYSCALL_RETURN(regs, -ENOSYS); + + if (syscall_trace_enter(regs)) goto out; - } - syscall = get_syscall(r); + /* Update the syscall number after orig_ax has potentially been updated + * with ptrace. + */ + UPT_SYSCALL_NR(r) = PT_SYSCALL_NR(r->gp); + syscall = UPT_SYSCALL_NR(r); - if ((syscall > __NR_syscall_max) || syscall < 0) - result = -ENOSYS; - else - result = EXECUTE_SYSCALL(syscall, regs); + if (syscall >= 0 && syscall <= __NR_syscall_max) + PT_REGS_SET_SYSCALL_RETURN(regs, + EXECUTE_SYSCALL(syscall, regs)); out: - PT_REGS_SET_SYSCALL_RETURN(regs, result); - syscall_trace_leave(regs); } diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index b856c66ebd3a..23025d645160 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -172,13 +172,6 @@ static void handle_trap(int pid, struct uml_pt_regs *regs, handle_syscall(regs); } -int get_syscall(struct uml_pt_regs *regs) -{ - UPT_SYSCALL_NR(regs) = PT_SYSCALL_NR(regs->gp); - - return UPT_SYSCALL_NR(regs); -} - extern char __syscall_stub_start[]; static int userspace_tramp(void *stack) diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index a29756f2d940..47c78d5e5c32 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -68,6 +68,7 @@ static const int reg_offsets[] = { [EFL] = HOST_EFLAGS, [UESP] = HOST_SP, [SS] = HOST_SS, + [ORIG_EAX] = HOST_ORIG_AX, }; int putreg(struct task_struct *child, int regno, unsigned long value) @@ -83,6 +84,7 @@ int putreg(struct task_struct *child, int regno, unsigned long value) case EAX: case EIP: case UESP: + case ORIG_EAX: break; case FS: if (value && (value & 3) != 3) @@ -108,9 +110,6 @@ int putreg(struct task_struct *child, int regno, unsigned long value) value &= FLAG_MASK; child->thread.regs.regs.gp[HOST_EFLAGS] |= value; return 0; - case ORIG_EAX: - child->thread.regs.regs.syscall = value; - return 0; default : panic("Bad register in putreg() : %d\n", regno); } @@ -143,8 +142,6 @@ unsigned long getreg(struct task_struct *child, int regno) regno >>= 2; switch (regno) { - case ORIG_EAX: - return child->thread.regs.regs.syscall; case FS: case GS: case DS: @@ -163,6 +160,7 @@ unsigned long getreg(struct task_struct *child, int regno) case EDI: case EBP: case EFL: + case ORIG_EAX: break; default: panic("Bad register in getreg() : %d\n", regno); From 4a0b88070406323487bad730d8945f482151a145 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Tue, 29 Dec 2015 21:35:45 +0100 Subject: [PATCH 08/13] selftests/seccomp: Remove the need for HAVE_ARCH_TRACEHOOK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some architectures do not implement PTRACE_GETREGSET nor PTRACE_SETREGSET (required by HAVE_ARCH_TRACEHOOK) but only implement PTRACE_GETREGS and PTRACE_SETREGS (e.g. User-mode Linux). This improve seccomp selftest portability for architectures without HAVE_ARCH_TRACEHOOK support by defining a new trigger HAVE_GETREGS. For now, this is only enabled for i386 and x86_64 architectures. This is required to be able to run this tests on User-mode Linux. Signed-off-by: Mickaël Salaün Cc: Jeff Dike Cc: Richard Weinberger Cc: Kees Cook Cc: Andy Lutomirski Cc: Will Drewry Cc: Shuah Khan Cc: Meredydd Luff Cc: David Drysdale Signed-off-by: Richard Weinberger Acked-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 882fe83a3554..b9453b838162 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1246,11 +1246,24 @@ TEST_F(TRACE_poke, getpid_runs_normally) # error "Do not know how to find your architecture's registers and syscalls" #endif +/* Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for + * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux). + */ +#if defined(__x86_64__) || defined(__i386__) +#define HAVE_GETREGS +#endif + /* Architecture-specific syscall fetching routine. */ int get_syscall(struct __test_metadata *_metadata, pid_t tracee) { - struct iovec iov; ARCH_REGS regs; +#ifdef HAVE_GETREGS + EXPECT_EQ(0, ptrace(PTRACE_GETREGS, tracee, 0, ®s)) { + TH_LOG("PTRACE_GETREGS failed"); + return -1; + } +#else + struct iovec iov; iov.iov_base = ®s; iov.iov_len = sizeof(regs); @@ -1258,6 +1271,7 @@ int get_syscall(struct __test_metadata *_metadata, pid_t tracee) TH_LOG("PTRACE_GETREGSET failed"); return -1; } +#endif return regs.SYSCALL_NUM; } @@ -1266,13 +1280,16 @@ int get_syscall(struct __test_metadata *_metadata, pid_t tracee) void change_syscall(struct __test_metadata *_metadata, pid_t tracee, int syscall) { - struct iovec iov; int ret; ARCH_REGS regs; - +#ifdef HAVE_GETREGS + ret = ptrace(PTRACE_GETREGS, tracee, 0, ®s); +#else + struct iovec iov; iov.iov_base = ®s; iov.iov_len = sizeof(regs); ret = ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov); +#endif EXPECT_EQ(0, ret); #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \ @@ -1312,9 +1329,13 @@ void change_syscall(struct __test_metadata *_metadata, if (syscall == -1) regs.SYSCALL_RET = 1; +#ifdef HAVE_GETREGS + ret = ptrace(PTRACE_SETREGS, tracee, 0, ®s); +#else iov.iov_base = ®s; iov.iov_len = sizeof(regs); ret = ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &iov); +#endif EXPECT_EQ(0, ret); } From d8f8b8445648c267a24f30a72533e77cb6543f21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Tue, 29 Dec 2015 21:35:46 +0100 Subject: [PATCH 09/13] um: Add full asm/syscall.h support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add subarchitecture-independent implementation of asm-generic/syscall.h allowing access to user system call parameters and results: * syscall_get_nr() * syscall_rollback() * syscall_get_error() * syscall_get_return_value() * syscall_set_return_value() * syscall_get_arguments() * syscall_set_arguments() * syscall_get_arch() provided by arch/x86/um/asm/syscall.h This provides the necessary syscall helpers needed by HAVE_ARCH_SECCOMP_FILTER plus syscall_get_error(). This is inspired from Meredydd Luff's patch (https://gerrit.chromium.org/gerrit/21425). Signed-off-by: Mickaël Salaün Cc: Jeff Dike Cc: Richard Weinberger Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Kees Cook Cc: Andy Lutomirski Cc: Will Drewry Cc: Meredydd Luff Cc: David Drysdale Signed-off-by: Richard Weinberger Acked-by: Kees Cook --- arch/um/include/asm/syscall-generic.h | 138 ++++++++++++++++++++++++++ arch/x86/um/asm/syscall.h | 1 + 2 files changed, 139 insertions(+) create mode 100644 arch/um/include/asm/syscall-generic.h diff --git a/arch/um/include/asm/syscall-generic.h b/arch/um/include/asm/syscall-generic.h new file mode 100644 index 000000000000..9fb9cf8cd39a --- /dev/null +++ b/arch/um/include/asm/syscall-generic.h @@ -0,0 +1,138 @@ +/* + * Access to user system call parameters and results + * + * See asm-generic/syscall.h for function descriptions. + * + * Copyright (C) 2015 Mickaël Salaün + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __UM_SYSCALL_GENERIC_H +#define __UM_SYSCALL_GENERIC_H + +#include +#include +#include +#include + +static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) +{ + + return PT_REGS_SYSCALL_NR(regs); +} + +static inline void syscall_rollback(struct task_struct *task, + struct pt_regs *regs) +{ + /* do nothing */ +} + +static inline long syscall_get_error(struct task_struct *task, + struct pt_regs *regs) +{ + const long error = regs_return_value(regs); + + return IS_ERR_VALUE(error) ? error : 0; +} + +static inline long syscall_get_return_value(struct task_struct *task, + struct pt_regs *regs) +{ + return regs_return_value(regs); +} + +static inline void syscall_set_return_value(struct task_struct *task, + struct pt_regs *regs, + int error, long val) +{ + PT_REGS_SET_SYSCALL_RETURN(regs, (long) error ?: val); +} + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + unsigned long *args) +{ + const struct uml_pt_regs *r = ®s->regs; + + switch (i) { + case 0: + if (!n--) + break; + *args++ = UPT_SYSCALL_ARG1(r); + case 1: + if (!n--) + break; + *args++ = UPT_SYSCALL_ARG2(r); + case 2: + if (!n--) + break; + *args++ = UPT_SYSCALL_ARG3(r); + case 3: + if (!n--) + break; + *args++ = UPT_SYSCALL_ARG4(r); + case 4: + if (!n--) + break; + *args++ = UPT_SYSCALL_ARG5(r); + case 5: + if (!n--) + break; + *args++ = UPT_SYSCALL_ARG6(r); + case 6: + if (!n--) + break; + default: + BUG(); + break; + } +} + +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + const unsigned long *args) +{ + struct uml_pt_regs *r = ®s->regs; + + switch (i) { + case 0: + if (!n--) + break; + UPT_SYSCALL_ARG1(r) = *args++; + case 1: + if (!n--) + break; + UPT_SYSCALL_ARG2(r) = *args++; + case 2: + if (!n--) + break; + UPT_SYSCALL_ARG3(r) = *args++; + case 3: + if (!n--) + break; + UPT_SYSCALL_ARG4(r) = *args++; + case 4: + if (!n--) + break; + UPT_SYSCALL_ARG5(r) = *args++; + case 5: + if (!n--) + break; + UPT_SYSCALL_ARG6(r) = *args++; + case 6: + if (!n--) + break; + default: + BUG(); + break; + } +} + +/* See arch/x86/um/asm/syscall.h for syscall_get_arch() definition. */ + +#endif /* __UM_SYSCALL_GENERIC_H */ diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h index 81d6562ce01d..11ab90dc5f14 100644 --- a/arch/x86/um/asm/syscall.h +++ b/arch/x86/um/asm/syscall.h @@ -1,6 +1,7 @@ #ifndef __UM_ASM_SYSCALL_H #define __UM_ASM_SYSCALL_H +#include #include typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, From c50b4659e444b020657e01bdf769c965e5597cb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Tue, 29 Dec 2015 21:35:47 +0100 Subject: [PATCH 10/13] um: Add seccomp support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This brings SECCOMP_MODE_STRICT and SECCOMP_MODE_FILTER support through prctl(2) and seccomp(2) to User-mode Linux for i386 and x86_64 subarchitectures. secure_computing() is called first in handle_syscall() so that the syscall emulation will be aborted quickly if matching a seccomp rule. This is inspired from Meredydd Luff's patch (https://gerrit.chromium.org/gerrit/21425). Signed-off-by: Mickaël Salaün Cc: Jeff Dike Cc: Richard Weinberger Cc: Ingo Molnar Cc: Kees Cook Cc: Andy Lutomirski Cc: Will Drewry Cc: Chris Metcalf Cc: Michael Ellerman Cc: James Hogan Cc: Meredydd Luff Cc: David Drysdale Signed-off-by: Richard Weinberger Acked-by: Kees Cook --- .../seccomp/seccomp-filter/arch-support.txt | 2 +- arch/um/Kconfig.common | 1 + arch/um/Kconfig.um | 16 ++++++++++++++++ arch/um/include/asm/thread_info.h | 2 ++ arch/um/kernel/skas/syscall.c | 5 +++++ 5 files changed, 25 insertions(+), 1 deletion(-) diff --git a/Documentation/features/seccomp/seccomp-filter/arch-support.txt b/Documentation/features/seccomp/seccomp-filter/arch-support.txt index 76d39d66a5d7..4f66ec133951 100644 --- a/Documentation/features/seccomp/seccomp-filter/arch-support.txt +++ b/Documentation/features/seccomp/seccomp-filter/arch-support.txt @@ -33,7 +33,7 @@ | sh: | TODO | | sparc: | TODO | | tile: | ok | - | um: | TODO | + | um: | ok | | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common index d195a87ca542..cc0013475444 100644 --- a/arch/um/Kconfig.common +++ b/arch/um/Kconfig.common @@ -2,6 +2,7 @@ config UML bool default y select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_SECCOMP_FILTER select HAVE_UID16 select HAVE_FUTEX_CMPXCHG if FUTEX select GENERIC_IRQ_SHOW diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um index 28a9885e3a37..4b2ed5858b2e 100644 --- a/arch/um/Kconfig.um +++ b/arch/um/Kconfig.um @@ -104,3 +104,19 @@ config PGTABLE_LEVELS int default 3 if 3_LEVEL_PGTABLES default 2 + +config SECCOMP + def_bool y + prompt "Enable seccomp to safely compute untrusted bytecode" + ---help--- + This kernel feature is useful for number crunching applications + that may need to compute untrusted bytecode during their + execution. By using pipes or other transports made available to + the process as file descriptors supporting the read/write + syscalls, it's possible to isolate those applications in + their own address space using seccomp. Once seccomp is + enabled via prctl(PR_SET_SECCOMP), it cannot be disabled + and the task is only allowed to execute a few safe syscalls + defined by each seccomp mode. + + If unsure, say Y. diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index 53968aaf76f9..053baff03674 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -62,11 +62,13 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SYSCALL_AUDIT 6 #define TIF_RESTORE_SIGMASK 7 #define TIF_NOTIFY_RESUME 8 +#define TIF_SECCOMP 9 /* secure computing */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_MEMDIE (1 << TIF_MEMDIE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) +#define _TIF_SECCOMP (1 << TIF_SECCOMP) #endif diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 6cadce761bcf..48b0dcbd87be 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -19,6 +20,10 @@ void handle_syscall(struct uml_pt_regs *r) UPT_SYSCALL_NR(r) = PT_SYSCALL_NR(r->gp); PT_REGS_SET_SYSCALL_RETURN(regs, -ENOSYS); + /* Do the secure computing check first; failures should be fast. */ + if (secure_computing() == -1) + return; + if (syscall_trace_enter(regs)) goto out; From 42d91f612c879627c925d3779c36877cd440f9f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 24 Dec 2015 13:12:11 +0100 Subject: [PATCH 11/13] um: Fix build error and kconfig for i386 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build error by generating elfcore.o only when ELF_CORE (depending on COREDUMP) is selected: arch/x86/um/built-in.o: In function `elf_core_write_extra_phdrs': (.text+0x3e62): undefined reference to `dump_emit' arch/x86/um/built-in.o: In function `elf_core_write_extra_data': (.text+0x3eef): undefined reference to `dump_emit' Fixes: 5d2acfc7b974 ("kconfig: make allnoconfig disable options behind EMBEDDED and EXPERT") Signed-off-by: Mickaël Salaün Cc: Jeff Dike Cc: Richard Weinberger Cc: Josh Triplett Cc: Paul E. McKenney Cc: Michal Marek Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Richard Weinberger Reviewed-by: Josh Triplett --- arch/x86/um/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index a8fecc226946..3ee2bb6b440b 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -17,7 +17,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ksyms.o ldt.o \ ifeq ($(CONFIG_X86_32),y) obj-y += checksum_32.o -obj-$(CONFIG_BINFMT_ELF) += elfcore.o +obj-$(CONFIG_ELF_CORE) += elfcore.o subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o From 571d2f0c341fa15dbbb4fb73c80bd740ef37a9f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Tue, 22 Dec 2015 22:15:09 +0100 Subject: [PATCH 12/13] um: Do not set unsecure permission for temporary file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the insecure 0777 mode for temporary file to prohibit other users to change the executable mapped code. An attacker could gain access to the mapped file descriptor from the temporary file (before it is unlinked) in a read-only mode but it should not be accessible in write mode to avoid arbitrary code execution. To not change the hostfs behavior, the temporary file creation permission now depends on the current umask(2) and the implementation of mkstemp(3). Signed-off-by: Mickaël Salaün Cc: Jeff Dike Cc: Richard Weinberger Acked-by: Tristan Schmelcher Signed-off-by: Richard Weinberger --- arch/um/os-Linux/mem.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c index 897e9ad0c108..840d573f7e38 100644 --- a/arch/um/os-Linux/mem.c +++ b/arch/um/os-Linux/mem.c @@ -142,12 +142,6 @@ static int __init create_tmp_file(unsigned long long len) if (fd < 0) exit(1); - err = fchmod(fd, 0777); - if (err < 0) { - perror("fchmod"); - exit(1); - } - /* * Seek to len - 1 because writing a character there will * increase the file size by one byte, to the desired length. From 3e46b25376321db119bc8507ce8c8841c580e736 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Tue, 22 Dec 2015 22:15:10 +0100 Subject: [PATCH 13/13] um: Use race-free temporary file creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Open the memory mapped file with the O_TMPFILE flag when available. Signed-off-by: Mickaël Salaün Cc: Jeff Dike Cc: Richard Weinberger Acked-by: Tristan Schmelcher Signed-off-by: Richard Weinberger --- arch/um/os-Linux/mem.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c index 840d573f7e38..8b1767668515 100644 --- a/arch/um/os-Linux/mem.c +++ b/arch/um/os-Linux/mem.c @@ -106,6 +106,17 @@ static int __init make_tempfile(const char *template) } } +#ifdef O_TMPFILE + fd = open(tempdir, O_CLOEXEC | O_RDWR | O_EXCL | O_TMPFILE, 0700); + /* + * If the running system does not support O_TMPFILE flag then retry + * without it. + */ + if (fd != -1 || (errno != EINVAL && errno != EISDIR && + errno != EOPNOTSUPP)) + return fd; +#endif + tempname = malloc(strlen(tempdir) + strlen(template) + 1); if (tempname == NULL) return -1;