From 157cc18122b4a1456d19048e151a164216c4a704 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Apr 2022 09:48:54 -0500 Subject: [PATCH 01/12] signal: Rename send_signal send_signal_locked Rename send_signal and __send_signal to send_signal_locked and __send_signal_locked to make send_signal usable outside of signal.c. Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-1-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/signal.h | 2 ++ kernel/signal.c | 24 ++++++++++++------------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/include/linux/signal.h b/include/linux/signal.h index a6db6f2ae113..55605bdf5ce9 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -283,6 +283,8 @@ extern int do_send_sig_info(int sig, struct kernel_siginfo *info, extern int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *); +extern int send_signal_locked(int sig, struct kernel_siginfo *info, + struct task_struct *p, enum pid_type type); extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); diff --git a/kernel/signal.c b/kernel/signal.c index 30cd1ca43bcd..b0403197b0ad 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1071,8 +1071,8 @@ static inline bool legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } -static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, - enum pid_type type, bool force) +static int __send_signal_locked(int sig, struct kernel_siginfo *info, + struct task_struct *t, enum pid_type type, bool force) { struct sigpending *pending; struct sigqueue *q; @@ -1212,8 +1212,8 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) return ret; } -static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, - enum pid_type type) +int send_signal_locked(int sig, struct kernel_siginfo *info, + struct task_struct *t, enum pid_type type) { /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */ bool force = false; @@ -1245,7 +1245,7 @@ static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct force = true; } } - return __send_signal(sig, info, t, type, force); + return __send_signal_locked(sig, info, t, type, force); } static void print_fatal_signal(int signr) @@ -1284,7 +1284,7 @@ __setup("print-fatal-signals=", setup_print_fatal_signals); int __group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p) { - return send_signal(sig, info, p, PIDTYPE_TGID); + return send_signal_locked(sig, info, p, PIDTYPE_TGID); } int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, @@ -1294,7 +1294,7 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { - ret = send_signal(sig, info, p, type); + ret = send_signal_locked(sig, info, p, type); unlock_task_sighand(p, &flags); } @@ -1347,7 +1347,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, if (action->sa.sa_handler == SIG_DFL && (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; - ret = send_signal(sig, info, t, PIDTYPE_PID); + ret = send_signal_locked(sig, info, t, PIDTYPE_PID); spin_unlock_irqrestore(&t->sighand->siglock, flags); return ret; @@ -1567,7 +1567,7 @@ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, if (sig) { if (lock_task_sighand(p, &flags)) { - ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false); + ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false); unlock_task_sighand(p, &flags); } else ret = -ESRCH; @@ -2103,7 +2103,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) * parent's namespaces. */ if (valid_signal(sig) && sig) - __send_signal(sig, &info, tsk->parent, PIDTYPE_TGID, false); + __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); @@ -2601,7 +2601,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) /* If the (new) signal is now blocked, requeue it. */ if (sigismember(¤t->blocked, signr) || fatal_signal_pending(current)) { - send_signal(signr, info, current, type); + send_signal_locked(signr, info, current, type); signr = 0; } @@ -4793,7 +4793,7 @@ void kdb_send_sig(struct task_struct *t, int sig) "the deadlock.\n"); return; } - ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); + ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); spin_unlock(&t->sighand->siglock); if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", From e71ba124078e391879e0bf111529fa2d630d106c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Apr 2022 09:28:50 -0500 Subject: [PATCH 02/12] signal: Replace __group_send_sig_info with send_signal_locked The function __group_send_sig_info is just a light wrapper around send_signal_locked with one parameter fixed to a constant value. As the wrapper adds no real value update the code to directly call the wrapped function. Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-2-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- drivers/tty/tty_jobctrl.c | 4 ++-- include/linux/signal.h | 1 - kernel/signal.c | 8 +------- kernel/time/posix-cpu-timers.c | 6 +++--- 4 files changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/tty/tty_jobctrl.c b/drivers/tty/tty_jobctrl.c index 80b86a7992b5..0d04287da098 100644 --- a/drivers/tty/tty_jobctrl.c +++ b/drivers/tty/tty_jobctrl.c @@ -215,8 +215,8 @@ int tty_signal_session_leader(struct tty_struct *tty, int exit_session) spin_unlock_irq(&p->sighand->siglock); continue; } - __group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p); - __group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p); + send_signal_locked(SIGHUP, SEND_SIG_PRIV, p, PIDTYPE_TGID); + send_signal_locked(SIGCONT, SEND_SIG_PRIV, p, PIDTYPE_TGID); put_pid(p->signal->tty_old_pgrp); /* A noop */ spin_lock(&tty->ctrl.lock); tty_pgrp = get_pid(tty->ctrl.pgrp); diff --git a/include/linux/signal.h b/include/linux/signal.h index 55605bdf5ce9..3b98e7a28538 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -282,7 +282,6 @@ extern int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); -extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern int send_signal_locked(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int sigprocmask(int, sigset_t *, sigset_t *); diff --git a/kernel/signal.c b/kernel/signal.c index b0403197b0ad..72d96614effc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1281,12 +1281,6 @@ static int __init setup_print_fatal_signals(char *str) __setup("print-fatal-signals=", setup_print_fatal_signals); -int -__group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p) -{ - return send_signal_locked(sig, info, p, PIDTYPE_TGID); -} - int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { @@ -2173,7 +2167,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, spin_lock_irqsave(&sighand->siglock, flags); if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN && !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) - __group_send_sig_info(SIGCHLD, &info, parent); + send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID); /* * Even if SIGCHLD is not generated, we must wake up wait4 calls. */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0a97193984db..cb925e8ef9a8 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -870,7 +870,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) { if (tsk->dl.dl_overrun) { tsk->dl.dl_overrun = 0; - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + send_signal_locked(SIGXCPU, SEND_SIG_PRIV, tsk, PIDTYPE_TGID); } } @@ -884,7 +884,7 @@ static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) rt ? "RT" : "CPU", hard ? "hard" : "soft", current->comm, task_pid_nr(current)); } - __group_send_sig_info(signo, SEND_SIG_PRIV, current); + send_signal_locked(signo, SEND_SIG_PRIV, current, PIDTYPE_TGID); return true; } @@ -958,7 +958,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, trace_itimer_expire(signo == SIGPROF ? ITIMER_PROF : ITIMER_VIRTUAL, task_tgid(tsk), cur_time); - __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + send_signal_locked(signo, SEND_SIG_PRIV, tsk, PIDTYPE_TGID); } if (it->expires && it->expires < *expires) From c200e4bb44e80b343c09841e7caaaca0aac5e5fa Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Apr 2022 16:30:17 -0500 Subject: [PATCH 03/12] ptrace/um: Replace PT_DTRACE with TIF_SINGLESTEP User mode linux is the last user of the PT_DTRACE flag. Using the flag to indicate single stepping is a little confusing and worse changing tsk->ptrace without locking could potentionally cause problems. So use a thread info flag with a better name instead of flag in tsk->ptrace. Remove the definition PT_DTRACE as uml is the last user. Cc: stable@vger.kernel.org Acked-by: Johannes Berg Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-3-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- arch/um/include/asm/thread_info.h | 2 ++ arch/um/kernel/exec.c | 2 +- arch/um/kernel/process.c | 2 +- arch/um/kernel/ptrace.c | 8 ++++---- arch/um/kernel/signal.c | 4 ++-- include/linux/ptrace.h | 1 - 6 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index 1395cbd7e340..c7b4b49826a2 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -60,6 +60,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_RESTORE_SIGMASK 7 #define TIF_NOTIFY_RESUME 8 #define TIF_SECCOMP 9 /* secure computing */ +#define TIF_SINGLESTEP 10 /* single stepping userspace */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) @@ -68,5 +69,6 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_MEMDIE (1 << TIF_MEMDIE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #endif diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index c85e40c72779..58938d75871a 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -43,7 +43,7 @@ void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp) { PT_REGS_IP(regs) = eip; PT_REGS_SP(regs) = esp; - current->ptrace &= ~PT_DTRACE; + clear_thread_flag(TIF_SINGLESTEP); #ifdef SUBARCH_EXECVE1 SUBARCH_EXECVE1(regs->regs); #endif diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 80504680be08..88c5c7844281 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -335,7 +335,7 @@ int singlestepping(void * t) { struct task_struct *task = t ? t : current; - if (!(task->ptrace & PT_DTRACE)) + if (!test_thread_flag(TIF_SINGLESTEP)) return 0; if (task->thread.singlestep_syscall) diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index bfaf6ab1ac03..5154b27de580 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -11,7 +11,7 @@ void user_enable_single_step(struct task_struct *child) { - child->ptrace |= PT_DTRACE; + set_tsk_thread_flag(child, TIF_SINGLESTEP); child->thread.singlestep_syscall = 0; #ifdef SUBARCH_SET_SINGLESTEPPING @@ -21,7 +21,7 @@ void user_enable_single_step(struct task_struct *child) void user_disable_single_step(struct task_struct *child) { - child->ptrace &= ~PT_DTRACE; + clear_tsk_thread_flag(child, TIF_SINGLESTEP); child->thread.singlestep_syscall = 0; #ifdef SUBARCH_SET_SINGLESTEPPING @@ -120,7 +120,7 @@ static void send_sigtrap(struct uml_pt_regs *regs, int error_code) } /* - * XXX Check PT_DTRACE vs TIF_SINGLESTEP for singlestepping check and + * XXX Check TIF_SINGLESTEP for singlestepping check and * PT_PTRACED vs TIF_SYSCALL_TRACE for syscall tracing check */ int syscall_trace_enter(struct pt_regs *regs) @@ -144,7 +144,7 @@ void syscall_trace_leave(struct pt_regs *regs) audit_syscall_exit(regs); /* Fake a debug trap */ - if (ptraced & PT_DTRACE) + if (test_thread_flag(TIF_SINGLESTEP)) send_sigtrap(®s->regs, 0); if (!test_thread_flag(TIF_SYSCALL_TRACE)) diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c index 88cd9b5c1b74..ae4658f576ab 100644 --- a/arch/um/kernel/signal.c +++ b/arch/um/kernel/signal.c @@ -53,7 +53,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) unsigned long sp; int err; - if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) + if (test_thread_flag(TIF_SINGLESTEP) && (current->ptrace & PT_PTRACED)) singlestep = 1; /* Did we come from a system call? */ @@ -128,7 +128,7 @@ void do_signal(struct pt_regs *regs) * on the host. The tracing thread will check this flag and * PTRACE_SYSCALL if necessary. */ - if (current->ptrace & PT_DTRACE) + if (test_thread_flag(TIF_SINGLESTEP)) current->thread.singlestep_syscall = is_syscall(PT_REGS_IP(¤t->thread.regs)); diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 15b3d176b6b4..4c06f9f8ef3f 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -30,7 +30,6 @@ extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 -#define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */ #define PT_OPT_FLAG_SHIFT 3 /* PT_TRACE_* event enable flags */ From 4a3d2717d140401df7501a95e454180831a0c5af Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Apr 2022 16:45:37 -0500 Subject: [PATCH 04/12] ptrace/xtensa: Replace PT_SINGLESTEP with TIF_SINGLESTEP xtensa is the last user of the PT_SINGLESTEP flag. Changing tsk->ptrace in user_enable_single_step and user_disable_single_step without locking could potentiallly cause problems. So use a thread info flag instead of a flag in tsk->ptrace. Use TIF_SINGLESTEP that xtensa already had defined but unused. Remove the definitions of PT_SINGLESTEP and PT_BLOCKSTEP as they have no more users. Cc: stable@vger.kernel.org Acked-by: Max Filippov Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-4-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- arch/xtensa/kernel/ptrace.c | 4 ++-- arch/xtensa/kernel/signal.c | 4 ++-- include/linux/ptrace.h | 6 ------ 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c index 323c678a691f..b952e67cc0cc 100644 --- a/arch/xtensa/kernel/ptrace.c +++ b/arch/xtensa/kernel/ptrace.c @@ -225,12 +225,12 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) void user_enable_single_step(struct task_struct *child) { - child->ptrace |= PT_SINGLESTEP; + set_tsk_thread_flag(child, TIF_SINGLESTEP); } void user_disable_single_step(struct task_struct *child) { - child->ptrace &= ~PT_SINGLESTEP; + clear_tsk_thread_flag(child, TIF_SINGLESTEP); } /* diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index 6f68649e86ba..ac50ec46c8f1 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -473,7 +473,7 @@ static void do_signal(struct pt_regs *regs) /* Set up the stack frame */ ret = setup_frame(&ksig, sigmask_to_save(), regs); signal_setup_done(ret, &ksig, 0); - if (current->ptrace & PT_SINGLESTEP) + if (test_thread_flag(TIF_SINGLESTEP)) task_pt_regs(current)->icountlevel = 1; return; @@ -499,7 +499,7 @@ static void do_signal(struct pt_regs *regs) /* If there's no signal to deliver, we just restore the saved mask. */ restore_saved_sigmask(); - if (current->ptrace & PT_SINGLESTEP) + if (test_thread_flag(TIF_SINGLESTEP)) task_pt_regs(current)->icountlevel = 1; return; } diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 4c06f9f8ef3f..c952c5ba8fab 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -46,12 +46,6 @@ extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, #define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT) #define PT_SUSPEND_SECCOMP (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) -/* single stepping state bits (used on ARM and PA-RISC) */ -#define PT_SINGLESTEP_BIT 31 -#define PT_SINGLESTEP (1< Date: Fri, 29 Apr 2022 11:09:11 -0500 Subject: [PATCH 05/12] ptrace: Remove arch_ptrace_attach The last remaining implementation of arch_ptrace_attach is ia64's ptrace_attach_sync_user_rbs which was added at the end of 2007 in commit aa91a2e90044 ("[IA64] Synchronize RBS on PTRACE_ATTACH"). Reading the comments and examining the code ptrace_attach_sync_user_rbs has the sole purpose of saving registers to the stack when ptrace_attach changes TASK_STOPPED to TASK_TRACED. In all other cases arch_ptrace_stop takes care of the register saving. In commit d79fdd6d96f4 ("ptrace: Clean transitions between TASK_STOPPED and TRACED") modified ptrace_attach to wake up the thread and enter ptrace_stop normally even when the thread starts out stopped. This makes ptrace_attach_sync_user_rbs completely unnecessary. So just remove it. I read through the code to verify that ptrace_attach_sync_user_rbs is unnecessary. What I found is that the code is quite dead. Reading ptrace_attach_sync_user_rbs it is easy to see that the it does nothing unless __state == TASK_STOPPED. Calling arch_ptrace_attach (aka ptrace_attach_sync_user_rbs) after ptrace_traceme it is easy to see that because we are talking about the current process the value of __state is TASK_RUNNING. Which means ptrace_attach_sync_user_rbs does nothing. The only other call of arch_ptrace_attach (aka ptrace_attach_sync_user_rbs) is after ptrace_attach. If the task is running (and PTRACE_SEIZE is not specified), a SIGSTOP is sent which results in do_signal_stop setting JOBCTL_TRAP_STOP on the target task (as it is ptraced) and the target task stopping in ptrace_stop with __state == TASK_TRACED. If the task was already stopped then ptrace_attach sets JOBCTL_TRAPPING and JOBCTL_TRAP_STOP, wakes it out of __TASK_STOPPED, and waits until the JOBCTL_TRAPPING_BIT is clear. At which point the task stops in ptrace_stop. In both cases there are a couple of funning excpetions such as if the traced task receiveds a SIGCONT, or is set a fatal signal. However in all of those cases the tracee never stops in __state TASK_STOPPED. Which is a long way of saying that ptrace_attach_sync_user_rbs is guaranteed never to do anything. Cc: linux-ia64@vger.kernel.org Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-4-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- arch/ia64/include/asm/ptrace.h | 4 --- arch/ia64/kernel/ptrace.c | 57 ---------------------------------- kernel/ptrace.c | 18 ----------- 3 files changed, 79 deletions(-) diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h index a10a498eede1..402874489890 100644 --- a/arch/ia64/include/asm/ptrace.h +++ b/arch/ia64/include/asm/ptrace.h @@ -139,10 +139,6 @@ static inline long regs_return_value(struct pt_regs *regs) #define arch_ptrace_stop_needed() \ (!test_thread_flag(TIF_RESTORE_RSE)) - extern void ptrace_attach_sync_user_rbs (struct task_struct *); - #define arch_ptrace_attach(child) \ - ptrace_attach_sync_user_rbs(child) - #define arch_has_single_step() (1) #define arch_has_block_step() (1) diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index a19acd9f5e1f..a45f529046c3 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -617,63 +617,6 @@ void ia64_sync_krbs(void) unw_init_running(do_sync_rbs, ia64_sync_kernel_rbs); } -/* - * After PTRACE_ATTACH, a thread's register backing store area in user - * space is assumed to contain correct data whenever the thread is - * stopped. arch_ptrace_stop takes care of this on tracing stops. - * But if the child was already stopped for job control when we attach - * to it, then it might not ever get into ptrace_stop by the time we - * want to examine the user memory containing the RBS. - */ -void -ptrace_attach_sync_user_rbs (struct task_struct *child) -{ - int stopped = 0; - struct unw_frame_info info; - - /* - * If the child is in TASK_STOPPED, we need to change that to - * TASK_TRACED momentarily while we operate on it. This ensures - * that the child won't be woken up and return to user mode while - * we are doing the sync. (It can only be woken up for SIGKILL.) - */ - - read_lock(&tasklist_lock); - if (child->sighand) { - spin_lock_irq(&child->sighand->siglock); - if (READ_ONCE(child->__state) == TASK_STOPPED && - !test_and_set_tsk_thread_flag(child, TIF_RESTORE_RSE)) { - set_notify_resume(child); - - WRITE_ONCE(child->__state, TASK_TRACED); - stopped = 1; - } - spin_unlock_irq(&child->sighand->siglock); - } - read_unlock(&tasklist_lock); - - if (!stopped) - return; - - unw_init_from_blocked_task(&info, child); - do_sync_rbs(&info, ia64_sync_user_rbs); - - /* - * Now move the child back into TASK_STOPPED if it should be in a - * job control stop, so that SIGCONT can be used to wake it up. - */ - read_lock(&tasklist_lock); - if (child->sighand) { - spin_lock_irq(&child->sighand->siglock); - if (READ_ONCE(child->__state) == TASK_TRACED && - (child->signal->flags & SIGNAL_STOP_STOPPED)) { - WRITE_ONCE(child->__state, TASK_STOPPED); - } - spin_unlock_irq(&child->sighand->siglock); - } - read_unlock(&tasklist_lock); -} - /* * Write f32-f127 back to task->thread.fph if it has been modified. */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ccc4b465775b..da30dcd477a0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1285,10 +1285,6 @@ int ptrace_request(struct task_struct *child, long request, return ret; } -#ifndef arch_ptrace_attach -#define arch_ptrace_attach(child) do { } while (0) -#endif - SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, unsigned long, data) { @@ -1297,8 +1293,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, if (request == PTRACE_TRACEME) { ret = ptrace_traceme(); - if (!ret) - arch_ptrace_attach(current); goto out; } @@ -1310,12 +1304,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { ret = ptrace_attach(child, request, addr, data); - /* - * Some architectures need to do book-keeping after - * a ptrace attach. - */ - if (!ret) - arch_ptrace_attach(child); goto out_put_task_struct; } @@ -1455,12 +1443,6 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid, if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { ret = ptrace_attach(child, request, addr, data); - /* - * Some architectures need to do book-keeping after - * a ptrace attach. - */ - if (!ret) - arch_ptrace_attach(child); goto out_put_task_struct; } From cb3c19c93d656caa6fe63d6277aabd7e570f1d03 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 Apr 2022 09:16:10 -0500 Subject: [PATCH 06/12] signal: Use lockdep_assert_held instead of assert_spin_locked The distinction is that assert_spin_locked() checks if the lock is held *by*anyone* whereas lockdep_assert_held() asserts the current context holds the lock. Also, the check goes away if you build without lockdep. Suggested-by: Peter Zijlstra Link: https://lkml.kernel.org/r/Ympr/+PX4XgT/UKU@hirez.programming.kicks-ass.net Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-6-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index 72d96614effc..3fd2ce133387 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -884,7 +884,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, static void ptrace_trap_notify(struct task_struct *t) { WARN_ON_ONCE(!(t->ptrace & PT_SEIZED)); - assert_spin_locked(&t->sighand->siglock); + lockdep_assert_held(&t->sighand->siglock); task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); @@ -1079,7 +1079,7 @@ static int __send_signal_locked(int sig, struct kernel_siginfo *info, int override_rlimit; int ret = 0, result; - assert_spin_locked(&t->sighand->siglock); + lockdep_assert_held(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, force)) From 6a2d90ba027adba528509ffa27097cffd3879257 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 Apr 2022 09:23:55 -0500 Subject: [PATCH 07/12] ptrace: Reimplement PTRACE_KILL by always sending SIGKILL The current implementation of PTRACE_KILL is buggy and has been for many years as it assumes it's target has stopped in ptrace_stop. At a quick skim it looks like this assumption has existed since ptrace support was added in linux v1.0. While PTRACE_KILL has been deprecated we can not remove it as a quick search with google code search reveals many existing programs calling it. When the ptracee is not stopped at ptrace_stop some fields would be set that are ignored except in ptrace_stop. Making the userspace visible behavior of PTRACE_KILL a noop in those case. As the usual rules are not obeyed it is not clear what the consequences are of calling PTRACE_KILL on a running process. Presumably userspace does not do this as it achieves nothing. Replace the implementation of PTRACE_KILL with a simple send_sig_info(SIGKILL) followed by a return 0. This changes the observable user space behavior only in that PTRACE_KILL on a process not stopped in ptrace_stop will also kill it. As that has always been the intent of the code this seems like a reasonable change. Cc: stable@vger.kernel.org Reported-by: Al Viro Suggested-by: Al Viro Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-7-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- arch/x86/kernel/step.c | 3 +-- kernel/ptrace.c | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 0f3c307b37b3..8e2b2552b5ee 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -180,8 +180,7 @@ void set_task_blockstep(struct task_struct *task, bool on) * * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if * task is current or it can't be running, otherwise we can race - * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but - * PTRACE_KILL is not safe. + * with __switch_to_xtra(). We rely on ptrace_freeze_traced(). */ local_irq_disable(); debugctl = get_debugctlmsr(); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index da30dcd477a0..7105821595bc 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1236,9 +1236,8 @@ int ptrace_request(struct task_struct *child, long request, return ptrace_resume(child, request, data); case PTRACE_KILL: - if (child->exit_state) /* already dead */ - return 0; - return ptrace_resume(child, request, SIGKILL); + send_sig_info(SIGKILL, SEND_SIG_NOINFO, child); + return 0; #ifdef CONFIG_HAVE_ARCH_TRACEHOOK case PTRACE_GETREGSET: From 7b0fe1367ef2d2591c20f03c4e64b7230e1ebcd7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 5 May 2022 12:25:36 -0500 Subject: [PATCH 08/12] ptrace: Document that wait_task_inactive can't fail After ptrace_freeze_traced succeeds it is known that the tracee has a __state value of __TASK_TRACED and that no __ptrace_unlink will happen because the tracer is waiting for the tracee, and the tracee is in ptrace_stop. The function ptrace_freeze_traced can succeed at any point after ptrace_stop has set TASK_TRACED and dropped siglock. The read_lock on tasklist_lock only excludes ptrace_attach. This means that the !current->ptrace which executes under a read_lock of tasklist_lock will never see a ptrace_freeze_trace as the tracer must have gone away before the tasklist_lock was taken and ptrace_attach can not occur until the read_lock is dropped. As ptrace_freeze_traced depends upon ptrace_attach running before it can run that excludes ptrace_freeze_traced until __state is set to TASK_RUNNING. This means that task_is_traced will fail in ptrace_freeze_attach and ptrace_freeze_attached will fail. On the current->ptrace branch of ptrace_stop which will be reached any time after ptrace_freeze_traced has succeed it is known that __state is __TASK_TRACED and schedule() will be called with that state. Use a WARN_ON_ONCE to document that wait_task_inactive(TASK_TRACED) should never fail. Remove the stale comment about may_ptrace_stop. Strictly speaking this is not true because if PREEMPT_RT is enabled wait_task_inactive can fail because __state can be changed. I don't see this as a problem as the ptrace code is currently broken on PREMPT_RT, and this is one of the issues. Failing and warning when the assumptions of the code are broken is good. Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-8-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 7105821595bc..05953ac9f7bd 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -266,17 +266,9 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) } read_unlock(&tasklist_lock); - if (!ret && !ignore_state) { - if (!wait_task_inactive(child, __TASK_TRACED)) { - /* - * This can only happen if may_ptrace_stop() fails and - * ptrace_stop() changes ->state back to TASK_RUNNING, - * so we should not worry about leaking __TASK_TRACED. - */ - WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED); - ret = -ESRCH; - } - } + if (!ret && !ignore_state && + WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED))) + ret = -ESRCH; return ret; } From 57b6de08b5f6586851c2261ef0cc16cd275615e7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 May 2022 13:39:58 -0500 Subject: [PATCH 09/12] ptrace: Admit ptrace_stop can generate spuriuos SIGTRAPs Long ago and far away there was a BUG_ON at the start of ptrace_stop that did "BUG_ON(!(current->ptrace & PT_PTRACED));" [1]. The BUG_ON had never triggered but examination of the code showed that the BUG_ON could actually trigger. To complement removing the BUG_ON an attempt to better handle the race was added. The code detected the tracer had gone away and did not call do_notify_parent_cldstop. The code also attempted to prevent ptrace_report_syscall from sending spurious SIGTRAPs when the tracer went away. The code to detect when the tracer had gone away before sending a signal to tracer was a legitimate fix and continues to work to this date. The code to prevent sending spurious SIGTRAPs is a failure. At the time and until today the code only catches it when the tracer goes away after siglock is dropped and before read_lock is acquired. If the tracer goes away after read_lock is dropped a spurious SIGTRAP can still be sent to the tracee. The tracer going away after read_lock is dropped is the far likelier case as it is the bigger window. Given that the attempt to prevent the generation of a SIGTRAP was a failure and continues to be a failure remove the code that attempts to do that. This simplifies the code in ptrace_stop and makes ptrace_stop much easier to reason about. To successfully deal with the tracer going away, all of the tracer's instrumentation of the child would need to be removed, and reliably detecting when the tracer has set a signal to continue with would need to be implemented. [1] 66519f549ae5 ("[PATCH] fix ptracer death race yielding bogus BUG_ON") History-Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-9-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 92 ++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 54 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index 3fd2ce133387..d2d0c753156c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2187,13 +2187,12 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, * with. If the code did not stop because the tracer is gone, * the stop signal remains unchanged unless clear_code. */ -static int ptrace_stop(int exit_code, int why, int clear_code, - unsigned long message, kernel_siginfo_t *info) +static int ptrace_stop(int exit_code, int why, unsigned long message, + kernel_siginfo_t *info) __releases(¤t->sighand->siglock) __acquires(¤t->sighand->siglock) { bool gstop_done = false; - bool read_code = true; if (arch_ptrace_stop_needed()) { /* @@ -2212,7 +2211,14 @@ static int ptrace_stop(int exit_code, int why, int clear_code, /* * schedule() will not sleep if there is a pending signal that * can awaken the task. + * + * After this point ptrace_signal_wake_up will clear TASK_TRACED + * if ptrace_unlink happens. Handle previous ptrace_unlinks + * here to prevent ptrace_stop sleeping in schedule. */ + if (!current->ptrace) + return exit_code; + set_special_state(TASK_TRACED); /* @@ -2259,54 +2265,33 @@ static int ptrace_stop(int exit_code, int why, int clear_code, spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); - if (likely(current->ptrace)) { - /* - * Notify parents of the stop. - * - * While ptraced, there are two parents - the ptracer and - * the real_parent of the group_leader. The ptracer should - * know about every stop while the real parent is only - * interested in the completion of group stop. The states - * for the two don't interact with each other. Notify - * separately unless they're gonna be duplicates. - */ + /* + * Notify parents of the stop. + * + * While ptraced, there are two parents - the ptracer and + * the real_parent of the group_leader. The ptracer should + * know about every stop while the real parent is only + * interested in the completion of group stop. The states + * for the two don't interact with each other. Notify + * separately unless they're gonna be duplicates. + */ + if (current->ptrace) do_notify_parent_cldstop(current, true, why); - if (gstop_done && ptrace_reparented(current)) - do_notify_parent_cldstop(current, false, why); + if (gstop_done && (!current->ptrace || ptrace_reparented(current))) + do_notify_parent_cldstop(current, false, why); - /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. - * - * XXX: implement read_unlock_no_resched(). - */ - preempt_disable(); - read_unlock(&tasklist_lock); - cgroup_enter_frozen(); - preempt_enable_no_resched(); - freezable_schedule(); - cgroup_leave_frozen(true); - } else { - /* - * By the time we got the lock, our tracer went away. - * Don't drop the lock yet, another tracer may come. - * - * If @gstop_done, the ptracer went away between group stop - * completion and here. During detach, it would have set - * JOBCTL_STOP_PENDING on us and we'll re-enter - * TASK_STOPPED in do_signal_stop() on return, so notifying - * the real parent of the group stop completion is enough. - */ - if (gstop_done) - do_notify_parent_cldstop(current, false, why); - - /* tasklist protects us from ptrace_freeze_traced() */ - __set_current_state(TASK_RUNNING); - read_code = false; - if (clear_code) - exit_code = 0; - read_unlock(&tasklist_lock); - } + /* + * Don't want to allow preemption here, because + * sys_ptrace() needs this task to be inactive. + * + * XXX: implement read_unlock_no_resched(). + */ + preempt_disable(); + read_unlock(&tasklist_lock); + cgroup_enter_frozen(); + preempt_enable_no_resched(); + freezable_schedule(); + cgroup_leave_frozen(true); /* * We are back. Now reacquire the siglock before touching @@ -2314,8 +2299,7 @@ static int ptrace_stop(int exit_code, int why, int clear_code, * any signal-sending on another CPU that wants to examine it. */ spin_lock_irq(¤t->sighand->siglock); - if (read_code) - exit_code = current->exit_code; + exit_code = current->exit_code; current->last_siginfo = NULL; current->ptrace_message = 0; current->exit_code = 0; @@ -2343,7 +2327,7 @@ static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long mes info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); /* Let the debugger run. */ - return ptrace_stop(exit_code, why, 1, message, &info); + return ptrace_stop(exit_code, why, message, &info); } int ptrace_notify(int exit_code, unsigned long message) @@ -2515,7 +2499,7 @@ static void do_jobctl_trap(void) CLD_STOPPED, 0); } else { WARN_ON_ONCE(!signr); - ptrace_stop(signr, CLD_STOPPED, 0, 0, NULL); + ptrace_stop(signr, CLD_STOPPED, 0, NULL); } } @@ -2568,7 +2552,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) * comment in dequeue_signal(). */ current->jobctl |= JOBCTL_STOP_DEQUEUED; - signr = ptrace_stop(signr, CLD_TRAPPED, 0, 0, info); + signr = ptrace_stop(signr, CLD_TRAPPED, 0, info); /* We're back. Did the debugger cancel the sig? */ if (signr == 0) From 2500ad1c7fa42ad734677853961a3a8bec0772c5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 Apr 2022 08:43:34 -0500 Subject: [PATCH 10/12] ptrace: Don't change __state Stop playing with tsk->__state to remove TASK_WAKEKILL while a ptrace command is executing. Instead remove TASK_WAKEKILL from the definition of TASK_TRACED, and implement a new jobctl flag TASK_PTRACE_FROZEN. This new flag is set in jobctl_freeze_task and cleared when ptrace_stop is awoken or in jobctl_unfreeze_task (when ptrace_stop remains asleep). In signal_wake_up add __TASK_TRACED to state along with TASK_WAKEKILL when the wake up is for a fatal signal. Skip adding __TASK_TRACED when TASK_PTRACE_FROZEN is not set. This has the same effect as changing TASK_TRACED to __TASK_TRACED as all of the wake_ups that use TASK_KILLABLE go through signal_wake_up. Handle a ptrace_stop being called with a pending fatal signal. Previously it would have been handled by schedule simply failing to sleep. As TASK_WAKEKILL is no longer part of TASK_TRACED schedule will sleep with a fatal_signal_pending. The code in signal_wake_up guarantees that the code will be awaked by any fatal signal that codes after TASK_TRACED is set. Previously the __state value of __TASK_TRACED was changed to TASK_RUNNING when woken up or back to TASK_TRACED when the code was left in ptrace_stop. Now when woken up ptrace_stop now clears JOBCTL_PTRACE_FROZEN and when left sleeping ptrace_unfreezed_traced clears JOBCTL_PTRACE_FROZEN. Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-10-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/sched.h | 2 +- include/linux/sched/jobctl.h | 2 ++ include/linux/sched/signal.h | 5 +++-- kernel/ptrace.c | 21 ++++++++------------- kernel/sched/core.c | 5 +---- kernel/signal.c | 14 ++++++-------- 6 files changed, 21 insertions(+), 28 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d5e3c00b74e1..610f2fdb1e2c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -103,7 +103,7 @@ struct task_group; /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) -#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) +#define TASK_TRACED __TASK_TRACED #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h index fa067de9f1a9..d556c3425963 100644 --- a/include/linux/sched/jobctl.h +++ b/include/linux/sched/jobctl.h @@ -19,6 +19,7 @@ struct task_struct; #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ #define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ +#define JOBCTL_PTRACE_FROZEN_BIT 24 /* frozen for ptrace */ #define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) @@ -28,6 +29,7 @@ struct task_struct; #define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) #define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) #define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) +#define JOBCTL_PTRACE_FROZEN (1UL << JOBCTL_PTRACE_FROZEN_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3c8b34876744..e66948abbee4 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -435,9 +435,10 @@ extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); -static inline void signal_wake_up(struct task_struct *t, bool resume) +static inline void signal_wake_up(struct task_struct *t, bool fatal) { - signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0); + fatal = fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN); + signal_wake_up_state(t, fatal ? TASK_WAKEKILL | __TASK_TRACED : 0); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 05953ac9f7bd..83ed28262708 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -197,7 +197,7 @@ static bool ptrace_freeze_traced(struct task_struct *task) spin_lock_irq(&task->sighand->siglock); if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) { - WRITE_ONCE(task->__state, __TASK_TRACED); + task->jobctl |= JOBCTL_PTRACE_FROZEN; ret = true; } spin_unlock_irq(&task->sighand->siglock); @@ -207,23 +207,19 @@ static bool ptrace_freeze_traced(struct task_struct *task) static void ptrace_unfreeze_traced(struct task_struct *task) { - if (READ_ONCE(task->__state) != __TASK_TRACED) - return; - - WARN_ON(!task->ptrace || task->parent != current); + unsigned long flags; /* - * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely. - * Recheck state under the lock to close this race. + * The child may be awake and may have cleared + * JOBCTL_PTRACE_FROZEN (see ptrace_resume). The child will + * not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew. */ - spin_lock_irq(&task->sighand->siglock); - if (READ_ONCE(task->__state) == __TASK_TRACED) { + if (lock_task_sighand(task, &flags)) { + task->jobctl &= ~JOBCTL_PTRACE_FROZEN; if (__fatal_signal_pending(task)) wake_up_state(task, __TASK_TRACED); - else - WRITE_ONCE(task->__state, TASK_TRACED); + unlock_task_sighand(task, &flags); } - spin_unlock_irq(&task->sighand->siglock); } /** @@ -256,7 +252,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) */ read_lock(&tasklist_lock); if (child->ptrace && child->parent == current) { - WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED); /* * child->sighand can't be NULL, release_task() * does ptrace_unlink() before __exit_signal(). diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d575b4914925..3c351707e830 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6304,10 +6304,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* * We must load prev->state once (task_struct::state is volatile), such - * that: - * - * - we form a control dependency vs deactivate_task() below. - * - ptrace_{,un}freeze_traced() can change ->state underneath us. + * that we form a control dependency vs deactivate_task() below. */ prev_state = READ_ONCE(prev->__state); if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { diff --git a/kernel/signal.c b/kernel/signal.c index d2d0c753156c..a58b68a2d3c6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2209,14 +2209,12 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, } /* - * schedule() will not sleep if there is a pending signal that - * can awaken the task. - * - * After this point ptrace_signal_wake_up will clear TASK_TRACED - * if ptrace_unlink happens. Handle previous ptrace_unlinks - * here to prevent ptrace_stop sleeping in schedule. + * After this point ptrace_signal_wake_up or signal_wake_up + * will clear TASK_TRACED if ptrace_unlink happens or a fatal + * signal comes in. Handle previous ptrace_unlinks and fatal + * signals here to prevent ptrace_stop sleeping in schedule. */ - if (!current->ptrace) + if (!current->ptrace || __fatal_signal_pending(current)) return exit_code; set_special_state(TASK_TRACED); @@ -2305,7 +2303,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, current->exit_code = 0; /* LISTENING can be set only during STOP traps, clear it */ - current->jobctl &= ~JOBCTL_LISTENING; + current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN); /* * Queued signals ignored us while we were stopped for tracing. From 5b4197cb287daf3cfd008fbf8682a1d6f4b13c0b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 Apr 2022 10:50:17 -0500 Subject: [PATCH 11/12] ptrace: Always take siglock in ptrace_resume Make code analysis simpler and future changes easier by always taking siglock in ptrace_resume. Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-11-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 83ed28262708..36a5b7a00d2f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -837,8 +837,6 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, static int ptrace_resume(struct task_struct *child, long request, unsigned long data) { - bool need_siglock; - if (!valid_signal(data)) return -EIO; @@ -874,18 +872,11 @@ static int ptrace_resume(struct task_struct *child, long request, * Note that we need siglock even if ->exit_code == data and/or this * status was not reported yet, the new status must not be cleared by * wait_task_stopped() after resume. - * - * If data == 0 we do not care if wait_task_stopped() reports the old - * status and clears the code too; this can't race with the tracee, it - * takes siglock after resume. */ - need_siglock = data && !thread_group_empty(current); - if (need_siglock) - spin_lock_irq(&child->sighand->siglock); + spin_lock_irq(&child->sighand->siglock); child->exit_code = data; wake_up_state(child, __TASK_TRACED); - if (need_siglock) - spin_unlock_irq(&child->sighand->siglock); + spin_unlock_irq(&child->sighand->siglock); return 0; } From 31cae1eaae4fd65095ad6a3659db467bc3c2599e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 May 2022 15:57:47 -0500 Subject: [PATCH 12/12] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state Currently ptrace_stop() / do_signal_stop() rely on the special states TASK_TRACED and TASK_STOPPED resp. to keep unique state. That is, this state exists only in task->__state and nowhere else. There's two spots of bother with this: - PREEMPT_RT has task->saved_state which complicates matters, meaning task_is_{traced,stopped}() needs to check an additional variable. - An alternative freezer implementation that itself relies on a special TASK state would loose TASK_TRACED/TASK_STOPPED and will result in misbehaviour. As such, add additional state to task->jobctl to track this state outside of task->__state. NOTE: this doesn't actually fix anything yet, just adds extra state. --EWB * didn't add a unnecessary newline in signal.h * Update t->jobctl in signal_wake_up and ptrace_signal_wake_up instead of in signal_wake_up_state. This prevents the clearing of TASK_STOPPED and TASK_TRACED from getting lost. * Added warnings if JOBCTL_STOPPED or JOBCTL_TRACED are not cleared Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220421150654.757693825@infradead.org Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-12-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- include/linux/sched.h | 8 +++----- include/linux/sched/jobctl.h | 6 ++++++ include/linux/sched/signal.h | 19 +++++++++++++++---- kernel/ptrace.c | 16 +++++++++++++--- kernel/signal.c | 10 ++++++++-- 5 files changed, 45 insertions(+), 14 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 610f2fdb1e2c..cbe5c899599c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -118,11 +118,9 @@ struct task_group; #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) -#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0) - -#define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0) - -#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0) +#define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) +#define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) +#define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) /* * Special states are those that do not use the normal wait-loop pattern. See diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h index d556c3425963..68876d0a7ef9 100644 --- a/include/linux/sched/jobctl.h +++ b/include/linux/sched/jobctl.h @@ -21,6 +21,9 @@ struct task_struct; #define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ #define JOBCTL_PTRACE_FROZEN_BIT 24 /* frozen for ptrace */ +#define JOBCTL_STOPPED_BIT 26 /* do_signal_stop() */ +#define JOBCTL_TRACED_BIT 27 /* ptrace_stop() */ + #define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) #define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT) @@ -31,6 +34,9 @@ struct task_struct; #define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) #define JOBCTL_PTRACE_FROZEN (1UL << JOBCTL_PTRACE_FROZEN_BIT) +#define JOBCTL_STOPPED (1UL << JOBCTL_STOPPED_BIT) +#define JOBCTL_TRACED (1UL << JOBCTL_TRACED_BIT) + #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index e66948abbee4..07ba3404fcde 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -294,8 +294,10 @@ static inline int kernel_dequeue_signal(void) static inline void kernel_signal_stop(void) { spin_lock_irq(¤t->sighand->siglock); - if (current->jobctl & JOBCTL_STOP_DEQUEUED) + if (current->jobctl & JOBCTL_STOP_DEQUEUED) { + current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); + } spin_unlock_irq(¤t->sighand->siglock); schedule(); @@ -437,12 +439,21 @@ extern void signal_wake_up_state(struct task_struct *t, unsigned int state); static inline void signal_wake_up(struct task_struct *t, bool fatal) { - fatal = fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN); - signal_wake_up_state(t, fatal ? TASK_WAKEKILL | __TASK_TRACED : 0); + unsigned int state = 0; + if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) { + t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED); + state = TASK_WAKEKILL | __TASK_TRACED; + } + signal_wake_up_state(t, state); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { - signal_wake_up_state(t, resume ? __TASK_TRACED : 0); + unsigned int state = 0; + if (resume) { + t->jobctl &= ~JOBCTL_TRACED; + state = __TASK_TRACED; + } + signal_wake_up_state(t, state); } void task_join_group_stop(struct task_struct *task); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 36a5b7a00d2f..328a34a99124 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -185,7 +185,12 @@ static bool looks_like_a_spurious_pid(struct task_struct *task) return true; } -/* Ensure that nothing can wake it up, even SIGKILL */ +/* + * Ensure that nothing can wake it up, even SIGKILL + * + * A task is switched to this state while a ptrace operation is in progress; + * such that the ptrace operation is uninterruptible. + */ static bool ptrace_freeze_traced(struct task_struct *task) { bool ret = false; @@ -216,8 +221,10 @@ static void ptrace_unfreeze_traced(struct task_struct *task) */ if (lock_task_sighand(task, &flags)) { task->jobctl &= ~JOBCTL_PTRACE_FROZEN; - if (__fatal_signal_pending(task)) + if (__fatal_signal_pending(task)) { + task->jobctl &= ~TASK_TRACED; wake_up_state(task, __TASK_TRACED); + } unlock_task_sighand(task, &flags); } } @@ -462,8 +469,10 @@ static int ptrace_attach(struct task_struct *task, long request, * in and out of STOPPED are protected by siglock. */ if (task_is_stopped(task) && - task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) + task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) { + task->jobctl &= ~JOBCTL_STOPPED; signal_wake_up_state(task, __TASK_STOPPED); + } spin_unlock(&task->sighand->siglock); @@ -875,6 +884,7 @@ static int ptrace_resume(struct task_struct *child, long request, */ spin_lock_irq(&child->sighand->siglock); child->exit_code = data; + child->jobctl &= ~JOBCTL_TRACED; wake_up_state(child, __TASK_TRACED); spin_unlock_irq(&child->sighand->siglock); diff --git a/kernel/signal.c b/kernel/signal.c index a58b68a2d3c6..e782c2611b64 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -762,7 +762,10 @@ static int dequeue_synchronous_signal(kernel_siginfo_t *info) */ void signal_wake_up_state(struct task_struct *t, unsigned int state) { + lockdep_assert_held(&t->sighand->siglock); + set_tsk_thread_flag(t, TIF_SIGPENDING); + /* * TASK_WAKEKILL also means wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it @@ -930,9 +933,10 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) for_each_thread(p, t) { flush_sigqueue_mask(&flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); - if (likely(!(t->ptrace & PT_SEIZED))) + if (likely(!(t->ptrace & PT_SEIZED))) { + t->jobctl &= ~JOBCTL_STOPPED; wake_up_state(t, __TASK_STOPPED); - else + } else ptrace_trap_notify(t); } @@ -2218,6 +2222,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, return exit_code; set_special_state(TASK_TRACED); + current->jobctl |= JOBCTL_TRACED; /* * We're committing to trapping. TRACED should be visible before @@ -2436,6 +2441,7 @@ static bool do_signal_stop(int signr) if (task_participate_group_stop(current)) notify = CLD_STOPPED; + current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock);