futex: Replace PF_EXITPIDONE with a state

The futex exit handling relies on PF_ flags. That's suboptimal as it requires a smp_mb() and an ugly lock/unlock of the exiting tasks pi_lock in the middle of do_exit() to enforce the observability of PF_EXITING in the futex code. Add a futex_state member to task_struct and convert the PF_EXITPIDONE logic over to the new state. The PF_EXITING dependency will be cleaned up in a later step. This prepares for handling various futex exit issues later. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Ingo Molnar <mingo@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lkml.kernel.org/r/20191106224556.149449274@linutronix.de
2019-11-06 22:55:37 +01:00 · 2019-11-06 22:55:37 +01:00 · 3d4775df0a
parent ba31c1a485
commit 3d4775df0a
4 changed files with 49 additions and 29 deletions
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@ -50,6 +50,10 @@ union futex_key {
 #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
 #ifdef CONFIG_FUTEX
 enum {
 	FUTEX_STATE_OK,
 	FUTEX_STATE_DEAD,
 };
 static inline void futex_init_task(struct task_struct *tsk)
 {
@ -59,6 +63,34 @@ static inline void futex_init_task(struct task_struct *tsk)
 #endif
 	INIT_LIST_HEAD(&tsk->pi_state_list);
 	tsk->pi_state_cache = NULL;
 	tsk->futex_state = FUTEX_STATE_OK;
 }
 /**
 * futex_exit_done - Sets the tasks futex state to FUTEX_STATE_DEAD
 * @tsk:	task to set the state on
 *
 * Set the futex exit state of the task lockless. The futex waiter code
 * observes that state when a task is exiting and loops until the task has
 * actually finished the futex cleanup. The worst case for this is that the
 * waiter runs through the wait loop until the state becomes visible.
 *
 * This has two callers:
 *
 * - futex_mm_release() after the futex exit cleanup has been done
 *
 * - do_exit() from the recursive fault handling path.
 *
 * In case of a recursive fault this is best effort. Either the futex exit
 * code has run already or not. If the OWNER_DIED bit has been set on the
 * futex then the waiter can take it over. If not, the problem is pushed
 * back to user space. If the futex exit code did not run yet, then an
 * already queued waiter might block forever, but there is nothing which
 * can be done about that.
 */
 static inline void futex_exit_done(struct task_struct *tsk)
 {
 	tsk->futex_state = FUTEX_STATE_DEAD;
 }
 void futex_mm_release(struct task_struct *tsk);
@ -68,6 +100,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 #else
 static inline void futex_init_task(struct task_struct *tsk) { }
 static inline void futex_mm_release(struct task_struct *tsk) { }
 static inline void futex_exit_done(struct task_struct *tsk) { }
 static inline long do_futex(u32 __user *uaddr, int op, u32 val,
 			    ktime_t *timeout, u32 __user *uaddr2,
 			    u32 val2, u32 val3)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -1053,6 +1053,7 @@ struct task_struct {
 #endif
 	struct list_head		pi_state_list;
 	struct futex_pi_state		*pi_state_cache;
 	unsigned int			futex_state;
 #endif
 #ifdef CONFIG_PERF_EVENTS
 	struct perf_event_context	*perf_event_ctxp[perf_nr_task_contexts];
@ -1441,7 +1442,6 @@ extern struct pid *cad_pid;
 */
 #define PF_IDLE			0x00000002	/* I am an IDLE thread */
 #define PF_EXITING		0x00000004	/* Getting shut down */
 #define PF_EXITPIDONE		0x00000008	/* PI exit done on shut down */
 #define PF_VCPU			0x00000010	/* I'm a virtual CPU */
 #define PF_WQ_WORKER		0x00000020	/* I'm a workqueue worker */
 #define PF_FORKNOEXEC		0x00000040	/* Forked but didn't exec */
--- a/kernel/exit.c
+++ b/kernel/exit.c
@ -746,16 +746,7 @@ void __noreturn do_exit(long code)
 	 */
 	if (unlikely(tsk->flags & PF_EXITING)) {
 		pr_alert("Fixing recursive fault but reboot is needed!\n");
-		/*
+		futex_exit_done(tsk);
 		 * We can do this unlocked here. The futex code uses
 		 * this flag just to verify whether the pi state
 		 * cleanup has been done or not. In the worst case it
 		 * loops once more. We pretend that the cleanup was
 		 * done as there is no way to return. Either the
 		 * OWNER_DIED bit is set by now or we push the blocked
 		 * task into the wait for ever nirwana as well.
 		 */
 		tsk->flags |= PF_EXITPIDONE;
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule();
 	}
@ -846,12 +837,7 @@ void __noreturn do_exit(long code)
 	 * Make sure we are holding no locks:
 	 */
 	debug_check_no_locks_held();
-	/*
+	futex_exit_done(tsk);
 	 * We can do this unlocked here. The futex code uses this flag
 	 * just to verify whether the pi state cleanup has been done
 	 * or not. In the worst case it loops once more.
 	 */
 	tsk->flags |= PF_EXITPIDONE;
 	if (tsk->io_context)
 		exit_io_context(tsk);
--- a/kernel/futex.c
+++ b/kernel/futex.c
@ -1182,9 +1182,10 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
 	u32 uval2;
 	/*
-	 * If PF_EXITPIDONE is not yet set, then try again.
+	 * If the futex exit state is not yet FUTEX_STATE_DEAD, wait
 	 * for it to finish.
 	 */
-	if (tsk && !(tsk->flags & PF_EXITPIDONE))
+	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
 		return -EAGAIN;
 	/*
@ -1203,8 +1204,9 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
 	 *    *uaddr = 0xC0000000;	     tsk = get_task(PID);
 	 *   }				     if (!tsk->flags & PF_EXITING) {
 	 *  ...				       attach();
-	 *  tsk->flags |= PF_EXITPIDONE;     } else {
+	 *  tsk->futex_state =               } else {
-	 *				       if (!(tsk->flags & PF_EXITPIDONE))
+	 *	FUTEX_STATE_DEAD;              if (tsk->futex_state !=
 	 *					  FUTEX_STATE_DEAD)
 	 *				         return -EAGAIN;
 	 *				       return -ESRCH; <--- FAIL
 	 *				     }
@ -1260,17 +1262,16 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
 	}
 	/*
-	 * We need to look at the task state flags to figure out,
+	 * We need to look at the task state to figure out, whether the
-	 * whether the task is exiting. To protect against the do_exit
+	 * task is exiting. To protect against the change of the task state
-	 * change of the task flags, we do this protected by
+	 * in futex_exit_release(), we do this protected by p->pi_lock:
 	 * p->pi_lock:
 	 */
 	raw_spin_lock_irq(&p->pi_lock);
-	if (unlikely(p->flags & PF_EXITING)) {
+	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
 		/*
-		 * The task is on the way out. When PF_EXITPIDONE is
+		 * The task is on the way out. When the futex state is
-		 * set, we know that the task has finished the
+		 * FUTEX_STATE_DEAD, we know that the task has finished
-		 * cleanup:
+		 * the cleanup:
 		 */
 		int ret = handle_exit_race(uaddr, uval, p);