A set of PI futex fixes:

- Address a longstanding issue where the user space part of the PI futex is not writeable. The kernel returns with inconsistent state which can in the worst case result in a UAF of a tasks kernel stack. The solution is to establish consistent kernel state which makes future operations on the futex fail because user space and kernel space state are inconsistent. Not a problem as PI futexes fundamentaly require a functional RW mapping and if user space pulls the rug under it, then it can keep the pieces it asked for. - Address an issue where the return value is incorrect in case that the futex was acquired after a timeout/signal made the waiter drop out of the rtmutex wait. In one of the corner cases the kernel returned an error code despite having successfully acquired the futex. -----BEGIN PGP SIGNATURE----- iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmATCSYTHHRnbHhAbGlu dXRyb25peC5kZQAKCRCmGPVMDXSYoT8zEADDFWxDp9SwgMKBf98ZJqfPvPbIbmqW xhJr1R+RLBNBmYEahQKFPWqlPf0JMDzIjYeu8fxkwPVwsTBzSgJ7Lka1acDI/TQG X6kXTIgX/SNdkYZ534b+FcPutYW+KFrpprebIF5U8NLII2xnlNrcVaVzeUvs5qgx ssjP79eJAWFSCXl2CJFVneMIcbrU+7czgjW0Cr84Z6O0GanJqPWVzngzgts2bvZN K29BGQrWcQhq0W641f5toLYoF+kq98cgtMlwY49quZW9pbNToKeUXDrOro6y8fkg J8dYdzFH5KS9SHxACP19y4ILXqCi3GIkDs+RsUtcyY1ZoY20DLV4rEvG26pX9azO 5b6MeWsWHwHHxnp1apYenq0QjvfKKuJZdet1xctC+vivl+4RX4m2IvFONJLDQ++o fJ2j1m/39szg8gw3oObtL0zIPMczFMKgh7rCNkkTedghXevynMoi88RE643ELq8d MFr0C0n2ThHFbewtdyOZtZ6zERAKmPynx57iIETC63sChxiUfwmhFkxIuFUQkkwg G239O4dIVLGT4JKkOgyftRyog8JOzvdQb7qh1aYoj6tJNevVXIfuFOo6gRFp2SXp T2Rot+3TJaUS+f/GqsyHreJYhpLK7KSesjdYS1xYA9oWVtgt0AKVcY1AbSpyBifk ACXMtP8cjU0KgA== =wzqT -----END PGP SIGNATURE----- Merge tag 'locking-urgent-2021-01-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull locking fixes from Thomas Gleixner: "A set of PI futex fixes: - Address a longstanding issue where the user space part of the PI futex is not writeable. The kernel returns with inconsistent state which can in the worst case result in a UAF of a tasks kernel stack. The solution is to establish consistent kernel state which makes future operations on the futex fail because user space and kernel space state are inconsistent. Not a problem as PI futexes fundamentaly require a functional RW mapping and if user space pulls the rug under it, then it can keep the pieces it asked for. - Address an issue where the return value is incorrect in case that the futex was acquired after a timeout/signal made the waiter drop out of the rtmutex wait. In one of the corner cases the kernel returned an error code despite having successfully acquired the futex" * tag 'locking-urgent-2021-01-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: futex: Handle faults correctly for PI futexes futex: Simplify fixup_pi_state_owner() futex: Use pi_state_update_owner() in put_pi_state() rtmutex: Remove unused argument from rt_mutex_proxy_unlock() futex: Provide and use pi_state_update_owner() futex: Replace pointless printk in fixup_owner() futex: Ensure the correct return value from futex_lock_pi()
2021-01-28 11:18:43 -08:00 · 2021-01-28 11:18:43 -08:00 · c64396cc36
parent e5ff2cb9cf 34b1a1ce14
commit c64396cc36
3 changed files with 98 additions and 127 deletions
--- a/kernel/futex.c
+++ b/kernel/futex.c
@ -763,6 +763,29 @@ static struct futex_pi_state *alloc_pi_state(void)
 	return pi_state;
 }

+static void pi_state_update_owner(struct futex_pi_state *pi_state,
+				  struct task_struct *new_owner)
+{
+	struct task_struct *old_owner = pi_state->owner;
+
+	lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
+
+	if (old_owner) {
+		raw_spin_lock(&old_owner->pi_lock);
+		WARN_ON(list_empty(&pi_state->list));
+		list_del_init(&pi_state->list);
+		raw_spin_unlock(&old_owner->pi_lock);
+	}
+
+	if (new_owner) {
+		raw_spin_lock(&new_owner->pi_lock);
+		WARN_ON(!list_empty(&pi_state->list));
+		list_add(&pi_state->list, &new_owner->pi_state_list);
+		pi_state->owner = new_owner;
+		raw_spin_unlock(&new_owner->pi_lock);
+	}
+}
+
 static void get_pi_state(struct futex_pi_state *pi_state)
 {
 	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
@ -785,17 +808,11 @@ static void put_pi_state(struct futex_pi_state *pi_state)
 	 * and has cleaned up the pi_state already
 	 */
 	if (pi_state->owner) {
-		struct task_struct *owner;
 		unsigned long flags;

 		raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
-		owner = pi_state->owner;
-		if (owner) {
-			raw_spin_lock(&owner->pi_lock);
-			list_del_init(&pi_state->list);
-			raw_spin_unlock(&owner->pi_lock);
-		}
-		rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
+		pi_state_update_owner(pi_state, NULL);
+		rt_mutex_proxy_unlock(&pi_state->pi_mutex);
 		raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
 	}

@ -941,7 +958,8 @@ static inline void exit_pi_state_list(struct task_struct *curr) { }
 *	FUTEX_OWNER_DIED bit. See [4]
 *
 * [10] There is no transient state which leaves owner and user space
- *	TID out of sync.
+ *	TID out of sync. Except one error case where the kernel is denied
+ *	write access to the user address, see fixup_pi_state_owner().
 *
 *
 * Serialization and lifetime rules:
@ -1521,26 +1539,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
 			ret = -EINVAL;
 	}

-	if (ret)
-		goto out_unlock;
-
+	if (!ret) {
 		/*
-	 * This is a point of no return; once we modify the uval there is no
-	 * going back and subsequent operations must not fail.
+		 * This is a point of no return; once we modified the uval
+		 * there is no going back and subsequent operations must
+		 * not fail.
 		 */
-
-	raw_spin_lock(&pi_state->owner->pi_lock);
-	WARN_ON(list_empty(&pi_state->list));
-	list_del_init(&pi_state->list);
-	raw_spin_unlock(&pi_state->owner->pi_lock);
-
-	raw_spin_lock(&new_owner->pi_lock);
-	WARN_ON(!list_empty(&pi_state->list));
-	list_add(&pi_state->list, &new_owner->pi_state_list);
-	pi_state->owner = new_owner;
-	raw_spin_unlock(&new_owner->pi_lock);
-
+		pi_state_update_owner(pi_state, new_owner);
 		postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+	}

 out_unlock:
 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
@ -2323,18 +2330,13 @@ static void unqueue_me_pi(struct futex_q *q)
 	spin_unlock(q->lock_ptr);
 }

-static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 				  struct task_struct *argowner)
 {
 	struct futex_pi_state *pi_state = q->pi_state;
-	u32 uval, curval, newval;
 	struct task_struct *oldowner, *newowner;
-	u32 newtid;
-	int ret, err = 0;
-
-	lockdep_assert_held(q->lock_ptr);
-
-	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+	u32 uval, curval, newval, newtid;
+	int err = 0;

 	oldowner = pi_state->owner;

@ -2368,14 +2370,12 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 			 * We raced against a concurrent self; things are
 			 * already fixed up. Nothing to do.
 			 */
-			ret = 0;
-			goto out_unlock;
+			return 0;
 		}

 		if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
-			/* We got the lock after all, nothing to fix. */
-			ret = 0;
-			goto out_unlock;
+			/* We got the lock. pi_state is correct. Tell caller. */
+			return 1;
 		}

 		/*
@ -2402,8 +2402,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 			 * We raced against a concurrent self; things are
 			 * already fixed up. Nothing to do.
 			 */
-			ret = 0;
-			goto out_unlock;
+			return 1;
 		}
 		newowner = argowner;
 	}
@ -2433,22 +2432,9 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 	 * We fixed up user space. Now we need to fix the pi_state
 	 * itself.
 	 */
-	if (pi_state->owner != NULL) {
-		raw_spin_lock(&pi_state->owner->pi_lock);
-		WARN_ON(list_empty(&pi_state->list));
-		list_del_init(&pi_state->list);
-		raw_spin_unlock(&pi_state->owner->pi_lock);
-	}
+	pi_state_update_owner(pi_state, newowner);

-	pi_state->owner = newowner;
-
-	raw_spin_lock(&newowner->pi_lock);
-	WARN_ON(!list_empty(&pi_state->list));
-	list_add(&pi_state->list, &newowner->pi_state_list);
-	raw_spin_unlock(&newowner->pi_lock);
-	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-
-	return 0;
+	return argowner == current;

 	/*
 	 * In order to reschedule or handle a page fault, we need to drop the
@ -2469,17 +2455,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,

 	switch (err) {
 	case -EFAULT:
-		ret = fault_in_user_writeable(uaddr);
+		err = fault_in_user_writeable(uaddr);
 		break;

 	case -EAGAIN:
 		cond_resched();
-		ret = 0;
+		err = 0;
 		break;

 	default:
 		WARN_ON_ONCE(1);
-		ret = err;
 		break;
 	}

@ -2489,17 +2474,44 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 	/*
 	 * Check if someone else fixed it for us:
 	 */
-	if (pi_state->owner != oldowner) {
-		ret = 0;
-		goto out_unlock;
-	}
-
-	if (ret)
-		goto out_unlock;
+	if (pi_state->owner != oldowner)
+		return argowner == current;

+	/* Retry if err was -EAGAIN or the fault in succeeded */
+	if (!err)
 		goto retry;

-out_unlock:
+	/*
+	 * fault_in_user_writeable() failed so user state is immutable. At
+	 * best we can make the kernel state consistent but user state will
+	 * be most likely hosed and any subsequent unlock operation will be
+	 * rejected due to PI futex rule [10].
+	 *
+	 * Ensure that the rtmutex owner is also the pi_state owner despite
+	 * the user space value claiming something different. There is no
+	 * point in unlocking the rtmutex if current is the owner as it
+	 * would need to wait until the next waiter has taken the rtmutex
+	 * to guarantee consistent state. Keep it simple. Userspace asked
+	 * for this wreckaged state.
+	 *
+	 * The rtmutex has an owner - either current or some other
+	 * task. See the EAGAIN loop above.
+	 */
+	pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
+
+	return err;
+}
+
+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+				struct task_struct *argowner)
+{
+	struct futex_pi_state *pi_state = q->pi_state;
+	int ret;
+
+	lockdep_assert_held(q->lock_ptr);
+
+	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+	ret = __fixup_pi_state_owner(uaddr, q, argowner);
 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 	return ret;
 }
@ -2523,8 +2535,6 @@ static long futex_wait_restart(struct restart_block *restart);
 */
 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 {
-	int ret = 0;
-
 	if (locked) {
 		/*
 		 * Got the lock. We might not be the anticipated owner if we
@ -2535,8 +2545,8 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 		 * stable state, anything else needs more attention.
 		 */
 		if (q->pi_state->owner != current)
-			ret = fixup_pi_state_owner(uaddr, q, current);
-		return ret ? ret : locked;
+			return fixup_pi_state_owner(uaddr, q, current);
+		return 1;
 	}

 	/*
@ -2547,23 +2557,17 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 	 * Another speculative read; pi_state->owner == current is unstable
 	 * but needs our attention.
 	 */
-	if (q->pi_state->owner == current) {
-		ret = fixup_pi_state_owner(uaddr, q, NULL);
-		return ret;
-	}
+	if (q->pi_state->owner == current)
+		return fixup_pi_state_owner(uaddr, q, NULL);

 	/*
 	 * Paranoia check. If we did not take the lock, then we should not be
-	 * the owner of the rt_mutex.
+	 * the owner of the rt_mutex. Warn and establish consistent state.
 	 */
-	if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
-		printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
-				"pi-state %p\n", ret,
-				q->pi_state->pi_mutex.owner,
-				q->pi_state->owner);
-	}
+	if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
+		return fixup_pi_state_owner(uaddr, q, current);

-	return ret;
+	return 0;
 }

 /**
@ -2771,7 +2775,6 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 			 ktime_t *time, int trylock)
 {
 	struct hrtimer_sleeper timeout, *to;
-	struct futex_pi_state *pi_state = NULL;
 	struct task_struct *exiting = NULL;
 	struct rt_mutex_waiter rt_waiter;
 	struct futex_hash_bucket *hb;
@ -2907,23 +2910,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 	if (res)
 		ret = (res < 0) ? res : 0;

-	/*
-	 * If fixup_owner() faulted and was unable to handle the fault, unlock
-	 * it and return the fault to userspace.
-	 */
-	if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
-		pi_state = q.pi_state;
-		get_pi_state(pi_state);
-	}
-
 	/* Unqueue and drop the lock */
 	unqueue_me_pi(&q);
-
-	if (pi_state) {
-		rt_mutex_futex_unlock(&pi_state->pi_mutex);
-		put_pi_state(pi_state);
-	}
-
 	goto out;

 out_unlock_put_key:
@ -3183,7 +3171,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 				 u32 __user *uaddr2)
 {
 	struct hrtimer_sleeper timeout, *to;
-	struct futex_pi_state *pi_state = NULL;
 	struct rt_mutex_waiter rt_waiter;
 	struct futex_hash_bucket *hb;
 	union futex_key key2 = FUTEX_KEY_INIT;
@ -3261,16 +3248,17 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		if (q.pi_state && (q.pi_state->owner != current)) {
 			spin_lock(q.lock_ptr);
 			ret = fixup_pi_state_owner(uaddr2, &q, current);
-			if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
-				pi_state = q.pi_state;
-				get_pi_state(pi_state);
-			}
 			/*
 			 * Drop the reference to the pi state which
 			 * the requeue_pi() code acquired for us.
 			 */
 			put_pi_state(q.pi_state);
 			spin_unlock(q.lock_ptr);
+			/*
+			 * Adjust the return value. It's either -EFAULT or
+			 * success (1) but the caller expects 0 for success.
+			 */
+			ret = ret < 0 ? ret : 0;
 		}
 	} else {
 		struct rt_mutex *pi_mutex;
@ -3301,25 +3289,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		if (res)
 			ret = (res < 0) ? res : 0;

-		/*
-		 * If fixup_pi_state_owner() faulted and was unable to handle
-		 * the fault, unlock the rt_mutex and return the fault to
-		 * userspace.
-		 */
-		if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
-			pi_state = q.pi_state;
-			get_pi_state(pi_state);
-		}
-
 		/* Unqueue and drop the lock. */
 		unqueue_me_pi(&q);
 	}

-	if (pi_state) {
-		rt_mutex_futex_unlock(&pi_state->pi_mutex);
-		put_pi_state(pi_state);
-	}
-
 	if (ret == -EINTR) {
 		/*
 		 * We've already been requeued, but cannot restart by calling
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@ -1716,8 +1716,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 * possible because it belongs to the pi_state which is about to be freed
 * and it is not longer visible to other tasks.
 */
-void rt_mutex_proxy_unlock(struct rt_mutex *lock,
-			   struct task_struct *proxy_owner)
+void rt_mutex_proxy_unlock(struct rt_mutex *lock)
 {
 	debug_rt_mutex_proxy_unlock(lock);
 	rt_mutex_set_owner(lock, NULL);
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@ -133,8 +133,7 @@ enum rtmutex_chainwalk {
 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				       struct task_struct *proxy_owner);
-extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
-				  struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
 extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
 extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 				     struct rt_mutex_waiter *waiter,