From db341a049ec7e87053c91008cb452d0bfa6dde72 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 6 Aug 2019 03:08:08 -0500
Subject: [PATCH 01/11] ata: rb532_cf: Fix unused variable warning in
 rb532_pata_driver_probe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the following warning (Building: rb532_defconfig mips):

drivers/ata/pata_rb532_cf.c: In function ‘rb532_pata_driver_remove’:
drivers/ata/pata_rb532_cf.c:161:24: warning: unused variable ‘info’ [-Wunused-variable]
  struct rb532_cf_info *info = ah->private_data;
                        ^~~~

Fixes: cd56f35e52d9 ("ata: rb532_cf: Convert to use GPIO descriptors")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ata/pata_rb532_cf.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/ata/pata_rb532_cf.c b/drivers/ata/pata_rb532_cf.c
index 7c37f2ff09e4..deae466395de 100644
--- a/drivers/ata/pata_rb532_cf.c
+++ b/drivers/ata/pata_rb532_cf.c
@@ -158,7 +158,6 @@ static int rb532_pata_driver_probe(struct platform_device *pdev)
 static int rb532_pata_driver_remove(struct platform_device *pdev)
 {
 	struct ata_host *ah = platform_get_drvdata(pdev);
-	struct rb532_cf_info *info = ah->private_data;
 
 	ata_host_detach(ah);
 

From e15c2ffa1091c4f72370f01af4de8f9dddeb17a6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 6 Aug 2019 13:34:31 -0600
Subject: [PATCH 02/11] block: fix O_DIRECT error handling for bio fragments

0eb6ddfb865c tried to fix this up, but introduced a use-after-free
of dio. Additionally, we still had an issue with error handling,
as reported by Darrick:

"I noticed a regression in xfs/747 (an unreleased xfstest for the
xfs_scrub media scanning feature) on 5.3-rc3.  I'll condense that down
to a simpler reproducer:

error-test: 0 209 linear 8:48 0
error-test: 209 1 error
error-test: 210 6446894 linear 8:48 210

Basically we have a ~3G /dev/sdd and we set up device mapper to fail IO
for sector 209 and to pass the io to the scsi device everywhere else.

On 5.3-rc3, performing a directio pread of this range with a < 1M buffer
(in other words, a request for fewer than MAX_BIO_PAGES bytes) yields
EIO like you'd expect:

pread64(3, 0x7f880e1c7000, 1048576, 0)  = -1 EIO (Input/output error)
pread: Input/output error
+++ exited with 0 +++

But doing it with a larger buffer succeeds(!):

pread64(3, "XFSB\0\0\20\0\0\0\0\0\0\fL\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1146880, 0) = 1146880
read 1146880/1146880 bytes at offset 0
1 MiB, 1 ops; 0.0009 sec (1.124 GiB/sec and 1052.6316 ops/sec)
+++ exited with 0 +++

(Note that the part of the buffer corresponding to the dm-error area is
uninitialized)

On 5.3-rc2, both commands would fail with EIO like you'd expect.  The
only change between rc2 and rc3 is commit 0eb6ddfb865c ("block: Fix
__blkdev_direct_IO() for bio fragments").

AFAICT we end up in __blkdev_direct_IO with a 1120K buffer, which gets
split into two bios: one for the first BIO_MAX_PAGES worth of data (1MB)
and a second one for the 96k after that."

Fix this by noting that it's always safe to dereference dio if we get
BLK_QC_T_EAGAIN returned, as end_io hasn't been run for that case. So
we can safely increment the dio size before calling submit_bio(), and
then decrement it on failure (not that it really matters, as the bio
and dio are going away).

For error handling, return to the original method of just using 'ret'
for tracking the error, and the size tracking in dio->size.

Fixes: 0eb6ddfb865c ("block: Fix __blkdev_direct_IO() for bio fragments")
Fixes: 6a43074e2f46 ("block: properly handle IOCB_NOWAIT for async O_DIRECT IO")
Reported-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index a6f7c892cb4a..131e2e0582a6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -349,7 +349,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	loff_t pos = iocb->ki_pos;
 	blk_qc_t qc = BLK_QC_T_NONE;
 	gfp_t gfp;
-	ssize_t ret;
+	int ret;
 
 	if ((pos | iov_iter_alignment(iter)) &
 	    (bdev_logical_block_size(bdev) - 1))
@@ -386,8 +386,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 
 	ret = 0;
 	for (;;) {
-		int err;
-
 		bio_set_dev(bio, bdev);
 		bio->bi_iter.bi_sector = pos >> 9;
 		bio->bi_write_hint = iocb->ki_hint;
@@ -395,10 +393,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		bio->bi_end_io = blkdev_bio_end_io;
 		bio->bi_ioprio = iocb->ki_ioprio;
 
-		err = bio_iov_iter_get_pages(bio, iter);
-		if (unlikely(err)) {
-			if (!ret)
-				ret = err;
+		ret = bio_iov_iter_get_pages(bio, iter);
+		if (unlikely(ret)) {
 			bio->bi_status = BLK_STS_IOERR;
 			bio_endio(bio);
 			break;
@@ -421,7 +417,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		if (nowait)
 			bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE);
 
-		dio->size += bio->bi_iter.bi_size;
 		pos += bio->bi_iter.bi_size;
 
 		nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
@@ -433,13 +428,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 				polled = true;
 			}
 
+			dio->size += bio->bi_iter.bi_size;
 			qc = submit_bio(bio);
 			if (qc == BLK_QC_T_EAGAIN) {
-				if (!ret)
-					ret = -EAGAIN;
+				dio->size -= bio->bi_iter.bi_size;
+				ret = -EAGAIN;
 				goto error;
 			}
-			ret = dio->size;
 
 			if (polled)
 				WRITE_ONCE(iocb->ki_cookie, qc);
@@ -460,18 +455,17 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 			atomic_inc(&dio->ref);
 		}
 
+		dio->size += bio->bi_iter.bi_size;
 		qc = submit_bio(bio);
 		if (qc == BLK_QC_T_EAGAIN) {
-			if (!ret)
-				ret = -EAGAIN;
+			dio->size -= bio->bi_iter.bi_size;
+			ret = -EAGAIN;
 			goto error;
 		}
-		ret = dio->size;
 
 		bio = bio_alloc(gfp, nr_pages);
 		if (!bio) {
-			if (!ret)
-				ret = -EAGAIN;
+			ret = -EAGAIN;
 			goto error;
 		}
 	}
@@ -496,6 +490,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 out:
 	if (!ret)
 		ret = blk_status_to_errno(dio->bio.bi_status);
+	if (likely(!ret))
+		ret = dio->size;
 
 	bio_put(&dio->bio);
 	return ret;

From 2d7271501720038381d45fb3dcbe4831228fc8cc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 7 Aug 2019 12:20:52 -0600
Subject: [PATCH 03/11] libata: have ata_scsi_rw_xlat() fail invalid
 passthrough requests

For passthrough requests, libata-scsi takes what the user passes in
as gospel. This can be problematic if the user fills in the CDB
incorrectly. One example of that is in request sizes. For read/write
commands, the CDB contains fields describing the transfer length of
the request. These should match with the SG_IO header fields, but
libata-scsi currently does no validation of that.

Check that the number of blocks in the CDB for passthrough requests
matches what was mapped into the request. If the CDB asks for more
data then the validated SG_IO header fields, error it.

Reported-by: Krishna Ram Prakash R <krp@gtux.in>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ata/libata-scsi.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 391ac0503dc0..76d0f9de767b 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1786,6 +1786,21 @@ static unsigned int ata_scsi_verify_xlat(struct ata_queued_cmd *qc)
 	return 1;
 }
 
+static bool ata_check_nblocks(struct scsi_cmnd *scmd, u32 n_blocks)
+{
+	struct request *rq = scmd->request;
+	u32 req_blocks;
+
+	if (!blk_rq_is_passthrough(rq))
+		return true;
+
+	req_blocks = blk_rq_bytes(rq) / scmd->device->sector_size;
+	if (n_blocks > req_blocks)
+		return false;
+
+	return true;
+}
+
 /**
  *	ata_scsi_rw_xlat - Translate SCSI r/w command into an ATA one
  *	@qc: Storage for translated ATA taskfile
@@ -1830,6 +1845,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc)
 		scsi_10_lba_len(cdb, &block, &n_block);
 		if (cdb[1] & (1 << 3))
 			tf_flags |= ATA_TFLAG_FUA;
+		if (!ata_check_nblocks(scmd, n_block))
+			goto invalid_fld;
 		break;
 	case READ_6:
 	case WRITE_6:
@@ -1844,6 +1861,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc)
 		 */
 		if (!n_block)
 			n_block = 256;
+		if (!ata_check_nblocks(scmd, n_block))
+			goto invalid_fld;
 		break;
 	case READ_16:
 	case WRITE_16:
@@ -1854,6 +1873,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc)
 		scsi_16_lba_len(cdb, &block, &n_block);
 		if (cdb[1] & (1 << 3))
 			tf_flags |= ATA_TFLAG_FUA;
+		if (!ata_check_nblocks(scmd, n_block))
+			goto invalid_fld;
 		break;
 	default:
 		DPRINTK("no-byte command\n");

From 752ead44491e8c91e14d7079625c5916b30921c5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 7 Aug 2019 12:23:57 -0600
Subject: [PATCH 04/11] libata: add SG safety checks in SFF pio transfers

Abort processing of a command if we run out of mapped data in the
SG list. This should never happen, but a previous bug caused it to
be possible. Play it safe and attempt to abort nicely if we don't
have more SG segments left.

Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ata/libata-sff.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 10aa27882142..4f115adb4ee8 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -658,6 +658,10 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
 	unsigned int offset;
 	unsigned char *buf;
 
+	if (!qc->cursg) {
+		qc->curbytes = qc->nbytes;
+		return;
+	}
 	if (qc->curbytes == qc->nbytes - qc->sect_size)
 		ap->hsm_task_state = HSM_ST_LAST;
 
@@ -683,6 +687,8 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
 
 	if (qc->cursg_ofs == qc->cursg->length) {
 		qc->cursg = sg_next(qc->cursg);
+		if (!qc->cursg)
+			ap->hsm_task_state = HSM_ST_LAST;
 		qc->cursg_ofs = 0;
 	}
 }

From 430380b4637aec646996b4aef67ad417593923b2 Mon Sep 17 00:00:00 2001
From: He Zhe <zhe.he@windriver.com>
Date: Thu, 8 Aug 2019 11:09:54 +0800
Subject: [PATCH 05/11] block: aoe: Fix kernel crash due to atomic sleep when
 exiting

Since commit 3582dd291788 ("aoe: convert aoeblk to blk-mq"), aoedev_downdev
has had the possibility of sleeping and causing the following crash.

BUG: scheduling while atomic: rmmod/2242/0x00000003
Modules linked in: aoe
Preemption disabled at:
[<ffffffffc01d95e5>] flush+0x95/0x4a0 [aoe]
CPU: 7 PID: 2242 Comm: rmmod Tainted: G          I       5.2.3 #1
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009
Call Trace:
 dump_stack+0x4f/0x6a
 ? flush+0x95/0x4a0 [aoe]
 __schedule_bug.cold+0x44/0x54
 __schedule+0x44f/0x680
 schedule+0x44/0xd0
 blk_mq_freeze_queue_wait+0x46/0xb0
 ? wait_woken+0x80/0x80
 blk_mq_freeze_queue+0x1b/0x20
 aoedev_downdev+0x111/0x160 [aoe]
 flush+0xff/0x4a0 [aoe]
 aoedev_exit+0x23/0x30 [aoe]
 aoe_exit+0x35/0x948 [aoe]
 __se_sys_delete_module+0x183/0x210
 __x64_sys_delete_module+0x16/0x20
 do_syscall_64+0x4d/0x130
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f24e0043b07
Code: 73 01 c3 48 8b 0d 89 73 0b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f
1f 84 00 00 00 00 00 0f 1f 44 00 00 b8 b0 00 00 00 0f 05 <48> 3d 01 f0 ff
ff 73 01 c3 48 8b 0d 59 73 0b 00 f7 d8 64 89 01 48
RSP: 002b:00007ffe18f7f1e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f24e0043b07
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000555c3ecf87c8
RBP: 00007ffe18f7f1f0 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f24e00b4ac0 R11: 0000000000000206 R12: 00007ffe18f7f238
R13: 00007ffe18f7f410 R14: 00007ffe18f80e73 R15: 0000555c3ecf8760

This patch, handling in the same way of pass two, unlocks the locks and
restart pass one after aoedev_downdev is done.

Fixes: 3582dd291788 ("aoe: convert aoeblk to blk-mq")
Signed-off-by: He Zhe <zhe.he@windriver.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/aoe/aoedev.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 5b49f1b33ebe..e2ea2356da06 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -323,10 +323,14 @@ flush(const char __user *str, size_t cnt, int exiting)
 	}
 
 	flush_scheduled_work();
-	/* pass one: without sleeping, do aoedev_downdev */
+	/* pass one: do aoedev_downdev, which might sleep */
+restart1:
 	spin_lock_irqsave(&devlist_lock, flags);
 	for (d = devlist; d; d = d->next) {
 		spin_lock(&d->lock);
+		if (d->flags & DEVFL_TKILL)
+			goto cont;
+
 		if (exiting) {
 			/* unconditionally take each device down */
 		} else if (specified) {
@@ -338,8 +342,11 @@ flush(const char __user *str, size_t cnt, int exiting)
 		|| d->ref)
 			goto cont;
 
+		spin_unlock(&d->lock);
+		spin_unlock_irqrestore(&devlist_lock, flags);
 		aoedev_downdev(d);
 		d->flags |= DEVFL_TKILL;
+		goto restart1;
 cont:
 		spin_unlock(&d->lock);
 	}
@@ -348,7 +355,7 @@ flush(const char __user *str, size_t cnt, int exiting)
 	/* pass two: call freedev, which might sleep,
 	 * for aoedevs marked with DEVFL_TKILL
 	 */
-restart:
+restart2:
 	spin_lock_irqsave(&devlist_lock, flags);
 	for (d = devlist; d; d = d->next) {
 		spin_lock(&d->lock);
@@ -357,7 +364,7 @@ flush(const char __user *str, size_t cnt, int exiting)
 			spin_unlock(&d->lock);
 			spin_unlock_irqrestore(&devlist_lock, flags);
 			freedev(d);
-			goto restart;
+			goto restart2;
 		}
 		spin_unlock(&d->lock);
 	}

From 08d383a74948b43eb6e96c86153e63cbf276f1fa Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Wed, 7 Aug 2019 16:17:53 +0200
Subject: [PATCH 06/11] block, bfq: reset last_completed_rq_bfqq if the pointed
 queue is freed

Since commit 13a857a4c4e8 ("block, bfq: detect wakers and
unconditionally inject their I/O"), BFQ stores, in a per-device
pointer last_completed_rq_bfqq, the last bfq_queue that had an I/O
request completed. If some bfq_queue receives new I/O right after the
last request of last_completed_rq_bfqq has been completed, then
last_completed_rq_bfqq may be a waker bfq_queue.

But if the bfq_queue last_completed_rq_bfqq points to is freed, then
last_completed_rq_bfqq becomes a dangling reference. This commit
resets last_completed_rq_bfqq if the pointed bfq_queue is freed.

Fixes: 13a857a4c4e8 ("block, bfq: detect wakers and unconditionally inject their I/O")
Reported-by: Douglas Anderson <dianders@chromium.org>
Tested-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 586fcfe227ea..b2009650afc2 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1924,12 +1924,13 @@ static void bfq_add_request(struct request *rq)
 		 * confirmed no later than during the next
 		 * I/O-plugging interval for bfqq.
 		 */
-		if (!bfq_bfqq_has_short_ttime(bfqq) &&
+		if (bfqd->last_completed_rq_bfqq &&
+		    !bfq_bfqq_has_short_ttime(bfqq) &&
 		    ktime_get_ns() - bfqd->last_completion <
 		    200 * NSEC_PER_USEC) {
 			if (bfqd->last_completed_rq_bfqq != bfqq &&
-				   bfqd->last_completed_rq_bfqq !=
-				   bfqq->waker_bfqq) {
+			    bfqd->last_completed_rq_bfqq !=
+			    bfqq->waker_bfqq) {
 				/*
 				 * First synchronization detected with
 				 * a candidate waker queue, or with a
@@ -4808,6 +4809,9 @@ void bfq_put_queue(struct bfq_queue *bfqq)
 			bfqq->bfqd->burst_size--;
 	}
 
+	if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq)
+		bfqq->bfqd->last_completed_rq_bfqq = NULL;
+
 	kmem_cache_free(bfq_pool, bfqq);
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	bfqg_and_blkg_put(bfqg);

From 3f758e844aa9800eb660d60ee10226fa802594d4 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Wed, 7 Aug 2019 16:17:54 +0200
Subject: [PATCH 07/11] block, bfq: move update of waker and woken list to
 queue freeing

Since commit 13a857a4c4e8 ("block, bfq: detect wakers and
unconditionally inject their I/O"), every bfq_queue has a pointer to a
waker bfq_queue and a list of the bfq_queues it may wake. In this
respect, when a bfq_queue, say Q, remains with no I/O source attached
to it, Q cannot be woken by any other bfq_queue, and cannot wake any
other bfq_queue. Then Q must be removed from the woken list of its
possible waker bfq_queue, and all bfq_queues in the woken list of Q
must stop having a waker bfq_queue.

Q remains with no I/O source in two cases: when the last process
associated with Q exits or when such a process gets associated with a
different bfq_queue. Unfortunately, commit 13a857a4c4e8 ("block, bfq:
detect wakers and unconditionally inject their I/O") performed the
above updates only in the first case.

This commit fixes this bug by moving these updates to when Q gets
freed. This is a simple and safe way to handle all cases, as both the
above events, process exit and re-association, lead to Q being freed
soon, and because dangling references would come out only after Q gets
freed (if no update were performed).

Fixes: 13a857a4c4e8 ("block, bfq: detect wakers and unconditionally inject their I/O")
Reported-by: Douglas Anderson <dianders@chromium.org>
Tested-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index b2009650afc2..5f477501bb3d 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -4765,6 +4765,8 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
  */
 void bfq_put_queue(struct bfq_queue *bfqq)
 {
+	struct bfq_queue *item;
+	struct hlist_node *n;
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	struct bfq_group *bfqg = bfqq_group(bfqq);
 #endif
@@ -4809,6 +4811,33 @@ void bfq_put_queue(struct bfq_queue *bfqq)
 			bfqq->bfqd->burst_size--;
 	}
 
+	/*
+	 * bfqq does not exist any longer, so it cannot be woken by
+	 * any other queue, and cannot wake any other queue. Then bfqq
+	 * must be removed from the woken list of its possible waker
+	 * queue, and all queues in the woken list of bfqq must stop
+	 * having a waker queue. Strictly speaking, these updates
+	 * should be performed when bfqq remains with no I/O source
+	 * attached to it, which happens before bfqq gets freed. In
+	 * particular, this happens when the last process associated
+	 * with bfqq exits or gets associated with a different
+	 * queue. However, both events lead to bfqq being freed soon,
+	 * and dangling references would come out only after bfqq gets
+	 * freed. So these updates are done here, as a simple and safe
+	 * way to handle all cases.
+	 */
+	/* remove bfqq from woken list */
+	if (!hlist_unhashed(&bfqq->woken_list_node))
+		hlist_del_init(&bfqq->woken_list_node);
+
+	/* reset waker for all queues in woken list */
+	hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
+				  woken_list_node) {
+		item->waker_bfqq = NULL;
+		bfq_clear_bfqq_has_waker(item);
+		hlist_del_init(&item->woken_list_node);
+	}
+
 	if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq)
 		bfqq->bfqd->last_completed_rq_bfqq = NULL;
 
@@ -4839,9 +4868,6 @@ static void bfq_put_cooperator(struct bfq_queue *bfqq)
 
 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
-	struct bfq_queue *item;
-	struct hlist_node *n;
-
 	if (bfqq == bfqd->in_service_queue) {
 		__bfq_bfqq_expire(bfqd, bfqq, BFQQE_BUDGET_TIMEOUT);
 		bfq_schedule_dispatch(bfqd);
@@ -4851,18 +4877,6 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 
 	bfq_put_cooperator(bfqq);
 
-	/* remove bfqq from woken list */
-	if (!hlist_unhashed(&bfqq->woken_list_node))
-		hlist_del_init(&bfqq->woken_list_node);
-
-	/* reset waker for all queues in woken list */
-	hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
-				  woken_list_node) {
-		item->waker_bfqq = NULL;
-		bfq_clear_bfqq_has_waker(item);
-		hlist_del_init(&item->woken_list_node);
-	}
-
 	bfq_put_queue(bfqq); /* release process reference */
 }
 

From fd03177c33b287c6541f4048f1d67b7b45a1abc9 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Wed, 7 Aug 2019 19:21:11 +0200
Subject: [PATCH 08/11] block, bfq: handle NULL return value by bfq_init_rq()

As reported in [1], the call bfq_init_rq(rq) may return NULL in case
of OOM (in particular, if rq->elv.icq is NULL because memory
allocation failed in failed in ioc_create_icq()).

This commit handles this circumstance.

[1] https://lkml.org/lkml/2019/7/22/824

Cc: Hsin-Yi Wang <hsinyi@google.com>
Cc: Nicolas Boichat <drinkcat@chromium.org>
Cc: Doug Anderson <dianders@chromium.org>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Reported-by: Hsin-Yi Wang <hsinyi@google.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 5f477501bb3d..b33be928d164 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2251,9 +2251,14 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
 	    blk_rq_pos(container_of(rb_prev(&req->rb_node),
 				    struct request, rb_node))) {
 		struct bfq_queue *bfqq = bfq_init_rq(req);
-		struct bfq_data *bfqd = bfqq->bfqd;
+		struct bfq_data *bfqd;
 		struct request *prev, *next_rq;
 
+		if (!bfqq)
+			return;
+
+		bfqd = bfqq->bfqd;
+
 		/* Reposition request in its sort_list */
 		elv_rb_del(&bfqq->sort_list, req);
 		elv_rb_add(&bfqq->sort_list, req);
@@ -2300,6 +2305,9 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 	struct bfq_queue *bfqq = bfq_init_rq(rq),
 		*next_bfqq = bfq_init_rq(next);
 
+	if (!bfqq)
+		return;
+
 	/*
 	 * If next and rq belong to the same bfq_queue and next is older
 	 * than rq, then reposition rq in the fifo (by substituting next
@@ -5454,12 +5462,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 
 	spin_lock_irq(&bfqd->lock);
 	bfqq = bfq_init_rq(rq);
-	if (at_head || blk_rq_is_passthrough(rq)) {
+	if (!bfqq || at_head || blk_rq_is_passthrough(rq)) {
 		if (at_head)
 			list_add(&rq->queuelist, &bfqd->dispatch);
 		else
 			list_add_tail(&rq->queuelist, &bfqd->dispatch);
-	} else { /* bfqq is assumed to be non null here */
+	} else {
 		idle_timer_disabled = __bfq_insert_request(bfqd, rq);
 		/*
 		 * Update bfqq, because, if a queue merge has occurred

From e91455bad5cff40a8c232f2204a5104127e3fec2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Aug 2019 11:36:47 +0200
Subject: [PATCH 09/11] bdev: Fixup error handling in blkdev_get()

Commit 89e524c04fa9 ("loop: Fix mount(2) failure due to race with
LOOP_SET_FD") converted blkdev_get() to use the new helpers for
finishing claiming of a block device. However the conversion botched the
error handling in blkdev_get() and thus the bdev has been marked as held
even in case __blkdev_get() returned error. This led to occasional
warnings with block/001 test from blktests like:

kernel: WARNING: CPU: 5 PID: 907 at fs/block_dev.c:1899 __blkdev_put+0x396/0x3a0

Correct the error handling.

CC: stable@vger.kernel.org
Fixes: 89e524c04fa9 ("loop: Fix mount(2) failure due to race with LOOP_SET_FD")
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 131e2e0582a6..eb657ab94060 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1750,7 +1750,10 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 
 		/* finish claiming */
 		mutex_lock(&bdev->bd_mutex);
-		bd_finish_claiming(bdev, whole, holder);
+		if (!res)
+			bd_finish_claiming(bdev, whole, holder);
+		else
+			bd_abort_claiming(bdev, whole, holder);
 		/*
 		 * Block event polling for write claims if requested.  Any
 		 * write holder makes the write_holder state stick until

From d0a255e795ab976481565f6ac178314b34fbf891 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 8 Aug 2019 11:17:01 -0400
Subject: [PATCH 10/11] loop: set PF_MEMALLOC_NOIO for the worker thread

A deadlock with this stacktrace was observed.

The loop thread does a GFP_KERNEL allocation, it calls into dm-bufio
shrinker and the shrinker depends on I/O completion in the dm-bufio
subsystem.

In order to fix the deadlock (and other similar ones), we set the flag
PF_MEMALLOC_NOIO at loop thread entry.

PID: 474    TASK: ffff8813e11f4600  CPU: 10  COMMAND: "kswapd0"
   #0 [ffff8813dedfb938] __schedule at ffffffff8173f405
   #1 [ffff8813dedfb990] schedule at ffffffff8173fa27
   #2 [ffff8813dedfb9b0] schedule_timeout at ffffffff81742fec
   #3 [ffff8813dedfba60] io_schedule_timeout at ffffffff8173f186
   #4 [ffff8813dedfbaa0] bit_wait_io at ffffffff8174034f
   #5 [ffff8813dedfbac0] __wait_on_bit at ffffffff8173fec8
   #6 [ffff8813dedfbb10] out_of_line_wait_on_bit at ffffffff8173ff81
   #7 [ffff8813dedfbb90] __make_buffer_clean at ffffffffa038736f [dm_bufio]
   #8 [ffff8813dedfbbb0] __try_evict_buffer at ffffffffa0387bb8 [dm_bufio]
   #9 [ffff8813dedfbbd0] dm_bufio_shrink_scan at ffffffffa0387cc3 [dm_bufio]
  #10 [ffff8813dedfbc40] shrink_slab at ffffffff811a87ce
  #11 [ffff8813dedfbd30] shrink_zone at ffffffff811ad778
  #12 [ffff8813dedfbdc0] kswapd at ffffffff811ae92f
  #13 [ffff8813dedfbec0] kthread at ffffffff810a8428
  #14 [ffff8813dedfbf50] ret_from_fork at ffffffff81745242

  PID: 14127  TASK: ffff881455749c00  CPU: 11  COMMAND: "loop1"
   #0 [ffff88272f5af228] __schedule at ffffffff8173f405
   #1 [ffff88272f5af280] schedule at ffffffff8173fa27
   #2 [ffff88272f5af2a0] schedule_preempt_disabled at ffffffff8173fd5e
   #3 [ffff88272f5af2b0] __mutex_lock_slowpath at ffffffff81741fb5
   #4 [ffff88272f5af330] mutex_lock at ffffffff81742133
   #5 [ffff88272f5af350] dm_bufio_shrink_count at ffffffffa03865f9 [dm_bufio]
   #6 [ffff88272f5af380] shrink_slab at ffffffff811a86bd
   #7 [ffff88272f5af470] shrink_zone at ffffffff811ad778
   #8 [ffff88272f5af500] do_try_to_free_pages at ffffffff811adb34
   #9 [ffff88272f5af590] try_to_free_pages at ffffffff811adef8
  #10 [ffff88272f5af610] __alloc_pages_nodemask at ffffffff811a09c3
  #11 [ffff88272f5af710] alloc_pages_current at ffffffff811e8b71
  #12 [ffff88272f5af760] new_slab at ffffffff811f4523
  #13 [ffff88272f5af7b0] __slab_alloc at ffffffff8173a1b5
  #14 [ffff88272f5af880] kmem_cache_alloc at ffffffff811f484b
  #15 [ffff88272f5af8d0] do_blockdev_direct_IO at ffffffff812535b3
  #16 [ffff88272f5afb00] __blockdev_direct_IO at ffffffff81255dc3
  #17 [ffff88272f5afb30] xfs_vm_direct_IO at ffffffffa01fe3fc [xfs]
  #18 [ffff88272f5afb90] generic_file_read_iter at ffffffff81198994
  #19 [ffff88272f5afc50] __dta_xfs_file_read_iter_2398 at ffffffffa020c970 [xfs]
  #20 [ffff88272f5afcc0] lo_rw_aio at ffffffffa0377042 [loop]
  #21 [ffff88272f5afd70] loop_queue_work at ffffffffa0377c3b [loop]
  #22 [ffff88272f5afe60] kthread_worker_fn at ffffffff810a8a0c
  #23 [ffff88272f5afec0] kthread at ffffffff810a8428
  #24 [ffff88272f5aff50] ret_from_fork at ffffffff81745242

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3036883fc9f8..ab7ca5989097 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -885,7 +885,7 @@ static void loop_unprepare_queue(struct loop_device *lo)
 
 static int loop_kthread_worker_fn(void *worker_ptr)
 {
-	current->flags |= PF_LESS_THROTTLE;
+	current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO;
 	return kthread_worker_fn(worker_ptr);
 }
 

From 20621fedb2a696e4dc60bc1c5de37cf21976abcb Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Fri, 9 Aug 2019 14:14:05 +0800
Subject: [PATCH 11/11] bcache: Revert "bcache: use sysfs_match_string()
 instead of __sysfs_match_string()"

This reverts commit 89e0341af082dbc170019f908846f4a424efc86b.

In drivers/md/bcache/sysfs.c:bch_snprint_string_list(), NULL pointer at
the end of list is necessary. Remove the NULL from last element of each
lists will cause the following panic,

[ 4340.455652] bcache: register_cache() registered cache device nvme0n1
[ 4340.464603] bcache: register_bdev() registered backing device sdk
[ 4421.587335] bcache: bch_cached_dev_run() cached dev sdk is running already
[ 4421.587348] bcache: bch_cached_dev_attach() Caching sdk as bcache0 on set 354e1d46-d99f-4d8b-870b-078b80dc88a6
[ 5139.247950] general protection fault: 0000 [#1] SMP NOPTI
[ 5139.247970] CPU: 9 PID: 5896 Comm: cat Not tainted 4.12.14-95.29-default #1 SLE12-SP4
[ 5139.247988] Hardware name: HPE ProLiant DL380 Gen10/ProLiant DL380 Gen10, BIOS U30 04/18/2019
[ 5139.248006] task: ffff888fb25c0b00 task.stack: ffff9bbacc704000
[ 5139.248021] RIP: 0010:string+0x21/0x70
[ 5139.248030] RSP: 0018:ffff9bbacc707bf0 EFLAGS: 00010286
[ 5139.248043] RAX: ffffffffa7e432e3 RBX: ffff8881c20da02a RCX: ffff0a00ffffff04
[ 5139.248058] RDX: 3f00656863616362 RSI: ffff8881c20db000 RDI: ffffffffffffffff
[ 5139.248075] RBP: ffff8881c20db000 R08: 0000000000000000 R09: ffff8881c20da02a
[ 5139.248090] R10: 0000000000000004 R11: 0000000000000000 R12: ffff9bbacc707c48
[ 5139.248104] R13: 0000000000000fd6 R14: ffffffffc0665855 R15: ffffffffc0665855
[ 5139.248119] FS:  00007faf253b8700(0000) GS:ffff88903f840000(0000) knlGS:0000000000000000
[ 5139.248137] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 5139.248149] CR2: 00007faf25395008 CR3: 0000000f72150006 CR4: 00000000007606e0
[ 5139.248164] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 5139.248179] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 5139.248193] PKRU: 55555554
[ 5139.248200] Call Trace:
[ 5139.248210]  vsnprintf+0x1fb/0x510
[ 5139.248221]  snprintf+0x39/0x40
[ 5139.248238]  bch_snprint_string_list.constprop.15+0x5b/0x90 [bcache]
[ 5139.248256]  __bch_cached_dev_show+0x44d/0x5f0 [bcache]
[ 5139.248270]  ? __alloc_pages_nodemask+0xb2/0x210
[ 5139.248284]  bch_cached_dev_show+0x2c/0x50 [bcache]
[ 5139.248297]  sysfs_kf_seq_show+0xbb/0x190
[ 5139.248308]  seq_read+0xfc/0x3c0
[ 5139.248317]  __vfs_read+0x26/0x140
[ 5139.248327]  vfs_read+0x87/0x130
[ 5139.248336]  SyS_read+0x42/0x90
[ 5139.248346]  do_syscall_64+0x74/0x160
[ 5139.248358]  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[ 5139.248370] RIP: 0033:0x7faf24eea370
[ 5139.248379] RSP: 002b:00007fff82d03f38 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[ 5139.248395] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007faf24eea370
[ 5139.248411] RDX: 0000000000020000 RSI: 00007faf25396000 RDI: 0000000000000003
[ 5139.248426] RBP: 00007faf25396000 R08: 00000000ffffffff R09: 0000000000000000
[ 5139.248441] R10: 000000007c9d4d41 R11: 0000000000000246 R12: 00007faf25396000
[ 5139.248456] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff
[ 5139.248892] Code: ff ff ff 0f 1f 80 00 00 00 00 49 89 f9 48 89 cf 48 c7 c0 e3 32 e4 a7 48 c1 ff 30 48 81 fa ff 0f 00 00 48 0f 46 d0 48 85 ff 74 45 <44> 0f b6 02 48 8d 42 01 45 84 c0 74 38 48 01 fa 4c 89 cf eb 0e

The simplest way to fix is to revert commit 89e0341af082 ("bcache: use
sysfs_match_string() instead of __sysfs_match_string()").

This bug was introduced in Linux v5.2, so this fix only applies to
Linux v5.2 is enough for stable tree maintainer.

Fixes: 89e0341af082 ("bcache: use sysfs_match_string() instead of __sysfs_match_string()")
Cc: stable@vger.kernel.org
Cc: Alexandru Ardelean <alexandru.ardelean@analog.com>
Reported-by: Peifeng Lin <pflin@suse.com>
Acked-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/sysfs.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 9f0826712845..e2059af90791 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -23,24 +23,28 @@ static const char * const bch_cache_modes[] = {
 	"writethrough",
 	"writeback",
 	"writearound",
-	"none"
+	"none",
+	NULL
 };
 
 /* Default is 0 ("auto") */
 static const char * const bch_stop_on_failure_modes[] = {
 	"auto",
-	"always"
+	"always",
+	NULL
 };
 
 static const char * const cache_replacement_policies[] = {
 	"lru",
 	"fifo",
-	"random"
+	"random",
+	NULL
 };
 
 static const char * const error_actions[] = {
 	"unregister",
-	"panic"
+	"panic",
+	NULL
 };
 
 write_attribute(attach);
@@ -338,7 +342,7 @@ STORE(__cached_dev)
 	}
 
 	if (attr == &sysfs_cache_mode) {
-		v = sysfs_match_string(bch_cache_modes, buf);
+		v = __sysfs_match_string(bch_cache_modes, -1, buf);
 		if (v < 0)
 			return v;
 
@@ -349,7 +353,7 @@ STORE(__cached_dev)
 	}
 
 	if (attr == &sysfs_stop_when_cache_set_failed) {
-		v = sysfs_match_string(bch_stop_on_failure_modes, buf);
+		v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf);
 		if (v < 0)
 			return v;
 
@@ -816,7 +820,7 @@ STORE(__bch_cache_set)
 			    0, UINT_MAX);
 
 	if (attr == &sysfs_errors) {
-		v = sysfs_match_string(error_actions, buf);
+		v = __sysfs_match_string(error_actions, -1, buf);
 		if (v < 0)
 			return v;
 
@@ -1088,7 +1092,7 @@ STORE(__bch_cache)
 	}
 
 	if (attr == &sysfs_cache_replacement_policy) {
-		v = sysfs_match_string(cache_replacement_policies, buf);
+		v = __sysfs_match_string(cache_replacement_policies, -1, buf);
 		if (v < 0)
 			return v;