From fec356a61aa3d3a66416b4321f1279e09e0f256f Mon Sep 17 00:00:00 2001 From: Wu Bo Date: Wed, 19 May 2021 13:01:09 +0800 Subject: [PATCH 1/7] nvmet: fix memory leak in nvmet_alloc_ctrl() When creating ctrl in nvmet_alloc_ctrl(), if the cntlid_min is larger than cntlid_max of the subsystem, and jumps to the "out_free_changed_ns_list" label, but the ctrl->sqs lack of be freed. Fix this by jumping to the "out_free_sqs" label. Fixes: 94a39d61f80f ("nvmet: make ctrl-id configurable") Signed-off-by: Wu Bo Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 25cc2ee8de3f..1853db38b682 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1372,7 +1372,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, goto out_free_changed_ns_list; if (subsys->cntlid_min > subsys->cntlid_max) - goto out_free_changed_ns_list; + goto out_free_sqs; ret = ida_simple_get(&cntlid_ida, subsys->cntlid_min, subsys->cntlid_max, From 03504e3b54cc8118cc26c064e60a0b00c2308708 Mon Sep 17 00:00:00 2001 From: Wu Bo Date: Wed, 19 May 2021 13:01:10 +0800 Subject: [PATCH 2/7] nvme-loop: fix memory leak in nvme_loop_create_ctrl() When creating loop ctrl in nvme_loop_create_ctrl(), if nvme_init_ctrl() fails, the loop ctrl should be freed before jumping to the "out" label. Fixes: 3a85a5de29ea ("nvme-loop: add a NVMe loopback host driver") Signed-off-by: Wu Bo Signed-off-by: Christoph Hellwig --- drivers/nvme/target/loop.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 74b3b150e1a5..cb30cb942e1d 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -590,8 +590,10 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, 0 /* no quirks, we're perfect! */); - if (ret) + if (ret) { + kfree(ctrl); goto out; + } if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) WARN_ON_ONCE(1); From 825619b09ad351894d2c6fb6705f5b3711d145c7 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 17 May 2021 14:07:45 -0700 Subject: [PATCH 3/7] nvme-tcp: fix possible use-after-completion Commit db5ad6b7f8cd ("nvme-tcp: try to send request in queue_rq context") added a second context that may perform a network send. This means that now RX and TX are not serialized in nvme_tcp_io_work and can run concurrently. While there is correct mutual exclusion in the TX path (where the send_mutex protect the queue socket send activity) RX activity, and more specifically request completion may run concurrently. This means we must guarantee that any mutation of the request state related to its lifetime, bytes sent must not be accessed when a completion may have possibly arrived back (and processed). The race may trigger when a request completion arrives, processed _and_ reused as a fresh new request, exactly in the (relatively short) window between the last data payload sent and before the request iov_iter is advanced. Consider the following race: 1. 16K write request is queued 2. The nvme command and the data is sent to the controller (in-capsule or solicited by r2t) 3. After the last payload is sent but before the req.iter is advanced, the controller sends back a completion. 4. The completion is processed, the request is completed, and reused to transfer a new request (write or read) 5. The new request is queued, and the driver reset the request parameters (nvme_tcp_setup_cmd_pdu). 6. Now context in (2) resumes execution and advances the req.iter ==> use-after-completion as this is already a new request. Fix this by making sure the request is not advanced after the last data payload send, knowing that a completion may have arrived already. An alternative solution would have been to delay the request completion or state change waiting for reference counting on the TX path, but besides adding atomic operations to the hot-path, it may present challenges in multi-stage R2T scenarios where a r2t handler needs to be deferred to an async execution. Reported-by: Narayan Ayalasomayajula Tested-by: Anil Mishra Reviewed-by: Keith Busch Cc: stable@vger.kernel.org # v5.8+ Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 0222e23f5936..b97d2732a80f 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -943,7 +943,6 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) if (ret <= 0) return ret; - nvme_tcp_advance_req(req, ret); if (queue->data_digest) nvme_tcp_ddgst_update(queue->snd_hash, page, offset, ret); @@ -960,6 +959,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) } return 1; } + nvme_tcp_advance_req(req, ret); } return -EAGAIN; } From a0fdd1418007f83565d3f2e04b47923ba93a9b8c Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 17 May 2021 15:36:43 -0700 Subject: [PATCH 4/7] nvme-tcp: rerun io_work if req_list is not empty A possible race condition exists where the request to send data is enqueued from nvme_tcp_handle_r2t()'s will not be observed by nvme_tcp_send_all() if it happens to be running. The driver relies on io_work to send the enqueued request when it is runs again, but the concurrently running nvme_tcp_send_all() may not have released the send_mutex at that time. If no future commands are enqueued to re-kick the io_work, the request will timeout in the SEND_H2C state, resulting in a timeout error like: nvme nvme0: queue 1: timeout request 0x3 type 6 Ensure the io_work continues to run as long as the req_list is not empty. Fixes: db5ad6b7f8cdd ("nvme-tcp: try to send request in queue_rq context") Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index b97d2732a80f..34f4b3402f7c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1140,7 +1140,8 @@ static void nvme_tcp_io_work(struct work_struct *w) pending = true; else if (unlikely(result < 0)) break; - } + } else + pending = !llist_empty(&queue->req_list); result = nvme_tcp_try_recv(queue); if (result > 0) From a7d139145a6640172516b193abf6d2398620aa14 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 10 May 2021 21:56:35 -0700 Subject: [PATCH 5/7] nvme-fc: clear q_live at beginning of association teardown The __nvmf_check_ready() routine used to bounce all filesystem io if the controller state isn't LIVE. However, a later patch changed the logic so that it rejection ends up being based on the Q live check. The FC transport has a slightly different sequence from rdma and tcp for shutting down queues/marking them non-live. FC marks its queue non-live after aborting all ios and waiting for their termination, leaving a rather large window for filesystem io to continue to hit the transport. Unfortunately this resulted in filesystem I/O or applications seeing I/O errors. Change the FC transport to mark the queues non-live at the first sign of teardown for the association (when I/O is initially terminated). Fixes: 73a5379937ec ("nvme-fabrics: allow to queue requests for live queues") Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index d9ab9e7871d0..256e87721a01 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2461,6 +2461,18 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) static void __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) { + int q; + + /* + * if aborting io, the queues are no longer good, mark them + * all as not live. + */ + if (ctrl->ctrl.queue_count > 1) { + for (q = 1; q < ctrl->ctrl.queue_count; q++) + clear_bit(NVME_FC_Q_LIVE, &ctrl->queues[q].flags); + } + clear_bit(NVME_FC_Q_LIVE, &ctrl->queues[0].flags); + /* * If io queues are present, stop them and terminate all outstanding * ios on them. As FC allocates FC exchange for each io, the From 6c60ff048ca1e0739f39aa25996543c6e662a46c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 May 2021 15:18:41 +0200 Subject: [PATCH 6/7] block: prevent block device lookups at the beginning of del_gendisk As an artifact of how gendisk lookup used to work in earlier kernels, GENHD_FL_UP is only cleared very late in del_gendisk, and a global lock is used to prevent opens from succeeding while del_gendisk is tearing down the gendisk. Switch to clearing the flag early and under bd_mutex so that callers can use bd_mutex to stabilize the flag, which removes the need for the global mutex. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210514131842.1600568-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 11 +---------- fs/block_dev.c | 15 +++++---------- include/linux/genhd.h | 2 -- 3 files changed, 6 insertions(+), 22 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 39ca97b0edc6..9f8cb7beaad1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -29,8 +29,6 @@ static struct kobject *block_depr; -DECLARE_RWSEM(bdev_lookup_sem); - /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) static DEFINE_IDA(ext_devt_ida); @@ -609,13 +607,8 @@ void del_gendisk(struct gendisk *disk) blk_integrity_del(disk); disk_del_events(disk); - /* - * Block lookups of the disk until all bdevs are unhashed and the - * disk is marked as dead (GENHD_FL_UP cleared). - */ - down_write(&bdev_lookup_sem); - mutex_lock(&disk->part0->bd_mutex); + disk->flags &= ~GENHD_FL_UP; blk_drop_partitions(disk); mutex_unlock(&disk->part0->bd_mutex); @@ -629,8 +622,6 @@ void del_gendisk(struct gendisk *disk) remove_inode_hash(disk->part0->bd_inode); set_capacity(disk, 0); - disk->flags &= ~GENHD_FL_UP; - up_write(&bdev_lookup_sem); if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); diff --git a/fs/block_dev.c b/fs/block_dev.c index eb265d72fce8..580bae995b87 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1298,6 +1298,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) struct gendisk *disk = bdev->bd_disk; int ret = 0; + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; + if (!bdev->bd_openers) { if (!bdev_is_partition(bdev)) { ret = 0; @@ -1332,8 +1335,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) whole->bd_part_count++; mutex_unlock(&whole->bd_mutex); - if (!(disk->flags & GENHD_FL_UP) || - !bdev_nr_sectors(bdev)) { + if (!bdev_nr_sectors(bdev)) { __blkdev_put(whole, mode, 1); bdput(whole); return -ENXIO; @@ -1364,16 +1366,12 @@ struct block_device *blkdev_get_no_open(dev_t dev) struct block_device *bdev; struct gendisk *disk; - down_read(&bdev_lookup_sem); bdev = bdget(dev); if (!bdev) { - up_read(&bdev_lookup_sem); blk_request_module(dev); - down_read(&bdev_lookup_sem); - bdev = bdget(dev); if (!bdev) - goto unlock; + return NULL; } disk = bdev->bd_disk; @@ -1383,14 +1381,11 @@ struct block_device *blkdev_get_no_open(dev_t dev) goto put_disk; if (!try_module_get(bdev->bd_disk->fops->owner)) goto put_disk; - up_read(&bdev_lookup_sem); return bdev; put_disk: put_disk(disk); bdput: bdput(bdev); -unlock: - up_read(&bdev_lookup_sem); return NULL; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 7e9660ea967d..6fc26f7bdf71 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -306,8 +306,6 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, } #endif /* CONFIG_SYSFS */ -extern struct rw_semaphore bdev_lookup_sem; - dev_t blk_lookup_devt(const char *name, int partno); void blk_request_module(dev_t devt); #ifdef CONFIG_BLOCK From bc6a385132601c29a6da1dbf8148c0d3c9ad36dc Mon Sep 17 00:00:00 2001 From: Gulam Mohamed Date: Fri, 14 May 2021 15:18:42 +0200 Subject: [PATCH 7/7] block: fix a race between del_gendisk and BLKRRPART When BLKRRPART is called concurrently with del_gendisk, the partitions rescan can create a stale partition that will never be be cleaned up. Fix this by checking the the disk is up before rescanning partitions while under bd_mutex. Signed-off-by: Gulam Mohamed [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210514131842.1600568-3-hch@lst.de Signed-off-by: Jens Axboe --- fs/block_dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/block_dev.c b/fs/block_dev.c index 580bae995b87..4494411fa4d3 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1244,6 +1244,9 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) lockdep_assert_held(&bdev->bd_mutex); + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; + rescan: if (bdev->bd_part_count) return -EBUSY;