From e43269e6e5c49d7fec599e6bba71963935b0e4ba Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 14 May 2019 14:07:38 -0600
Subject: [PATCH 01/23] nvme-pci: Fix controller freeze wait disabling

If a controller disabling didn't start a freeze, don't wait for the
operation to complete.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/pci.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2a8708c9ac18..d4e442160048 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2376,7 +2376,7 @@ static void nvme_pci_disable(struct nvme_dev *dev)
 
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 {
-	bool dead = true;
+	bool dead = true, freeze = false;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	mutex_lock(&dev->shutdown_lock);
@@ -2384,8 +2384,10 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
 		if (dev->ctrl.state == NVME_CTRL_LIVE ||
-		    dev->ctrl.state == NVME_CTRL_RESETTING)
+		    dev->ctrl.state == NVME_CTRL_RESETTING) {
+			freeze = true;
 			nvme_start_freeze(&dev->ctrl);
+		}
 		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
 			pdev->error_state  != pci_channel_io_normal);
 	}
@@ -2394,10 +2396,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	 * Give the controller a chance to complete all entered requests if
 	 * doing a safe shutdown.
 	 */
-	if (!dead) {
-		if (shutdown)
-			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
-	}
+	if (!dead && shutdown && freeze)
+		nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
 
 	nvme_stop_queues(&dev->ctrl);
 

From 39a9dd81f864aa20be896bb34b4bbc2501a2453d Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 14 May 2019 14:10:41 -0600
Subject: [PATCH 02/23] nvme-pci: Don't disable on timeout in reset state

The reset state doesn't dispatch commands that it needs to wait for
anymore. If a timeout occurs in this state, the reset work is already
disabling the controller, so just reset the request's timer.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/pci.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d4e442160048..c72755311ffa 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1298,13 +1298,14 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		shutdown = true;
 		/* fall through */
 	case NVME_CTRL_CONNECTING:
-	case NVME_CTRL_RESETTING:
 		dev_warn_ratelimited(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
 		nvme_dev_disable(dev, shutdown);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
 		return BLK_EH_DONE;
+	case NVME_CTRL_RESETTING:
+		return BLK_EH_RESET_TIMER;
 	default:
 		break;
 	}

From 2036f7263d70e67d70a67899a468588cb7356bc9 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 14 May 2019 14:27:53 -0600
Subject: [PATCH 03/23] nvme-pci: Unblock reset_work on IO failure

The reset_work waits for queued IO to complete before setting the
controller to live. If any of these times out and requeues, we won't be
able to restart the controller because the reset_work is already running.

Flush all entered requests to a failed completion if a timeout occurs
in the connecting state, and ensure the controller can't transition to
the live state after we've unblocked it from waiting for completions.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/pci.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c72755311ffa..8df176ffcbc1 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1257,7 +1257,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	struct nvme_dev *dev = nvmeq->dev;
 	struct request *abort_req;
 	struct nvme_command cmd;
-	bool shutdown = false;
 	u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
 	/* If PCI error recovery process is happening, we cannot reset or
@@ -1294,14 +1293,14 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 * shutdown, so we return BLK_EH_DONE.
 	 */
 	switch (dev->ctrl.state) {
-	case NVME_CTRL_DELETING:
-		shutdown = true;
-		/* fall through */
 	case NVME_CTRL_CONNECTING:
+		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+		/* fall through */
+	case NVME_CTRL_DELETING:
 		dev_warn_ratelimited(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, shutdown);
+		nvme_dev_disable(dev, true);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
 		return BLK_EH_DONE;
 	case NVME_CTRL_RESETTING:

From d6135c3a1ec0cddda7b8b8e1b5b4abeeafd98289 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 14 May 2019 14:46:09 -0600
Subject: [PATCH 04/23] nvme-pci: Sync queues on reset

A controller with multiple namespaces may have multiple request_queues with
their own timeout work. If a controller fails with IO outstanding to
diffent namespaces, each request queue may attempt to handle it, so
ensure there is no previously scheduled timeout work executing prior to
starting controller initialization by synchronizing with each queue.

Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/core.c | 12 ++++++++++++
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/pci.c  |  1 +
 3 files changed, 14 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7da80f375315..f6879e417386 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3880,6 +3880,18 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
+
+void nvme_sync_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_sync_queue(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_sync_queues);
+
 /*
  * Check we didn't inadvertently grow the command structure sizes:
  */
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 5ee75b5ff83f..55553d293a98 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -441,6 +441,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
+void nvme_sync_queues(struct nvme_ctrl *ctrl);
 void nvme_unfreeze(struct nvme_ctrl *ctrl);
 void nvme_wait_freeze(struct nvme_ctrl *ctrl);
 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8df176ffcbc1..599065ed6a32 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2492,6 +2492,7 @@ static void nvme_reset_work(struct work_struct *work)
 	 */
 	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
 		nvme_dev_disable(dev, false);
+	nvme_sync_queues(&dev->ctrl);
 
 	mutex_lock(&dev->shutdown_lock);
 	result = nvme_pci_enable(dev);

From 6fa0321a96043b5a983bbefa785859d664645840 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 17 May 2019 10:08:19 -0600
Subject: [PATCH 05/23] nvme: Fix known effects

We're trying to append known effects to the ones reported in the
controller's log. The original patch accomplished this, but something
went wrong when patch was merged causing the effects log to override
the known effects.

Link: http://lists.infradead.org/pipermail/linux-nvme/2019-May/023710.html
Fixes: f4524cc45626 ("nvme-pci: add known admin effects to augument admin effects log page")
Cc: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f6879e417386..308b9ce820cd 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1257,9 +1257,9 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 		return 0;
 	}
 
-	effects |= nvme_known_admin_effects(opcode);
 	if (ctrl->effects)
 		effects = le32_to_cpu(ctrl->effects->acs[opcode]);
+	effects |= nvme_known_admin_effects(opcode);
 
 	/*
 	 * For simplicity, IO to all namespaces is quiesced even if the command

From 100c815cbd56480b3e31518475b04719c363614a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 May 2019 02:47:33 -0700
Subject: [PATCH 06/23] nvme: fix srcu locking on error return in
 nvme_get_ns_from_disk

If we can't get a namespace don't leak the SRCU lock.  nvme_ioctl was
working around this, but nvme_pr_command wasn't handling this properly.
Just do what callers would usually expect.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
---
 drivers/nvme/host/core.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 308b9ce820cd..421bffd95aee 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1361,9 +1361,14 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
 {
 #ifdef CONFIG_NVME_MULTIPATH
 	if (disk->fops == &nvme_ns_head_ops) {
+		struct nvme_ns *ns;
+
 		*head = disk->private_data;
 		*srcu_idx = srcu_read_lock(&(*head)->srcu);
-		return nvme_find_path(*head);
+		ns = nvme_find_path(*head);
+		if (!ns)
+			srcu_read_unlock(&(*head)->srcu, *srcu_idx);
+		return ns;
 	}
 #endif
 	*head = NULL;
@@ -1410,9 +1415,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 
 	ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
 	if (unlikely(!ns))
-		ret = -EWOULDBLOCK;
-	else
-		ret = nvme_ns_ioctl(ns, cmd, arg);
+		return -EWOULDBLOCK;
+
+	ret = nvme_ns_ioctl(ns, cmd, arg);
 	nvme_put_ns_from_disk(head, srcu_idx);
 	return ret;
 }

From 3f98bcc58cd5f1e4668db289dcab771874cc0920 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 May 2019 02:47:34 -0700
Subject: [PATCH 07/23] nvme: remove the ifdef around nvme_nvm_ioctl

We already have a proper stub if lightnvm is not enabled, so don't bother
with the ifdef.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
---
 drivers/nvme/host/core.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 421bffd95aee..4352f8582213 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1395,10 +1395,8 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg)
 	case NVME_IOCTL_SUBMIT_IO:
 		return nvme_submit_io(ns, (void __user *)arg);
 	default:
-#ifdef CONFIG_NVM
 		if (ns->ndev)
 			return nvme_nvm_ioctl(ns, cmd, arg);
-#endif
 		if (is_sed_ioctl(cmd))
 			return sed_ioctl(ns->ctrl->opal_dev, cmd,
 					 (void __user *) arg);

From 90ec611adcf20b96d0c2b7166497d53e4301a57f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 May 2019 02:47:35 -0700
Subject: [PATCH 08/23] nvme: merge nvme_ns_ioctl into nvme_ioctl

Merge the two functions to make future changes a little easier.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
---
 drivers/nvme/host/core.c | 47 ++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4352f8582213..bc288990fd50 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1382,32 +1382,11 @@ static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
 		srcu_read_unlock(&head->srcu, idx);
 }
 
-static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg)
-{
-	switch (cmd) {
-	case NVME_IOCTL_ID:
-		force_successful_syscall_return();
-		return ns->head->ns_id;
-	case NVME_IOCTL_ADMIN_CMD:
-		return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
-	case NVME_IOCTL_IO_CMD:
-		return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
-	case NVME_IOCTL_SUBMIT_IO:
-		return nvme_submit_io(ns, (void __user *)arg);
-	default:
-		if (ns->ndev)
-			return nvme_nvm_ioctl(ns, cmd, arg);
-		if (is_sed_ioctl(cmd))
-			return sed_ioctl(ns->ctrl->opal_dev, cmd,
-					 (void __user *) arg);
-		return -ENOTTY;
-	}
-}
-
 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 		unsigned int cmd, unsigned long arg)
 {
 	struct nvme_ns_head *head = NULL;
+	void __user *argp = (void __user *)arg;
 	struct nvme_ns *ns;
 	int srcu_idx, ret;
 
@@ -1415,7 +1394,29 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 	if (unlikely(!ns))
 		return -EWOULDBLOCK;
 
-	ret = nvme_ns_ioctl(ns, cmd, arg);
+	switch (cmd) {
+	case NVME_IOCTL_ID:
+		force_successful_syscall_return();
+		ret = ns->head->ns_id;
+		break;
+	case NVME_IOCTL_ADMIN_CMD:
+		ret = nvme_user_cmd(ns->ctrl, NULL, argp);
+		break;
+	case NVME_IOCTL_IO_CMD:
+		ret = nvme_user_cmd(ns->ctrl, ns, argp);
+		break;
+	case NVME_IOCTL_SUBMIT_IO:
+		ret = nvme_submit_io(ns, argp);
+		break;
+	default:
+		if (ns->ndev)
+			ret = nvme_nvm_ioctl(ns, cmd, arg);
+		else if (is_sed_ioctl(cmd))
+			ret = sed_ioctl(ns->ctrl->opal_dev, cmd, argp);
+		else
+			ret = -ENOTTY;
+	}
+
 	nvme_put_ns_from_disk(head, srcu_idx);
 	return ret;
 }

From 5fb4aac756acacf260b9ebd88747251effa3a2f2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 May 2019 11:47:36 +0200
Subject: [PATCH 09/23] nvme: release namespace SRCU protection before
 performing controller ioctls

Holding the SRCU critical section protecting the namespace list can
cause deadlocks when using the per-namespace admin passthrough ioctl to
delete as namespace.  Release it earlier when performing per-controller
ioctls to avoid that.

Reported-by: Kenneth Heitke <kenneth.heitke@intel.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index bc288990fd50..d4226c18eb71 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1394,14 +1394,31 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 	if (unlikely(!ns))
 		return -EWOULDBLOCK;
 
+	/*
+	 * Handle ioctls that apply to the controller instead of the namespace
+	 * seperately and drop the ns SRCU reference early.  This avoids a
+	 * deadlock when deleting namespaces using the passthrough interface.
+	 */
+	if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) {
+		struct nvme_ctrl *ctrl = ns->ctrl;
+
+		nvme_get_ctrl(ns->ctrl);
+		nvme_put_ns_from_disk(head, srcu_idx);
+
+		if (cmd == NVME_IOCTL_ADMIN_CMD)
+			ret = nvme_user_cmd(ctrl, NULL, argp);
+		else
+			ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
+
+		nvme_put_ctrl(ctrl);
+		return ret;
+	}
+
 	switch (cmd) {
 	case NVME_IOCTL_ID:
 		force_successful_syscall_return();
 		ret = ns->head->ns_id;
 		break;
-	case NVME_IOCTL_ADMIN_CMD:
-		ret = nvme_user_cmd(ns->ctrl, NULL, argp);
-		break;
 	case NVME_IOCTL_IO_CMD:
 		ret = nvme_user_cmd(ns->ctrl, ns, argp);
 		break;
@@ -1411,8 +1428,6 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 	default:
 		if (ns->ndev)
 			ret = nvme_nvm_ioctl(ns, cmd, arg);
-		else if (is_sed_ioctl(cmd))
-			ret = sed_ioctl(ns->ctrl->opal_dev, cmd, argp);
 		else
 			ret = -ENOTTY;
 	}

From 510a405d945bc985abc513fafe45890cac34fafa Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Thu, 16 May 2019 19:30:07 -0700
Subject: [PATCH 10/23] nvme: fix memory leak for power latency tolerance

Unconditionally hide device pm latency tolerance when uninitializing
the controller to ensure all qos resources are released so that we're
not leaking this memory. This is safe to call if none were allocated in
the first place, or were previously freed.

Fixes: c5552fde102fc("nvme: Enable autonomous power state transitions")
Suggested-by: Keith Busch <keith.busch@intel.com>
Tested-by: David Milburn <dmilburn@redhat.com>
Signed-off-by: Yufen Yu <yuyufen@huawei.com>
[changelog]
Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index d4226c18eb71..e1449c196f20 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3700,6 +3700,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl);
 
 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
 {
+	dev_pm_qos_hide_latency_tolerance(ctrl->device);
 	cdev_device_del(&ctrl->cdev, ctrl->device);
 }
 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);

From 2d466c7a574d0b893a233735f133c60115013c0e Mon Sep 17 00:00:00 2001
From: Laine Walker-Avina <laine.walker-avina@intel.com>
Date: Mon, 20 May 2019 10:13:04 -0700
Subject: [PATCH 11/23] nvme: copy MTFA field from identify controller
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We use the controller's reported maximum firmware activation time as our
timeout before resetting a controller for a failed activation notice,
but this value was never being read so we could only use the default
timeout. Copy the Identify Controller MTFA field to the corresponding
nvme_ctrl's mtfa field.

Fixes: b6dccf7fae433 (“nvme: add support for FW activation without reset”).
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Minwoo Im <minwoo.im@samsung.com>
Signed-off-by: Laine Walker-Avina <laine.walker-avina@intel.com>
[changelog, fix endian]
Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e1449c196f20..1b7c2afd84cb 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2576,6 +2576,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 
 	ctrl->oacs = le16_to_cpu(id->oacs);
 	ctrl->oncs = le16_to_cpu(id->oncs);
+	ctrl->mtfa = le16_to_cpu(id->mtfa);
 	ctrl->oaes = le32_to_cpu(id->oaes);
 	atomic_set(&ctrl->abort_limit, id->acl + 1);
 	ctrl->vwc = id->vwc;

From 0decfd8bd8237eb470f699a0ecfa126c4befb03b Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 21 May 2019 09:04:57 -0600
Subject: [PATCH 12/23] nvme: update MAINTAINERS

Use my kernel.org email for nvme. This forwards to all my accounts.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 005902ea1450..032a6aa728be 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11227,7 +11227,7 @@ F:	drivers/video/fbdev/riva/
 F:	drivers/video/fbdev/nvidia/
 
 NVM EXPRESS DRIVER
-M:	Keith Busch <keith.busch@intel.com>
+M:	Keith Busch <kbusch@kernel.org>
 M:	Jens Axboe <axboe@fb.com>
 M:	Christoph Hellwig <hch@lst.de>
 M:	Sagi Grimberg <sagi@grimberg.me>

From cb9e0e5006064a807b5d722c7e3c42f307193792 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 21 May 2019 10:56:43 -0600
Subject: [PATCH 13/23] nvme-pci: use blk-mq mapping for unmanaged irqs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a device is providing a single IRQ vector, the IO queue will share
that vector with the admin queue. This is an unmanaged vector, so does
not have a valid PCI IRQ affinity. Avoid trying to extract a managed
affinity in this case and let blk-mq set up the cpu:queue mapping instead.
Otherwise we'd hit the following warning when the device is using MSI:

 WARNING: CPU: 4 PID: 7 at drivers/pci/msi.c:1272 pci_irq_get_affinity+0x66/0x80
 Modules linked in: nvme nvme_core serio_raw
 CPU: 4 PID: 7 Comm: kworker/u16:0 Tainted: G        W         5.2.0-rc1+ #494
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
 Workqueue: nvme-reset-wq nvme_reset_work [nvme]
 RIP: 0010:pci_irq_get_affinity+0x66/0x80
 Code: 0b 31 c0 c3 83 e2 10 48 c7 c0 b0 83 35 91 74 2a 48 8b 87 d8 03 00 00 48 85 c0 74 0e 48 8b 50 30 48 85 d2 74 05 39 70 14 77 05 <0f> 0b 31 c0 c3 48 63 f6 48 8d 04 76 48 8d 04 c2 f3 c3 48 8b 40 30
 RSP: 0000:ffffb5abc01d3cc8 EFLAGS: 00010246
 RAX: ffff9536786a39c0 RBX: 0000000000000000 RCX: 0000000000000080
 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff9536781ed000
 RBP: ffff95367346a008 R08: ffff95367d43f080 R09: ffff953678c07800
 R10: ffff953678164800 R11: 0000000000000000 R12: 0000000000000000
 R13: ffff9536781ed000 R14: 00000000ffffffff R15: ffff95367346a008
 FS:  0000000000000000(0000) GS:ffff95367d400000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007fdf814a3ff0 CR3: 000000001a20f000 CR4: 00000000000006e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  blk_mq_pci_map_queues+0x37/0xd0
  nvme_pci_map_queues+0x80/0xb0 [nvme]
  blk_mq_alloc_tag_set+0x133/0x2f0
  nvme_reset_work+0x105d/0x1590 [nvme]
  process_one_work+0x291/0x530
  worker_thread+0x218/0x3d0
  ? process_one_work+0x530/0x530
  kthread+0x111/0x130
  ? kthread_park+0x90/0x90
  ret_from_fork+0x1f/0x30
 ---[ end trace 74587339d93c83c0 ]---

Fixes: 22b5560195bd6 ("nvme-pci: Separate IO and admin queue IRQ vectors")
Reported-by: Iván Chavero <ichavero@chavero.com.mx>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 599065ed6a32..f562154551ce 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -464,7 +464,7 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 		 * affinity), so use the regular blk-mq cpu mapping
 		 */
 		map->queue_offset = qoff;
-		if (i != HCTX_TYPE_POLL)
+		if (i != HCTX_TYPE_POLL && offset)
 			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
 		else
 			blk_mq_map_queues(map);

From 389468e76b6778c714af49f590de9efe3415c162 Mon Sep 17 00:00:00 2001
From: Ed Cashin <ed.cashin@acm.org>
Date: Fri, 17 May 2019 16:29:18 -0400
Subject: [PATCH 14/23] aoe: list new maintainer for aoe driver

Justin Sanders, who has extensive experience with ATA over Ethernet
in general and AoE SCSI and block-device drivers in particular, is
ready to take on the role of aoe maintainer.  The driver needs a more
active maintainer.

Signed-off-by: Ed Cashin <ed.cashin@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9cc6767e1b12..634f1d035767 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2627,7 +2627,7 @@ F:	Documentation/devicetree/bindings/eeprom/at24.txt
 F:	drivers/misc/eeprom/at24.c
 
 ATA OVER ETHERNET (AOE) DRIVER
-M:	"Ed L. Cashin" <ed.cashin@acm.org>
+M:	"Justin Sanders" <justin@coraid.com>
 W:	http://www.openaoe.org/
 S:	Supported
 F:	Documentation/aoe/

From f381c6a4bd0ae0fde2d6340f1b9bb0f58d915de6 Mon Sep 17 00:00:00 2001
From: Andrea Parri <andrea.parri@amarulasolutions.com>
Date: Mon, 20 May 2019 19:23:56 +0200
Subject: [PATCH 15/23] bio: fix improper use of smp_mb__before_atomic()

This barrier only applies to the read-modify-write operations; in
particular, it does not apply to the atomic_set() primitive.

Replace the barrier with an smp_mb().

Fixes: dac56212e8127 ("bio: skip atomic inc/dec of ->bi_cnt for most use cases")
Cc: stable@vger.kernel.org
Reported-by: "Paul E. McKenney" <paulmck@linux.ibm.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: linux-block@vger.kernel.org
Cc: "Paul E. McKenney" <paulmck@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index ea73df36529a..0f23b5682640 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -210,7 +210,7 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count)
 {
 	if (count != 1) {
 		bio->bi_flags |= (1 << BIO_REFFED);
-		smp_mb__before_atomic();
+		smp_mb();
 	}
 	atomic_set(&bio->__bi_cnt, count);
 }

From a0934fd2b1208458e55fc4b48f55889809fce666 Mon Sep 17 00:00:00 2001
From: Andrea Parri <andrea.parri@amarulasolutions.com>
Date: Mon, 20 May 2019 19:23:57 +0200
Subject: [PATCH 16/23] sbitmap: fix improper use of smp_mb__before_atomic()

This barrier only applies to the read-modify-write operations; in
particular, it does not apply to the atomic_set() primitive.

Replace the barrier with an smp_mb().

Fixes: 6c0ca7ae292ad ("sbitmap: fix wakeup hang after sbq resize")
Cc: stable@vger.kernel.org
Reported-by: "Paul E. McKenney" <paulmck@linux.ibm.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: linux-block@vger.kernel.org
Cc: "Paul E. McKenney" <paulmck@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 lib/sbitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 155fe38756ec..4a7fc4915dfc 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -435,7 +435,7 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
 		 * to ensure that the batch size is updated before the wait
 		 * counts.
 		 */
-		smp_mb__before_atomic();
+		smp_mb();
 		for (i = 0; i < SBQ_WAIT_QUEUES; i++)
 			atomic_set(&sbq->ws[i].wait_cnt, 1);
 	}

From eded341c085bebdd653f8086c02179098cb81748 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 21 May 2019 09:01:40 +0200
Subject: [PATCH 17/23] block: don't decrement nr_phys_segments for physically
 contigous segments

Currently ll_merge_requests_fn, unlike all other merge functions,
reduces nr_phys_segments by one if the last segment of the previous,
and the first segment of the next segement are contigous.  While this
seems like a nice solution to avoid building smaller than possible
requests it causes a mismatch between the segments actually present
in the request and those iterated over by the bvec iterators, including
__rq_for_each_bio.  This can for example mistrigger the single segment
optimization in the nvme-pci driver, and might lead to mismatching
nr_phys_segments number when recalculating the number of request
when inserting a cloned request.

We could possibly work around this by making the bvec iterators take
the front and back segment size into account, but that would require
moving them from the bio to the bio_iter and spreading this mess
over all users of bvecs.  Or we could simply remove this optimization
under the assumption that most users already build good enough bvecs,
and that the bio merge patch never cared about this optimization
either.  The latter is what this patch does.

dff824b2aadb ("nvme-pci: optimize mapping of small single segment requests").
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 21e87a714a73..80a5a0facb87 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -358,7 +358,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	unsigned front_seg_size;
 	struct bio *fbio, *bbio;
 	struct bvec_iter iter;
-	bool new_bio = false;
 
 	if (!bio)
 		return 0;
@@ -379,31 +378,12 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	nr_phys_segs = 0;
 	for_each_bio(bio) {
 		bio_for_each_bvec(bv, bio, iter) {
-			if (new_bio) {
-				if (seg_size + bv.bv_len
-				    > queue_max_segment_size(q))
-					goto new_segment;
-				if (!biovec_phys_mergeable(q, &bvprv, &bv))
-					goto new_segment;
-
-				seg_size += bv.bv_len;
-
-				if (nr_phys_segs == 1 && seg_size >
-						front_seg_size)
-					front_seg_size = seg_size;
-
-				continue;
-			}
-new_segment:
 			bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size,
 					&front_seg_size, NULL, UINT_MAX);
-			new_bio = false;
 		}
 		bbio = bio;
-		if (likely(bio->bi_iter.bi_size)) {
+		if (likely(bio->bi_iter.bi_size))
 			bvprv = bv;
-			new_bio = true;
-		}
 	}
 
 	fbio->bi_seg_front_size = front_seg_size;
@@ -725,7 +705,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 			req->bio->bi_seg_front_size = seg_size;
 		if (next->nr_phys_segments == 1)
 			next->biotail->bi_seg_back_size = seg_size;
-		total_phys_segments--;
 	}
 
 	if (total_phys_segments > queue_max_segments(q))

From 09324d32d2a0843e66652a087da6f77924358e62 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 21 May 2019 09:01:41 +0200
Subject: [PATCH 18/23] block: force an unlimited segment size on queues with a
 virt boundary

We currently fail to update the front/back segment size in the bio when
deciding to allow an otherwise gappy segement to a device with a
virt boundary.  The reason why this did not cause problems is that
devices with a virt boundary fundamentally don't use segments as we
know it and thus don't care.  Make that assumption formal by forcing
an unlimited segement size in this case.

Fixes: f6970f83ef79 ("block: don't check if adjacent bvecs in one bio can be mergeable")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 3facc41476be..2ae348c101a0 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -310,6 +310,9 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
 		       __func__, max_size);
 	}
 
+	/* see blk_queue_virt_boundary() for the explanation */
+	WARN_ON_ONCE(q->limits.virt_boundary_mask);
+
 	q->limits.max_segment_size = max_size;
 }
 EXPORT_SYMBOL(blk_queue_max_segment_size);
@@ -742,6 +745,14 @@ EXPORT_SYMBOL(blk_queue_segment_boundary);
 void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask)
 {
 	q->limits.virt_boundary_mask = mask;
+
+	/*
+	 * Devices that require a virtual boundary do not support scatter/gather
+	 * I/O natively, but instead require a descriptor list entry for each
+	 * page (which might not be idential to the Linux PAGE_SIZE).  Because
+	 * of that they are not limited by our notion of "segment size".
+	 */
+	q->limits.max_segment_size = UINT_MAX;
 }
 EXPORT_SYMBOL(blk_queue_virt_boundary);
 

From 200a9aff7b02feea30b01141b0df9bc19457a232 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 21 May 2019 09:01:42 +0200
Subject: [PATCH 19/23] block: remove the segment size check in bio_will_gap

We fundamentally do not have a maximum segement size for devices with a
virt boundary.  So don't bother checking it, especially given that the
existing checks didn't properly work to start with as we never fully
update the front/back segment size and miss the bi_seg_front_size that
wuld have been required for some cases.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 80a5a0facb87..eee2c02c50ce 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -12,23 +12,6 @@
 
 #include "blk.h"
 
-/*
- * Check if the two bvecs from two bios can be merged to one segment.  If yes,
- * no need to check gap between the two bios since the 1st bio and the 1st bvec
- * in the 2nd bio can be handled in one segment.
- */
-static inline bool bios_segs_mergeable(struct request_queue *q,
-		struct bio *prev, struct bio_vec *prev_last_bv,
-		struct bio_vec *next_first_bv)
-{
-	if (!biovec_phys_mergeable(q, prev_last_bv, next_first_bv))
-		return false;
-	if (prev->bi_seg_back_size + next_first_bv->bv_len >
-			queue_max_segment_size(q))
-		return false;
-	return true;
-}
-
 static inline bool bio_will_gap(struct request_queue *q,
 		struct request *prev_rq, struct bio *prev, struct bio *next)
 {
@@ -60,7 +43,7 @@ static inline bool bio_will_gap(struct request_queue *q,
 	 */
 	bio_get_last_bvec(prev, &pb);
 	bio_get_first_bvec(next, &nb);
-	if (bios_segs_mergeable(q, prev, &pb, &nb))
+	if (biovec_phys_mergeable(q, &pb, &nb))
 		return false;
 	return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
 }

From 6869875fbc04042ad01654591da60862706e86e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 21 May 2019 09:01:43 +0200
Subject: [PATCH 20/23] block: remove the bi_seg_{front,back}_size fields in
 struct bio

At this point these fields aren't used for anything, so we can remove
them.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c         | 94 +++++----------------------------------
 include/linux/blk_types.h |  7 ---
 2 files changed, 12 insertions(+), 89 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index eee2c02c50ce..17713d7d98d5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -162,8 +162,7 @@ static unsigned get_max_segment_size(struct request_queue *q,
  * variables.
  */
 static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
-		unsigned *nsegs, unsigned *last_seg_size,
-		unsigned *front_seg_size, unsigned *sectors, unsigned max_segs)
+		unsigned *nsegs, unsigned *sectors, unsigned max_segs)
 {
 	unsigned len = bv->bv_len;
 	unsigned total_len = 0;
@@ -185,28 +184,12 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
 			break;
 	}
 
-	if (!new_nsegs)
-		return !!len;
-
-	/* update front segment size */
-	if (!*nsegs) {
-		unsigned first_seg_size;
-
-		if (new_nsegs == 1)
-			first_seg_size = get_max_segment_size(q, bv->bv_offset);
-		else
-			first_seg_size = queue_max_segment_size(q);
-
-		if (*front_seg_size < first_seg_size)
-			*front_seg_size = first_seg_size;
+	if (new_nsegs) {
+		*nsegs += new_nsegs;
+		if (sectors)
+			*sectors += total_len >> 9;
 	}
 
-	/* update other varibles */
-	*last_seg_size = seg_size;
-	*nsegs += new_nsegs;
-	if (sectors)
-		*sectors += total_len >> 9;
-
 	/* split in the middle of the bvec if len != 0 */
 	return !!len;
 }
@@ -218,8 +201,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 {
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
-	unsigned seg_size = 0, nsegs = 0, sectors = 0;
-	unsigned front_seg_size = bio->bi_seg_front_size;
+	unsigned nsegs = 0, sectors = 0;
 	bool do_split = true;
 	struct bio *new = NULL;
 	const unsigned max_sectors = get_max_io_size(q, bio);
@@ -243,8 +225,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 				/* split in the middle of bvec */
 				bv.bv_len = (max_sectors - sectors) << 9;
 				bvec_split_segs(q, &bv, &nsegs,
-						&seg_size,
-						&front_seg_size,
 						&sectors, max_segs);
 			}
 			goto split;
@@ -258,12 +238,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 
 		if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
 			nsegs++;
-			seg_size = bv.bv_len;
 			sectors += bv.bv_len >> 9;
-			if (nsegs == 1 && seg_size > front_seg_size)
-				front_seg_size = seg_size;
-		} else if (bvec_split_segs(q, &bv, &nsegs, &seg_size,
-				    &front_seg_size, &sectors, max_segs)) {
+		} else if (bvec_split_segs(q, &bv, &nsegs, &sectors,
+				max_segs)) {
 			goto split;
 		}
 	}
@@ -278,10 +255,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 			bio = new;
 	}
 
-	bio->bi_seg_front_size = front_seg_size;
-	if (seg_size > bio->bi_seg_back_size)
-		bio->bi_seg_back_size = seg_size;
-
 	return do_split ? new : NULL;
 }
 
@@ -336,17 +309,13 @@ EXPORT_SYMBOL(blk_queue_split);
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 					     struct bio *bio)
 {
-	struct bio_vec uninitialized_var(bv), bvprv = { NULL };
-	unsigned int seg_size, nr_phys_segs;
-	unsigned front_seg_size;
-	struct bio *fbio, *bbio;
+	unsigned int nr_phys_segs = 0;
 	struct bvec_iter iter;
+	struct bio_vec bv;
 
 	if (!bio)
 		return 0;
 
-	front_seg_size = bio->bi_seg_front_size;
-
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
@@ -356,23 +325,11 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 		return 1;
 	}
 
-	fbio = bio;
-	seg_size = 0;
-	nr_phys_segs = 0;
 	for_each_bio(bio) {
-		bio_for_each_bvec(bv, bio, iter) {
-			bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size,
-					&front_seg_size, NULL, UINT_MAX);
-		}
-		bbio = bio;
-		if (likely(bio->bi_iter.bi_size))
-			bvprv = bv;
+		bio_for_each_bvec(bv, bio, iter)
+			bvec_split_segs(q, &bv, &nr_phys_segs, NULL, UINT_MAX);
 	}
 
-	fbio->bi_seg_front_size = front_seg_size;
-	if (seg_size > bbio->bi_seg_back_size)
-		bbio->bi_seg_back_size = seg_size;
-
 	return nr_phys_segs;
 }
 
@@ -392,24 +349,6 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio)
 	bio_set_flag(bio, BIO_SEG_VALID);
 }
 
-static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
-				   struct bio *nxt)
-{
-	struct bio_vec end_bv = { NULL }, nxt_bv;
-
-	if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
-	    queue_max_segment_size(q))
-		return 0;
-
-	if (!bio_has_data(bio))
-		return 1;
-
-	bio_get_last_bvec(bio, &end_bv);
-	bio_get_first_bvec(nxt, &nxt_bv);
-
-	return biovec_phys_mergeable(q, &end_bv, &nxt_bv);
-}
-
 static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
 		struct scatterlist *sglist)
 {
@@ -669,8 +608,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 				struct request *next)
 {
 	int total_phys_segments;
-	unsigned int seg_size =
-		req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size;
 
 	if (req_gap_back_merge(req, next->bio))
 		return 0;
@@ -683,13 +620,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 		return 0;
 
 	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
-	if (blk_phys_contig_segment(q, req->biotail, next->bio)) {
-		if (req->nr_phys_segments == 1)
-			req->bio->bi_seg_front_size = seg_size;
-		if (next->nr_phys_segments == 1)
-			next->biotail->bi_seg_back_size = seg_size;
-	}
-
 	if (total_phys_segments > queue_max_segments(q))
 		return 0;
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index be418275763c..95202f80676c 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -159,13 +159,6 @@ struct bio {
 	 */
 	unsigned int		bi_phys_segments;
 
-	/*
-	 * To keep track of the max segment size, we account for the
-	 * sizes of the first and last mergeable segments in this bio.
-	 */
-	unsigned int		bi_seg_front_size;
-	unsigned int		bi_seg_back_size;
-
 	struct bvec_iter	bi_iter;
 
 	atomic_t		__bi_remaining;

From 7996a8b5511a72465b0b286763c2d8f412b8874a Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Tue, 21 May 2019 11:25:55 +0800
Subject: [PATCH 21/23] blk-mq: fix hang caused by freeze/unfreeze sequence

The following is a description of a hang in blk_mq_freeze_queue_wait().
The hang happens on attempt to freeze a queue while another task does
queue unfreeze.

The root cause is an incorrect sequence of percpu_ref_resurrect() and
percpu_ref_kill() and as a result those two can be swapped:

 CPU#0                         CPU#1
 ----------------              -----------------
 q1 = blk_mq_init_queue(shared_tags)

                                q2 = blk_mq_init_queue(shared_tags):
                                  blk_mq_add_queue_tag_set(shared_tags):
                                    blk_mq_update_tag_set_depth(shared_tags):
				     list_for_each_entry()
                                      blk_mq_freeze_queue(q1)
                                       > percpu_ref_kill()
                                       > blk_mq_freeze_queue_wait()

 blk_cleanup_queue(q1)
  blk_mq_freeze_queue(q1)
   > percpu_ref_kill()
                 ^^^^^^ freeze_depth can't guarantee the order

                                      blk_mq_unfreeze_queue()
                                        > percpu_ref_resurrect()

   > blk_mq_freeze_queue_wait()
                 ^^^^^^ Hang here!!!!

This wrong sequence raises kernel warning:
percpu_ref_kill_and_confirm called more than once on blk_queue_usage_counter_release!
WARNING: CPU: 0 PID: 11854 at lib/percpu-refcount.c:336 percpu_ref_kill_and_confirm+0x99/0xb0

But the most unpleasant effect is a hang of a blk_mq_freeze_queue_wait(),
which waits for a zero of a q_usage_counter, which never happens
because percpu-ref was reinited (instead of being killed) and stays in
PERCPU state forever.

How to reproduce:
 - "insmod null_blk.ko shared_tags=1 nr_devices=0 queue_mode=2"
 - cpu0: python Script.py 0; taskset the corresponding process running on cpu0
 - cpu1: python Script.py 1; taskset the corresponding process running on cpu1

 Script.py:
 ------
 #!/usr/bin/python3

import os
import sys

while True:
    on = "echo 1 > /sys/kernel/config/nullb/%s/power" % sys.argv[1]
    off = "echo 0 > /sys/kernel/config/nullb/%s/power" % sys.argv[1]
    os.system(on)
    os.system(off)
------

This bug was first reported and fixed by Roman, previous discussion:
[1] Message id: 1443287365-4244-7-git-send-email-akinobu.mita@gmail.com
[2] Message id: 1443563240-29306-6-git-send-email-tj@kernel.org
[3] https://patchwork.kernel.org/patch/9268199/

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  3 ++-
 block/blk-mq.c         | 19 ++++++++++---------
 include/linux/blkdev.h |  7 ++++++-
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 419d600e6637..1bf83a0df0f6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -413,7 +413,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 		smp_rmb();
 
 		wait_event(q->mq_freeze_wq,
-			   (atomic_read(&q->mq_freeze_depth) == 0 &&
+			   (!q->mq_freeze_depth &&
 			    (pm || (blk_pm_request_resume(q),
 				    !blk_queue_pm_only(q)))) ||
 			   blk_queue_dying(q));
@@ -503,6 +503,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	spin_lock_init(&q->queue_lock);
 
 	init_waitqueue_head(&q->mq_freeze_wq);
+	mutex_init(&q->mq_freeze_lock);
 
 	/*
 	 * Init percpu_ref in atomic mode so that it's faster to shutdown.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 08a6248d8536..32b8ad3d341b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -144,13 +144,14 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 
 void blk_freeze_queue_start(struct request_queue *q)
 {
-	int freeze_depth;
-
-	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
-	if (freeze_depth == 1) {
+	mutex_lock(&q->mq_freeze_lock);
+	if (++q->mq_freeze_depth == 1) {
 		percpu_ref_kill(&q->q_usage_counter);
+		mutex_unlock(&q->mq_freeze_lock);
 		if (queue_is_mq(q))
 			blk_mq_run_hw_queues(q, false);
+	} else {
+		mutex_unlock(&q->mq_freeze_lock);
 	}
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -199,14 +200,14 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-	int freeze_depth;
-
-	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
-	WARN_ON_ONCE(freeze_depth < 0);
-	if (!freeze_depth) {
+	mutex_lock(&q->mq_freeze_lock);
+	q->mq_freeze_depth--;
+	WARN_ON_ONCE(q->mq_freeze_depth < 0);
+	if (!q->mq_freeze_depth) {
 		percpu_ref_resurrect(&q->q_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
 	}
+	mutex_unlock(&q->mq_freeze_lock);
 }
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1aafeb923e7b..592669bcc536 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -542,7 +542,7 @@ struct request_queue {
 	struct list_head	unused_hctx_list;
 	spinlock_t		unused_hctx_lock;
 
-	atomic_t		mq_freeze_depth;
+	int			mq_freeze_depth;
 
 #if defined(CONFIG_BLK_DEV_BSG)
 	struct bsg_class_device bsg_dev;
@@ -554,6 +554,11 @@ struct request_queue {
 #endif
 	struct rcu_head		rcu_head;
 	wait_queue_head_t	mq_freeze_wq;
+	/*
+	 * Protect concurrent access to q_usage_counter by
+	 * percpu_ref_kill() and percpu_ref_reinit().
+	 */
+	struct mutex		mq_freeze_lock;
 	struct percpu_ref	q_usage_counter;
 
 	struct blk_mq_tag_set	*tag_set;

From 486f069253c3c738dec62daeb16f7232b2cca065 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 22 May 2019 08:47:54 -0600
Subject: [PATCH 22/23] tools/io_uring: fix Makefile for pthread library link

Currently fails with:

io_uring-bench.o: In function `main':
/home/axboe/git/linux-block/tools/io_uring/io_uring-bench.c:560: undefined reference to `pthread_create'
/home/axboe/git/linux-block/tools/io_uring/io_uring-bench.c:588: undefined reference to `pthread_join'
collect2: error: ld returned 1 exit status
Makefile:11: recipe for target 'io_uring-bench' failed
make: *** [io_uring-bench] Error 1

Move -lpthread to the end.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/io_uring/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/io_uring/Makefile b/tools/io_uring/Makefile
index f79522fc37b5..00f146c54c53 100644
--- a/tools/io_uring/Makefile
+++ b/tools/io_uring/Makefile
@@ -8,7 +8,7 @@ all: io_uring-cp io_uring-bench
 	$(CC) $(CFLAGS) -o $@ $^
 
 io_uring-bench: syscall.o io_uring-bench.o
-	$(CC) $(CFLAGS) $(LDLIBS) -o $@ $^
+	$(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)
 
 io_uring-cp: setup.o syscall.o queue.o
 

From 004d564f908790efe815a6510a542ac1227ef2a2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 22 May 2019 08:59:12 -0600
Subject: [PATCH 23/23] tools/io_uring: sync with liburing

Various fixes and changes have been applied to liburing since we
copied some select bits to the kernel testing/examples part, sync
up with liburing to get those changes.

Most notable is the change that split the CQE reading into the peek
and seen event, instead of being just a single function. Also fixes
an unsigned wrap issue in io_uring_submit(), leak of 'fd' in setup
if we fail, and various other little issues.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/io_uring/io_uring-cp.c | 23 +++++++++----
 tools/io_uring/liburing.h    | 64 +++++++++++++++++++++++++++++-------
 tools/io_uring/queue.c       | 36 ++++++++------------
 tools/io_uring/setup.c       | 10 ++++--
 tools/io_uring/syscall.c     | 48 +++++++++++++++++----------
 5 files changed, 119 insertions(+), 62 deletions(-)

diff --git a/tools/io_uring/io_uring-cp.c b/tools/io_uring/io_uring-cp.c
index 633f65bb43a7..81461813ec62 100644
--- a/tools/io_uring/io_uring-cp.c
+++ b/tools/io_uring/io_uring-cp.c
@@ -13,6 +13,7 @@
 #include <assert.h>
 #include <errno.h>
 #include <inttypes.h>
+#include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/ioctl.h>
 
@@ -85,11 +86,16 @@ static int queue_read(struct io_uring *ring, off_t size, off_t offset)
 	struct io_uring_sqe *sqe;
 	struct io_data *data;
 
-	sqe = io_uring_get_sqe(ring);
-	if (!sqe)
+	data = malloc(size + sizeof(*data));
+	if (!data)
 		return 1;
 
-	data = malloc(size + sizeof(*data));
+	sqe = io_uring_get_sqe(ring);
+	if (!sqe) {
+		free(data);
+		return 1;
+	}
+
 	data->read = 1;
 	data->offset = data->first_offset = offset;
 
@@ -166,22 +172,23 @@ static int copy_file(struct io_uring *ring, off_t insize)
 			struct io_data *data;
 
 			if (!got_comp) {
-				ret = io_uring_wait_completion(ring, &cqe);
+				ret = io_uring_wait_cqe(ring, &cqe);
 				got_comp = 1;
 			} else
-				ret = io_uring_get_completion(ring, &cqe);
+				ret = io_uring_peek_cqe(ring, &cqe);
 			if (ret < 0) {
-				fprintf(stderr, "io_uring_get_completion: %s\n",
+				fprintf(stderr, "io_uring_peek_cqe: %s\n",
 							strerror(-ret));
 				return 1;
 			}
 			if (!cqe)
 				break;
 
-			data = (struct io_data *) (uintptr_t) cqe->user_data;
+			data = io_uring_cqe_get_data(cqe);
 			if (cqe->res < 0) {
 				if (cqe->res == -EAGAIN) {
 					queue_prepped(ring, data);
+					io_uring_cqe_seen(ring, cqe);
 					continue;
 				}
 				fprintf(stderr, "cqe failed: %s\n",
@@ -193,6 +200,7 @@ static int copy_file(struct io_uring *ring, off_t insize)
 				data->iov.iov_len -= cqe->res;
 				data->offset += cqe->res;
 				queue_prepped(ring, data);
+				io_uring_cqe_seen(ring, cqe);
 				continue;
 			}
 
@@ -209,6 +217,7 @@ static int copy_file(struct io_uring *ring, off_t insize)
 				free(data);
 				writes--;
 			}
+			io_uring_cqe_seen(ring, cqe);
 		}
 	}
 
diff --git a/tools/io_uring/liburing.h b/tools/io_uring/liburing.h
index cab0f50257ba..5f305c86b892 100644
--- a/tools/io_uring/liburing.h
+++ b/tools/io_uring/liburing.h
@@ -1,10 +1,16 @@
 #ifndef LIB_URING_H
 #define LIB_URING_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include <sys/uio.h>
 #include <signal.h>
 #include <string.h>
 #include "../../include/uapi/linux/io_uring.h"
+#include <inttypes.h>
+#include "barrier.h"
 
 /*
  * Library interface to io_uring
@@ -46,7 +52,7 @@ struct io_uring {
  * System calls
  */
 extern int io_uring_setup(unsigned entries, struct io_uring_params *p);
-extern int io_uring_enter(unsigned fd, unsigned to_submit,
+extern int io_uring_enter(int fd, unsigned to_submit,
 	unsigned min_complete, unsigned flags, sigset_t *sig);
 extern int io_uring_register(int fd, unsigned int opcode, void *arg,
 	unsigned int nr_args);
@@ -59,13 +65,32 @@ extern int io_uring_queue_init(unsigned entries, struct io_uring *ring,
 extern int io_uring_queue_mmap(int fd, struct io_uring_params *p,
 	struct io_uring *ring);
 extern void io_uring_queue_exit(struct io_uring *ring);
-extern int io_uring_get_completion(struct io_uring *ring,
+extern int io_uring_peek_cqe(struct io_uring *ring,
 	struct io_uring_cqe **cqe_ptr);
-extern int io_uring_wait_completion(struct io_uring *ring,
+extern int io_uring_wait_cqe(struct io_uring *ring,
 	struct io_uring_cqe **cqe_ptr);
 extern int io_uring_submit(struct io_uring *ring);
 extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring);
 
+/*
+ * Must be called after io_uring_{peek,wait}_cqe() after the cqe has
+ * been processed by the application.
+ */
+static inline void io_uring_cqe_seen(struct io_uring *ring,
+				     struct io_uring_cqe *cqe)
+{
+	if (cqe) {
+		struct io_uring_cq *cq = &ring->cq;
+
+		(*cq->khead)++;
+		/*
+		 * Ensure that the kernel sees our new head, the kernel has
+		 * the matching read barrier.
+		 */
+		write_barrier();
+	}
+}
+
 /*
  * Command prep helpers
  */
@@ -74,8 +99,14 @@ static inline void io_uring_sqe_set_data(struct io_uring_sqe *sqe, void *data)
 	sqe->user_data = (unsigned long) data;
 }
 
+static inline void *io_uring_cqe_get_data(struct io_uring_cqe *cqe)
+{
+	return (void *) (uintptr_t) cqe->user_data;
+}
+
 static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd,
-				    void *addr, unsigned len, off_t offset)
+				    const void *addr, unsigned len,
+				    off_t offset)
 {
 	memset(sqe, 0, sizeof(*sqe));
 	sqe->opcode = op;
@@ -86,8 +117,8 @@ static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd,
 }
 
 static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd,
-				       struct iovec *iovecs, unsigned nr_vecs,
-				       off_t offset)
+				       const struct iovec *iovecs,
+				       unsigned nr_vecs, off_t offset)
 {
 	io_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset);
 }
@@ -100,14 +131,14 @@ static inline void io_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd,
 }
 
 static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd,
-				        struct iovec *iovecs, unsigned nr_vecs,
-					off_t offset)
+					const struct iovec *iovecs,
+					unsigned nr_vecs, off_t offset)
 {
 	io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset);
 }
 
 static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd,
-					     void *buf, unsigned nbytes,
+					     const void *buf, unsigned nbytes,
 					     off_t offset)
 {
 	io_uring_prep_rw(IORING_OP_WRITE_FIXED, sqe, fd, buf, nbytes, offset);
@@ -131,13 +162,22 @@ static inline void io_uring_prep_poll_remove(struct io_uring_sqe *sqe,
 }
 
 static inline void io_uring_prep_fsync(struct io_uring_sqe *sqe, int fd,
-				       int datasync)
+				       unsigned fsync_flags)
 {
 	memset(sqe, 0, sizeof(*sqe));
 	sqe->opcode = IORING_OP_FSYNC;
 	sqe->fd = fd;
-	if (datasync)
-		sqe->fsync_flags = IORING_FSYNC_DATASYNC;
+	sqe->fsync_flags = fsync_flags;
 }
 
+static inline void io_uring_prep_nop(struct io_uring_sqe *sqe)
+{
+	memset(sqe, 0, sizeof(*sqe));
+	sqe->opcode = IORING_OP_NOP;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/tools/io_uring/queue.c b/tools/io_uring/queue.c
index 88505e873ad9..321819c132c7 100644
--- a/tools/io_uring/queue.c
+++ b/tools/io_uring/queue.c
@@ -8,8 +8,8 @@
 #include "liburing.h"
 #include "barrier.h"
 
-static int __io_uring_get_completion(struct io_uring *ring,
-				     struct io_uring_cqe **cqe_ptr, int wait)
+static int __io_uring_get_cqe(struct io_uring *ring,
+			      struct io_uring_cqe **cqe_ptr, int wait)
 {
 	struct io_uring_cq *cq = &ring->cq;
 	const unsigned mask = *cq->kring_mask;
@@ -39,34 +39,25 @@ static int __io_uring_get_completion(struct io_uring *ring,
 			return -errno;
 	} while (1);
 
-	if (*cqe_ptr) {
-		*cq->khead = head + 1;
-		/*
-		 * Ensure that the kernel sees our new head, the kernel has
-		 * the matching read barrier.
-		 */
-		write_barrier();
-	}
-
 	return 0;
 }
 
 /*
- * Return an IO completion, if one is readily available
+ * Return an IO completion, if one is readily available. Returns 0 with
+ * cqe_ptr filled in on success, -errno on failure.
  */
-int io_uring_get_completion(struct io_uring *ring,
-			    struct io_uring_cqe **cqe_ptr)
+int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
 {
-	return __io_uring_get_completion(ring, cqe_ptr, 0);
+	return __io_uring_get_cqe(ring, cqe_ptr, 0);
 }
 
 /*
- * Return an IO completion, waiting for it if necessary
+ * Return an IO completion, waiting for it if necessary. Returns 0 with
+ * cqe_ptr filled in on success, -errno on failure.
  */
-int io_uring_wait_completion(struct io_uring *ring,
-			     struct io_uring_cqe **cqe_ptr)
+int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
 {
-	return __io_uring_get_completion(ring, cqe_ptr, 1);
+	return __io_uring_get_cqe(ring, cqe_ptr, 1);
 }
 
 /*
@@ -78,7 +69,7 @@ int io_uring_submit(struct io_uring *ring)
 {
 	struct io_uring_sq *sq = &ring->sq;
 	const unsigned mask = *sq->kring_mask;
-	unsigned ktail, ktail_next, submitted;
+	unsigned ktail, ktail_next, submitted, to_submit;
 	int ret;
 
 	/*
@@ -100,7 +91,8 @@ int io_uring_submit(struct io_uring *ring)
 	 */
 	submitted = 0;
 	ktail = ktail_next = *sq->ktail;
-	while (sq->sqe_head < sq->sqe_tail) {
+	to_submit = sq->sqe_tail - sq->sqe_head;
+	while (to_submit--) {
 		ktail_next++;
 		read_barrier();
 
@@ -136,7 +128,7 @@ int io_uring_submit(struct io_uring *ring)
 	if (ret < 0)
 		return -errno;
 
-	return 0;
+	return ret;
 }
 
 /*
diff --git a/tools/io_uring/setup.c b/tools/io_uring/setup.c
index 4da19a77132c..0b50fcd78520 100644
--- a/tools/io_uring/setup.c
+++ b/tools/io_uring/setup.c
@@ -27,7 +27,7 @@ static int io_uring_mmap(int fd, struct io_uring_params *p,
 	sq->kdropped = ptr + p->sq_off.dropped;
 	sq->array = ptr + p->sq_off.array;
 
-	size = p->sq_entries * sizeof(struct io_uring_sqe),
+	size = p->sq_entries * sizeof(struct io_uring_sqe);
 	sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
 				MAP_SHARED | MAP_POPULATE, fd,
 				IORING_OFF_SQES);
@@ -79,7 +79,7 @@ int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring
 int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags)
 {
 	struct io_uring_params p;
-	int fd;
+	int fd, ret;
 
 	memset(&p, 0, sizeof(p));
 	p.flags = flags;
@@ -88,7 +88,11 @@ int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags)
 	if (fd < 0)
 		return fd;
 
-	return io_uring_queue_mmap(fd, &p, ring);
+	ret = io_uring_queue_mmap(fd, &p, ring);
+	if (ret)
+		close(fd);
+
+	return ret;
 }
 
 void io_uring_queue_exit(struct io_uring *ring)
diff --git a/tools/io_uring/syscall.c b/tools/io_uring/syscall.c
index 6b835e5c6a5b..b22e0aa54e9d 100644
--- a/tools/io_uring/syscall.c
+++ b/tools/io_uring/syscall.c
@@ -7,34 +7,46 @@
 #include <signal.h>
 #include "liburing.h"
 
-#if defined(__x86_64) || defined(__i386__)
-#ifndef __NR_sys_io_uring_setup
-#define __NR_sys_io_uring_setup		425
-#endif
-#ifndef __NR_sys_io_uring_enter
-#define __NR_sys_io_uring_enter		426
-#endif
-#ifndef __NR_sys_io_uring_register
-#define __NR_sys_io_uring_register	427
-#endif
-#else
-#error "Arch not supported yet"
+#ifdef __alpha__
+/*
+ * alpha is the only exception, all other architectures
+ * have common numbers for new system calls.
+ */
+# ifndef __NR_io_uring_setup
+#  define __NR_io_uring_setup		535
+# endif
+# ifndef __NR_io_uring_enter
+#  define __NR_io_uring_enter		536
+# endif
+# ifndef __NR_io_uring_register
+#  define __NR_io_uring_register	537
+# endif
+#else /* !__alpha__ */
+# ifndef __NR_io_uring_setup
+#  define __NR_io_uring_setup		425
+# endif
+# ifndef __NR_io_uring_enter
+#  define __NR_io_uring_enter		426
+# endif
+# ifndef __NR_io_uring_register
+#  define __NR_io_uring_register	427
+# endif
 #endif
 
 int io_uring_register(int fd, unsigned int opcode, void *arg,
 		      unsigned int nr_args)
 {
-	return syscall(__NR_sys_io_uring_register, fd, opcode, arg, nr_args);
+	return syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
 }
 
-int io_uring_setup(unsigned entries, struct io_uring_params *p)
+int io_uring_setup(unsigned int entries, struct io_uring_params *p)
 {
-	return syscall(__NR_sys_io_uring_setup, entries, p);
+	return syscall(__NR_io_uring_setup, entries, p);
 }
 
-int io_uring_enter(unsigned fd, unsigned to_submit, unsigned min_complete,
-		   unsigned flags, sigset_t *sig)
+int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete,
+		   unsigned int flags, sigset_t *sig)
 {
-	return syscall(__NR_sys_io_uring_enter, fd, to_submit, min_complete,
+	return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
 			flags, sig, _NSIG / 8);
 }