From b6dccf7fae4331b0ea41cf087e3f02d5db9161dc Mon Sep 17 00:00:00 2001 From: Arnav Dawn Date: Wed, 12 Jul 2017 16:10:40 +0530 Subject: [PATCH 01/74] nvme: add support for FW activation without reset This patch adds support for handling Fw activation without reset On completion of FW-activation-starting AER, all queues are paused till CSTS.PP is cleared or timed out (exceeds max time for fw activtion MTFA). If device fails to clear CSTS.PP within MTFA, driver issues reset controller. Signed-off-by: Arnav Dawn Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 75 ++++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 2 ++ include/linux/nvme.h | 9 +++++ 3 files changed, 86 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c596dd3c58b1..1a1dedbac39b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -76,6 +76,11 @@ static DEFINE_SPINLOCK(dev_list_lock); static struct class *nvme_class; +static __le32 nvme_get_log_dw10(u8 lid, size_t size) +{ + return cpu_to_le32((((size / 4) - 1) << 16) | lid); +} + int nvme_reset_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) @@ -2534,6 +2539,71 @@ static void nvme_async_event_work(struct work_struct *work) spin_unlock_irq(&ctrl->lock); } +static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) +{ + + u32 csts; + + if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) + return false; + + if (csts == ~0) + return false; + + return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP)); +} + +static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) +{ + struct nvme_command c = { }; + struct nvme_fw_slot_info_log *log; + + log = kmalloc(sizeof(*log), GFP_KERNEL); + if (!log) + return; + + c.common.opcode = nvme_admin_get_log_page; + c.common.nsid = cpu_to_le32(0xffffffff); + c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log)); + + if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log))) + dev_warn(ctrl->device, + "Get FW SLOT INFO log error\n"); + kfree(log); +} + +static void nvme_fw_act_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(work, + struct nvme_ctrl, fw_act_work); + unsigned long fw_act_timeout; + + if (ctrl->mtfa) + fw_act_timeout = jiffies + + msecs_to_jiffies(ctrl->mtfa * 100); + else + fw_act_timeout = jiffies + + msecs_to_jiffies(admin_timeout * 1000); + + nvme_stop_queues(ctrl); + while (nvme_ctrl_pp_status(ctrl)) { + if (time_after(jiffies, fw_act_timeout)) { + dev_warn(ctrl->device, + "Fw activation timeout, reset controller\n"); + nvme_reset_ctrl(ctrl); + break; + } + msleep(100); + } + + if (ctrl->state != NVME_CTRL_LIVE) + return; + + nvme_start_queues(ctrl); + /* read FW slot informationi to clear the AER*/ + nvme_get_fw_slot_info(ctrl); +} + void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, union nvme_result *res) { @@ -2560,6 +2630,9 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, dev_info(ctrl->device, "rescanning\n"); nvme_queue_scan(ctrl); break; + case NVME_AER_NOTICE_FW_ACT_STARTING: + schedule_work(&ctrl->fw_act_work); + break; default: dev_warn(ctrl->device, "async event result %08x\n", result); } @@ -2607,6 +2680,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); flush_work(&ctrl->scan_work); + cancel_work_sync(&ctrl->fw_act_work); } EXPORT_SYMBOL_GPL(nvme_stop_ctrl); @@ -2670,6 +2744,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->quirks = quirks; INIT_WORK(&ctrl->scan_work, nvme_scan_work); INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); + INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); ret = nvme_set_instance(ctrl); if (ret) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 8f2a168ddc01..b40b9af4564f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -142,6 +142,7 @@ struct nvme_ctrl { u16 cntlid; u32 ctrl_config; + u16 mtfa; u32 queue_count; u64 cap; @@ -167,6 +168,7 @@ struct nvme_ctrl { struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; + struct work_struct fw_act_work; /* Power saving configuration */ u64 ps_max_latency_us; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 25d8225dbd04..5d0f2f4fc3b8 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -146,6 +146,7 @@ enum { NVME_CSTS_RDY = 1 << 0, NVME_CSTS_CFS = 1 << 1, NVME_CSTS_NSSRO = 1 << 4, + NVME_CSTS_PP = 1 << 5, NVME_CSTS_SHST_NORMAL = 0 << 2, NVME_CSTS_SHST_OCCUR = 1 << 2, NVME_CSTS_SHST_CMPLT = 2 << 2, @@ -376,6 +377,13 @@ struct nvme_smart_log { __u8 rsvd216[296]; }; +struct nvme_fw_slot_info_log { + __u8 afi; + __u8 rsvd1[7]; + __le64 frs[7]; + __u8 rsvd64[448]; +}; + enum { NVME_SMART_CRIT_SPARE = 1 << 0, NVME_SMART_CRIT_TEMPERATURE = 1 << 1, @@ -386,6 +394,7 @@ enum { enum { NVME_AER_NOTICE_NS_CHANGED = 0x0002, + NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102, }; struct nvme_lba_range_type { From 62346eaeb2f1a0524b35eaa2f479596f40491165 Mon Sep 17 00:00:00 2001 From: Arnav Dawn Date: Wed, 12 Jul 2017 16:11:53 +0530 Subject: [PATCH 02/74] nvme: define NVME_NSID_ALL Define the constant "0xffffffff" (used as nsid for all namespaces) as NVME_NSID_ALL. Signed-off-by: Arnav Dawn Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 6 +++--- include/linux/nvme.h | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1a1dedbac39b..a6afeedc009a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -312,7 +312,7 @@ static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) memset(&c, 0, sizeof(c)); c.directive.opcode = nvme_admin_directive_send; - c.directive.nsid = cpu_to_le32(0xffffffff); + c.directive.nsid = cpu_to_le32(NVME_NSID_ALL); c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE; c.directive.dtype = NVME_DIR_IDENTIFY; c.directive.tdtype = NVME_DIR_STREAMS; @@ -362,7 +362,7 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl) if (ret) return ret; - ret = nvme_get_stream_params(ctrl, &s, 0xffffffff); + ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL); if (ret) return ret; @@ -2563,7 +2563,7 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) return; c.common.opcode = nvme_admin_get_log_page; - c.common.nsid = cpu_to_le32(0xffffffff); + c.common.nsid = cpu_to_le32(NVME_NSID_ALL); c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log)); if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log))) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5d0f2f4fc3b8..1d79046bf9d4 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -32,6 +32,8 @@ #define NVME_RDMA_IP_PORT 4420 +#define NVME_NSID_ALL 0xffffffff + enum nvme_subsys_type { NVME_NQN_DISC = 1, /* Discovery type target subsystem */ NVME_NQN_NVME = 2, /* NVME type target subsystem */ From dbf86b39005d26b21c52a23720e15fb850d71cdc Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Wed, 16 Aug 2017 09:51:29 +0200 Subject: [PATCH 03/74] nvme: add support for NVMe 1.3 Timestamp Feature NVME's Timestamp feature allows controllers to be aware of the epoch time in milliseconds. This patch adds the set features hook for various transports through the identify path, so that resets and resumes can update the controller as necessary. Signed-off-by: Jon Derrick [hch: rebased on top of nvme-4.13 error handling changes, changed nvme_configure_timestamp to return the status] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 21 +++++++++++++++++++++ include/linux/nvme.h | 2 ++ 2 files changed, 23 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a6afeedc009a..0b979e16655e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1507,6 +1507,23 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, blk_queue_write_cache(q, vwc, vwc); } +static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) +{ + __le64 ts; + int ret; + + if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP)) + return 0; + + ts = cpu_to_le64(ktime_to_ms(ktime_get_real())); + ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts), + NULL); + if (ret) + dev_warn_once(ctrl->device, + "could not set timestamp (%d)\n", ret); + return ret; +} + static int nvme_configure_apst(struct nvme_ctrl *ctrl) { /* @@ -1861,6 +1878,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ret = nvme_configure_apst(ctrl); if (ret < 0) return ret; + + ret = nvme_configure_timestamp(ctrl); + if (ret < 0) + return ret; ret = nvme_configure_directives(ctrl); if (ret < 0) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1d79046bf9d4..a12b47073273 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -254,6 +254,7 @@ enum { NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, NVME_CTRL_ONCS_DSM = 1 << 2, NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, + NVME_CTRL_ONCS_TIMESTAMP = 1 << 6, NVME_CTRL_VWC_PRESENT = 1 << 0, NVME_CTRL_OACS_SEC_SUPP = 1 << 0, NVME_CTRL_OACS_DIRECTIVES = 1 << 5, @@ -688,6 +689,7 @@ enum { NVME_FEAT_ASYNC_EVENT = 0x0b, NVME_FEAT_AUTO_PST = 0x0c, NVME_FEAT_HOST_MEM_BUF = 0x0d, + NVME_FEAT_TIMESTAMP = 0x0e, NVME_FEAT_KATO = 0x0f, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, From 1645d5036f9201bd0925aff099fa92fbe7afaff4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jul 2017 19:46:36 +0200 Subject: [PATCH 04/74] nvmet: use NVME_NSID_ALL Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Max Gurtovoy --- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/configfs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index a53bb6635b83..dffd8064163b 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -100,7 +100,7 @@ static u16 nvmet_get_smart_log(struct nvmet_req *req, u16 status; WARN_ON(req == NULL || slog == NULL); - if (req->cmd->get_log_page.nsid == cpu_to_le32(0xFFFFFFFF)) + if (req->cmd->get_log_page.nsid == cpu_to_le32(NVME_NSID_ALL)) status = nvmet_get_smart_log_all(req, slog); else status = nvmet_get_smart_log_nsid(req, slog); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 0a0067e771f5..b6aeb1d70951 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -444,7 +444,7 @@ static struct config_group *nvmet_ns_make(struct config_group *group, goto out; ret = -EINVAL; - if (nsid == 0 || nsid == 0xffffffff) + if (nsid == 0 || nsid == NVME_NSID_ALL) goto out; ret = -ENOMEM; From 130c24b5be80512be973e3a614abda75f5896b42 Mon Sep 17 00:00:00 2001 From: Guan Junxiong Date: Fri, 4 Aug 2017 17:27:47 +0800 Subject: [PATCH 05/74] nvmet: fix the return error code of target if host is not allowed nvmf target shall return NVME_SC_CONNECT_INVALID_HOST instead of the gereal code INVALID_PARAM when the given host nqn is not allowed to connect. Refer to the 2.2.1 section of the NVMe over Fabrics Spec. Signed-off-by: Guan Junxiong Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index f4b02bb4a1a8..ea7eb00dcb87 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -749,6 +749,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, hostnqn, subsysnqn); req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn); up_read(&nvmet_config_sem); + status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR; goto out_put_subsystem; } up_read(&nvmet_config_sem); From 1c35be8c8a13fa714c1c783f14653ca9d3e3721f Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 3 Aug 2017 11:28:48 +0200 Subject: [PATCH 06/74] nvmet-fcloop: remove ALL_OPTS define ALL_OPTS isn't used anywhere, remove it. Signed-off-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 1bb9d5b311b1..1cb9847ec261 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -193,9 +193,6 @@ fcloop_parse_nm_options(struct device *dev, u64 *nname, u64 *pname, #define TGTPORT_OPTS (NVMF_OPT_WWNN | NVMF_OPT_WWPN) -#define ALL_OPTS (NVMF_OPT_WWNN | NVMF_OPT_WWPN | NVMF_OPT_ROLES | \ - NVMF_OPT_FCADDR | NVMF_OPT_LPWWNN | NVMF_OPT_LPWWPN) - static DEFINE_SPINLOCK(fcloop_lock); static LIST_HEAD(fcloop_lports); From 4897ad4e0812c358c8bdd7aa25fdc2201ae2de0b Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 3 Aug 2017 11:28:47 +0200 Subject: [PATCH 07/74] nvme-rdma: remove NVME_RDMA_MAX_SEGMENT_SIZE NVME_RDMA_MAX_SEGMENT_SIZE is not used anywhere, zap it. Signed-off-by: Johannes Thumshirn Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 9ff0eb3a625e..ffd73890d077 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -36,8 +36,6 @@ #define NVME_RDMA_CONNECT_TIMEOUT_MS 3000 /* 3 second */ -#define NVME_RDMA_MAX_SEGMENT_SIZE 0xffffff /* 24-bit SGL field */ - #define NVME_RDMA_MAX_SEGMENTS 256 #define NVME_RDMA_MAX_INLINE_SEGMENTS 1 From 90af35123d3be8b011a8a3f69ce46fd431c55b25 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:28 +0300 Subject: [PATCH 08/74] nvme-rdma: move nvme_rdma_configure_admin_queue code location We will call it from other places so avoid having to forward declare it. Also move it next to nvme_rdma_destroy_admin_queue. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 185 ++++++++++++++++++++------------------- 1 file changed, 94 insertions(+), 91 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ffd73890d077..3cecb087ee3a 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -149,6 +149,9 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static const struct blk_mq_ops nvme_rdma_mq_ops; +static const struct blk_mq_ops nvme_rdma_admin_mq_ops; + /* XXX: really should move to a generic header sooner or later.. */ static inline void put_unaligned_le24(u32 val, u8 *p) { @@ -653,6 +656,97 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) nvme_rdma_dev_put(ctrl->device); } +static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) +{ + int error; + + error = nvme_rdma_init_queue(ctrl, 0, NVME_AQ_DEPTH); + if (error) + return error; + + ctrl->device = ctrl->queues[0].device; + + /* + * We need a reference on the device as long as the tag_set is alive, + * as the MRs in the request structures need a valid ib_device. + */ + error = -EINVAL; + if (!nvme_rdma_dev_get(ctrl->device)) + goto out_free_queue; + + ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, + ctrl->device->dev->attrs.max_fast_reg_page_list_len); + + memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); + ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops; + ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; + ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ + ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; + ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->admin_tag_set.driver_data = ctrl; + ctrl->admin_tag_set.nr_hw_queues = 1; + ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + + error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); + if (error) + goto out_put_dev; + + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.admin_q)) { + error = PTR_ERR(ctrl->ctrl.admin_q); + goto out_free_tagset; + } + + error = nvmf_connect_admin_queue(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); + + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, + &ctrl->ctrl.cap); + if (error) { + dev_err(ctrl->ctrl.device, + "prop_get NVME_REG_CAP failed\n"); + goto out_cleanup_queue; + } + + ctrl->ctrl.sqsize = + min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); + + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + if (error) + goto out_cleanup_queue; + + ctrl->ctrl.max_hw_sectors = + (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9); + + error = nvme_init_identify(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev, + &ctrl->async_event_sqe, sizeof(struct nvme_command), + DMA_TO_DEVICE); + if (error) + goto out_cleanup_queue; + + return 0; + +out_cleanup_queue: + blk_cleanup_queue(ctrl->ctrl.admin_q); +out_free_tagset: + /* disconnect and drain the queue before freeing the tagset */ + nvme_rdma_stop_queue(&ctrl->queues[0]); + blk_mq_free_tag_set(&ctrl->admin_tag_set); +out_put_dev: + nvme_rdma_dev_put(ctrl->device); +out_free_queue: + nvme_rdma_free_queue(&ctrl->queues[0]); + return error; +} + static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); @@ -1517,97 +1611,6 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { .timeout = nvme_rdma_timeout, }; -static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) -{ - int error; - - error = nvme_rdma_init_queue(ctrl, 0, NVME_AQ_DEPTH); - if (error) - return error; - - ctrl->device = ctrl->queues[0].device; - - /* - * We need a reference on the device as long as the tag_set is alive, - * as the MRs in the request structures need a valid ib_device. - */ - error = -EINVAL; - if (!nvme_rdma_dev_get(ctrl->device)) - goto out_free_queue; - - ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, - ctrl->device->dev->attrs.max_fast_reg_page_list_len); - - memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); - ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops; - ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; - ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ - ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; - ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) + - SG_CHUNK_SIZE * sizeof(struct scatterlist); - ctrl->admin_tag_set.driver_data = ctrl; - ctrl->admin_tag_set.nr_hw_queues = 1; - ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; - - error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); - if (error) - goto out_put_dev; - - ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); - if (IS_ERR(ctrl->ctrl.admin_q)) { - error = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_tagset; - } - - error = nvmf_connect_admin_queue(&ctrl->ctrl); - if (error) - goto out_cleanup_queue; - - set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); - - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, - &ctrl->ctrl.cap); - if (error) { - dev_err(ctrl->ctrl.device, - "prop_get NVME_REG_CAP failed\n"); - goto out_cleanup_queue; - } - - ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); - if (error) - goto out_cleanup_queue; - - ctrl->ctrl.max_hw_sectors = - (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9); - - error = nvme_init_identify(&ctrl->ctrl); - if (error) - goto out_cleanup_queue; - - error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev, - &ctrl->async_event_sqe, sizeof(struct nvme_command), - DMA_TO_DEVICE); - if (error) - goto out_cleanup_queue; - - return 0; - -out_cleanup_queue: - blk_cleanup_queue(ctrl->ctrl.admin_q); -out_free_tagset: - /* disconnect and drain the queue before freeing the tagset */ - nvme_rdma_stop_queue(&ctrl->queues[0]); - blk_mq_free_tag_set(&ctrl->admin_tag_set); -out_put_dev: - nvme_rdma_dev_put(ctrl->device); -out_free_queue: - nvme_rdma_free_queue(&ctrl->queues[0]); - return error; -} - static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) { cancel_work_sync(&ctrl->err_work); From 34b6c2315eb66e6411261aa440f6e3c4cded3506 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:29 +0300 Subject: [PATCH 09/74] nvme: Add admin_tagset pointer to nvme_ctrl Will be used when we centralize control flows. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 1 + drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 1 + drivers/nvme/host/rdma.c | 1 + drivers/nvme/target/loop.c | 1 + 5 files changed, 5 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 1438be649866..1912df412692 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2731,6 +2731,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); if (ret) goto out_free_queues; + ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b40b9af4564f..2c8a02be46fd 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -125,6 +125,7 @@ struct nvme_ctrl { struct kref kref; int instance; struct blk_mq_tag_set *tagset; + struct blk_mq_tag_set *admin_tagset; struct list_head namespaces; struct mutex namespaces_mutex; struct device *device; /* char device */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 925467b31a33..e6283745ecd2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1376,6 +1376,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(&dev->admin_tagset)) return -ENOMEM; + dev->ctrl.admin_tagset = &dev->admin_tagset; dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); if (IS_ERR(dev->ctrl.admin_q)) { diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 3cecb087ee3a..6ef56500fc9c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -691,6 +691,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); if (error) goto out_put_dev; + ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 717ed7ddb2f6..92628c432926 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -375,6 +375,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); if (error) goto out_free_sq; + ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { From b28a308ee7774341312de28405e53a4a30cb7d31 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:30 +0300 Subject: [PATCH 10/74] nvme-rdma: move tagset allocation to a dedicated routine We always pair tagset allocation with rdma device reference and it shares some code, centralize it with an argument if its an admin or IO tagset. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 131 ++++++++++++++++++++++----------------- 1 file changed, 75 insertions(+), 56 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 6ef56500fc9c..3f580c198100 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -646,14 +646,79 @@ static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) return ret; } +static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, bool admin) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + struct blk_mq_tag_set *set = admin ? + &ctrl->admin_tag_set : &ctrl->tag_set; + + blk_mq_free_tag_set(set); + nvme_rdma_dev_put(ctrl->device); +} + +static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, + bool admin) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + struct blk_mq_tag_set *set; + int ret; + + if (admin) { + set = &ctrl->admin_tag_set; + memset(set, 0, sizeof(*set)); + set->ops = &nvme_rdma_admin_mq_ops; + set->queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; + set->reserved_tags = 2; /* connect + keep-alive */ + set->numa_node = NUMA_NO_NODE; + set->cmd_size = sizeof(struct nvme_rdma_request) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + set->driver_data = ctrl; + set->nr_hw_queues = 1; + set->timeout = ADMIN_TIMEOUT; + } else { + set = &ctrl->tag_set; + memset(set, 0, sizeof(*set)); + set->ops = &nvme_rdma_mq_ops; + set->queue_depth = nctrl->opts->queue_size; + set->reserved_tags = 1; /* fabric connect */ + set->numa_node = NUMA_NO_NODE; + set->flags = BLK_MQ_F_SHOULD_MERGE; + set->cmd_size = sizeof(struct nvme_rdma_request) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + set->driver_data = ctrl; + set->nr_hw_queues = nctrl->queue_count - 1; + set->timeout = NVME_IO_TIMEOUT; + } + + ret = blk_mq_alloc_tag_set(set); + if (ret) + goto out; + + /* + * We need a reference on the device as long as the tag_set is alive, + * as the MRs in the request structures need a valid ib_device. + */ + ret = nvme_rdma_dev_get(ctrl->device); + if (!ret) { + ret = -EINVAL; + goto out_free_tagset; + } + + return set; + +out_free_tagset: + blk_mq_free_tag_set(set); +out: + return ERR_PTR(ret); +} + static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) { nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); blk_cleanup_queue(ctrl->ctrl.admin_q); - blk_mq_free_tag_set(&ctrl->admin_tag_set); - nvme_rdma_dev_put(ctrl->device); + nvme_rdma_free_tagset(&ctrl->ctrl, true); } static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) @@ -666,32 +731,12 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) ctrl->device = ctrl->queues[0].device; - /* - * We need a reference on the device as long as the tag_set is alive, - * as the MRs in the request structures need a valid ib_device. - */ - error = -EINVAL; - if (!nvme_rdma_dev_get(ctrl->device)) - goto out_free_queue; - ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, ctrl->device->dev->attrs.max_fast_reg_page_list_len); - memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); - ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops; - ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; - ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ - ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; - ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) + - SG_CHUNK_SIZE * sizeof(struct scatterlist); - ctrl->admin_tag_set.driver_data = ctrl; - ctrl->admin_tag_set.nr_hw_queues = 1; - ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; - - error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); - if (error) - goto out_put_dev; - ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; + ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); + if (IS_ERR(ctrl->ctrl.admin_tagset)) + goto out_free_queue; ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { @@ -740,9 +785,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) out_free_tagset: /* disconnect and drain the queue before freeing the tagset */ nvme_rdma_stop_queue(&ctrl->queues[0]); - blk_mq_free_tag_set(&ctrl->admin_tag_set); -out_put_dev: - nvme_rdma_dev_put(ctrl->device); + nvme_rdma_free_tagset(&ctrl->ctrl, true); out_free_queue: nvme_rdma_free_queue(&ctrl->queues[0]); return error; @@ -1644,8 +1687,7 @@ static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_uninit_ctrl(&ctrl->ctrl); if (ctrl->ctrl.tagset) { blk_cleanup_queue(ctrl->ctrl.connect_q); - blk_mq_free_tag_set(&ctrl->tag_set); - nvme_rdma_dev_put(ctrl->device); + nvme_rdma_free_tagset(&ctrl->ctrl, false); } nvme_put_ctrl(&ctrl->ctrl); @@ -1765,31 +1807,10 @@ static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) if (ret) return ret; - /* - * We need a reference on the device as long as the tag_set is alive, - * as the MRs in the request structures need a valid ib_device. - */ - ret = -EINVAL; - if (!nvme_rdma_dev_get(ctrl->device)) + ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); + if (IS_ERR(ctrl->ctrl.tagset)) goto out_free_io_queues; - memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); - ctrl->tag_set.ops = &nvme_rdma_mq_ops; - ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; - ctrl->tag_set.reserved_tags = 1; /* fabric connect */ - ctrl->tag_set.numa_node = NUMA_NO_NODE; - ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; - ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) + - SG_CHUNK_SIZE * sizeof(struct scatterlist); - ctrl->tag_set.driver_data = ctrl; - ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; - ctrl->tag_set.timeout = NVME_IO_TIMEOUT; - - ret = blk_mq_alloc_tag_set(&ctrl->tag_set); - if (ret) - goto out_put_dev; - ctrl->ctrl.tagset = &ctrl->tag_set; - ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); if (IS_ERR(ctrl->ctrl.connect_q)) { ret = PTR_ERR(ctrl->ctrl.connect_q); @@ -1805,9 +1826,7 @@ static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) out_cleanup_connect_q: blk_cleanup_queue(ctrl->ctrl.connect_q); out_free_tag_set: - blk_mq_free_tag_set(&ctrl->tag_set); -out_put_dev: - nvme_rdma_dev_put(ctrl->device); + nvme_rdma_free_tagset(&ctrl->ctrl, false); out_free_io_queues: nvme_rdma_free_io_queues(ctrl); return ret; From 18398af2dcca3044c74fd2697f1d5b624b7848fe Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:31 +0300 Subject: [PATCH 11/74] nvme-rdma: disable the controller on resets Mimic the pci driver as a controller disable might be more lightweight than a shutdown. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 3f580c198100..b1e67cd41c81 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1655,7 +1655,7 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { .timeout = nvme_rdma_timeout, }; -static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) +static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { cancel_work_sync(&ctrl->err_work); cancel_delayed_work_sync(&ctrl->reconnect_work); @@ -1667,8 +1667,10 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) nvme_rdma_free_io_queues(ctrl); } - if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags)) + if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); + else + nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, @@ -1682,7 +1684,7 @@ static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_stop_ctrl(&ctrl->ctrl); nvme_remove_namespaces(&ctrl->ctrl); if (shutdown) - nvme_rdma_shutdown_ctrl(ctrl); + nvme_rdma_shutdown_ctrl(ctrl, shutdown); nvme_uninit_ctrl(&ctrl->ctrl); if (ctrl->ctrl.tagset) { @@ -1746,7 +1748,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) bool changed; nvme_stop_ctrl(&ctrl->ctrl); - nvme_rdma_shutdown_ctrl(ctrl); + nvme_rdma_shutdown_ctrl(ctrl, false); ret = nvme_rdma_configure_admin_queue(ctrl); if (ret) { From 3f02fffb74ae7486232fef6ca4341c5e9719c759 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:32 +0300 Subject: [PATCH 12/74] nvme-rdma: don't free tagset on resets We're not supposed to do that. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 49 ++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b1e67cd41c81..b8c07f2f2204 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -712,16 +712,20 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, return ERR_PTR(ret); } -static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) +static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, + bool remove) { nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); - blk_cleanup_queue(ctrl->ctrl.admin_q); - nvme_rdma_free_tagset(&ctrl->ctrl, true); + if (remove) { + blk_cleanup_queue(ctrl->ctrl.admin_q); + nvme_rdma_free_tagset(&ctrl->ctrl, true); + } } -static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) +static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, + bool new) { int error; @@ -734,14 +738,21 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, ctrl->device->dev->attrs.max_fast_reg_page_list_len); - ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); - if (IS_ERR(ctrl->ctrl.admin_tagset)) - goto out_free_queue; + if (new) { + ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); + if (IS_ERR(ctrl->ctrl.admin_tagset)) + goto out_free_queue; - ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); - if (IS_ERR(ctrl->ctrl.admin_q)) { - error = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_tagset; + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.admin_q)) { + error = PTR_ERR(ctrl->ctrl.admin_q); + goto out_free_tagset; + } + } else { + error = blk_mq_reinit_tagset(&ctrl->admin_tag_set, + nvme_rdma_reinit_request); + if (error) + goto out_free_queue; } error = nvmf_connect_admin_queue(&ctrl->ctrl); @@ -781,11 +792,11 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) return 0; out_cleanup_queue: - blk_cleanup_queue(ctrl->ctrl.admin_q); + if (new) + blk_cleanup_queue(ctrl->ctrl.admin_q); out_free_tagset: - /* disconnect and drain the queue before freeing the tagset */ - nvme_rdma_stop_queue(&ctrl->queues[0]); - nvme_rdma_free_tagset(&ctrl->ctrl, true); + if (new) + nvme_rdma_free_tagset(&ctrl->ctrl, true); out_free_queue: nvme_rdma_free_queue(&ctrl->queues[0]); return error; @@ -1676,7 +1687,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_destroy_admin_queue(ctrl); + nvme_rdma_destroy_admin_queue(ctrl, shutdown); } static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) @@ -1750,7 +1761,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl, false); - ret = nvme_rdma_configure_admin_queue(ctrl); + ret = nvme_rdma_configure_admin_queue(ctrl, false); if (ret) { /* ctrl is already shutdown, just remove the ctrl */ INIT_WORK(&ctrl->delete_work, nvme_rdma_remove_ctrl_work); @@ -1891,7 +1902,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, if (!ctrl->queues) goto out_uninit_ctrl; - ret = nvme_rdma_configure_admin_queue(ctrl); + ret = nvme_rdma_configure_admin_queue(ctrl, true); if (ret) goto out_kfree_queues; @@ -1948,7 +1959,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, return &ctrl->ctrl; out_remove_admin_queue: - nvme_rdma_destroy_admin_queue(ctrl); + nvme_rdma_destroy_admin_queue(ctrl, true); out_kfree_queues: kfree(ctrl->queues); out_uninit_ctrl: From 31fdf18401703df246e20fc0382ed3594e6cf8d8 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 28 Aug 2017 21:40:06 +0200 Subject: [PATCH 13/74] nvme-rdma: reuse configure/destroy_admin_queue No need to open-code it. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b8c07f2f2204..cdf14226bf07 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -857,24 +857,8 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) goto requeue; } - nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); - - ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set, - nvme_rdma_reinit_request); - if (ret) - goto requeue; - - ret = nvme_rdma_init_queue(ctrl, 0, NVME_AQ_DEPTH); - if (ret) - goto requeue; - - ret = nvmf_connect_admin_queue(&ctrl->ctrl); - if (ret) - goto requeue; - - set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); - - ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + nvme_rdma_destroy_admin_queue(ctrl, false); + ret = nvme_rdma_configure_admin_queue(ctrl, false); if (ret) goto requeue; From a57bd54122232b32414748fc8b14634bfd74a7ff Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 28 Aug 2017 21:41:10 +0200 Subject: [PATCH 14/74] nvme-rdma: introduce configure/destroy io queues Make a symmetrical handling with admin queue. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 166 +++++++++++++++++++-------------------- 1 file changed, 83 insertions(+), 83 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index cdf14226bf07..b5fdd2b009c0 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -562,22 +562,20 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) { + if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags)) + return; + rdma_disconnect(queue->cm_id); ib_drain_qp(queue->qp); } static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) -{ - nvme_rdma_destroy_queue_ib(queue); - rdma_destroy_id(queue->cm_id); -} - -static void nvme_rdma_stop_and_free_queue(struct nvme_rdma_queue *queue) { if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags)) return; - nvme_rdma_stop_queue(queue); - nvme_rdma_free_queue(queue); + + nvme_rdma_destroy_queue_ib(queue); + rdma_destroy_id(queue->cm_id); } static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl) @@ -585,7 +583,15 @@ static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl) int i; for (i = 1; i < ctrl->ctrl.queue_count; i++) - nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); + nvme_rdma_free_queue(&ctrl->queues[i]); +} + +static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->ctrl.queue_count; i++) + nvme_rdma_stop_queue(&ctrl->queues[i]); } static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) @@ -597,15 +603,15 @@ static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) if (ret) { dev_info(ctrl->ctrl.device, "failed to connect i/o queue: %d\n", ret); - goto out_free_queues; + goto out_stop_queues; } set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); } return 0; -out_free_queues: - nvme_rdma_free_io_queues(ctrl); +out_stop_queues: + nvme_rdma_stop_io_queues(ctrl); return ret; } @@ -641,7 +647,7 @@ static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) out_free_queues: for (i--; i >= 1; i--) - nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); + nvme_rdma_free_queue(&ctrl->queues[i]); return ret; } @@ -717,11 +723,12 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, { nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); - nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); + nvme_rdma_stop_queue(&ctrl->queues[0]); if (remove) { blk_cleanup_queue(ctrl->ctrl.admin_q); nvme_rdma_free_tagset(&ctrl->ctrl, true); } + nvme_rdma_free_queue(&ctrl->queues[0]); } static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, @@ -802,6 +809,62 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, return error; } +static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, + bool remove) +{ + nvme_rdma_stop_io_queues(ctrl); + if (remove) { + blk_cleanup_queue(ctrl->ctrl.connect_q); + nvme_rdma_free_tagset(&ctrl->ctrl, false); + } + nvme_rdma_free_io_queues(ctrl); +} + +static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) +{ + int ret; + + ret = nvme_rdma_init_io_queues(ctrl); + if (ret) + return ret; + + if (new) { + ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false); + if (IS_ERR(ctrl->ctrl.tagset)) + goto out_free_io_queues; + + ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); + if (IS_ERR(ctrl->ctrl.connect_q)) { + ret = PTR_ERR(ctrl->ctrl.connect_q); + goto out_free_tag_set; + } + } else { + ret = blk_mq_reinit_tagset(&ctrl->tag_set, + nvme_rdma_reinit_request); + if (ret) + goto out_free_io_queues; + + blk_mq_update_nr_hw_queues(&ctrl->tag_set, + ctrl->ctrl.queue_count - 1); + } + + ret = nvme_rdma_connect_io_queues(ctrl); + if (ret) + goto out_cleanup_connect_q; + + return 0; + +out_cleanup_connect_q: + if (new) + blk_cleanup_queue(ctrl->ctrl.connect_q); +out_free_tag_set: + if (new) + nvme_rdma_free_tagset(&ctrl->ctrl, false); +out_free_io_queues: + nvme_rdma_free_io_queues(ctrl); + return ret; +} + static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); @@ -848,14 +911,8 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) ++ctrl->ctrl.nr_reconnects; - if (ctrl->ctrl.queue_count > 1) { - nvme_rdma_free_io_queues(ctrl); - - ret = blk_mq_reinit_tagset(&ctrl->tag_set, - nvme_rdma_reinit_request); - if (ret) - goto requeue; - } + if (ctrl->ctrl.queue_count > 1) + nvme_rdma_destroy_io_queues(ctrl, false); nvme_rdma_destroy_admin_queue(ctrl, false); ret = nvme_rdma_configure_admin_queue(ctrl, false); @@ -863,16 +920,9 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) goto requeue; if (ctrl->ctrl.queue_count > 1) { - ret = nvme_rdma_init_io_queues(ctrl); + ret = nvme_rdma_configure_io_queues(ctrl, false); if (ret) goto requeue; - - ret = nvme_rdma_connect_io_queues(ctrl); - if (ret) - goto requeue; - - blk_mq_update_nr_hw_queues(&ctrl->tag_set, - ctrl->ctrl.queue_count - 1); } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); @@ -1659,7 +1709,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); - nvme_rdma_free_io_queues(ctrl); + nvme_rdma_destroy_io_queues(ctrl, shutdown); } if (shutdown) @@ -1682,11 +1732,6 @@ static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_rdma_shutdown_ctrl(ctrl, shutdown); nvme_uninit_ctrl(&ctrl->ctrl); - if (ctrl->ctrl.tagset) { - blk_cleanup_queue(ctrl->ctrl.connect_q); - nvme_rdma_free_tagset(&ctrl->ctrl, false); - } - nvme_put_ctrl(&ctrl->ctrl); } @@ -1753,21 +1798,9 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) } if (ctrl->ctrl.queue_count > 1) { - ret = blk_mq_reinit_tagset(&ctrl->tag_set, - nvme_rdma_reinit_request); + ret = nvme_rdma_configure_io_queues(ctrl, false); if (ret) goto del_dead_ctrl; - - ret = nvme_rdma_init_io_queues(ctrl); - if (ret) - goto del_dead_ctrl; - - ret = nvme_rdma_connect_io_queues(ctrl); - if (ret) - goto del_dead_ctrl; - - blk_mq_update_nr_hw_queues(&ctrl->tag_set, - ctrl->ctrl.queue_count - 1); } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); @@ -1796,39 +1829,6 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .get_address = nvmf_get_address, }; -static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) -{ - int ret; - - ret = nvme_rdma_init_io_queues(ctrl); - if (ret) - return ret; - - ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); - if (IS_ERR(ctrl->ctrl.tagset)) - goto out_free_io_queues; - - ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); - if (IS_ERR(ctrl->ctrl.connect_q)) { - ret = PTR_ERR(ctrl->ctrl.connect_q); - goto out_free_tag_set; - } - - ret = nvme_rdma_connect_io_queues(ctrl); - if (ret) - goto out_cleanup_connect_q; - - return 0; - -out_cleanup_connect_q: - blk_cleanup_queue(ctrl->ctrl.connect_q); -out_free_tag_set: - nvme_rdma_free_tagset(&ctrl->ctrl, false); -out_free_io_queues: - nvme_rdma_free_io_queues(ctrl); - return ret; -} - static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { @@ -1921,7 +1921,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, } if (opts->nr_io_queues) { - ret = nvme_rdma_create_io_queues(ctrl); + ret = nvme_rdma_configure_io_queues(ctrl, true); if (ret) goto out_remove_admin_queue; } From 148b4e7ff31e4bb90cf7851ad1bcd305c292be2c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:35 +0300 Subject: [PATCH 15/74] nvme-rdma: stop queues instead of simply flipping their state If we move the queues from LIVE state, we might as well stop them (drain for rdma). Do it after we stop the request queues to prevent a stray request sneaking in .queue_rq after we stop the queue. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b5fdd2b009c0..34d3ed8b249e 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -945,16 +945,15 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) { struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, err_work); - int i; nvme_stop_ctrl(&ctrl->ctrl); - for (i = 0; i < ctrl->ctrl.queue_count; i++) - clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); - - if (ctrl->ctrl.queue_count > 1) + if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); + } blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_stop_queue(&ctrl->queues[0]); /* We must take care of fastfail/requeue all our inflight requests */ if (ctrl->ctrl.queue_count > 1) From 41e8cfa117cee46a732436dd303de4772c60965c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:36 +0300 Subject: [PATCH 16/74] nvme-rdma: rename nvme_rdma_init_queue to nvme_rdma_alloc_queue Give it a name symmetric to nvme_rdma_free_queue. Also pass in the ctrl sqsize+1 and not the opts queue_size. And suppress a superflous failure message. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 34d3ed8b249e..d87874e4902d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -504,7 +504,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) return ret; } -static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, +static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, int idx, size_t queue_size) { struct nvme_rdma_queue *queue; @@ -615,7 +615,7 @@ static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) return ret; } -static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) +static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; unsigned int nr_io_queues; @@ -634,13 +634,10 @@ static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) "creating %d I/O queues.\n", nr_io_queues); for (i = 1; i < ctrl->ctrl.queue_count; i++) { - ret = nvme_rdma_init_queue(ctrl, i, - ctrl->ctrl.opts->queue_size); - if (ret) { - dev_info(ctrl->ctrl.device, - "failed to initialize i/o queue: %d\n", ret); + ret = nvme_rdma_alloc_queue(ctrl, i, + ctrl->ctrl.sqsize + 1); + if (ret) goto out_free_queues; - } } return 0; @@ -736,7 +733,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, { int error; - error = nvme_rdma_init_queue(ctrl, 0, NVME_AQ_DEPTH); + error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); if (error) return error; @@ -824,7 +821,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) { int ret; - ret = nvme_rdma_init_io_queues(ctrl); + ret = nvme_rdma_alloc_io_queues(ctrl); if (ret) return ret; From 68e16fcfaf9bbde573e89f783cf1ca60acb49cf5 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:37 +0300 Subject: [PATCH 17/74] nvme-rdma: introduce nvme_rdma_start_queue This should pair with nvme_rdma_stop_queue. While this is not a complete inverse, it still pairs up pretty well because in fabrics we don't have a disconnect capsule (yet) but we simply teardown the transport association. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index d87874e4902d..09c4ea8332d7 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -594,24 +594,38 @@ static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl) nvme_rdma_stop_queue(&ctrl->queues[i]); } -static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) +static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) +{ + int ret; + + if (idx) + ret = nvmf_connect_io_queue(&ctrl->ctrl, idx); + else + ret = nvmf_connect_admin_queue(&ctrl->ctrl); + + if (!ret) + set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags); + else + dev_info(ctrl->ctrl.device, + "failed to connect queue: %d ret=%d\n", idx, ret); + return ret; +} + +static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl) { int i, ret = 0; for (i = 1; i < ctrl->ctrl.queue_count; i++) { - ret = nvmf_connect_io_queue(&ctrl->ctrl, i); - if (ret) { - dev_info(ctrl->ctrl.device, - "failed to connect i/o queue: %d\n", ret); + ret = nvme_rdma_start_queue(ctrl, i); + if (ret) goto out_stop_queues; - } - set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); } return 0; out_stop_queues: - nvme_rdma_stop_io_queues(ctrl); + for (i--; i >= 1; i--) + nvme_rdma_stop_queue(&ctrl->queues[i]); return ret; } @@ -759,12 +773,10 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, goto out_free_queue; } - error = nvmf_connect_admin_queue(&ctrl->ctrl); + error = nvme_rdma_start_queue(ctrl, 0); if (error) goto out_cleanup_queue; - set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); if (error) { @@ -845,7 +857,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) ctrl->ctrl.queue_count - 1); } - ret = nvme_rdma_connect_io_queues(ctrl); + ret = nvme_rdma_start_io_queues(ctrl); if (ret) goto out_cleanup_connect_q; From 370ae6e450260c8bd3e1ea75c1390f746d3c71ee Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:38 +0300 Subject: [PATCH 18/74] nvme-rdma: cleanup error path in controller reset No need to queue an extra work to indirect controller removal, just call the ctrl remove routine. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 09c4ea8332d7..1bf7c3a55c96 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1732,13 +1732,10 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_rdma_destroy_admin_queue(ctrl, shutdown); } -static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) +static void nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl) { - nvme_stop_ctrl(&ctrl->ctrl); nvme_remove_namespaces(&ctrl->ctrl); - if (shutdown) - nvme_rdma_shutdown_ctrl(ctrl, shutdown); - + nvme_rdma_shutdown_ctrl(ctrl, true); nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); } @@ -1748,7 +1745,8 @@ static void nvme_rdma_del_ctrl_work(struct work_struct *work) struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, delete_work); - __nvme_rdma_remove_ctrl(ctrl, true); + nvme_stop_ctrl(&ctrl->ctrl); + nvme_rdma_remove_ctrl(ctrl); } static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) @@ -1780,14 +1778,6 @@ static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl) return ret; } -static void nvme_rdma_remove_ctrl_work(struct work_struct *work) -{ - struct nvme_rdma_ctrl *ctrl = container_of(work, - struct nvme_rdma_ctrl, delete_work); - - __nvme_rdma_remove_ctrl(ctrl, false); -} - static void nvme_rdma_reset_ctrl_work(struct work_struct *work) { struct nvme_rdma_ctrl *ctrl = @@ -1799,16 +1789,13 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) nvme_rdma_shutdown_ctrl(ctrl, false); ret = nvme_rdma_configure_admin_queue(ctrl, false); - if (ret) { - /* ctrl is already shutdown, just remove the ctrl */ - INIT_WORK(&ctrl->delete_work, nvme_rdma_remove_ctrl_work); - goto del_dead_ctrl; - } + if (ret) + goto out_fail; if (ctrl->ctrl.queue_count > 1) { ret = nvme_rdma_configure_io_queues(ctrl, false); if (ret) - goto del_dead_ctrl; + goto out_fail; } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); @@ -1818,10 +1805,9 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) return; -del_dead_ctrl: - /* Deleting this dead controller... */ +out_fail: dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); - WARN_ON(!queue_work(nvme_wq, &ctrl->delete_work)); + nvme_rdma_remove_ctrl(ctrl); } static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { From 09fdc23b29618127709711606d4fa439a6839852 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Jul 2017 09:22:39 +0300 Subject: [PATCH 19/74] nvme-rdma: call ops->reg_read64 instead of nvmf_reg_read64 To make the nvme_rdma_configure_admin_queue generic in preparation of moving it to common code. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 1bf7c3a55c96..b51e7df63df5 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -777,7 +777,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, if (error) goto out_cleanup_queue; - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, + error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); if (error) { dev_err(ctrl->ctrl.device, From 9b483da15dd0c94b62ae4e789b37dfff4410f45e Mon Sep 17 00:00:00 2001 From: Guan Junxiong Date: Thu, 3 Aug 2017 21:40:26 +0800 Subject: [PATCH 20/74] nvme-fabrics: log a warning if hostid is invalid This helps users to quickly spot the reason of why connection fails if the hostid is not compliant with the uuid format. Signed-off-by: Guan Junxiong Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 5f5cd306f76d..fc3b6552f467 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -735,6 +735,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, goto out; } if (uuid_parse(p, &hostid)) { + pr_err("Invalid hostid %s\n", p); ret = -EINVAL; goto out; } From caaa15c5097d58545075a8bbdf208078b87d5f28 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 15 Aug 2017 12:24:05 +0300 Subject: [PATCH 21/74] nvme: fix identify namespace logging Use ctrl->device and lose the func name. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0b979e16655e..b2daeafbeadd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1167,7 +1167,7 @@ static void nvme_config_discard(struct nvme_ns *ns) static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) { if (nvme_identify_ns(ns->ctrl, ns->ns_id, id)) { - dev_warn(ns->ctrl->dev, "%s: Identify failure\n", __func__); + dev_warn(ns->ctrl->device, "Identify namespace failed\n"); return -ENODEV; } From ad4e05b24c428d6125f6f10bd300600cae5079d4 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 13 Aug 2017 19:21:06 +0300 Subject: [PATCH 22/74] nvme: add symbolic constants for CC identifiers Signed-off-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 14 +++++++------- include/linux/nvme.h | 24 +++++++++++++++--------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index ea7eb00dcb87..7c23eaf8e563 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -538,37 +538,37 @@ EXPORT_SYMBOL_GPL(nvmet_req_uninit); static inline bool nvmet_cc_en(u32 cc) { - return cc & 0x1; + return (cc >> NVME_CC_EN_SHIFT) & 0x1; } static inline u8 nvmet_cc_css(u32 cc) { - return (cc >> 4) & 0x7; + return (cc >> NVME_CC_CSS_SHIFT) & 0x7; } static inline u8 nvmet_cc_mps(u32 cc) { - return (cc >> 7) & 0xf; + return (cc >> NVME_CC_MPS_SHIFT) & 0xf; } static inline u8 nvmet_cc_ams(u32 cc) { - return (cc >> 11) & 0x7; + return (cc >> NVME_CC_AMS_SHIFT) & 0x7; } static inline u8 nvmet_cc_shn(u32 cc) { - return (cc >> 14) & 0x3; + return (cc >> NVME_CC_SHN_SHIFT) & 0x3; } static inline u8 nvmet_cc_iosqes(u32 cc) { - return (cc >> 16) & 0xf; + return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf; } static inline u8 nvmet_cc_iocqes(u32 cc) { - return (cc >> 20) & 0xf; + return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf; } static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index a12b47073273..7b4322dc2975 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -135,16 +135,22 @@ enum { enum { NVME_CC_ENABLE = 1 << 0, NVME_CC_CSS_NVM = 0 << 4, + NVME_CC_EN_SHIFT = 0, + NVME_CC_CSS_SHIFT = 4, NVME_CC_MPS_SHIFT = 7, - NVME_CC_ARB_RR = 0 << 11, - NVME_CC_ARB_WRRU = 1 << 11, - NVME_CC_ARB_VS = 7 << 11, - NVME_CC_SHN_NONE = 0 << 14, - NVME_CC_SHN_NORMAL = 1 << 14, - NVME_CC_SHN_ABRUPT = 2 << 14, - NVME_CC_SHN_MASK = 3 << 14, - NVME_CC_IOSQES = NVME_NVM_IOSQES << 16, - NVME_CC_IOCQES = NVME_NVM_IOCQES << 20, + NVME_CC_AMS_SHIFT = 11, + NVME_CC_SHN_SHIFT = 14, + NVME_CC_IOSQES_SHIFT = 16, + NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_ARB_RR = 0 << NVME_CC_AMS_SHIFT, + NVME_CC_ARB_WRRU = 1 << NVME_CC_AMS_SHIFT, + NVME_CC_ARB_VS = 7 << NVME_CC_AMS_SHIFT, + NVME_CC_SHN_NONE = 0 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_NORMAL = 1 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_ABRUPT = 2 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, + NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, + NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, NVME_CSTS_RDY = 1 << 0, NVME_CSTS_CFS = 1 << 1, NVME_CSTS_NSSRO = 1 << 4, From 60b43f627a71aaf233ef5af90f72e207c29781b4 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 13 Aug 2017 19:21:07 +0300 Subject: [PATCH 23/74] nvme: rename AMS symbolic constants to fit specification Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- include/linux/nvme.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b2daeafbeadd..1db8de0bee87 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1445,7 +1445,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) ctrl->ctrl_config = NVME_CC_CSS_NVM; ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; - ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; + ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; ctrl->ctrl_config |= NVME_CC_ENABLE; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7b4322dc2975..05560c2ecc83 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -142,9 +142,9 @@ enum { NVME_CC_SHN_SHIFT = 14, NVME_CC_IOSQES_SHIFT = 16, NVME_CC_IOCQES_SHIFT = 20, - NVME_CC_ARB_RR = 0 << NVME_CC_AMS_SHIFT, - NVME_CC_ARB_WRRU = 1 << NVME_CC_AMS_SHIFT, - NVME_CC_ARB_VS = 7 << NVME_CC_AMS_SHIFT, + NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, + NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, + NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, NVME_CC_SHN_NONE = 0 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NORMAL = 1 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_ABRUPT = 2 << NVME_CC_SHN_SHIFT, From 5533d42480d6ced6765401c55a3622b4c437d7eb Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 31 Jul 2017 13:20:30 -0700 Subject: [PATCH 24/74] nvme-fc: Reattach to localports on re-registration If the LLDD resets or detaches from an fc port, the LLDD will deregister all remoteports seen by the fc port and deregister the localport associated with the fc port. The teardown of the localport structure will be held off due to reference counting until all the remoteports are removed (and they are held off until all controllers/associations to terminated). Currently, if the fc port is reinit/reattached and registered again as a localport it is treated as an independent entity from the prior localport and all prior remoteports and controllers cannot be revived. They are created as new and separate entities. This patch changes the localport registration to look at the known localports that are waiting to be torndown. If they are the same port based on wwn's, the local port is transitioned out of the teardown state. This allows the remote ports and controller connections to be reestablished and resumed as long as the localport can also be reregistered within the timeout windows. The patch adds a new routine nvme_fc_attach_to_unreg_lport() with the functionality and moves the lport get/put routines to avoid forward references. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 144 ++++++++++++++++++++++++++++++----------- 1 file changed, 106 insertions(+), 38 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 1912df412692..d2e882c0f496 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -220,6 +220,90 @@ static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *); static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *, struct nvme_fc_queue *, unsigned int); +static void +nvme_fc_free_lport(struct kref *ref) +{ + struct nvme_fc_lport *lport = + container_of(ref, struct nvme_fc_lport, ref); + unsigned long flags; + + WARN_ON(lport->localport.port_state != FC_OBJSTATE_DELETED); + WARN_ON(!list_empty(&lport->endp_list)); + + /* remove from transport list */ + spin_lock_irqsave(&nvme_fc_lock, flags); + list_del(&lport->port_list); + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + /* let the LLDD know we've finished tearing it down */ + lport->ops->localport_delete(&lport->localport); + + ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); + ida_destroy(&lport->endp_cnt); + + put_device(lport->dev); + + kfree(lport); +} + +static void +nvme_fc_lport_put(struct nvme_fc_lport *lport) +{ + kref_put(&lport->ref, nvme_fc_free_lport); +} + +static int +nvme_fc_lport_get(struct nvme_fc_lport *lport) +{ + return kref_get_unless_zero(&lport->ref); +} + + +static struct nvme_fc_lport * +nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo) +{ + struct nvme_fc_lport *lport; + unsigned long flags; + + spin_lock_irqsave(&nvme_fc_lock, flags); + + list_for_each_entry(lport, &nvme_fc_lport_list, port_list) { + if (lport->localport.node_name != pinfo->node_name || + lport->localport.port_name != pinfo->port_name) + continue; + + if (lport->localport.port_state != FC_OBJSTATE_DELETED) { + lport = ERR_PTR(-EEXIST); + goto out_done; + } + + if (!nvme_fc_lport_get(lport)) { + /* + * fails if ref cnt already 0. If so, + * act as if lport already deleted + */ + lport = NULL; + goto out_done; + } + + /* resume the lport */ + + lport->localport.port_role = pinfo->port_role; + lport->localport.port_id = pinfo->port_id; + lport->localport.port_state = FC_OBJSTATE_ONLINE; + + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + return lport; + } + + lport = NULL; + +out_done: + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + return lport; +} /** * nvme_fc_register_localport - transport entry point called by an @@ -257,6 +341,28 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, goto out_reghost_failed; } + /* + * look to see if there is already a localport that had been + * deregistered and in the process of waiting for all the + * references to fully be removed. If the references haven't + * expired, we can simply re-enable the localport. Remoteports + * and controller reconnections should resume naturally. + */ + newrec = nvme_fc_attach_to_unreg_lport(pinfo); + + /* found an lport, but something about its state is bad */ + if (IS_ERR(newrec)) { + ret = PTR_ERR(newrec); + goto out_reghost_failed; + + /* found existing lport, which was resumed */ + } else if (newrec) { + *portptr = &newrec->localport; + return 0; + } + + /* nothing found - allocate a new localport struct */ + newrec = kmalloc((sizeof(*newrec) + template->local_priv_sz), GFP_KERNEL); if (!newrec) { @@ -310,44 +416,6 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, } EXPORT_SYMBOL_GPL(nvme_fc_register_localport); -static void -nvme_fc_free_lport(struct kref *ref) -{ - struct nvme_fc_lport *lport = - container_of(ref, struct nvme_fc_lport, ref); - unsigned long flags; - - WARN_ON(lport->localport.port_state != FC_OBJSTATE_DELETED); - WARN_ON(!list_empty(&lport->endp_list)); - - /* remove from transport list */ - spin_lock_irqsave(&nvme_fc_lock, flags); - list_del(&lport->port_list); - spin_unlock_irqrestore(&nvme_fc_lock, flags); - - /* let the LLDD know we've finished tearing it down */ - lport->ops->localport_delete(&lport->localport); - - ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); - ida_destroy(&lport->endp_cnt); - - put_device(lport->dev); - - kfree(lport); -} - -static void -nvme_fc_lport_put(struct nvme_fc_lport *lport) -{ - kref_put(&lport->ref, nvme_fc_free_lport); -} - -static int -nvme_fc_lport_get(struct nvme_fc_lport *lport) -{ - return kref_get_unless_zero(&lport->ref); -} - /** * nvme_fc_unregister_localport - transport entry point called by an * LLDD to deregister/remove a previously From 48fa362b6c3f4d69bdb6310b46626049092475e0 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 31 Jul 2017 13:21:14 -0700 Subject: [PATCH 25/74] nvmet-fc: simplify sg list handling The existing nvmet_fc sg list handling has 2 faults: a) the request between LLDD and transport has too large of an sg list (256 elements), which is normally 256k (64 elements). b) sglist handling doesn't optimize on the fact that each element is a page. This patch removes the static sg list in the request and uses the dynamic list already present in the nvmet_fc transport. It also simplies the handling of the sg list on multiple sequences to take advantage of the per-page divisions. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 48 +++++++--------------------------- include/linux/nvme-fc-driver.h | 2 +- 2 files changed, 11 insertions(+), 39 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 309c84aa7595..421e43bf1dd7 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -58,7 +58,8 @@ struct nvmet_fc_ls_iod { struct work_struct work; } __aligned(sizeof(unsigned long long)); -#define NVMET_FC_MAX_KB_PER_XFR 256 +#define NVMET_FC_MAX_SEQ_LENGTH (256 * 1024) +#define NVMET_FC_MAX_XFR_SGENTS (NVMET_FC_MAX_SEQ_LENGTH / PAGE_SIZE) enum nvmet_fcp_datadir { NVMET_FCP_NODATA, @@ -74,9 +75,7 @@ struct nvmet_fc_fcp_iod { struct nvme_fc_ersp_iu rspiubuf; dma_addr_t rspdma; struct scatterlist *data_sg; - struct scatterlist *next_sg; int data_sg_cnt; - u32 next_sg_offset; u32 total_length; u32 offset; enum nvmet_fcp_datadir io_dir; @@ -112,6 +111,7 @@ struct nvmet_fc_tgtport { struct ida assoc_cnt; struct nvmet_port *port; struct kref ref; + u32 max_sg_cnt; }; struct nvmet_fc_defer_fcp_req { @@ -994,6 +994,8 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo, INIT_LIST_HEAD(&newrec->assoc_list); kref_init(&newrec->ref); ida_init(&newrec->assoc_cnt); + newrec->max_sg_cnt = min_t(u32, NVMET_FC_MAX_XFR_SGENTS, + template->max_sgl_segments); ret = nvmet_fc_alloc_ls_iodlist(newrec); if (ret) { @@ -1866,51 +1868,23 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport, struct nvmet_fc_fcp_iod *fod, u8 op) { struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq; - struct scatterlist *sg, *datasg; unsigned long flags; - u32 tlen, sg_off; + u32 tlen; int ret; fcpreq->op = op; fcpreq->offset = fod->offset; fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC; - tlen = min_t(u32, (NVMET_FC_MAX_KB_PER_XFR * 1024), + + tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE, (fod->total_length - fod->offset)); - tlen = min_t(u32, tlen, NVME_FC_MAX_SEGMENTS * PAGE_SIZE); - tlen = min_t(u32, tlen, fod->tgtport->ops->max_sgl_segments - * PAGE_SIZE); fcpreq->transfer_length = tlen; fcpreq->transferred_length = 0; fcpreq->fcp_error = 0; fcpreq->rsplen = 0; - fcpreq->sg_cnt = 0; - - datasg = fod->next_sg; - sg_off = fod->next_sg_offset; - - for (sg = fcpreq->sg ; tlen; sg++) { - *sg = *datasg; - if (sg_off) { - sg->offset += sg_off; - sg->length -= sg_off; - sg->dma_address += sg_off; - sg_off = 0; - } - if (tlen < sg->length) { - sg->length = tlen; - fod->next_sg = datasg; - fod->next_sg_offset += tlen; - } else if (tlen == sg->length) { - fod->next_sg_offset = 0; - fod->next_sg = sg_next(datasg); - } else { - fod->next_sg_offset = 0; - datasg = sg_next(datasg); - } - tlen -= sg->length; - fcpreq->sg_cnt++; - } + fcpreq->sg = &fod->data_sg[fod->offset / PAGE_SIZE]; + fcpreq->sg_cnt = DIV_ROUND_UP(tlen, PAGE_SIZE); /* * If the last READDATA request: check if LLDD supports @@ -2225,8 +2199,6 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, fod->req.sg = fod->data_sg; fod->req.sg_cnt = fod->data_sg_cnt; fod->offset = 0; - fod->next_sg = fod->data_sg; - fod->next_sg_offset = 0; if (fod->io_dir == NVMET_FCP_WRITE) { /* pull the data over before invoking nvmet layer */ diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 2591878c1d48..9c5cb4480806 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -624,7 +624,7 @@ struct nvmefc_tgt_fcp_req { u32 timeout; u32 transfer_length; struct fc_ba_rjt ba_rjt; - struct scatterlist sg[NVME_FC_MAX_SEGMENTS]; + struct scatterlist *sg; int sg_cnt; void *rspaddr; dma_addr_t rspdma; From 01f33c336e2d298ea5d4ce5d6e5bcd12865cc30f Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Mon, 14 Aug 2017 22:12:38 +0200 Subject: [PATCH 26/74] string.h: add memcpy_and_pad() This helper function is useful for the nvme subsystem, and maybe others. Note: the warnings reported by the kbuild test robot for this patch are actually generated by the use of CONFIG_PROFILE_ALL_BRANCHES together with __FORTIFY_INLINE. Signed-off-by: Martin Wilck Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- include/linux/string.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/include/linux/string.h b/include/linux/string.h index a467e617eeb0..0bec4151b0eb 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -200,6 +200,7 @@ static inline const char *kbasename(const char *path) void fortify_panic(const char *name) __noreturn __cold; void __read_overflow(void) __compiletime_error("detected read beyond size of object passed as 1st parameter"); void __read_overflow2(void) __compiletime_error("detected read beyond size of object passed as 2nd parameter"); +void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter"); void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) @@ -395,4 +396,33 @@ __FORTIFY_INLINE char *strcpy(char *p, const char *q) #endif +/** + * memcpy_and_pad - Copy one buffer to another with padding + * @dest: Where to copy to + * @dest_len: The destination buffer size + * @src: Where to copy from + * @count: The number of bytes to copy + * @pad: Character to use for padding if space is left in destination. + */ +__FORTIFY_INLINE void memcpy_and_pad(void *dest, size_t dest_len, + const void *src, size_t count, int pad) +{ + size_t dest_size = __builtin_object_size(dest, 0); + size_t src_size = __builtin_object_size(src, 0); + + if (__builtin_constant_p(dest_len) && __builtin_constant_p(count)) { + if (dest_size < dest_len && dest_size < count) + __write_overflow(); + else if (src_size < dest_len && src_size < count) + __read_overflow3(); + } + if (dest_size < dest_len) + fortify_panic(__func__); + if (dest_len > count) { + memcpy(dest, src, count); + memset(dest + count, pad, dest_len - count); + } else + memcpy(dest, src, dest_len); +} + #endif /* _LINUX_STRING_H_ */ From 17c39d053a46b300fee786857458857086a4844e Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Mon, 14 Aug 2017 22:12:39 +0200 Subject: [PATCH 27/74] nvmet: use memcpy_and_pad for identify sn/fr This changes the earlier patch "nvmet: don't report 0-bytes in serial number" to use the memcpy_and_pad() helper introduced in a previous patch. Signed-off-by: Martin Wilck Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index dffd8064163b..9496c71d2257 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -168,15 +168,6 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) nvmet_req_complete(req, status); } -static void copy_and_pad(char *dst, int dst_len, const char *src, int src_len) -{ - int len = min(src_len, dst_len); - - memcpy(dst, src, len); - if (dst_len > len) - memset(dst + len, ' ', dst_len - len); -} - static void nvmet_execute_identify_ctrl(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -196,8 +187,9 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) bin2hex(id->sn, &ctrl->subsys->serial, min(sizeof(ctrl->subsys->serial), sizeof(id->sn) / 2)); - copy_and_pad(id->mn, sizeof(id->mn), model, sizeof(model) - 1); - copy_and_pad(id->fr, sizeof(id->fr), UTS_RELEASE, strlen(UTS_RELEASE)); + memcpy_and_pad(id->mn, sizeof(id->mn), model, sizeof(model) - 1, ' '); + memcpy_and_pad(id->fr, sizeof(id->fr), + UTS_RELEASE, strlen(UTS_RELEASE), ' '); id->rab = 6; From a7b7c7a105a528e6c2a0a2581b814a5acacb4c38 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 14 Aug 2017 15:29:26 +0300 Subject: [PATCH 28/74] nvme-rdma: Use unlikely macro in the fast path This patch slightly improves performance (mainly for small block sizes). Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b51e7df63df5..6a7682620d87 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1047,7 +1047,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, if (req->mr->need_inval) { res = nvme_rdma_inv_rkey(queue, req); - if (res < 0) { + if (unlikely(res < 0)) { dev_err(ctrl->ctrl.device, "Queueing INV WR for rkey %#x failed (%d)\n", req->mr->rkey, res); @@ -1112,7 +1112,7 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, int nr; nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE); - if (nr < count) { + if (unlikely(nr < count)) { if (nr < 0) return nr; return -EINVAL; @@ -1248,7 +1248,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, first = ≀ ret = ib_post_send(queue->qp, first, &bad_wr); - if (ret) { + if (unlikely(ret)) { dev_err(queue->ctrl->ctrl.device, "%s failed with error code %d\n", __func__, ret); } @@ -1274,7 +1274,7 @@ static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue, wr.num_sge = 1; ret = ib_post_recv(queue->qp, &wr, &bad_wr); - if (ret) { + if (unlikely(ret)) { dev_err(queue->ctrl->ctrl.device, "%s failed with error code %d\n", __func__, ret); } @@ -1634,7 +1634,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); err = nvme_rdma_map_data(queue, rq, c); - if (err < 0) { + if (unlikely(err < 0)) { dev_err(queue->ctrl->ctrl.device, "Failed to map data (%d)\n", err); nvme_cleanup_cmd(rq); @@ -1648,7 +1648,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, flush = true; err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, req->mr->need_inval ? &req->reg_wr.wr : NULL, flush); - if (err) { + if (unlikely(err)) { nvme_rdma_unmap_data(queue, rq); goto err; } From 5228b3280b9bb8fa6aef59f891cca64a028e9b36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Sun, 27 Aug 2017 15:56:37 +0200 Subject: [PATCH 29/74] nvme: fix uninitialized prp2 value on small transfers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The value of iod->first_dma ends up as prp2 in NVMe commands. In case there is not enough data to cross a page boundary, iod->first_dma is never initialized and contains random data. Comply with the NVMe specification and fill in 0 in that case. Signed-off-by: Jan H. Schönherr Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e6283745ecd2..544805a2421b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -555,8 +555,10 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) int nprps, i; length -= (page_size - offset); - if (length <= 0) + if (length <= 0) { + iod->first_dma = 0; return BLK_STS_OK; + } dma_len -= (page_size - offset); if (dma_len) { From 07fbd32a6b215d8b2fc01ccc89622207b9b782fd Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 25 Aug 2017 19:14:50 -0400 Subject: [PATCH 30/74] nvme: honor RTD3 Entry Latency for shutdowns If an NVMe controller reports RTD3 Entry Latency larger than shutdown_timeout, up to a maximum of 60 seconds, use that value to set the shutdown timer. Otherwise fall back to the module parameter which defaults to 5 seconds. Signed-off-by: Martin K. Petersen Reviewed-by: Sagi Grimberg [hch: removed do_div, made transition time local scope] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 16 +++++++++++++++- drivers/nvme/host/nvme.h | 1 + 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1db8de0bee87..745058905da6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1458,7 +1458,7 @@ EXPORT_SYMBOL_GPL(nvme_enable_ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) { - unsigned long timeout = jiffies + (shutdown_timeout * HZ); + unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ); u32 csts; int ret; @@ -1826,6 +1826,20 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->sgls = le32_to_cpu(id->sgls); ctrl->kas = le16_to_cpu(id->kas); + if (id->rtd3e) { + /* us -> s */ + u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000; + + ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time, + shutdown_timeout, 60); + + if (ctrl->shutdown_timeout != shutdown_timeout) + dev_warn(ctrl->device, + "Shutdown timeout set to %u seconds\n", + ctrl->shutdown_timeout); + } else + ctrl->shutdown_timeout = shutdown_timeout; + ctrl->npss = id->npss; ctrl->apsta = id->apsta; prev_apst_enabled = ctrl->apst_enabled; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2c8a02be46fd..9b959ee18cb6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -162,6 +162,7 @@ struct nvme_ctrl { u16 kas; u8 npss; u8 apsta; + unsigned int shutdown_timeout; unsigned int kato; bool subsystem; unsigned long quirks; From a751da33945a2c94a0449d1005c3c973e5acfefb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 22 Aug 2017 10:17:03 +0200 Subject: [PATCH 31/74] nvme: report more detailed status codes to the block layer Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg --- drivers/nvme/host/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 745058905da6..4c49ec4349bc 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -113,7 +113,16 @@ static blk_status_t nvme_error_status(struct request *req) case NVME_SC_WRITE_FAULT: case NVME_SC_READ_ERROR: case NVME_SC_UNWRITTEN_BLOCK: + case NVME_SC_ACCESS_DENIED: + case NVME_SC_READ_ONLY: return BLK_STS_MEDIUM; + case NVME_SC_GUARD_CHECK: + case NVME_SC_APPTAG_CHECK: + case NVME_SC_REFTAG_CHECK: + case NVME_SC_INVALID_PI: + return BLK_STS_PROTECTION; + case NVME_SC_RESERVATION_CONFLICT: + return BLK_STS_NEXUS; default: return BLK_STS_IOERR; } From 0a72bbba493bebd53fa78e6741b86a3003f070d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 22 Aug 2017 11:42:24 +0200 Subject: [PATCH 32/74] nvme: allow calling nvme_change_ctrl_state from irq context Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4c49ec4349bc..d63e1fcf4437 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -176,9 +176,10 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state) { enum nvme_ctrl_state old_state; + unsigned long flags; bool changed = false; - spin_lock_irq(&ctrl->lock); + spin_lock_irqsave(&ctrl->lock, flags); old_state = ctrl->state; switch (new_state) { @@ -239,7 +240,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, if (changed) ctrl->state = new_state; - spin_unlock_irq(&ctrl->lock); + spin_unlock_irqrestore(&ctrl->lock, flags); return changed; } From 57eeaf8ec67d5c5dee0ee9bb5b19866b8eef780d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Aug 2017 15:47:37 +0200 Subject: [PATCH 33/74] nvme: remove unused struct nvme_ns fields And move the flags for the flags field near that field while touching this area. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg --- drivers/nvme/host/nvme.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9b959ee18cb6..936c4056d98e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -211,13 +211,9 @@ struct nvme_ns { bool ext; u8 pi_type; unsigned long flags; - u16 noiob; - #define NVME_NS_REMOVING 0 #define NVME_NS_DEAD 1 - - u64 mode_select_num_blocks; - u32 mode_select_block_len; + u16 noiob; }; struct nvme_ctrl_ops { From cdbff4f26bd9fab11bfac22097f836892a5c3612 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Aug 2017 16:14:47 +0200 Subject: [PATCH 34/74] nvme: remove nvme_revalidate_ns The function is used in two places, and the shared code for those will diverge later in this series. Instead factor out a new helper to get the ids for a namespace, simplify the calling conventions for nvme_identify_ns and just open code the sequence. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg --- drivers/nvme/host/core.c | 100 +++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 47 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d63e1fcf4437..b87cf3a6e9ac 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -783,7 +783,8 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } -static int nvme_identify_ns_descs(struct nvme_ns *ns, unsigned nsid) +static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, + u8 *eui64, u8 *nguid, uuid_t *uuid) { struct nvme_command c = { }; int status; @@ -799,7 +800,7 @@ static int nvme_identify_ns_descs(struct nvme_ns *ns, unsigned nsid) if (!data) return -ENOMEM; - status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, data, + status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data, NVME_IDENTIFY_DATA_SIZE); if (status) goto free_data; @@ -813,33 +814,33 @@ static int nvme_identify_ns_descs(struct nvme_ns *ns, unsigned nsid) switch (cur->nidt) { case NVME_NIDT_EUI64: if (cur->nidl != NVME_NIDT_EUI64_LEN) { - dev_warn(ns->ctrl->device, + dev_warn(ctrl->device, "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n", cur->nidl); goto free_data; } len = NVME_NIDT_EUI64_LEN; - memcpy(ns->eui, data + pos + sizeof(*cur), len); + memcpy(eui64, data + pos + sizeof(*cur), len); break; case NVME_NIDT_NGUID: if (cur->nidl != NVME_NIDT_NGUID_LEN) { - dev_warn(ns->ctrl->device, + dev_warn(ctrl->device, "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n", cur->nidl); goto free_data; } len = NVME_NIDT_NGUID_LEN; - memcpy(ns->nguid, data + pos + sizeof(*cur), len); + memcpy(nguid, data + pos + sizeof(*cur), len); break; case NVME_NIDT_UUID: if (cur->nidl != NVME_NIDT_UUID_LEN) { - dev_warn(ns->ctrl->device, + dev_warn(ctrl->device, "ctrl returned bogus length: %d for NVME_NIDT_UUID\n", cur->nidl); goto free_data; } len = NVME_NIDT_UUID_LEN; - uuid_copy(&ns->uuid, data + pos + sizeof(*cur)); + uuid_copy(uuid, data + pos + sizeof(*cur)); break; default: /* Skip unnkown types */ @@ -864,9 +865,10 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000); } -static int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, - struct nvme_id_ns **id) +static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, + unsigned nsid) { + struct nvme_id_ns *id; struct nvme_command c = { }; int error; @@ -875,15 +877,18 @@ static int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, c.identify.nsid = cpu_to_le32(nsid); c.identify.cns = NVME_ID_CNS_NS; - *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); - if (!*id) - return -ENOMEM; + id = kmalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return NULL; - error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, - sizeof(struct nvme_id_ns)); - if (error) - kfree(*id); - return error; + error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + if (error) { + dev_warn(ctrl->device, "Identify namespace failed\n"); + kfree(id); + return NULL; + } + + return id; } static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, @@ -1174,32 +1179,21 @@ static void nvme_config_discard(struct nvme_ns *ns) blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX); } -static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) +static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, + struct nvme_id_ns *id, u8 *eui64, u8 *nguid, uuid_t *uuid) { - if (nvme_identify_ns(ns->ctrl, ns->ns_id, id)) { - dev_warn(ns->ctrl->device, "Identify namespace failed\n"); - return -ENODEV; - } - - if ((*id)->ncap == 0) { - kfree(*id); - return -ENODEV; - } - - if (ns->ctrl->vs >= NVME_VS(1, 1, 0)) - memcpy(ns->eui, (*id)->eui64, sizeof(ns->eui)); - if (ns->ctrl->vs >= NVME_VS(1, 2, 0)) - memcpy(ns->nguid, (*id)->nguid, sizeof(ns->nguid)); - if (ns->ctrl->vs >= NVME_VS(1, 3, 0)) { + if (ctrl->vs >= NVME_VS(1, 1, 0)) + memcpy(eui64, id->eui64, sizeof(id->eui64)); + if (ctrl->vs >= NVME_VS(1, 2, 0)) + memcpy(nguid, id->nguid, sizeof(id->nguid)); + if (ctrl->vs >= NVME_VS(1, 3, 0)) { /* Don't treat error as fatal we potentially * already have a NGUID or EUI-64 */ - if (nvme_identify_ns_descs(ns, ns->ns_id)) - dev_warn(ns->ctrl->device, + if (nvme_identify_ns_descs(ctrl, nsid, eui64, nguid, uuid)) + dev_warn(ctrl->device, "%s: Identify Descriptors failed\n", __func__); } - - return 0; } static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) @@ -1240,22 +1234,28 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) static int nvme_revalidate_disk(struct gendisk *disk) { struct nvme_ns *ns = disk->private_data; - struct nvme_id_ns *id = NULL; - int ret; + struct nvme_ctrl *ctrl = ns->ctrl; + struct nvme_id_ns *id; + int ret = 0; if (test_bit(NVME_NS_DEAD, &ns->flags)) { set_capacity(disk, 0); return -ENODEV; } - ret = nvme_revalidate_ns(ns, &id); - if (ret) - return ret; + id = nvme_identify_ns(ctrl, ns->ns_id); + if (!id) + return -ENODEV; - __nvme_revalidate_disk(disk, id); + if (id->ncap == 0) { + ret = -ENODEV; + goto out; + } + + nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid); +out: kfree(id); - - return 0; + return ret; } static char nvme_pr_type(enum pr_type type) @@ -2361,9 +2361,15 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance); - if (nvme_revalidate_ns(ns, &id)) + id = nvme_identify_ns(ctrl, nsid); + if (!id) goto out_free_queue; + if (id->ncap == 0) + goto out_free_id; + + nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid); + if (nvme_nvm_ns_supported(ns, id) && nvme_nvm_register(ns, disk_name, node)) { dev_warn(ctrl->device, "%s: LightNVM init failure\n", __func__); From 1d5df6af8c7469f9ae3e66e7bed0782cfe4f95db Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Aug 2017 14:10:00 +0200 Subject: [PATCH 35/74] nvme: don't blindly overwrite identifiers on disk revalidate Instead validate that these identifiers do not change, as that is prohibited by the specification. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b87cf3a6e9ac..b0dd58db110e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1236,6 +1236,8 @@ static int nvme_revalidate_disk(struct gendisk *disk) struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_id_ns *id; + u8 eui64[8] = { 0 }, nguid[16] = { 0 }; + uuid_t uuid = uuid_null; int ret = 0; if (test_bit(NVME_NS_DEAD, &ns->flags)) { @@ -1252,7 +1254,15 @@ static int nvme_revalidate_disk(struct gendisk *disk) goto out; } - nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid); + nvme_report_ns_ids(ctrl, ns->ns_id, id, eui64, nguid, &uuid); + if (!uuid_equal(&ns->uuid, &uuid) || + memcmp(&ns->nguid, &nguid, sizeof(ns->nguid)) || + memcmp(&ns->eui, &eui64, sizeof(ns->eui))) { + dev_err(ctrl->device, + "identifiers changed for nsid %d\n", ns->ns_id); + ret = -ENODEV; + } + out: kfree(id); return ret; From 489beb91e66a237254e23ac1a0fe1beac23d87c5 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 29 Aug 2017 10:33:44 -0700 Subject: [PATCH 36/74] nvme-fabrics: Convert nvmf_transports_mutex to an rwsem The mutex protects against the list of transports changing while a controller is being created, but using a plain old mutex means that it also serializes controller creation. This unnecessarily slows down creating multiple controllers - for example for the RDMA transport, creating a controller involves establishing one connection for every IO queue, which involves even more network/software round trips, so the delay can become significant. The simplest way to fix this is to change the mutex to an rwsem and only hold it for writing when the list is being mutated. Since we can take the rwsem for reading while creating a controller, we can create multiple controllers in parallel. Signed-off-by: Roland Dreier Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index fc3b6552f467..53a9598e3aee 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -22,7 +22,7 @@ #include "fabrics.h" static LIST_HEAD(nvmf_transports); -static DEFINE_MUTEX(nvmf_transports_mutex); +static DECLARE_RWSEM(nvmf_transports_rwsem); static LIST_HEAD(nvmf_hosts); static DEFINE_MUTEX(nvmf_hosts_mutex); @@ -495,9 +495,9 @@ int nvmf_register_transport(struct nvmf_transport_ops *ops) if (!ops->create_ctrl) return -EINVAL; - mutex_lock(&nvmf_transports_mutex); + down_write(&nvmf_transports_rwsem); list_add_tail(&ops->entry, &nvmf_transports); - mutex_unlock(&nvmf_transports_mutex); + up_write(&nvmf_transports_rwsem); return 0; } @@ -514,9 +514,9 @@ EXPORT_SYMBOL_GPL(nvmf_register_transport); */ void nvmf_unregister_transport(struct nvmf_transport_ops *ops) { - mutex_lock(&nvmf_transports_mutex); + down_write(&nvmf_transports_rwsem); list_del(&ops->entry); - mutex_unlock(&nvmf_transports_mutex); + up_write(&nvmf_transports_rwsem); } EXPORT_SYMBOL_GPL(nvmf_unregister_transport); @@ -525,7 +525,7 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( { struct nvmf_transport_ops *ops; - lockdep_assert_held(&nvmf_transports_mutex); + lockdep_assert_held(&nvmf_transports_rwsem); list_for_each_entry(ops, &nvmf_transports, entry) { if (strcmp(ops->name, opts->transport) == 0) @@ -851,7 +851,7 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) goto out_free_opts; opts->mask &= ~NVMF_REQUIRED_OPTS; - mutex_lock(&nvmf_transports_mutex); + down_read(&nvmf_transports_rwsem); ops = nvmf_lookup_transport(opts); if (!ops) { pr_info("no handler found for transport %s.\n", @@ -878,16 +878,16 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) dev_warn(ctrl->device, "controller returned incorrect NQN: \"%s\".\n", ctrl->subnqn); - mutex_unlock(&nvmf_transports_mutex); + up_read(&nvmf_transports_rwsem); ctrl->ops->delete_ctrl(ctrl); return ERR_PTR(-EINVAL); } - mutex_unlock(&nvmf_transports_mutex); + up_read(&nvmf_transports_rwsem); return ctrl; out_unlock: - mutex_unlock(&nvmf_transports_mutex); + up_read(&nvmf_transports_rwsem); out_free_opts: nvmf_free_options(opts); return ERR_PTR(ret); From 1cad65620fecfb24cdeefa5533628a4f293783e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Aug 2017 17:46:01 -0400 Subject: [PATCH 37/74] nvme: factor metadata handling out of __nvme_submit_user_cmd Keep the metadata code in a separate helper instead of making the main function more complicated. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 76 +++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b0dd58db110e..a520c841c63c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -600,6 +600,40 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); +static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, + unsigned len, u32 seed, bool write) +{ + struct bio_integrity_payload *bip; + int ret = -ENOMEM; + void *buf; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + goto out; + + ret = -EFAULT; + if (write && copy_from_user(buf, ubuf, len)) + goto out_free_meta; + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto out_free_meta; + } + + bip->bip_iter.bi_size = len; + bip->bip_iter.bi_sector = seed; + ret = bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)); + if (ret == len) + return buf; + ret = -ENOMEM; +out_free_meta: + kfree(buf); +out: + return ERR_PTR(ret); +} + int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, @@ -625,46 +659,17 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, if (ret) goto out; bio = req->bio; - - if (!disk) - goto submit; bio->bi_disk = disk; - - if (meta_buffer && meta_len) { - struct bio_integrity_payload *bip; - - meta = kmalloc(meta_len, GFP_KERNEL); - if (!meta) { - ret = -ENOMEM; + if (disk && meta_buffer && meta_len) { + meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, + meta_seed, write); + if (IS_ERR(meta)) { + ret = PTR_ERR(meta); goto out_unmap; } - - if (write) { - if (copy_from_user(meta, meta_buffer, - meta_len)) { - ret = -EFAULT; - goto out_free_meta; - } - } - - bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); - if (IS_ERR(bip)) { - ret = PTR_ERR(bip); - goto out_free_meta; - } - - bip->bip_iter.bi_size = meta_len; - bip->bip_iter.bi_sector = meta_seed; - - ret = bio_integrity_add_page(bio, virt_to_page(meta), - meta_len, offset_in_page(meta)); - if (ret != meta_len) { - ret = -ENOMEM; - goto out_free_meta; - } } } - submit: + blk_execute_rq(req->q, disk, req, 0); if (nvme_req(req)->flags & NVME_REQ_CANCELLED) ret = -EINTR; @@ -676,7 +681,6 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, if (copy_to_user(meta_buffer, meta, meta_len)) ret = -EFAULT; } - out_free_meta: kfree(meta); out_unmap: if (bio) From b5d8af5b521bdb4167808979df37e9defabeb707 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 29 Aug 2017 17:46:02 -0400 Subject: [PATCH 38/74] nvme/pci: Use req_op to determine DIF remapping Only read and write commands need DIF remapping. Everything else uses a passthrough integrity payload. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 544805a2421b..11874afb2422 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -668,7 +668,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1) goto out_unmap; - if (rq_data_dir(req)) + if (req_op(req) == REQ_OP_WRITE) nvme_dif_remap(req, nvme_dif_prep); if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir)) @@ -696,7 +696,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) if (iod->nents) { dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); if (blk_integrity_rq(req)) { - if (!rq_data_dir(req)) + if (req_op(req) == REQ_OP_READ) nvme_dif_remap(req, nvme_dif_complete); dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); } From 485783ca63e3a47fa5a30df8c6745a6299607e4d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 29 Aug 2017 17:46:03 -0400 Subject: [PATCH 39/74] nvme: Make nvme user functions static These functions are used only locally in the nvme core. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 10 +++++----- drivers/nvme/host/nvme.h | 7 ------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a520c841c63c..01f39a977f95 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -634,10 +634,10 @@ static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, return ERR_PTR(ret); } -int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, - void __user *ubuffer, unsigned bufflen, - void __user *meta_buffer, unsigned meta_len, u32 meta_seed, - u32 *result, unsigned timeout) +static int __nvme_submit_user_cmd(struct request_queue *q, + struct nvme_command *cmd, void __user *ubuffer, + unsigned bufflen, void __user *meta_buffer, unsigned meta_len, + u32 meta_seed, u32 *result, unsigned timeout) { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; @@ -690,7 +690,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, return ret; } -int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, +static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, u32 *result, unsigned timeout) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 936c4056d98e..a19a587d60ed 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -314,13 +314,6 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, unsigned timeout, int qid, int at_head, int flags); -int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, - void __user *ubuffer, unsigned bufflen, u32 *result, - unsigned timeout); -int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, - void __user *ubuffer, unsigned bufflen, - void __user *meta_buffer, unsigned meta_len, u32 meta_seed, - u32 *result, unsigned timeout); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); From 63263d60e0f9f37bfd5e6a1e83a62f0e62fc459f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 29 Aug 2017 17:46:04 -0400 Subject: [PATCH 40/74] nvme: Use metadata for passthrough commands The ioctls' struct allows the user to provide a metadata address and length for a passthrough command. This patch uses these values that were previously ignored and deletes the now unused wrapper function. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 01f39a977f95..277a7a02cba5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -634,7 +634,7 @@ static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, return ERR_PTR(ret); } -static int __nvme_submit_user_cmd(struct request_queue *q, +static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, u32 *result, unsigned timeout) @@ -690,14 +690,6 @@ static int __nvme_submit_user_cmd(struct request_queue *q, return ret; } -static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, - void __user *ubuffer, unsigned bufflen, u32 *result, - unsigned timeout) -{ - return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0, - result, timeout); -} - static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) { struct nvme_ctrl *ctrl = rq->end_io_data; @@ -987,7 +979,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.apptag = cpu_to_le16(io.apptag); c.rw.appmask = cpu_to_le16(io.appmask); - return __nvme_submit_user_cmd(ns->queue, &c, + return nvme_submit_user_cmd(ns->queue, &c, (void __user *)(uintptr_t)io.addr, length, metadata, meta_len, io.slba, NULL, 0); } @@ -1025,7 +1017,8 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, (void __user *)(uintptr_t)cmd.addr, cmd.data_len, - &cmd.result, timeout); + (void __user *)(uintptr_t)cmd.metadata, cmd.metadata, + 0, &cmd.result, timeout); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) return -EFAULT; From 28dd5cf70aaac2a12a16847ae0a978f0b0575194 Mon Sep 17 00:00:00 2001 From: Omri Mann Date: Wed, 30 Aug 2017 15:22:59 +0300 Subject: [PATCH 41/74] nvmet: add support for reporting the host identifier And fix the Get/Set Log Page implementation to take all 8 bits of the feature identifier into account. Signed-off-by: Omri Mann Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig [hch: used the UUID API, updated changelog] --- drivers/nvme/target/admin-cmd.c | 17 +++++++++++++++-- drivers/nvme/target/fabrics-cmd.c | 1 + drivers/nvme/target/nvmet.h | 1 + 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 9496c71d2257..c4a0bf36e752 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -443,7 +443,7 @@ static void nvmet_execute_set_features(struct nvmet_req *req) u32 val32; u16 status = 0; - switch (cdw10 & 0xf) { + switch (cdw10 & 0xff) { case NVME_FEAT_NUM_QUEUES: nvmet_set_result(req, (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16)); @@ -453,6 +453,9 @@ static void nvmet_execute_set_features(struct nvmet_req *req) req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000); nvmet_set_result(req, req->sq->ctrl->kato); break; + case NVME_FEAT_HOST_ID: + status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + break; default: status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; break; @@ -467,7 +470,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req) u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); u16 status = 0; - switch (cdw10 & 0xf) { + switch (cdw10 & 0xff) { /* * These features are mandatory in the spec, but we don't * have a useful way to implement them. We'll eventually @@ -501,6 +504,16 @@ static void nvmet_execute_get_features(struct nvmet_req *req) case NVME_FEAT_KATO: nvmet_set_result(req, req->sq->ctrl->kato * 1000); break; + case NVME_FEAT_HOST_ID: + /* need 128-bit host identifier flag */ + if (!(req->cmd->common.cdw10[1] & cpu_to_le32(1 << 0))) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + status = nvmet_copy_to_sgl(req, 0, &req->sq->ctrl->hostid, + sizeof(req->sq->ctrl->hostid)); + break; default: status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; break; diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 3cc17269504b..859a66725291 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -154,6 +154,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) le32_to_cpu(c->kato), &ctrl); if (status) goto out; + uuid_copy(&ctrl->hostid, &d->hostid); status = nvmet_install_queue(ctrl, req); if (status) { diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index e3b244c7e443..7d261ab894f4 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -115,6 +115,7 @@ struct nvmet_ctrl { u32 cc; u32 csts; + uuid_t hostid; u16 cntlid; u32 kato; From 80294c3bbf3ceb20530ee4aa44bbaf354222b021 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 31 Aug 2017 08:46:29 +0200 Subject: [PATCH 42/74] block, bfq: make lookup_next_entity push up vtime on expirations To provide a very smooth service, bfq starts to serve a bfq_queue only if the queue is 'eligible', i.e., if the same queue would have started to be served in the ideal, perfectly fair system that bfq simulates internally. This is obtained by associating each queue with a virtual start time, and by computing a special system virtual time quantity: a queue is eligible only if the system virtual time has reached the virtual start time of the queue. Finally, bfq guarantees that, when a new queue must be set in service, there is always at least one eligible entity for each active parent entity in the scheduler. To provide this guarantee, the function __bfq_lookup_next_entity pushes up, for each parent entity on which it is invoked, the system virtual time to the minimum among the virtual start times of the entities in the active tree for the parent entity (more precisely, the push up occurs if the system virtual time happens to be lower than all such virtual start times). There is however a circumstance in which __bfq_lookup_next_entity cannot push up the system virtual time for a parent entity, even if the system virtual time is lower than the virtual start times of all the child entities in the active tree. It happens if one of the child entities is in service. In fact, in such a case, there is already an eligible entity, the in-service one, even if it may not be not present in the active tree (because in-service entities may be removed from the active tree). Unfortunately, in the last re-design of the hierarchical-scheduling engine, the reset of the pointer to the in-service entity for a given parent entity--reset to be done as a consequence of the expiration of the in-service entity--always happens after the function __bfq_lookup_next_entity has been invoked. This causes the function to think that there is still an entity in service for the parent entity, and then that the system virtual time cannot be pushed up, even if actually such a no-more-in-service entity has already been properly reinserted into the active tree (or in some other tree if no more active). Yet, the system virtual time *had* to be pushed up, to be ready to correctly choose the next queue to serve. Because of the lack of this push up, bfq may wrongly set in service a queue that had been speculatively pre-computed as the possible next-in-service queue, but that would no more be the one to serve after the expiration and the reinsertion into the active trees of the previously in-service entities. This commit addresses this issue by making __bfq_lookup_next_entity properly push up the system virtual time if an expiration is occurring. Signed-off-by: Paolo Valente Tested-by: Lee Tibbert Tested-by: Oleksandr Natalenko Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 4 ++-- block/bfq-iosched.h | 4 ++-- block/bfq-wf2q.c | 58 +++++++++++++++++++++++++++++++++------------ 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 79484469c2f7..8f3d22330d64 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -720,7 +720,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, entity->budget = new_budget; bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); - bfq_requeue_bfqq(bfqd, bfqq); + bfq_requeue_bfqq(bfqd, bfqq, false); } } @@ -2563,7 +2563,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_del_bfqq_busy(bfqd, bfqq, true); } else { - bfq_requeue_bfqq(bfqd, bfqq); + bfq_requeue_bfqq(bfqd, bfqq, true); /* * Resort priority tree of potential close cooperators. */ diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index cc4ea8574483..ac0809c72c98 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -817,7 +817,6 @@ extern const int bfq_timeout; struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync); void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync); struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); -void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, struct rb_root *root); @@ -917,7 +916,8 @@ void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd); void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool ins_into_idle_tree, bool expiration); void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); +void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool expiration); void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool expiration); void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 911aa7431dbe..5f26cc21fdd1 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -44,7 +44,8 @@ static unsigned int bfq_class_idx(struct bfq_entity *entity) BFQ_DEFAULT_GRP_CLASS - 1; } -static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + bool expiration); static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); @@ -54,6 +55,8 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); * @new_entity: if not NULL, pointer to the entity whose activation, * requeueing or repositionig triggered the invocation of * this function. + * @expiration: id true, this function is being invoked after the + * expiration of the in-service entity * * This function is called to update sd->next_in_service, which, in * its turn, may change as a consequence of the insertion or @@ -72,7 +75,8 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); * entity. */ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *new_entity) + struct bfq_entity *new_entity, + bool expiration) { struct bfq_entity *next_in_service = sd->next_in_service; bool parent_sched_may_change = false; @@ -130,7 +134,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, if (replace_next) next_in_service = new_entity; } else /* invoked because of a deactivation: lookup needed */ - next_in_service = bfq_lookup_next_entity(sd); + next_in_service = bfq_lookup_next_entity(sd, expiration); if (next_in_service) { parent_sched_may_change = !sd->next_in_service || @@ -1127,10 +1131,12 @@ static void __bfq_activate_requeue_entity(struct bfq_entity *entity, * @requeue: true if this is a requeue, which implies that bfqq is * being expired; thus ALL its ancestors stop being served and must * therefore be requeued + * @expiration: true if this function is being invoked in the expiration path + * of the in-service queue */ static void bfq_activate_requeue_entity(struct bfq_entity *entity, bool non_blocking_wait_rq, - bool requeue) + bool requeue, bool expiration) { struct bfq_sched_data *sd; @@ -1138,7 +1144,8 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, sd = entity->sched_data; __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); - if (!bfq_update_next_in_service(sd, entity) && !requeue) + if (!bfq_update_next_in_service(sd, entity, expiration) && + !requeue) break; } } @@ -1194,6 +1201,8 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree) * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. * @entity: the entity to deactivate. * @ins_into_idle_tree: true if the entity can be put into the idle tree + * @expiration: true if this function is being invoked in the expiration path + * of the in-service queue */ static void bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree, @@ -1222,7 +1231,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, * then, since entity has just been * deactivated, a new one must be found. */ - bfq_update_next_in_service(sd, NULL); + bfq_update_next_in_service(sd, NULL, expiration); if (sd->next_in_service || sd->in_service_entity) { /* @@ -1281,7 +1290,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, __bfq_requeue_entity(entity); sd = entity->sched_data; - if (!bfq_update_next_in_service(sd, entity) && + if (!bfq_update_next_in_service(sd, entity, expiration) && !expiration) /* * next_in_service unchanged or not causing @@ -1416,12 +1425,14 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) /** * bfq_lookup_next_entity - return the first eligible entity in @sd. * @sd: the sched_data. + * @expiration: true if we are on the expiration path of the in-service queue * * This function is invoked when there has been a change in the trees - * for sd, and we need know what is the new next entity after this - * change. + * for sd, and we need to know what is the new next entity to serve + * after this change. */ -static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + bool expiration) { struct bfq_service_tree *st = sd->service_tree; struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); @@ -1448,8 +1459,24 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) * class, unless the idle class needs to be served. */ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { + /* + * If expiration is true, then bfq_lookup_next_entity + * is being invoked as a part of the expiration path + * of the in-service queue. In this case, even if + * sd->in_service_entity is not NULL, + * sd->in_service_entiy at this point is actually not + * in service any more, and, if needed, has already + * been properly queued or requeued into the right + * tree. The reason why sd->in_service_entity is still + * not NULL here, even if expiration is true, is that + * sd->in_service_entiy is reset as a last step in the + * expiration path. So, if expiration is true, tell + * __bfq_lookup_next_entity that there is no + * sd->in_service_entity. + */ entity = __bfq_lookup_next_entity(st + class_idx, - sd->in_service_entity); + sd->in_service_entity && + !expiration); if (entity) break; @@ -1562,7 +1589,7 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) for_each_entity(entity) { struct bfq_sched_data *sd = entity->sched_data; - if (!bfq_update_next_in_service(sd, NULL)) + if (!bfq_update_next_in_service(sd, NULL, false)) break; } @@ -1610,16 +1637,17 @@ void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) struct bfq_entity *entity = &bfqq->entity; bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), - false); + false, false); bfq_clear_bfqq_non_blocking_wait_rq(bfqq); } -void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool expiration) { struct bfq_entity *entity = &bfqq->entity; bfq_activate_requeue_entity(entity, false, - bfqq == bfqd->in_service_queue); + bfqq == bfqd->in_service_queue, expiration); } /* From a02195ce864069c3aa41da792364bdfb3cdbc39f Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 31 Aug 2017 08:46:30 +0200 Subject: [PATCH 43/74] block, bfq: remove direct switch to an entity in higher class If the function bfq_update_next_in_service is invoked as a consequence of the activation or requeueing of an entity, say E, and finds out that E belongs to a higher-priority class than that of the current next-in-service entity, then it sets next_in_service directly to E. But this may lead to anomalous schedules, because E may happen not be eligible for service, because its virtual start time is higher than the system virtual time for its service tree. This commit addresses this issue by simply removing this direct switch. Signed-off-by: Paolo Valente Tested-by: Lee Tibbert Tested-by: Oleksandr Natalenko Signed-off-by: Jens Axboe --- block/bfq-wf2q.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 5f26cc21fdd1..86a928d05507 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -86,9 +86,8 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, * or repositiong of an entity that does not coincide with * sd->next_in_service, then a full lookup in the active tree * can be avoided. In fact, it is enough to check whether the - * just-modified entity has a higher priority than - * sd->next_in_service, or, even if it has the same priority - * as sd->next_in_service, is eligible and has a lower virtual + * just-modified entity has the same priority as + * sd->next_in_service, is eligible and has a lower virtual * finish time than sd->next_in_service. If this compound * condition holds, then the new entity becomes the new * next_in_service. Otherwise no change is needed. @@ -104,9 +103,8 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, /* * If there is already a next_in_service candidate - * entity, then compare class priorities or timestamps - * to decide whether to replace sd->service_tree with - * new_entity. + * entity, then compare timestamps to decide whether + * to replace sd->service_tree with new_entity. */ if (next_in_service) { unsigned int new_entity_class_idx = @@ -114,10 +112,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, struct bfq_service_tree *st = sd->service_tree + new_entity_class_idx; - /* - * For efficiency, evaluate the most likely - * sub-condition first. - */ replace_next = (new_entity_class_idx == bfq_class_idx(next_in_service) @@ -125,10 +119,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, !bfq_gt(new_entity->start, st->vtime) && bfq_gt(next_in_service->finish, - new_entity->finish)) - || - new_entity_class_idx < - bfq_class_idx(next_in_service); + new_entity->finish)); } if (replace_next) From 24d90bb2f3251a8cb63b65e8eb61713d6395362c Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 31 Aug 2017 08:46:31 +0200 Subject: [PATCH 44/74] block, bfq: guarantee update_next_in_service always returns an eligible entity If the function bfq_update_next_in_service is invoked as a consequence of the activation or requeueing of an entity, say E, then it doesn't invoke bfq_lookup_next_entity to get the next-in-service entity. In contrast, it follows a shorter path: if E happens to be eligible (see commit "bfq-sq-mq: make lookup_next_entity push up vtime on expirations" for details on eligibility) and to have a lower virtual finish time than the current candidate as next-in-service entity, then E directly becomes the next-in-service entity. Unfortunately, there is a corner case for which this shorter path makes bfq_update_next_in_service choose a non eligible entity: it occurs if both E and the current next-in-service entity happen to be non eligible when bfq_update_next_in_service is invoked. In this case, E is not set as next-in-service, and, since bfq_lookup_next_entity is not invoked, the state of the parent entity is not updated so as to end up with an eligible entity as the proper next-in-service entity. In this respect, next-in-service is actually allowed to be non eligible while some queue is in service: since no system-virtual-time push-up can be performed in that case (see again commit "bfq-sq-mq: make lookup_next_entity push up vtime on expirations" for details), next-in-service is chosen, speculatively, as a function of the possible value that the system virtual time may get after a push up. But the correctness of the schedule breaks if next-in-service is still a non eligible entity when it is time to set in service the next entity. Unfortunately, this may happen in the above corner case. This commit fixes this problem by making bfq_update_next_in_service invoke bfq_lookup_next_entity not only if the above shorter path cannot be taken, but also if the shorter path is taken but fails to yield an eligible next-in-service entity. Signed-off-by: Paolo Valente Tested-by: Lee Tibbert Tested-by: Oleksandr Natalenko Signed-off-by: Jens Axboe --- block/bfq-wf2q.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 86a928d05507..414ba686a847 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -80,6 +80,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, { struct bfq_entity *next_in_service = sd->next_in_service; bool parent_sched_may_change = false; + bool change_without_lookup = false; /* * If this update is triggered by the activation, requeueing @@ -99,7 +100,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, * set to true, and left as true if * sd->next_in_service is NULL. */ - bool replace_next = true; + change_without_lookup = true; /* * If there is already a next_in_service candidate @@ -112,7 +113,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, struct bfq_service_tree *st = sd->service_tree + new_entity_class_idx; - replace_next = + change_without_lookup = (new_entity_class_idx == bfq_class_idx(next_in_service) && @@ -122,15 +123,16 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, new_entity->finish)); } - if (replace_next) + if (change_without_lookup) next_in_service = new_entity; - } else /* invoked because of a deactivation: lookup needed */ + } + + if (!change_without_lookup) /* lookup needed */ next_in_service = bfq_lookup_next_entity(sd, expiration); - if (next_in_service) { + if (next_in_service) parent_sched_may_change = !sd->next_in_service || bfq_update_parent_budget(next_in_service); - } sd->next_in_service = next_in_service; From 8a0740c4109d646d8697d359962edea47301c652 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 24 Aug 2017 00:03:41 -0700 Subject: [PATCH 45/74] loop: get rid of lo_blocksize This is only used for setting the soft block size on the struct block_device once and then never used again. Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.c | 10 ++-------- drivers/block/loop.h | 1 - 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 407cb172d6e3..efad2d46a018 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -573,8 +573,6 @@ static void do_loop_switch(struct loop_device *lo, struct switch_request *p) mapping = file->f_mapping; mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); lo->lo_backing_file = file; - lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? - mapping->host->i_bdev->bd_block_size : PAGE_SIZE; lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); loop_update_dio(lo); @@ -867,7 +865,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, struct file *file, *f; struct inode *inode; struct address_space *mapping; - unsigned lo_blocksize; int lo_flags = 0; int error; loff_t size; @@ -911,9 +908,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, !file->f_op->write_iter) lo_flags |= LO_FLAGS_READ_ONLY; - lo_blocksize = S_ISBLK(inode->i_mode) ? - inode->i_bdev->bd_block_size : PAGE_SIZE; - error = -EFBIG; size = get_loop_size(lo, file); if ((loff_t)(sector_t)size != size) @@ -927,7 +921,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); lo->use_dio = false; - lo->lo_blocksize = lo_blocksize; lo->lo_device = bdev; lo->lo_flags = lo_flags; lo->lo_backing_file = file; @@ -947,7 +940,8 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, /* let user-space know about the new size */ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); - set_blocksize(bdev, lo_blocksize); + set_blocksize(bdev, S_ISBLK(inode->i_mode) ? + block_size(inode->i_bdev) : PAGE_SIZE); lo->lo_state = Lo_bound; if (part_shift) diff --git a/drivers/block/loop.h b/drivers/block/loop.h index fecd3f97ef8c..efe57189d01e 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -48,7 +48,6 @@ struct loop_device { struct file * lo_backing_file; struct block_device *lo_device; - unsigned lo_blocksize; void *key_data; gfp_t old_gfp_mask; From 6c6b6f28b3335fd85ec833ee0005d9c9dca6c003 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 24 Aug 2017 00:03:42 -0700 Subject: [PATCH 46/74] loop: set physical block size to PAGE_SIZE The physical block size is "the lowest possible sector size that the hardware can operate on without reverting to read-modify-write operations" (from the comment on blk_queue_physical_block_size()). Since loop does buffered I/O on the backing file by default, the RMW unit is a page. This isn't the case for direct I/O mode, but let's keep it simple. Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index efad2d46a018..e3f190016d4f 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1764,6 +1764,8 @@ static int loop_add(struct loop_device **l, int i) } lo->lo_queue->queuedata = lo; + blk_queue_physical_block_size(lo->lo_queue, PAGE_SIZE); + /* * It doesn't make sense to enable merge because the I/O * submitted to backing file is handled page by page. From 89e4fdecb51cf5535867026274bc97de9480ade5 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 24 Aug 2017 00:03:43 -0700 Subject: [PATCH 47/74] loop: add ioctl for changing logical block size This is a different approach from the first attempt in f2c6df7dbf9a ("loop: support 4k physical blocksize"). Rather than extending LOOP_{GET,SET}_STATUS, add a separate ioctl just for setting the block size. Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.c | 24 ++++++++++++++++++++++++ include/uapi/linux/loop.h | 1 + 2 files changed, 25 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index e3f190016d4f..ac106b287d75 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1047,6 +1047,7 @@ static int loop_clr_fd(struct loop_device *lo) memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); + blk_queue_logical_block_size(lo->lo_queue, 512); if (bdev) { bdput(bdev); invalidate_bdev(bdev); @@ -1330,6 +1331,24 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg) return error; } +static int loop_set_block_size(struct loop_device *lo, unsigned long arg) +{ + if (lo->lo_state != Lo_bound) + return -ENXIO; + + if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg)) + return -EINVAL; + + blk_mq_freeze_queue(lo->lo_queue); + + blk_queue_logical_block_size(lo->lo_queue, arg); + loop_update_dio(lo); + + blk_mq_unfreeze_queue(lo->lo_queue); + + return 0; +} + static int lo_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1378,6 +1397,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) err = loop_set_dio(lo, arg); break; + case LOOP_SET_BLOCK_SIZE: + err = -EPERM; + if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) + err = loop_set_block_size(lo, arg); + break; default: err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; } diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h index c8125ec1f4f2..23158dbe2424 100644 --- a/include/uapi/linux/loop.h +++ b/include/uapi/linux/loop.h @@ -88,6 +88,7 @@ struct loop_info64 { #define LOOP_CHANGE_FD 0x4C06 #define LOOP_SET_CAPACITY 0x4C07 #define LOOP_SET_DIRECT_IO 0x4C08 +#define LOOP_SET_BLOCK_SIZE 0x4C09 /* /dev/loop-control interface */ #define LOOP_CTL_ADD 0x4C80 From 43cade803ebeb002403d4b704e041ce800e5b0e1 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 24 Aug 2017 00:03:44 -0700 Subject: [PATCH 48/74] loop: fold loop_switch() into callers The comments here are really outdated, and blk-mq made flushing much simpler, so just fold the two cases into the callers. Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.c | 76 +++++++------------------------------------- 1 file changed, 11 insertions(+), 65 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ac106b287d75..f6c204f62b1e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -546,72 +546,12 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) } } -struct switch_request { - struct file *file; - struct completion wait; -}; - static inline void loop_update_dio(struct loop_device *lo) { __loop_update_dio(lo, io_is_direct(lo->lo_backing_file) | lo->use_dio); } -/* - * Do the actual switch; called from the BIO completion routine - */ -static void do_loop_switch(struct loop_device *lo, struct switch_request *p) -{ - struct file *file = p->file; - struct file *old_file = lo->lo_backing_file; - struct address_space *mapping; - - /* if no new file, only flush of queued bios requested */ - if (!file) - return; - - mapping = file->f_mapping; - mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); - lo->lo_backing_file = file; - lo->old_gfp_mask = mapping_gfp_mask(mapping); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - loop_update_dio(lo); -} - -/* - * loop_switch performs the hard work of switching a backing store. - * First it needs to flush existing IO, it does this by sending a magic - * BIO down the pipe. The completion of this BIO does the actual switch. - */ -static int loop_switch(struct loop_device *lo, struct file *file) -{ - struct switch_request w; - - w.file = file; - - /* freeze queue and wait for completion of scheduled requests */ - blk_mq_freeze_queue(lo->lo_queue); - - /* do the switch action */ - do_loop_switch(lo, &w); - - /* unfreeze */ - blk_mq_unfreeze_queue(lo->lo_queue); - - return 0; -} - -/* - * Helper to flush the IOs in loop, but keeping loop thread running - */ -static int loop_flush(struct loop_device *lo) -{ - /* loop not yet configured, no running thread, nothing to flush */ - if (lo->lo_state != Lo_bound) - return 0; - return loop_switch(lo, NULL); -} - static void loop_reread_partitions(struct loop_device *lo, struct block_device *bdev) { @@ -676,9 +616,14 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, goto out_putf; /* and ... switch */ - error = loop_switch(lo, file); - if (error) - goto out_putf; + blk_mq_freeze_queue(lo->lo_queue); + mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); + lo->lo_backing_file = file; + lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); + mapping_set_gfp_mask(file->f_mapping, + lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + loop_update_dio(lo); + blk_mq_unfreeze_queue(lo->lo_queue); fput(old_file); if (lo->lo_flags & LO_FLAGS_PARTSCAN) @@ -1601,12 +1546,13 @@ static void lo_release(struct gendisk *disk, fmode_t mode) err = loop_clr_fd(lo); if (!err) return; - } else { + } else if (lo->lo_state == Lo_bound) { /* * Otherwise keep thread (if running) and config, * but flush possible ongoing bios in thread. */ - loop_flush(lo); + blk_mq_freeze_queue(lo->lo_queue); + blk_mq_unfreeze_queue(lo->lo_queue); } mutex_unlock(&lo->lo_ctl_mutex); From 233f0bf415e28adca96f61289e424ce4cfa9a9c0 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 31 Aug 2017 20:00:30 +0200 Subject: [PATCH 49/74] doc, block, bfq: fix some typos and remove stale stuff In addition to containing some typos and stale sentences, the file bfq-iosched.txt still mentioned a set of sysfs parameters that have been removed from this version of bfq. This commit fixes all these issues. Signed-off-by: Paolo Valente Reviewed-by: Jeremy Hickman Reviewed-by: Laurentiu Nicola Signed-off-by: Jens Axboe --- Documentation/block/bfq-iosched.txt | 66 +++++------------------------ 1 file changed, 10 insertions(+), 56 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 05e2822a80b3..03ff4cc79911 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -16,14 +16,16 @@ throughput. So, when needed for achieving a lower latency, BFQ builds schedules that may lead to a lower throughput. If your main or only goal, for a given device, is to achieve the maximum-possible throughput at all times, then do switch off all low-latency heuristics -for that device, by setting low_latency to 0. Full details in Section 3. +for that device, by setting low_latency to 0. See Section 3 for +details on how to configure BFQ for the desired tradeoff between +latency and throughput, or on how to maximize throughput. On average CPUs, the current version of BFQ can handle devices performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a reference, 30-50 KIOPS correspond to very high bandwidths with sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and -to 120-200 MB/s with 4KB random I/O. BFQ has not yet been tested on -multi-queue devices. +to 120-200 MB/s with 4KB random I/O. BFQ is currently being tested on +multi-queue devices too. The table of contents follow. Impatients can just jump to Section 3. @@ -154,10 +156,10 @@ plus a lot of code, are borrowed from CFQ. - With respect to idling for service guarantees, if several processes are competing for the device at the same time, but - all processes (and groups, after the following commit) have - the same weight, then BFQ guarantees the expected throughput - distribution without ever idling the device. Throughput is - thus as high as possible in this common scenario. + all processes and groups have the same weight, then BFQ + guarantees the expected throughput distribution without ever + idling the device. Throughput is thus as high as possible in + this common scenario. - If low-latency mode is enabled (default configuration), BFQ executes some special heuristics to detect interactive and soft @@ -191,10 +193,7 @@ plus a lot of code, are borrowed from CFQ. - Queues are scheduled according to a variant of WF2Q+, named B-WF2Q+, and implemented using an augmented rb-tree to preserve an O(log N) overall complexity. See [2] for more details. B-WF2Q+ is - also ready for hierarchical scheduling. However, for a cleaner - logical breakdown, the code that enables and completes - hierarchical support is provided in the next commit, which focuses - exactly on this feature. + also ready for hierarchical scheduling, details in Section 4. - B-WF2Q+ guarantees a tight deviation with respect to an ideal, perfectly fair, and smooth service. In particular, B-WF2Q+ @@ -427,51 +426,6 @@ Read-only parameter, used to show the weights of the currently active BFQ queues. -wr_ tunables ------------- - -BFQ exports a few parameters to control/tune the behavior of -low-latency heuristics. - -wr_coeff - -Factor by which the weight of a weight-raised queue is multiplied. If -the queue is deemed soft real-time, then the weight is further -multiplied by an additional, constant factor. - -wr_max_time - -Maximum duration of a weight-raising period for an interactive task -(ms). If set to zero (default value), then this value is computed -automatically, as a function of the peak rate of the device. In any -case, when the value of this parameter is read, it always reports the -current duration, regardless of whether it has been set manually or -computed automatically. - -wr_max_softrt_rate - -Maximum service rate below which a queue is deemed to be associated -with a soft real-time application, and is then weight-raised -accordingly (sectors/sec). - -wr_min_idle_time - -Minimum idle period after which interactive weight-raising may be -reactivated for a queue (in ms). - -wr_rt_max_time - -Maximum weight-raising duration for soft real-time queues (in ms). The -start time from which this duration is considered is automatically -moved forward if the queue is detected to be still soft real-time -before the current soft real-time weight-raising period finishes. - -wr_min_inter_arr_async - -Minimum period between I/O request arrivals after which weight-raising -may be reactivated for an already busy async queue (in ms). - - 4. Group scheduling with BFQ ============================ From 2670cd1674055ab48a9607472c5ff14781b9b2ea Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 31 Aug 2017 20:00:31 +0200 Subject: [PATCH 50/74] doc, block, bfq: better describe how to properly configure bfq Many users have reported the lack of an HOWTO for properly configuring bfq as a function of the goal one wants to achieve (max responsiveness, max throughput, ...). In fact, all needed details are already provided in the documentation file bfq-iosched.txt. Yet the document lacks guidance on which parameter descriptions to look at. This commit adds some simple direction. Signed-off-by: Paolo Valente Reviewed-by: Jeremy Hickman Reviewed-by: Laurentiu Nicola Signed-off-by: Jens Axboe --- Documentation/block/bfq-iosched.txt | 78 ++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 03ff4cc79911..3d6951d63489 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -35,7 +35,7 @@ CONTENTS 1-1 Personal systems 1-2 Server systems 2. How does BFQ work? -3. What are BFQ's tunable? +3. What are BFQ's tunables and how to properly configure BFQ? 4. BFQ group scheduling 4-1 Service guarantees provided 4-2 Interface @@ -147,12 +147,12 @@ plus a lot of code, are borrowed from CFQ. contrast, BFQ may idle the device for a short time interval, giving the process the chance to go on being served if it issues a new request in time. Device idling typically boosts the - throughput on rotational devices, if processes do synchronous - and sequential I/O. In addition, under BFQ, device idling is - also instrumental in guaranteeing the desired throughput - fraction to processes issuing sync requests (see the description - of the slice_idle tunable in this document, or [1, 2], for more - details). + throughput on rotational devices and on non-queueing flash-based + devices, if processes do synchronous and sequential I/O. In + addition, under BFQ, device idling is also instrumental in + guaranteeing the desired throughput fraction to processes + issuing sync requests (see the description of the slice_idle + tunable in this document, or [1, 2], for more details). - With respect to idling for service guarantees, if several processes are competing for the device at the same time, but @@ -161,6 +161,15 @@ plus a lot of code, are borrowed from CFQ. idling the device. Throughput is thus as high as possible in this common scenario. + - On flash-based storage with internal queueing of commands + (typically NCQ), device idling happens to be always detrimental + for throughput. So, with these devices, BFQ performs idling + only when strictly needed for service guarantees, i.e., for + guaranteeing low latency or fairness. In these cases, overall + throughput may be sub-optimal. No solution currently exists to + provide both strong service guarantees and optimal throughput + on devices with internal queueing. + - If low-latency mode is enabled (default configuration), BFQ executes some special heuristics to detect interactive and soft real-time applications (e.g., video or audio players/streamers), @@ -248,13 +257,24 @@ plus a lot of code, are borrowed from CFQ. the Idle class, to prevent it from starving. -3. What are BFQ's tunable? -========================== +3. What are BFQ's tunables and how to properly configure BFQ? +============================================================= -The tunables back_seek-max, back_seek_penalty, fifo_expire_async and -fifo_expire_sync below are the same as in CFQ. Their description is -just copied from that for CFQ. Some considerations in the description -of slice_idle are copied from CFQ too. +Most BFQ tunables affect service guarantees (basically latency and +fairness) and throughput. For full details on how to choose the +desired tradeoff between service guarantees and throughput, see the +parameters slice_idle, strict_guarantees and low_latency. For details +on how to maximise throughput, see slice_idle, timeout_sync and +max_budget. The other performance-related parameters have been +inherited from, and have been preserved mostly for compatibility with +CFQ. So far, no performance improvement has been reported after +changing the latter parameters in BFQ. + +In particular, the tunables back_seek-max, back_seek_penalty, +fifo_expire_async and fifo_expire_sync below are the same as in +CFQ. Their description is just copied from that for CFQ. Some +considerations in the description of slice_idle are copied from CFQ +too. per-process ioprio and weight ----------------------------- @@ -284,15 +304,17 @@ number of seeks and see improved throughput. Setting slice_idle to 0 will remove all the idling on queues and one should see an overall improved throughput on faster storage devices -like multiple SATA/SAS disks in hardware RAID configuration. +like multiple SATA/SAS disks in hardware RAID configuration, as well +as flash-based storage with internal command queueing (and +parallelism). So depending on storage and workload, it might be useful to set slice_idle=0. In general for SATA/SAS disks and software RAID of SATA/SAS disks keeping slice_idle enabled should be useful. For any configurations where there are multiple spindles behind single LUN -(Host based hardware RAID controller or for storage arrays), setting -slice_idle=0 might end up in better throughput and acceptable -latencies. +(Host based hardware RAID controller or for storage arrays), or with +flash-based fast storage, setting slice_idle=0 might end up in better +throughput and acceptable latencies. Idling is however necessary to have service guarantees enforced in case of differentiated weights or differentiated I/O-request lengths. @@ -311,13 +333,14 @@ There is an important flipside for idling: apart from the above cases where it is beneficial also for throughput, idling can severely impact throughput. One important case is random workload. Because of this issue, BFQ tends to avoid idling as much as possible, when it is not -beneficial also for throughput. As a consequence of this behavior, and -of further issues described for the strict_guarantees tunable, -short-term service guarantees may be occasionally violated. And, in -some cases, these guarantees may be more important than guaranteeing -maximum throughput. For example, in video playing/streaming, a very -low drop rate may be more important than maximum throughput. In these -cases, consider setting the strict_guarantees parameter. +beneficial also for throughput (as detailed in Section 2). As a +consequence of this behavior, and of further issues described for the +strict_guarantees tunable, short-term service guarantees may be +occasionally violated. And, in some cases, these guarantees may be +more important than guaranteeing maximum throughput. For example, in +video playing/streaming, a very low drop rate may be more important +than maximum throughput. In these cases, consider setting the +strict_guarantees parameter. strict_guarantees ----------------- @@ -419,6 +442,13 @@ The default value is 0, which enables auto-tuning: BFQ sets max_budget to the maximum number of sectors that can be served during timeout_sync, according to the estimated peak rate. +For specific devices, some users have occasionally reported to have +reached a higher throughput by setting max_budget explicitly, i.e., by +setting max_budget to a higher value than 0. In particular, they have +set max_budget to higher values than those to which BFQ would have set +it with auto-tuning. An alternative way to achieve this goal is to +just increase the value of timeout_sync, leaving max_budget equal to 0. + weights ------- From 40a5fce495715c48c2e02668144e68a507ac5a30 Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Wed, 30 Aug 2017 15:18:19 -0700 Subject: [PATCH 51/74] nvme-fabrics: generate spec-compliant UUID NQNs The default host NQN, which is generated based on the host's UUID, does not follow the UUID-based NQN format laid out in the NVMe 1.3 specification. Remove the "NVMf:" portion of the NQN to match the spec. Signed-off-by: Daniel Verkamp Reviewed-by: Max Gurtovoy Cc: stable@vger.kernel.org Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 53a9598e3aee..47307752dc65 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -75,7 +75,7 @@ static struct nvmf_host *nvmf_host_default(void) kref_init(&host->ref); snprintf(host->nqn, NVMF_NQN_SIZE, - "nqn.2014-08.org.nvmexpress:NVMf:uuid:%pUb", &host->id); + "nqn.2014-08.org.nvmexpress:uuid:%pUb", &host->id); mutex_lock(&nvmf_hosts_mutex); list_add_tail(&host->list, &nvmf_hosts); From 54bb0ade6627a183c211345761ec46e4bf0048fe Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 31 Aug 2017 22:09:45 -0700 Subject: [PATCH 52/74] block/loop: set hw_sectors Loop can handle any size of request. Limiting it to 255 sectors just burns the CPU for bio split and request merge for underlayer disk and also cause bad fs block allocation in directio mode. Reviewed-by: Omar Sandoval Reviewed-by: Ming Lei Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/loop.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f6c204f62b1e..9eff4d3ab1f3 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1736,6 +1736,7 @@ static int loop_add(struct loop_device **l, int i) blk_queue_physical_block_size(lo->lo_queue, PAGE_SIZE); + blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS); /* * It doesn't make sense to enable merge because the I/O * submitted to backing file is handled page by page. From 40326d8a33d5b70039849d233975b63c733d94a2 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 31 Aug 2017 22:09:46 -0700 Subject: [PATCH 53/74] block/loop: allow request merge for directio mode Currently loop disables merge. While it makes sense for buffer IO mode, directio mode can benefit from request merge. Without merge, loop could send small size IO to underlayer disk and harm performance. Reviewed-by: Omar Sandoval Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/loop.c | 66 ++++++++++++++++++++++++++++++++++---------- drivers/block/loop.h | 1 + 2 files changed, 52 insertions(+), 15 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 9eff4d3ab1f3..3a35121f8a6f 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -213,10 +213,13 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) */ blk_mq_freeze_queue(lo->lo_queue); lo->use_dio = use_dio; - if (use_dio) + if (use_dio) { + queue_flag_clear_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue); lo->lo_flags |= LO_FLAGS_DIRECT_IO; - else + } else { + queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue); lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; + } blk_mq_unfreeze_queue(lo->lo_queue); } @@ -464,6 +467,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) { struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); + kfree(cmd->bvec); + cmd->bvec = NULL; cmd->ret = ret; blk_mq_complete_request(cmd->rq); } @@ -473,22 +478,50 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, { struct iov_iter iter; struct bio_vec *bvec; - struct bio *bio = cmd->rq->bio; + struct request *rq = cmd->rq; + struct bio *bio = rq->bio; struct file *file = lo->lo_backing_file; + unsigned int offset; + int segments = 0; int ret; - /* nomerge for loop request queue */ - WARN_ON(cmd->rq->bio != cmd->rq->biotail); + if (rq->bio != rq->biotail) { + struct req_iterator iter; + struct bio_vec tmp; + + __rq_for_each_bio(bio, rq) + segments += bio_segments(bio); + bvec = kmalloc(sizeof(struct bio_vec) * segments, GFP_NOIO); + if (!bvec) + return -EIO; + cmd->bvec = bvec; + + /* + * The bios of the request may be started from the middle of + * the 'bvec' because of bio splitting, so we can't directly + * copy bio->bi_iov_vec to new bvec. The rq_for_each_segment + * API will take care of all details for us. + */ + rq_for_each_segment(tmp, rq, iter) { + *bvec = tmp; + bvec++; + } + bvec = cmd->bvec; + offset = 0; + } else { + /* + * Same here, this bio may be started from the middle of the + * 'bvec' because of bio splitting, so offset from the bvec + * must be passed to iov iterator + */ + offset = bio->bi_iter.bi_bvec_done; + bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + segments = bio_segments(bio); + } - bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); iov_iter_bvec(&iter, ITER_BVEC | rw, bvec, - bio_segments(bio), blk_rq_bytes(cmd->rq)); - /* - * This bio may be started from the middle of the 'bvec' - * because of bio splitting, so offset from the bvec must - * be passed to iov iterator - */ - iter.iov_offset = bio->bi_iter.bi_bvec_done; + segments, blk_rq_bytes(rq)); + iter.iov_offset = offset; cmd->iocb.ki_pos = pos; cmd->iocb.ki_filp = file; @@ -1737,9 +1770,12 @@ static int loop_add(struct loop_device **l, int i) blk_queue_physical_block_size(lo->lo_queue, PAGE_SIZE); blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS); + /* - * It doesn't make sense to enable merge because the I/O - * submitted to backing file is handled page by page. + * By default, we do buffer IO, so it doesn't make sense to enable + * merge because the I/O submitted to backing file is handled page by + * page. For directio mode, merge does help to dispatch bigger request + * to underlayer disk. We will enable merge once directio is enabled. */ queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue); diff --git a/drivers/block/loop.h b/drivers/block/loop.h index efe57189d01e..43d20d37b79a 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -71,6 +71,7 @@ struct loop_cmd { bool use_aio; /* use AIO interface to handle I/O */ long ret; struct kiocb iocb; + struct bio_vec *bvec; }; /* Support for loadable transfer modules */ From fa393d1b9c6326c227a24915a6f00721a288bde9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 30 Aug 2017 11:42:07 -0700 Subject: [PATCH 54/74] bfq: Annotate fall-through in a switch statement This patch avoids that gcc 7 issues a warning about fall-through when building with W=1. Acked-by: Paolo Valente Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 8f3d22330d64..856f4199a470 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3780,6 +3780,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) default: dev_err(bfqq->bfqd->queue->backing_dev_info->dev, "bfq: bad prio class %d\n", ioprio_class); + /* fall through */ case IOPRIO_CLASS_NONE: /* * No prio set, inherit CPU scheduling settings. From dfb79af5469a028e23ba2592a577d9b6f8a5651f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 30 Aug 2017 11:42:08 -0700 Subject: [PATCH 55/74] bfq: Declare local functions static Acked-by: Paolo Valente Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 78b2e0db4fb2..ceefb9a706d6 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -206,7 +206,7 @@ static void bfqg_get(struct bfq_group *bfqg) bfqg->ref++; } -void bfqg_put(struct bfq_group *bfqg) +static void bfqg_put(struct bfq_group *bfqg) { bfqg->ref--; @@ -385,7 +385,7 @@ static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); } -struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) +static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) { struct bfq_group_data *bgd; @@ -395,7 +395,7 @@ struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) return &bgd->pd; } -void bfq_cpd_init(struct blkcg_policy_data *cpd) +static void bfq_cpd_init(struct blkcg_policy_data *cpd) { struct bfq_group_data *d = cpd_to_bfqgd(cpd); @@ -403,12 +403,12 @@ void bfq_cpd_init(struct blkcg_policy_data *cpd) CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; } -void bfq_cpd_free(struct blkcg_policy_data *cpd) +static void bfq_cpd_free(struct blkcg_policy_data *cpd) { kfree(cpd_to_bfqgd(cpd)); } -struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) +static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) { struct bfq_group *bfqg; @@ -426,7 +426,7 @@ struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) return &bfqg->pd; } -void bfq_pd_init(struct blkg_policy_data *pd) +static void bfq_pd_init(struct blkg_policy_data *pd) { struct blkcg_gq *blkg = pd_to_blkg(pd); struct bfq_group *bfqg = blkg_to_bfqg(blkg); @@ -445,7 +445,7 @@ void bfq_pd_init(struct blkg_policy_data *pd) bfqg->rq_pos_tree = RB_ROOT; } -void bfq_pd_free(struct blkg_policy_data *pd) +static void bfq_pd_free(struct blkg_policy_data *pd) { struct bfq_group *bfqg = pd_to_bfqg(pd); @@ -453,7 +453,7 @@ void bfq_pd_free(struct blkg_policy_data *pd) bfqg_put(bfqg); } -void bfq_pd_reset_stats(struct blkg_policy_data *pd) +static void bfq_pd_reset_stats(struct blkg_policy_data *pd) { struct bfq_group *bfqg = pd_to_bfqg(pd); @@ -740,7 +740,7 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd, * blkio already grabs the queue_lock for us, so no need to use * RCU-based magic */ -void bfq_pd_offline(struct blkg_policy_data *pd) +static void bfq_pd_offline(struct blkg_policy_data *pd) { struct bfq_service_tree *st; struct bfq_group *bfqg = pd_to_bfqg(pd); From 2f79136ba2df07ce7563158f7948c4ce770f8132 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 30 Aug 2017 11:42:09 -0700 Subject: [PATCH 56/74] bfq: Check kstrtoul() return value Make sysfs writes fail for invalid numbers instead of storing uninitialized data copied from the stack. This patch removes all uninitialized_var() occurrences from the BFQ source code. Acked-by: Paolo Valente Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 52 ++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 856f4199a470..0f286cd5da95 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4802,13 +4802,15 @@ static ssize_t bfq_var_show(unsigned int var, char *page) return sprintf(page, "%u\n", var); } -static void bfq_var_store(unsigned long *var, const char *page) +static int bfq_var_store(unsigned long *var, const char *page) { unsigned long new_val; int ret = kstrtoul(page, 10, &new_val); - if (ret == 0) - *var = new_val; + if (ret) + return ret; + *var = new_val; + return 0; } #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ @@ -4849,8 +4851,12 @@ static ssize_t \ __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct bfq_data *bfqd = e->elevator_data; \ - unsigned long uninitialized_var(__data); \ - bfq_var_store(&__data, (page)); \ + unsigned long __data; \ + int ret; \ + \ + ret = bfq_var_store(&__data, (page)); \ + if (ret) \ + return ret; \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ @@ -4877,8 +4883,12 @@ STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ { \ struct bfq_data *bfqd = e->elevator_data; \ - unsigned long uninitialized_var(__data); \ - bfq_var_store(&__data, (page)); \ + unsigned long __data; \ + int ret; \ + \ + ret = bfq_var_store(&__data, (page)); \ + if (ret) \ + return ret; \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ @@ -4894,9 +4904,12 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, const char *page, size_t count) { struct bfq_data *bfqd = e->elevator_data; - unsigned long uninitialized_var(__data); + unsigned long __data; + int ret; - bfq_var_store(&__data, (page)); + ret = bfq_var_store(&__data, (page)); + if (ret) + return ret; if (__data == 0) bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); @@ -4919,9 +4932,12 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, const char *page, size_t count) { struct bfq_data *bfqd = e->elevator_data; - unsigned long uninitialized_var(__data); + unsigned long __data; + int ret; - bfq_var_store(&__data, (page)); + ret = bfq_var_store(&__data, (page)); + if (ret) + return ret; if (__data < 1) __data = 1; @@ -4939,9 +4955,12 @@ static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, const char *page, size_t count) { struct bfq_data *bfqd = e->elevator_data; - unsigned long uninitialized_var(__data); + unsigned long __data; + int ret; - bfq_var_store(&__data, (page)); + ret = bfq_var_store(&__data, (page)); + if (ret) + return ret; if (__data > 1) __data = 1; @@ -4958,9 +4977,12 @@ static ssize_t bfq_low_latency_store(struct elevator_queue *e, const char *page, size_t count) { struct bfq_data *bfqd = e->elevator_data; - unsigned long uninitialized_var(__data); + unsigned long __data; + int ret; - bfq_var_store(&__data, (page)); + ret = bfq_var_store(&__data, (page)); + if (ret) + return ret; if (__data > 1) __data = 1; From 1530486cda2df87d66af8bd3aa069c12b6fada41 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 30 Aug 2017 11:42:10 -0700 Subject: [PATCH 57/74] bfq: Suppress compiler warnings about comparisons This patch avoids that the following warnings are reported when building with W=1: block/bfq-iosched.c: In function 'bfq_back_seek_max_store': block/bfq-iosched.c:4860:13: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits] if (__data < (MIN)) \ ^ block/bfq-iosched.c:4876:1: note: in expansion of macro 'STORE_FUNCTION' STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); ^~~~~~~~~~~~~~ block/bfq-iosched.c: In function 'bfq_slice_idle_store': block/bfq-iosched.c:4860:13: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits] if (__data < (MIN)) \ ^ block/bfq-iosched.c:4879:1: note: in expansion of macro 'STORE_FUNCTION' STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); ^~~~~~~~~~~~~~ block/bfq-iosched.c: In function 'bfq_slice_idle_us_store': block/bfq-iosched.c:4892:13: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits] if (__data < (MIN)) \ ^ block/bfq-iosched.c:4899:1: note: in expansion of macro 'USEC_STORE_FUNCTION' USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, ^~~~~~~~~~~~~~~~~~~ Acked-by: Paolo Valente Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0f286cd5da95..7adde1b84e19 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4851,16 +4851,16 @@ static ssize_t \ __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct bfq_data *bfqd = e->elevator_data; \ - unsigned long __data; \ + unsigned long __data, __min = (MIN), __max = (MAX); \ int ret; \ \ ret = bfq_var_store(&__data, (page)); \ if (ret) \ return ret; \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ + if (__data < __min) \ + __data = __min; \ + else if (__data > __max) \ + __data = __max; \ if (__CONV == 1) \ *(__PTR) = msecs_to_jiffies(__data); \ else if (__CONV == 2) \ @@ -4883,16 +4883,16 @@ STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ { \ struct bfq_data *bfqd = e->elevator_data; \ - unsigned long __data; \ + unsigned long __data, __min = (MIN), __max = (MAX); \ int ret; \ \ ret = bfq_var_store(&__data, (page)); \ if (ret) \ return ret; \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ + if (__data < __min) \ + __data = __min; \ + else if (__data > __max) \ + __data = __max; \ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ return count; \ } From 12cd3a2fe3ba6d1a2cf007e4e8dcfbe66d3d0a28 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 30 Aug 2017 11:42:11 -0700 Subject: [PATCH 58/74] bfq: Use icq_to_bic() consistently Some code uses icq_to_bic() to convert an io_cq pointer to a bfq_io_cq pointer while other code uses a direct cast. Convert the code that uses a direct cast such that it uses icq_to_bic(). Acked-by: Paolo Valente Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7adde1b84e19..fd09d4d4ada7 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -239,7 +239,7 @@ static int T_slow[2]; static int T_fast[2]; static int device_speed_thresh[2]; -#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) +#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) From 92d773324b7edbd36bf0c28c1e0157763aeccc92 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 1 Sep 2017 11:15:17 -0700 Subject: [PATCH 59/74] block/loop: fix use after free lo_rw_aio->call_read_iter-> 1 aops->direct_IO 2 iov_iter_revert lo_rw_aio_complete could happen between 1 and 2, the bio and bvec could be freed before 2, which accesses bvec. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/loop.c | 16 +++++++++++++--- drivers/block/loop.h | 5 ++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 3a35121f8a6f..78c47c4b584d 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -463,14 +463,21 @@ static void lo_complete_rq(struct request *rq) blk_mq_end_request(rq, cmd->ret < 0 ? BLK_STS_IOERR : BLK_STS_OK); } +static void lo_rw_aio_do_completion(struct loop_cmd *cmd) +{ + if (!atomic_dec_and_test(&cmd->ref)) + return; + kfree(cmd->bvec); + cmd->bvec = NULL; + blk_mq_complete_request(cmd->rq); +} + static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) { struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); - kfree(cmd->bvec); - cmd->bvec = NULL; cmd->ret = ret; - blk_mq_complete_request(cmd->rq); + lo_rw_aio_do_completion(cmd); } static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, @@ -518,6 +525,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); segments = bio_segments(bio); } + atomic_set(&cmd->ref, 2); iov_iter_bvec(&iter, ITER_BVEC | rw, bvec, segments, blk_rq_bytes(rq)); @@ -533,6 +541,8 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, else ret = call_read_iter(file, &cmd->iocb, &iter); + lo_rw_aio_do_completion(cmd); + if (ret != -EIOCBQUEUED) cmd->iocb.ki_complete(&cmd->iocb, ret, 0); return 0; diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 43d20d37b79a..b0ba4a5951c4 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -68,7 +68,10 @@ struct loop_cmd { struct kthread_work work; struct request *rq; struct list_head list; - bool use_aio; /* use AIO interface to handle I/O */ + union { + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + }; long ret; struct kiocb iocb; struct bio_vec *bvec; From bc75705d00637c5f7b0346bf63094a9899c3d516 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 1 Sep 2017 11:15:18 -0700 Subject: [PATCH 60/74] block/loop: remove unused field nobody uses the list. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/loop.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/loop.h b/drivers/block/loop.h index b0ba4a5951c4..f68c1d50802f 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -67,7 +67,6 @@ struct loop_device { struct loop_cmd { struct kthread_work work; struct request *rq; - struct list_head list; union { bool use_aio; /* use AIO interface to handle I/O */ atomic_t ref; /* only for aio */ From 4b758df21ee7081ab41448d21d60367efaa625b3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Sep 2017 14:25:51 +0800 Subject: [PATCH 61/74] bcache: Fix leak of bdev reference If blkdev_get_by_path() in register_bcache() fails, we try to lookup the block device using lookup_bdev() to detect which situation we are in to properly report error. However we never drop the reference returned to us from lookup_bdev(). Fix that. Signed-off-by: Jan Kara Acked-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 974d832e54a6..c3fedd265d18 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1964,6 +1964,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, else err = "device busy"; mutex_unlock(&bch_register_lock); + if (!IS_ERR(bdev)) + bdput(bdev); if (attr == &ksysfs_register_quiet) goto out; } From c81ffa32a214c84b08900fbc9d432187bd948eba Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:52 +0800 Subject: [PATCH 62/74] bcache: fix sequential large write IO bypass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sequential write IOs were tested with bs=1M by FIO in writeback cache mode, these IOs were expected to be bypassed, but actually they did not. We debug the code, and find in check_should_bypass(): if (!congested && mode == CACHE_MODE_WRITEBACK && op_is_write(bio_op(bio)) && (bio->bi_opf & REQ_SYNC)) goto rescale that means, If in writeback mode, a write IO with REQ_SYNC flag will not be bypassed though it is a sequential large IO, It's not a correct thing to do actually, so this patch remove these codes. Signed-off-by: tang.junhui Reviewed-by: Kent Overstreet Reviewed-by: Eric Wheeler Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 0e1463d0c334..de97d86ddff4 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -400,12 +400,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) if (!congested && !dc->sequential_cutoff) goto rescale; - if (!congested && - mode == CACHE_MODE_WRITEBACK && - op_is_write(bio->bi_opf) && - op_is_sync(bio->bi_opf)) - goto rescale; - spin_lock(&dc->io_lock); hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash) From 69daf03adef5f7bc13e0ac86b4b8007df1767aab Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:53 +0800 Subject: [PATCH 63/74] bcache: do not subtract sectors_to_gc for bypassed IO Since bypassed IOs use no bucket, so do not subtract sectors_to_gc to trigger gc thread. Signed-off-by: tang.junhui Acked-by: Coly Li Reviewed-by: Eric Wheeler Reviewed-by: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index de97d86ddff4..681b4f12b05a 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -196,12 +196,12 @@ static void bch_data_insert_start(struct closure *cl) struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); struct bio *bio = op->bio, *n; - if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) - wake_up_gc(op->c); - if (op->bypass) return bch_data_invalidate(cl); + if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) + wake_up_gc(op->c); + /* * Journal writes are marked REQ_PREFLUSH; if the original write was a * flush, it'll wait on the journal write. From 09b3efec81def807fb225359e34a8e72866dd9c4 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 6 Sep 2017 14:25:54 +0800 Subject: [PATCH 64/74] bcache: Don't reinvent the wheel but use existing llist API Although llist provides proper APIs, they are not used. Make them used. Signed-off-by: Byungchul Park Acked-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/closure.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 864e673aec39..7d5286b05036 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -70,21 +70,10 @@ void __closure_wake_up(struct closure_waitlist *wait_list) list = llist_del_all(&wait_list->list); /* We first reverse the list to preserve FIFO ordering and fairness */ - - while (list) { - struct llist_node *t = list; - list = llist_next(list); - - t->next = reverse; - reverse = t; - } + reverse = llist_reverse_order(list); /* Then do the wakeups */ - - while (reverse) { - cl = container_of(reverse, struct closure, list); - reverse = llist_next(reverse); - + llist_for_each_entry(cl, reverse, list) { closure_set_waiting(cl, 0); closure_sub(cl, CLOSURE_WAITING + 1); } From 0b43f49dc4d6d3789e936731dc16af94cb57d568 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:55 +0800 Subject: [PATCH 65/74] bcache: gc does not work when triggering by manual command I try to execute the following command to trigger gc thread: [root@localhost internal]# echo 1 > trigger_gc But it does not work, I debug the code in gc_should_run(), It works only if in invalidating or sectors_to_gc < 0. So set sectors_to_gc to -1 to meet the condition when we trigger gc by manual command. (Code comments aded by Coly Li) Signed-off-by: Tang Junhui Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index f90f13616980..09295c07ea88 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -615,8 +615,21 @@ STORE(__bch_cache_set) bch_cache_accounting_clear(&c->accounting); } - if (attr == &sysfs_trigger_gc) + if (attr == &sysfs_trigger_gc) { + /* + * Garbage collection thread only works when sectors_to_gc < 0, + * when users write to sysfs entry trigger_gc, most of time + * they want to forcibly triger gargage collection. Here -1 is + * set to c->sectors_to_gc, to make gc_should_run() give a + * chance to permit gc thread to run. "give a chance" means + * before going into gc_should_run(), there is still chance + * that c->sectors_to_gc being set to other positive value. So + * writing sysfs entry trigger_gc won't always make sure gc + * thread takes effect. + */ + atomic_set(&c->sectors_to_gc, -1); wake_up_gc(c); + } if (attr == &sysfs_prune_cache) { struct shrink_control sc; From a8394090a9129b40f9d90dcb7f4a49d60c727ca6 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:56 +0800 Subject: [PATCH 66/74] bcache: correct cache_dirty_target in __update_writeback_rate() __update_write_rate() uses a Proportion-Differentiation Controller algorithm to control writeback rate. A dirty target number is used in this PD controller to control writeback rate. A larger target number will make the writeback rate smaller, on the versus, a smaller target number will make the writeback rate larger. bcache uses the following steps to calculate the target number, 1) cache_sectors = all-buckets-of-cache-set * buckets-size 2) cache_dirty_target = cache_sectors * cached-device-writeback_percent 3) target = cache_dirty_target * (sectors-of-cached-device/sectors-of-all-cached-devices-of-this-cache-set) The calculation at step 1) for cache_sectors is incorrect, which does not consider dirty blocks occupied by flash only volume. A flash only volume can be took as a bcache device without cached device. All data sectors allocated for it are persistent on cache device and marked dirty, they are not touched by bcache writeback and garbage collection code. So data blocks of flash only volume should be ignore when calculating cache_sectors of cache set. Current code does not subtract dirty sectors of flash only volume, which results a larger target number from the above 3 steps. And in sequence the cache device's writeback rate is smaller then a correct value, writeback speed is slower on all cached devices. This patch fixes the incorrect slower writeback rate by subtracting dirty sectors of flash only volumes in __update_writeback_rate(). (Commit log composed by Coly Li to pass checkpatch.pl checking) Signed-off-by: Tang Junhui Reviewed-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 3 ++- drivers/md/bcache/writeback.h | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index c49022a8dc9d..5abe6472b66b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -21,7 +21,8 @@ static void __update_writeback_rate(struct cached_dev *dc) { struct cache_set *c = dc->disk.c; - uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; + uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - + bcache_flash_devs_sectors_dirty(c); uint64_t cache_dirty_target = div_u64(cache_sectors * dc->writeback_percent, 100); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 629bd1a502fd..8789b9c8c484 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } +static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) +{ + uint64_t i, ret = 0; + + mutex_lock(&bch_register_lock); + + for (i = 0; i < c->nr_uuids; i++) { + struct bcache_device *d = c->devices[i]; + + if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) + continue; + ret += bcache_dev_sectors_dirty(d); + } + + mutex_unlock(&bch_register_lock); + + return ret; +} + static inline unsigned offset_to_stripe(struct bcache_device *d, uint64_t offset) { From 77fa100f27475d08a569b9d51c17722130f089e7 Mon Sep 17 00:00:00 2001 From: Tony Asleson Date: Wed, 6 Sep 2017 14:25:57 +0800 Subject: [PATCH 67/74] bcache: Correct return value for sysfs attach errors If you encounter any errors in bch_cached_dev_attach it will return a negative error code. The variable 'v' which stores the result is unsigned, thus user space sees a very large value returned for bytes written which can cause incorrect user space behavior. Utilize 1 signed variable to use throughout the function to preserve error return capability. Signed-off-by: Tony Asleson Acked-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 09295c07ea88..104c57cd666c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -192,7 +192,7 @@ STORE(__cached_dev) { struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); - unsigned v = size; + ssize_t v = size; struct cache_set *c; struct kobj_uevent_env *env; @@ -227,7 +227,7 @@ STORE(__cached_dev) bch_cached_dev_run(dc); if (attr == &sysfs_cache_mode) { - ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1); + v = bch_read_string_list(buf, bch_cache_modes + 1); if (v < 0) return v; From 89b1fc54c257104df007c5888e3705e52b973d45 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:58 +0800 Subject: [PATCH 68/74] bcache: increase the number of open buckets In currently, we only alloc 6 open buckets for each cache set, but in usually, we always attach about 10 or so backend devices for each cache set, and the each bcache device are always accessed by about 10 or so threads in top application layer. So 6 open buckets are too few, It has led to that each of the same thread write data to different buckets, which would cause low efficiency write-back, and also cause buckets inefficient, and would be Very easy to run out of. I add debug message in bch_open_buckets_alloc() to print alloc bucket info, and test with ten bcache devices with a cache set, and each bcache device is accessed by ten threads. From the debug message, we can see that, after the modification, One bucket is more likely to assign to the same thread, and the data from the same thread are more likely to write the same bucket. Usually the same thread always write/read the same backend device, so it is good for write-back and also promote the usage efficiency of buckets. Signed-off-by: Tang Junhui Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index ca4abe1ccd8d..cacbe2dbd5c3 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -68,6 +68,8 @@ #include #include +#define MAX_OPEN_BUCKETS 128 + /* Bucket heap / gen */ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) @@ -671,7 +673,7 @@ int bch_open_buckets_alloc(struct cache_set *c) spin_lock_init(&c->data_bucket_lock); - for (i = 0; i < 6; i++) { + for (i = 0; i < MAX_OPEN_BUCKETS; i++) { struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); if (!b) return -ENOMEM; From 9baf30972b5568d8b5bc8b3c46a6ec5b58100463 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:59 +0800 Subject: [PATCH 69/74] bcache: fix for gc and write-back race gc and write-back get raced (see the email "bcache get stucked" I sended before): gc thread write-back thread | |bch_writeback_thread() |bch_gc_thread() | | |==>read_dirty() |==>bch_btree_gc() | |==>btree_root() //get btree root | | //node write locker | |==>bch_btree_gc_root() | | |==>read_dirty_submit() | |==>write_dirty() | |==>continue_at(cl, | | write_dirty_finish, | | system_wq); | |==>write_dirty_finish()//excute | | //in system_wq | |==>bch_btree_insert() | |==>bch_btree_map_leaf_nodes() | |==>__bch_btree_map_nodes() | |==>btree_root //try to get btree | | //root node read | | //lock | |-----stuck here |==>bch_btree_set_root() |==>bch_journal_meta() |==>bch_journal() |==>journal_try_write() |==>journal_write_unlocked() //journal_full(&c->journal) | //condition satisfied |==>continue_at(cl, journal_write, system_wq); //try to excute | //journal_write in system_wq | //but work queue is excuting | //write_dirty_finish() |==>closure_sync(); //wait journal_write execute | //over and wake up gc, |-------------stuck here |==>release root node write locker This patch alloc a separate work-queue for write-back thread to avoid such race. (Commit log re-organized by Coly Li to pass checkpatch.pl checking) Signed-off-by: Tang Junhui Acked-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/super.c | 2 ++ drivers/md/bcache/writeback.c | 9 +++++++-- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index dee542fff68e..2ed9bd231d84 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -333,6 +333,7 @@ struct cached_dev { /* Limit number of writeback bios in flight */ struct semaphore in_flight; struct task_struct *writeback_thread; + struct workqueue_struct *writeback_write_wq; struct keybuf writeback_keys; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c3fedd265d18..3b724fa2b68d 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1059,6 +1059,8 @@ static void cached_dev_free(struct closure *cl) cancel_delayed_work_sync(&dc->writeback_rate_update); if (!IS_ERR_OR_NULL(dc->writeback_thread)) kthread_stop(dc->writeback_thread); + if (dc->writeback_write_wq) + destroy_workqueue(dc->writeback_write_wq); mutex_lock(&bch_register_lock); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 5abe6472b66b..34131439e28c 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -187,7 +187,7 @@ static void write_dirty(struct closure *cl) closure_bio_submit(&io->bio, cl); - continue_at(cl, write_dirty_finish, system_wq); + continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); } static void read_dirty_endio(struct bio *bio) @@ -207,7 +207,7 @@ static void read_dirty_submit(struct closure *cl) closure_bio_submit(&io->bio, cl); - continue_at(cl, write_dirty, system_wq); + continue_at(cl, write_dirty, io->dc->writeback_write_wq); } static void read_dirty(struct cached_dev *dc) @@ -516,6 +516,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) int bch_cached_dev_writeback_start(struct cached_dev *dc) { + dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", + WQ_MEM_RECLAIM, 0); + if (!dc->writeback_write_wq) + return -ENOMEM; + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, "bcache_writeback"); if (IS_ERR(dc->writeback_thread)) From da22f0eea555baf9b0a84b52afe56db2052cfe8d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Sep 2017 14:26:00 +0800 Subject: [PATCH 70/74] bcache: silence static checker warning In olden times, closure_return() used to have a hidden return built in. We removed the hidden return but forgot to add a new return here. If "c" were NULL we would oops on the next line, but fortunately "c" is never NULL. Let's just remove the if statement. Signed-off-by: Dan Carpenter Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 3b724fa2b68d..1ae63374366c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1376,9 +1376,6 @@ static void cache_set_flush(struct closure *cl) struct btree *b; unsigned i; - if (!c) - closure_return(cl); - bch_cache_accounting_destroy(&c->accounting); kobject_put(&c->internal); From 7b6a8570e02a20d0b6cadb9689e4b2e6217c390c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Sep 2017 14:26:01 +0800 Subject: [PATCH 71/74] bcache: Update continue_at() documentation continue_at() doesn't have a return statement anymore. Signed-off-by: Dan Carpenter Acked-by: Coly Li Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/bcache/closure.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 1ec84ca81146..295b7e43f92c 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -312,8 +312,6 @@ static inline void closure_wake_up(struct closure_waitlist *list) * been dropped with closure_put()), it will resume execution at @fn running out * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). * - * NOTE: This macro expands to a return in the calling function! - * * This is because after calling continue_at() you no longer have a ref on @cl, * and whatever @cl owns may be freed out from under you - a running closure fn * has a ref on its own closure which continue_at() drops. @@ -340,8 +338,6 @@ do { \ * Causes @fn to be executed out of @cl, in @wq context (or called directly if * @wq is NULL). * - * NOTE: like continue_at(), this macro expands to a return in the caller! - * * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, * thus it's not safe to touch anything protected by @cl after a * continue_at_nobarrier(). From 9276717b9e297a62d1151a43d1cd286213f68eb7 Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Wed, 6 Sep 2017 14:26:02 +0800 Subject: [PATCH 72/74] bcache: fix bch_hprint crash and improve output Most importantly, solve a crash where %llu was used to format signed numbers. This would cause a buffer overflow when reading sysfs writeback_rate_debug, as only 20 bytes were allocated for this and %llu writes 20 characters plus a null. Always use the units mechanism rather than having different output paths for simplicity. Also, correct problems with display output where 1.10 was a larger number than 1.09, by multiplying by 10 and then dividing by 1024 instead of dividing by 100. (Remainders of >= 1000 would print as .10). Minor changes: Always display the decimal point instead of trying to omit it based on number of digits shown. Decide what units to use based on 1000 as a threshold, not 1024 (in other words, always print at most 3 digits before the decimal point). Signed-off-by: Michael Lyle Reported-by: Dmitry Yu Okunev Acked-by: Kent Overstreet Reviewed-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/util.c | 42 +++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 8c3a938f4bf0..176d3c2ef5f5 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -74,24 +74,44 @@ STRTO_H(strtouint, unsigned int) STRTO_H(strtoll, long long) STRTO_H(strtoull, unsigned long long) +/** + * bch_hprint() - formats @v to human readable string for sysfs. + * + * @v - signed 64 bit integer + * @buf - the (at least 8 byte) buffer to format the result into. + * + * Returns the number of bytes used by format. + */ ssize_t bch_hprint(char *buf, int64_t v) { static const char units[] = "?kMGTPEZY"; - char dec[4] = ""; - int u, t = 0; + int u = 0, t; - for (u = 0; v >= 1024 || v <= -1024; u++) { - t = v & ~(~0 << 10); - v >>= 10; - } + uint64_t q; - if (!u) - return sprintf(buf, "%llu", v); + if (v < 0) + q = -v; + else + q = v; - if (v < 100 && v > -100) - snprintf(dec, sizeof(dec), ".%i", t / 100); + /* For as long as the number is more than 3 digits, but at least + * once, shift right / divide by 1024. Keep the remainder for + * a digit after the decimal point. + */ + do { + u++; - return sprintf(buf, "%lli%s%c", v, dec, units[u]); + t = q & ~(~0 << 10); + q >>= 10; + } while (q >= 1000); + + if (v < 0) + /* '-', up to 3 digits, '.', 1 digit, 1 character, null; + * yields 8 bytes. + */ + return sprintf(buf, "-%llu.%i%c", q, t * 10 / 1024, units[u]); + else + return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]); } ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], From bf09375337077b692d21d062c30697c86f2872d3 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 5 Sep 2017 14:24:47 -0700 Subject: [PATCH 73/74] loop: set physical block size to logical block size Commit 6c6b6f28b333 ("loop: set physical block size to PAGE_SIZE") caused mkfs.xfs to barf on ppc64 [1]. Always using PAGE_SIZE as the physical block size still makes the most sense semantically, but let's just lie and always set it to the same value as the logical block size (same goes for io_min). In the future we might want to at least bump up io_min to PAGE_SIZE but I'm sick of these stupid changes so let's play it safe. 1: https://marc.info/?l=linux-xfs&m=150459024723753&w=2 Tested-by: Chandan Rajendra Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 78c47c4b584d..85de67334695 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1036,6 +1036,8 @@ static int loop_clr_fd(struct loop_device *lo) memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); blk_queue_logical_block_size(lo->lo_queue, 512); + blk_queue_physical_block_size(lo->lo_queue, 512); + blk_queue_io_min(lo->lo_queue, 512); if (bdev) { bdput(bdev); invalidate_bdev(bdev); @@ -1330,6 +1332,8 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) blk_mq_freeze_queue(lo->lo_queue); blk_queue_logical_block_size(lo->lo_queue, arg); + blk_queue_physical_block_size(lo->lo_queue, arg); + blk_queue_io_min(lo->lo_queue, arg); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue); @@ -1777,8 +1781,6 @@ static int loop_add(struct loop_device **l, int i) } lo->lo_queue->queuedata = lo; - blk_queue_physical_block_size(lo->lo_queue, PAGE_SIZE); - blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS); /* From 175206cf9ab63161dec74d9cd7f9992e062491f5 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Thu, 7 Sep 2017 01:28:53 +0800 Subject: [PATCH 74/74] bcache: initialize dirty stripes in flash_dev_run() bcache uses a Proportion-Differentiation Controller algorithm to control writeback rate to cached devices. In the PD controller algorithm, dirty stripes of thin flash device should not be counted in, because flash only volumes never write back dirty data. Currently dirty stripe counter for thin flash device is not initialized when the thin flash device starts. Which means the following calculation in PD controller will reference an undefined dirty stripes number, and all cached devices attached to the same cache set where the thin flash device lies on may have an inaccurate writeback rate. This patch calles bch_sectors_dirty_init() in flash_dev_run(), to correctly initialize dirty stripe counter when the thin flash device starts to run. This patch also does following parameter data type change, -void bch_sectors_dirty_init(struct cached_dev *dc); +void bch_sectors_dirty_init(struct bcache_device *); to call this function conveniently in flash_dev_run(). (Commit log is composed by Coly Li) Signed-off-by: Tang Junhui Reviewed-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 3 ++- drivers/md/bcache/writeback.c | 8 ++++---- drivers/md/bcache/writeback.h | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1ae63374366c..fc0a31b13ac4 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1026,7 +1026,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) } if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - bch_sectors_dirty_init(dc); + bch_sectors_dirty_init(&dc->disk); atomic_set(&dc->has_dirty, 1); atomic_inc(&dc->count); bch_writeback_queue(dc); @@ -1230,6 +1230,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) goto err; bcache_device_attach(d, c, u - c->uuids); + bch_sectors_dirty_init(d); bch_flash_dev_request_init(d); add_disk(d->disk); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 34131439e28c..e663ca082183 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -482,17 +482,17 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, return MAP_CONTINUE; } -void bch_sectors_dirty_init(struct cached_dev *dc) +void bch_sectors_dirty_init(struct bcache_device *d) { struct sectors_dirty_init op; bch_btree_op_init(&op.op, -1); - op.inode = dc->disk.id; + op.inode = d->id; - bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), + bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); - dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); + d->sectors_dirty_last = bcache_dev_sectors_dirty(d); } void bch_cached_dev_writeback_init(struct cached_dev *dc) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 8789b9c8c484..e35421d20d2e 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -103,7 +103,7 @@ static inline void bch_writeback_add(struct cached_dev *dc) void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); -void bch_sectors_dirty_init(struct cached_dev *dc); +void bch_sectors_dirty_init(struct bcache_device *); void bch_cached_dev_writeback_init(struct cached_dev *); int bch_cached_dev_writeback_start(struct cached_dev *);