IB/srp: Prevent mapping failures

If both max_sectors and the queue_depth are high enough it can
happen that the MR pool is depleted temporarily. This causes
the SRP initiator to report mapping failures. Although the SRP
initiator recovers from such mapping failures, prevent that
this can happen by allocating more memory regions.

Additionally, only enable memory registration if at least two
pages can be registered per memory region.

Reported-by: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Tested-by: Laurence Oberman <loberman@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Doug Ledford <dledford@redhat.com>
This commit is contained in:
Bart Van Assche 2016-05-12 10:50:35 -07:00 committed by Doug Ledford
parent 835ee624c9
commit 509c5f33f4
2 changed files with 119 additions and 19 deletions

View File

@ -468,7 +468,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
struct ib_qp *qp; struct ib_qp *qp;
struct ib_fmr_pool *fmr_pool = NULL; struct ib_fmr_pool *fmr_pool = NULL;
struct srp_fr_pool *fr_pool = NULL; struct srp_fr_pool *fr_pool = NULL;
const int m = dev->use_fast_reg ? 3 : 1; const int m = 1 + dev->use_fast_reg * target->mr_per_cmd * 2;
int ret; int ret;
init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL); init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL);
@ -849,7 +849,7 @@ static int srp_alloc_req_data(struct srp_rdma_ch *ch)
for (i = 0; i < target->req_ring_size; ++i) { for (i = 0; i < target->req_ring_size; ++i) {
req = &ch->req_ring[i]; req = &ch->req_ring[i];
mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), mr_list = kmalloc(target->mr_per_cmd * sizeof(void *),
GFP_KERNEL); GFP_KERNEL);
if (!mr_list) if (!mr_list)
goto out; goto out;
@ -1298,9 +1298,16 @@ static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc)
srp_handle_qp_err(cq, wc, "FAST REG"); srp_handle_qp_err(cq, wc, "FAST REG");
} }
/*
* Map up to sg_nents elements of state->sg where *sg_offset_p is the offset
* where to start in the first element. If sg_offset_p != NULL then
* *sg_offset_p is updated to the offset in state->sg[retval] of the first
* byte that has not yet been mapped.
*/
static int srp_map_finish_fr(struct srp_map_state *state, static int srp_map_finish_fr(struct srp_map_state *state,
struct srp_request *req, struct srp_request *req,
struct srp_rdma_ch *ch, int sg_nents) struct srp_rdma_ch *ch, int sg_nents,
unsigned int *sg_offset_p)
{ {
struct srp_target_port *target = ch->target; struct srp_target_port *target = ch->target;
struct srp_device *dev = target->srp_host->srp_dev; struct srp_device *dev = target->srp_host->srp_dev;
@ -1316,9 +1323,13 @@ static int srp_map_finish_fr(struct srp_map_state *state,
WARN_ON_ONCE(!dev->use_fast_reg); WARN_ON_ONCE(!dev->use_fast_reg);
if (sg_nents == 1 && target->global_mr) { if (sg_nents == 1 && target->global_mr) {
srp_map_desc(state, sg_dma_address(state->sg), unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
sg_dma_len(state->sg),
srp_map_desc(state, sg_dma_address(state->sg) + sg_offset,
sg_dma_len(state->sg) - sg_offset,
target->global_mr->rkey); target->global_mr->rkey);
if (sg_offset_p)
*sg_offset_p = 0;
return 1; return 1;
} }
@ -1329,15 +1340,18 @@ static int srp_map_finish_fr(struct srp_map_state *state,
rkey = ib_inc_rkey(desc->mr->rkey); rkey = ib_inc_rkey(desc->mr->rkey);
ib_update_fast_reg_key(desc->mr, rkey); ib_update_fast_reg_key(desc->mr, rkey);
n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, NULL, dev->mr_page_size); n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, sg_offset_p,
dev->mr_page_size);
if (unlikely(n < 0)) { if (unlikely(n < 0)) {
srp_fr_pool_put(ch->fr_pool, &desc, 1); srp_fr_pool_put(ch->fr_pool, &desc, 1);
pr_debug("%s: ib_map_mr_sg(%d) returned %d.\n", pr_debug("%s: ib_map_mr_sg(%d, %d) returned %d.\n",
dev_name(&req->scmnd->device->sdev_gendev), sg_nents, dev_name(&req->scmnd->device->sdev_gendev), sg_nents,
n); sg_offset_p ? *sg_offset_p : -1, n);
return n; return n;
} }
WARN_ON_ONCE(desc->mr->length == 0);
req->reg_cqe.done = srp_reg_mr_err_done; req->reg_cqe.done = srp_reg_mr_err_done;
wr.wr.next = NULL; wr.wr.next = NULL;
@ -1358,8 +1372,10 @@ static int srp_map_finish_fr(struct srp_map_state *state,
desc->mr->length, desc->mr->rkey); desc->mr->length, desc->mr->rkey);
err = ib_post_send(ch->qp, &wr.wr, &bad_wr); err = ib_post_send(ch->qp, &wr.wr, &bad_wr);
if (unlikely(err)) if (unlikely(err)) {
WARN_ON_ONCE(err == -ENOMEM);
return err; return err;
}
return n; return n;
} }
@ -1416,7 +1432,7 @@ static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch,
state->pages = req->map_page; state->pages = req->map_page;
state->fmr.next = req->fmr_list; state->fmr.next = req->fmr_list;
state->fmr.end = req->fmr_list + ch->target->cmd_sg_cnt; state->fmr.end = req->fmr_list + ch->target->mr_per_cmd;
for_each_sg(scat, sg, count, i) { for_each_sg(scat, sg, count, i) {
ret = srp_map_sg_entry(state, ch, sg, i); ret = srp_map_sg_entry(state, ch, sg, i);
@ -1435,9 +1451,11 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch,
struct srp_request *req, struct scatterlist *scat, struct srp_request *req, struct scatterlist *scat,
int count) int count)
{ {
unsigned int sg_offset = 0;
state->desc = req->indirect_desc; state->desc = req->indirect_desc;
state->fr.next = req->fr_list; state->fr.next = req->fr_list;
state->fr.end = req->fr_list + ch->target->cmd_sg_cnt; state->fr.end = req->fr_list + ch->target->mr_per_cmd;
state->sg = scat; state->sg = scat;
if (count == 0) if (count == 0)
@ -1446,7 +1464,7 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch,
while (count) { while (count) {
int i, n; int i, n;
n = srp_map_finish_fr(state, req, ch, count); n = srp_map_finish_fr(state, req, ch, count, &sg_offset);
if (unlikely(n < 0)) if (unlikely(n < 0))
return n; return n;
@ -1511,9 +1529,10 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
#ifdef CONFIG_NEED_SG_DMA_LENGTH #ifdef CONFIG_NEED_SG_DMA_LENGTH
idb_sg->dma_length = idb_sg->length; /* hack^2 */ idb_sg->dma_length = idb_sg->length; /* hack^2 */
#endif #endif
ret = srp_map_finish_fr(&state, req, ch, 1); ret = srp_map_finish_fr(&state, req, ch, 1, NULL);
if (ret < 0) if (ret < 0)
return ret; return ret;
WARN_ON_ONCE(ret < 1);
} else if (dev->use_fmr) { } else if (dev->use_fmr) {
state.pages = idb_pages; state.pages = idb_pages;
state.pages[0] = (req->indirect_dma_addr & state.pages[0] = (req->indirect_dma_addr &
@ -1531,6 +1550,32 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
return 0; return 0;
} }
#if defined(DYNAMIC_DATA_DEBUG)
static void srp_check_mapping(struct srp_map_state *state,
struct srp_rdma_ch *ch, struct srp_request *req,
struct scatterlist *scat, int count)
{
struct srp_device *dev = ch->target->srp_host->srp_dev;
struct srp_fr_desc **pfr;
u64 desc_len = 0, mr_len = 0;
int i;
for (i = 0; i < state->ndesc; i++)
desc_len += be32_to_cpu(req->indirect_desc[i].len);
if (dev->use_fast_reg)
for (i = 0, pfr = req->fr_list; i < state->nmdesc; i++, pfr++)
mr_len += (*pfr)->mr->length;
else if (dev->use_fmr)
for (i = 0; i < state->nmdesc; i++)
mr_len += be32_to_cpu(req->indirect_desc[i].len);
if (desc_len != scsi_bufflen(req->scmnd) ||
mr_len > scsi_bufflen(req->scmnd))
pr_err("Inconsistent: scsi len %d <> desc len %lld <> mr len %lld; ndesc %d; nmdesc = %d\n",
scsi_bufflen(req->scmnd), desc_len, mr_len,
state->ndesc, state->nmdesc);
}
#endif
/** /**
* srp_map_data() - map SCSI data buffer onto an SRP request * srp_map_data() - map SCSI data buffer onto an SRP request
* @scmnd: SCSI command to map * @scmnd: SCSI command to map
@ -1616,6 +1661,15 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
if (ret < 0) if (ret < 0)
goto unmap; goto unmap;
#if defined(DYNAMIC_DEBUG)
{
DEFINE_DYNAMIC_DEBUG_METADATA(ddm,
"Memory mapping consistency check");
if (unlikely(ddm.flags & _DPRINTK_FLAGS_PRINT))
srp_check_mapping(&state, ch, req, scat, count);
}
#endif
/* We've mapped the request, now pull as much of the indirect /* We've mapped the request, now pull as much of the indirect
* descriptor table as we can into the command buffer. If this * descriptor table as we can into the command buffer. If this
* target is not using an external indirect table, we are * target is not using an external indirect table, we are
@ -2580,6 +2634,20 @@ static int srp_reset_host(struct scsi_cmnd *scmnd)
return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
} }
static int srp_slave_alloc(struct scsi_device *sdev)
{
struct Scsi_Host *shost = sdev->host;
struct srp_target_port *target = host_to_target(shost);
struct srp_device *srp_dev = target->srp_host->srp_dev;
struct ib_device *ibdev = srp_dev->dev;
if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG))
blk_queue_virt_boundary(sdev->request_queue,
~srp_dev->mr_page_mask);
return 0;
}
static int srp_slave_configure(struct scsi_device *sdev) static int srp_slave_configure(struct scsi_device *sdev)
{ {
struct Scsi_Host *shost = sdev->host; struct Scsi_Host *shost = sdev->host;
@ -2771,6 +2839,7 @@ static struct scsi_host_template srp_template = {
.module = THIS_MODULE, .module = THIS_MODULE,
.name = "InfiniBand SRP initiator", .name = "InfiniBand SRP initiator",
.proc_name = DRV_NAME, .proc_name = DRV_NAME,
.slave_alloc = srp_slave_alloc,
.slave_configure = srp_slave_configure, .slave_configure = srp_slave_configure,
.info = srp_target_info, .info = srp_target_info,
.queuecommand = srp_queuecommand, .queuecommand = srp_queuecommand,
@ -3177,6 +3246,7 @@ static ssize_t srp_create_target(struct device *dev,
struct srp_device *srp_dev = host->srp_dev; struct srp_device *srp_dev = host->srp_dev;
struct ib_device *ibdev = srp_dev->dev; struct ib_device *ibdev = srp_dev->dev;
int ret, node_idx, node, cpu, i; int ret, node_idx, node, cpu, i;
unsigned int max_sectors_per_mr, mr_per_cmd = 0;
bool multich = false; bool multich = false;
target_host = scsi_host_alloc(&srp_template, target_host = scsi_host_alloc(&srp_template,
@ -3233,8 +3303,33 @@ static ssize_t srp_create_target(struct device *dev,
target->sg_tablesize = target->cmd_sg_cnt; target->sg_tablesize = target->cmd_sg_cnt;
} }
if (srp_dev->use_fast_reg || srp_dev->use_fmr) {
/*
* FR and FMR can only map one HCA page per entry. If the
* start address is not aligned on a HCA page boundary two
* entries will be used for the head and the tail although
* these two entries combined contain at most one HCA page of
* data. Hence the "+ 1" in the calculation below.
*
* The indirect data buffer descriptor is contiguous so the
* memory for that buffer will only be registered if
* register_always is true. Hence add one to mr_per_cmd if
* register_always has been set.
*/
max_sectors_per_mr = srp_dev->max_pages_per_mr <<
(ilog2(srp_dev->mr_page_size) - 9);
mr_per_cmd = register_always +
(target->scsi_host->max_sectors + 1 +
max_sectors_per_mr - 1) / max_sectors_per_mr;
pr_debug("max_sectors = %u; max_pages_per_mr = %u; mr_page_size = %u; max_sectors_per_mr = %u; mr_per_cmd = %u\n",
target->scsi_host->max_sectors,
srp_dev->max_pages_per_mr, srp_dev->mr_page_size,
max_sectors_per_mr, mr_per_cmd);
}
target_host->sg_tablesize = target->sg_tablesize; target_host->sg_tablesize = target->sg_tablesize;
target->mr_pool_size = target->scsi_host->can_queue; target->mr_pool_size = target->scsi_host->can_queue * mr_per_cmd;
target->mr_per_cmd = mr_per_cmd;
target->indirect_size = target->sg_tablesize * target->indirect_size = target->sg_tablesize *
sizeof (struct srp_direct_buf); sizeof (struct srp_direct_buf);
target->max_iu_len = sizeof (struct srp_cmd) + target->max_iu_len = sizeof (struct srp_cmd) +
@ -3441,6 +3536,9 @@ static void srp_add_one(struct ib_device *device)
srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1); srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1);
max_pages_per_mr = device->attrs.max_mr_size; max_pages_per_mr = device->attrs.max_mr_size;
do_div(max_pages_per_mr, srp_dev->mr_page_size); do_div(max_pages_per_mr, srp_dev->mr_page_size);
pr_debug("%s: %llu / %u = %llu <> %u\n", __func__,
device->attrs.max_mr_size, srp_dev->mr_page_size,
max_pages_per_mr, SRP_MAX_PAGES_PER_MR);
srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR,
max_pages_per_mr); max_pages_per_mr);
@ -3448,12 +3546,13 @@ static void srp_add_one(struct ib_device *device)
device->map_phys_fmr && device->unmap_fmr); device->map_phys_fmr && device->unmap_fmr);
srp_dev->has_fr = (device->attrs.device_cap_flags & srp_dev->has_fr = (device->attrs.device_cap_flags &
IB_DEVICE_MEM_MGT_EXTENSIONS); IB_DEVICE_MEM_MGT_EXTENSIONS);
if (!srp_dev->has_fmr && !srp_dev->has_fr) if (!srp_dev->has_fmr && !srp_dev->has_fr) {
dev_warn(&device->dev, "neither FMR nor FR is supported\n"); dev_warn(&device->dev, "neither FMR nor FR is supported\n");
} else if (device->attrs.max_mr_size >= 2 * srp_dev->mr_page_size) {
srp_dev->use_fast_reg = (srp_dev->has_fr && srp_dev->use_fast_reg = (srp_dev->has_fr &&
(!srp_dev->has_fmr || prefer_fr)); (!srp_dev->has_fmr || prefer_fr));
srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr; srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr;
}
if (srp_dev->use_fast_reg) { if (srp_dev->use_fast_reg) {
srp_dev->max_pages_per_mr = srp_dev->max_pages_per_mr =

View File

@ -203,6 +203,7 @@ struct srp_target_port {
unsigned int scsi_id; unsigned int scsi_id;
unsigned int sg_tablesize; unsigned int sg_tablesize;
int mr_pool_size; int mr_pool_size;
int mr_per_cmd;
int queue_size; int queue_size;
int req_ring_size; int req_ring_size;
int comp_vector; int comp_vector;