IB/srp: Fix srp_map_sg_dma()
[cascardo/linux.git] / drivers / infiniband / ulp / srp / ib_srp.c
index 369a75e..3322ed7 100644 (file)
@@ -70,6 +70,7 @@ static unsigned int indirect_sg_entries;
 static bool allow_ext_sg;
 static bool prefer_fr = true;
 static bool register_always = true;
+static bool never_register;
 static int topspin_workarounds = 1;
 
 module_param(srp_sg_tablesize, uint, 0444);
@@ -99,6 +100,9 @@ module_param(register_always, bool, 0444);
 MODULE_PARM_DESC(register_always,
                 "Use memory registration even for contiguous memory regions");
 
+module_param(never_register, bool, 0444);
+MODULE_PARM_DESC(never_register, "Never register memory");
+
 static const struct kernel_param_ops srp_tmo_ops;
 
 static int srp_reconnect_delay = 10;
@@ -316,7 +320,7 @@ static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target)
        struct ib_fmr_pool_param fmr_param;
 
        memset(&fmr_param, 0, sizeof(fmr_param));
-       fmr_param.pool_size         = target->scsi_host->can_queue;
+       fmr_param.pool_size         = target->mr_pool_size;
        fmr_param.dirty_watermark   = fmr_param.pool_size / 4;
        fmr_param.cache             = 1;
        fmr_param.max_pages_per_fmr = dev->max_pages_per_mr;
@@ -441,23 +445,22 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
 {
        struct srp_device *dev = target->srp_host->srp_dev;
 
-       return srp_create_fr_pool(dev->dev, dev->pd,
-                                 target->scsi_host->can_queue,
+       return srp_create_fr_pool(dev->dev, dev->pd, target->mr_pool_size,
                                  dev->max_pages_per_mr);
 }
 
 /**
  * srp_destroy_qp() - destroy an RDMA queue pair
- * @ch: SRP RDMA channel.
+ * @qp: RDMA queue pair.
  *
  * Drain the qp before destroying it.  This avoids that the receive
  * completion handler can access the queue pair while it is
  * being destroyed.
  */
-static void srp_destroy_qp(struct srp_rdma_ch *ch)
+static void srp_destroy_qp(struct ib_qp *qp)
 {
-       ib_drain_rq(ch->qp);
-       ib_destroy_qp(ch->qp);
+       ib_drain_rq(qp);
+       ib_destroy_qp(qp);
 }
 
 static int srp_create_ch_ib(struct srp_rdma_ch *ch)
@@ -469,7 +472,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
        struct ib_qp *qp;
        struct ib_fmr_pool *fmr_pool = NULL;
        struct srp_fr_pool *fr_pool = NULL;
-       const int m = dev->use_fast_reg ? 3 : 1;
+       const int m = 1 + dev->use_fast_reg * target->mr_per_cmd * 2;
        int ret;
 
        init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL);
@@ -530,7 +533,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
        }
 
        if (ch->qp)
-               srp_destroy_qp(ch);
+               srp_destroy_qp(ch->qp);
        if (ch->recv_cq)
                ib_free_cq(ch->recv_cq);
        if (ch->send_cq)
@@ -554,7 +557,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
        return 0;
 
 err_qp:
-       srp_destroy_qp(ch);
+       srp_destroy_qp(qp);
 
 err_send_cq:
        ib_free_cq(send_cq);
@@ -597,7 +600,7 @@ static void srp_free_ch_ib(struct srp_target_port *target,
                        ib_destroy_fmr_pool(ch->fmr_pool);
        }
 
-       srp_destroy_qp(ch);
+       srp_destroy_qp(ch->qp);
        ib_free_cq(ch->send_cq);
        ib_free_cq(ch->recv_cq);
 
@@ -850,7 +853,7 @@ static int srp_alloc_req_data(struct srp_rdma_ch *ch)
 
        for (i = 0; i < target->req_ring_size; ++i) {
                req = &ch->req_ring[i];
-               mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *),
+               mr_list = kmalloc(target->mr_per_cmd * sizeof(void *),
                                  GFP_KERNEL);
                if (!mr_list)
                        goto out;
@@ -1112,7 +1115,7 @@ static struct scsi_cmnd *srp_claim_req(struct srp_rdma_ch *ch,
 }
 
 /**
- * srp_free_req() - Unmap data and add request to the free request list.
+ * srp_free_req() - Unmap data and adjust ch->req_lim.
  * @ch:     SRP RDMA channel.
  * @req:    Request to be freed.
  * @scmnd:  SCSI command associated with @req.
@@ -1299,9 +1302,16 @@ static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc)
        srp_handle_qp_err(cq, wc, "FAST REG");
 }
 
+/*
+ * Map up to sg_nents elements of state->sg where *sg_offset_p is the offset
+ * where to start in the first element. If sg_offset_p != NULL then
+ * *sg_offset_p is updated to the offset in state->sg[retval] of the first
+ * byte that has not yet been mapped.
+ */
 static int srp_map_finish_fr(struct srp_map_state *state,
                             struct srp_request *req,
-                            struct srp_rdma_ch *ch, int sg_nents)
+                            struct srp_rdma_ch *ch, int sg_nents,
+                            unsigned int *sg_offset_p)
 {
        struct srp_target_port *target = ch->target;
        struct srp_device *dev = target->srp_host->srp_dev;
@@ -1316,13 +1326,14 @@ static int srp_map_finish_fr(struct srp_map_state *state,
 
        WARN_ON_ONCE(!dev->use_fast_reg);
 
-       if (sg_nents == 0)
-               return 0;
-
        if (sg_nents == 1 && target->global_mr) {
-               srp_map_desc(state, sg_dma_address(state->sg),
-                            sg_dma_len(state->sg),
+               unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
+
+               srp_map_desc(state, sg_dma_address(state->sg) + sg_offset,
+                            sg_dma_len(state->sg) - sg_offset,
                             target->global_mr->rkey);
+               if (sg_offset_p)
+                       *sg_offset_p = 0;
                return 1;
        }
 
@@ -1333,9 +1344,17 @@ static int srp_map_finish_fr(struct srp_map_state *state,
        rkey = ib_inc_rkey(desc->mr->rkey);
        ib_update_fast_reg_key(desc->mr, rkey);
 
-       n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, dev->mr_page_size);
-       if (unlikely(n < 0))
+       n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, sg_offset_p,
+                        dev->mr_page_size);
+       if (unlikely(n < 0)) {
+               srp_fr_pool_put(ch->fr_pool, &desc, 1);
+               pr_debug("%s: ib_map_mr_sg(%d, %d) returned %d.\n",
+                        dev_name(&req->scmnd->device->sdev_gendev), sg_nents,
+                        sg_offset_p ? *sg_offset_p : -1, n);
                return n;
+       }
+
+       WARN_ON_ONCE(desc->mr->length == 0);
 
        req->reg_cqe.done = srp_reg_mr_err_done;
 
@@ -1357,8 +1376,10 @@ static int srp_map_finish_fr(struct srp_map_state *state,
                     desc->mr->length, desc->mr->rkey);
 
        err = ib_post_send(ch->qp, &wr.wr, &bad_wr);
-       if (unlikely(err))
+       if (unlikely(err)) {
+               WARN_ON_ONCE(err == -ENOMEM);
                return err;
+       }
 
        return n;
 }
@@ -1398,7 +1419,7 @@ static int srp_map_sg_entry(struct srp_map_state *state,
        /*
         * If the last entry of the MR wasn't a full page, then we need to
         * close it out and start a new one -- we can only merge at page
-        * boundries.
+        * boundaries.
         */
        ret = 0;
        if (len != dev->mr_page_size)
@@ -1413,10 +1434,9 @@ static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch,
        struct scatterlist *sg;
        int i, ret;
 
-       state->desc = req->indirect_desc;
        state->pages = req->map_page;
        state->fmr.next = req->fmr_list;
-       state->fmr.end = req->fmr_list + ch->target->cmd_sg_cnt;
+       state->fmr.end = req->fmr_list + ch->target->mr_per_cmd;
 
        for_each_sg(scat, sg, count, i) {
                ret = srp_map_sg_entry(state, ch, sg, i);
@@ -1428,8 +1448,6 @@ static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch,
        if (ret)
                return ret;
 
-       req->nmdesc = state->nmdesc;
-
        return 0;
 }
 
@@ -1437,15 +1455,19 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch,
                         struct srp_request *req, struct scatterlist *scat,
                         int count)
 {
-       state->desc = req->indirect_desc;
+       unsigned int sg_offset = 0;
+
        state->fr.next = req->fr_list;
-       state->fr.end = req->fr_list + ch->target->cmd_sg_cnt;
+       state->fr.end = req->fr_list + ch->target->mr_per_cmd;
        state->sg = scat;
 
+       if (count == 0)
+               return 0;
+
        while (count) {
                int i, n;
 
-               n = srp_map_finish_fr(state, req, ch, count);
+               n = srp_map_finish_fr(state, req, ch, count, &sg_offset);
                if (unlikely(n < 0))
                        return n;
 
@@ -1454,8 +1476,6 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch,
                        state->sg = sg_next(state->sg);
        }
 
-       req->nmdesc = state->nmdesc;
-
        return 0;
 }
 
@@ -1468,15 +1488,12 @@ static int srp_map_sg_dma(struct srp_map_state *state, struct srp_rdma_ch *ch,
        struct scatterlist *sg;
        int i;
 
-       state->desc = req->indirect_desc;
        for_each_sg(scat, sg, count, i) {
                srp_map_desc(state, ib_sg_dma_address(dev->dev, sg),
                             ib_sg_dma_len(dev->dev, sg),
                             target->global_mr->rkey);
        }
 
-       req->nmdesc = state->nmdesc;
-
        return 0;
 }
 
@@ -1509,14 +1526,15 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
 
        if (dev->use_fast_reg) {
                state.sg = idb_sg;
-               sg_set_buf(idb_sg, req->indirect_desc, idb_len);
+               sg_init_one(idb_sg, req->indirect_desc, idb_len);
                idb_sg->dma_address = req->indirect_dma_addr; /* hack! */
 #ifdef CONFIG_NEED_SG_DMA_LENGTH
                idb_sg->dma_length = idb_sg->length;          /* hack^2 */
 #endif
-               ret = srp_map_finish_fr(&state, req, ch, 1);
+               ret = srp_map_finish_fr(&state, req, ch, 1, NULL);
                if (ret < 0)
                        return ret;
+               WARN_ON_ONCE(ret < 1);
        } else if (dev->use_fmr) {
                state.pages = idb_pages;
                state.pages[0] = (req->indirect_dma_addr &
@@ -1534,6 +1552,41 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
        return 0;
 }
 
+#if defined(DYNAMIC_DATA_DEBUG)
+static void srp_check_mapping(struct srp_map_state *state,
+                             struct srp_rdma_ch *ch, struct srp_request *req,
+                             struct scatterlist *scat, int count)
+{
+       struct srp_device *dev = ch->target->srp_host->srp_dev;
+       struct srp_fr_desc **pfr;
+       u64 desc_len = 0, mr_len = 0;
+       int i;
+
+       for (i = 0; i < state->ndesc; i++)
+               desc_len += be32_to_cpu(req->indirect_desc[i].len);
+       if (dev->use_fast_reg)
+               for (i = 0, pfr = req->fr_list; i < state->nmdesc; i++, pfr++)
+                       mr_len += (*pfr)->mr->length;
+       else if (dev->use_fmr)
+               for (i = 0; i < state->nmdesc; i++)
+                       mr_len += be32_to_cpu(req->indirect_desc[i].len);
+       if (desc_len != scsi_bufflen(req->scmnd) ||
+           mr_len > scsi_bufflen(req->scmnd))
+               pr_err("Inconsistent: scsi len %d <> desc len %lld <> mr len %lld; ndesc %d; nmdesc = %d\n",
+                      scsi_bufflen(req->scmnd), desc_len, mr_len,
+                      state->ndesc, state->nmdesc);
+}
+#endif
+
+/**
+ * srp_map_data() - map SCSI data buffer onto an SRP request
+ * @scmnd: SCSI command to map
+ * @ch: SRP RDMA channel
+ * @req: SRP request
+ *
+ * Returns the length in bytes of the SRP_CMD IU or a negative value if
+ * mapping failed.
+ */
 static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
                        struct srp_request *req)
 {
@@ -1600,12 +1653,25 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
                                   target->indirect_size, DMA_TO_DEVICE);
 
        memset(&state, 0, sizeof(state));
+       state.desc = req->indirect_desc;
        if (dev->use_fast_reg)
-               srp_map_sg_fr(&state, ch, req, scat, count);
+               ret = srp_map_sg_fr(&state, ch, req, scat, count);
        else if (dev->use_fmr)
-               srp_map_sg_fmr(&state, ch, req, scat, count);
+               ret = srp_map_sg_fmr(&state, ch, req, scat, count);
        else
-               srp_map_sg_dma(&state, ch, req, scat, count);
+               ret = srp_map_sg_dma(&state, ch, req, scat, count);
+       req->nmdesc = state.nmdesc;
+       if (ret < 0)
+               goto unmap;
+
+#if defined(DYNAMIC_DEBUG)
+       {
+               DEFINE_DYNAMIC_DEBUG_METADATA(ddm,
+                       "Memory mapping consistency check");
+               if (unlikely(ddm.flags & _DPRINTK_FLAGS_PRINT))
+                       srp_check_mapping(&state, ch, req, scat, count);
+       }
+#endif
 
        /* We've mapped the request, now pull as much of the indirect
         * descriptor table as we can into the command buffer. If this
@@ -1628,7 +1694,8 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
                                                !target->allow_ext_sg)) {
                shost_printk(KERN_ERR, target->scsi_host,
                             "Could not fit S/G list into SRP_CMD\n");
-               return -EIO;
+               ret = -EIO;
+               goto unmap;
        }
 
        count = min(state.ndesc, target->cmd_sg_cnt);
@@ -1646,7 +1713,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
                ret = srp_map_idb(ch, req, state.gen.next, state.gen.end,
                                  idb_len, &idb_rkey);
                if (ret < 0)
-                       return ret;
+                       goto unmap;
                req->nmdesc++;
        } else {
                idb_rkey = cpu_to_be32(target->global_mr->rkey);
@@ -1672,6 +1739,12 @@ map_complete:
                cmd->buf_fmt = fmt;
 
        return len;
+
+unmap:
+       srp_unmap_data(scmnd, ch, req);
+       if (ret == -ENOMEM && req->nmdesc >= target->mr_pool_size)
+               ret = -E2BIG;
+       return ret;
 }
 
 /*
@@ -2564,6 +2637,20 @@ static int srp_reset_host(struct scsi_cmnd *scmnd)
        return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
 }
 
+static int srp_slave_alloc(struct scsi_device *sdev)
+{
+       struct Scsi_Host *shost = sdev->host;
+       struct srp_target_port *target = host_to_target(shost);
+       struct srp_device *srp_dev = target->srp_host->srp_dev;
+       struct ib_device *ibdev = srp_dev->dev;
+
+       if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG))
+               blk_queue_virt_boundary(sdev->request_queue,
+                                       ~srp_dev->mr_page_mask);
+
+       return 0;
+}
+
 static int srp_slave_configure(struct scsi_device *sdev)
 {
        struct Scsi_Host *shost = sdev->host;
@@ -2755,6 +2842,7 @@ static struct scsi_host_template srp_template = {
        .module                         = THIS_MODULE,
        .name                           = "InfiniBand SRP initiator",
        .proc_name                      = DRV_NAME,
+       .slave_alloc                    = srp_slave_alloc,
        .slave_configure                = srp_slave_configure,
        .info                           = srp_target_info,
        .queuecommand                   = srp_queuecommand,
@@ -2829,7 +2917,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target)
                goto out;
        }
 
-       pr_debug(PFX "%s: SCSI scan succeeded - detected %d LUNs\n",
+       pr_debug("%s: SCSI scan succeeded - detected %d LUNs\n",
                 dev_name(&target->scsi_host->shost_gendev),
                 srp_sdev_count(target->scsi_host));
 
@@ -3161,6 +3249,7 @@ static ssize_t srp_create_target(struct device *dev,
        struct srp_device *srp_dev = host->srp_dev;
        struct ib_device *ibdev = srp_dev->dev;
        int ret, node_idx, node, cpu, i;
+       unsigned int max_sectors_per_mr, mr_per_cmd = 0;
        bool multich = false;
 
        target_host = scsi_host_alloc(&srp_template,
@@ -3217,7 +3306,33 @@ static ssize_t srp_create_target(struct device *dev,
                target->sg_tablesize = target->cmd_sg_cnt;
        }
 
+       if (srp_dev->use_fast_reg || srp_dev->use_fmr) {
+               /*
+                * FR and FMR can only map one HCA page per entry. If the
+                * start address is not aligned on a HCA page boundary two
+                * entries will be used for the head and the tail although
+                * these two entries combined contain at most one HCA page of
+                * data. Hence the "+ 1" in the calculation below.
+                *
+                * The indirect data buffer descriptor is contiguous so the
+                * memory for that buffer will only be registered if
+                * register_always is true. Hence add one to mr_per_cmd if
+                * register_always has been set.
+                */
+               max_sectors_per_mr = srp_dev->max_pages_per_mr <<
+                                 (ilog2(srp_dev->mr_page_size) - 9);
+               mr_per_cmd = register_always +
+                       (target->scsi_host->max_sectors + 1 +
+                        max_sectors_per_mr - 1) / max_sectors_per_mr;
+               pr_debug("max_sectors = %u; max_pages_per_mr = %u; mr_page_size = %u; max_sectors_per_mr = %u; mr_per_cmd = %u\n",
+                        target->scsi_host->max_sectors,
+                        srp_dev->max_pages_per_mr, srp_dev->mr_page_size,
+                        max_sectors_per_mr, mr_per_cmd);
+       }
+
        target_host->sg_tablesize = target->sg_tablesize;
+       target->mr_pool_size = target->scsi_host->can_queue * mr_per_cmd;
+       target->mr_per_cmd = mr_per_cmd;
        target->indirect_size = target->sg_tablesize *
                                sizeof (struct srp_direct_buf);
        target->max_iu_len = sizeof (struct srp_cmd) +
@@ -3410,21 +3525,10 @@ static void srp_add_one(struct ib_device *device)
        int mr_page_shift, p;
        u64 max_pages_per_mr;
 
-       srp_dev = kmalloc(sizeof *srp_dev, GFP_KERNEL);
+       srp_dev = kzalloc(sizeof(*srp_dev), GFP_KERNEL);
        if (!srp_dev)
                return;
 
-       srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
-                           device->map_phys_fmr && device->unmap_fmr);
-       srp_dev->has_fr = (device->attrs.device_cap_flags &
-                          IB_DEVICE_MEM_MGT_EXTENSIONS);
-       if (!srp_dev->has_fmr && !srp_dev->has_fr)
-               dev_warn(&device->dev, "neither FMR nor FR is supported\n");
-
-       srp_dev->use_fast_reg = (srp_dev->has_fr &&
-                                (!srp_dev->has_fmr || prefer_fr));
-       srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr;
-
        /*
         * Use the smallest page size supported by the HCA, down to a
         * minimum of 4096 bytes. We're unlikely to build large sglists
@@ -3435,8 +3539,25 @@ static void srp_add_one(struct ib_device *device)
        srp_dev->mr_page_mask   = ~((u64) srp_dev->mr_page_size - 1);
        max_pages_per_mr        = device->attrs.max_mr_size;
        do_div(max_pages_per_mr, srp_dev->mr_page_size);
+       pr_debug("%s: %llu / %u = %llu <> %u\n", __func__,
+                device->attrs.max_mr_size, srp_dev->mr_page_size,
+                max_pages_per_mr, SRP_MAX_PAGES_PER_MR);
        srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR,
                                          max_pages_per_mr);
+
+       srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
+                           device->map_phys_fmr && device->unmap_fmr);
+       srp_dev->has_fr = (device->attrs.device_cap_flags &
+                          IB_DEVICE_MEM_MGT_EXTENSIONS);
+       if (!never_register && !srp_dev->has_fmr && !srp_dev->has_fr) {
+               dev_warn(&device->dev, "neither FMR nor FR is supported\n");
+       } else if (!never_register &&
+                  device->attrs.max_mr_size >= 2 * srp_dev->mr_page_size) {
+               srp_dev->use_fast_reg = (srp_dev->has_fr &&
+                                        (!srp_dev->has_fmr || prefer_fr));
+               srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr;
+       }
+
        if (srp_dev->use_fast_reg) {
                srp_dev->max_pages_per_mr =
                        min_t(u32, srp_dev->max_pages_per_mr,
@@ -3456,15 +3577,14 @@ static void srp_add_one(struct ib_device *device)
        if (IS_ERR(srp_dev->pd))
                goto free_dev;
 
-       if (!register_always || (!srp_dev->has_fmr && !srp_dev->has_fr)) {
+       if (never_register || !register_always ||
+           (!srp_dev->has_fmr && !srp_dev->has_fr)) {
                srp_dev->global_mr = ib_get_dma_mr(srp_dev->pd,
                                                   IB_ACCESS_LOCAL_WRITE |
                                                   IB_ACCESS_REMOTE_READ |
                                                   IB_ACCESS_REMOTE_WRITE);
                if (IS_ERR(srp_dev->global_mr))
                        goto err_pd;
-       } else {
-               srp_dev->global_mr = NULL;
        }
 
        for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {