Merge tag 'nfs-for-4.9-1' of git://git.linux-nfs.org/projects/anna/linux-nfs

[cascardo/linux.git] / net / sunrpc / xprtrdma / transport.c
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c

index 81f0e87..ed5e285 100644 (file)
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -97,7 +97,7 @@ static struct ctl_table xr_tunables_table[] = {
                 .data           = &xprt_rdma_max_inline_read,
                 .maxlen         = sizeof(unsigned int),
                 .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = proc_dointvec_minmax,
                 .extra1         = &min_inline_size,
                 .extra2         = &max_inline_size,
         },
@@ -106,7 +106,7 @@ static struct ctl_table xr_tunables_table[] = {
                 .data           = &xprt_rdma_max_inline_write,
                 .maxlen         = sizeof(unsigned int),
                 .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = proc_dointvec_minmax,
                 .extra1         = &min_inline_size,
                 .extra2         = &max_inline_size,
         },
@@ -477,115 +477,152 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
         }
  }
  
-/*
- * The RDMA allocate/free functions need the task structure as a place
- * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
- * sequence.
+/* Allocate a fixed-size buffer in which to construct and send the
+ * RPC-over-RDMA header for this request.
+ */
+static bool
+rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+                   gfp_t flags)
+{
+       size_t size = RPCRDMA_HDRBUF_SIZE;
+       struct rpcrdma_regbuf *rb;
+
+       if (req->rl_rdmabuf)
+               return true;
+
+       rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
+       if (IS_ERR(rb))
+               return false;
+
+       r_xprt->rx_stats.hardway_register_count += size;
+       req->rl_rdmabuf = rb;
+       return true;
+}
+
+static bool
+rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+                   size_t size, gfp_t flags)
+{
+       struct rpcrdma_regbuf *rb;
+
+       if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size)
+               return true;
+
+       rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
+       if (IS_ERR(rb))
+               return false;
+
+       rpcrdma_free_regbuf(req->rl_sendbuf);
+       r_xprt->rx_stats.hardway_register_count += size;
+       req->rl_sendbuf = rb;
+       return true;
+}
+
+/* The rq_rcv_buf is used only if a Reply chunk is necessary.
+ * The decision to use a Reply chunk is made later in
+ * rpcrdma_marshal_req. This buffer is registered at that time.
   *
- * The RPC layer allocates both send and receive buffers in the same call
- * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer).
- * We may register rq_rcv_buf when using reply chunks.
+ * Otherwise, the associated RPC Reply arrives in a separate
+ * Receive buffer, arbitrarily chosen by the HCA. The buffer
+ * allocated here for the RPC Reply is not utilized in that
+ * case. See rpcrdma_inline_fixup.
+ *
+ * A regbuf is used here to remember the buffer size.
   */
-static void *
-xprt_rdma_allocate(struct rpc_task *task, size_t size)
+static bool
+rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+                   size_t size, gfp_t flags)
  {
-       struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
-       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
         struct rpcrdma_regbuf *rb;
+
+       if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size)
+               return true;
+
+       rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags);
+       if (IS_ERR(rb))
+               return false;
+
+       rpcrdma_free_regbuf(req->rl_recvbuf);
+       r_xprt->rx_stats.hardway_register_count += size;
+       req->rl_recvbuf = rb;
+       return true;
+}
+
+/**
+ * xprt_rdma_allocate - allocate transport resources for an RPC
+ * @task: RPC task
+ *
+ * Return values:
+ *        0:   Success; rq_buffer points to RPC buffer to use
+ *   ENOMEM:   Out of memory, call again later
+ *      EIO:   A permanent error occurred, do not retry
+ *
+ * The RDMA allocate/free functions need the task structure as a place
+ * to hide the struct rpcrdma_req, which is necessary for the actual
+ * send/recv sequence.
+ *
+ * xprt_rdma_allocate provides buffers that are already mapped for
+ * DMA, and a local DMA lkey is provided for each.
+ */
+static int
+xprt_rdma_allocate(struct rpc_task *task)
+{
+       struct rpc_rqst *rqst = task->tk_rqstp;
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
         struct rpcrdma_req *req;
-       size_t min_size;
         gfp_t flags;
  
         req = rpcrdma_buffer_get(&r_xprt->rx_buf);
         if (req == NULL)
-               return NULL;
+               return -ENOMEM;
  
         flags = RPCRDMA_DEF_GFP;
         if (RPC_IS_SWAPPER(task))
                 flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
  
-       if (req->rl_rdmabuf == NULL)
-               goto out_rdmabuf;
-       if (req->rl_sendbuf == NULL)
-               goto out_sendbuf;
-       if (size > req->rl_sendbuf->rg_size)
-               goto out_sendbuf;
-
-out:
-       dprintk("RPC:       %s: size %zd, request 0x%p\n", __func__, size, req);
-       req->rl_connect_cookie = 0;     /* our reserved value */
-       req->rl_task = task;
-       return req->rl_sendbuf->rg_base;
-
-out_rdmabuf:
-       min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
-       rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags);
-       if (IS_ERR(rb))
+       if (!rpcrdma_get_rdmabuf(r_xprt, req, flags))
                 goto out_fail;
-       req->rl_rdmabuf = rb;
-
-out_sendbuf:
-       /* XDR encoding and RPC/RDMA marshaling of this request has not
-        * yet occurred. Thus a lower bound is needed to prevent buffer
-        * overrun during marshaling.
-        *
-        * RPC/RDMA marshaling may choose to send payload bearing ops
-        * inline, if the result is smaller than the inline threshold.
-        * The value of the "size" argument accounts for header
-        * requirements but not for the payload in these cases.
-        *
-        * Likewise, allocate enough space to receive a reply up to the
-        * size of the inline threshold.
-        *
-        * It's unlikely that both the send header and the received
-        * reply will be large, but slush is provided here to allow
-        * flexibility when marshaling.
-        */
-       min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp);
-       min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
-       if (size < min_size)
-               size = min_size;
-
-       rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
-       if (IS_ERR(rb))
+       if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags))
+               goto out_fail;
+       if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
                 goto out_fail;
-       rb->rg_owner = req;
  
-       r_xprt->rx_stats.hardway_register_count += size;
-       rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf);
-       req->rl_sendbuf = rb;
-       goto out;
+       dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n",
+               task->tk_pid, __func__, rqst->rq_callsize,
+               rqst->rq_rcvsize, req);
+
+       req->rl_connect_cookie = 0;     /* our reserved value */
+       rpcrdma_set_xprtdata(rqst, req);
+       rqst->rq_buffer = req->rl_sendbuf->rg_base;
+       rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
+       return 0;
  
  out_fail:
         rpcrdma_buffer_put(req);
-       return NULL;
+       return -ENOMEM;
  }
  
-/*
- * This function returns all RDMA resources to the pool.
+/**
+ * xprt_rdma_free - release resources allocated by xprt_rdma_allocate
+ * @task: RPC task
+ *
+ * Caller guarantees rqst->rq_buffer is non-NULL.
   */
  static void
-xprt_rdma_free(void *buffer)
+xprt_rdma_free(struct rpc_task *task)
  {
-       struct rpcrdma_req *req;
-       struct rpcrdma_xprt *r_xprt;
-       struct rpcrdma_regbuf *rb;
-
-       if (buffer == NULL)
-               return;
+       struct rpc_rqst *rqst = task->tk_rqstp;
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+       struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
  
-       rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
-       req = rb->rg_owner;
         if (req->rl_backchannel)
                 return;
  
-       r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
-
         dprintk("RPC:       %s: called on 0x%p\n", __func__, req->rl_reply);
  
-       r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req,
-                                           !RPC_IS_ASYNC(req->rl_task));
-
+       ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task));
+       rpcrdma_unmap_sges(ia, req);
         rpcrdma_buffer_put(req);
  }
  
@@ -685,10 +722,11 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
                    r_xprt->rx_stats.failed_marshal_count,
                    r_xprt->rx_stats.bad_reply_count,
                    r_xprt->rx_stats.nomsg_call_count);
-       seq_printf(seq, "%lu %lu %lu\n",
+       seq_printf(seq, "%lu %lu %lu %lu\n",
                    r_xprt->rx_stats.mrs_recovered,
                    r_xprt->rx_stats.mrs_orphaned,
-                  r_xprt->rx_stats.mrs_allocated);
+                  r_xprt->rx_stats.mrs_allocated,
+                  r_xprt->rx_stats.local_inv_needed);
  }
  
  static int