pinctrl: at91: enhance (debugfs) at91_gpio_dbg_show
[cascardo/linux.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the BSD-type
8  * license below:
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  *      Redistributions of source code must retain the above copyright
15  *      notice, this list of conditions and the following disclaimer.
16  *
17  *      Redistributions in binary form must reproduce the above
18  *      copyright notice, this list of conditions and the following
19  *      disclaimer in the documentation and/or other materials provided
20  *      with the distribution.
21  *
22  *      Neither the name of the Network Appliance, Inc. nor the names of
23  *      its contributors may be used to endorse or promote products
24  *      derived from this software without specific prior written
25  *      permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38  */
39
40 /*
41  * verbs.c
42  *
43  * Encapsulates the major functions managing:
44  *  o adapters
45  *  o endpoints
46  *  o connections
47  *  o buffer memory
48  */
49
50 #include <linux/interrupt.h>
51 #include <linux/slab.h>
52 #include <asm/bitops.h>
53
54 #include "xprt_rdma.h"
55
56 /*
57  * Globals/Macros
58  */
59
60 #ifdef RPC_DEBUG
61 # define RPCDBG_FACILITY        RPCDBG_TRANS
62 #endif
63
64 static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
65
66 /*
67  * internal functions
68  */
69
70 /*
71  * handle replies in tasklet context, using a single, global list
72  * rdma tasklet function -- just turn around and call the func
73  * for all replies on the list
74  */
75
76 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
77 static LIST_HEAD(rpcrdma_tasklets_g);
78
79 static void
80 rpcrdma_run_tasklet(unsigned long data)
81 {
82         struct rpcrdma_rep *rep;
83         void (*func)(struct rpcrdma_rep *);
84         unsigned long flags;
85
86         data = data;
87         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
88         while (!list_empty(&rpcrdma_tasklets_g)) {
89                 rep = list_entry(rpcrdma_tasklets_g.next,
90                                  struct rpcrdma_rep, rr_list);
91                 list_del(&rep->rr_list);
92                 func = rep->rr_func;
93                 rep->rr_func = NULL;
94                 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
95
96                 if (func)
97                         func(rep);
98                 else
99                         rpcrdma_recv_buffer_put(rep);
100
101                 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
102         }
103         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
104 }
105
106 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
107
108 static void
109 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
110 {
111         struct rpcrdma_ep *ep = context;
112
113         dprintk("RPC:       %s: QP error %X on device %s ep %p\n",
114                 __func__, event->event, event->device->name, context);
115         if (ep->rep_connected == 1) {
116                 ep->rep_connected = -EIO;
117                 ep->rep_func(ep);
118                 wake_up_all(&ep->rep_connect_wait);
119         }
120 }
121
122 static void
123 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
124 {
125         struct rpcrdma_ep *ep = context;
126
127         dprintk("RPC:       %s: CQ error %X on device %s ep %p\n",
128                 __func__, event->event, event->device->name, context);
129         if (ep->rep_connected == 1) {
130                 ep->rep_connected = -EIO;
131                 ep->rep_func(ep);
132                 wake_up_all(&ep->rep_connect_wait);
133         }
134 }
135
136 static void
137 rpcrdma_sendcq_process_wc(struct ib_wc *wc)
138 {
139         struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
140
141         dprintk("RPC:       %s: frmr %p status %X opcode %d\n",
142                 __func__, frmr, wc->status, wc->opcode);
143
144         if (wc->wr_id == 0ULL)
145                 return;
146         if (wc->status != IB_WC_SUCCESS)
147                 frmr->r.frmr.fr_state = FRMR_IS_STALE;
148 }
149
150 static int
151 rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
152 {
153         struct ib_wc *wcs;
154         int budget, count, rc;
155
156         budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
157         do {
158                 wcs = ep->rep_send_wcs;
159
160                 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
161                 if (rc <= 0)
162                         return rc;
163
164                 count = rc;
165                 while (count-- > 0)
166                         rpcrdma_sendcq_process_wc(wcs++);
167         } while (rc == RPCRDMA_POLLSIZE && --budget);
168         return 0;
169 }
170
171 /*
172  * Handle send, fast_reg_mr, and local_inv completions.
173  *
174  * Send events are typically suppressed and thus do not result
175  * in an upcall. Occasionally one is signaled, however. This
176  * prevents the provider's completion queue from wrapping and
177  * losing a completion.
178  */
179 static void
180 rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
181 {
182         struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
183         int rc;
184
185         rc = rpcrdma_sendcq_poll(cq, ep);
186         if (rc) {
187                 dprintk("RPC:       %s: ib_poll_cq failed: %i\n",
188                         __func__, rc);
189                 return;
190         }
191
192         rc = ib_req_notify_cq(cq,
193                         IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
194         if (rc == 0)
195                 return;
196         if (rc < 0) {
197                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
198                         __func__, rc);
199                 return;
200         }
201
202         rpcrdma_sendcq_poll(cq, ep);
203 }
204
205 static void
206 rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
207 {
208         struct rpcrdma_rep *rep =
209                         (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
210
211         dprintk("RPC:       %s: rep %p status %X opcode %X length %u\n",
212                 __func__, rep, wc->status, wc->opcode, wc->byte_len);
213
214         if (wc->status != IB_WC_SUCCESS) {
215                 rep->rr_len = ~0U;
216                 goto out_schedule;
217         }
218         if (wc->opcode != IB_WC_RECV)
219                 return;
220
221         rep->rr_len = wc->byte_len;
222         ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
223                         rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
224
225         if (rep->rr_len >= 16) {
226                 struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
227                 unsigned int credits = ntohl(p->rm_credit);
228
229                 if (credits == 0)
230                         credits = 1;    /* don't deadlock */
231                 else if (credits > rep->rr_buffer->rb_max_requests)
232                         credits = rep->rr_buffer->rb_max_requests;
233                 atomic_set(&rep->rr_buffer->rb_credits, credits);
234         }
235
236 out_schedule:
237         list_add_tail(&rep->rr_list, sched_list);
238 }
239
240 static int
241 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
242 {
243         struct list_head sched_list;
244         struct ib_wc *wcs;
245         int budget, count, rc;
246         unsigned long flags;
247
248         INIT_LIST_HEAD(&sched_list);
249         budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
250         do {
251                 wcs = ep->rep_recv_wcs;
252
253                 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
254                 if (rc <= 0)
255                         goto out_schedule;
256
257                 count = rc;
258                 while (count-- > 0)
259                         rpcrdma_recvcq_process_wc(wcs++, &sched_list);
260         } while (rc == RPCRDMA_POLLSIZE && --budget);
261         rc = 0;
262
263 out_schedule:
264         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
265         list_splice_tail(&sched_list, &rpcrdma_tasklets_g);
266         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
267         tasklet_schedule(&rpcrdma_tasklet_g);
268         return rc;
269 }
270
271 /*
272  * Handle receive completions.
273  *
274  * It is reentrant but processes single events in order to maintain
275  * ordering of receives to keep server credits.
276  *
277  * It is the responsibility of the scheduled tasklet to return
278  * recv buffers to the pool. NOTE: this affects synchronization of
279  * connection shutdown. That is, the structures required for
280  * the completion of the reply handler must remain intact until
281  * all memory has been reclaimed.
282  */
283 static void
284 rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
285 {
286         struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
287         int rc;
288
289         rc = rpcrdma_recvcq_poll(cq, ep);
290         if (rc) {
291                 dprintk("RPC:       %s: ib_poll_cq failed: %i\n",
292                         __func__, rc);
293                 return;
294         }
295
296         rc = ib_req_notify_cq(cq,
297                         IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
298         if (rc == 0)
299                 return;
300         if (rc < 0) {
301                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
302                         __func__, rc);
303                 return;
304         }
305
306         rpcrdma_recvcq_poll(cq, ep);
307 }
308
309 static void
310 rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
311 {
312         rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep);
313         rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep);
314 }
315
316 #ifdef RPC_DEBUG
317 static const char * const conn[] = {
318         "address resolved",
319         "address error",
320         "route resolved",
321         "route error",
322         "connect request",
323         "connect response",
324         "connect error",
325         "unreachable",
326         "rejected",
327         "established",
328         "disconnected",
329         "device removal",
330         "multicast join",
331         "multicast error",
332         "address change",
333         "timewait exit",
334 };
335
336 #define CONNECTION_MSG(status)                                          \
337         ((status) < ARRAY_SIZE(conn) ?                                  \
338                 conn[(status)] : "unrecognized connection error")
339 #endif
340
341 static int
342 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
343 {
344         struct rpcrdma_xprt *xprt = id->context;
345         struct rpcrdma_ia *ia = &xprt->rx_ia;
346         struct rpcrdma_ep *ep = &xprt->rx_ep;
347 #ifdef RPC_DEBUG
348         struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
349 #endif
350         struct ib_qp_attr attr;
351         struct ib_qp_init_attr iattr;
352         int connstate = 0;
353
354         switch (event->event) {
355         case RDMA_CM_EVENT_ADDR_RESOLVED:
356         case RDMA_CM_EVENT_ROUTE_RESOLVED:
357                 ia->ri_async_rc = 0;
358                 complete(&ia->ri_done);
359                 break;
360         case RDMA_CM_EVENT_ADDR_ERROR:
361                 ia->ri_async_rc = -EHOSTUNREACH;
362                 dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
363                         __func__, ep);
364                 complete(&ia->ri_done);
365                 break;
366         case RDMA_CM_EVENT_ROUTE_ERROR:
367                 ia->ri_async_rc = -ENETUNREACH;
368                 dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
369                         __func__, ep);
370                 complete(&ia->ri_done);
371                 break;
372         case RDMA_CM_EVENT_ESTABLISHED:
373                 connstate = 1;
374                 ib_query_qp(ia->ri_id->qp, &attr,
375                         IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
376                         &iattr);
377                 dprintk("RPC:       %s: %d responder resources"
378                         " (%d initiator)\n",
379                         __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
380                 goto connected;
381         case RDMA_CM_EVENT_CONNECT_ERROR:
382                 connstate = -ENOTCONN;
383                 goto connected;
384         case RDMA_CM_EVENT_UNREACHABLE:
385                 connstate = -ENETDOWN;
386                 goto connected;
387         case RDMA_CM_EVENT_REJECTED:
388                 connstate = -ECONNREFUSED;
389                 goto connected;
390         case RDMA_CM_EVENT_DISCONNECTED:
391                 connstate = -ECONNABORTED;
392                 goto connected;
393         case RDMA_CM_EVENT_DEVICE_REMOVAL:
394                 connstate = -ENODEV;
395 connected:
396                 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
397                 dprintk("RPC:       %s: %sconnected\n",
398                                         __func__, connstate > 0 ? "" : "dis");
399                 ep->rep_connected = connstate;
400                 ep->rep_func(ep);
401                 wake_up_all(&ep->rep_connect_wait);
402                 /*FALLTHROUGH*/
403         default:
404                 dprintk("RPC:       %s: %pI4:%u (ep 0x%p): %s\n",
405                         __func__, &addr->sin_addr.s_addr,
406                         ntohs(addr->sin_port), ep,
407                         CONNECTION_MSG(event->event));
408                 break;
409         }
410
411 #ifdef RPC_DEBUG
412         if (connstate == 1) {
413                 int ird = attr.max_dest_rd_atomic;
414                 int tird = ep->rep_remote_cma.responder_resources;
415                 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
416                         "on %s, memreg %d slots %d ird %d%s\n",
417                         &addr->sin_addr.s_addr,
418                         ntohs(addr->sin_port),
419                         ia->ri_id->device->name,
420                         ia->ri_memreg_strategy,
421                         xprt->rx_buf.rb_max_requests,
422                         ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
423         } else if (connstate < 0) {
424                 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
425                         &addr->sin_addr.s_addr,
426                         ntohs(addr->sin_port),
427                         connstate);
428         }
429 #endif
430
431         return 0;
432 }
433
434 static struct rdma_cm_id *
435 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
436                         struct rpcrdma_ia *ia, struct sockaddr *addr)
437 {
438         struct rdma_cm_id *id;
439         int rc;
440
441         init_completion(&ia->ri_done);
442
443         id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
444         if (IS_ERR(id)) {
445                 rc = PTR_ERR(id);
446                 dprintk("RPC:       %s: rdma_create_id() failed %i\n",
447                         __func__, rc);
448                 return id;
449         }
450
451         ia->ri_async_rc = -ETIMEDOUT;
452         rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
453         if (rc) {
454                 dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
455                         __func__, rc);
456                 goto out;
457         }
458         wait_for_completion_interruptible_timeout(&ia->ri_done,
459                                 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
460         rc = ia->ri_async_rc;
461         if (rc)
462                 goto out;
463
464         ia->ri_async_rc = -ETIMEDOUT;
465         rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
466         if (rc) {
467                 dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
468                         __func__, rc);
469                 goto out;
470         }
471         wait_for_completion_interruptible_timeout(&ia->ri_done,
472                                 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
473         rc = ia->ri_async_rc;
474         if (rc)
475                 goto out;
476
477         return id;
478
479 out:
480         rdma_destroy_id(id);
481         return ERR_PTR(rc);
482 }
483
484 /*
485  * Drain any cq, prior to teardown.
486  */
487 static void
488 rpcrdma_clean_cq(struct ib_cq *cq)
489 {
490         struct ib_wc wc;
491         int count = 0;
492
493         while (1 == ib_poll_cq(cq, 1, &wc))
494                 ++count;
495
496         if (count)
497                 dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
498                         __func__, count, wc.opcode);
499 }
500
501 /*
502  * Exported functions.
503  */
504
505 /*
506  * Open and initialize an Interface Adapter.
507  *  o initializes fields of struct rpcrdma_ia, including
508  *    interface and provider attributes and protection zone.
509  */
510 int
511 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
512 {
513         int rc, mem_priv;
514         struct ib_device_attr devattr;
515         struct rpcrdma_ia *ia = &xprt->rx_ia;
516
517         ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
518         if (IS_ERR(ia->ri_id)) {
519                 rc = PTR_ERR(ia->ri_id);
520                 goto out1;
521         }
522
523         ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
524         if (IS_ERR(ia->ri_pd)) {
525                 rc = PTR_ERR(ia->ri_pd);
526                 dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
527                         __func__, rc);
528                 goto out2;
529         }
530
531         /*
532          * Query the device to determine if the requested memory
533          * registration strategy is supported. If it isn't, set the
534          * strategy to a globally supported model.
535          */
536         rc = ib_query_device(ia->ri_id->device, &devattr);
537         if (rc) {
538                 dprintk("RPC:       %s: ib_query_device failed %d\n",
539                         __func__, rc);
540                 goto out2;
541         }
542
543         if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
544                 ia->ri_have_dma_lkey = 1;
545                 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
546         }
547
548         if (memreg == RPCRDMA_FRMR) {
549                 /* Requires both frmr reg and local dma lkey */
550                 if ((devattr.device_cap_flags &
551                      (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
552                     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
553                         dprintk("RPC:       %s: FRMR registration "
554                                 "not supported by HCA\n", __func__);
555                         memreg = RPCRDMA_MTHCAFMR;
556                 } else {
557                         /* Mind the ia limit on FRMR page list depth */
558                         ia->ri_max_frmr_depth = min_t(unsigned int,
559                                 RPCRDMA_MAX_DATA_SEGS,
560                                 devattr.max_fast_reg_page_list_len);
561                 }
562         }
563         if (memreg == RPCRDMA_MTHCAFMR) {
564                 if (!ia->ri_id->device->alloc_fmr) {
565                         dprintk("RPC:       %s: MTHCAFMR registration "
566                                 "not supported by HCA\n", __func__);
567                         memreg = RPCRDMA_ALLPHYSICAL;
568                 }
569         }
570
571         /*
572          * Optionally obtain an underlying physical identity mapping in
573          * order to do a memory window-based bind. This base registration
574          * is protected from remote access - that is enabled only by binding
575          * for the specific bytes targeted during each RPC operation, and
576          * revoked after the corresponding completion similar to a storage
577          * adapter.
578          */
579         switch (memreg) {
580         case RPCRDMA_FRMR:
581                 break;
582         case RPCRDMA_ALLPHYSICAL:
583                 mem_priv = IB_ACCESS_LOCAL_WRITE |
584                                 IB_ACCESS_REMOTE_WRITE |
585                                 IB_ACCESS_REMOTE_READ;
586                 goto register_setup;
587         case RPCRDMA_MTHCAFMR:
588                 if (ia->ri_have_dma_lkey)
589                         break;
590                 mem_priv = IB_ACCESS_LOCAL_WRITE;
591         register_setup:
592                 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
593                 if (IS_ERR(ia->ri_bind_mem)) {
594                         printk(KERN_ALERT "%s: ib_get_dma_mr for "
595                                 "phys register failed with %lX\n",
596                                 __func__, PTR_ERR(ia->ri_bind_mem));
597                         rc = -ENOMEM;
598                         goto out2;
599                 }
600                 break;
601         default:
602                 printk(KERN_ERR "RPC: Unsupported memory "
603                                 "registration mode: %d\n", memreg);
604                 rc = -ENOMEM;
605                 goto out2;
606         }
607         dprintk("RPC:       %s: memory registration strategy is %d\n",
608                 __func__, memreg);
609
610         /* Else will do memory reg/dereg for each chunk */
611         ia->ri_memreg_strategy = memreg;
612
613         rwlock_init(&ia->ri_qplock);
614         return 0;
615 out2:
616         rdma_destroy_id(ia->ri_id);
617         ia->ri_id = NULL;
618 out1:
619         return rc;
620 }
621
622 /*
623  * Clean up/close an IA.
624  *   o if event handles and PD have been initialized, free them.
625  *   o close the IA
626  */
627 void
628 rpcrdma_ia_close(struct rpcrdma_ia *ia)
629 {
630         int rc;
631
632         dprintk("RPC:       %s: entering\n", __func__);
633         if (ia->ri_bind_mem != NULL) {
634                 rc = ib_dereg_mr(ia->ri_bind_mem);
635                 dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
636                         __func__, rc);
637         }
638         if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
639                 if (ia->ri_id->qp)
640                         rdma_destroy_qp(ia->ri_id);
641                 rdma_destroy_id(ia->ri_id);
642                 ia->ri_id = NULL;
643         }
644         if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
645                 rc = ib_dealloc_pd(ia->ri_pd);
646                 dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
647                         __func__, rc);
648         }
649 }
650
651 /*
652  * Create unconnected endpoint.
653  */
654 int
655 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
656                                 struct rpcrdma_create_data_internal *cdata)
657 {
658         struct ib_device_attr devattr;
659         struct ib_cq *sendcq, *recvcq;
660         int rc, err;
661
662         rc = ib_query_device(ia->ri_id->device, &devattr);
663         if (rc) {
664                 dprintk("RPC:       %s: ib_query_device failed %d\n",
665                         __func__, rc);
666                 return rc;
667         }
668
669         /* check provider's send/recv wr limits */
670         if (cdata->max_requests > devattr.max_qp_wr)
671                 cdata->max_requests = devattr.max_qp_wr;
672
673         ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
674         ep->rep_attr.qp_context = ep;
675         /* send_cq and recv_cq initialized below */
676         ep->rep_attr.srq = NULL;
677         ep->rep_attr.cap.max_send_wr = cdata->max_requests;
678         switch (ia->ri_memreg_strategy) {
679         case RPCRDMA_FRMR: {
680                 int depth = 7;
681
682                 /* Add room for frmr register and invalidate WRs.
683                  * 1. FRMR reg WR for head
684                  * 2. FRMR invalidate WR for head
685                  * 3. N FRMR reg WRs for pagelist
686                  * 4. N FRMR invalidate WRs for pagelist
687                  * 5. FRMR reg WR for tail
688                  * 6. FRMR invalidate WR for tail
689                  * 7. The RDMA_SEND WR
690                  */
691
692                 /* Calculate N if the device max FRMR depth is smaller than
693                  * RPCRDMA_MAX_DATA_SEGS.
694                  */
695                 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
696                         int delta = RPCRDMA_MAX_DATA_SEGS -
697                                     ia->ri_max_frmr_depth;
698
699                         do {
700                                 depth += 2; /* FRMR reg + invalidate */
701                                 delta -= ia->ri_max_frmr_depth;
702                         } while (delta > 0);
703
704                 }
705                 ep->rep_attr.cap.max_send_wr *= depth;
706                 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
707                         cdata->max_requests = devattr.max_qp_wr / depth;
708                         if (!cdata->max_requests)
709                                 return -EINVAL;
710                         ep->rep_attr.cap.max_send_wr = cdata->max_requests *
711                                                        depth;
712                 }
713                 break;
714         }
715         default:
716                 break;
717         }
718         ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
719         ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
720         ep->rep_attr.cap.max_recv_sge = 1;
721         ep->rep_attr.cap.max_inline_data = 0;
722         ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
723         ep->rep_attr.qp_type = IB_QPT_RC;
724         ep->rep_attr.port_num = ~0;
725
726         dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
727                 "iovs: send %d recv %d\n",
728                 __func__,
729                 ep->rep_attr.cap.max_send_wr,
730                 ep->rep_attr.cap.max_recv_wr,
731                 ep->rep_attr.cap.max_send_sge,
732                 ep->rep_attr.cap.max_recv_sge);
733
734         /* set trigger for requesting send completion */
735         ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
736         if (ep->rep_cqinit <= 2)
737                 ep->rep_cqinit = 0;
738         INIT_CQCOUNT(ep);
739         ep->rep_ia = ia;
740         init_waitqueue_head(&ep->rep_connect_wait);
741         INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
742
743         sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
744                                   rpcrdma_cq_async_error_upcall, ep,
745                                   ep->rep_attr.cap.max_send_wr + 1, 0);
746         if (IS_ERR(sendcq)) {
747                 rc = PTR_ERR(sendcq);
748                 dprintk("RPC:       %s: failed to create send CQ: %i\n",
749                         __func__, rc);
750                 goto out1;
751         }
752
753         rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
754         if (rc) {
755                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
756                         __func__, rc);
757                 goto out2;
758         }
759
760         recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
761                                   rpcrdma_cq_async_error_upcall, ep,
762                                   ep->rep_attr.cap.max_recv_wr + 1, 0);
763         if (IS_ERR(recvcq)) {
764                 rc = PTR_ERR(recvcq);
765                 dprintk("RPC:       %s: failed to create recv CQ: %i\n",
766                         __func__, rc);
767                 goto out2;
768         }
769
770         rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
771         if (rc) {
772                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
773                         __func__, rc);
774                 ib_destroy_cq(recvcq);
775                 goto out2;
776         }
777
778         ep->rep_attr.send_cq = sendcq;
779         ep->rep_attr.recv_cq = recvcq;
780
781         /* Initialize cma parameters */
782
783         /* RPC/RDMA does not use private data */
784         ep->rep_remote_cma.private_data = NULL;
785         ep->rep_remote_cma.private_data_len = 0;
786
787         /* Client offers RDMA Read but does not initiate */
788         ep->rep_remote_cma.initiator_depth = 0;
789         if (devattr.max_qp_rd_atom > 32)        /* arbitrary but <= 255 */
790                 ep->rep_remote_cma.responder_resources = 32;
791         else
792                 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
793
794         ep->rep_remote_cma.retry_count = 7;
795         ep->rep_remote_cma.flow_control = 0;
796         ep->rep_remote_cma.rnr_retry_count = 0;
797
798         return 0;
799
800 out2:
801         err = ib_destroy_cq(sendcq);
802         if (err)
803                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
804                         __func__, err);
805 out1:
806         return rc;
807 }
808
809 /*
810  * rpcrdma_ep_destroy
811  *
812  * Disconnect and destroy endpoint. After this, the only
813  * valid operations on the ep are to free it (if dynamically
814  * allocated) or re-create it.
815  */
816 void
817 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
818 {
819         int rc;
820
821         dprintk("RPC:       %s: entering, connected is %d\n",
822                 __func__, ep->rep_connected);
823
824         cancel_delayed_work_sync(&ep->rep_connect_worker);
825
826         if (ia->ri_id->qp) {
827                 rpcrdma_ep_disconnect(ep, ia);
828                 rdma_destroy_qp(ia->ri_id);
829                 ia->ri_id->qp = NULL;
830         }
831
832         /* padding - could be done in rpcrdma_buffer_destroy... */
833         if (ep->rep_pad_mr) {
834                 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
835                 ep->rep_pad_mr = NULL;
836         }
837
838         rpcrdma_clean_cq(ep->rep_attr.recv_cq);
839         rc = ib_destroy_cq(ep->rep_attr.recv_cq);
840         if (rc)
841                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
842                         __func__, rc);
843
844         rpcrdma_clean_cq(ep->rep_attr.send_cq);
845         rc = ib_destroy_cq(ep->rep_attr.send_cq);
846         if (rc)
847                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
848                         __func__, rc);
849 }
850
851 /*
852  * Connect unconnected endpoint.
853  */
854 int
855 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
856 {
857         struct rdma_cm_id *id, *old;
858         int rc = 0;
859         int retry_count = 0;
860
861         if (ep->rep_connected != 0) {
862                 struct rpcrdma_xprt *xprt;
863 retry:
864                 dprintk("RPC:       %s: reconnecting...\n", __func__);
865
866                 rpcrdma_ep_disconnect(ep, ia);
867                 rpcrdma_flush_cqs(ep);
868
869                 if (ia->ri_memreg_strategy == RPCRDMA_FRMR)
870                         rpcrdma_reset_frmrs(ia);
871
872                 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
873                 id = rpcrdma_create_id(xprt, ia,
874                                 (struct sockaddr *)&xprt->rx_data.addr);
875                 if (IS_ERR(id)) {
876                         rc = -EHOSTUNREACH;
877                         goto out;
878                 }
879                 /* TEMP TEMP TEMP - fail if new device:
880                  * Deregister/remarshal *all* requests!
881                  * Close and recreate adapter, pd, etc!
882                  * Re-determine all attributes still sane!
883                  * More stuff I haven't thought of!
884                  * Rrrgh!
885                  */
886                 if (ia->ri_id->device != id->device) {
887                         printk("RPC:       %s: can't reconnect on "
888                                 "different device!\n", __func__);
889                         rdma_destroy_id(id);
890                         rc = -ENETUNREACH;
891                         goto out;
892                 }
893                 /* END TEMP */
894                 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
895                 if (rc) {
896                         dprintk("RPC:       %s: rdma_create_qp failed %i\n",
897                                 __func__, rc);
898                         rdma_destroy_id(id);
899                         rc = -ENETUNREACH;
900                         goto out;
901                 }
902
903                 write_lock(&ia->ri_qplock);
904                 old = ia->ri_id;
905                 ia->ri_id = id;
906                 write_unlock(&ia->ri_qplock);
907
908                 rdma_destroy_qp(old);
909                 rdma_destroy_id(old);
910         } else {
911                 dprintk("RPC:       %s: connecting...\n", __func__);
912                 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
913                 if (rc) {
914                         dprintk("RPC:       %s: rdma_create_qp failed %i\n",
915                                 __func__, rc);
916                         /* do not update ep->rep_connected */
917                         return -ENETUNREACH;
918                 }
919         }
920
921         ep->rep_connected = 0;
922
923         rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
924         if (rc) {
925                 dprintk("RPC:       %s: rdma_connect() failed with %i\n",
926                                 __func__, rc);
927                 goto out;
928         }
929
930         wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
931
932         /*
933          * Check state. A non-peer reject indicates no listener
934          * (ECONNREFUSED), which may be a transient state. All
935          * others indicate a transport condition which has already
936          * undergone a best-effort.
937          */
938         if (ep->rep_connected == -ECONNREFUSED &&
939             ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
940                 dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
941                 goto retry;
942         }
943         if (ep->rep_connected <= 0) {
944                 /* Sometimes, the only way to reliably connect to remote
945                  * CMs is to use same nonzero values for ORD and IRD. */
946                 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
947                     (ep->rep_remote_cma.responder_resources == 0 ||
948                      ep->rep_remote_cma.initiator_depth !=
949                                 ep->rep_remote_cma.responder_resources)) {
950                         if (ep->rep_remote_cma.responder_resources == 0)
951                                 ep->rep_remote_cma.responder_resources = 1;
952                         ep->rep_remote_cma.initiator_depth =
953                                 ep->rep_remote_cma.responder_resources;
954                         goto retry;
955                 }
956                 rc = ep->rep_connected;
957         } else {
958                 dprintk("RPC:       %s: connected\n", __func__);
959         }
960
961 out:
962         if (rc)
963                 ep->rep_connected = rc;
964         return rc;
965 }
966
967 /*
968  * rpcrdma_ep_disconnect
969  *
970  * This is separate from destroy to facilitate the ability
971  * to reconnect without recreating the endpoint.
972  *
973  * This call is not reentrant, and must not be made in parallel
974  * on the same endpoint.
975  */
976 void
977 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
978 {
979         int rc;
980
981         rpcrdma_flush_cqs(ep);
982         rc = rdma_disconnect(ia->ri_id);
983         if (!rc) {
984                 /* returns without wait if not connected */
985                 wait_event_interruptible(ep->rep_connect_wait,
986                                                         ep->rep_connected != 1);
987                 dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
988                         (ep->rep_connected == 1) ? "still " : "dis");
989         } else {
990                 dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
991                 ep->rep_connected = rc;
992         }
993 }
994
995 static int
996 rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
997 {
998         int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
999         struct ib_fmr_attr fmr_attr = {
1000                 .max_pages      = RPCRDMA_MAX_DATA_SEGS,
1001                 .max_maps       = 1,
1002                 .page_shift     = PAGE_SHIFT
1003         };
1004         struct rpcrdma_mw *r;
1005         int i, rc;
1006
1007         i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1008         dprintk("RPC:       %s: initalizing %d FMRs\n", __func__, i);
1009
1010         while (i--) {
1011                 r = kzalloc(sizeof(*r), GFP_KERNEL);
1012                 if (r == NULL)
1013                         return -ENOMEM;
1014
1015                 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1016                 if (IS_ERR(r->r.fmr)) {
1017                         rc = PTR_ERR(r->r.fmr);
1018                         dprintk("RPC:       %s: ib_alloc_fmr failed %i\n",
1019                                 __func__, rc);
1020                         goto out_free;
1021                 }
1022
1023                 list_add(&r->mw_list, &buf->rb_mws);
1024                 list_add(&r->mw_all, &buf->rb_all);
1025         }
1026         return 0;
1027
1028 out_free:
1029         kfree(r);
1030         return rc;
1031 }
1032
1033 static int
1034 rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1035 {
1036         struct rpcrdma_frmr *f;
1037         struct rpcrdma_mw *r;
1038         int i, rc;
1039
1040         i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1041         dprintk("RPC:       %s: initalizing %d FRMRs\n", __func__, i);
1042
1043         while (i--) {
1044                 r = kzalloc(sizeof(*r), GFP_KERNEL);
1045                 if (r == NULL)
1046                         return -ENOMEM;
1047                 f = &r->r.frmr;
1048
1049                 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1050                                                 ia->ri_max_frmr_depth);
1051                 if (IS_ERR(f->fr_mr)) {
1052                         rc = PTR_ERR(f->fr_mr);
1053                         dprintk("RPC:       %s: ib_alloc_fast_reg_mr "
1054                                 "failed %i\n", __func__, rc);
1055                         goto out_free;
1056                 }
1057
1058                 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1059                                                         ia->ri_max_frmr_depth);
1060                 if (IS_ERR(f->fr_pgl)) {
1061                         rc = PTR_ERR(f->fr_pgl);
1062                         dprintk("RPC:       %s: ib_alloc_fast_reg_page_list "
1063                                 "failed %i\n", __func__, rc);
1064
1065                         ib_dereg_mr(f->fr_mr);
1066                         goto out_free;
1067                 }
1068
1069                 list_add(&r->mw_list, &buf->rb_mws);
1070                 list_add(&r->mw_all, &buf->rb_all);
1071         }
1072
1073         return 0;
1074
1075 out_free:
1076         kfree(r);
1077         return rc;
1078 }
1079
1080 int
1081 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1082         struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
1083 {
1084         char *p;
1085         size_t len, rlen, wlen;
1086         int i, rc;
1087
1088         buf->rb_max_requests = cdata->max_requests;
1089         spin_lock_init(&buf->rb_lock);
1090         atomic_set(&buf->rb_credits, 1);
1091
1092         /* Need to allocate:
1093          *   1.  arrays for send and recv pointers
1094          *   2.  arrays of struct rpcrdma_req to fill in pointers
1095          *   3.  array of struct rpcrdma_rep for replies
1096          *   4.  padding, if any
1097          * Send/recv buffers in req/rep need to be registered
1098          */
1099         len = buf->rb_max_requests *
1100                 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1101         len += cdata->padding;
1102
1103         p = kzalloc(len, GFP_KERNEL);
1104         if (p == NULL) {
1105                 dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1106                         __func__, len);
1107                 rc = -ENOMEM;
1108                 goto out;
1109         }
1110         buf->rb_pool = p;       /* for freeing it later */
1111
1112         buf->rb_send_bufs = (struct rpcrdma_req **) p;
1113         p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1114         buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1115         p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1116
1117         /*
1118          * Register the zeroed pad buffer, if any.
1119          */
1120         if (cdata->padding) {
1121                 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1122                                             &ep->rep_pad_mr, &ep->rep_pad);
1123                 if (rc)
1124                         goto out;
1125         }
1126         p += cdata->padding;
1127
1128         INIT_LIST_HEAD(&buf->rb_mws);
1129         INIT_LIST_HEAD(&buf->rb_all);
1130         switch (ia->ri_memreg_strategy) {
1131         case RPCRDMA_FRMR:
1132                 rc = rpcrdma_init_frmrs(ia, buf);
1133                 if (rc)
1134                         goto out;
1135                 break;
1136         case RPCRDMA_MTHCAFMR:
1137                 rc = rpcrdma_init_fmrs(ia, buf);
1138                 if (rc)
1139                         goto out;
1140                 break;
1141         default:
1142                 break;
1143         }
1144
1145         /*
1146          * Allocate/init the request/reply buffers. Doing this
1147          * using kmalloc for now -- one for each buf.
1148          */
1149         wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
1150         rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
1151         dprintk("RPC:       %s: wlen = %zu, rlen = %zu\n",
1152                 __func__, wlen, rlen);
1153
1154         for (i = 0; i < buf->rb_max_requests; i++) {
1155                 struct rpcrdma_req *req;
1156                 struct rpcrdma_rep *rep;
1157
1158                 req = kmalloc(wlen, GFP_KERNEL);
1159                 if (req == NULL) {
1160                         dprintk("RPC:       %s: request buffer %d alloc"
1161                                 " failed\n", __func__, i);
1162                         rc = -ENOMEM;
1163                         goto out;
1164                 }
1165                 memset(req, 0, sizeof(struct rpcrdma_req));
1166                 buf->rb_send_bufs[i] = req;
1167                 buf->rb_send_bufs[i]->rl_buffer = buf;
1168
1169                 rc = rpcrdma_register_internal(ia, req->rl_base,
1170                                 wlen - offsetof(struct rpcrdma_req, rl_base),
1171                                 &buf->rb_send_bufs[i]->rl_handle,
1172                                 &buf->rb_send_bufs[i]->rl_iov);
1173                 if (rc)
1174                         goto out;
1175
1176                 buf->rb_send_bufs[i]->rl_size = wlen -
1177                                                 sizeof(struct rpcrdma_req);
1178
1179                 rep = kmalloc(rlen, GFP_KERNEL);
1180                 if (rep == NULL) {
1181                         dprintk("RPC:       %s: reply buffer %d alloc failed\n",
1182                                 __func__, i);
1183                         rc = -ENOMEM;
1184                         goto out;
1185                 }
1186                 memset(rep, 0, sizeof(struct rpcrdma_rep));
1187                 buf->rb_recv_bufs[i] = rep;
1188                 buf->rb_recv_bufs[i]->rr_buffer = buf;
1189
1190                 rc = rpcrdma_register_internal(ia, rep->rr_base,
1191                                 rlen - offsetof(struct rpcrdma_rep, rr_base),
1192                                 &buf->rb_recv_bufs[i]->rr_handle,
1193                                 &buf->rb_recv_bufs[i]->rr_iov);
1194                 if (rc)
1195                         goto out;
1196
1197         }
1198         dprintk("RPC:       %s: max_requests %d\n",
1199                 __func__, buf->rb_max_requests);
1200         /* done */
1201         return 0;
1202 out:
1203         rpcrdma_buffer_destroy(buf);
1204         return rc;
1205 }
1206
1207 static void
1208 rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1209 {
1210         struct rpcrdma_mw *r;
1211         int rc;
1212
1213         while (!list_empty(&buf->rb_all)) {
1214                 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1215                 list_del(&r->mw_all);
1216                 list_del(&r->mw_list);
1217
1218                 rc = ib_dealloc_fmr(r->r.fmr);
1219                 if (rc)
1220                         dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
1221                                 __func__, rc);
1222
1223                 kfree(r);
1224         }
1225 }
1226
1227 static void
1228 rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1229 {
1230         struct rpcrdma_mw *r;
1231         int rc;
1232
1233         while (!list_empty(&buf->rb_all)) {
1234                 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1235                 list_del(&r->mw_all);
1236                 list_del(&r->mw_list);
1237
1238                 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1239                 if (rc)
1240                         dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
1241                                 __func__, rc);
1242                 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1243
1244                 kfree(r);
1245         }
1246 }
1247
1248 void
1249 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1250 {
1251         struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1252         int i;
1253
1254         /* clean up in reverse order from create
1255          *   1.  recv mr memory (mr free, then kfree)
1256          *   2.  send mr memory (mr free, then kfree)
1257          *   3.  MWs
1258          */
1259         dprintk("RPC:       %s: entering\n", __func__);
1260
1261         for (i = 0; i < buf->rb_max_requests; i++) {
1262                 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1263                         rpcrdma_deregister_internal(ia,
1264                                         buf->rb_recv_bufs[i]->rr_handle,
1265                                         &buf->rb_recv_bufs[i]->rr_iov);
1266                         kfree(buf->rb_recv_bufs[i]);
1267                 }
1268                 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1269                         rpcrdma_deregister_internal(ia,
1270                                         buf->rb_send_bufs[i]->rl_handle,
1271                                         &buf->rb_send_bufs[i]->rl_iov);
1272                         kfree(buf->rb_send_bufs[i]);
1273                 }
1274         }
1275
1276         switch (ia->ri_memreg_strategy) {
1277         case RPCRDMA_FRMR:
1278                 rpcrdma_destroy_frmrs(buf);
1279                 break;
1280         case RPCRDMA_MTHCAFMR:
1281                 rpcrdma_destroy_fmrs(buf);
1282                 break;
1283         default:
1284                 break;
1285         }
1286
1287         kfree(buf->rb_pool);
1288 }
1289
1290 /* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1291  * an unusable state. Find FRMRs in this state and dereg / reg
1292  * each.  FRMRs that are VALID and attached to an rpcrdma_req are
1293  * also torn down.
1294  *
1295  * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1296  *
1297  * This is invoked only in the transport connect worker in order
1298  * to serialize with rpcrdma_register_frmr_external().
1299  */
1300 static void
1301 rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1302 {
1303         struct rpcrdma_xprt *r_xprt =
1304                                 container_of(ia, struct rpcrdma_xprt, rx_ia);
1305         struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1306         struct list_head *pos;
1307         struct rpcrdma_mw *r;
1308         int rc;
1309
1310         list_for_each(pos, &buf->rb_all) {
1311                 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1312
1313                 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1314                         continue;
1315
1316                 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1317                 if (rc)
1318                         dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
1319                                 __func__, rc);
1320                 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1321
1322                 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1323                                         ia->ri_max_frmr_depth);
1324                 if (IS_ERR(r->r.frmr.fr_mr)) {
1325                         rc = PTR_ERR(r->r.frmr.fr_mr);
1326                         dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
1327                                 " failed %i\n", __func__, rc);
1328                         continue;
1329                 }
1330                 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1331                                         ia->ri_id->device,
1332                                         ia->ri_max_frmr_depth);
1333                 if (IS_ERR(r->r.frmr.fr_pgl)) {
1334                         rc = PTR_ERR(r->r.frmr.fr_pgl);
1335                         dprintk("RPC:       %s: "
1336                                 "ib_alloc_fast_reg_page_list "
1337                                 "failed %i\n", __func__, rc);
1338
1339                         ib_dereg_mr(r->r.frmr.fr_mr);
1340                         continue;
1341                 }
1342                 r->r.frmr.fr_state = FRMR_IS_INVALID;
1343         }
1344 }
1345
1346 /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1347  * some req segments uninitialized.
1348  */
1349 static void
1350 rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1351 {
1352         if (*mw) {
1353                 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1354                 *mw = NULL;
1355         }
1356 }
1357
1358 /* Cycle mw's back in reverse order, and "spin" them.
1359  * This delays and scrambles reuse as much as possible.
1360  */
1361 static void
1362 rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1363 {
1364         struct rpcrdma_mr_seg *seg = req->rl_segments;
1365         struct rpcrdma_mr_seg *seg1 = seg;
1366         int i;
1367
1368         for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
1369                 rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf);
1370         rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf);
1371 }
1372
1373 static void
1374 rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1375 {
1376         buf->rb_send_bufs[--buf->rb_send_index] = req;
1377         req->rl_niovs = 0;
1378         if (req->rl_reply) {
1379                 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1380                 req->rl_reply->rr_func = NULL;
1381                 req->rl_reply = NULL;
1382         }
1383 }
1384
1385 /* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
1386  * Redo only the ib_post_send().
1387  */
1388 static void
1389 rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1390 {
1391         struct rpcrdma_xprt *r_xprt =
1392                                 container_of(ia, struct rpcrdma_xprt, rx_ia);
1393         struct ib_send_wr invalidate_wr, *bad_wr;
1394         int rc;
1395
1396         dprintk("RPC:       %s: FRMR %p is stale\n", __func__, r);
1397
1398         /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
1399         r->r.frmr.fr_state = FRMR_IS_INVALID;
1400
1401         memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1402         invalidate_wr.wr_id = (unsigned long)(void *)r;
1403         invalidate_wr.opcode = IB_WR_LOCAL_INV;
1404         invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1405         DECR_CQCOUNT(&r_xprt->rx_ep);
1406
1407         dprintk("RPC:       %s: frmr %p invalidating rkey %08x\n",
1408                 __func__, r, r->r.frmr.fr_mr->rkey);
1409
1410         read_lock(&ia->ri_qplock);
1411         rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1412         read_unlock(&ia->ri_qplock);
1413         if (rc) {
1414                 /* Force rpcrdma_buffer_get() to retry */
1415                 r->r.frmr.fr_state = FRMR_IS_STALE;
1416                 dprintk("RPC:       %s: ib_post_send failed, %i\n",
1417                         __func__, rc);
1418         }
1419 }
1420
1421 static void
1422 rpcrdma_retry_flushed_linv(struct list_head *stale,
1423                            struct rpcrdma_buffer *buf)
1424 {
1425         struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1426         struct list_head *pos;
1427         struct rpcrdma_mw *r;
1428         unsigned long flags;
1429
1430         list_for_each(pos, stale) {
1431                 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1432                 rpcrdma_retry_local_inv(r, ia);
1433         }
1434
1435         spin_lock_irqsave(&buf->rb_lock, flags);
1436         list_splice_tail(stale, &buf->rb_mws);
1437         spin_unlock_irqrestore(&buf->rb_lock, flags);
1438 }
1439
1440 static struct rpcrdma_req *
1441 rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1442                          struct list_head *stale)
1443 {
1444         struct rpcrdma_mw *r;
1445         int i;
1446
1447         i = RPCRDMA_MAX_SEGS - 1;
1448         while (!list_empty(&buf->rb_mws)) {
1449                 r = list_entry(buf->rb_mws.next,
1450                                struct rpcrdma_mw, mw_list);
1451                 list_del(&r->mw_list);
1452                 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1453                         list_add(&r->mw_list, stale);
1454                         continue;
1455                 }
1456                 req->rl_segments[i].mr_chunk.rl_mw = r;
1457                 if (unlikely(i-- == 0))
1458                         return req;     /* Success */
1459         }
1460
1461         /* Not enough entries on rb_mws for this req */
1462         rpcrdma_buffer_put_sendbuf(req, buf);
1463         rpcrdma_buffer_put_mrs(req, buf);
1464         return NULL;
1465 }
1466
1467 static struct rpcrdma_req *
1468 rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1469 {
1470         struct rpcrdma_mw *r;
1471         int i;
1472
1473         i = RPCRDMA_MAX_SEGS - 1;
1474         while (!list_empty(&buf->rb_mws)) {
1475                 r = list_entry(buf->rb_mws.next,
1476                                struct rpcrdma_mw, mw_list);
1477                 list_del(&r->mw_list);
1478                 req->rl_segments[i].mr_chunk.rl_mw = r;
1479                 if (unlikely(i-- == 0))
1480                         return req;     /* Success */
1481         }
1482
1483         /* Not enough entries on rb_mws for this req */
1484         rpcrdma_buffer_put_sendbuf(req, buf);
1485         rpcrdma_buffer_put_mrs(req, buf);
1486         return NULL;
1487 }
1488
1489 /*
1490  * Get a set of request/reply buffers.
1491  *
1492  * Reply buffer (if needed) is attached to send buffer upon return.
1493  * Rule:
1494  *    rb_send_index and rb_recv_index MUST always be pointing to the
1495  *    *next* available buffer (non-NULL). They are incremented after
1496  *    removing buffers, and decremented *before* returning them.
1497  */
1498 struct rpcrdma_req *
1499 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1500 {
1501         struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1502         struct list_head stale;
1503         struct rpcrdma_req *req;
1504         unsigned long flags;
1505
1506         spin_lock_irqsave(&buffers->rb_lock, flags);
1507         if (buffers->rb_send_index == buffers->rb_max_requests) {
1508                 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1509                 dprintk("RPC:       %s: out of request buffers\n", __func__);
1510                 return ((struct rpcrdma_req *)NULL);
1511         }
1512
1513         req = buffers->rb_send_bufs[buffers->rb_send_index];
1514         if (buffers->rb_send_index < buffers->rb_recv_index) {
1515                 dprintk("RPC:       %s: %d extra receives outstanding (ok)\n",
1516                         __func__,
1517                         buffers->rb_recv_index - buffers->rb_send_index);
1518                 req->rl_reply = NULL;
1519         } else {
1520                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1521                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1522         }
1523         buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1524
1525         INIT_LIST_HEAD(&stale);
1526         switch (ia->ri_memreg_strategy) {
1527         case RPCRDMA_FRMR:
1528                 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1529                 break;
1530         case RPCRDMA_MTHCAFMR:
1531                 req = rpcrdma_buffer_get_fmrs(req, buffers);
1532                 break;
1533         default:
1534                 break;
1535         }
1536         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1537         if (!list_empty(&stale))
1538                 rpcrdma_retry_flushed_linv(&stale, buffers);
1539         return req;
1540 }
1541
1542 /*
1543  * Put request/reply buffers back into pool.
1544  * Pre-decrement counter/array index.
1545  */
1546 void
1547 rpcrdma_buffer_put(struct rpcrdma_req *req)
1548 {
1549         struct rpcrdma_buffer *buffers = req->rl_buffer;
1550         struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1551         unsigned long flags;
1552
1553         spin_lock_irqsave(&buffers->rb_lock, flags);
1554         rpcrdma_buffer_put_sendbuf(req, buffers);
1555         switch (ia->ri_memreg_strategy) {
1556         case RPCRDMA_FRMR:
1557         case RPCRDMA_MTHCAFMR:
1558                 rpcrdma_buffer_put_mrs(req, buffers);
1559                 break;
1560         default:
1561                 break;
1562         }
1563         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1564 }
1565
1566 /*
1567  * Recover reply buffers from pool.
1568  * This happens when recovering from error conditions.
1569  * Post-increment counter/array index.
1570  */
1571 void
1572 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1573 {
1574         struct rpcrdma_buffer *buffers = req->rl_buffer;
1575         unsigned long flags;
1576
1577         if (req->rl_iov.length == 0)    /* special case xprt_rdma_allocate() */
1578                 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1579         spin_lock_irqsave(&buffers->rb_lock, flags);
1580         if (buffers->rb_recv_index < buffers->rb_max_requests) {
1581                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1582                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1583         }
1584         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1585 }
1586
1587 /*
1588  * Put reply buffers back into pool when not attached to
1589  * request. This happens in error conditions.
1590  */
1591 void
1592 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1593 {
1594         struct rpcrdma_buffer *buffers = rep->rr_buffer;
1595         unsigned long flags;
1596
1597         rep->rr_func = NULL;
1598         spin_lock_irqsave(&buffers->rb_lock, flags);
1599         buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1600         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1601 }
1602
1603 /*
1604  * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1605  */
1606
1607 int
1608 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1609                                 struct ib_mr **mrp, struct ib_sge *iov)
1610 {
1611         struct ib_phys_buf ipb;
1612         struct ib_mr *mr;
1613         int rc;
1614
1615         /*
1616          * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1617          */
1618         iov->addr = ib_dma_map_single(ia->ri_id->device,
1619                         va, len, DMA_BIDIRECTIONAL);
1620         if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1621                 return -ENOMEM;
1622
1623         iov->length = len;
1624
1625         if (ia->ri_have_dma_lkey) {
1626                 *mrp = NULL;
1627                 iov->lkey = ia->ri_dma_lkey;
1628                 return 0;
1629         } else if (ia->ri_bind_mem != NULL) {
1630                 *mrp = NULL;
1631                 iov->lkey = ia->ri_bind_mem->lkey;
1632                 return 0;
1633         }
1634
1635         ipb.addr = iov->addr;
1636         ipb.size = iov->length;
1637         mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1638                         IB_ACCESS_LOCAL_WRITE, &iov->addr);
1639
1640         dprintk("RPC:       %s: phys convert: 0x%llx "
1641                         "registered 0x%llx length %d\n",
1642                         __func__, (unsigned long long)ipb.addr,
1643                         (unsigned long long)iov->addr, len);
1644
1645         if (IS_ERR(mr)) {
1646                 *mrp = NULL;
1647                 rc = PTR_ERR(mr);
1648                 dprintk("RPC:       %s: failed with %i\n", __func__, rc);
1649         } else {
1650                 *mrp = mr;
1651                 iov->lkey = mr->lkey;
1652                 rc = 0;
1653         }
1654
1655         return rc;
1656 }
1657
1658 int
1659 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1660                                 struct ib_mr *mr, struct ib_sge *iov)
1661 {
1662         int rc;
1663
1664         ib_dma_unmap_single(ia->ri_id->device,
1665                         iov->addr, iov->length, DMA_BIDIRECTIONAL);
1666
1667         if (NULL == mr)
1668                 return 0;
1669
1670         rc = ib_dereg_mr(mr);
1671         if (rc)
1672                 dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
1673         return rc;
1674 }
1675
1676 /*
1677  * Wrappers for chunk registration, shared by read/write chunk code.
1678  */
1679
1680 static void
1681 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1682 {
1683         seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1684         seg->mr_dmalen = seg->mr_len;
1685         if (seg->mr_page)
1686                 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1687                                 seg->mr_page, offset_in_page(seg->mr_offset),
1688                                 seg->mr_dmalen, seg->mr_dir);
1689         else
1690                 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1691                                 seg->mr_offset,
1692                                 seg->mr_dmalen, seg->mr_dir);
1693         if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1694                 dprintk("RPC:       %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1695                         __func__,
1696                         (unsigned long long)seg->mr_dma,
1697                         seg->mr_offset, seg->mr_dmalen);
1698         }
1699 }
1700
1701 static void
1702 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1703 {
1704         if (seg->mr_page)
1705                 ib_dma_unmap_page(ia->ri_id->device,
1706                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1707         else
1708                 ib_dma_unmap_single(ia->ri_id->device,
1709                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1710 }
1711
1712 static int
1713 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1714                         int *nsegs, int writing, struct rpcrdma_ia *ia,
1715                         struct rpcrdma_xprt *r_xprt)
1716 {
1717         struct rpcrdma_mr_seg *seg1 = seg;
1718         struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw;
1719         struct rpcrdma_frmr *frmr = &mw->r.frmr;
1720         struct ib_mr *mr = frmr->fr_mr;
1721         struct ib_send_wr fastreg_wr, *bad_wr;
1722         u8 key;
1723         int len, pageoff;
1724         int i, rc;
1725         int seg_len;
1726         u64 pa;
1727         int page_no;
1728
1729         pageoff = offset_in_page(seg1->mr_offset);
1730         seg1->mr_offset -= pageoff;     /* start of page */
1731         seg1->mr_len += pageoff;
1732         len = -pageoff;
1733         if (*nsegs > ia->ri_max_frmr_depth)
1734                 *nsegs = ia->ri_max_frmr_depth;
1735         for (page_no = i = 0; i < *nsegs;) {
1736                 rpcrdma_map_one(ia, seg, writing);
1737                 pa = seg->mr_dma;
1738                 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1739                         frmr->fr_pgl->page_list[page_no++] = pa;
1740                         pa += PAGE_SIZE;
1741                 }
1742                 len += seg->mr_len;
1743                 ++seg;
1744                 ++i;
1745                 /* Check for holes */
1746                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1747                     offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1748                         break;
1749         }
1750         dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
1751                 __func__, mw, i);
1752
1753         frmr->fr_state = FRMR_IS_VALID;
1754
1755         memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1756         fastreg_wr.wr_id = (unsigned long)(void *)mw;
1757         fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1758         fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1759         fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1760         fastreg_wr.wr.fast_reg.page_list_len = page_no;
1761         fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1762         fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1763         if (fastreg_wr.wr.fast_reg.length < len) {
1764                 rc = -EIO;
1765                 goto out_err;
1766         }
1767
1768         /* Bump the key */
1769         key = (u8)(mr->rkey & 0x000000FF);
1770         ib_update_fast_reg_key(mr, ++key);
1771
1772         fastreg_wr.wr.fast_reg.access_flags = (writing ?
1773                                 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1774                                 IB_ACCESS_REMOTE_READ);
1775         fastreg_wr.wr.fast_reg.rkey = mr->rkey;
1776         DECR_CQCOUNT(&r_xprt->rx_ep);
1777
1778         rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
1779         if (rc) {
1780                 dprintk("RPC:       %s: failed ib_post_send for register,"
1781                         " status %i\n", __func__, rc);
1782                 ib_update_fast_reg_key(mr, --key);
1783                 goto out_err;
1784         } else {
1785                 seg1->mr_rkey = mr->rkey;
1786                 seg1->mr_base = seg1->mr_dma + pageoff;
1787                 seg1->mr_nsegs = i;
1788                 seg1->mr_len = len;
1789         }
1790         *nsegs = i;
1791         return 0;
1792 out_err:
1793         frmr->fr_state = FRMR_IS_INVALID;
1794         while (i--)
1795                 rpcrdma_unmap_one(ia, --seg);
1796         return rc;
1797 }
1798
1799 static int
1800 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1801                         struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1802 {
1803         struct rpcrdma_mr_seg *seg1 = seg;
1804         struct ib_send_wr invalidate_wr, *bad_wr;
1805         int rc;
1806
1807         seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1808
1809         memset(&invalidate_wr, 0, sizeof invalidate_wr);
1810         invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1811         invalidate_wr.opcode = IB_WR_LOCAL_INV;
1812         invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1813         DECR_CQCOUNT(&r_xprt->rx_ep);
1814
1815         read_lock(&ia->ri_qplock);
1816         while (seg1->mr_nsegs--)
1817                 rpcrdma_unmap_one(ia, seg++);
1818         rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1819         read_unlock(&ia->ri_qplock);
1820         if (rc) {
1821                 /* Force rpcrdma_buffer_get() to retry */
1822                 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
1823                 dprintk("RPC:       %s: failed ib_post_send for invalidate,"
1824                         " status %i\n", __func__, rc);
1825         }
1826         return rc;
1827 }
1828
1829 static int
1830 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1831                         int *nsegs, int writing, struct rpcrdma_ia *ia)
1832 {
1833         struct rpcrdma_mr_seg *seg1 = seg;
1834         u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1835         int len, pageoff, i, rc;
1836
1837         pageoff = offset_in_page(seg1->mr_offset);
1838         seg1->mr_offset -= pageoff;     /* start of page */
1839         seg1->mr_len += pageoff;
1840         len = -pageoff;
1841         if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1842                 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1843         for (i = 0; i < *nsegs;) {
1844                 rpcrdma_map_one(ia, seg, writing);
1845                 physaddrs[i] = seg->mr_dma;
1846                 len += seg->mr_len;
1847                 ++seg;
1848                 ++i;
1849                 /* Check for holes */
1850                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1851                     offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1852                         break;
1853         }
1854         rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1855                                 physaddrs, i, seg1->mr_dma);
1856         if (rc) {
1857                 dprintk("RPC:       %s: failed ib_map_phys_fmr "
1858                         "%u@0x%llx+%i (%d)... status %i\n", __func__,
1859                         len, (unsigned long long)seg1->mr_dma,
1860                         pageoff, i, rc);
1861                 while (i--)
1862                         rpcrdma_unmap_one(ia, --seg);
1863         } else {
1864                 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1865                 seg1->mr_base = seg1->mr_dma + pageoff;
1866                 seg1->mr_nsegs = i;
1867                 seg1->mr_len = len;
1868         }
1869         *nsegs = i;
1870         return rc;
1871 }
1872
1873 static int
1874 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1875                         struct rpcrdma_ia *ia)
1876 {
1877         struct rpcrdma_mr_seg *seg1 = seg;
1878         LIST_HEAD(l);
1879         int rc;
1880
1881         list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1882         rc = ib_unmap_fmr(&l);
1883         read_lock(&ia->ri_qplock);
1884         while (seg1->mr_nsegs--)
1885                 rpcrdma_unmap_one(ia, seg++);
1886         read_unlock(&ia->ri_qplock);
1887         if (rc)
1888                 dprintk("RPC:       %s: failed ib_unmap_fmr,"
1889                         " status %i\n", __func__, rc);
1890         return rc;
1891 }
1892
1893 int
1894 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1895                         int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1896 {
1897         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1898         int rc = 0;
1899
1900         switch (ia->ri_memreg_strategy) {
1901
1902         case RPCRDMA_ALLPHYSICAL:
1903                 rpcrdma_map_one(ia, seg, writing);
1904                 seg->mr_rkey = ia->ri_bind_mem->rkey;
1905                 seg->mr_base = seg->mr_dma;
1906                 seg->mr_nsegs = 1;
1907                 nsegs = 1;
1908                 break;
1909
1910         /* Registration using frmr registration */
1911         case RPCRDMA_FRMR:
1912                 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1913                 break;
1914
1915         /* Registration using fmr memory registration */
1916         case RPCRDMA_MTHCAFMR:
1917                 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1918                 break;
1919
1920         default:
1921                 return -1;
1922         }
1923         if (rc)
1924                 return -1;
1925
1926         return nsegs;
1927 }
1928
1929 int
1930 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1931                 struct rpcrdma_xprt *r_xprt)
1932 {
1933         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1934         int nsegs = seg->mr_nsegs, rc;
1935
1936         switch (ia->ri_memreg_strategy) {
1937
1938         case RPCRDMA_ALLPHYSICAL:
1939                 read_lock(&ia->ri_qplock);
1940                 rpcrdma_unmap_one(ia, seg);
1941                 read_unlock(&ia->ri_qplock);
1942                 break;
1943
1944         case RPCRDMA_FRMR:
1945                 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1946                 break;
1947
1948         case RPCRDMA_MTHCAFMR:
1949                 rc = rpcrdma_deregister_fmr_external(seg, ia);
1950                 break;
1951
1952         default:
1953                 break;
1954         }
1955         return nsegs;
1956 }
1957
1958 /*
1959  * Prepost any receive buffer, then post send.
1960  *
1961  * Receive buffer is donated to hardware, reclaimed upon recv completion.
1962  */
1963 int
1964 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1965                 struct rpcrdma_ep *ep,
1966                 struct rpcrdma_req *req)
1967 {
1968         struct ib_send_wr send_wr, *send_wr_fail;
1969         struct rpcrdma_rep *rep = req->rl_reply;
1970         int rc;
1971
1972         if (rep) {
1973                 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1974                 if (rc)
1975                         goto out;
1976                 req->rl_reply = NULL;
1977         }
1978
1979         send_wr.next = NULL;
1980         send_wr.wr_id = 0ULL;   /* no send cookie */
1981         send_wr.sg_list = req->rl_send_iov;
1982         send_wr.num_sge = req->rl_niovs;
1983         send_wr.opcode = IB_WR_SEND;
1984         if (send_wr.num_sge == 4)       /* no need to sync any pad (constant) */
1985                 ib_dma_sync_single_for_device(ia->ri_id->device,
1986                         req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1987                         DMA_TO_DEVICE);
1988         ib_dma_sync_single_for_device(ia->ri_id->device,
1989                 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1990                 DMA_TO_DEVICE);
1991         ib_dma_sync_single_for_device(ia->ri_id->device,
1992                 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1993                 DMA_TO_DEVICE);
1994
1995         if (DECR_CQCOUNT(ep) > 0)
1996                 send_wr.send_flags = 0;
1997         else { /* Provider must take a send completion every now and then */
1998                 INIT_CQCOUNT(ep);
1999                 send_wr.send_flags = IB_SEND_SIGNALED;
2000         }
2001
2002         rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
2003         if (rc)
2004                 dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
2005                         rc);
2006 out:
2007         return rc;
2008 }
2009
2010 /*
2011  * (Re)post a receive buffer.
2012  */
2013 int
2014 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2015                      struct rpcrdma_ep *ep,
2016                      struct rpcrdma_rep *rep)
2017 {
2018         struct ib_recv_wr recv_wr, *recv_wr_fail;
2019         int rc;
2020
2021         recv_wr.next = NULL;
2022         recv_wr.wr_id = (u64) (unsigned long) rep;
2023         recv_wr.sg_list = &rep->rr_iov;
2024         recv_wr.num_sge = 1;
2025
2026         ib_dma_sync_single_for_cpu(ia->ri_id->device,
2027                 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
2028
2029         rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
2030
2031         if (rc)
2032                 dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
2033                         rc);
2034         return rc;
2035 }
2036
2037 /* Physical mapping means one Read/Write list entry per-page.
2038  * All list entries must fit within an inline buffer
2039  *
2040  * NB: The server must return a Write list for NFS READ,
2041  *     which has the same constraint. Factor in the inline
2042  *     rsize as well.
2043  */
2044 static size_t
2045 rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
2046 {
2047         struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2048         unsigned int inline_size, pages;
2049
2050         inline_size = min_t(unsigned int,
2051                             cdata->inline_wsize, cdata->inline_rsize);
2052         inline_size -= RPCRDMA_HDRLEN_MIN;
2053         pages = inline_size / sizeof(struct rpcrdma_segment);
2054         return pages << PAGE_SHIFT;
2055 }
2056
2057 static size_t
2058 rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2059 {
2060         return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2061 }
2062
2063 size_t
2064 rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2065 {
2066         size_t result;
2067
2068         switch (r_xprt->rx_ia.ri_memreg_strategy) {
2069         case RPCRDMA_ALLPHYSICAL:
2070                 result = rpcrdma_physical_max_payload(r_xprt);
2071                 break;
2072         default:
2073                 result = rpcrdma_mr_max_payload(r_xprt);
2074         }
2075         return result;
2076 }