Merge tag 'dm-4.9-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device...
[cascardo/linux.git] / drivers / nvme / host / rdma.c
1 /*
2  * NVMe over Fabrics RDMA host code.
3  * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  */
14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15 #include <linux/module.h>
16 #include <linux/init.h>
17 #include <linux/slab.h>
18 #include <linux/err.h>
19 #include <linux/string.h>
20 #include <linux/atomic.h>
21 #include <linux/blk-mq.h>
22 #include <linux/types.h>
23 #include <linux/list.h>
24 #include <linux/mutex.h>
25 #include <linux/scatterlist.h>
26 #include <linux/nvme.h>
27 #include <asm/unaligned.h>
28
29 #include <rdma/ib_verbs.h>
30 #include <rdma/rdma_cm.h>
31 #include <rdma/ib_cm.h>
32 #include <linux/nvme-rdma.h>
33
34 #include "nvme.h"
35 #include "fabrics.h"
36
37
38 #define NVME_RDMA_CONNECT_TIMEOUT_MS    1000            /* 1 second */
39
40 #define NVME_RDMA_MAX_SEGMENT_SIZE      0xffffff        /* 24-bit SGL field */
41
42 #define NVME_RDMA_MAX_SEGMENTS          256
43
44 #define NVME_RDMA_MAX_INLINE_SEGMENTS   1
45
46 /*
47  * We handle AEN commands ourselves and don't even let the
48  * block layer know about them.
49  */
50 #define NVME_RDMA_NR_AEN_COMMANDS      1
51 #define NVME_RDMA_AQ_BLKMQ_DEPTH       \
52         (NVMF_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS)
53
54 struct nvme_rdma_device {
55         struct ib_device       *dev;
56         struct ib_pd           *pd;
57         struct kref             ref;
58         struct list_head        entry;
59 };
60
61 struct nvme_rdma_qe {
62         struct ib_cqe           cqe;
63         void                    *data;
64         u64                     dma;
65 };
66
67 struct nvme_rdma_queue;
68 struct nvme_rdma_request {
69         struct ib_mr            *mr;
70         struct nvme_rdma_qe     sqe;
71         struct ib_sge           sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
72         u32                     num_sge;
73         int                     nents;
74         bool                    inline_data;
75         struct ib_reg_wr        reg_wr;
76         struct ib_cqe           reg_cqe;
77         struct nvme_rdma_queue  *queue;
78         struct sg_table         sg_table;
79         struct scatterlist      first_sgl[];
80 };
81
82 enum nvme_rdma_queue_flags {
83         NVME_RDMA_Q_CONNECTED = (1 << 0),
84         NVME_RDMA_IB_QUEUE_ALLOCATED = (1 << 1),
85         NVME_RDMA_Q_DELETING = (1 << 2),
86 };
87
88 struct nvme_rdma_queue {
89         struct nvme_rdma_qe     *rsp_ring;
90         u8                      sig_count;
91         int                     queue_size;
92         size_t                  cmnd_capsule_len;
93         struct nvme_rdma_ctrl   *ctrl;
94         struct nvme_rdma_device *device;
95         struct ib_cq            *ib_cq;
96         struct ib_qp            *qp;
97
98         unsigned long           flags;
99         struct rdma_cm_id       *cm_id;
100         int                     cm_error;
101         struct completion       cm_done;
102 };
103
104 struct nvme_rdma_ctrl {
105         /* read and written in the hot path */
106         spinlock_t              lock;
107
108         /* read only in the hot path */
109         struct nvme_rdma_queue  *queues;
110         u32                     queue_count;
111
112         /* other member variables */
113         struct blk_mq_tag_set   tag_set;
114         struct work_struct      delete_work;
115         struct work_struct      reset_work;
116         struct work_struct      err_work;
117
118         struct nvme_rdma_qe     async_event_sqe;
119
120         int                     reconnect_delay;
121         struct delayed_work     reconnect_work;
122
123         struct list_head        list;
124
125         struct blk_mq_tag_set   admin_tag_set;
126         struct nvme_rdma_device *device;
127
128         u64                     cap;
129         u32                     max_fr_pages;
130
131         union {
132                 struct sockaddr addr;
133                 struct sockaddr_in addr_in;
134         };
135
136         struct nvme_ctrl        ctrl;
137 };
138
139 static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
140 {
141         return container_of(ctrl, struct nvme_rdma_ctrl, ctrl);
142 }
143
144 static LIST_HEAD(device_list);
145 static DEFINE_MUTEX(device_list_mutex);
146
147 static LIST_HEAD(nvme_rdma_ctrl_list);
148 static DEFINE_MUTEX(nvme_rdma_ctrl_mutex);
149
150 static struct workqueue_struct *nvme_rdma_wq;
151
152 /*
153  * Disabling this option makes small I/O goes faster, but is fundamentally
154  * unsafe.  With it turned off we will have to register a global rkey that
155  * allows read and write access to all physical memory.
156  */
157 static bool register_always = true;
158 module_param(register_always, bool, 0444);
159 MODULE_PARM_DESC(register_always,
160          "Use memory registration even for contiguous memory regions");
161
162 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
163                 struct rdma_cm_event *event);
164 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
165
166 /* XXX: really should move to a generic header sooner or later.. */
167 static inline void put_unaligned_le24(u32 val, u8 *p)
168 {
169         *p++ = val;
170         *p++ = val >> 8;
171         *p++ = val >> 16;
172 }
173
174 static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
175 {
176         return queue - queue->ctrl->queues;
177 }
178
179 static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
180 {
181         return queue->cmnd_capsule_len - sizeof(struct nvme_command);
182 }
183
184 static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
185                 size_t capsule_size, enum dma_data_direction dir)
186 {
187         ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir);
188         kfree(qe->data);
189 }
190
191 static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
192                 size_t capsule_size, enum dma_data_direction dir)
193 {
194         qe->data = kzalloc(capsule_size, GFP_KERNEL);
195         if (!qe->data)
196                 return -ENOMEM;
197
198         qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir);
199         if (ib_dma_mapping_error(ibdev, qe->dma)) {
200                 kfree(qe->data);
201                 return -ENOMEM;
202         }
203
204         return 0;
205 }
206
207 static void nvme_rdma_free_ring(struct ib_device *ibdev,
208                 struct nvme_rdma_qe *ring, size_t ib_queue_size,
209                 size_t capsule_size, enum dma_data_direction dir)
210 {
211         int i;
212
213         for (i = 0; i < ib_queue_size; i++)
214                 nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir);
215         kfree(ring);
216 }
217
218 static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
219                 size_t ib_queue_size, size_t capsule_size,
220                 enum dma_data_direction dir)
221 {
222         struct nvme_rdma_qe *ring;
223         int i;
224
225         ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL);
226         if (!ring)
227                 return NULL;
228
229         for (i = 0; i < ib_queue_size; i++) {
230                 if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
231                         goto out_free_ring;
232         }
233
234         return ring;
235
236 out_free_ring:
237         nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir);
238         return NULL;
239 }
240
241 static void nvme_rdma_qp_event(struct ib_event *event, void *context)
242 {
243         pr_debug("QP event %d\n", event->event);
244 }
245
246 static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
247 {
248         wait_for_completion_interruptible_timeout(&queue->cm_done,
249                         msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
250         return queue->cm_error;
251 }
252
253 static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
254 {
255         struct nvme_rdma_device *dev = queue->device;
256         struct ib_qp_init_attr init_attr;
257         int ret;
258
259         memset(&init_attr, 0, sizeof(init_attr));
260         init_attr.event_handler = nvme_rdma_qp_event;
261         /* +1 for drain */
262         init_attr.cap.max_send_wr = factor * queue->queue_size + 1;
263         /* +1 for drain */
264         init_attr.cap.max_recv_wr = queue->queue_size + 1;
265         init_attr.cap.max_recv_sge = 1;
266         init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS;
267         init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
268         init_attr.qp_type = IB_QPT_RC;
269         init_attr.send_cq = queue->ib_cq;
270         init_attr.recv_cq = queue->ib_cq;
271
272         ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
273
274         queue->qp = queue->cm_id->qp;
275         return ret;
276 }
277
278 static int nvme_rdma_reinit_request(void *data, struct request *rq)
279 {
280         struct nvme_rdma_ctrl *ctrl = data;
281         struct nvme_rdma_device *dev = ctrl->device;
282         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
283         int ret = 0;
284
285         if (!req->mr->need_inval)
286                 goto out;
287
288         ib_dereg_mr(req->mr);
289
290         req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
291                         ctrl->max_fr_pages);
292         if (IS_ERR(req->mr)) {
293                 ret = PTR_ERR(req->mr);
294                 req->mr = NULL;
295                 goto out;
296         }
297
298         req->mr->need_inval = false;
299
300 out:
301         return ret;
302 }
303
304 static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl,
305                 struct request *rq, unsigned int queue_idx)
306 {
307         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
308         struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
309         struct nvme_rdma_device *dev = queue->device;
310
311         if (req->mr)
312                 ib_dereg_mr(req->mr);
313
314         nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
315                         DMA_TO_DEVICE);
316 }
317
318 static void nvme_rdma_exit_request(void *data, struct request *rq,
319                                 unsigned int hctx_idx, unsigned int rq_idx)
320 {
321         return __nvme_rdma_exit_request(data, rq, hctx_idx + 1);
322 }
323
324 static void nvme_rdma_exit_admin_request(void *data, struct request *rq,
325                                 unsigned int hctx_idx, unsigned int rq_idx)
326 {
327         return __nvme_rdma_exit_request(data, rq, 0);
328 }
329
330 static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl,
331                 struct request *rq, unsigned int queue_idx)
332 {
333         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
334         struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
335         struct nvme_rdma_device *dev = queue->device;
336         struct ib_device *ibdev = dev->dev;
337         int ret;
338
339         BUG_ON(queue_idx >= ctrl->queue_count);
340
341         ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
342                         DMA_TO_DEVICE);
343         if (ret)
344                 return ret;
345
346         req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
347                         ctrl->max_fr_pages);
348         if (IS_ERR(req->mr)) {
349                 ret = PTR_ERR(req->mr);
350                 goto out_free_qe;
351         }
352
353         req->queue = queue;
354
355         return 0;
356
357 out_free_qe:
358         nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
359                         DMA_TO_DEVICE);
360         return -ENOMEM;
361 }
362
363 static int nvme_rdma_init_request(void *data, struct request *rq,
364                                 unsigned int hctx_idx, unsigned int rq_idx,
365                                 unsigned int numa_node)
366 {
367         return __nvme_rdma_init_request(data, rq, hctx_idx + 1);
368 }
369
370 static int nvme_rdma_init_admin_request(void *data, struct request *rq,
371                                 unsigned int hctx_idx, unsigned int rq_idx,
372                                 unsigned int numa_node)
373 {
374         return __nvme_rdma_init_request(data, rq, 0);
375 }
376
377 static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
378                 unsigned int hctx_idx)
379 {
380         struct nvme_rdma_ctrl *ctrl = data;
381         struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
382
383         BUG_ON(hctx_idx >= ctrl->queue_count);
384
385         hctx->driver_data = queue;
386         return 0;
387 }
388
389 static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
390                 unsigned int hctx_idx)
391 {
392         struct nvme_rdma_ctrl *ctrl = data;
393         struct nvme_rdma_queue *queue = &ctrl->queues[0];
394
395         BUG_ON(hctx_idx != 0);
396
397         hctx->driver_data = queue;
398         return 0;
399 }
400
401 static void nvme_rdma_free_dev(struct kref *ref)
402 {
403         struct nvme_rdma_device *ndev =
404                 container_of(ref, struct nvme_rdma_device, ref);
405
406         mutex_lock(&device_list_mutex);
407         list_del(&ndev->entry);
408         mutex_unlock(&device_list_mutex);
409
410         ib_dealloc_pd(ndev->pd);
411         kfree(ndev);
412 }
413
414 static void nvme_rdma_dev_put(struct nvme_rdma_device *dev)
415 {
416         kref_put(&dev->ref, nvme_rdma_free_dev);
417 }
418
419 static int nvme_rdma_dev_get(struct nvme_rdma_device *dev)
420 {
421         return kref_get_unless_zero(&dev->ref);
422 }
423
424 static struct nvme_rdma_device *
425 nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
426 {
427         struct nvme_rdma_device *ndev;
428
429         mutex_lock(&device_list_mutex);
430         list_for_each_entry(ndev, &device_list, entry) {
431                 if (ndev->dev->node_guid == cm_id->device->node_guid &&
432                     nvme_rdma_dev_get(ndev))
433                         goto out_unlock;
434         }
435
436         ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
437         if (!ndev)
438                 goto out_err;
439
440         ndev->dev = cm_id->device;
441         kref_init(&ndev->ref);
442
443         ndev->pd = ib_alloc_pd(ndev->dev,
444                 register_always ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY);
445         if (IS_ERR(ndev->pd))
446                 goto out_free_dev;
447
448         if (!(ndev->dev->attrs.device_cap_flags &
449               IB_DEVICE_MEM_MGT_EXTENSIONS)) {
450                 dev_err(&ndev->dev->dev,
451                         "Memory registrations not supported.\n");
452                 goto out_free_pd;
453         }
454
455         list_add(&ndev->entry, &device_list);
456 out_unlock:
457         mutex_unlock(&device_list_mutex);
458         return ndev;
459
460 out_free_pd:
461         ib_dealloc_pd(ndev->pd);
462 out_free_dev:
463         kfree(ndev);
464 out_err:
465         mutex_unlock(&device_list_mutex);
466         return NULL;
467 }
468
469 static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
470 {
471         struct nvme_rdma_device *dev;
472         struct ib_device *ibdev;
473
474         if (!test_and_clear_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags))
475                 return;
476
477         dev = queue->device;
478         ibdev = dev->dev;
479         rdma_destroy_qp(queue->cm_id);
480         ib_free_cq(queue->ib_cq);
481
482         nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
483                         sizeof(struct nvme_completion), DMA_FROM_DEVICE);
484
485         nvme_rdma_dev_put(dev);
486 }
487
488 static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue,
489                 struct nvme_rdma_device *dev)
490 {
491         struct ib_device *ibdev = dev->dev;
492         const int send_wr_factor = 3;                   /* MR, SEND, INV */
493         const int cq_factor = send_wr_factor + 1;       /* + RECV */
494         int comp_vector, idx = nvme_rdma_queue_idx(queue);
495
496         int ret;
497
498         queue->device = dev;
499
500         /*
501          * The admin queue is barely used once the controller is live, so don't
502          * bother to spread it out.
503          */
504         if (idx == 0)
505                 comp_vector = 0;
506         else
507                 comp_vector = idx % ibdev->num_comp_vectors;
508
509
510         /* +1 for ib_stop_cq */
511         queue->ib_cq = ib_alloc_cq(dev->dev, queue,
512                                 cq_factor * queue->queue_size + 1, comp_vector,
513                                 IB_POLL_SOFTIRQ);
514         if (IS_ERR(queue->ib_cq)) {
515                 ret = PTR_ERR(queue->ib_cq);
516                 goto out;
517         }
518
519         ret = nvme_rdma_create_qp(queue, send_wr_factor);
520         if (ret)
521                 goto out_destroy_ib_cq;
522
523         queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size,
524                         sizeof(struct nvme_completion), DMA_FROM_DEVICE);
525         if (!queue->rsp_ring) {
526                 ret = -ENOMEM;
527                 goto out_destroy_qp;
528         }
529         set_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags);
530
531         return 0;
532
533 out_destroy_qp:
534         ib_destroy_qp(queue->qp);
535 out_destroy_ib_cq:
536         ib_free_cq(queue->ib_cq);
537 out:
538         return ret;
539 }
540
541 static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
542                 int idx, size_t queue_size)
543 {
544         struct nvme_rdma_queue *queue;
545         int ret;
546
547         queue = &ctrl->queues[idx];
548         queue->ctrl = ctrl;
549         init_completion(&queue->cm_done);
550
551         if (idx > 0)
552                 queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
553         else
554                 queue->cmnd_capsule_len = sizeof(struct nvme_command);
555
556         queue->queue_size = queue_size;
557
558         queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
559                         RDMA_PS_TCP, IB_QPT_RC);
560         if (IS_ERR(queue->cm_id)) {
561                 dev_info(ctrl->ctrl.device,
562                         "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
563                 return PTR_ERR(queue->cm_id);
564         }
565
566         queue->cm_error = -ETIMEDOUT;
567         ret = rdma_resolve_addr(queue->cm_id, NULL, &ctrl->addr,
568                         NVME_RDMA_CONNECT_TIMEOUT_MS);
569         if (ret) {
570                 dev_info(ctrl->ctrl.device,
571                         "rdma_resolve_addr failed (%d).\n", ret);
572                 goto out_destroy_cm_id;
573         }
574
575         ret = nvme_rdma_wait_for_cm(queue);
576         if (ret) {
577                 dev_info(ctrl->ctrl.device,
578                         "rdma_resolve_addr wait failed (%d).\n", ret);
579                 goto out_destroy_cm_id;
580         }
581
582         clear_bit(NVME_RDMA_Q_DELETING, &queue->flags);
583         set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags);
584
585         return 0;
586
587 out_destroy_cm_id:
588         nvme_rdma_destroy_queue_ib(queue);
589         rdma_destroy_id(queue->cm_id);
590         return ret;
591 }
592
593 static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
594 {
595         rdma_disconnect(queue->cm_id);
596         ib_drain_qp(queue->qp);
597 }
598
599 static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
600 {
601         nvme_rdma_destroy_queue_ib(queue);
602         rdma_destroy_id(queue->cm_id);
603 }
604
605 static void nvme_rdma_stop_and_free_queue(struct nvme_rdma_queue *queue)
606 {
607         if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags))
608                 return;
609         nvme_rdma_stop_queue(queue);
610         nvme_rdma_free_queue(queue);
611 }
612
613 static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
614 {
615         int i;
616
617         for (i = 1; i < ctrl->queue_count; i++)
618                 nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
619 }
620
621 static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl)
622 {
623         int i, ret = 0;
624
625         for (i = 1; i < ctrl->queue_count; i++) {
626                 ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
627                 if (ret)
628                         break;
629         }
630
631         return ret;
632 }
633
634 static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl)
635 {
636         int i, ret;
637
638         for (i = 1; i < ctrl->queue_count; i++) {
639                 ret = nvme_rdma_init_queue(ctrl, i,
640                                            ctrl->ctrl.opts->queue_size);
641                 if (ret) {
642                         dev_info(ctrl->ctrl.device,
643                                 "failed to initialize i/o queue: %d\n", ret);
644                         goto out_free_queues;
645                 }
646         }
647
648         return 0;
649
650 out_free_queues:
651         for (i--; i >= 1; i--)
652                 nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
653
654         return ret;
655 }
656
657 static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl)
658 {
659         nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe,
660                         sizeof(struct nvme_command), DMA_TO_DEVICE);
661         nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
662         blk_cleanup_queue(ctrl->ctrl.admin_q);
663         blk_mq_free_tag_set(&ctrl->admin_tag_set);
664         nvme_rdma_dev_put(ctrl->device);
665 }
666
667 static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
668 {
669         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
670
671         if (list_empty(&ctrl->list))
672                 goto free_ctrl;
673
674         mutex_lock(&nvme_rdma_ctrl_mutex);
675         list_del(&ctrl->list);
676         mutex_unlock(&nvme_rdma_ctrl_mutex);
677
678         kfree(ctrl->queues);
679         nvmf_free_options(nctrl->opts);
680 free_ctrl:
681         kfree(ctrl);
682 }
683
684 static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
685 {
686         struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
687                         struct nvme_rdma_ctrl, reconnect_work);
688         bool changed;
689         int ret;
690
691         if (ctrl->queue_count > 1) {
692                 nvme_rdma_free_io_queues(ctrl);
693
694                 ret = blk_mq_reinit_tagset(&ctrl->tag_set);
695                 if (ret)
696                         goto requeue;
697         }
698
699         nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
700
701         ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set);
702         if (ret)
703                 goto requeue;
704
705         ret = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH);
706         if (ret)
707                 goto requeue;
708
709         blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
710
711         ret = nvmf_connect_admin_queue(&ctrl->ctrl);
712         if (ret)
713                 goto stop_admin_q;
714
715         ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
716         if (ret)
717                 goto stop_admin_q;
718
719         nvme_start_keep_alive(&ctrl->ctrl);
720
721         if (ctrl->queue_count > 1) {
722                 ret = nvme_rdma_init_io_queues(ctrl);
723                 if (ret)
724                         goto stop_admin_q;
725
726                 ret = nvme_rdma_connect_io_queues(ctrl);
727                 if (ret)
728                         goto stop_admin_q;
729         }
730
731         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
732         WARN_ON_ONCE(!changed);
733
734         if (ctrl->queue_count > 1) {
735                 nvme_start_queues(&ctrl->ctrl);
736                 nvme_queue_scan(&ctrl->ctrl);
737                 nvme_queue_async_events(&ctrl->ctrl);
738         }
739
740         dev_info(ctrl->ctrl.device, "Successfully reconnected\n");
741
742         return;
743
744 stop_admin_q:
745         blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
746 requeue:
747         /* Make sure we are not resetting/deleting */
748         if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) {
749                 dev_info(ctrl->ctrl.device,
750                         "Failed reconnect attempt, requeueing...\n");
751                 queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
752                                         ctrl->reconnect_delay * HZ);
753         }
754 }
755
756 static void nvme_rdma_error_recovery_work(struct work_struct *work)
757 {
758         struct nvme_rdma_ctrl *ctrl = container_of(work,
759                         struct nvme_rdma_ctrl, err_work);
760         int i;
761
762         nvme_stop_keep_alive(&ctrl->ctrl);
763
764         for (i = 0; i < ctrl->queue_count; i++)
765                 clear_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[i].flags);
766
767         if (ctrl->queue_count > 1)
768                 nvme_stop_queues(&ctrl->ctrl);
769         blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
770
771         /* We must take care of fastfail/requeue all our inflight requests */
772         if (ctrl->queue_count > 1)
773                 blk_mq_tagset_busy_iter(&ctrl->tag_set,
774                                         nvme_cancel_request, &ctrl->ctrl);
775         blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
776                                 nvme_cancel_request, &ctrl->ctrl);
777
778         dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n",
779                 ctrl->reconnect_delay);
780
781         queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
782                                 ctrl->reconnect_delay * HZ);
783 }
784
785 static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
786 {
787         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING))
788                 return;
789
790         queue_work(nvme_rdma_wq, &ctrl->err_work);
791 }
792
793 static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
794                 const char *op)
795 {
796         struct nvme_rdma_queue *queue = cq->cq_context;
797         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
798
799         if (ctrl->ctrl.state == NVME_CTRL_LIVE)
800                 dev_info(ctrl->ctrl.device,
801                              "%s for CQE 0x%p failed with status %s (%d)\n",
802                              op, wc->wr_cqe,
803                              ib_wc_status_msg(wc->status), wc->status);
804         nvme_rdma_error_recovery(ctrl);
805 }
806
807 static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
808 {
809         if (unlikely(wc->status != IB_WC_SUCCESS))
810                 nvme_rdma_wr_error(cq, wc, "MEMREG");
811 }
812
813 static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
814 {
815         if (unlikely(wc->status != IB_WC_SUCCESS))
816                 nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
817 }
818
819 static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
820                 struct nvme_rdma_request *req)
821 {
822         struct ib_send_wr *bad_wr;
823         struct ib_send_wr wr = {
824                 .opcode             = IB_WR_LOCAL_INV,
825                 .next               = NULL,
826                 .num_sge            = 0,
827                 .send_flags         = 0,
828                 .ex.invalidate_rkey = req->mr->rkey,
829         };
830
831         req->reg_cqe.done = nvme_rdma_inv_rkey_done;
832         wr.wr_cqe = &req->reg_cqe;
833
834         return ib_post_send(queue->qp, &wr, &bad_wr);
835 }
836
837 static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
838                 struct request *rq)
839 {
840         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
841         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
842         struct nvme_rdma_device *dev = queue->device;
843         struct ib_device *ibdev = dev->dev;
844         int res;
845
846         if (!blk_rq_bytes(rq))
847                 return;
848
849         if (req->mr->need_inval) {
850                 res = nvme_rdma_inv_rkey(queue, req);
851                 if (res < 0) {
852                         dev_err(ctrl->ctrl.device,
853                                 "Queueing INV WR for rkey %#x failed (%d)\n",
854                                 req->mr->rkey, res);
855                         nvme_rdma_error_recovery(queue->ctrl);
856                 }
857         }
858
859         ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
860                         req->nents, rq_data_dir(rq) ==
861                                     WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
862
863         nvme_cleanup_cmd(rq);
864         sg_free_table_chained(&req->sg_table, true);
865 }
866
867 static int nvme_rdma_set_sg_null(struct nvme_command *c)
868 {
869         struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
870
871         sg->addr = 0;
872         put_unaligned_le24(0, sg->length);
873         put_unaligned_le32(0, sg->key);
874         sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
875         return 0;
876 }
877
878 static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
879                 struct nvme_rdma_request *req, struct nvme_command *c)
880 {
881         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
882
883         req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
884         req->sge[1].length = sg_dma_len(req->sg_table.sgl);
885         req->sge[1].lkey = queue->device->pd->local_dma_lkey;
886
887         sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
888         sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
889         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
890
891         req->inline_data = true;
892         req->num_sge++;
893         return 0;
894 }
895
896 static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue,
897                 struct nvme_rdma_request *req, struct nvme_command *c)
898 {
899         struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
900
901         sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl));
902         put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length);
903         put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
904         sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
905         return 0;
906 }
907
908 static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
909                 struct nvme_rdma_request *req, struct nvme_command *c,
910                 int count)
911 {
912         struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
913         int nr;
914
915         nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE);
916         if (nr < count) {
917                 if (nr < 0)
918                         return nr;
919                 return -EINVAL;
920         }
921
922         ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
923
924         req->reg_cqe.done = nvme_rdma_memreg_done;
925         memset(&req->reg_wr, 0, sizeof(req->reg_wr));
926         req->reg_wr.wr.opcode = IB_WR_REG_MR;
927         req->reg_wr.wr.wr_cqe = &req->reg_cqe;
928         req->reg_wr.wr.num_sge = 0;
929         req->reg_wr.mr = req->mr;
930         req->reg_wr.key = req->mr->rkey;
931         req->reg_wr.access = IB_ACCESS_LOCAL_WRITE |
932                              IB_ACCESS_REMOTE_READ |
933                              IB_ACCESS_REMOTE_WRITE;
934
935         req->mr->need_inval = true;
936
937         sg->addr = cpu_to_le64(req->mr->iova);
938         put_unaligned_le24(req->mr->length, sg->length);
939         put_unaligned_le32(req->mr->rkey, sg->key);
940         sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) |
941                         NVME_SGL_FMT_INVALIDATE;
942
943         return 0;
944 }
945
946 static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
947                 struct request *rq, unsigned int map_len,
948                 struct nvme_command *c)
949 {
950         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
951         struct nvme_rdma_device *dev = queue->device;
952         struct ib_device *ibdev = dev->dev;
953         int nents, count;
954         int ret;
955
956         req->num_sge = 1;
957         req->inline_data = false;
958         req->mr->need_inval = false;
959
960         c->common.flags |= NVME_CMD_SGL_METABUF;
961
962         if (!blk_rq_bytes(rq))
963                 return nvme_rdma_set_sg_null(c);
964
965         req->sg_table.sgl = req->first_sgl;
966         ret = sg_alloc_table_chained(&req->sg_table, rq->nr_phys_segments,
967                                 req->sg_table.sgl);
968         if (ret)
969                 return -ENOMEM;
970
971         nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
972         BUG_ON(nents > rq->nr_phys_segments);
973         req->nents = nents;
974
975         count = ib_dma_map_sg(ibdev, req->sg_table.sgl, nents,
976                     rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
977         if (unlikely(count <= 0)) {
978                 sg_free_table_chained(&req->sg_table, true);
979                 return -EIO;
980         }
981
982         if (count == 1) {
983                 if (rq_data_dir(rq) == WRITE &&
984                     map_len <= nvme_rdma_inline_data_size(queue) &&
985                     nvme_rdma_queue_idx(queue))
986                         return nvme_rdma_map_sg_inline(queue, req, c);
987
988                 if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY)
989                         return nvme_rdma_map_sg_single(queue, req, c);
990         }
991
992         return nvme_rdma_map_sg_fr(queue, req, c, count);
993 }
994
995 static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
996 {
997         if (unlikely(wc->status != IB_WC_SUCCESS))
998                 nvme_rdma_wr_error(cq, wc, "SEND");
999 }
1000
1001 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
1002                 struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
1003                 struct ib_send_wr *first, bool flush)
1004 {
1005         struct ib_send_wr wr, *bad_wr;
1006         int ret;
1007
1008         sge->addr   = qe->dma;
1009         sge->length = sizeof(struct nvme_command),
1010         sge->lkey   = queue->device->pd->local_dma_lkey;
1011
1012         qe->cqe.done = nvme_rdma_send_done;
1013
1014         wr.next       = NULL;
1015         wr.wr_cqe     = &qe->cqe;
1016         wr.sg_list    = sge;
1017         wr.num_sge    = num_sge;
1018         wr.opcode     = IB_WR_SEND;
1019         wr.send_flags = 0;
1020
1021         /*
1022          * Unsignalled send completions are another giant desaster in the
1023          * IB Verbs spec:  If we don't regularly post signalled sends
1024          * the send queue will fill up and only a QP reset will rescue us.
1025          * Would have been way to obvious to handle this in hardware or
1026          * at least the RDMA stack..
1027          *
1028          * This messy and racy code sniplet is copy and pasted from the iSER
1029          * initiator, and the magic '32' comes from there as well.
1030          *
1031          * Always signal the flushes. The magic request used for the flush
1032          * sequencer is not allocated in our driver's tagset and it's
1033          * triggered to be freed by blk_cleanup_queue(). So we need to
1034          * always mark it as signaled to ensure that the "wr_cqe", which is
1035          * embeded in request's payload, is not freed when __ib_process_cq()
1036          * calls wr_cqe->done().
1037          */
1038         if ((++queue->sig_count % 32) == 0 || flush)
1039                 wr.send_flags |= IB_SEND_SIGNALED;
1040
1041         if (first)
1042                 first->next = &wr;
1043         else
1044                 first = &wr;
1045
1046         ret = ib_post_send(queue->qp, first, &bad_wr);
1047         if (ret) {
1048                 dev_err(queue->ctrl->ctrl.device,
1049                              "%s failed with error code %d\n", __func__, ret);
1050         }
1051         return ret;
1052 }
1053
1054 static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue,
1055                 struct nvme_rdma_qe *qe)
1056 {
1057         struct ib_recv_wr wr, *bad_wr;
1058         struct ib_sge list;
1059         int ret;
1060
1061         list.addr   = qe->dma;
1062         list.length = sizeof(struct nvme_completion);
1063         list.lkey   = queue->device->pd->local_dma_lkey;
1064
1065         qe->cqe.done = nvme_rdma_recv_done;
1066
1067         wr.next     = NULL;
1068         wr.wr_cqe   = &qe->cqe;
1069         wr.sg_list  = &list;
1070         wr.num_sge  = 1;
1071
1072         ret = ib_post_recv(queue->qp, &wr, &bad_wr);
1073         if (ret) {
1074                 dev_err(queue->ctrl->ctrl.device,
1075                         "%s failed with error code %d\n", __func__, ret);
1076         }
1077         return ret;
1078 }
1079
1080 static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
1081 {
1082         u32 queue_idx = nvme_rdma_queue_idx(queue);
1083
1084         if (queue_idx == 0)
1085                 return queue->ctrl->admin_tag_set.tags[queue_idx];
1086         return queue->ctrl->tag_set.tags[queue_idx - 1];
1087 }
1088
1089 static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
1090 {
1091         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
1092         struct nvme_rdma_queue *queue = &ctrl->queues[0];
1093         struct ib_device *dev = queue->device->dev;
1094         struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe;
1095         struct nvme_command *cmd = sqe->data;
1096         struct ib_sge sge;
1097         int ret;
1098
1099         if (WARN_ON_ONCE(aer_idx != 0))
1100                 return;
1101
1102         ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
1103
1104         memset(cmd, 0, sizeof(*cmd));
1105         cmd->common.opcode = nvme_admin_async_event;
1106         cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH;
1107         cmd->common.flags |= NVME_CMD_SGL_METABUF;
1108         nvme_rdma_set_sg_null(cmd);
1109
1110         ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
1111                         DMA_TO_DEVICE);
1112
1113         ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false);
1114         WARN_ON_ONCE(ret);
1115 }
1116
1117 static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1118                 struct nvme_completion *cqe, struct ib_wc *wc, int tag)
1119 {
1120         u16 status = le16_to_cpu(cqe->status);
1121         struct request *rq;
1122         struct nvme_rdma_request *req;
1123         int ret = 0;
1124
1125         status >>= 1;
1126
1127         rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
1128         if (!rq) {
1129                 dev_err(queue->ctrl->ctrl.device,
1130                         "tag 0x%x on QP %#x not found\n",
1131                         cqe->command_id, queue->qp->qp_num);
1132                 nvme_rdma_error_recovery(queue->ctrl);
1133                 return ret;
1134         }
1135         req = blk_mq_rq_to_pdu(rq);
1136
1137         if (rq->cmd_type == REQ_TYPE_DRV_PRIV && rq->special)
1138                 memcpy(rq->special, cqe, sizeof(*cqe));
1139
1140         if (rq->tag == tag)
1141                 ret = 1;
1142
1143         if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) &&
1144             wc->ex.invalidate_rkey == req->mr->rkey)
1145                 req->mr->need_inval = false;
1146
1147         blk_mq_complete_request(rq, status);
1148
1149         return ret;
1150 }
1151
1152 static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
1153 {
1154         struct nvme_rdma_qe *qe =
1155                 container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
1156         struct nvme_rdma_queue *queue = cq->cq_context;
1157         struct ib_device *ibdev = queue->device->dev;
1158         struct nvme_completion *cqe = qe->data;
1159         const size_t len = sizeof(struct nvme_completion);
1160         int ret = 0;
1161
1162         if (unlikely(wc->status != IB_WC_SUCCESS)) {
1163                 nvme_rdma_wr_error(cq, wc, "RECV");
1164                 return 0;
1165         }
1166
1167         ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1168         /*
1169          * AEN requests are special as they don't time out and can
1170          * survive any kind of queue freeze and often don't respond to
1171          * aborts.  We don't even bother to allocate a struct request
1172          * for them but rather special case them here.
1173          */
1174         if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
1175                         cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH))
1176                 nvme_complete_async_event(&queue->ctrl->ctrl, cqe);
1177         else
1178                 ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
1179         ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1180
1181         nvme_rdma_post_recv(queue, qe);
1182         return ret;
1183 }
1184
1185 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
1186 {
1187         __nvme_rdma_recv_done(cq, wc, -1);
1188 }
1189
1190 static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
1191 {
1192         int ret, i;
1193
1194         for (i = 0; i < queue->queue_size; i++) {
1195                 ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
1196                 if (ret)
1197                         goto out_destroy_queue_ib;
1198         }
1199
1200         return 0;
1201
1202 out_destroy_queue_ib:
1203         nvme_rdma_destroy_queue_ib(queue);
1204         return ret;
1205 }
1206
1207 static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
1208                 struct rdma_cm_event *ev)
1209 {
1210         if (ev->param.conn.private_data_len) {
1211                 struct nvme_rdma_cm_rej *rej =
1212                         (struct nvme_rdma_cm_rej *)ev->param.conn.private_data;
1213
1214                 dev_err(queue->ctrl->ctrl.device,
1215                         "Connect rejected, status %d.", le16_to_cpu(rej->sts));
1216                 /* XXX: Think of something clever to do here... */
1217         } else {
1218                 dev_err(queue->ctrl->ctrl.device,
1219                         "Connect rejected, no private data.\n");
1220         }
1221
1222         return -ECONNRESET;
1223 }
1224
1225 static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
1226 {
1227         struct nvme_rdma_device *dev;
1228         int ret;
1229
1230         dev = nvme_rdma_find_get_device(queue->cm_id);
1231         if (!dev) {
1232                 dev_err(queue->cm_id->device->dma_device,
1233                         "no client data found!\n");
1234                 return -ECONNREFUSED;
1235         }
1236
1237         ret = nvme_rdma_create_queue_ib(queue, dev);
1238         if (ret) {
1239                 nvme_rdma_dev_put(dev);
1240                 goto out;
1241         }
1242
1243         ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
1244         if (ret) {
1245                 dev_err(queue->ctrl->ctrl.device,
1246                         "rdma_resolve_route failed (%d).\n",
1247                         queue->cm_error);
1248                 goto out_destroy_queue;
1249         }
1250
1251         return 0;
1252
1253 out_destroy_queue:
1254         nvme_rdma_destroy_queue_ib(queue);
1255 out:
1256         return ret;
1257 }
1258
1259 static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
1260 {
1261         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1262         struct rdma_conn_param param = { };
1263         struct nvme_rdma_cm_req priv = { };
1264         int ret;
1265
1266         param.qp_num = queue->qp->qp_num;
1267         param.flow_control = 1;
1268
1269         param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom;
1270         /* maximum retry count */
1271         param.retry_count = 7;
1272         param.rnr_retry_count = 7;
1273         param.private_data = &priv;
1274         param.private_data_len = sizeof(priv);
1275
1276         priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1277         priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue));
1278         /*
1279          * set the admin queue depth to the minimum size
1280          * specified by the Fabrics standard.
1281          */
1282         if (priv.qid == 0) {
1283                 priv.hrqsize = cpu_to_le16(NVMF_AQ_DEPTH);
1284                 priv.hsqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1);
1285         } else {
1286                 /*
1287                  * current interpretation of the fabrics spec
1288                  * is at minimum you make hrqsize sqsize+1, or a
1289                  * 1's based representation of sqsize.
1290                  */
1291                 priv.hrqsize = cpu_to_le16(queue->queue_size);
1292                 priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
1293         }
1294
1295         ret = rdma_connect(queue->cm_id, &param);
1296         if (ret) {
1297                 dev_err(ctrl->ctrl.device,
1298                         "rdma_connect failed (%d).\n", ret);
1299                 goto out_destroy_queue_ib;
1300         }
1301
1302         return 0;
1303
1304 out_destroy_queue_ib:
1305         nvme_rdma_destroy_queue_ib(queue);
1306         return ret;
1307 }
1308
1309 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
1310                 struct rdma_cm_event *ev)
1311 {
1312         struct nvme_rdma_queue *queue = cm_id->context;
1313         int cm_error = 0;
1314
1315         dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n",
1316                 rdma_event_msg(ev->event), ev->event,
1317                 ev->status, cm_id);
1318
1319         switch (ev->event) {
1320         case RDMA_CM_EVENT_ADDR_RESOLVED:
1321                 cm_error = nvme_rdma_addr_resolved(queue);
1322                 break;
1323         case RDMA_CM_EVENT_ROUTE_RESOLVED:
1324                 cm_error = nvme_rdma_route_resolved(queue);
1325                 break;
1326         case RDMA_CM_EVENT_ESTABLISHED:
1327                 queue->cm_error = nvme_rdma_conn_established(queue);
1328                 /* complete cm_done regardless of success/failure */
1329                 complete(&queue->cm_done);
1330                 return 0;
1331         case RDMA_CM_EVENT_REJECTED:
1332                 cm_error = nvme_rdma_conn_rejected(queue, ev);
1333                 break;
1334         case RDMA_CM_EVENT_ADDR_ERROR:
1335         case RDMA_CM_EVENT_ROUTE_ERROR:
1336         case RDMA_CM_EVENT_CONNECT_ERROR:
1337         case RDMA_CM_EVENT_UNREACHABLE:
1338                 dev_dbg(queue->ctrl->ctrl.device,
1339                         "CM error event %d\n", ev->event);
1340                 cm_error = -ECONNRESET;
1341                 break;
1342         case RDMA_CM_EVENT_DISCONNECTED:
1343         case RDMA_CM_EVENT_ADDR_CHANGE:
1344         case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1345                 dev_dbg(queue->ctrl->ctrl.device,
1346                         "disconnect received - connection closed\n");
1347                 nvme_rdma_error_recovery(queue->ctrl);
1348                 break;
1349         case RDMA_CM_EVENT_DEVICE_REMOVAL:
1350                 /* device removal is handled via the ib_client API */
1351                 break;
1352         default:
1353                 dev_err(queue->ctrl->ctrl.device,
1354                         "Unexpected RDMA CM event (%d)\n", ev->event);
1355                 nvme_rdma_error_recovery(queue->ctrl);
1356                 break;
1357         }
1358
1359         if (cm_error) {
1360                 queue->cm_error = cm_error;
1361                 complete(&queue->cm_done);
1362         }
1363
1364         return 0;
1365 }
1366
1367 static enum blk_eh_timer_return
1368 nvme_rdma_timeout(struct request *rq, bool reserved)
1369 {
1370         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1371
1372         /* queue error recovery */
1373         nvme_rdma_error_recovery(req->queue->ctrl);
1374
1375         /* fail with DNR on cmd timeout */
1376         rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
1377
1378         return BLK_EH_HANDLED;
1379 }
1380
1381 static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
1382                 const struct blk_mq_queue_data *bd)
1383 {
1384         struct nvme_ns *ns = hctx->queue->queuedata;
1385         struct nvme_rdma_queue *queue = hctx->driver_data;
1386         struct request *rq = bd->rq;
1387         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1388         struct nvme_rdma_qe *sqe = &req->sqe;
1389         struct nvme_command *c = sqe->data;
1390         bool flush = false;
1391         struct ib_device *dev;
1392         unsigned int map_len;
1393         int ret;
1394
1395         WARN_ON_ONCE(rq->tag < 0);
1396
1397         dev = queue->device->dev;
1398         ib_dma_sync_single_for_cpu(dev, sqe->dma,
1399                         sizeof(struct nvme_command), DMA_TO_DEVICE);
1400
1401         ret = nvme_setup_cmd(ns, rq, c);
1402         if (ret)
1403                 return ret;
1404
1405         c->common.command_id = rq->tag;
1406         blk_mq_start_request(rq);
1407
1408         map_len = nvme_map_len(rq);
1409         ret = nvme_rdma_map_data(queue, rq, map_len, c);
1410         if (ret < 0) {
1411                 dev_err(queue->ctrl->ctrl.device,
1412                              "Failed to map data (%d)\n", ret);
1413                 nvme_cleanup_cmd(rq);
1414                 goto err;
1415         }
1416
1417         ib_dma_sync_single_for_device(dev, sqe->dma,
1418                         sizeof(struct nvme_command), DMA_TO_DEVICE);
1419
1420         if (rq->cmd_type == REQ_TYPE_FS && req_op(rq) == REQ_OP_FLUSH)
1421                 flush = true;
1422         ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
1423                         req->mr->need_inval ? &req->reg_wr.wr : NULL, flush);
1424         if (ret) {
1425                 nvme_rdma_unmap_data(queue, rq);
1426                 goto err;
1427         }
1428
1429         return BLK_MQ_RQ_QUEUE_OK;
1430 err:
1431         return (ret == -ENOMEM || ret == -EAGAIN) ?
1432                 BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR;
1433 }
1434
1435 static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
1436 {
1437         struct nvme_rdma_queue *queue = hctx->driver_data;
1438         struct ib_cq *cq = queue->ib_cq;
1439         struct ib_wc wc;
1440         int found = 0;
1441
1442         ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
1443         while (ib_poll_cq(cq, 1, &wc) > 0) {
1444                 struct ib_cqe *cqe = wc.wr_cqe;
1445
1446                 if (cqe) {
1447                         if (cqe->done == nvme_rdma_recv_done)
1448                                 found |= __nvme_rdma_recv_done(cq, &wc, tag);
1449                         else
1450                                 cqe->done(cq, &wc);
1451                 }
1452         }
1453
1454         return found;
1455 }
1456
1457 static void nvme_rdma_complete_rq(struct request *rq)
1458 {
1459         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1460         struct nvme_rdma_queue *queue = req->queue;
1461         int error = 0;
1462
1463         nvme_rdma_unmap_data(queue, rq);
1464
1465         if (unlikely(rq->errors)) {
1466                 if (nvme_req_needs_retry(rq, rq->errors)) {
1467                         nvme_requeue_req(rq);
1468                         return;
1469                 }
1470
1471                 if (rq->cmd_type == REQ_TYPE_DRV_PRIV)
1472                         error = rq->errors;
1473                 else
1474                         error = nvme_error_status(rq->errors);
1475         }
1476
1477         blk_mq_end_request(rq, error);
1478 }
1479
1480 static struct blk_mq_ops nvme_rdma_mq_ops = {
1481         .queue_rq       = nvme_rdma_queue_rq,
1482         .complete       = nvme_rdma_complete_rq,
1483         .map_queue      = blk_mq_map_queue,
1484         .init_request   = nvme_rdma_init_request,
1485         .exit_request   = nvme_rdma_exit_request,
1486         .reinit_request = nvme_rdma_reinit_request,
1487         .init_hctx      = nvme_rdma_init_hctx,
1488         .poll           = nvme_rdma_poll,
1489         .timeout        = nvme_rdma_timeout,
1490 };
1491
1492 static struct blk_mq_ops nvme_rdma_admin_mq_ops = {
1493         .queue_rq       = nvme_rdma_queue_rq,
1494         .complete       = nvme_rdma_complete_rq,
1495         .map_queue      = blk_mq_map_queue,
1496         .init_request   = nvme_rdma_init_admin_request,
1497         .exit_request   = nvme_rdma_exit_admin_request,
1498         .reinit_request = nvme_rdma_reinit_request,
1499         .init_hctx      = nvme_rdma_init_admin_hctx,
1500         .timeout        = nvme_rdma_timeout,
1501 };
1502
1503 static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl)
1504 {
1505         int error;
1506
1507         error = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH);
1508         if (error)
1509                 return error;
1510
1511         ctrl->device = ctrl->queues[0].device;
1512
1513         /*
1514          * We need a reference on the device as long as the tag_set is alive,
1515          * as the MRs in the request structures need a valid ib_device.
1516          */
1517         error = -EINVAL;
1518         if (!nvme_rdma_dev_get(ctrl->device))
1519                 goto out_free_queue;
1520
1521         ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS,
1522                 ctrl->device->dev->attrs.max_fast_reg_page_list_len);
1523
1524         memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
1525         ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops;
1526         ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH;
1527         ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
1528         ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
1529         ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) +
1530                 SG_CHUNK_SIZE * sizeof(struct scatterlist);
1531         ctrl->admin_tag_set.driver_data = ctrl;
1532         ctrl->admin_tag_set.nr_hw_queues = 1;
1533         ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
1534
1535         error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
1536         if (error)
1537                 goto out_put_dev;
1538
1539         ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
1540         if (IS_ERR(ctrl->ctrl.admin_q)) {
1541                 error = PTR_ERR(ctrl->ctrl.admin_q);
1542                 goto out_free_tagset;
1543         }
1544
1545         error = nvmf_connect_admin_queue(&ctrl->ctrl);
1546         if (error)
1547                 goto out_cleanup_queue;
1548
1549         error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
1550         if (error) {
1551                 dev_err(ctrl->ctrl.device,
1552                         "prop_get NVME_REG_CAP failed\n");
1553                 goto out_cleanup_queue;
1554         }
1555
1556         ctrl->ctrl.sqsize =
1557                 min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize);
1558
1559         error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
1560         if (error)
1561                 goto out_cleanup_queue;
1562
1563         ctrl->ctrl.max_hw_sectors =
1564                 (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9);
1565
1566         error = nvme_init_identify(&ctrl->ctrl);
1567         if (error)
1568                 goto out_cleanup_queue;
1569
1570         error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev,
1571                         &ctrl->async_event_sqe, sizeof(struct nvme_command),
1572                         DMA_TO_DEVICE);
1573         if (error)
1574                 goto out_cleanup_queue;
1575
1576         nvme_start_keep_alive(&ctrl->ctrl);
1577
1578         return 0;
1579
1580 out_cleanup_queue:
1581         blk_cleanup_queue(ctrl->ctrl.admin_q);
1582 out_free_tagset:
1583         /* disconnect and drain the queue before freeing the tagset */
1584         nvme_rdma_stop_queue(&ctrl->queues[0]);
1585         blk_mq_free_tag_set(&ctrl->admin_tag_set);
1586 out_put_dev:
1587         nvme_rdma_dev_put(ctrl->device);
1588 out_free_queue:
1589         nvme_rdma_free_queue(&ctrl->queues[0]);
1590         return error;
1591 }
1592
1593 static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl)
1594 {
1595         nvme_stop_keep_alive(&ctrl->ctrl);
1596         cancel_work_sync(&ctrl->err_work);
1597         cancel_delayed_work_sync(&ctrl->reconnect_work);
1598
1599         if (ctrl->queue_count > 1) {
1600                 nvme_stop_queues(&ctrl->ctrl);
1601                 blk_mq_tagset_busy_iter(&ctrl->tag_set,
1602                                         nvme_cancel_request, &ctrl->ctrl);
1603                 nvme_rdma_free_io_queues(ctrl);
1604         }
1605
1606         if (test_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[0].flags))
1607                 nvme_shutdown_ctrl(&ctrl->ctrl);
1608
1609         blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
1610         blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
1611                                 nvme_cancel_request, &ctrl->ctrl);
1612         nvme_rdma_destroy_admin_queue(ctrl);
1613 }
1614
1615 static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
1616 {
1617         nvme_uninit_ctrl(&ctrl->ctrl);
1618         if (shutdown)
1619                 nvme_rdma_shutdown_ctrl(ctrl);
1620
1621         if (ctrl->ctrl.tagset) {
1622                 blk_cleanup_queue(ctrl->ctrl.connect_q);
1623                 blk_mq_free_tag_set(&ctrl->tag_set);
1624                 nvme_rdma_dev_put(ctrl->device);
1625         }
1626
1627         nvme_put_ctrl(&ctrl->ctrl);
1628 }
1629
1630 static void nvme_rdma_del_ctrl_work(struct work_struct *work)
1631 {
1632         struct nvme_rdma_ctrl *ctrl = container_of(work,
1633                                 struct nvme_rdma_ctrl, delete_work);
1634
1635         __nvme_rdma_remove_ctrl(ctrl, true);
1636 }
1637
1638 static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl)
1639 {
1640         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
1641                 return -EBUSY;
1642
1643         if (!queue_work(nvme_rdma_wq, &ctrl->delete_work))
1644                 return -EBUSY;
1645
1646         return 0;
1647 }
1648
1649 static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl)
1650 {
1651         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1652         int ret = 0;
1653
1654         /*
1655          * Keep a reference until all work is flushed since
1656          * __nvme_rdma_del_ctrl can free the ctrl mem
1657          */
1658         if (!kref_get_unless_zero(&ctrl->ctrl.kref))
1659                 return -EBUSY;
1660         ret = __nvme_rdma_del_ctrl(ctrl);
1661         if (!ret)
1662                 flush_work(&ctrl->delete_work);
1663         nvme_put_ctrl(&ctrl->ctrl);
1664         return ret;
1665 }
1666
1667 static void nvme_rdma_remove_ctrl_work(struct work_struct *work)
1668 {
1669         struct nvme_rdma_ctrl *ctrl = container_of(work,
1670                                 struct nvme_rdma_ctrl, delete_work);
1671
1672         __nvme_rdma_remove_ctrl(ctrl, false);
1673 }
1674
1675 static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
1676 {
1677         struct nvme_rdma_ctrl *ctrl = container_of(work,
1678                                         struct nvme_rdma_ctrl, reset_work);
1679         int ret;
1680         bool changed;
1681
1682         nvme_rdma_shutdown_ctrl(ctrl);
1683
1684         ret = nvme_rdma_configure_admin_queue(ctrl);
1685         if (ret) {
1686                 /* ctrl is already shutdown, just remove the ctrl */
1687                 INIT_WORK(&ctrl->delete_work, nvme_rdma_remove_ctrl_work);
1688                 goto del_dead_ctrl;
1689         }
1690
1691         if (ctrl->queue_count > 1) {
1692                 ret = blk_mq_reinit_tagset(&ctrl->tag_set);
1693                 if (ret)
1694                         goto del_dead_ctrl;
1695
1696                 ret = nvme_rdma_init_io_queues(ctrl);
1697                 if (ret)
1698                         goto del_dead_ctrl;
1699
1700                 ret = nvme_rdma_connect_io_queues(ctrl);
1701                 if (ret)
1702                         goto del_dead_ctrl;
1703         }
1704
1705         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1706         WARN_ON_ONCE(!changed);
1707
1708         if (ctrl->queue_count > 1) {
1709                 nvme_start_queues(&ctrl->ctrl);
1710                 nvme_queue_scan(&ctrl->ctrl);
1711                 nvme_queue_async_events(&ctrl->ctrl);
1712         }
1713
1714         return;
1715
1716 del_dead_ctrl:
1717         /* Deleting this dead controller... */
1718         dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
1719         WARN_ON(!queue_work(nvme_rdma_wq, &ctrl->delete_work));
1720 }
1721
1722 static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl)
1723 {
1724         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1725
1726         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
1727                 return -EBUSY;
1728
1729         if (!queue_work(nvme_rdma_wq, &ctrl->reset_work))
1730                 return -EBUSY;
1731
1732         flush_work(&ctrl->reset_work);
1733
1734         return 0;
1735 }
1736
1737 static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
1738         .name                   = "rdma",
1739         .module                 = THIS_MODULE,
1740         .is_fabrics             = true,
1741         .reg_read32             = nvmf_reg_read32,
1742         .reg_read64             = nvmf_reg_read64,
1743         .reg_write32            = nvmf_reg_write32,
1744         .reset_ctrl             = nvme_rdma_reset_ctrl,
1745         .free_ctrl              = nvme_rdma_free_ctrl,
1746         .submit_async_event     = nvme_rdma_submit_async_event,
1747         .delete_ctrl            = nvme_rdma_del_ctrl,
1748         .get_subsysnqn          = nvmf_get_subsysnqn,
1749         .get_address            = nvmf_get_address,
1750 };
1751
1752 static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl)
1753 {
1754         struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
1755         int ret;
1756
1757         ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
1758         if (ret)
1759                 return ret;
1760
1761         ctrl->queue_count = opts->nr_io_queues + 1;
1762         if (ctrl->queue_count < 2)
1763                 return 0;
1764
1765         dev_info(ctrl->ctrl.device,
1766                 "creating %d I/O queues.\n", opts->nr_io_queues);
1767
1768         ret = nvme_rdma_init_io_queues(ctrl);
1769         if (ret)
1770                 return ret;
1771
1772         /*
1773          * We need a reference on the device as long as the tag_set is alive,
1774          * as the MRs in the request structures need a valid ib_device.
1775          */
1776         ret = -EINVAL;
1777         if (!nvme_rdma_dev_get(ctrl->device))
1778                 goto out_free_io_queues;
1779
1780         memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
1781         ctrl->tag_set.ops = &nvme_rdma_mq_ops;
1782         ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
1783         ctrl->tag_set.reserved_tags = 1; /* fabric connect */
1784         ctrl->tag_set.numa_node = NUMA_NO_NODE;
1785         ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1786         ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) +
1787                 SG_CHUNK_SIZE * sizeof(struct scatterlist);
1788         ctrl->tag_set.driver_data = ctrl;
1789         ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
1790         ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
1791
1792         ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
1793         if (ret)
1794                 goto out_put_dev;
1795         ctrl->ctrl.tagset = &ctrl->tag_set;
1796
1797         ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
1798         if (IS_ERR(ctrl->ctrl.connect_q)) {
1799                 ret = PTR_ERR(ctrl->ctrl.connect_q);
1800                 goto out_free_tag_set;
1801         }
1802
1803         ret = nvme_rdma_connect_io_queues(ctrl);
1804         if (ret)
1805                 goto out_cleanup_connect_q;
1806
1807         return 0;
1808
1809 out_cleanup_connect_q:
1810         blk_cleanup_queue(ctrl->ctrl.connect_q);
1811 out_free_tag_set:
1812         blk_mq_free_tag_set(&ctrl->tag_set);
1813 out_put_dev:
1814         nvme_rdma_dev_put(ctrl->device);
1815 out_free_io_queues:
1816         nvme_rdma_free_io_queues(ctrl);
1817         return ret;
1818 }
1819
1820 static int nvme_rdma_parse_ipaddr(struct sockaddr_in *in_addr, char *p)
1821 {
1822         u8 *addr = (u8 *)&in_addr->sin_addr.s_addr;
1823         size_t buflen = strlen(p);
1824
1825         /* XXX: handle IPv6 addresses */
1826
1827         if (buflen > INET_ADDRSTRLEN)
1828                 return -EINVAL;
1829         if (in4_pton(p, buflen, addr, '\0', NULL) == 0)
1830                 return -EINVAL;
1831         in_addr->sin_family = AF_INET;
1832         return 0;
1833 }
1834
1835 static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
1836                 struct nvmf_ctrl_options *opts)
1837 {
1838         struct nvme_rdma_ctrl *ctrl;
1839         int ret;
1840         bool changed;
1841
1842         ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
1843         if (!ctrl)
1844                 return ERR_PTR(-ENOMEM);
1845         ctrl->ctrl.opts = opts;
1846         INIT_LIST_HEAD(&ctrl->list);
1847
1848         ret = nvme_rdma_parse_ipaddr(&ctrl->addr_in, opts->traddr);
1849         if (ret) {
1850                 pr_err("malformed IP address passed: %s\n", opts->traddr);
1851                 goto out_free_ctrl;
1852         }
1853
1854         if (opts->mask & NVMF_OPT_TRSVCID) {
1855                 u16 port;
1856
1857                 ret = kstrtou16(opts->trsvcid, 0, &port);
1858                 if (ret)
1859                         goto out_free_ctrl;
1860
1861                 ctrl->addr_in.sin_port = cpu_to_be16(port);
1862         } else {
1863                 ctrl->addr_in.sin_port = cpu_to_be16(NVME_RDMA_IP_PORT);
1864         }
1865
1866         ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
1867                                 0 /* no quirks, we're perfect! */);
1868         if (ret)
1869                 goto out_free_ctrl;
1870
1871         ctrl->reconnect_delay = opts->reconnect_delay;
1872         INIT_DELAYED_WORK(&ctrl->reconnect_work,
1873                         nvme_rdma_reconnect_ctrl_work);
1874         INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
1875         INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work);
1876         INIT_WORK(&ctrl->reset_work, nvme_rdma_reset_ctrl_work);
1877         spin_lock_init(&ctrl->lock);
1878
1879         ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
1880         ctrl->ctrl.sqsize = opts->queue_size - 1;
1881         ctrl->ctrl.kato = opts->kato;
1882
1883         ret = -ENOMEM;
1884         ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues),
1885                                 GFP_KERNEL);
1886         if (!ctrl->queues)
1887                 goto out_uninit_ctrl;
1888
1889         ret = nvme_rdma_configure_admin_queue(ctrl);
1890         if (ret)
1891                 goto out_kfree_queues;
1892
1893         /* sanity check icdoff */
1894         if (ctrl->ctrl.icdoff) {
1895                 dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
1896                 goto out_remove_admin_queue;
1897         }
1898
1899         /* sanity check keyed sgls */
1900         if (!(ctrl->ctrl.sgls & (1 << 20))) {
1901                 dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not support\n");
1902                 goto out_remove_admin_queue;
1903         }
1904
1905         if (opts->queue_size > ctrl->ctrl.maxcmd) {
1906                 /* warn if maxcmd is lower than queue_size */
1907                 dev_warn(ctrl->ctrl.device,
1908                         "queue_size %zu > ctrl maxcmd %u, clamping down\n",
1909                         opts->queue_size, ctrl->ctrl.maxcmd);
1910                 opts->queue_size = ctrl->ctrl.maxcmd;
1911         }
1912
1913         if (opts->nr_io_queues) {
1914                 ret = nvme_rdma_create_io_queues(ctrl);
1915                 if (ret)
1916                         goto out_remove_admin_queue;
1917         }
1918
1919         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1920         WARN_ON_ONCE(!changed);
1921
1922         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
1923                 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
1924
1925         kref_get(&ctrl->ctrl.kref);
1926
1927         mutex_lock(&nvme_rdma_ctrl_mutex);
1928         list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
1929         mutex_unlock(&nvme_rdma_ctrl_mutex);
1930
1931         if (opts->nr_io_queues) {
1932                 nvme_queue_scan(&ctrl->ctrl);
1933                 nvme_queue_async_events(&ctrl->ctrl);
1934         }
1935
1936         return &ctrl->ctrl;
1937
1938 out_remove_admin_queue:
1939         nvme_stop_keep_alive(&ctrl->ctrl);
1940         nvme_rdma_destroy_admin_queue(ctrl);
1941 out_kfree_queues:
1942         kfree(ctrl->queues);
1943 out_uninit_ctrl:
1944         nvme_uninit_ctrl(&ctrl->ctrl);
1945         nvme_put_ctrl(&ctrl->ctrl);
1946         if (ret > 0)
1947                 ret = -EIO;
1948         return ERR_PTR(ret);
1949 out_free_ctrl:
1950         kfree(ctrl);
1951         return ERR_PTR(ret);
1952 }
1953
1954 static struct nvmf_transport_ops nvme_rdma_transport = {
1955         .name           = "rdma",
1956         .required_opts  = NVMF_OPT_TRADDR,
1957         .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY,
1958         .create_ctrl    = nvme_rdma_create_ctrl,
1959 };
1960
1961 static void nvme_rdma_add_one(struct ib_device *ib_device)
1962 {
1963 }
1964
1965 static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
1966 {
1967         struct nvme_rdma_ctrl *ctrl;
1968
1969         /* Delete all controllers using this device */
1970         mutex_lock(&nvme_rdma_ctrl_mutex);
1971         list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
1972                 if (ctrl->device->dev != ib_device)
1973                         continue;
1974                 dev_info(ctrl->ctrl.device,
1975                         "Removing ctrl: NQN \"%s\", addr %pISp\n",
1976                         ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
1977                 __nvme_rdma_del_ctrl(ctrl);
1978         }
1979         mutex_unlock(&nvme_rdma_ctrl_mutex);
1980
1981         flush_workqueue(nvme_rdma_wq);
1982 }
1983
1984 static struct ib_client nvme_rdma_ib_client = {
1985         .name   = "nvme_rdma",
1986         .add = nvme_rdma_add_one,
1987         .remove = nvme_rdma_remove_one
1988 };
1989
1990 static int __init nvme_rdma_init_module(void)
1991 {
1992         int ret;
1993
1994         nvme_rdma_wq = create_workqueue("nvme_rdma_wq");
1995         if (!nvme_rdma_wq)
1996                 return -ENOMEM;
1997
1998         ret = ib_register_client(&nvme_rdma_ib_client);
1999         if (ret) {
2000                 destroy_workqueue(nvme_rdma_wq);
2001                 return ret;
2002         }
2003
2004         nvmf_register_transport(&nvme_rdma_transport);
2005         return 0;
2006 }
2007
2008 static void __exit nvme_rdma_cleanup_module(void)
2009 {
2010         nvmf_unregister_transport(&nvme_rdma_transport);
2011         ib_unregister_client(&nvme_rdma_ib_client);
2012         destroy_workqueue(nvme_rdma_wq);
2013 }
2014
2015 module_init(nvme_rdma_init_module);
2016 module_exit(nvme_rdma_cleanup_module);
2017
2018 MODULE_LICENSE("GPL v2");