2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 #include <linux/kref.h>
35 #include <linux/random.h>
36 #include <linux/debugfs.h>
37 #include <linux/export.h>
38 #include <linux/delay.h>
39 #include <rdma/ib_umem.h>
40 #include <rdma/ib_umem_odp.h>
41 #include <rdma/ib_verbs.h>
45 MAX_PENDING_REG_MR = 8,
48 #define MLX5_UMR_ALIGN 2048
49 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
50 static __be64 mlx5_ib_update_mtt_emergency_buffer[
51 MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)]
52 __aligned(MLX5_UMR_ALIGN);
53 static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
56 static int clean_mr(struct mlx5_ib_mr *mr);
58 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
60 int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
62 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
63 /* Wait until all page fault handlers using the mr complete. */
64 synchronize_srcu(&dev->mr_srcu);
70 static int order2idx(struct mlx5_ib_dev *dev, int order)
72 struct mlx5_mr_cache *cache = &dev->cache;
74 if (order < cache->ent[0].order)
77 return order - cache->ent[0].order;
80 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
81 static void update_odp_mr(struct mlx5_ib_mr *mr)
83 if (mr->umem->odp_data) {
85 * This barrier prevents the compiler from moving the
86 * setting of umem->odp_data->private to point to our
87 * MR, before reg_umr finished, to ensure that the MR
88 * initialization have finished before starting to
89 * handle invalidations.
92 mr->umem->odp_data->private = mr;
94 * Make sure we will see the new
95 * umem->odp_data->private value in the invalidation
96 * routines, before we can get page faults on the
97 * MR. Page faults can happen once we put the MR in
98 * the tree, below this line. Without the barrier,
99 * there can be a fault handling and an invalidation
100 * before umem->odp_data->private == mr is visible to
101 * the invalidation handler.
108 static void reg_mr_callback(int status, void *context)
110 struct mlx5_ib_mr *mr = context;
111 struct mlx5_ib_dev *dev = mr->dev;
112 struct mlx5_mr_cache *cache = &dev->cache;
113 int c = order2idx(dev, mr->order);
114 struct mlx5_cache_ent *ent = &cache->ent[c];
117 struct mlx5_mr_table *table = &dev->mdev->priv.mr_table;
120 spin_lock_irqsave(&ent->lock, flags);
122 spin_unlock_irqrestore(&ent->lock, flags);
124 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
127 mod_timer(&dev->delay_timer, jiffies + HZ);
131 if (mr->out.hdr.status) {
132 mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n",
134 be32_to_cpu(mr->out.hdr.syndrome));
137 mod_timer(&dev->delay_timer, jiffies + HZ);
141 spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
142 key = dev->mdev->priv.mkey_key++;
143 spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
144 mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key;
146 cache->last_add = jiffies;
148 spin_lock_irqsave(&ent->lock, flags);
149 list_add_tail(&mr->list, &ent->head);
152 spin_unlock_irqrestore(&ent->lock, flags);
154 write_lock_irqsave(&table->lock, flags);
155 err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmr.key),
158 pr_err("Error inserting to mr tree. 0x%x\n", -err);
159 write_unlock_irqrestore(&table->lock, flags);
162 static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
164 struct mlx5_mr_cache *cache = &dev->cache;
165 struct mlx5_cache_ent *ent = &cache->ent[c];
166 struct mlx5_create_mkey_mbox_in *in;
167 struct mlx5_ib_mr *mr;
168 int npages = 1 << ent->order;
172 in = kzalloc(sizeof(*in), GFP_KERNEL);
176 for (i = 0; i < num; i++) {
177 if (ent->pending >= MAX_PENDING_REG_MR) {
182 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
187 mr->order = ent->order;
190 in->seg.status = MLX5_MKEY_STATUS_FREE;
191 in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2);
192 in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
193 in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN;
194 in->seg.log2_page_size = 12;
196 spin_lock_irq(&ent->lock);
198 spin_unlock_irq(&ent->lock);
199 err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in,
200 sizeof(*in), reg_mr_callback,
203 spin_lock_irq(&ent->lock);
205 spin_unlock_irq(&ent->lock);
206 mlx5_ib_warn(dev, "create mkey failed %d\n", err);
216 static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
218 struct mlx5_mr_cache *cache = &dev->cache;
219 struct mlx5_cache_ent *ent = &cache->ent[c];
220 struct mlx5_ib_mr *mr;
224 for (i = 0; i < num; i++) {
225 spin_lock_irq(&ent->lock);
226 if (list_empty(&ent->head)) {
227 spin_unlock_irq(&ent->lock);
230 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
234 spin_unlock_irq(&ent->lock);
235 err = destroy_mkey(dev, mr);
237 mlx5_ib_warn(dev, "failed destroy mkey\n");
243 static ssize_t size_write(struct file *filp, const char __user *buf,
244 size_t count, loff_t *pos)
246 struct mlx5_cache_ent *ent = filp->private_data;
247 struct mlx5_ib_dev *dev = ent->dev;
253 if (copy_from_user(lbuf, buf, sizeof(lbuf)))
256 c = order2idx(dev, ent->order);
257 lbuf[sizeof(lbuf) - 1] = 0;
259 if (sscanf(lbuf, "%u", &var) != 1)
262 if (var < ent->limit)
265 if (var > ent->size) {
267 err = add_keys(dev, c, var - ent->size);
268 if (err && err != -EAGAIN)
271 usleep_range(3000, 5000);
273 } else if (var < ent->size) {
274 remove_keys(dev, c, ent->size - var);
280 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
283 struct mlx5_cache_ent *ent = filp->private_data;
290 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size);
294 if (copy_to_user(buf, lbuf, err))
302 static const struct file_operations size_fops = {
303 .owner = THIS_MODULE,
309 static ssize_t limit_write(struct file *filp, const char __user *buf,
310 size_t count, loff_t *pos)
312 struct mlx5_cache_ent *ent = filp->private_data;
313 struct mlx5_ib_dev *dev = ent->dev;
319 if (copy_from_user(lbuf, buf, sizeof(lbuf)))
322 c = order2idx(dev, ent->order);
323 lbuf[sizeof(lbuf) - 1] = 0;
325 if (sscanf(lbuf, "%u", &var) != 1)
333 if (ent->cur < ent->limit) {
334 err = add_keys(dev, c, 2 * ent->limit - ent->cur);
342 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
345 struct mlx5_cache_ent *ent = filp->private_data;
352 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
356 if (copy_to_user(buf, lbuf, err))
364 static const struct file_operations limit_fops = {
365 .owner = THIS_MODULE,
367 .write = limit_write,
371 static int someone_adding(struct mlx5_mr_cache *cache)
375 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
376 if (cache->ent[i].cur < cache->ent[i].limit)
383 static void __cache_work_func(struct mlx5_cache_ent *ent)
385 struct mlx5_ib_dev *dev = ent->dev;
386 struct mlx5_mr_cache *cache = &dev->cache;
387 int i = order2idx(dev, ent->order);
393 ent = &dev->cache.ent[i];
394 if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
395 err = add_keys(dev, i, 1);
396 if (ent->cur < 2 * ent->limit) {
397 if (err == -EAGAIN) {
398 mlx5_ib_dbg(dev, "returned eagain, order %d\n",
400 queue_delayed_work(cache->wq, &ent->dwork,
401 msecs_to_jiffies(3));
403 mlx5_ib_warn(dev, "command failed order %d, err %d\n",
405 queue_delayed_work(cache->wq, &ent->dwork,
406 msecs_to_jiffies(1000));
408 queue_work(cache->wq, &ent->work);
411 } else if (ent->cur > 2 * ent->limit) {
413 * The remove_keys() logic is performed as garbage collection
414 * task. Such task is intended to be run when no other active
415 * processes are running.
417 * The need_resched() will return TRUE if there are user tasks
418 * to be activated in near future.
420 * In such case, we don't execute remove_keys() and postpone
421 * the garbage collection work to try to run in next cycle,
422 * in order to free CPU resources to other tasks.
424 if (!need_resched() && !someone_adding(cache) &&
425 time_after(jiffies, cache->last_add + 300 * HZ)) {
426 remove_keys(dev, i, 1);
427 if (ent->cur > ent->limit)
428 queue_work(cache->wq, &ent->work);
430 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
435 static void delayed_cache_work_func(struct work_struct *work)
437 struct mlx5_cache_ent *ent;
439 ent = container_of(work, struct mlx5_cache_ent, dwork.work);
440 __cache_work_func(ent);
443 static void cache_work_func(struct work_struct *work)
445 struct mlx5_cache_ent *ent;
447 ent = container_of(work, struct mlx5_cache_ent, work);
448 __cache_work_func(ent);
451 static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
453 struct mlx5_mr_cache *cache = &dev->cache;
454 struct mlx5_ib_mr *mr = NULL;
455 struct mlx5_cache_ent *ent;
459 c = order2idx(dev, order);
460 if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
461 mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
465 for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) {
466 ent = &cache->ent[i];
468 mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
470 spin_lock_irq(&ent->lock);
471 if (!list_empty(&ent->head)) {
472 mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
476 spin_unlock_irq(&ent->lock);
477 if (ent->cur < ent->limit)
478 queue_work(cache->wq, &ent->work);
481 spin_unlock_irq(&ent->lock);
483 queue_work(cache->wq, &ent->work);
487 cache->ent[c].miss++;
492 static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
494 struct mlx5_mr_cache *cache = &dev->cache;
495 struct mlx5_cache_ent *ent;
499 c = order2idx(dev, mr->order);
500 if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
501 mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
504 ent = &cache->ent[c];
505 spin_lock_irq(&ent->lock);
506 list_add_tail(&mr->list, &ent->head);
508 if (ent->cur > 2 * ent->limit)
510 spin_unlock_irq(&ent->lock);
513 queue_work(cache->wq, &ent->work);
516 static void clean_keys(struct mlx5_ib_dev *dev, int c)
518 struct mlx5_mr_cache *cache = &dev->cache;
519 struct mlx5_cache_ent *ent = &cache->ent[c];
520 struct mlx5_ib_mr *mr;
523 cancel_delayed_work(&ent->dwork);
525 spin_lock_irq(&ent->lock);
526 if (list_empty(&ent->head)) {
527 spin_unlock_irq(&ent->lock);
530 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
534 spin_unlock_irq(&ent->lock);
535 err = destroy_mkey(dev, mr);
537 mlx5_ib_warn(dev, "failed destroy mkey\n");
543 static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
545 struct mlx5_mr_cache *cache = &dev->cache;
546 struct mlx5_cache_ent *ent;
549 if (!mlx5_debugfs_root)
552 cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
556 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
557 ent = &cache->ent[i];
558 sprintf(ent->name, "%d", ent->order);
559 ent->dir = debugfs_create_dir(ent->name, cache->root);
563 ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent,
568 ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent,
573 ent->fcur = debugfs_create_u32("cur", 0400, ent->dir,
578 ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir,
587 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
589 if (!mlx5_debugfs_root)
592 debugfs_remove_recursive(dev->cache.root);
595 static void delay_time_func(unsigned long ctx)
597 struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx;
602 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
604 struct mlx5_mr_cache *cache = &dev->cache;
605 struct mlx5_cache_ent *ent;
610 cache->wq = create_singlethread_workqueue("mkey_cache");
612 mlx5_ib_warn(dev, "failed to create work queue\n");
616 setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
617 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
618 INIT_LIST_HEAD(&cache->ent[i].head);
619 spin_lock_init(&cache->ent[i].lock);
621 ent = &cache->ent[i];
622 INIT_LIST_HEAD(&ent->head);
623 spin_lock_init(&ent->lock);
627 if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE)
628 limit = dev->mdev->profile->mr_cache[i].limit;
632 INIT_WORK(&ent->work, cache_work_func);
633 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
635 queue_work(cache->wq, &ent->work);
638 err = mlx5_mr_cache_debugfs_init(dev);
640 mlx5_ib_warn(dev, "cache debugfs failure\n");
645 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
649 dev->cache.stopped = 1;
650 flush_workqueue(dev->cache.wq);
652 mlx5_mr_cache_debugfs_cleanup(dev);
654 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
657 destroy_workqueue(dev->cache.wq);
658 del_timer_sync(&dev->delay_timer);
663 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
665 struct mlx5_ib_dev *dev = to_mdev(pd->device);
666 struct mlx5_core_dev *mdev = dev->mdev;
667 struct mlx5_create_mkey_mbox_in *in;
668 struct mlx5_mkey_seg *seg;
669 struct mlx5_ib_mr *mr;
672 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
674 return ERR_PTR(-ENOMEM);
676 in = kzalloc(sizeof(*in), GFP_KERNEL);
683 seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA;
684 seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64);
685 seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
688 err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL,
694 mr->ibmr.lkey = mr->mmr.key;
695 mr->ibmr.rkey = mr->mmr.key;
709 static int get_octo_len(u64 addr, u64 len, int page_size)
714 offset = addr & (page_size - 1);
715 npages = ALIGN(len + offset, page_size) >> ilog2(page_size);
716 return (npages + 1) / 2;
719 static int use_umr(int order)
721 return order <= MLX5_MAX_UMR_SHIFT;
724 static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
725 int npages, int page_shift, int *size,
726 __be64 **mr_pas, dma_addr_t *dma)
729 struct device *ddev = dev->ib_dev.dma_device;
732 * UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes.
733 * To avoid copying garbage after the pas array, we allocate
736 *size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT);
737 *mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
741 pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN);
742 mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT);
743 /* Clear padding after the actual pages. */
744 memset(pas + npages, 0, *size - npages * sizeof(u64));
746 *dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE);
747 if (dma_mapping_error(ddev, *dma)) {
755 static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr,
756 struct ib_sge *sg, u64 dma, int n, u32 key,
759 struct mlx5_ib_dev *dev = to_mdev(pd->device);
760 struct mlx5_umr_wr *umrwr = umr_wr(wr);
763 sg->length = ALIGN(sizeof(u64) * n, 64);
764 sg->lkey = dev->umrc.pd->local_dma_lkey;
773 wr->opcode = MLX5_IB_WR_UMR;
776 umrwr->page_shift = page_shift;
780 static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
781 struct ib_sge *sg, u64 dma, int n, u32 key,
782 int page_shift, u64 virt_addr, u64 len,
785 struct mlx5_umr_wr *umrwr = umr_wr(wr);
787 prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift);
791 umrwr->target.virt_addr = virt_addr;
793 umrwr->access_flags = access_flags;
797 static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
798 struct ib_send_wr *wr, u32 key)
800 struct mlx5_umr_wr *umrwr = umr_wr(wr);
802 wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE;
803 wr->opcode = MLX5_IB_WR_UMR;
807 static struct ib_umem *mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
808 int access_flags, int *npages,
809 int *page_shift, int *ncont, int *order)
811 struct mlx5_ib_dev *dev = to_mdev(pd->device);
812 struct ib_umem *umem = ib_umem_get(pd->uobject->context, start, length,
815 mlx5_ib_err(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
819 mlx5_ib_cont_pages(umem, start, npages, page_shift, ncont, order);
821 mlx5_ib_warn(dev, "avoid zero region\n");
822 ib_umem_release(umem);
823 return ERR_PTR(-EINVAL);
826 mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
827 *npages, *ncont, *order, *page_shift);
832 void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context)
834 struct mlx5_ib_umr_context *context;
839 err = ib_poll_cq(cq, 1, &wc);
841 pr_warn("poll cq error %d\n", err);
847 context = (struct mlx5_ib_umr_context *) (unsigned long) wc.wr_id;
848 context->status = wc.status;
849 complete(&context->done);
851 ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
854 static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
855 u64 virt_addr, u64 len, int npages,
856 int page_shift, int order, int access_flags)
858 struct mlx5_ib_dev *dev = to_mdev(pd->device);
859 struct device *ddev = dev->ib_dev.dma_device;
860 struct umr_common *umrc = &dev->umrc;
861 struct mlx5_ib_umr_context umr_context;
862 struct mlx5_umr_wr umrwr;
863 struct ib_send_wr *bad;
864 struct mlx5_ib_mr *mr;
872 for (i = 0; i < 1; i++) {
873 mr = alloc_cached_mr(dev, order);
877 err = add_keys(dev, order2idx(dev, order), 1);
878 if (err && err != -EAGAIN) {
879 mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
885 return ERR_PTR(-EAGAIN);
887 err = dma_map_mr_pas(dev, umem, npages, page_shift, &size, &mr_pas,
892 memset(&umrwr, 0, sizeof(umrwr));
893 umrwr.wr.wr_id = (u64)(unsigned long)&umr_context;
894 prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmr.key,
895 page_shift, virt_addr, len, access_flags);
897 mlx5_ib_init_umr_context(&umr_context);
899 err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
901 mlx5_ib_warn(dev, "post send failed, err %d\n", err);
904 wait_for_completion(&umr_context.done);
905 if (umr_context.status != IB_WC_SUCCESS) {
906 mlx5_ib_warn(dev, "reg umr failed\n");
911 mr->mmr.iova = virt_addr;
913 mr->mmr.pd = to_mpd(pd)->pdn;
919 dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
925 free_cached_mr(dev, mr);
932 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
933 int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
936 struct mlx5_ib_dev *dev = mr->dev;
937 struct device *ddev = dev->ib_dev.dma_device;
938 struct umr_common *umrc = &dev->umrc;
939 struct mlx5_ib_umr_context umr_context;
940 struct ib_umem *umem = mr->umem;
944 struct ib_send_wr *bad;
945 struct mlx5_umr_wr wr;
948 const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64);
949 const int page_index_mask = page_index_alignment - 1;
950 size_t pages_mapped = 0;
951 size_t pages_to_map = 0;
952 size_t pages_iter = 0;
953 int use_emergency_buf = 0;
955 /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
956 * so we need to align the offset and length accordingly */
957 if (start_page_index & page_index_mask) {
958 npages += start_page_index & page_index_mask;
959 start_page_index &= ~page_index_mask;
962 pages_to_map = ALIGN(npages, page_index_alignment);
964 if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES)
967 size = sizeof(u64) * pages_to_map;
968 size = min_t(int, PAGE_SIZE, size);
969 /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim
970 * code, when we are called from an invalidation. The pas buffer must
971 * be 2k-aligned for Connect-IB. */
972 pas = (__be64 *)get_zeroed_page(GFP_ATOMIC);
974 mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n");
975 pas = mlx5_ib_update_mtt_emergency_buffer;
976 size = MLX5_UMR_MTT_MIN_CHUNK_SIZE;
977 use_emergency_buf = 1;
978 mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
979 memset(pas, 0, size);
981 pages_iter = size / sizeof(u64);
982 dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
983 if (dma_mapping_error(ddev, dma)) {
984 mlx5_ib_err(dev, "unable to map DMA during MTT update.\n");
989 for (pages_mapped = 0;
990 pages_mapped < pages_to_map && !err;
991 pages_mapped += pages_iter, start_page_index += pages_iter) {
992 dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
994 npages = min_t(size_t,
996 ib_umem_num_pages(umem) - start_page_index);
999 __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT,
1000 start_page_index, npages, pas,
1001 MLX5_IB_MTT_PRESENT);
1002 /* Clear padding after the pages brought from the
1004 memset(pas + npages, 0, size - npages * sizeof(u64));
1007 dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
1009 memset(&wr, 0, sizeof(wr));
1010 wr.wr.wr_id = (u64)(unsigned long)&umr_context;
1013 sg.length = ALIGN(npages * sizeof(u64),
1014 MLX5_UMR_MTT_ALIGNMENT);
1015 sg.lkey = dev->umrc.pd->local_dma_lkey;
1017 wr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
1018 MLX5_IB_SEND_UMR_UPDATE_MTT;
1019 wr.wr.sg_list = &sg;
1021 wr.wr.opcode = MLX5_IB_WR_UMR;
1022 wr.npages = sg.length / sizeof(u64);
1023 wr.page_shift = PAGE_SHIFT;
1024 wr.mkey = mr->mmr.key;
1025 wr.target.offset = start_page_index;
1027 mlx5_ib_init_umr_context(&umr_context);
1029 err = ib_post_send(umrc->qp, &wr.wr, &bad);
1031 mlx5_ib_err(dev, "UMR post send failed, err %d\n", err);
1033 wait_for_completion(&umr_context.done);
1034 if (umr_context.status != IB_WC_SUCCESS) {
1035 mlx5_ib_err(dev, "UMR completion failed, code %d\n",
1036 umr_context.status);
1042 dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
1045 if (!use_emergency_buf)
1046 free_page((unsigned long)pas);
1048 mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
1055 * If ibmr is NULL it will be allocated by reg_create.
1056 * Else, the given ibmr will be used.
1058 static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
1059 u64 virt_addr, u64 length,
1060 struct ib_umem *umem, int npages,
1061 int page_shift, int access_flags)
1063 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1064 struct mlx5_create_mkey_mbox_in *in;
1065 struct mlx5_ib_mr *mr;
1068 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1070 mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL);
1072 return ERR_PTR(-ENOMEM);
1074 inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2;
1075 in = mlx5_vzalloc(inlen);
1080 mlx5_ib_populate_pas(dev, umem, page_shift, in->pas,
1081 pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1083 /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags
1084 * in the page list submitted with the command. */
1085 in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0;
1086 in->seg.flags = convert_access(access_flags) |
1087 MLX5_ACCESS_MODE_MTT;
1088 in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
1089 in->seg.start_addr = cpu_to_be64(virt_addr);
1090 in->seg.len = cpu_to_be64(length);
1091 in->seg.bsfs_octo_size = 0;
1092 in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
1093 in->seg.log2_page_size = page_shift;
1094 in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
1095 in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length,
1097 err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL,
1100 mlx5_ib_warn(dev, "create mkey failed\n");
1108 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
1119 return ERR_PTR(err);
1122 static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1123 int npages, u64 length, int access_flags)
1125 mr->npages = npages;
1126 atomic_add(npages, &dev->mdev->priv.reg_pages);
1127 mr->ibmr.lkey = mr->mmr.key;
1128 mr->ibmr.rkey = mr->mmr.key;
1129 mr->ibmr.length = length;
1132 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1133 u64 virt_addr, int access_flags,
1134 struct ib_udata *udata)
1136 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1137 struct mlx5_ib_mr *mr = NULL;
1138 struct ib_umem *umem;
1145 mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1146 start, virt_addr, length, access_flags);
1147 umem = mr_umem_get(pd, start, length, access_flags, &npages,
1148 &page_shift, &ncont, &order);
1151 return (void *)umem;
1153 if (use_umr(order)) {
1154 mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift,
1155 order, access_flags);
1156 if (PTR_ERR(mr) == -EAGAIN) {
1157 mlx5_ib_dbg(dev, "cache empty for order %d", order);
1160 } else if (access_flags & IB_ACCESS_ON_DEMAND) {
1162 pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
1167 mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
1168 page_shift, access_flags);
1175 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key);
1178 set_mr_fileds(dev, mr, npages, length, access_flags);
1180 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1187 ib_umem_release(umem);
1188 return ERR_PTR(err);
1191 static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1193 struct umr_common *umrc = &dev->umrc;
1194 struct mlx5_ib_umr_context umr_context;
1195 struct mlx5_umr_wr umrwr;
1196 struct ib_send_wr *bad;
1199 memset(&umrwr.wr, 0, sizeof(umrwr));
1200 umrwr.wr.wr_id = (u64)(unsigned long)&umr_context;
1201 prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmr.key);
1203 mlx5_ib_init_umr_context(&umr_context);
1205 err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
1208 mlx5_ib_dbg(dev, "err %d\n", err);
1211 wait_for_completion(&umr_context.done);
1214 if (umr_context.status != IB_WC_SUCCESS) {
1215 mlx5_ib_warn(dev, "unreg umr failed\n");
1226 mlx5_alloc_priv_descs(struct ib_device *device,
1227 struct mlx5_ib_mr *mr,
1231 int size = ndescs * desc_size;
1235 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1237 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1238 if (!mr->descs_alloc)
1241 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1243 mr->desc_map = dma_map_single(device->dma_device, mr->descs,
1244 size, DMA_TO_DEVICE);
1245 if (dma_mapping_error(device->dma_device, mr->desc_map)) {
1252 kfree(mr->descs_alloc);
1258 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1261 struct ib_device *device = mr->ibmr.device;
1262 int size = mr->max_descs * mr->desc_size;
1264 dma_unmap_single(device->dma_device, mr->desc_map,
1265 size, DMA_TO_DEVICE);
1266 kfree(mr->descs_alloc);
1271 static int clean_mr(struct mlx5_ib_mr *mr)
1273 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1274 int umred = mr->umred;
1278 if (mlx5_core_destroy_psv(dev->mdev,
1279 mr->sig->psv_memory.psv_idx))
1280 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1281 mr->sig->psv_memory.psv_idx);
1282 if (mlx5_core_destroy_psv(dev->mdev,
1283 mr->sig->psv_wire.psv_idx))
1284 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1285 mr->sig->psv_wire.psv_idx);
1290 mlx5_free_priv_descs(mr);
1293 err = destroy_mkey(dev, mr);
1295 mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
1300 err = unreg_umr(dev, mr);
1302 mlx5_ib_warn(dev, "failed unregister\n");
1305 free_cached_mr(dev, mr);
1314 int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
1316 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1317 struct mlx5_ib_mr *mr = to_mmr(ibmr);
1318 int npages = mr->npages;
1319 struct ib_umem *umem = mr->umem;
1321 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1322 if (umem && umem->odp_data) {
1323 /* Prevent new page faults from succeeding */
1325 /* Wait for all running page-fault handlers to finish. */
1326 synchronize_srcu(&dev->mr_srcu);
1327 /* Destroy all page mappings */
1328 mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
1331 * We kill the umem before the MR for ODP,
1332 * so that there will not be any invalidations in
1333 * flight, looking at the *mr struct.
1335 ib_umem_release(umem);
1336 atomic_sub(npages, &dev->mdev->priv.reg_pages);
1338 /* Avoid double-freeing the umem. */
1346 ib_umem_release(umem);
1347 atomic_sub(npages, &dev->mdev->priv.reg_pages);
1353 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
1354 enum ib_mr_type mr_type,
1357 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1358 struct mlx5_create_mkey_mbox_in *in;
1359 struct mlx5_ib_mr *mr;
1360 int access_mode, err;
1361 int ndescs = roundup(max_num_sg, 4);
1363 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1365 return ERR_PTR(-ENOMEM);
1367 in = kzalloc(sizeof(*in), GFP_KERNEL);
1373 in->seg.status = MLX5_MKEY_STATUS_FREE;
1374 in->seg.xlt_oct_size = cpu_to_be32(ndescs);
1375 in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
1376 in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
1378 if (mr_type == IB_MR_TYPE_MEM_REG) {
1379 access_mode = MLX5_ACCESS_MODE_MTT;
1380 in->seg.log2_page_size = PAGE_SHIFT;
1382 err = mlx5_alloc_priv_descs(pd->device, mr,
1383 ndescs, sizeof(u64));
1387 mr->desc_size = sizeof(u64);
1388 mr->max_descs = ndescs;
1389 } else if (mr_type == IB_MR_TYPE_SIGNATURE) {
1392 in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) |
1394 in->seg.bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE);
1395 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
1401 /* create mem & wire PSVs */
1402 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn,
1407 access_mode = MLX5_ACCESS_MODE_KLM;
1408 mr->sig->psv_memory.psv_idx = psv_index[0];
1409 mr->sig->psv_wire.psv_idx = psv_index[1];
1411 mr->sig->sig_status_checked = true;
1412 mr->sig->sig_err_exists = false;
1413 /* Next UMR, Arm SIGERR */
1414 ++mr->sig->sigerr_count;
1416 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
1421 in->seg.flags = MLX5_PERM_UMR_EN | access_mode;
1422 err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in),
1425 goto err_destroy_psv;
1427 mr->ibmr.lkey = mr->mmr.key;
1428 mr->ibmr.rkey = mr->mmr.key;
1436 if (mlx5_core_destroy_psv(dev->mdev,
1437 mr->sig->psv_memory.psv_idx))
1438 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1439 mr->sig->psv_memory.psv_idx);
1440 if (mlx5_core_destroy_psv(dev->mdev,
1441 mr->sig->psv_wire.psv_idx))
1442 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1443 mr->sig->psv_wire.psv_idx);
1445 mlx5_free_priv_descs(mr);
1452 return ERR_PTR(err);
1455 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
1456 struct ib_mr_status *mr_status)
1458 struct mlx5_ib_mr *mmr = to_mmr(ibmr);
1461 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
1462 pr_err("Invalid status check mask\n");
1467 mr_status->fail_status = 0;
1468 if (check_mask & IB_MR_CHECK_SIG_STATUS) {
1471 pr_err("signature status check requested on a non-signature enabled MR\n");
1475 mmr->sig->sig_status_checked = true;
1476 if (!mmr->sig->sig_err_exists)
1479 if (ibmr->lkey == mmr->sig->err_item.key)
1480 memcpy(&mr_status->sig_err, &mmr->sig->err_item,
1481 sizeof(mr_status->sig_err));
1483 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
1484 mr_status->sig_err.sig_err_offset = 0;
1485 mr_status->sig_err.key = mmr->sig->err_item.key;
1488 mmr->sig->sig_err_exists = false;
1489 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
1496 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
1498 struct mlx5_ib_mr *mr = to_mmr(ibmr);
1501 if (unlikely(mr->ndescs == mr->max_descs))
1505 descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
1510 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr,
1511 struct scatterlist *sg,
1514 struct mlx5_ib_mr *mr = to_mmr(ibmr);
1519 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
1520 mr->desc_size * mr->max_descs,
1523 n = ib_sg_to_pages(ibmr, sg, sg_nents, mlx5_set_page);
1525 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
1526 mr->desc_size * mr->max_descs,