Merge tag 'for-linus-2' of git://git.code.sf.net/p/openipmi/linux-ipmi
[cascardo/linux.git] / drivers / infiniband / hw / mlx5 / mr.c
index 49fc3ca..32a28bd 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/export.h>
 #include <linux/delay.h>
 #include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
 #include <rdma/ib_verbs.h>
 #include "mlx5_ib.h"
 
@@ -44,9 +45,27 @@ enum {
        MAX_PENDING_REG_MR = 8,
 };
 
-enum {
-       MLX5_UMR_ALIGN  = 2048
-};
+#define MLX5_UMR_ALIGN 2048
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+static __be64 mlx5_ib_update_mtt_emergency_buffer[
+               MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)]
+       __aligned(MLX5_UMR_ALIGN);
+static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
+#endif
+
+static int clean_mr(struct mlx5_ib_mr *mr);
+
+static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+{
+       int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       /* Wait until all page fault handlers using the mr complete. */
+       synchronize_srcu(&dev->mr_srcu);
+#endif
+
+       return err;
+}
 
 static int order2idx(struct mlx5_ib_dev *dev, int order)
 {
@@ -185,7 +204,7 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
                ent->cur--;
                ent->size--;
                spin_unlock_irq(&ent->lock);
-               err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
+               err = destroy_mkey(dev, mr);
                if (err)
                        mlx5_ib_warn(dev, "failed destroy mkey\n");
                else
@@ -476,7 +495,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
                ent->cur--;
                ent->size--;
                spin_unlock_irq(&ent->lock);
-               err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
+               err = destroy_mkey(dev, mr);
                if (err)
                        mlx5_ib_warn(dev, "failed destroy mkey\n");
                else
@@ -806,6 +825,8 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
        mr->mmr.size = len;
        mr->mmr.pd = to_mpd(pd)->pdn;
 
+       mr->live = 1;
+
 unmap_dma:
        up(&umrc->sem);
        dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
@@ -822,6 +843,128 @@ free_mr:
        return mr;
 }
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
+                      int zap)
+{
+       struct mlx5_ib_dev *dev = mr->dev;
+       struct device *ddev = dev->ib_dev.dma_device;
+       struct umr_common *umrc = &dev->umrc;
+       struct mlx5_ib_umr_context umr_context;
+       struct ib_umem *umem = mr->umem;
+       int size;
+       __be64 *pas;
+       dma_addr_t dma;
+       struct ib_send_wr wr, *bad;
+       struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr.wr.fast_reg;
+       struct ib_sge sg;
+       int err = 0;
+       const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64);
+       const int page_index_mask = page_index_alignment - 1;
+       size_t pages_mapped = 0;
+       size_t pages_to_map = 0;
+       size_t pages_iter = 0;
+       int use_emergency_buf = 0;
+
+       /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
+        * so we need to align the offset and length accordingly */
+       if (start_page_index & page_index_mask) {
+               npages += start_page_index & page_index_mask;
+               start_page_index &= ~page_index_mask;
+       }
+
+       pages_to_map = ALIGN(npages, page_index_alignment);
+
+       if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES)
+               return -EINVAL;
+
+       size = sizeof(u64) * pages_to_map;
+       size = min_t(int, PAGE_SIZE, size);
+       /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim
+        * code, when we are called from an invalidation. The pas buffer must
+        * be 2k-aligned for Connect-IB. */
+       pas = (__be64 *)get_zeroed_page(GFP_ATOMIC);
+       if (!pas) {
+               mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n");
+               pas = mlx5_ib_update_mtt_emergency_buffer;
+               size = MLX5_UMR_MTT_MIN_CHUNK_SIZE;
+               use_emergency_buf = 1;
+               mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
+               memset(pas, 0, size);
+       }
+       pages_iter = size / sizeof(u64);
+       dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
+       if (dma_mapping_error(ddev, dma)) {
+               mlx5_ib_err(dev, "unable to map DMA during MTT update.\n");
+               err = -ENOMEM;
+               goto free_pas;
+       }
+
+       for (pages_mapped = 0;
+            pages_mapped < pages_to_map && !err;
+            pages_mapped += pages_iter, start_page_index += pages_iter) {
+               dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
+
+               npages = min_t(size_t,
+                              pages_iter,
+                              ib_umem_num_pages(umem) - start_page_index);
+
+               if (!zap) {
+                       __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT,
+                                              start_page_index, npages, pas,
+                                              MLX5_IB_MTT_PRESENT);
+                       /* Clear padding after the pages brought from the
+                        * umem. */
+                       memset(pas + npages, 0, size - npages * sizeof(u64));
+               }
+
+               dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
+
+               memset(&wr, 0, sizeof(wr));
+               wr.wr_id = (u64)(unsigned long)&umr_context;
+
+               sg.addr = dma;
+               sg.length = ALIGN(npages * sizeof(u64),
+                               MLX5_UMR_MTT_ALIGNMENT);
+               sg.lkey = dev->umrc.mr->lkey;
+
+               wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
+                               MLX5_IB_SEND_UMR_UPDATE_MTT;
+               wr.sg_list = &sg;
+               wr.num_sge = 1;
+               wr.opcode = MLX5_IB_WR_UMR;
+               umrwr->npages = sg.length / sizeof(u64);
+               umrwr->page_shift = PAGE_SHIFT;
+               umrwr->mkey = mr->mmr.key;
+               umrwr->target.offset = start_page_index;
+
+               mlx5_ib_init_umr_context(&umr_context);
+               down(&umrc->sem);
+               err = ib_post_send(umrc->qp, &wr, &bad);
+               if (err) {
+                       mlx5_ib_err(dev, "UMR post send failed, err %d\n", err);
+               } else {
+                       wait_for_completion(&umr_context.done);
+                       if (umr_context.status != IB_WC_SUCCESS) {
+                               mlx5_ib_err(dev, "UMR completion failed, code %d\n",
+                                           umr_context.status);
+                               err = -EFAULT;
+                       }
+               }
+               up(&umrc->sem);
+       }
+       dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
+
+free_pas:
+       if (!use_emergency_buf)
+               free_page((unsigned long)pas);
+       else
+               mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
+
+       return err;
+}
+#endif
+
 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
                                     u64 length, struct ib_umem *umem,
                                     int npages, int page_shift,
@@ -869,6 +1012,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
                goto err_2;
        }
        mr->umem = umem;
+       mr->live = 1;
        kvfree(in);
 
        mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
@@ -923,6 +1067,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                        mlx5_ib_dbg(dev, "cache empty for order %d", order);
                        mr = NULL;
                }
+       } else if (access_flags & IB_ACCESS_ON_DEMAND) {
+               err = -EINVAL;
+               pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
+               goto error;
        }
 
        if (!mr)
@@ -938,16 +1086,51 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
        mr->umem = umem;
        mr->npages = npages;
-       spin_lock(&dev->mr_lock);
-       dev->mdev->priv.reg_pages += npages;
-       spin_unlock(&dev->mr_lock);
+       atomic_add(npages, &dev->mdev->priv.reg_pages);
        mr->ibmr.lkey = mr->mmr.key;
        mr->ibmr.rkey = mr->mmr.key;
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       if (umem->odp_data) {
+               /*
+                * This barrier prevents the compiler from moving the
+                * setting of umem->odp_data->private to point to our
+                * MR, before reg_umr finished, to ensure that the MR
+                * initialization have finished before starting to
+                * handle invalidations.
+                */
+               smp_wmb();
+               mr->umem->odp_data->private = mr;
+               /*
+                * Make sure we will see the new
+                * umem->odp_data->private value in the invalidation
+                * routines, before we can get page faults on the
+                * MR. Page faults can happen once we put the MR in
+                * the tree, below this line. Without the barrier,
+                * there can be a fault handling and an invalidation
+                * before umem->odp_data->private == mr is visible to
+                * the invalidation handler.
+                */
+               smp_wmb();
+       }
+#endif
+
        return &mr->ibmr;
 
 error:
+       /*
+        * Destroy the umem *before* destroying the MR, to ensure we
+        * will not have any in-flight notifiers when destroying the
+        * MR.
+        *
+        * As the MR is completely invalid to begin with, and this
+        * error path is only taken if we can't push the mr entry into
+        * the pagefault tree, this is safe.
+        */
+
        ib_umem_release(umem);
+       /* Kill the MR, and return an error code. */
+       clean_mr(mr);
        return ERR_PTR(err);
 }
 
@@ -984,17 +1167,14 @@ error:
        return err;
 }
 
-int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
+static int clean_mr(struct mlx5_ib_mr *mr)
 {
-       struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
-       struct mlx5_ib_mr *mr = to_mmr(ibmr);
-       struct ib_umem *umem = mr->umem;
-       int npages = mr->npages;
+       struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
        int umred = mr->umred;
        int err;
 
        if (!umred) {
-               err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
+               err = destroy_mkey(dev, mr);
                if (err) {
                        mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
                                     mr->mmr.key, err);
@@ -1009,15 +1189,47 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
                free_cached_mr(dev, mr);
        }
 
-       if (umem) {
+       if (!umred)
+               kfree(mr);
+
+       return 0;
+}
+
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
+{
+       struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
+       struct mlx5_ib_mr *mr = to_mmr(ibmr);
+       int npages = mr->npages;
+       struct ib_umem *umem = mr->umem;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       if (umem && umem->odp_data) {
+               /* Prevent new page faults from succeeding */
+               mr->live = 0;
+               /* Wait for all running page-fault handlers to finish. */
+               synchronize_srcu(&dev->mr_srcu);
+               /* Destroy all page mappings */
+               mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
+                                        ib_umem_end(umem));
+               /*
+                * We kill the umem before the MR for ODP,
+                * so that there will not be any invalidations in
+                * flight, looking at the *mr struct.
+                */
                ib_umem_release(umem);
-               spin_lock(&dev->mr_lock);
-               dev->mdev->priv.reg_pages -= npages;
-               spin_unlock(&dev->mr_lock);
+               atomic_sub(npages, &dev->mdev->priv.reg_pages);
+
+               /* Avoid double-freeing the umem. */
+               umem = NULL;
        }
+#endif
 
-       if (!umred)
-               kfree(mr);
+       clean_mr(mr);
+
+       if (umem) {
+               ib_umem_release(umem);
+               atomic_sub(npages, &dev->mdev->priv.reg_pages);
+       }
 
        return 0;
 }
@@ -1126,7 +1338,7 @@ int mlx5_ib_destroy_mr(struct ib_mr *ibmr)
                kfree(mr->sig);
        }
 
-       err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
+       err = destroy_mkey(dev, mr);
        if (err) {
                mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
                             mr->mmr.key, err);