powerpc/iommu/powernv: Release replaced TCE
[cascardo/linux.git] / drivers / vfio / vfio_iommu_spapr_tce.c
1 /*
2  * VFIO: IOMMU DMA mapping support for TCE on POWER
3  *
4  * Copyright (C) 2013 IBM Corp.  All rights reserved.
5  *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  *
11  * Derived from original vfio_iommu_type1.c:
12  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
13  *     Author: Alex Williamson <alex.williamson@redhat.com>
14  */
15
16 #include <linux/module.h>
17 #include <linux/pci.h>
18 #include <linux/slab.h>
19 #include <linux/uaccess.h>
20 #include <linux/err.h>
21 #include <linux/vfio.h>
22 #include <asm/iommu.h>
23 #include <asm/tce.h>
24
25 #define DRIVER_VERSION  "0.1"
26 #define DRIVER_AUTHOR   "aik@ozlabs.ru"
27 #define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
28
29 static void tce_iommu_detach_group(void *iommu_data,
30                 struct iommu_group *iommu_group);
31
32 static long try_increment_locked_vm(long npages)
33 {
34         long ret = 0, locked, lock_limit;
35
36         if (!current || !current->mm)
37                 return -ESRCH; /* process exited */
38
39         if (!npages)
40                 return 0;
41
42         down_write(&current->mm->mmap_sem);
43         locked = current->mm->locked_vm + npages;
44         lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
45         if (locked > lock_limit && !capable(CAP_IPC_LOCK))
46                 ret = -ENOMEM;
47         else
48                 current->mm->locked_vm += npages;
49
50         pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
51                         npages << PAGE_SHIFT,
52                         current->mm->locked_vm << PAGE_SHIFT,
53                         rlimit(RLIMIT_MEMLOCK),
54                         ret ? " - exceeded" : "");
55
56         up_write(&current->mm->mmap_sem);
57
58         return ret;
59 }
60
61 static void decrement_locked_vm(long npages)
62 {
63         if (!current || !current->mm || !npages)
64                 return; /* process exited */
65
66         down_write(&current->mm->mmap_sem);
67         if (WARN_ON_ONCE(npages > current->mm->locked_vm))
68                 npages = current->mm->locked_vm;
69         current->mm->locked_vm -= npages;
70         pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
71                         npages << PAGE_SHIFT,
72                         current->mm->locked_vm << PAGE_SHIFT,
73                         rlimit(RLIMIT_MEMLOCK));
74         up_write(&current->mm->mmap_sem);
75 }
76
77 /*
78  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
79  *
80  * This code handles mapping and unmapping of user data buffers
81  * into DMA'ble space using the IOMMU
82  */
83
84 /*
85  * The container descriptor supports only a single group per container.
86  * Required by the API as the container is not supplied with the IOMMU group
87  * at the moment of initialization.
88  */
89 struct tce_container {
90         struct mutex lock;
91         struct iommu_group *grp;
92         bool enabled;
93         unsigned long locked_pages;
94 };
95
96 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
97 {
98         /*
99          * Check that the TCE table granularity is not bigger than the size of
100          * a page we just found. Otherwise the hardware can get access to
101          * a bigger memory chunk that it should.
102          */
103         return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
104 }
105
106 static long tce_iommu_find_table(struct tce_container *container,
107                 phys_addr_t ioba, struct iommu_table **ptbl)
108 {
109         long i;
110         struct iommu_table_group *table_group;
111
112         table_group = iommu_group_get_iommudata(container->grp);
113         if (!table_group)
114                 return -1;
115
116         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
117                 struct iommu_table *tbl = table_group->tables[i];
118
119                 if (tbl) {
120                         unsigned long entry = ioba >> tbl->it_page_shift;
121                         unsigned long start = tbl->it_offset;
122                         unsigned long end = start + tbl->it_size;
123
124                         if ((start <= entry) && (entry < end)) {
125                                 *ptbl = tbl;
126                                 return i;
127                         }
128                 }
129         }
130
131         return -1;
132 }
133
134 static int tce_iommu_enable(struct tce_container *container)
135 {
136         int ret = 0;
137         unsigned long locked;
138         struct iommu_table *tbl;
139         struct iommu_table_group *table_group;
140
141         if (!container->grp)
142                 return -ENXIO;
143
144         if (!current->mm)
145                 return -ESRCH; /* process exited */
146
147         if (container->enabled)
148                 return -EBUSY;
149
150         /*
151          * When userspace pages are mapped into the IOMMU, they are effectively
152          * locked memory, so, theoretically, we need to update the accounting
153          * of locked pages on each map and unmap.  For powerpc, the map unmap
154          * paths can be very hot, though, and the accounting would kill
155          * performance, especially since it would be difficult to impossible
156          * to handle the accounting in real mode only.
157          *
158          * To address that, rather than precisely accounting every page, we
159          * instead account for a worst case on locked memory when the iommu is
160          * enabled and disabled.  The worst case upper bound on locked memory
161          * is the size of the whole iommu window, which is usually relatively
162          * small (compared to total memory sizes) on POWER hardware.
163          *
164          * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
165          * that would effectively kill the guest at random points, much better
166          * enforcing the limit based on the max that the guest can map.
167          *
168          * Unfortunately at the moment it counts whole tables, no matter how
169          * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
170          * each with 2GB DMA window, 8GB will be counted here. The reason for
171          * this is that we cannot tell here the amount of RAM used by the guest
172          * as this information is only available from KVM and VFIO is
173          * KVM agnostic.
174          */
175         table_group = iommu_group_get_iommudata(container->grp);
176         if (!table_group)
177                 return -ENODEV;
178
179         tbl = table_group->tables[0];
180         locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
181         ret = try_increment_locked_vm(locked);
182         if (ret)
183                 return ret;
184
185         container->locked_pages = locked;
186
187         container->enabled = true;
188
189         return ret;
190 }
191
192 static void tce_iommu_disable(struct tce_container *container)
193 {
194         if (!container->enabled)
195                 return;
196
197         container->enabled = false;
198
199         if (!current->mm)
200                 return;
201
202         decrement_locked_vm(container->locked_pages);
203 }
204
205 static void *tce_iommu_open(unsigned long arg)
206 {
207         struct tce_container *container;
208
209         if (arg != VFIO_SPAPR_TCE_IOMMU) {
210                 pr_err("tce_vfio: Wrong IOMMU type\n");
211                 return ERR_PTR(-EINVAL);
212         }
213
214         container = kzalloc(sizeof(*container), GFP_KERNEL);
215         if (!container)
216                 return ERR_PTR(-ENOMEM);
217
218         mutex_init(&container->lock);
219
220         return container;
221 }
222
223 static void tce_iommu_release(void *iommu_data)
224 {
225         struct tce_container *container = iommu_data;
226
227         WARN_ON(container->grp);
228
229         if (container->grp)
230                 tce_iommu_detach_group(iommu_data, container->grp);
231
232         tce_iommu_disable(container);
233         mutex_destroy(&container->lock);
234
235         kfree(container);
236 }
237
238 static void tce_iommu_unuse_page(struct tce_container *container,
239                 unsigned long hpa)
240 {
241         struct page *page;
242
243         page = pfn_to_page(hpa >> PAGE_SHIFT);
244         put_page(page);
245 }
246
247 static int tce_iommu_clear(struct tce_container *container,
248                 struct iommu_table *tbl,
249                 unsigned long entry, unsigned long pages)
250 {
251         unsigned long oldhpa;
252         long ret;
253         enum dma_data_direction direction;
254
255         for ( ; pages; --pages, ++entry) {
256                 direction = DMA_NONE;
257                 oldhpa = 0;
258                 ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
259                 if (ret)
260                         continue;
261
262                 if (direction == DMA_NONE)
263                         continue;
264
265                 tce_iommu_unuse_page(container, oldhpa);
266         }
267
268         return 0;
269 }
270
271 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
272 {
273         struct page *page = NULL;
274         enum dma_data_direction direction = iommu_tce_direction(tce);
275
276         if (get_user_pages_fast(tce & PAGE_MASK, 1,
277                         direction != DMA_TO_DEVICE, &page) != 1)
278                 return -EFAULT;
279
280         *hpa = __pa((unsigned long) page_address(page));
281
282         return 0;
283 }
284
285 static long tce_iommu_build(struct tce_container *container,
286                 struct iommu_table *tbl,
287                 unsigned long entry, unsigned long tce, unsigned long pages,
288                 enum dma_data_direction direction)
289 {
290         long i, ret = 0;
291         struct page *page;
292         unsigned long hpa;
293         enum dma_data_direction dirtmp;
294
295         for (i = 0; i < pages; ++i) {
296                 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
297
298                 ret = tce_iommu_use_page(tce, &hpa);
299                 if (ret)
300                         break;
301
302                 page = pfn_to_page(hpa >> PAGE_SHIFT);
303                 if (!tce_page_is_contained(page, tbl->it_page_shift)) {
304                         ret = -EPERM;
305                         break;
306                 }
307
308                 hpa |= offset;
309                 dirtmp = direction;
310                 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
311                 if (ret) {
312                         tce_iommu_unuse_page(container, hpa);
313                         pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
314                                         __func__, entry << tbl->it_page_shift,
315                                         tce, ret);
316                         break;
317                 }
318
319                 if (dirtmp != DMA_NONE)
320                         tce_iommu_unuse_page(container, hpa);
321
322                 tce += IOMMU_PAGE_SIZE(tbl);
323         }
324
325         if (ret)
326                 tce_iommu_clear(container, tbl, entry, i);
327
328         return ret;
329 }
330
331 static long tce_iommu_ioctl(void *iommu_data,
332                                  unsigned int cmd, unsigned long arg)
333 {
334         struct tce_container *container = iommu_data;
335         unsigned long minsz;
336         long ret;
337
338         switch (cmd) {
339         case VFIO_CHECK_EXTENSION:
340                 switch (arg) {
341                 case VFIO_SPAPR_TCE_IOMMU:
342                         ret = 1;
343                         break;
344                 default:
345                         ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
346                         break;
347                 }
348
349                 return (ret < 0) ? 0 : ret;
350
351         case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
352                 struct vfio_iommu_spapr_tce_info info;
353                 struct iommu_table *tbl;
354                 struct iommu_table_group *table_group;
355
356                 if (WARN_ON(!container->grp))
357                         return -ENXIO;
358
359                 table_group = iommu_group_get_iommudata(container->grp);
360
361                 tbl = table_group->tables[0];
362                 if (WARN_ON_ONCE(!tbl))
363                         return -ENXIO;
364
365                 minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
366                                 dma32_window_size);
367
368                 if (copy_from_user(&info, (void __user *)arg, minsz))
369                         return -EFAULT;
370
371                 if (info.argsz < minsz)
372                         return -EINVAL;
373
374                 info.dma32_window_start = tbl->it_offset << tbl->it_page_shift;
375                 info.dma32_window_size = tbl->it_size << tbl->it_page_shift;
376                 info.flags = 0;
377
378                 if (copy_to_user((void __user *)arg, &info, minsz))
379                         return -EFAULT;
380
381                 return 0;
382         }
383         case VFIO_IOMMU_MAP_DMA: {
384                 struct vfio_iommu_type1_dma_map param;
385                 struct iommu_table *tbl = NULL;
386                 long num;
387                 enum dma_data_direction direction;
388
389                 if (!container->enabled)
390                         return -EPERM;
391
392                 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
393
394                 if (copy_from_user(&param, (void __user *)arg, minsz))
395                         return -EFAULT;
396
397                 if (param.argsz < minsz)
398                         return -EINVAL;
399
400                 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
401                                 VFIO_DMA_MAP_FLAG_WRITE))
402                         return -EINVAL;
403
404                 num = tce_iommu_find_table(container, param.iova, &tbl);
405                 if (num < 0)
406                         return -ENXIO;
407
408                 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
409                                 (param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
410                         return -EINVAL;
411
412                 /* iova is checked by the IOMMU API */
413                 if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
414                         if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
415                                 direction = DMA_BIDIRECTIONAL;
416                         else
417                                 direction = DMA_TO_DEVICE;
418                 } else {
419                         if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
420                                 direction = DMA_FROM_DEVICE;
421                         else
422                                 return -EINVAL;
423                 }
424
425                 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
426                 if (ret)
427                         return ret;
428
429                 ret = tce_iommu_build(container, tbl,
430                                 param.iova >> tbl->it_page_shift,
431                                 param.vaddr,
432                                 param.size >> tbl->it_page_shift,
433                                 direction);
434
435                 iommu_flush_tce(tbl);
436
437                 return ret;
438         }
439         case VFIO_IOMMU_UNMAP_DMA: {
440                 struct vfio_iommu_type1_dma_unmap param;
441                 struct iommu_table *tbl = NULL;
442                 long num;
443
444                 if (!container->enabled)
445                         return -EPERM;
446
447                 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
448                                 size);
449
450                 if (copy_from_user(&param, (void __user *)arg, minsz))
451                         return -EFAULT;
452
453                 if (param.argsz < minsz)
454                         return -EINVAL;
455
456                 /* No flag is supported now */
457                 if (param.flags)
458                         return -EINVAL;
459
460                 num = tce_iommu_find_table(container, param.iova, &tbl);
461                 if (num < 0)
462                         return -ENXIO;
463
464                 if (param.size & ~IOMMU_PAGE_MASK(tbl))
465                         return -EINVAL;
466
467                 ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
468                                 param.size >> tbl->it_page_shift);
469                 if (ret)
470                         return ret;
471
472                 ret = tce_iommu_clear(container, tbl,
473                                 param.iova >> tbl->it_page_shift,
474                                 param.size >> tbl->it_page_shift);
475                 iommu_flush_tce(tbl);
476
477                 return ret;
478         }
479         case VFIO_IOMMU_ENABLE:
480                 mutex_lock(&container->lock);
481                 ret = tce_iommu_enable(container);
482                 mutex_unlock(&container->lock);
483                 return ret;
484
485
486         case VFIO_IOMMU_DISABLE:
487                 mutex_lock(&container->lock);
488                 tce_iommu_disable(container);
489                 mutex_unlock(&container->lock);
490                 return 0;
491         case VFIO_EEH_PE_OP:
492                 if (!container->grp)
493                         return -ENODEV;
494
495                 return vfio_spapr_iommu_eeh_ioctl(container->grp,
496                                                   cmd, arg);
497         }
498
499         return -ENOTTY;
500 }
501
502 static void tce_iommu_release_ownership(struct tce_container *container,
503                 struct iommu_table_group *table_group)
504 {
505         int i;
506
507         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
508                 struct iommu_table *tbl = table_group->tables[i];
509
510                 if (!tbl)
511                         continue;
512
513                 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
514                 if (tbl->it_map)
515                         iommu_release_ownership(tbl);
516         }
517 }
518
519 static int tce_iommu_take_ownership(struct tce_container *container,
520                 struct iommu_table_group *table_group)
521 {
522         int i, j, rc = 0;
523
524         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
525                 struct iommu_table *tbl = table_group->tables[i];
526
527                 if (!tbl || !tbl->it_map)
528                         continue;
529
530                 rc = iommu_take_ownership(tbl);
531                 if (rc) {
532                         for (j = 0; j < i; ++j)
533                                 iommu_release_ownership(
534                                                 table_group->tables[j]);
535
536                         return rc;
537                 }
538         }
539
540         return 0;
541 }
542
543 static void tce_iommu_release_ownership_ddw(struct tce_container *container,
544                 struct iommu_table_group *table_group)
545 {
546         table_group->ops->release_ownership(table_group);
547 }
548
549 static long tce_iommu_take_ownership_ddw(struct tce_container *container,
550                 struct iommu_table_group *table_group)
551 {
552         table_group->ops->take_ownership(table_group);
553
554         return 0;
555 }
556
557 static int tce_iommu_attach_group(void *iommu_data,
558                 struct iommu_group *iommu_group)
559 {
560         int ret;
561         struct tce_container *container = iommu_data;
562         struct iommu_table_group *table_group;
563
564         mutex_lock(&container->lock);
565
566         /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
567                         iommu_group_id(iommu_group), iommu_group); */
568         if (container->grp) {
569                 pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
570                                 iommu_group_id(container->grp),
571                                 iommu_group_id(iommu_group));
572                 ret = -EBUSY;
573                 goto unlock_exit;
574         }
575
576         if (container->enabled) {
577                 pr_err("tce_vfio: attaching group #%u to enabled container\n",
578                                 iommu_group_id(iommu_group));
579                 ret = -EBUSY;
580                 goto unlock_exit;
581         }
582
583         table_group = iommu_group_get_iommudata(iommu_group);
584         if (!table_group) {
585                 ret = -ENXIO;
586                 goto unlock_exit;
587         }
588
589         if (!table_group->ops || !table_group->ops->take_ownership ||
590                         !table_group->ops->release_ownership)
591                 ret = tce_iommu_take_ownership(container, table_group);
592         else
593                 ret = tce_iommu_take_ownership_ddw(container, table_group);
594
595         if (!ret)
596                 container->grp = iommu_group;
597
598 unlock_exit:
599         mutex_unlock(&container->lock);
600
601         return ret;
602 }
603
604 static void tce_iommu_detach_group(void *iommu_data,
605                 struct iommu_group *iommu_group)
606 {
607         struct tce_container *container = iommu_data;
608         struct iommu_table_group *table_group;
609
610         mutex_lock(&container->lock);
611         if (iommu_group != container->grp) {
612                 pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
613                                 iommu_group_id(iommu_group),
614                                 iommu_group_id(container->grp));
615                 goto unlock_exit;
616         }
617
618         if (container->enabled) {
619                 pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
620                                 iommu_group_id(container->grp));
621                 tce_iommu_disable(container);
622         }
623
624         /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
625            iommu_group_id(iommu_group), iommu_group); */
626         container->grp = NULL;
627
628         table_group = iommu_group_get_iommudata(iommu_group);
629         BUG_ON(!table_group);
630
631         if (!table_group->ops || !table_group->ops->release_ownership)
632                 tce_iommu_release_ownership(container, table_group);
633         else
634                 tce_iommu_release_ownership_ddw(container, table_group);
635
636 unlock_exit:
637         mutex_unlock(&container->lock);
638 }
639
640 const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
641         .name           = "iommu-vfio-powerpc",
642         .owner          = THIS_MODULE,
643         .open           = tce_iommu_open,
644         .release        = tce_iommu_release,
645         .ioctl          = tce_iommu_ioctl,
646         .attach_group   = tce_iommu_attach_group,
647         .detach_group   = tce_iommu_detach_group,
648 };
649
650 static int __init tce_iommu_init(void)
651 {
652         return vfio_register_iommu_driver(&tce_iommu_driver_ops);
653 }
654
655 static void __exit tce_iommu_cleanup(void)
656 {
657         vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
658 }
659
660 module_init(tce_iommu_init);
661 module_exit(tce_iommu_cleanup);
662
663 MODULE_VERSION(DRIVER_VERSION);
664 MODULE_LICENSE("GPL v2");
665 MODULE_AUTHOR(DRIVER_AUTHOR);
666 MODULE_DESCRIPTION(DRIVER_DESC);
667