Merge branch 'x86/vt-d' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu...
[cascardo/linux.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  */
19
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-contiguous.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
46
47 #include "irq_remapping.h"
48
49 #define ROOT_SIZE               VTD_PAGE_SIZE
50 #define CONTEXT_SIZE            VTD_PAGE_SIZE
51
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
64
65 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
67
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
71                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
73
74 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
75 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
76 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
77
78 /* page table handling */
79 #define LEVEL_STRIDE            (9)
80 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
81
82 /*
83  * This bitmap is used to advertise the page sizes our hardware support
84  * to the IOMMU core, which will then use this information to split
85  * physically contiguous memory regions it is mapping into page sizes
86  * that we support.
87  *
88  * Traditionally the IOMMU core just handed us the mappings directly,
89  * after making sure the size is an order of a 4KiB page and that the
90  * mapping has natural alignment.
91  *
92  * To retain this behavior, we currently advertise that we support
93  * all page sizes that are an order of 4KiB.
94  *
95  * If at some point we'd like to utilize the IOMMU core's new behavior,
96  * we could change this to advertise the real page sizes we support.
97  */
98 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
99
100 static inline int agaw_to_level(int agaw)
101 {
102         return agaw + 2;
103 }
104
105 static inline int agaw_to_width(int agaw)
106 {
107         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
108 }
109
110 static inline int width_to_agaw(int width)
111 {
112         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
113 }
114
115 static inline unsigned int level_to_offset_bits(int level)
116 {
117         return (level - 1) * LEVEL_STRIDE;
118 }
119
120 static inline int pfn_level_offset(unsigned long pfn, int level)
121 {
122         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
123 }
124
125 static inline unsigned long level_mask(int level)
126 {
127         return -1UL << level_to_offset_bits(level);
128 }
129
130 static inline unsigned long level_size(int level)
131 {
132         return 1UL << level_to_offset_bits(level);
133 }
134
135 static inline unsigned long align_to_level(unsigned long pfn, int level)
136 {
137         return (pfn + level_size(level) - 1) & level_mask(level);
138 }
139
140 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
141 {
142         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
143 }
144
145 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
146    are never going to work. */
147 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
148 {
149         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
150 }
151
152 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
153 {
154         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
155 }
156 static inline unsigned long page_to_dma_pfn(struct page *pg)
157 {
158         return mm_to_dma_pfn(page_to_pfn(pg));
159 }
160 static inline unsigned long virt_to_dma_pfn(void *p)
161 {
162         return page_to_dma_pfn(virt_to_page(p));
163 }
164
165 /* global iommu list, set NULL for ignored DMAR units */
166 static struct intel_iommu **g_iommus;
167
168 static void __init check_tylersburg_isoch(void);
169 static int rwbf_quirk;
170
171 /*
172  * set to 1 to panic kernel if can't successfully enable VT-d
173  * (used when kernel is launched w/ TXT)
174  */
175 static int force_on = 0;
176
177 /*
178  * 0: Present
179  * 1-11: Reserved
180  * 12-63: Context Ptr (12 - (haw-1))
181  * 64-127: Reserved
182  */
183 struct root_entry {
184         u64     val;
185         u64     rsvd1;
186 };
187 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 static inline bool root_present(struct root_entry *root)
189 {
190         return (root->val & 1);
191 }
192 static inline void set_root_present(struct root_entry *root)
193 {
194         root->val |= 1;
195 }
196 static inline void set_root_value(struct root_entry *root, unsigned long value)
197 {
198         root->val &= ~VTD_PAGE_MASK;
199         root->val |= value & VTD_PAGE_MASK;
200 }
201
202 static inline struct context_entry *
203 get_context_addr_from_root(struct root_entry *root)
204 {
205         return (struct context_entry *)
206                 (root_present(root)?phys_to_virt(
207                 root->val & VTD_PAGE_MASK) :
208                 NULL);
209 }
210
211 /*
212  * low 64 bits:
213  * 0: present
214  * 1: fault processing disable
215  * 2-3: translation type
216  * 12-63: address space root
217  * high 64 bits:
218  * 0-2: address width
219  * 3-6: aval
220  * 8-23: domain id
221  */
222 struct context_entry {
223         u64 lo;
224         u64 hi;
225 };
226
227 static inline bool context_present(struct context_entry *context)
228 {
229         return (context->lo & 1);
230 }
231 static inline void context_set_present(struct context_entry *context)
232 {
233         context->lo |= 1;
234 }
235
236 static inline void context_set_fault_enable(struct context_entry *context)
237 {
238         context->lo &= (((u64)-1) << 2) | 1;
239 }
240
241 static inline void context_set_translation_type(struct context_entry *context,
242                                                 unsigned long value)
243 {
244         context->lo &= (((u64)-1) << 4) | 3;
245         context->lo |= (value & 3) << 2;
246 }
247
248 static inline void context_set_address_root(struct context_entry *context,
249                                             unsigned long value)
250 {
251         context->lo &= ~VTD_PAGE_MASK;
252         context->lo |= value & VTD_PAGE_MASK;
253 }
254
255 static inline void context_set_address_width(struct context_entry *context,
256                                              unsigned long value)
257 {
258         context->hi |= value & 7;
259 }
260
261 static inline void context_set_domain_id(struct context_entry *context,
262                                          unsigned long value)
263 {
264         context->hi |= (value & ((1 << 16) - 1)) << 8;
265 }
266
267 static inline void context_clear_entry(struct context_entry *context)
268 {
269         context->lo = 0;
270         context->hi = 0;
271 }
272
273 /*
274  * 0: readable
275  * 1: writable
276  * 2-6: reserved
277  * 7: super page
278  * 8-10: available
279  * 11: snoop behavior
280  * 12-63: Host physcial address
281  */
282 struct dma_pte {
283         u64 val;
284 };
285
286 static inline void dma_clear_pte(struct dma_pte *pte)
287 {
288         pte->val = 0;
289 }
290
291 static inline u64 dma_pte_addr(struct dma_pte *pte)
292 {
293 #ifdef CONFIG_64BIT
294         return pte->val & VTD_PAGE_MASK;
295 #else
296         /* Must have a full atomic 64-bit read */
297         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
298 #endif
299 }
300
301 static inline bool dma_pte_present(struct dma_pte *pte)
302 {
303         return (pte->val & 3) != 0;
304 }
305
306 static inline bool dma_pte_superpage(struct dma_pte *pte)
307 {
308         return (pte->val & DMA_PTE_LARGE_PAGE);
309 }
310
311 static inline int first_pte_in_page(struct dma_pte *pte)
312 {
313         return !((unsigned long)pte & ~VTD_PAGE_MASK);
314 }
315
316 /*
317  * This domain is a statically identity mapping domain.
318  *      1. This domain creats a static 1:1 mapping to all usable memory.
319  *      2. It maps to each iommu if successful.
320  *      3. Each iommu mapps to this domain if successful.
321  */
322 static struct dmar_domain *si_domain;
323 static int hw_pass_through = 1;
324
325 /* domain represents a virtual machine, more than one devices
326  * across iommus may be owned in one domain, e.g. kvm guest.
327  */
328 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
329
330 /* si_domain contains mulitple devices */
331 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
332
333 struct dmar_domain {
334         int     id;                     /* domain id */
335         int     nid;                    /* node id */
336         DECLARE_BITMAP(iommu_bmp, DMAR_UNITS_SUPPORTED);
337                                         /* bitmap of iommus this domain uses*/
338
339         struct list_head devices;       /* all devices' list */
340         struct iova_domain iovad;       /* iova's that belong to this domain */
341
342         struct dma_pte  *pgd;           /* virtual address */
343         int             gaw;            /* max guest address width */
344
345         /* adjusted guest address width, 0 is level 2 30-bit */
346         int             agaw;
347
348         int             flags;          /* flags to find out type of domain */
349
350         int             iommu_coherency;/* indicate coherency of iommu access */
351         int             iommu_snooping; /* indicate snooping control feature*/
352         int             iommu_count;    /* reference count of iommu */
353         int             iommu_superpage;/* Level of superpages supported:
354                                            0 == 4KiB (no superpages), 1 == 2MiB,
355                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
356         spinlock_t      iommu_lock;     /* protect iommu set in domain */
357         u64             max_addr;       /* maximum mapped address */
358 };
359
360 /* PCI domain-device relationship */
361 struct device_domain_info {
362         struct list_head link;  /* link to domain siblings */
363         struct list_head global; /* link to global list */
364         u8 bus;                 /* PCI bus number */
365         u8 devfn;               /* PCI devfn number */
366         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
367         struct intel_iommu *iommu; /* IOMMU used by this device */
368         struct dmar_domain *domain; /* pointer to domain */
369 };
370
371 struct dmar_rmrr_unit {
372         struct list_head list;          /* list of rmrr units   */
373         struct acpi_dmar_header *hdr;   /* ACPI header          */
374         u64     base_address;           /* reserved base address*/
375         u64     end_address;            /* reserved end address */
376         struct dmar_dev_scope *devices; /* target devices */
377         int     devices_cnt;            /* target device count */
378 };
379
380 struct dmar_atsr_unit {
381         struct list_head list;          /* list of ATSR units */
382         struct acpi_dmar_header *hdr;   /* ACPI header */
383         struct dmar_dev_scope *devices; /* target devices */
384         int devices_cnt;                /* target device count */
385         u8 include_all:1;               /* include all ports */
386 };
387
388 static LIST_HEAD(dmar_atsr_units);
389 static LIST_HEAD(dmar_rmrr_units);
390
391 #define for_each_rmrr_units(rmrr) \
392         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
393
394 static void flush_unmaps_timeout(unsigned long data);
395
396 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
397
398 #define HIGH_WATER_MARK 250
399 struct deferred_flush_tables {
400         int next;
401         struct iova *iova[HIGH_WATER_MARK];
402         struct dmar_domain *domain[HIGH_WATER_MARK];
403         struct page *freelist[HIGH_WATER_MARK];
404 };
405
406 static struct deferred_flush_tables *deferred_flush;
407
408 /* bitmap for indexing intel_iommus */
409 static int g_num_of_iommus;
410
411 static DEFINE_SPINLOCK(async_umap_flush_lock);
412 static LIST_HEAD(unmaps_to_do);
413
414 static int timer_on;
415 static long list_size;
416
417 static void domain_exit(struct dmar_domain *domain);
418 static void domain_remove_dev_info(struct dmar_domain *domain);
419 static void domain_remove_one_dev_info(struct dmar_domain *domain,
420                                        struct device *dev);
421 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
422                                            struct device *dev);
423 static int domain_detach_iommu(struct dmar_domain *domain,
424                                struct intel_iommu *iommu);
425
426 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
427 int dmar_disabled = 0;
428 #else
429 int dmar_disabled = 1;
430 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
431
432 int intel_iommu_enabled = 0;
433 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
434
435 static int dmar_map_gfx = 1;
436 static int dmar_forcedac;
437 static int intel_iommu_strict;
438 static int intel_iommu_superpage = 1;
439
440 int intel_iommu_gfx_mapped;
441 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
442
443 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
444 static DEFINE_SPINLOCK(device_domain_lock);
445 static LIST_HEAD(device_domain_list);
446
447 static const struct iommu_ops intel_iommu_ops;
448
449 static int __init intel_iommu_setup(char *str)
450 {
451         if (!str)
452                 return -EINVAL;
453         while (*str) {
454                 if (!strncmp(str, "on", 2)) {
455                         dmar_disabled = 0;
456                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
457                 } else if (!strncmp(str, "off", 3)) {
458                         dmar_disabled = 1;
459                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
460                 } else if (!strncmp(str, "igfx_off", 8)) {
461                         dmar_map_gfx = 0;
462                         printk(KERN_INFO
463                                 "Intel-IOMMU: disable GFX device mapping\n");
464                 } else if (!strncmp(str, "forcedac", 8)) {
465                         printk(KERN_INFO
466                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
467                         dmar_forcedac = 1;
468                 } else if (!strncmp(str, "strict", 6)) {
469                         printk(KERN_INFO
470                                 "Intel-IOMMU: disable batched IOTLB flush\n");
471                         intel_iommu_strict = 1;
472                 } else if (!strncmp(str, "sp_off", 6)) {
473                         printk(KERN_INFO
474                                 "Intel-IOMMU: disable supported super page\n");
475                         intel_iommu_superpage = 0;
476                 }
477
478                 str += strcspn(str, ",");
479                 while (*str == ',')
480                         str++;
481         }
482         return 0;
483 }
484 __setup("intel_iommu=", intel_iommu_setup);
485
486 static struct kmem_cache *iommu_domain_cache;
487 static struct kmem_cache *iommu_devinfo_cache;
488 static struct kmem_cache *iommu_iova_cache;
489
490 static inline void *alloc_pgtable_page(int node)
491 {
492         struct page *page;
493         void *vaddr = NULL;
494
495         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
496         if (page)
497                 vaddr = page_address(page);
498         return vaddr;
499 }
500
501 static inline void free_pgtable_page(void *vaddr)
502 {
503         free_page((unsigned long)vaddr);
504 }
505
506 static inline void *alloc_domain_mem(void)
507 {
508         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
509 }
510
511 static void free_domain_mem(void *vaddr)
512 {
513         kmem_cache_free(iommu_domain_cache, vaddr);
514 }
515
516 static inline void * alloc_devinfo_mem(void)
517 {
518         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
519 }
520
521 static inline void free_devinfo_mem(void *vaddr)
522 {
523         kmem_cache_free(iommu_devinfo_cache, vaddr);
524 }
525
526 struct iova *alloc_iova_mem(void)
527 {
528         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
529 }
530
531 void free_iova_mem(struct iova *iova)
532 {
533         kmem_cache_free(iommu_iova_cache, iova);
534 }
535
536 static inline int domain_type_is_vm(struct dmar_domain *domain)
537 {
538         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
539 }
540
541 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
542 {
543         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
544                                 DOMAIN_FLAG_STATIC_IDENTITY);
545 }
546
547 static inline int domain_pfn_supported(struct dmar_domain *domain,
548                                        unsigned long pfn)
549 {
550         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
551
552         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
553 }
554
555 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
556 {
557         unsigned long sagaw;
558         int agaw = -1;
559
560         sagaw = cap_sagaw(iommu->cap);
561         for (agaw = width_to_agaw(max_gaw);
562              agaw >= 0; agaw--) {
563                 if (test_bit(agaw, &sagaw))
564                         break;
565         }
566
567         return agaw;
568 }
569
570 /*
571  * Calculate max SAGAW for each iommu.
572  */
573 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
574 {
575         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
576 }
577
578 /*
579  * calculate agaw for each iommu.
580  * "SAGAW" may be different across iommus, use a default agaw, and
581  * get a supported less agaw for iommus that don't support the default agaw.
582  */
583 int iommu_calculate_agaw(struct intel_iommu *iommu)
584 {
585         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
586 }
587
588 /* This functionin only returns single iommu in a domain */
589 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
590 {
591         int iommu_id;
592
593         /* si_domain and vm domain should not get here. */
594         BUG_ON(domain_type_is_vm_or_si(domain));
595         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
596         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
597                 return NULL;
598
599         return g_iommus[iommu_id];
600 }
601
602 static void domain_update_iommu_coherency(struct dmar_domain *domain)
603 {
604         struct dmar_drhd_unit *drhd;
605         struct intel_iommu *iommu;
606         int i, found = 0;
607
608         domain->iommu_coherency = 1;
609
610         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
611                 found = 1;
612                 if (!ecap_coherent(g_iommus[i]->ecap)) {
613                         domain->iommu_coherency = 0;
614                         break;
615                 }
616         }
617         if (found)
618                 return;
619
620         /* No hardware attached; use lowest common denominator */
621         rcu_read_lock();
622         for_each_active_iommu(iommu, drhd) {
623                 if (!ecap_coherent(iommu->ecap)) {
624                         domain->iommu_coherency = 0;
625                         break;
626                 }
627         }
628         rcu_read_unlock();
629 }
630
631 static int domain_update_iommu_snooping(struct intel_iommu *skip)
632 {
633         struct dmar_drhd_unit *drhd;
634         struct intel_iommu *iommu;
635         int ret = 1;
636
637         rcu_read_lock();
638         for_each_active_iommu(iommu, drhd) {
639                 if (iommu != skip) {
640                         if (!ecap_sc_support(iommu->ecap)) {
641                                 ret = 0;
642                                 break;
643                         }
644                 }
645         }
646         rcu_read_unlock();
647
648         return ret;
649 }
650
651 static int domain_update_iommu_superpage(struct intel_iommu *skip)
652 {
653         struct dmar_drhd_unit *drhd;
654         struct intel_iommu *iommu;
655         int mask = 0xf;
656
657         if (!intel_iommu_superpage) {
658                 return 0;
659         }
660
661         /* set iommu_superpage to the smallest common denominator */
662         rcu_read_lock();
663         for_each_active_iommu(iommu, drhd) {
664                 if (iommu != skip) {
665                         mask &= cap_super_page_val(iommu->cap);
666                         if (!mask)
667                                 break;
668                 }
669         }
670         rcu_read_unlock();
671
672         return fls(mask);
673 }
674
675 /* Some capabilities may be different across iommus */
676 static void domain_update_iommu_cap(struct dmar_domain *domain)
677 {
678         domain_update_iommu_coherency(domain);
679         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
680         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
681 }
682
683 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
684 {
685         struct dmar_drhd_unit *drhd = NULL;
686         struct intel_iommu *iommu;
687         struct device *tmp;
688         struct pci_dev *ptmp, *pdev = NULL;
689         u16 segment = 0;
690         int i;
691
692         if (dev_is_pci(dev)) {
693                 pdev = to_pci_dev(dev);
694                 segment = pci_domain_nr(pdev->bus);
695         } else if (ACPI_COMPANION(dev))
696                 dev = &ACPI_COMPANION(dev)->dev;
697
698         rcu_read_lock();
699         for_each_active_iommu(iommu, drhd) {
700                 if (pdev && segment != drhd->segment)
701                         continue;
702
703                 for_each_active_dev_scope(drhd->devices,
704                                           drhd->devices_cnt, i, tmp) {
705                         if (tmp == dev) {
706                                 *bus = drhd->devices[i].bus;
707                                 *devfn = drhd->devices[i].devfn;
708                                 goto out;
709                         }
710
711                         if (!pdev || !dev_is_pci(tmp))
712                                 continue;
713
714                         ptmp = to_pci_dev(tmp);
715                         if (ptmp->subordinate &&
716                             ptmp->subordinate->number <= pdev->bus->number &&
717                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
718                                 goto got_pdev;
719                 }
720
721                 if (pdev && drhd->include_all) {
722                 got_pdev:
723                         *bus = pdev->bus->number;
724                         *devfn = pdev->devfn;
725                         goto out;
726                 }
727         }
728         iommu = NULL;
729  out:
730         rcu_read_unlock();
731
732         return iommu;
733 }
734
735 static void domain_flush_cache(struct dmar_domain *domain,
736                                void *addr, int size)
737 {
738         if (!domain->iommu_coherency)
739                 clflush_cache_range(addr, size);
740 }
741
742 /* Gets context entry for a given bus and devfn */
743 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
744                 u8 bus, u8 devfn)
745 {
746         struct root_entry *root;
747         struct context_entry *context;
748         unsigned long phy_addr;
749         unsigned long flags;
750
751         spin_lock_irqsave(&iommu->lock, flags);
752         root = &iommu->root_entry[bus];
753         context = get_context_addr_from_root(root);
754         if (!context) {
755                 context = (struct context_entry *)
756                                 alloc_pgtable_page(iommu->node);
757                 if (!context) {
758                         spin_unlock_irqrestore(&iommu->lock, flags);
759                         return NULL;
760                 }
761                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
762                 phy_addr = virt_to_phys((void *)context);
763                 set_root_value(root, phy_addr);
764                 set_root_present(root);
765                 __iommu_flush_cache(iommu, root, sizeof(*root));
766         }
767         spin_unlock_irqrestore(&iommu->lock, flags);
768         return &context[devfn];
769 }
770
771 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
772 {
773         struct root_entry *root;
774         struct context_entry *context;
775         int ret;
776         unsigned long flags;
777
778         spin_lock_irqsave(&iommu->lock, flags);
779         root = &iommu->root_entry[bus];
780         context = get_context_addr_from_root(root);
781         if (!context) {
782                 ret = 0;
783                 goto out;
784         }
785         ret = context_present(&context[devfn]);
786 out:
787         spin_unlock_irqrestore(&iommu->lock, flags);
788         return ret;
789 }
790
791 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
792 {
793         struct root_entry *root;
794         struct context_entry *context;
795         unsigned long flags;
796
797         spin_lock_irqsave(&iommu->lock, flags);
798         root = &iommu->root_entry[bus];
799         context = get_context_addr_from_root(root);
800         if (context) {
801                 context_clear_entry(&context[devfn]);
802                 __iommu_flush_cache(iommu, &context[devfn], \
803                         sizeof(*context));
804         }
805         spin_unlock_irqrestore(&iommu->lock, flags);
806 }
807
808 static void free_context_table(struct intel_iommu *iommu)
809 {
810         struct root_entry *root;
811         int i;
812         unsigned long flags;
813         struct context_entry *context;
814
815         spin_lock_irqsave(&iommu->lock, flags);
816         if (!iommu->root_entry) {
817                 goto out;
818         }
819         for (i = 0; i < ROOT_ENTRY_NR; i++) {
820                 root = &iommu->root_entry[i];
821                 context = get_context_addr_from_root(root);
822                 if (context)
823                         free_pgtable_page(context);
824         }
825         free_pgtable_page(iommu->root_entry);
826         iommu->root_entry = NULL;
827 out:
828         spin_unlock_irqrestore(&iommu->lock, flags);
829 }
830
831 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
832                                       unsigned long pfn, int *target_level)
833 {
834         struct dma_pte *parent, *pte = NULL;
835         int level = agaw_to_level(domain->agaw);
836         int offset;
837
838         BUG_ON(!domain->pgd);
839
840         if (!domain_pfn_supported(domain, pfn))
841                 /* Address beyond IOMMU's addressing capabilities. */
842                 return NULL;
843
844         parent = domain->pgd;
845
846         while (1) {
847                 void *tmp_page;
848
849                 offset = pfn_level_offset(pfn, level);
850                 pte = &parent[offset];
851                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
852                         break;
853                 if (level == *target_level)
854                         break;
855
856                 if (!dma_pte_present(pte)) {
857                         uint64_t pteval;
858
859                         tmp_page = alloc_pgtable_page(domain->nid);
860
861                         if (!tmp_page)
862                                 return NULL;
863
864                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
865                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
866                         if (cmpxchg64(&pte->val, 0ULL, pteval))
867                                 /* Someone else set it while we were thinking; use theirs. */
868                                 free_pgtable_page(tmp_page);
869                         else
870                                 domain_flush_cache(domain, pte, sizeof(*pte));
871                 }
872                 if (level == 1)
873                         break;
874
875                 parent = phys_to_virt(dma_pte_addr(pte));
876                 level--;
877         }
878
879         if (!*target_level)
880                 *target_level = level;
881
882         return pte;
883 }
884
885
886 /* return address's pte at specific level */
887 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
888                                          unsigned long pfn,
889                                          int level, int *large_page)
890 {
891         struct dma_pte *parent, *pte = NULL;
892         int total = agaw_to_level(domain->agaw);
893         int offset;
894
895         parent = domain->pgd;
896         while (level <= total) {
897                 offset = pfn_level_offset(pfn, total);
898                 pte = &parent[offset];
899                 if (level == total)
900                         return pte;
901
902                 if (!dma_pte_present(pte)) {
903                         *large_page = total;
904                         break;
905                 }
906
907                 if (dma_pte_superpage(pte)) {
908                         *large_page = total;
909                         return pte;
910                 }
911
912                 parent = phys_to_virt(dma_pte_addr(pte));
913                 total--;
914         }
915         return NULL;
916 }
917
918 /* clear last level pte, a tlb flush should be followed */
919 static void dma_pte_clear_range(struct dmar_domain *domain,
920                                 unsigned long start_pfn,
921                                 unsigned long last_pfn)
922 {
923         unsigned int large_page = 1;
924         struct dma_pte *first_pte, *pte;
925
926         BUG_ON(!domain_pfn_supported(domain, start_pfn));
927         BUG_ON(!domain_pfn_supported(domain, last_pfn));
928         BUG_ON(start_pfn > last_pfn);
929
930         /* we don't need lock here; nobody else touches the iova range */
931         do {
932                 large_page = 1;
933                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
934                 if (!pte) {
935                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
936                         continue;
937                 }
938                 do {
939                         dma_clear_pte(pte);
940                         start_pfn += lvl_to_nr_pages(large_page);
941                         pte++;
942                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
943
944                 domain_flush_cache(domain, first_pte,
945                                    (void *)pte - (void *)first_pte);
946
947         } while (start_pfn && start_pfn <= last_pfn);
948 }
949
950 static void dma_pte_free_level(struct dmar_domain *domain, int level,
951                                struct dma_pte *pte, unsigned long pfn,
952                                unsigned long start_pfn, unsigned long last_pfn)
953 {
954         pfn = max(start_pfn, pfn);
955         pte = &pte[pfn_level_offset(pfn, level)];
956
957         do {
958                 unsigned long level_pfn;
959                 struct dma_pte *level_pte;
960
961                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
962                         goto next;
963
964                 level_pfn = pfn & level_mask(level - 1);
965                 level_pte = phys_to_virt(dma_pte_addr(pte));
966
967                 if (level > 2)
968                         dma_pte_free_level(domain, level - 1, level_pte,
969                                            level_pfn, start_pfn, last_pfn);
970
971                 /* If range covers entire pagetable, free it */
972                 if (!(start_pfn > level_pfn ||
973                       last_pfn < level_pfn + level_size(level) - 1)) {
974                         dma_clear_pte(pte);
975                         domain_flush_cache(domain, pte, sizeof(*pte));
976                         free_pgtable_page(level_pte);
977                 }
978 next:
979                 pfn += level_size(level);
980         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
981 }
982
983 /* free page table pages. last level pte should already be cleared */
984 static void dma_pte_free_pagetable(struct dmar_domain *domain,
985                                    unsigned long start_pfn,
986                                    unsigned long last_pfn)
987 {
988         BUG_ON(!domain_pfn_supported(domain, start_pfn));
989         BUG_ON(!domain_pfn_supported(domain, last_pfn));
990         BUG_ON(start_pfn > last_pfn);
991
992         dma_pte_clear_range(domain, start_pfn, last_pfn);
993
994         /* We don't need lock here; nobody else touches the iova range */
995         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
996                            domain->pgd, 0, start_pfn, last_pfn);
997
998         /* free pgd */
999         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1000                 free_pgtable_page(domain->pgd);
1001                 domain->pgd = NULL;
1002         }
1003 }
1004
1005 /* When a page at a given level is being unlinked from its parent, we don't
1006    need to *modify* it at all. All we need to do is make a list of all the
1007    pages which can be freed just as soon as we've flushed the IOTLB and we
1008    know the hardware page-walk will no longer touch them.
1009    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1010    be freed. */
1011 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1012                                             int level, struct dma_pte *pte,
1013                                             struct page *freelist)
1014 {
1015         struct page *pg;
1016
1017         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1018         pg->freelist = freelist;
1019         freelist = pg;
1020
1021         if (level == 1)
1022                 return freelist;
1023
1024         pte = page_address(pg);
1025         do {
1026                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1027                         freelist = dma_pte_list_pagetables(domain, level - 1,
1028                                                            pte, freelist);
1029                 pte++;
1030         } while (!first_pte_in_page(pte));
1031
1032         return freelist;
1033 }
1034
1035 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1036                                         struct dma_pte *pte, unsigned long pfn,
1037                                         unsigned long start_pfn,
1038                                         unsigned long last_pfn,
1039                                         struct page *freelist)
1040 {
1041         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1042
1043         pfn = max(start_pfn, pfn);
1044         pte = &pte[pfn_level_offset(pfn, level)];
1045
1046         do {
1047                 unsigned long level_pfn;
1048
1049                 if (!dma_pte_present(pte))
1050                         goto next;
1051
1052                 level_pfn = pfn & level_mask(level);
1053
1054                 /* If range covers entire pagetable, free it */
1055                 if (start_pfn <= level_pfn &&
1056                     last_pfn >= level_pfn + level_size(level) - 1) {
1057                         /* These suborbinate page tables are going away entirely. Don't
1058                            bother to clear them; we're just going to *free* them. */
1059                         if (level > 1 && !dma_pte_superpage(pte))
1060                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1061
1062                         dma_clear_pte(pte);
1063                         if (!first_pte)
1064                                 first_pte = pte;
1065                         last_pte = pte;
1066                 } else if (level > 1) {
1067                         /* Recurse down into a level that isn't *entirely* obsolete */
1068                         freelist = dma_pte_clear_level(domain, level - 1,
1069                                                        phys_to_virt(dma_pte_addr(pte)),
1070                                                        level_pfn, start_pfn, last_pfn,
1071                                                        freelist);
1072                 }
1073 next:
1074                 pfn += level_size(level);
1075         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1076
1077         if (first_pte)
1078                 domain_flush_cache(domain, first_pte,
1079                                    (void *)++last_pte - (void *)first_pte);
1080
1081         return freelist;
1082 }
1083
1084 /* We can't just free the pages because the IOMMU may still be walking
1085    the page tables, and may have cached the intermediate levels. The
1086    pages can only be freed after the IOTLB flush has been done. */
1087 struct page *domain_unmap(struct dmar_domain *domain,
1088                           unsigned long start_pfn,
1089                           unsigned long last_pfn)
1090 {
1091         struct page *freelist = NULL;
1092
1093         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1094         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1095         BUG_ON(start_pfn > last_pfn);
1096
1097         /* we don't need lock here; nobody else touches the iova range */
1098         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1099                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1100
1101         /* free pgd */
1102         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1103                 struct page *pgd_page = virt_to_page(domain->pgd);
1104                 pgd_page->freelist = freelist;
1105                 freelist = pgd_page;
1106
1107                 domain->pgd = NULL;
1108         }
1109
1110         return freelist;
1111 }
1112
1113 void dma_free_pagelist(struct page *freelist)
1114 {
1115         struct page *pg;
1116
1117         while ((pg = freelist)) {
1118                 freelist = pg->freelist;
1119                 free_pgtable_page(page_address(pg));
1120         }
1121 }
1122
1123 /* iommu handling */
1124 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1125 {
1126         struct root_entry *root;
1127         unsigned long flags;
1128
1129         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1130         if (!root) {
1131                 pr_err("IOMMU: allocating root entry for %s failed\n",
1132                         iommu->name);
1133                 return -ENOMEM;
1134         }
1135
1136         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1137
1138         spin_lock_irqsave(&iommu->lock, flags);
1139         iommu->root_entry = root;
1140         spin_unlock_irqrestore(&iommu->lock, flags);
1141
1142         return 0;
1143 }
1144
1145 static void iommu_set_root_entry(struct intel_iommu *iommu)
1146 {
1147         void *addr;
1148         u32 sts;
1149         unsigned long flag;
1150
1151         addr = iommu->root_entry;
1152
1153         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1154         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1155
1156         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1157
1158         /* Make sure hardware complete it */
1159         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1160                       readl, (sts & DMA_GSTS_RTPS), sts);
1161
1162         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1163 }
1164
1165 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1166 {
1167         u32 val;
1168         unsigned long flag;
1169
1170         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1171                 return;
1172
1173         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1174         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1175
1176         /* Make sure hardware complete it */
1177         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1178                       readl, (!(val & DMA_GSTS_WBFS)), val);
1179
1180         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1181 }
1182
1183 /* return value determine if we need a write buffer flush */
1184 static void __iommu_flush_context(struct intel_iommu *iommu,
1185                                   u16 did, u16 source_id, u8 function_mask,
1186                                   u64 type)
1187 {
1188         u64 val = 0;
1189         unsigned long flag;
1190
1191         switch (type) {
1192         case DMA_CCMD_GLOBAL_INVL:
1193                 val = DMA_CCMD_GLOBAL_INVL;
1194                 break;
1195         case DMA_CCMD_DOMAIN_INVL:
1196                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1197                 break;
1198         case DMA_CCMD_DEVICE_INVL:
1199                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1200                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1201                 break;
1202         default:
1203                 BUG();
1204         }
1205         val |= DMA_CCMD_ICC;
1206
1207         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1208         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1209
1210         /* Make sure hardware complete it */
1211         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1212                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1213
1214         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1215 }
1216
1217 /* return value determine if we need a write buffer flush */
1218 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1219                                 u64 addr, unsigned int size_order, u64 type)
1220 {
1221         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1222         u64 val = 0, val_iva = 0;
1223         unsigned long flag;
1224
1225         switch (type) {
1226         case DMA_TLB_GLOBAL_FLUSH:
1227                 /* global flush doesn't need set IVA_REG */
1228                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1229                 break;
1230         case DMA_TLB_DSI_FLUSH:
1231                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1232                 break;
1233         case DMA_TLB_PSI_FLUSH:
1234                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1235                 /* IH bit is passed in as part of address */
1236                 val_iva = size_order | addr;
1237                 break;
1238         default:
1239                 BUG();
1240         }
1241         /* Note: set drain read/write */
1242 #if 0
1243         /*
1244          * This is probably to be super secure.. Looks like we can
1245          * ignore it without any impact.
1246          */
1247         if (cap_read_drain(iommu->cap))
1248                 val |= DMA_TLB_READ_DRAIN;
1249 #endif
1250         if (cap_write_drain(iommu->cap))
1251                 val |= DMA_TLB_WRITE_DRAIN;
1252
1253         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1254         /* Note: Only uses first TLB reg currently */
1255         if (val_iva)
1256                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1257         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1258
1259         /* Make sure hardware complete it */
1260         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1261                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1262
1263         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1264
1265         /* check IOTLB invalidation granularity */
1266         if (DMA_TLB_IAIG(val) == 0)
1267                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1268         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1269                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1270                         (unsigned long long)DMA_TLB_IIRG(type),
1271                         (unsigned long long)DMA_TLB_IAIG(val));
1272 }
1273
1274 static struct device_domain_info *
1275 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1276                          u8 bus, u8 devfn)
1277 {
1278         int found = 0;
1279         unsigned long flags;
1280         struct device_domain_info *info;
1281         struct pci_dev *pdev;
1282
1283         if (!ecap_dev_iotlb_support(iommu->ecap))
1284                 return NULL;
1285
1286         if (!iommu->qi)
1287                 return NULL;
1288
1289         spin_lock_irqsave(&device_domain_lock, flags);
1290         list_for_each_entry(info, &domain->devices, link)
1291                 if (info->iommu == iommu && info->bus == bus &&
1292                     info->devfn == devfn) {
1293                         found = 1;
1294                         break;
1295                 }
1296         spin_unlock_irqrestore(&device_domain_lock, flags);
1297
1298         if (!found || !info->dev || !dev_is_pci(info->dev))
1299                 return NULL;
1300
1301         pdev = to_pci_dev(info->dev);
1302
1303         if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1304                 return NULL;
1305
1306         if (!dmar_find_matched_atsr_unit(pdev))
1307                 return NULL;
1308
1309         return info;
1310 }
1311
1312 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1313 {
1314         if (!info || !dev_is_pci(info->dev))
1315                 return;
1316
1317         pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1318 }
1319
1320 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1321 {
1322         if (!info->dev || !dev_is_pci(info->dev) ||
1323             !pci_ats_enabled(to_pci_dev(info->dev)))
1324                 return;
1325
1326         pci_disable_ats(to_pci_dev(info->dev));
1327 }
1328
1329 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1330                                   u64 addr, unsigned mask)
1331 {
1332         u16 sid, qdep;
1333         unsigned long flags;
1334         struct device_domain_info *info;
1335
1336         spin_lock_irqsave(&device_domain_lock, flags);
1337         list_for_each_entry(info, &domain->devices, link) {
1338                 struct pci_dev *pdev;
1339                 if (!info->dev || !dev_is_pci(info->dev))
1340                         continue;
1341
1342                 pdev = to_pci_dev(info->dev);
1343                 if (!pci_ats_enabled(pdev))
1344                         continue;
1345
1346                 sid = info->bus << 8 | info->devfn;
1347                 qdep = pci_ats_queue_depth(pdev);
1348                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1349         }
1350         spin_unlock_irqrestore(&device_domain_lock, flags);
1351 }
1352
1353 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1354                                   unsigned long pfn, unsigned int pages, int ih, int map)
1355 {
1356         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1357         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1358
1359         BUG_ON(pages == 0);
1360
1361         if (ih)
1362                 ih = 1 << 6;
1363         /*
1364          * Fallback to domain selective flush if no PSI support or the size is
1365          * too big.
1366          * PSI requires page size to be 2 ^ x, and the base address is naturally
1367          * aligned to the size
1368          */
1369         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1370                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1371                                                 DMA_TLB_DSI_FLUSH);
1372         else
1373                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1374                                                 DMA_TLB_PSI_FLUSH);
1375
1376         /*
1377          * In caching mode, changes of pages from non-present to present require
1378          * flush. However, device IOTLB doesn't need to be flushed in this case.
1379          */
1380         if (!cap_caching_mode(iommu->cap) || !map)
1381                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1382 }
1383
1384 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1385 {
1386         u32 pmen;
1387         unsigned long flags;
1388
1389         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1390         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1391         pmen &= ~DMA_PMEN_EPM;
1392         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1393
1394         /* wait for the protected region status bit to clear */
1395         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1396                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1397
1398         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1399 }
1400
1401 static void iommu_enable_translation(struct intel_iommu *iommu)
1402 {
1403         u32 sts;
1404         unsigned long flags;
1405
1406         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1407         iommu->gcmd |= DMA_GCMD_TE;
1408         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1409
1410         /* Make sure hardware complete it */
1411         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1412                       readl, (sts & DMA_GSTS_TES), sts);
1413
1414         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1415 }
1416
1417 static void iommu_disable_translation(struct intel_iommu *iommu)
1418 {
1419         u32 sts;
1420         unsigned long flag;
1421
1422         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1423         iommu->gcmd &= ~DMA_GCMD_TE;
1424         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1425
1426         /* Make sure hardware complete it */
1427         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1428                       readl, (!(sts & DMA_GSTS_TES)), sts);
1429
1430         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1431 }
1432
1433
1434 static int iommu_init_domains(struct intel_iommu *iommu)
1435 {
1436         unsigned long ndomains;
1437         unsigned long nlongs;
1438
1439         ndomains = cap_ndoms(iommu->cap);
1440         pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1441                  iommu->seq_id, ndomains);
1442         nlongs = BITS_TO_LONGS(ndomains);
1443
1444         spin_lock_init(&iommu->lock);
1445
1446         /* TBD: there might be 64K domains,
1447          * consider other allocation for future chip
1448          */
1449         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1450         if (!iommu->domain_ids) {
1451                 pr_err("IOMMU%d: allocating domain id array failed\n",
1452                        iommu->seq_id);
1453                 return -ENOMEM;
1454         }
1455         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1456                         GFP_KERNEL);
1457         if (!iommu->domains) {
1458                 pr_err("IOMMU%d: allocating domain array failed\n",
1459                        iommu->seq_id);
1460                 kfree(iommu->domain_ids);
1461                 iommu->domain_ids = NULL;
1462                 return -ENOMEM;
1463         }
1464
1465         /*
1466          * if Caching mode is set, then invalid translations are tagged
1467          * with domainid 0. Hence we need to pre-allocate it.
1468          */
1469         if (cap_caching_mode(iommu->cap))
1470                 set_bit(0, iommu->domain_ids);
1471         return 0;
1472 }
1473
1474 static void disable_dmar_iommu(struct intel_iommu *iommu)
1475 {
1476         struct dmar_domain *domain;
1477         int i;
1478
1479         if ((iommu->domains) && (iommu->domain_ids)) {
1480                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1481                         /*
1482                          * Domain id 0 is reserved for invalid translation
1483                          * if hardware supports caching mode.
1484                          */
1485                         if (cap_caching_mode(iommu->cap) && i == 0)
1486                                 continue;
1487
1488                         domain = iommu->domains[i];
1489                         clear_bit(i, iommu->domain_ids);
1490                         if (domain_detach_iommu(domain, iommu) == 0 &&
1491                             !domain_type_is_vm(domain))
1492                                 domain_exit(domain);
1493                 }
1494         }
1495
1496         if (iommu->gcmd & DMA_GCMD_TE)
1497                 iommu_disable_translation(iommu);
1498 }
1499
1500 static void free_dmar_iommu(struct intel_iommu *iommu)
1501 {
1502         if ((iommu->domains) && (iommu->domain_ids)) {
1503                 kfree(iommu->domains);
1504                 kfree(iommu->domain_ids);
1505                 iommu->domains = NULL;
1506                 iommu->domain_ids = NULL;
1507         }
1508
1509         g_iommus[iommu->seq_id] = NULL;
1510
1511         /* free context mapping */
1512         free_context_table(iommu);
1513 }
1514
1515 static struct dmar_domain *alloc_domain(int flags)
1516 {
1517         /* domain id for virtual machine, it won't be set in context */
1518         static atomic_t vm_domid = ATOMIC_INIT(0);
1519         struct dmar_domain *domain;
1520
1521         domain = alloc_domain_mem();
1522         if (!domain)
1523                 return NULL;
1524
1525         memset(domain, 0, sizeof(*domain));
1526         domain->nid = -1;
1527         domain->flags = flags;
1528         spin_lock_init(&domain->iommu_lock);
1529         INIT_LIST_HEAD(&domain->devices);
1530         if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1531                 domain->id = atomic_inc_return(&vm_domid);
1532
1533         return domain;
1534 }
1535
1536 static int __iommu_attach_domain(struct dmar_domain *domain,
1537                                  struct intel_iommu *iommu)
1538 {
1539         int num;
1540         unsigned long ndomains;
1541
1542         ndomains = cap_ndoms(iommu->cap);
1543         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1544         if (num < ndomains) {
1545                 set_bit(num, iommu->domain_ids);
1546                 iommu->domains[num] = domain;
1547         } else {
1548                 num = -ENOSPC;
1549         }
1550
1551         return num;
1552 }
1553
1554 static int iommu_attach_domain(struct dmar_domain *domain,
1555                                struct intel_iommu *iommu)
1556 {
1557         int num;
1558         unsigned long flags;
1559
1560         spin_lock_irqsave(&iommu->lock, flags);
1561         num = __iommu_attach_domain(domain, iommu);
1562         spin_unlock_irqrestore(&iommu->lock, flags);
1563         if (num < 0)
1564                 pr_err("IOMMU: no free domain ids\n");
1565
1566         return num;
1567 }
1568
1569 static int iommu_attach_vm_domain(struct dmar_domain *domain,
1570                                   struct intel_iommu *iommu)
1571 {
1572         int num;
1573         unsigned long ndomains;
1574
1575         ndomains = cap_ndoms(iommu->cap);
1576         for_each_set_bit(num, iommu->domain_ids, ndomains)
1577                 if (iommu->domains[num] == domain)
1578                         return num;
1579
1580         return __iommu_attach_domain(domain, iommu);
1581 }
1582
1583 static void iommu_detach_domain(struct dmar_domain *domain,
1584                                 struct intel_iommu *iommu)
1585 {
1586         unsigned long flags;
1587         int num, ndomains;
1588
1589         spin_lock_irqsave(&iommu->lock, flags);
1590         if (domain_type_is_vm_or_si(domain)) {
1591                 ndomains = cap_ndoms(iommu->cap);
1592                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1593                         if (iommu->domains[num] == domain) {
1594                                 clear_bit(num, iommu->domain_ids);
1595                                 iommu->domains[num] = NULL;
1596                                 break;
1597                         }
1598                 }
1599         } else {
1600                 clear_bit(domain->id, iommu->domain_ids);
1601                 iommu->domains[domain->id] = NULL;
1602         }
1603         spin_unlock_irqrestore(&iommu->lock, flags);
1604 }
1605
1606 static void domain_attach_iommu(struct dmar_domain *domain,
1607                                struct intel_iommu *iommu)
1608 {
1609         unsigned long flags;
1610
1611         spin_lock_irqsave(&domain->iommu_lock, flags);
1612         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1613                 domain->iommu_count++;
1614                 if (domain->iommu_count == 1)
1615                         domain->nid = iommu->node;
1616                 domain_update_iommu_cap(domain);
1617         }
1618         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1619 }
1620
1621 static int domain_detach_iommu(struct dmar_domain *domain,
1622                                struct intel_iommu *iommu)
1623 {
1624         unsigned long flags;
1625         int count = INT_MAX;
1626
1627         spin_lock_irqsave(&domain->iommu_lock, flags);
1628         if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
1629                 count = --domain->iommu_count;
1630                 domain_update_iommu_cap(domain);
1631         }
1632         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1633
1634         return count;
1635 }
1636
1637 static struct iova_domain reserved_iova_list;
1638 static struct lock_class_key reserved_rbtree_key;
1639
1640 static int dmar_init_reserved_ranges(void)
1641 {
1642         struct pci_dev *pdev = NULL;
1643         struct iova *iova;
1644         int i;
1645
1646         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1647
1648         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1649                 &reserved_rbtree_key);
1650
1651         /* IOAPIC ranges shouldn't be accessed by DMA */
1652         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1653                 IOVA_PFN(IOAPIC_RANGE_END));
1654         if (!iova) {
1655                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1656                 return -ENODEV;
1657         }
1658
1659         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1660         for_each_pci_dev(pdev) {
1661                 struct resource *r;
1662
1663                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1664                         r = &pdev->resource[i];
1665                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1666                                 continue;
1667                         iova = reserve_iova(&reserved_iova_list,
1668                                             IOVA_PFN(r->start),
1669                                             IOVA_PFN(r->end));
1670                         if (!iova) {
1671                                 printk(KERN_ERR "Reserve iova failed\n");
1672                                 return -ENODEV;
1673                         }
1674                 }
1675         }
1676         return 0;
1677 }
1678
1679 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1680 {
1681         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1682 }
1683
1684 static inline int guestwidth_to_adjustwidth(int gaw)
1685 {
1686         int agaw;
1687         int r = (gaw - 12) % 9;
1688
1689         if (r == 0)
1690                 agaw = gaw;
1691         else
1692                 agaw = gaw + 9 - r;
1693         if (agaw > 64)
1694                 agaw = 64;
1695         return agaw;
1696 }
1697
1698 static int domain_init(struct dmar_domain *domain, int guest_width)
1699 {
1700         struct intel_iommu *iommu;
1701         int adjust_width, agaw;
1702         unsigned long sagaw;
1703
1704         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1705         domain_reserve_special_ranges(domain);
1706
1707         /* calculate AGAW */
1708         iommu = domain_get_iommu(domain);
1709         if (guest_width > cap_mgaw(iommu->cap))
1710                 guest_width = cap_mgaw(iommu->cap);
1711         domain->gaw = guest_width;
1712         adjust_width = guestwidth_to_adjustwidth(guest_width);
1713         agaw = width_to_agaw(adjust_width);
1714         sagaw = cap_sagaw(iommu->cap);
1715         if (!test_bit(agaw, &sagaw)) {
1716                 /* hardware doesn't support it, choose a bigger one */
1717                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1718                 agaw = find_next_bit(&sagaw, 5, agaw);
1719                 if (agaw >= 5)
1720                         return -ENODEV;
1721         }
1722         domain->agaw = agaw;
1723
1724         if (ecap_coherent(iommu->ecap))
1725                 domain->iommu_coherency = 1;
1726         else
1727                 domain->iommu_coherency = 0;
1728
1729         if (ecap_sc_support(iommu->ecap))
1730                 domain->iommu_snooping = 1;
1731         else
1732                 domain->iommu_snooping = 0;
1733
1734         if (intel_iommu_superpage)
1735                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1736         else
1737                 domain->iommu_superpage = 0;
1738
1739         domain->nid = iommu->node;
1740
1741         /* always allocate the top pgd */
1742         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1743         if (!domain->pgd)
1744                 return -ENOMEM;
1745         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1746         return 0;
1747 }
1748
1749 static void domain_exit(struct dmar_domain *domain)
1750 {
1751         struct dmar_drhd_unit *drhd;
1752         struct intel_iommu *iommu;
1753         struct page *freelist = NULL;
1754
1755         /* Domain 0 is reserved, so dont process it */
1756         if (!domain)
1757                 return;
1758
1759         /* Flush any lazy unmaps that may reference this domain */
1760         if (!intel_iommu_strict)
1761                 flush_unmaps_timeout(0);
1762
1763         /* remove associated devices */
1764         domain_remove_dev_info(domain);
1765
1766         /* destroy iovas */
1767         put_iova_domain(&domain->iovad);
1768
1769         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1770
1771         /* clear attached or cached domains */
1772         rcu_read_lock();
1773         for_each_active_iommu(iommu, drhd)
1774                 iommu_detach_domain(domain, iommu);
1775         rcu_read_unlock();
1776
1777         dma_free_pagelist(freelist);
1778
1779         free_domain_mem(domain);
1780 }
1781
1782 static int domain_context_mapping_one(struct dmar_domain *domain,
1783                                       struct intel_iommu *iommu,
1784                                       u8 bus, u8 devfn, int translation)
1785 {
1786         struct context_entry *context;
1787         unsigned long flags;
1788         struct dma_pte *pgd;
1789         int id;
1790         int agaw;
1791         struct device_domain_info *info = NULL;
1792
1793         pr_debug("Set context mapping for %02x:%02x.%d\n",
1794                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1795
1796         BUG_ON(!domain->pgd);
1797         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1798                translation != CONTEXT_TT_MULTI_LEVEL);
1799
1800         context = device_to_context_entry(iommu, bus, devfn);
1801         if (!context)
1802                 return -ENOMEM;
1803         spin_lock_irqsave(&iommu->lock, flags);
1804         if (context_present(context)) {
1805                 spin_unlock_irqrestore(&iommu->lock, flags);
1806                 return 0;
1807         }
1808
1809         id = domain->id;
1810         pgd = domain->pgd;
1811
1812         if (domain_type_is_vm_or_si(domain)) {
1813                 if (domain_type_is_vm(domain)) {
1814                         id = iommu_attach_vm_domain(domain, iommu);
1815                         if (id < 0) {
1816                                 spin_unlock_irqrestore(&iommu->lock, flags);
1817                                 pr_err("IOMMU: no free domain ids\n");
1818                                 return -EFAULT;
1819                         }
1820                 }
1821
1822                 /* Skip top levels of page tables for
1823                  * iommu which has less agaw than default.
1824                  * Unnecessary for PT mode.
1825                  */
1826                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1827                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1828                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1829                                 if (!dma_pte_present(pgd)) {
1830                                         spin_unlock_irqrestore(&iommu->lock, flags);
1831                                         return -ENOMEM;
1832                                 }
1833                         }
1834                 }
1835         }
1836
1837         context_set_domain_id(context, id);
1838
1839         if (translation != CONTEXT_TT_PASS_THROUGH) {
1840                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1841                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1842                                      CONTEXT_TT_MULTI_LEVEL;
1843         }
1844         /*
1845          * In pass through mode, AW must be programmed to indicate the largest
1846          * AGAW value supported by hardware. And ASR is ignored by hardware.
1847          */
1848         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1849                 context_set_address_width(context, iommu->msagaw);
1850         else {
1851                 context_set_address_root(context, virt_to_phys(pgd));
1852                 context_set_address_width(context, iommu->agaw);
1853         }
1854
1855         context_set_translation_type(context, translation);
1856         context_set_fault_enable(context);
1857         context_set_present(context);
1858         domain_flush_cache(domain, context, sizeof(*context));
1859
1860         /*
1861          * It's a non-present to present mapping. If hardware doesn't cache
1862          * non-present entry we only need to flush the write-buffer. If the
1863          * _does_ cache non-present entries, then it does so in the special
1864          * domain #0, which we have to flush:
1865          */
1866         if (cap_caching_mode(iommu->cap)) {
1867                 iommu->flush.flush_context(iommu, 0,
1868                                            (((u16)bus) << 8) | devfn,
1869                                            DMA_CCMD_MASK_NOBIT,
1870                                            DMA_CCMD_DEVICE_INVL);
1871                 iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
1872         } else {
1873                 iommu_flush_write_buffer(iommu);
1874         }
1875         iommu_enable_dev_iotlb(info);
1876         spin_unlock_irqrestore(&iommu->lock, flags);
1877
1878         domain_attach_iommu(domain, iommu);
1879
1880         return 0;
1881 }
1882
1883 struct domain_context_mapping_data {
1884         struct dmar_domain *domain;
1885         struct intel_iommu *iommu;
1886         int translation;
1887 };
1888
1889 static int domain_context_mapping_cb(struct pci_dev *pdev,
1890                                      u16 alias, void *opaque)
1891 {
1892         struct domain_context_mapping_data *data = opaque;
1893
1894         return domain_context_mapping_one(data->domain, data->iommu,
1895                                           PCI_BUS_NUM(alias), alias & 0xff,
1896                                           data->translation);
1897 }
1898
1899 static int
1900 domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1901                        int translation)
1902 {
1903         struct intel_iommu *iommu;
1904         u8 bus, devfn;
1905         struct domain_context_mapping_data data;
1906
1907         iommu = device_to_iommu(dev, &bus, &devfn);
1908         if (!iommu)
1909                 return -ENODEV;
1910
1911         if (!dev_is_pci(dev))
1912                 return domain_context_mapping_one(domain, iommu, bus, devfn,
1913                                                   translation);
1914
1915         data.domain = domain;
1916         data.iommu = iommu;
1917         data.translation = translation;
1918
1919         return pci_for_each_dma_alias(to_pci_dev(dev),
1920                                       &domain_context_mapping_cb, &data);
1921 }
1922
1923 static int domain_context_mapped_cb(struct pci_dev *pdev,
1924                                     u16 alias, void *opaque)
1925 {
1926         struct intel_iommu *iommu = opaque;
1927
1928         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
1929 }
1930
1931 static int domain_context_mapped(struct device *dev)
1932 {
1933         struct intel_iommu *iommu;
1934         u8 bus, devfn;
1935
1936         iommu = device_to_iommu(dev, &bus, &devfn);
1937         if (!iommu)
1938                 return -ENODEV;
1939
1940         if (!dev_is_pci(dev))
1941                 return device_context_mapped(iommu, bus, devfn);
1942
1943         return !pci_for_each_dma_alias(to_pci_dev(dev),
1944                                        domain_context_mapped_cb, iommu);
1945 }
1946
1947 /* Returns a number of VTD pages, but aligned to MM page size */
1948 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1949                                             size_t size)
1950 {
1951         host_addr &= ~PAGE_MASK;
1952         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1953 }
1954
1955 /* Return largest possible superpage level for a given mapping */
1956 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1957                                           unsigned long iov_pfn,
1958                                           unsigned long phy_pfn,
1959                                           unsigned long pages)
1960 {
1961         int support, level = 1;
1962         unsigned long pfnmerge;
1963
1964         support = domain->iommu_superpage;
1965
1966         /* To use a large page, the virtual *and* physical addresses
1967            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1968            of them will mean we have to use smaller pages. So just
1969            merge them and check both at once. */
1970         pfnmerge = iov_pfn | phy_pfn;
1971
1972         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1973                 pages >>= VTD_STRIDE_SHIFT;
1974                 if (!pages)
1975                         break;
1976                 pfnmerge >>= VTD_STRIDE_SHIFT;
1977                 level++;
1978                 support--;
1979         }
1980         return level;
1981 }
1982
1983 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1984                             struct scatterlist *sg, unsigned long phys_pfn,
1985                             unsigned long nr_pages, int prot)
1986 {
1987         struct dma_pte *first_pte = NULL, *pte = NULL;
1988         phys_addr_t uninitialized_var(pteval);
1989         unsigned long sg_res;
1990         unsigned int largepage_lvl = 0;
1991         unsigned long lvl_pages = 0;
1992
1993         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
1994
1995         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1996                 return -EINVAL;
1997
1998         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1999
2000         if (sg)
2001                 sg_res = 0;
2002         else {
2003                 sg_res = nr_pages + 1;
2004                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2005         }
2006
2007         while (nr_pages > 0) {
2008                 uint64_t tmp;
2009
2010                 if (!sg_res) {
2011                         sg_res = aligned_nrpages(sg->offset, sg->length);
2012                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2013                         sg->dma_length = sg->length;
2014                         pteval = page_to_phys(sg_page(sg)) | prot;
2015                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2016                 }
2017
2018                 if (!pte) {
2019                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2020
2021                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2022                         if (!pte)
2023                                 return -ENOMEM;
2024                         /* It is large page*/
2025                         if (largepage_lvl > 1) {
2026                                 pteval |= DMA_PTE_LARGE_PAGE;
2027                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2028                                 /*
2029                                  * Ensure that old small page tables are
2030                                  * removed to make room for superpage,
2031                                  * if they exist.
2032                                  */
2033                                 dma_pte_free_pagetable(domain, iov_pfn,
2034                                                        iov_pfn + lvl_pages - 1);
2035                         } else {
2036                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2037                         }
2038
2039                 }
2040                 /* We don't need lock here, nobody else
2041                  * touches the iova range
2042                  */
2043                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2044                 if (tmp) {
2045                         static int dumps = 5;
2046                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2047                                iov_pfn, tmp, (unsigned long long)pteval);
2048                         if (dumps) {
2049                                 dumps--;
2050                                 debug_dma_dump_mappings(NULL);
2051                         }
2052                         WARN_ON(1);
2053                 }
2054
2055                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2056
2057                 BUG_ON(nr_pages < lvl_pages);
2058                 BUG_ON(sg_res < lvl_pages);
2059
2060                 nr_pages -= lvl_pages;
2061                 iov_pfn += lvl_pages;
2062                 phys_pfn += lvl_pages;
2063                 pteval += lvl_pages * VTD_PAGE_SIZE;
2064                 sg_res -= lvl_pages;
2065
2066                 /* If the next PTE would be the first in a new page, then we
2067                    need to flush the cache on the entries we've just written.
2068                    And then we'll need to recalculate 'pte', so clear it and
2069                    let it get set again in the if (!pte) block above.
2070
2071                    If we're done (!nr_pages) we need to flush the cache too.
2072
2073                    Also if we've been setting superpages, we may need to
2074                    recalculate 'pte' and switch back to smaller pages for the
2075                    end of the mapping, if the trailing size is not enough to
2076                    use another superpage (i.e. sg_res < lvl_pages). */
2077                 pte++;
2078                 if (!nr_pages || first_pte_in_page(pte) ||
2079                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2080                         domain_flush_cache(domain, first_pte,
2081                                            (void *)pte - (void *)first_pte);
2082                         pte = NULL;
2083                 }
2084
2085                 if (!sg_res && nr_pages)
2086                         sg = sg_next(sg);
2087         }
2088         return 0;
2089 }
2090
2091 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2092                                     struct scatterlist *sg, unsigned long nr_pages,
2093                                     int prot)
2094 {
2095         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2096 }
2097
2098 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2099                                      unsigned long phys_pfn, unsigned long nr_pages,
2100                                      int prot)
2101 {
2102         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2103 }
2104
2105 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2106 {
2107         if (!iommu)
2108                 return;
2109
2110         clear_context_table(iommu, bus, devfn);
2111         iommu->flush.flush_context(iommu, 0, 0, 0,
2112                                            DMA_CCMD_GLOBAL_INVL);
2113         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2114 }
2115
2116 static inline void unlink_domain_info(struct device_domain_info *info)
2117 {
2118         assert_spin_locked(&device_domain_lock);
2119         list_del(&info->link);
2120         list_del(&info->global);
2121         if (info->dev)
2122                 info->dev->archdata.iommu = NULL;
2123 }
2124
2125 static void domain_remove_dev_info(struct dmar_domain *domain)
2126 {
2127         struct device_domain_info *info, *tmp;
2128         unsigned long flags;
2129
2130         spin_lock_irqsave(&device_domain_lock, flags);
2131         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
2132                 unlink_domain_info(info);
2133                 spin_unlock_irqrestore(&device_domain_lock, flags);
2134
2135                 iommu_disable_dev_iotlb(info);
2136                 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2137
2138                 if (domain_type_is_vm(domain)) {
2139                         iommu_detach_dependent_devices(info->iommu, info->dev);
2140                         domain_detach_iommu(domain, info->iommu);
2141                 }
2142
2143                 free_devinfo_mem(info);
2144                 spin_lock_irqsave(&device_domain_lock, flags);
2145         }
2146         spin_unlock_irqrestore(&device_domain_lock, flags);
2147 }
2148
2149 /*
2150  * find_domain
2151  * Note: we use struct device->archdata.iommu stores the info
2152  */
2153 static struct dmar_domain *find_domain(struct device *dev)
2154 {
2155         struct device_domain_info *info;
2156
2157         /* No lock here, assumes no domain exit in normal case */
2158         info = dev->archdata.iommu;
2159         if (info)
2160                 return info->domain;
2161         return NULL;
2162 }
2163
2164 static inline struct device_domain_info *
2165 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2166 {
2167         struct device_domain_info *info;
2168
2169         list_for_each_entry(info, &device_domain_list, global)
2170                 if (info->iommu->segment == segment && info->bus == bus &&
2171                     info->devfn == devfn)
2172                         return info;
2173
2174         return NULL;
2175 }
2176
2177 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2178                                                 int bus, int devfn,
2179                                                 struct device *dev,
2180                                                 struct dmar_domain *domain)
2181 {
2182         struct dmar_domain *found = NULL;
2183         struct device_domain_info *info;
2184         unsigned long flags;
2185
2186         info = alloc_devinfo_mem();
2187         if (!info)
2188                 return NULL;
2189
2190         info->bus = bus;
2191         info->devfn = devfn;
2192         info->dev = dev;
2193         info->domain = domain;
2194         info->iommu = iommu;
2195
2196         spin_lock_irqsave(&device_domain_lock, flags);
2197         if (dev)
2198                 found = find_domain(dev);
2199         else {
2200                 struct device_domain_info *info2;
2201                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2202                 if (info2)
2203                         found = info2->domain;
2204         }
2205         if (found) {
2206                 spin_unlock_irqrestore(&device_domain_lock, flags);
2207                 free_devinfo_mem(info);
2208                 /* Caller must free the original domain */
2209                 return found;
2210         }
2211
2212         list_add(&info->link, &domain->devices);
2213         list_add(&info->global, &device_domain_list);
2214         if (dev)
2215                 dev->archdata.iommu = info;
2216         spin_unlock_irqrestore(&device_domain_lock, flags);
2217
2218         return domain;
2219 }
2220
2221 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2222 {
2223         *(u16 *)opaque = alias;
2224         return 0;
2225 }
2226
2227 /* domain is initialized */
2228 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2229 {
2230         struct dmar_domain *domain, *tmp;
2231         struct intel_iommu *iommu;
2232         struct device_domain_info *info;
2233         u16 dma_alias;
2234         unsigned long flags;
2235         u8 bus, devfn;
2236
2237         domain = find_domain(dev);
2238         if (domain)
2239                 return domain;
2240
2241         iommu = device_to_iommu(dev, &bus, &devfn);
2242         if (!iommu)
2243                 return NULL;
2244
2245         if (dev_is_pci(dev)) {
2246                 struct pci_dev *pdev = to_pci_dev(dev);
2247
2248                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2249
2250                 spin_lock_irqsave(&device_domain_lock, flags);
2251                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2252                                                       PCI_BUS_NUM(dma_alias),
2253                                                       dma_alias & 0xff);
2254                 if (info) {
2255                         iommu = info->iommu;
2256                         domain = info->domain;
2257                 }
2258                 spin_unlock_irqrestore(&device_domain_lock, flags);
2259
2260                 /* DMA alias already has a domain, uses it */
2261                 if (info)
2262                         goto found_domain;
2263         }
2264
2265         /* Allocate and initialize new domain for the device */
2266         domain = alloc_domain(0);
2267         if (!domain)
2268                 return NULL;
2269         domain->id = iommu_attach_domain(domain, iommu);
2270         if (domain->id < 0) {
2271                 free_domain_mem(domain);
2272                 return NULL;
2273         }
2274         domain_attach_iommu(domain, iommu);
2275         if (domain_init(domain, gaw)) {
2276                 domain_exit(domain);
2277                 return NULL;
2278         }
2279
2280         /* register PCI DMA alias device */
2281         if (dev_is_pci(dev)) {
2282                 tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2283                                            dma_alias & 0xff, NULL, domain);
2284
2285                 if (!tmp || tmp != domain) {
2286                         domain_exit(domain);
2287                         domain = tmp;
2288                 }
2289
2290                 if (!domain)
2291                         return NULL;
2292         }
2293
2294 found_domain:
2295         tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2296
2297         if (!tmp || tmp != domain) {
2298                 domain_exit(domain);
2299                 domain = tmp;
2300         }
2301
2302         return domain;
2303 }
2304
2305 static int iommu_identity_mapping;
2306 #define IDENTMAP_ALL            1
2307 #define IDENTMAP_GFX            2
2308 #define IDENTMAP_AZALIA         4
2309
2310 static int iommu_domain_identity_map(struct dmar_domain *domain,
2311                                      unsigned long long start,
2312                                      unsigned long long end)
2313 {
2314         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2315         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2316
2317         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2318                           dma_to_mm_pfn(last_vpfn))) {
2319                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2320                 return -ENOMEM;
2321         }
2322
2323         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2324                  start, end, domain->id);
2325         /*
2326          * RMRR range might have overlap with physical memory range,
2327          * clear it first
2328          */
2329         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2330
2331         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2332                                   last_vpfn - first_vpfn + 1,
2333                                   DMA_PTE_READ|DMA_PTE_WRITE);
2334 }
2335
2336 static int iommu_prepare_identity_map(struct device *dev,
2337                                       unsigned long long start,
2338                                       unsigned long long end)
2339 {
2340         struct dmar_domain *domain;
2341         int ret;
2342
2343         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2344         if (!domain)
2345                 return -ENOMEM;
2346
2347         /* For _hardware_ passthrough, don't bother. But for software
2348            passthrough, we do it anyway -- it may indicate a memory
2349            range which is reserved in E820, so which didn't get set
2350            up to start with in si_domain */
2351         if (domain == si_domain && hw_pass_through) {
2352                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2353                        dev_name(dev), start, end);
2354                 return 0;
2355         }
2356
2357         printk(KERN_INFO
2358                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2359                dev_name(dev), start, end);
2360         
2361         if (end < start) {
2362                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2363                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2364                         dmi_get_system_info(DMI_BIOS_VENDOR),
2365                         dmi_get_system_info(DMI_BIOS_VERSION),
2366                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2367                 ret = -EIO;
2368                 goto error;
2369         }
2370
2371         if (end >> agaw_to_width(domain->agaw)) {
2372                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2373                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2374                      agaw_to_width(domain->agaw),
2375                      dmi_get_system_info(DMI_BIOS_VENDOR),
2376                      dmi_get_system_info(DMI_BIOS_VERSION),
2377                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2378                 ret = -EIO;
2379                 goto error;
2380         }
2381
2382         ret = iommu_domain_identity_map(domain, start, end);
2383         if (ret)
2384                 goto error;
2385
2386         /* context entry init */
2387         ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2388         if (ret)
2389                 goto error;
2390
2391         return 0;
2392
2393  error:
2394         domain_exit(domain);
2395         return ret;
2396 }
2397
2398 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2399                                          struct device *dev)
2400 {
2401         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2402                 return 0;
2403         return iommu_prepare_identity_map(dev, rmrr->base_address,
2404                                           rmrr->end_address);
2405 }
2406
2407 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2408 static inline void iommu_prepare_isa(void)
2409 {
2410         struct pci_dev *pdev;
2411         int ret;
2412
2413         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2414         if (!pdev)
2415                 return;
2416
2417         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2418         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2419
2420         if (ret)
2421                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2422                        "floppy might not work\n");
2423
2424         pci_dev_put(pdev);
2425 }
2426 #else
2427 static inline void iommu_prepare_isa(void)
2428 {
2429         return;
2430 }
2431 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2432
2433 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2434
2435 static int __init si_domain_init(int hw)
2436 {
2437         struct dmar_drhd_unit *drhd;
2438         struct intel_iommu *iommu;
2439         int nid, ret = 0;
2440         bool first = true;
2441
2442         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2443         if (!si_domain)
2444                 return -EFAULT;
2445
2446         for_each_active_iommu(iommu, drhd) {
2447                 ret = iommu_attach_domain(si_domain, iommu);
2448                 if (ret < 0) {
2449                         domain_exit(si_domain);
2450                         return -EFAULT;
2451                 } else if (first) {
2452                         si_domain->id = ret;
2453                         first = false;
2454                 } else if (si_domain->id != ret) {
2455                         domain_exit(si_domain);
2456                         return -EFAULT;
2457                 }
2458                 domain_attach_iommu(si_domain, iommu);
2459         }
2460
2461         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2462                 domain_exit(si_domain);
2463                 return -EFAULT;
2464         }
2465
2466         pr_debug("IOMMU: identity mapping domain is domain %d\n",
2467                  si_domain->id);
2468
2469         if (hw)
2470                 return 0;
2471
2472         for_each_online_node(nid) {
2473                 unsigned long start_pfn, end_pfn;
2474                 int i;
2475
2476                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2477                         ret = iommu_domain_identity_map(si_domain,
2478                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2479                         if (ret)
2480                                 return ret;
2481                 }
2482         }
2483
2484         return 0;
2485 }
2486
2487 static int identity_mapping(struct device *dev)
2488 {
2489         struct device_domain_info *info;
2490
2491         if (likely(!iommu_identity_mapping))
2492                 return 0;
2493
2494         info = dev->archdata.iommu;
2495         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2496                 return (info->domain == si_domain);
2497
2498         return 0;
2499 }
2500
2501 static int domain_add_dev_info(struct dmar_domain *domain,
2502                                struct device *dev, int translation)
2503 {
2504         struct dmar_domain *ndomain;
2505         struct intel_iommu *iommu;
2506         u8 bus, devfn;
2507         int ret;
2508
2509         iommu = device_to_iommu(dev, &bus, &devfn);
2510         if (!iommu)
2511                 return -ENODEV;
2512
2513         ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2514         if (ndomain != domain)
2515                 return -EBUSY;
2516
2517         ret = domain_context_mapping(domain, dev, translation);
2518         if (ret) {
2519                 domain_remove_one_dev_info(domain, dev);
2520                 return ret;
2521         }
2522
2523         return 0;
2524 }
2525
2526 static bool device_has_rmrr(struct device *dev)
2527 {
2528         struct dmar_rmrr_unit *rmrr;
2529         struct device *tmp;
2530         int i;
2531
2532         rcu_read_lock();
2533         for_each_rmrr_units(rmrr) {
2534                 /*
2535                  * Return TRUE if this RMRR contains the device that
2536                  * is passed in.
2537                  */
2538                 for_each_active_dev_scope(rmrr->devices,
2539                                           rmrr->devices_cnt, i, tmp)
2540                         if (tmp == dev) {
2541                                 rcu_read_unlock();
2542                                 return true;
2543                         }
2544         }
2545         rcu_read_unlock();
2546         return false;
2547 }
2548
2549 /*
2550  * There are a couple cases where we need to restrict the functionality of
2551  * devices associated with RMRRs.  The first is when evaluating a device for
2552  * identity mapping because problems exist when devices are moved in and out
2553  * of domains and their respective RMRR information is lost.  This means that
2554  * a device with associated RMRRs will never be in a "passthrough" domain.
2555  * The second is use of the device through the IOMMU API.  This interface
2556  * expects to have full control of the IOVA space for the device.  We cannot
2557  * satisfy both the requirement that RMRR access is maintained and have an
2558  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2559  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2560  * We therefore prevent devices associated with an RMRR from participating in
2561  * the IOMMU API, which eliminates them from device assignment.
2562  *
2563  * In both cases we assume that PCI USB devices with RMRRs have them largely
2564  * for historical reasons and that the RMRR space is not actively used post
2565  * boot.  This exclusion may change if vendors begin to abuse it.
2566  */
2567 static bool device_is_rmrr_locked(struct device *dev)
2568 {
2569         if (!device_has_rmrr(dev))
2570                 return false;
2571
2572         if (dev_is_pci(dev)) {
2573                 struct pci_dev *pdev = to_pci_dev(dev);
2574
2575                 if ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
2576                         return false;
2577         }
2578
2579         return true;
2580 }
2581
2582 static int iommu_should_identity_map(struct device *dev, int startup)
2583 {
2584
2585         if (dev_is_pci(dev)) {
2586                 struct pci_dev *pdev = to_pci_dev(dev);
2587
2588                 if (device_is_rmrr_locked(dev))
2589                         return 0;
2590
2591                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2592                         return 1;
2593
2594                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2595                         return 1;
2596
2597                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2598                         return 0;
2599
2600                 /*
2601                  * We want to start off with all devices in the 1:1 domain, and
2602                  * take them out later if we find they can't access all of memory.
2603                  *
2604                  * However, we can't do this for PCI devices behind bridges,
2605                  * because all PCI devices behind the same bridge will end up
2606                  * with the same source-id on their transactions.
2607                  *
2608                  * Practically speaking, we can't change things around for these
2609                  * devices at run-time, because we can't be sure there'll be no
2610                  * DMA transactions in flight for any of their siblings.
2611                  *
2612                  * So PCI devices (unless they're on the root bus) as well as
2613                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2614                  * the 1:1 domain, just in _case_ one of their siblings turns out
2615                  * not to be able to map all of memory.
2616                  */
2617                 if (!pci_is_pcie(pdev)) {
2618                         if (!pci_is_root_bus(pdev->bus))
2619                                 return 0;
2620                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2621                                 return 0;
2622                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2623                         return 0;
2624         } else {
2625                 if (device_has_rmrr(dev))
2626                         return 0;
2627         }
2628
2629         /*
2630          * At boot time, we don't yet know if devices will be 64-bit capable.
2631          * Assume that they will — if they turn out not to be, then we can
2632          * take them out of the 1:1 domain later.
2633          */
2634         if (!startup) {
2635                 /*
2636                  * If the device's dma_mask is less than the system's memory
2637                  * size then this is not a candidate for identity mapping.
2638                  */
2639                 u64 dma_mask = *dev->dma_mask;
2640
2641                 if (dev->coherent_dma_mask &&
2642                     dev->coherent_dma_mask < dma_mask)
2643                         dma_mask = dev->coherent_dma_mask;
2644
2645                 return dma_mask >= dma_get_required_mask(dev);
2646         }
2647
2648         return 1;
2649 }
2650
2651 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2652 {
2653         int ret;
2654
2655         if (!iommu_should_identity_map(dev, 1))
2656                 return 0;
2657
2658         ret = domain_add_dev_info(si_domain, dev,
2659                                   hw ? CONTEXT_TT_PASS_THROUGH :
2660                                        CONTEXT_TT_MULTI_LEVEL);
2661         if (!ret)
2662                 pr_info("IOMMU: %s identity mapping for device %s\n",
2663                         hw ? "hardware" : "software", dev_name(dev));
2664         else if (ret == -ENODEV)
2665                 /* device not associated with an iommu */
2666                 ret = 0;
2667
2668         return ret;
2669 }
2670
2671
2672 static int __init iommu_prepare_static_identity_mapping(int hw)
2673 {
2674         struct pci_dev *pdev = NULL;
2675         struct dmar_drhd_unit *drhd;
2676         struct intel_iommu *iommu;
2677         struct device *dev;
2678         int i;
2679         int ret = 0;
2680
2681         ret = si_domain_init(hw);
2682         if (ret)
2683                 return -EFAULT;
2684
2685         for_each_pci_dev(pdev) {
2686                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2687                 if (ret)
2688                         return ret;
2689         }
2690
2691         for_each_active_iommu(iommu, drhd)
2692                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2693                         struct acpi_device_physical_node *pn;
2694                         struct acpi_device *adev;
2695
2696                         if (dev->bus != &acpi_bus_type)
2697                                 continue;
2698                                 
2699                         adev= to_acpi_device(dev);
2700                         mutex_lock(&adev->physical_node_lock);
2701                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2702                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2703                                 if (ret)
2704                                         break;
2705                         }
2706                         mutex_unlock(&adev->physical_node_lock);
2707                         if (ret)
2708                                 return ret;
2709                 }
2710
2711         return 0;
2712 }
2713
2714 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2715 {
2716         /*
2717          * Start from the sane iommu hardware state.
2718          * If the queued invalidation is already initialized by us
2719          * (for example, while enabling interrupt-remapping) then
2720          * we got the things already rolling from a sane state.
2721          */
2722         if (!iommu->qi) {
2723                 /*
2724                  * Clear any previous faults.
2725                  */
2726                 dmar_fault(-1, iommu);
2727                 /*
2728                  * Disable queued invalidation if supported and already enabled
2729                  * before OS handover.
2730                  */
2731                 dmar_disable_qi(iommu);
2732         }
2733
2734         if (dmar_enable_qi(iommu)) {
2735                 /*
2736                  * Queued Invalidate not enabled, use Register Based Invalidate
2737                  */
2738                 iommu->flush.flush_context = __iommu_flush_context;
2739                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2740                 pr_info("IOMMU: %s using Register based invalidation\n",
2741                         iommu->name);
2742         } else {
2743                 iommu->flush.flush_context = qi_flush_context;
2744                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2745                 pr_info("IOMMU: %s using Queued invalidation\n", iommu->name);
2746         }
2747 }
2748
2749 static int __init init_dmars(void)
2750 {
2751         struct dmar_drhd_unit *drhd;
2752         struct dmar_rmrr_unit *rmrr;
2753         struct device *dev;
2754         struct intel_iommu *iommu;
2755         int i, ret;
2756
2757         /*
2758          * for each drhd
2759          *    allocate root
2760          *    initialize and program root entry to not present
2761          * endfor
2762          */
2763         for_each_drhd_unit(drhd) {
2764                 /*
2765                  * lock not needed as this is only incremented in the single
2766                  * threaded kernel __init code path all other access are read
2767                  * only
2768                  */
2769                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
2770                         g_num_of_iommus++;
2771                         continue;
2772                 }
2773                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2774                           DMAR_UNITS_SUPPORTED);
2775         }
2776
2777         /* Preallocate enough resources for IOMMU hot-addition */
2778         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
2779                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
2780
2781         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2782                         GFP_KERNEL);
2783         if (!g_iommus) {
2784                 printk(KERN_ERR "Allocating global iommu array failed\n");
2785                 ret = -ENOMEM;
2786                 goto error;
2787         }
2788
2789         deferred_flush = kzalloc(g_num_of_iommus *
2790                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2791         if (!deferred_flush) {
2792                 ret = -ENOMEM;
2793                 goto free_g_iommus;
2794         }
2795
2796         for_each_active_iommu(iommu, drhd) {
2797                 g_iommus[iommu->seq_id] = iommu;
2798
2799                 ret = iommu_init_domains(iommu);
2800                 if (ret)
2801                         goto free_iommu;
2802
2803                 /*
2804                  * TBD:
2805                  * we could share the same root & context tables
2806                  * among all IOMMU's. Need to Split it later.
2807                  */
2808                 ret = iommu_alloc_root_entry(iommu);
2809                 if (ret)
2810                         goto free_iommu;
2811                 if (!ecap_pass_through(iommu->ecap))
2812                         hw_pass_through = 0;
2813         }
2814
2815         for_each_active_iommu(iommu, drhd)
2816                 intel_iommu_init_qi(iommu);
2817
2818         if (iommu_pass_through)
2819                 iommu_identity_mapping |= IDENTMAP_ALL;
2820
2821 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2822         iommu_identity_mapping |= IDENTMAP_GFX;
2823 #endif
2824
2825         check_tylersburg_isoch();
2826
2827         /*
2828          * If pass through is not set or not enabled, setup context entries for
2829          * identity mappings for rmrr, gfx, and isa and may fall back to static
2830          * identity mapping if iommu_identity_mapping is set.
2831          */
2832         if (iommu_identity_mapping) {
2833                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2834                 if (ret) {
2835                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2836                         goto free_iommu;
2837                 }
2838         }
2839         /*
2840          * For each rmrr
2841          *   for each dev attached to rmrr
2842          *   do
2843          *     locate drhd for dev, alloc domain for dev
2844          *     allocate free domain
2845          *     allocate page table entries for rmrr
2846          *     if context not allocated for bus
2847          *           allocate and init context
2848          *           set present in root table for this bus
2849          *     init context with domain, translation etc
2850          *    endfor
2851          * endfor
2852          */
2853         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2854         for_each_rmrr_units(rmrr) {
2855                 /* some BIOS lists non-exist devices in DMAR table. */
2856                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2857                                           i, dev) {
2858                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
2859                         if (ret)
2860                                 printk(KERN_ERR
2861                                        "IOMMU: mapping reserved region failed\n");
2862                 }
2863         }
2864
2865         iommu_prepare_isa();
2866
2867         /*
2868          * for each drhd
2869          *   enable fault log
2870          *   global invalidate context cache
2871          *   global invalidate iotlb
2872          *   enable translation
2873          */
2874         for_each_iommu(iommu, drhd) {
2875                 if (drhd->ignored) {
2876                         /*
2877                          * we always have to disable PMRs or DMA may fail on
2878                          * this device
2879                          */
2880                         if (force_on)
2881                                 iommu_disable_protect_mem_regions(iommu);
2882                         continue;
2883                 }
2884
2885                 iommu_flush_write_buffer(iommu);
2886
2887                 ret = dmar_set_interrupt(iommu);
2888                 if (ret)
2889                         goto free_iommu;
2890
2891                 iommu_set_root_entry(iommu);
2892
2893                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2894                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2895                 iommu_enable_translation(iommu);
2896                 iommu_disable_protect_mem_regions(iommu);
2897         }
2898
2899         return 0;
2900
2901 free_iommu:
2902         for_each_active_iommu(iommu, drhd) {
2903                 disable_dmar_iommu(iommu);
2904                 free_dmar_iommu(iommu);
2905         }
2906         kfree(deferred_flush);
2907 free_g_iommus:
2908         kfree(g_iommus);
2909 error:
2910         return ret;
2911 }
2912
2913 /* This takes a number of _MM_ pages, not VTD pages */
2914 static struct iova *intel_alloc_iova(struct device *dev,
2915                                      struct dmar_domain *domain,
2916                                      unsigned long nrpages, uint64_t dma_mask)
2917 {
2918         struct iova *iova = NULL;
2919
2920         /* Restrict dma_mask to the width that the iommu can handle */
2921         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2922
2923         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2924                 /*
2925                  * First try to allocate an io virtual address in
2926                  * DMA_BIT_MASK(32) and if that fails then try allocating
2927                  * from higher range
2928                  */
2929                 iova = alloc_iova(&domain->iovad, nrpages,
2930                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2931                 if (iova)
2932                         return iova;
2933         }
2934         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2935         if (unlikely(!iova)) {
2936                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2937                        nrpages, dev_name(dev));
2938                 return NULL;
2939         }
2940
2941         return iova;
2942 }
2943
2944 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
2945 {
2946         struct dmar_domain *domain;
2947         int ret;
2948
2949         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2950         if (!domain) {
2951                 printk(KERN_ERR "Allocating domain for %s failed",
2952                        dev_name(dev));
2953                 return NULL;
2954         }
2955
2956         /* make sure context mapping is ok */
2957         if (unlikely(!domain_context_mapped(dev))) {
2958                 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2959                 if (ret) {
2960                         printk(KERN_ERR "Domain context map for %s failed",
2961                                dev_name(dev));
2962                         return NULL;
2963                 }
2964         }
2965
2966         return domain;
2967 }
2968
2969 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
2970 {
2971         struct device_domain_info *info;
2972
2973         /* No lock here, assumes no domain exit in normal case */
2974         info = dev->archdata.iommu;
2975         if (likely(info))
2976                 return info->domain;
2977
2978         return __get_valid_domain_for_dev(dev);
2979 }
2980
2981 static int iommu_dummy(struct device *dev)
2982 {
2983         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2984 }
2985
2986 /* Check if the dev needs to go through non-identity map and unmap process.*/
2987 static int iommu_no_mapping(struct device *dev)
2988 {
2989         int found;
2990
2991         if (iommu_dummy(dev))
2992                 return 1;
2993
2994         if (!iommu_identity_mapping)
2995                 return 0;
2996
2997         found = identity_mapping(dev);
2998         if (found) {
2999                 if (iommu_should_identity_map(dev, 0))
3000                         return 1;
3001                 else {
3002                         /*
3003                          * 32 bit DMA is removed from si_domain and fall back
3004                          * to non-identity mapping.
3005                          */
3006                         domain_remove_one_dev_info(si_domain, dev);
3007                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
3008                                dev_name(dev));
3009                         return 0;
3010                 }
3011         } else {
3012                 /*
3013                  * In case of a detached 64 bit DMA device from vm, the device
3014                  * is put into si_domain for identity mapping.
3015                  */
3016                 if (iommu_should_identity_map(dev, 0)) {
3017                         int ret;
3018                         ret = domain_add_dev_info(si_domain, dev,
3019                                                   hw_pass_through ?
3020                                                   CONTEXT_TT_PASS_THROUGH :
3021                                                   CONTEXT_TT_MULTI_LEVEL);
3022                         if (!ret) {
3023                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
3024                                        dev_name(dev));
3025                                 return 1;
3026                         }
3027                 }
3028         }
3029
3030         return 0;
3031 }
3032
3033 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3034                                      size_t size, int dir, u64 dma_mask)
3035 {
3036         struct dmar_domain *domain;
3037         phys_addr_t start_paddr;
3038         struct iova *iova;
3039         int prot = 0;
3040         int ret;
3041         struct intel_iommu *iommu;
3042         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3043
3044         BUG_ON(dir == DMA_NONE);
3045
3046         if (iommu_no_mapping(dev))
3047                 return paddr;
3048
3049         domain = get_valid_domain_for_dev(dev);
3050         if (!domain)
3051                 return 0;
3052
3053         iommu = domain_get_iommu(domain);
3054         size = aligned_nrpages(paddr, size);
3055
3056         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3057         if (!iova)
3058                 goto error;
3059
3060         /*
3061          * Check if DMAR supports zero-length reads on write only
3062          * mappings..
3063          */
3064         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3065                         !cap_zlr(iommu->cap))
3066                 prot |= DMA_PTE_READ;
3067         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3068                 prot |= DMA_PTE_WRITE;
3069         /*
3070          * paddr - (paddr + size) might be partial page, we should map the whole
3071          * page.  Note: if two part of one page are separately mapped, we
3072          * might have two guest_addr mapping to the same host paddr, but this
3073          * is not a big problem
3074          */
3075         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3076                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3077         if (ret)
3078                 goto error;
3079
3080         /* it's a non-present to present mapping. Only flush if caching mode */
3081         if (cap_caching_mode(iommu->cap))
3082                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3083         else
3084                 iommu_flush_write_buffer(iommu);
3085
3086         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3087         start_paddr += paddr & ~PAGE_MASK;
3088         return start_paddr;
3089
3090 error:
3091         if (iova)
3092                 __free_iova(&domain->iovad, iova);
3093         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3094                 dev_name(dev), size, (unsigned long long)paddr, dir);
3095         return 0;
3096 }
3097
3098 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3099                                  unsigned long offset, size_t size,
3100                                  enum dma_data_direction dir,
3101                                  struct dma_attrs *attrs)
3102 {
3103         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3104                                   dir, *dev->dma_mask);
3105 }
3106
3107 static void flush_unmaps(void)
3108 {
3109         int i, j;
3110
3111         timer_on = 0;
3112
3113         /* just flush them all */
3114         for (i = 0; i < g_num_of_iommus; i++) {
3115                 struct intel_iommu *iommu = g_iommus[i];
3116                 if (!iommu)
3117                         continue;
3118
3119                 if (!deferred_flush[i].next)
3120                         continue;
3121
3122                 /* In caching mode, global flushes turn emulation expensive */
3123                 if (!cap_caching_mode(iommu->cap))
3124                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3125                                          DMA_TLB_GLOBAL_FLUSH);
3126                 for (j = 0; j < deferred_flush[i].next; j++) {
3127                         unsigned long mask;
3128                         struct iova *iova = deferred_flush[i].iova[j];
3129                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3130
3131                         /* On real hardware multiple invalidations are expensive */
3132                         if (cap_caching_mode(iommu->cap))
3133                                 iommu_flush_iotlb_psi(iommu, domain->id,
3134                                         iova->pfn_lo, iova_size(iova),
3135                                         !deferred_flush[i].freelist[j], 0);
3136                         else {
3137                                 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3138                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3139                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3140                         }
3141                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3142                         if (deferred_flush[i].freelist[j])
3143                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3144                 }
3145                 deferred_flush[i].next = 0;
3146         }
3147
3148         list_size = 0;
3149 }
3150
3151 static void flush_unmaps_timeout(unsigned long data)
3152 {
3153         unsigned long flags;
3154
3155         spin_lock_irqsave(&async_umap_flush_lock, flags);
3156         flush_unmaps();
3157         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3158 }
3159
3160 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3161 {
3162         unsigned long flags;
3163         int next, iommu_id;
3164         struct intel_iommu *iommu;
3165
3166         spin_lock_irqsave(&async_umap_flush_lock, flags);
3167         if (list_size == HIGH_WATER_MARK)
3168                 flush_unmaps();
3169
3170         iommu = domain_get_iommu(dom);
3171         iommu_id = iommu->seq_id;
3172
3173         next = deferred_flush[iommu_id].next;
3174         deferred_flush[iommu_id].domain[next] = dom;
3175         deferred_flush[iommu_id].iova[next] = iova;
3176         deferred_flush[iommu_id].freelist[next] = freelist;
3177         deferred_flush[iommu_id].next++;
3178
3179         if (!timer_on) {
3180                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3181                 timer_on = 1;
3182         }
3183         list_size++;
3184         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3185 }
3186
3187 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3188 {
3189         struct dmar_domain *domain;
3190         unsigned long start_pfn, last_pfn;
3191         struct iova *iova;
3192         struct intel_iommu *iommu;
3193         struct page *freelist;
3194
3195         if (iommu_no_mapping(dev))
3196                 return;
3197
3198         domain = find_domain(dev);
3199         BUG_ON(!domain);
3200
3201         iommu = domain_get_iommu(domain);
3202
3203         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3204         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3205                       (unsigned long long)dev_addr))
3206                 return;
3207
3208         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3209         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3210
3211         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3212                  dev_name(dev), start_pfn, last_pfn);
3213
3214         freelist = domain_unmap(domain, start_pfn, last_pfn);
3215
3216         if (intel_iommu_strict) {
3217                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3218                                       last_pfn - start_pfn + 1, !freelist, 0);
3219                 /* free iova */
3220                 __free_iova(&domain->iovad, iova);
3221                 dma_free_pagelist(freelist);
3222         } else {
3223                 add_unmap(domain, iova, freelist);
3224                 /*
3225                  * queue up the release of the unmap to save the 1/6th of the
3226                  * cpu used up by the iotlb flush operation...
3227                  */
3228         }
3229 }
3230
3231 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3232                              size_t size, enum dma_data_direction dir,
3233                              struct dma_attrs *attrs)
3234 {
3235         intel_unmap(dev, dev_addr);
3236 }
3237
3238 static void *intel_alloc_coherent(struct device *dev, size_t size,
3239                                   dma_addr_t *dma_handle, gfp_t flags,
3240                                   struct dma_attrs *attrs)
3241 {
3242         struct page *page = NULL;
3243         int order;
3244
3245         size = PAGE_ALIGN(size);
3246         order = get_order(size);
3247
3248         if (!iommu_no_mapping(dev))
3249                 flags &= ~(GFP_DMA | GFP_DMA32);
3250         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3251                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3252                         flags |= GFP_DMA;
3253                 else
3254                         flags |= GFP_DMA32;
3255         }
3256
3257         if (flags & __GFP_WAIT) {
3258                 unsigned int count = size >> PAGE_SHIFT;
3259
3260                 page = dma_alloc_from_contiguous(dev, count, order);
3261                 if (page && iommu_no_mapping(dev) &&
3262                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3263                         dma_release_from_contiguous(dev, page, count);
3264                         page = NULL;
3265                 }
3266         }
3267
3268         if (!page)
3269                 page = alloc_pages(flags, order);
3270         if (!page)
3271                 return NULL;
3272         memset(page_address(page), 0, size);
3273
3274         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3275                                          DMA_BIDIRECTIONAL,
3276                                          dev->coherent_dma_mask);
3277         if (*dma_handle)
3278                 return page_address(page);
3279         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3280                 __free_pages(page, order);
3281
3282         return NULL;
3283 }
3284
3285 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3286                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3287 {
3288         int order;
3289         struct page *page = virt_to_page(vaddr);
3290
3291         size = PAGE_ALIGN(size);
3292         order = get_order(size);
3293
3294         intel_unmap(dev, dma_handle);
3295         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3296                 __free_pages(page, order);
3297 }
3298
3299 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3300                            int nelems, enum dma_data_direction dir,
3301                            struct dma_attrs *attrs)
3302 {
3303         intel_unmap(dev, sglist[0].dma_address);
3304 }
3305
3306 static int intel_nontranslate_map_sg(struct device *hddev,
3307         struct scatterlist *sglist, int nelems, int dir)
3308 {
3309         int i;
3310         struct scatterlist *sg;
3311
3312         for_each_sg(sglist, sg, nelems, i) {
3313                 BUG_ON(!sg_page(sg));
3314                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3315                 sg->dma_length = sg->length;
3316         }
3317         return nelems;
3318 }
3319
3320 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3321                         enum dma_data_direction dir, struct dma_attrs *attrs)
3322 {
3323         int i;
3324         struct dmar_domain *domain;
3325         size_t size = 0;
3326         int prot = 0;
3327         struct iova *iova = NULL;
3328         int ret;
3329         struct scatterlist *sg;
3330         unsigned long start_vpfn;
3331         struct intel_iommu *iommu;
3332
3333         BUG_ON(dir == DMA_NONE);
3334         if (iommu_no_mapping(dev))
3335                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3336
3337         domain = get_valid_domain_for_dev(dev);
3338         if (!domain)
3339                 return 0;
3340
3341         iommu = domain_get_iommu(domain);
3342
3343         for_each_sg(sglist, sg, nelems, i)
3344                 size += aligned_nrpages(sg->offset, sg->length);
3345
3346         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3347                                 *dev->dma_mask);
3348         if (!iova) {
3349                 sglist->dma_length = 0;
3350                 return 0;
3351         }
3352
3353         /*
3354          * Check if DMAR supports zero-length reads on write only
3355          * mappings..
3356          */
3357         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3358                         !cap_zlr(iommu->cap))
3359                 prot |= DMA_PTE_READ;
3360         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3361                 prot |= DMA_PTE_WRITE;
3362
3363         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3364
3365         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3366         if (unlikely(ret)) {
3367                 dma_pte_free_pagetable(domain, start_vpfn,
3368                                        start_vpfn + size - 1);
3369                 __free_iova(&domain->iovad, iova);
3370                 return 0;
3371         }
3372
3373         /* it's a non-present to present mapping. Only flush if caching mode */
3374         if (cap_caching_mode(iommu->cap))
3375                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3376         else
3377                 iommu_flush_write_buffer(iommu);
3378
3379         return nelems;
3380 }
3381
3382 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3383 {
3384         return !dma_addr;
3385 }
3386
3387 struct dma_map_ops intel_dma_ops = {
3388         .alloc = intel_alloc_coherent,
3389         .free = intel_free_coherent,
3390         .map_sg = intel_map_sg,
3391         .unmap_sg = intel_unmap_sg,
3392         .map_page = intel_map_page,
3393         .unmap_page = intel_unmap_page,
3394         .mapping_error = intel_mapping_error,
3395 };
3396
3397 static inline int iommu_domain_cache_init(void)
3398 {
3399         int ret = 0;
3400
3401         iommu_domain_cache = kmem_cache_create("iommu_domain",
3402                                          sizeof(struct dmar_domain),
3403                                          0,
3404                                          SLAB_HWCACHE_ALIGN,
3405
3406                                          NULL);
3407         if (!iommu_domain_cache) {
3408                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3409                 ret = -ENOMEM;
3410         }
3411
3412         return ret;
3413 }
3414
3415 static inline int iommu_devinfo_cache_init(void)
3416 {
3417         int ret = 0;
3418
3419         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3420                                          sizeof(struct device_domain_info),
3421                                          0,
3422                                          SLAB_HWCACHE_ALIGN,
3423                                          NULL);
3424         if (!iommu_devinfo_cache) {
3425                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3426                 ret = -ENOMEM;
3427         }
3428
3429         return ret;
3430 }
3431
3432 static inline int iommu_iova_cache_init(void)
3433 {
3434         int ret = 0;
3435
3436         iommu_iova_cache = kmem_cache_create("iommu_iova",
3437                                          sizeof(struct iova),
3438                                          0,
3439                                          SLAB_HWCACHE_ALIGN,
3440                                          NULL);
3441         if (!iommu_iova_cache) {
3442                 printk(KERN_ERR "Couldn't create iova cache\n");
3443                 ret = -ENOMEM;
3444         }
3445
3446         return ret;
3447 }
3448
3449 static int __init iommu_init_mempool(void)
3450 {
3451         int ret;
3452         ret = iommu_iova_cache_init();
3453         if (ret)
3454                 return ret;
3455
3456         ret = iommu_domain_cache_init();
3457         if (ret)
3458                 goto domain_error;
3459
3460         ret = iommu_devinfo_cache_init();
3461         if (!ret)
3462                 return ret;
3463
3464         kmem_cache_destroy(iommu_domain_cache);
3465 domain_error:
3466         kmem_cache_destroy(iommu_iova_cache);
3467
3468         return -ENOMEM;
3469 }
3470
3471 static void __init iommu_exit_mempool(void)
3472 {
3473         kmem_cache_destroy(iommu_devinfo_cache);
3474         kmem_cache_destroy(iommu_domain_cache);
3475         kmem_cache_destroy(iommu_iova_cache);
3476
3477 }
3478
3479 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3480 {
3481         struct dmar_drhd_unit *drhd;
3482         u32 vtbar;
3483         int rc;
3484
3485         /* We know that this device on this chipset has its own IOMMU.
3486          * If we find it under a different IOMMU, then the BIOS is lying
3487          * to us. Hope that the IOMMU for this device is actually
3488          * disabled, and it needs no translation...
3489          */
3490         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3491         if (rc) {
3492                 /* "can't" happen */
3493                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3494                 return;
3495         }
3496         vtbar &= 0xffff0000;
3497
3498         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3499         drhd = dmar_find_matched_drhd_unit(pdev);
3500         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3501                             TAINT_FIRMWARE_WORKAROUND,
3502                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3503                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3504 }
3505 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3506
3507 static void __init init_no_remapping_devices(void)
3508 {
3509         struct dmar_drhd_unit *drhd;
3510         struct device *dev;
3511         int i;
3512
3513         for_each_drhd_unit(drhd) {
3514                 if (!drhd->include_all) {
3515                         for_each_active_dev_scope(drhd->devices,
3516                                                   drhd->devices_cnt, i, dev)
3517                                 break;
3518                         /* ignore DMAR unit if no devices exist */
3519                         if (i == drhd->devices_cnt)
3520                                 drhd->ignored = 1;
3521                 }
3522         }
3523
3524         for_each_active_drhd_unit(drhd) {
3525                 if (drhd->include_all)
3526                         continue;
3527
3528                 for_each_active_dev_scope(drhd->devices,
3529                                           drhd->devices_cnt, i, dev)
3530                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3531                                 break;
3532                 if (i < drhd->devices_cnt)
3533                         continue;
3534
3535                 /* This IOMMU has *only* gfx devices. Either bypass it or
3536                    set the gfx_mapped flag, as appropriate */
3537                 if (dmar_map_gfx) {
3538                         intel_iommu_gfx_mapped = 1;
3539                 } else {
3540                         drhd->ignored = 1;
3541                         for_each_active_dev_scope(drhd->devices,
3542                                                   drhd->devices_cnt, i, dev)
3543                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3544                 }
3545         }
3546 }
3547
3548 #ifdef CONFIG_SUSPEND
3549 static int init_iommu_hw(void)
3550 {
3551         struct dmar_drhd_unit *drhd;
3552         struct intel_iommu *iommu = NULL;
3553
3554         for_each_active_iommu(iommu, drhd)
3555                 if (iommu->qi)
3556                         dmar_reenable_qi(iommu);
3557
3558         for_each_iommu(iommu, drhd) {
3559                 if (drhd->ignored) {
3560                         /*
3561                          * we always have to disable PMRs or DMA may fail on
3562                          * this device
3563                          */
3564                         if (force_on)
3565                                 iommu_disable_protect_mem_regions(iommu);
3566                         continue;
3567                 }
3568         
3569                 iommu_flush_write_buffer(iommu);
3570
3571                 iommu_set_root_entry(iommu);
3572
3573                 iommu->flush.flush_context(iommu, 0, 0, 0,
3574                                            DMA_CCMD_GLOBAL_INVL);
3575                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3576                 iommu_enable_translation(iommu);
3577                 iommu_disable_protect_mem_regions(iommu);
3578         }
3579
3580         return 0;
3581 }
3582
3583 static void iommu_flush_all(void)
3584 {
3585         struct dmar_drhd_unit *drhd;
3586         struct intel_iommu *iommu;
3587
3588         for_each_active_iommu(iommu, drhd) {
3589                 iommu->flush.flush_context(iommu, 0, 0, 0,
3590                                            DMA_CCMD_GLOBAL_INVL);
3591                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3592                                          DMA_TLB_GLOBAL_FLUSH);
3593         }
3594 }
3595
3596 static int iommu_suspend(void)
3597 {
3598         struct dmar_drhd_unit *drhd;
3599         struct intel_iommu *iommu = NULL;
3600         unsigned long flag;
3601
3602         for_each_active_iommu(iommu, drhd) {
3603                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3604                                                  GFP_ATOMIC);
3605                 if (!iommu->iommu_state)
3606                         goto nomem;
3607         }
3608
3609         iommu_flush_all();
3610
3611         for_each_active_iommu(iommu, drhd) {
3612                 iommu_disable_translation(iommu);
3613
3614                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3615
3616                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3617                         readl(iommu->reg + DMAR_FECTL_REG);
3618                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3619                         readl(iommu->reg + DMAR_FEDATA_REG);
3620                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3621                         readl(iommu->reg + DMAR_FEADDR_REG);
3622                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3623                         readl(iommu->reg + DMAR_FEUADDR_REG);
3624
3625                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3626         }
3627         return 0;
3628
3629 nomem:
3630         for_each_active_iommu(iommu, drhd)
3631                 kfree(iommu->iommu_state);
3632
3633         return -ENOMEM;
3634 }
3635
3636 static void iommu_resume(void)
3637 {
3638         struct dmar_drhd_unit *drhd;
3639         struct intel_iommu *iommu = NULL;
3640         unsigned long flag;
3641
3642         if (init_iommu_hw()) {
3643                 if (force_on)
3644                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3645                 else
3646                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3647                 return;
3648         }
3649
3650         for_each_active_iommu(iommu, drhd) {
3651
3652                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3653
3654                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3655                         iommu->reg + DMAR_FECTL_REG);
3656                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3657                         iommu->reg + DMAR_FEDATA_REG);
3658                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3659                         iommu->reg + DMAR_FEADDR_REG);
3660                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3661                         iommu->reg + DMAR_FEUADDR_REG);
3662
3663                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3664         }
3665
3666         for_each_active_iommu(iommu, drhd)
3667                 kfree(iommu->iommu_state);
3668 }
3669
3670 static struct syscore_ops iommu_syscore_ops = {
3671         .resume         = iommu_resume,
3672         .suspend        = iommu_suspend,
3673 };
3674
3675 static void __init init_iommu_pm_ops(void)
3676 {
3677         register_syscore_ops(&iommu_syscore_ops);
3678 }
3679
3680 #else
3681 static inline void init_iommu_pm_ops(void) {}
3682 #endif  /* CONFIG_PM */
3683
3684
3685 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3686 {
3687         struct acpi_dmar_reserved_memory *rmrr;
3688         struct dmar_rmrr_unit *rmrru;
3689
3690         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3691         if (!rmrru)
3692                 return -ENOMEM;
3693
3694         rmrru->hdr = header;
3695         rmrr = (struct acpi_dmar_reserved_memory *)header;
3696         rmrru->base_address = rmrr->base_address;
3697         rmrru->end_address = rmrr->end_address;
3698         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3699                                 ((void *)rmrr) + rmrr->header.length,
3700                                 &rmrru->devices_cnt);
3701         if (rmrru->devices_cnt && rmrru->devices == NULL) {
3702                 kfree(rmrru);
3703                 return -ENOMEM;
3704         }
3705
3706         list_add(&rmrru->list, &dmar_rmrr_units);
3707
3708         return 0;
3709 }
3710
3711 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3712 {
3713         struct dmar_atsr_unit *atsru;
3714         struct acpi_dmar_atsr *tmp;
3715
3716         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3717                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3718                 if (atsr->segment != tmp->segment)
3719                         continue;
3720                 if (atsr->header.length != tmp->header.length)
3721                         continue;
3722                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3723                         return atsru;
3724         }
3725
3726         return NULL;
3727 }
3728
3729 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3730 {
3731         struct acpi_dmar_atsr *atsr;
3732         struct dmar_atsr_unit *atsru;
3733
3734         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
3735                 return 0;
3736
3737         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3738         atsru = dmar_find_atsr(atsr);
3739         if (atsru)
3740                 return 0;
3741
3742         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3743         if (!atsru)
3744                 return -ENOMEM;
3745
3746         /*
3747          * If memory is allocated from slab by ACPI _DSM method, we need to
3748          * copy the memory content because the memory buffer will be freed
3749          * on return.
3750          */
3751         atsru->hdr = (void *)(atsru + 1);
3752         memcpy(atsru->hdr, hdr, hdr->length);
3753         atsru->include_all = atsr->flags & 0x1;
3754         if (!atsru->include_all) {
3755                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3756                                 (void *)atsr + atsr->header.length,
3757                                 &atsru->devices_cnt);
3758                 if (atsru->devices_cnt && atsru->devices == NULL) {
3759                         kfree(atsru);
3760                         return -ENOMEM;
3761                 }
3762         }
3763
3764         list_add_rcu(&atsru->list, &dmar_atsr_units);
3765
3766         return 0;
3767 }
3768
3769 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3770 {
3771         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3772         kfree(atsru);
3773 }
3774
3775 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3776 {
3777         struct acpi_dmar_atsr *atsr;
3778         struct dmar_atsr_unit *atsru;
3779
3780         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3781         atsru = dmar_find_atsr(atsr);
3782         if (atsru) {
3783                 list_del_rcu(&atsru->list);
3784                 synchronize_rcu();
3785                 intel_iommu_free_atsr(atsru);
3786         }
3787
3788         return 0;
3789 }
3790
3791 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3792 {
3793         int i;
3794         struct device *dev;
3795         struct acpi_dmar_atsr *atsr;
3796         struct dmar_atsr_unit *atsru;
3797
3798         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3799         atsru = dmar_find_atsr(atsr);
3800         if (!atsru)
3801                 return 0;
3802
3803         if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
3804                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3805                                           i, dev)
3806                         return -EBUSY;
3807
3808         return 0;
3809 }
3810
3811 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3812 {
3813         int sp, ret = 0;
3814         struct intel_iommu *iommu = dmaru->iommu;
3815
3816         if (g_iommus[iommu->seq_id])
3817                 return 0;
3818
3819         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3820                 pr_warn("IOMMU: %s doesn't support hardware pass through.\n",
3821                         iommu->name);
3822                 return -ENXIO;
3823         }
3824         if (!ecap_sc_support(iommu->ecap) &&
3825             domain_update_iommu_snooping(iommu)) {
3826                 pr_warn("IOMMU: %s doesn't support snooping.\n",
3827                         iommu->name);
3828                 return -ENXIO;
3829         }
3830         sp = domain_update_iommu_superpage(iommu) - 1;
3831         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3832                 pr_warn("IOMMU: %s doesn't support large page.\n",
3833                         iommu->name);
3834                 return -ENXIO;
3835         }
3836
3837         /*
3838          * Disable translation if already enabled prior to OS handover.
3839          */
3840         if (iommu->gcmd & DMA_GCMD_TE)
3841                 iommu_disable_translation(iommu);
3842
3843         g_iommus[iommu->seq_id] = iommu;
3844         ret = iommu_init_domains(iommu);
3845         if (ret == 0)
3846                 ret = iommu_alloc_root_entry(iommu);
3847         if (ret)
3848                 goto out;
3849
3850         if (dmaru->ignored) {
3851                 /*
3852                  * we always have to disable PMRs or DMA may fail on this device
3853                  */
3854                 if (force_on)
3855                         iommu_disable_protect_mem_regions(iommu);
3856                 return 0;
3857         }
3858
3859         intel_iommu_init_qi(iommu);
3860         iommu_flush_write_buffer(iommu);
3861         ret = dmar_set_interrupt(iommu);
3862         if (ret)
3863                 goto disable_iommu;
3864
3865         iommu_set_root_entry(iommu);
3866         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3867         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3868         iommu_enable_translation(iommu);
3869
3870         if (si_domain) {
3871                 ret = iommu_attach_domain(si_domain, iommu);
3872                 if (ret < 0 || si_domain->id != ret)
3873                         goto disable_iommu;
3874                 domain_attach_iommu(si_domain, iommu);
3875         }
3876
3877         iommu_disable_protect_mem_regions(iommu);
3878         return 0;
3879
3880 disable_iommu:
3881         disable_dmar_iommu(iommu);
3882 out:
3883         free_dmar_iommu(iommu);
3884         return ret;
3885 }
3886
3887 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3888 {
3889         int ret = 0;
3890         struct intel_iommu *iommu = dmaru->iommu;
3891
3892         if (!intel_iommu_enabled)
3893                 return 0;
3894         if (iommu == NULL)
3895                 return -EINVAL;
3896
3897         if (insert) {
3898                 ret = intel_iommu_add(dmaru);
3899         } else {
3900                 disable_dmar_iommu(iommu);
3901                 free_dmar_iommu(iommu);
3902         }
3903
3904         return ret;
3905 }
3906
3907 static void intel_iommu_free_dmars(void)
3908 {
3909         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3910         struct dmar_atsr_unit *atsru, *atsr_n;
3911
3912         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3913                 list_del(&rmrru->list);
3914                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3915                 kfree(rmrru);
3916         }
3917
3918         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3919                 list_del(&atsru->list);
3920                 intel_iommu_free_atsr(atsru);
3921         }
3922 }
3923
3924 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3925 {
3926         int i, ret = 1;
3927         struct pci_bus *bus;
3928         struct pci_dev *bridge = NULL;
3929         struct device *tmp;
3930         struct acpi_dmar_atsr *atsr;
3931         struct dmar_atsr_unit *atsru;
3932
3933         dev = pci_physfn(dev);
3934         for (bus = dev->bus; bus; bus = bus->parent) {
3935                 bridge = bus->self;
3936                 if (!bridge || !pci_is_pcie(bridge) ||
3937                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3938                         return 0;
3939                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3940                         break;
3941         }
3942         if (!bridge)
3943                 return 0;
3944
3945         rcu_read_lock();
3946         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3947                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3948                 if (atsr->segment != pci_domain_nr(dev->bus))
3949                         continue;
3950
3951                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3952                         if (tmp == &bridge->dev)
3953                                 goto out;
3954
3955                 if (atsru->include_all)
3956                         goto out;
3957         }
3958         ret = 0;
3959 out:
3960         rcu_read_unlock();
3961
3962         return ret;
3963 }
3964
3965 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3966 {
3967         int ret = 0;
3968         struct dmar_rmrr_unit *rmrru;
3969         struct dmar_atsr_unit *atsru;
3970         struct acpi_dmar_atsr *atsr;
3971         struct acpi_dmar_reserved_memory *rmrr;
3972
3973         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3974                 return 0;
3975
3976         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3977                 rmrr = container_of(rmrru->hdr,
3978                                     struct acpi_dmar_reserved_memory, header);
3979                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3980                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3981                                 ((void *)rmrr) + rmrr->header.length,
3982                                 rmrr->segment, rmrru->devices,
3983                                 rmrru->devices_cnt);
3984                         if(ret < 0)
3985                                 return ret;
3986                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3987                         dmar_remove_dev_scope(info, rmrr->segment,
3988                                 rmrru->devices, rmrru->devices_cnt);
3989                 }
3990         }
3991
3992         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3993                 if (atsru->include_all)
3994                         continue;
3995
3996                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3997                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3998                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3999                                         (void *)atsr + atsr->header.length,
4000                                         atsr->segment, atsru->devices,
4001                                         atsru->devices_cnt);
4002                         if (ret > 0)
4003                                 break;
4004                         else if(ret < 0)
4005                                 return ret;
4006                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
4007                         if (dmar_remove_dev_scope(info, atsr->segment,
4008                                         atsru->devices, atsru->devices_cnt))
4009                                 break;
4010                 }
4011         }
4012
4013         return 0;
4014 }
4015
4016 /*
4017  * Here we only respond to action of unbound device from driver.
4018  *
4019  * Added device is not attached to its DMAR domain here yet. That will happen
4020  * when mapping the device to iova.
4021  */
4022 static int device_notifier(struct notifier_block *nb,
4023                                   unsigned long action, void *data)
4024 {
4025         struct device *dev = data;
4026         struct dmar_domain *domain;
4027
4028         if (iommu_dummy(dev))
4029                 return 0;
4030
4031         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4032                 return 0;
4033
4034         /*
4035          * If the device is still attached to a device driver we can't
4036          * tear down the domain yet as DMA mappings may still be in use.
4037          * Wait for the BUS_NOTIFY_UNBOUND_DRIVER event to do that.
4038          */
4039         if (action == BUS_NOTIFY_DEL_DEVICE && dev->driver != NULL)
4040                 return 0;
4041
4042         domain = find_domain(dev);
4043         if (!domain)
4044                 return 0;
4045
4046         down_read(&dmar_global_lock);
4047         domain_remove_one_dev_info(domain, dev);
4048         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4049                 domain_exit(domain);
4050         up_read(&dmar_global_lock);
4051
4052         return 0;
4053 }
4054
4055 static struct notifier_block device_nb = {
4056         .notifier_call = device_notifier,
4057 };
4058
4059 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4060                                        unsigned long val, void *v)
4061 {
4062         struct memory_notify *mhp = v;
4063         unsigned long long start, end;
4064         unsigned long start_vpfn, last_vpfn;
4065
4066         switch (val) {
4067         case MEM_GOING_ONLINE:
4068                 start = mhp->start_pfn << PAGE_SHIFT;
4069                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4070                 if (iommu_domain_identity_map(si_domain, start, end)) {
4071                         pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
4072                                 start, end);
4073                         return NOTIFY_BAD;
4074                 }
4075                 break;
4076
4077         case MEM_OFFLINE:
4078         case MEM_CANCEL_ONLINE:
4079                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4080                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4081                 while (start_vpfn <= last_vpfn) {
4082                         struct iova *iova;
4083                         struct dmar_drhd_unit *drhd;
4084                         struct intel_iommu *iommu;
4085                         struct page *freelist;
4086
4087                         iova = find_iova(&si_domain->iovad, start_vpfn);
4088                         if (iova == NULL) {
4089                                 pr_debug("dmar: failed get IOVA for PFN %lx\n",
4090                                          start_vpfn);
4091                                 break;
4092                         }
4093
4094                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4095                                                      start_vpfn, last_vpfn);
4096                         if (iova == NULL) {
4097                                 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
4098                                         start_vpfn, last_vpfn);
4099                                 return NOTIFY_BAD;
4100                         }
4101
4102                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4103                                                iova->pfn_hi);
4104
4105                         rcu_read_lock();
4106                         for_each_active_iommu(iommu, drhd)
4107                                 iommu_flush_iotlb_psi(iommu, si_domain->id,
4108                                         iova->pfn_lo, iova_size(iova),
4109                                         !freelist, 0);
4110                         rcu_read_unlock();
4111                         dma_free_pagelist(freelist);
4112
4113                         start_vpfn = iova->pfn_hi + 1;
4114                         free_iova_mem(iova);
4115                 }
4116                 break;
4117         }
4118
4119         return NOTIFY_OK;
4120 }
4121
4122 static struct notifier_block intel_iommu_memory_nb = {
4123         .notifier_call = intel_iommu_memory_notifier,
4124         .priority = 0
4125 };
4126
4127
4128 static ssize_t intel_iommu_show_version(struct device *dev,
4129                                         struct device_attribute *attr,
4130                                         char *buf)
4131 {
4132         struct intel_iommu *iommu = dev_get_drvdata(dev);
4133         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4134         return sprintf(buf, "%d:%d\n",
4135                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4136 }
4137 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4138
4139 static ssize_t intel_iommu_show_address(struct device *dev,
4140                                         struct device_attribute *attr,
4141                                         char *buf)
4142 {
4143         struct intel_iommu *iommu = dev_get_drvdata(dev);
4144         return sprintf(buf, "%llx\n", iommu->reg_phys);
4145 }
4146 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4147
4148 static ssize_t intel_iommu_show_cap(struct device *dev,
4149                                     struct device_attribute *attr,
4150                                     char *buf)
4151 {
4152         struct intel_iommu *iommu = dev_get_drvdata(dev);
4153         return sprintf(buf, "%llx\n", iommu->cap);
4154 }
4155 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4156
4157 static ssize_t intel_iommu_show_ecap(struct device *dev,
4158                                     struct device_attribute *attr,
4159                                     char *buf)
4160 {
4161         struct intel_iommu *iommu = dev_get_drvdata(dev);
4162         return sprintf(buf, "%llx\n", iommu->ecap);
4163 }
4164 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4165
4166 static struct attribute *intel_iommu_attrs[] = {
4167         &dev_attr_version.attr,
4168         &dev_attr_address.attr,
4169         &dev_attr_cap.attr,
4170         &dev_attr_ecap.attr,
4171         NULL,
4172 };
4173
4174 static struct attribute_group intel_iommu_group = {
4175         .name = "intel-iommu",
4176         .attrs = intel_iommu_attrs,
4177 };
4178
4179 const struct attribute_group *intel_iommu_groups[] = {
4180         &intel_iommu_group,
4181         NULL,
4182 };
4183
4184 int __init intel_iommu_init(void)
4185 {
4186         int ret = -ENODEV;
4187         struct dmar_drhd_unit *drhd;
4188         struct intel_iommu *iommu;
4189
4190         /* VT-d is required for a TXT/tboot launch, so enforce that */
4191         force_on = tboot_force_iommu();
4192
4193         if (iommu_init_mempool()) {
4194                 if (force_on)
4195                         panic("tboot: Failed to initialize iommu memory\n");
4196                 return -ENOMEM;
4197         }
4198
4199         down_write(&dmar_global_lock);
4200         if (dmar_table_init()) {
4201                 if (force_on)
4202                         panic("tboot: Failed to initialize DMAR table\n");
4203                 goto out_free_dmar;
4204         }
4205
4206         /*
4207          * Disable translation if already enabled prior to OS handover.
4208          */
4209         for_each_active_iommu(iommu, drhd)
4210                 if (iommu->gcmd & DMA_GCMD_TE)
4211                         iommu_disable_translation(iommu);
4212
4213         if (dmar_dev_scope_init() < 0) {
4214                 if (force_on)
4215                         panic("tboot: Failed to initialize DMAR device scope\n");
4216                 goto out_free_dmar;
4217         }
4218
4219         if (no_iommu || dmar_disabled)
4220                 goto out_free_dmar;
4221
4222         if (list_empty(&dmar_rmrr_units))
4223                 printk(KERN_INFO "DMAR: No RMRR found\n");
4224
4225         if (list_empty(&dmar_atsr_units))
4226                 printk(KERN_INFO "DMAR: No ATSR found\n");
4227
4228         if (dmar_init_reserved_ranges()) {
4229                 if (force_on)
4230                         panic("tboot: Failed to reserve iommu ranges\n");
4231                 goto out_free_reserved_range;
4232         }
4233
4234         init_no_remapping_devices();
4235
4236         ret = init_dmars();
4237         if (ret) {
4238                 if (force_on)
4239                         panic("tboot: Failed to initialize DMARs\n");
4240                 printk(KERN_ERR "IOMMU: dmar init failed\n");
4241                 goto out_free_reserved_range;
4242         }
4243         up_write(&dmar_global_lock);
4244         printk(KERN_INFO
4245         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
4246
4247         init_timer(&unmap_timer);
4248 #ifdef CONFIG_SWIOTLB
4249         swiotlb = 0;
4250 #endif
4251         dma_ops = &intel_dma_ops;
4252
4253         init_iommu_pm_ops();
4254
4255         for_each_active_iommu(iommu, drhd)
4256                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4257                                                        intel_iommu_groups,
4258                                                        iommu->name);
4259
4260         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4261         bus_register_notifier(&pci_bus_type, &device_nb);
4262         if (si_domain && !hw_pass_through)
4263                 register_memory_notifier(&intel_iommu_memory_nb);
4264
4265         intel_iommu_enabled = 1;
4266
4267         return 0;
4268
4269 out_free_reserved_range:
4270         put_iova_domain(&reserved_iova_list);
4271 out_free_dmar:
4272         intel_iommu_free_dmars();
4273         up_write(&dmar_global_lock);
4274         iommu_exit_mempool();
4275         return ret;
4276 }
4277
4278 static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4279 {
4280         struct intel_iommu *iommu = opaque;
4281
4282         iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4283         return 0;
4284 }
4285
4286 /*
4287  * NB - intel-iommu lacks any sort of reference counting for the users of
4288  * dependent devices.  If multiple endpoints have intersecting dependent
4289  * devices, unbinding the driver from any one of them will possibly leave
4290  * the others unable to operate.
4291  */
4292 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4293                                            struct device *dev)
4294 {
4295         if (!iommu || !dev || !dev_is_pci(dev))
4296                 return;
4297
4298         pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
4299 }
4300
4301 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4302                                        struct device *dev)
4303 {
4304         struct device_domain_info *info, *tmp;
4305         struct intel_iommu *iommu;
4306         unsigned long flags;
4307         int found = 0;
4308         u8 bus, devfn;
4309
4310         iommu = device_to_iommu(dev, &bus, &devfn);
4311         if (!iommu)
4312                 return;
4313
4314         spin_lock_irqsave(&device_domain_lock, flags);
4315         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4316                 if (info->iommu == iommu && info->bus == bus &&
4317                     info->devfn == devfn) {
4318                         unlink_domain_info(info);
4319                         spin_unlock_irqrestore(&device_domain_lock, flags);
4320
4321                         iommu_disable_dev_iotlb(info);
4322                         iommu_detach_dev(iommu, info->bus, info->devfn);
4323                         iommu_detach_dependent_devices(iommu, dev);
4324                         free_devinfo_mem(info);
4325
4326                         spin_lock_irqsave(&device_domain_lock, flags);
4327
4328                         if (found)
4329                                 break;
4330                         else
4331                                 continue;
4332                 }
4333
4334                 /* if there is no other devices under the same iommu
4335                  * owned by this domain, clear this iommu in iommu_bmp
4336                  * update iommu count and coherency
4337                  */
4338                 if (info->iommu == iommu)
4339                         found = 1;
4340         }
4341
4342         spin_unlock_irqrestore(&device_domain_lock, flags);
4343
4344         if (found == 0) {
4345                 domain_detach_iommu(domain, iommu);
4346                 if (!domain_type_is_vm_or_si(domain))
4347                         iommu_detach_domain(domain, iommu);
4348         }
4349 }
4350
4351 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4352 {
4353         int adjust_width;
4354
4355         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
4356         domain_reserve_special_ranges(domain);
4357
4358         /* calculate AGAW */
4359         domain->gaw = guest_width;
4360         adjust_width = guestwidth_to_adjustwidth(guest_width);
4361         domain->agaw = width_to_agaw(adjust_width);
4362
4363         domain->iommu_coherency = 0;
4364         domain->iommu_snooping = 0;
4365         domain->iommu_superpage = 0;
4366         domain->max_addr = 0;
4367
4368         /* always allocate the top pgd */
4369         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4370         if (!domain->pgd)
4371                 return -ENOMEM;
4372         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4373         return 0;
4374 }
4375
4376 static int intel_iommu_domain_init(struct iommu_domain *domain)
4377 {
4378         struct dmar_domain *dmar_domain;
4379
4380         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4381         if (!dmar_domain) {
4382                 printk(KERN_ERR
4383                         "intel_iommu_domain_init: dmar_domain == NULL\n");
4384                 return -ENOMEM;
4385         }
4386         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4387                 printk(KERN_ERR
4388                         "intel_iommu_domain_init() failed\n");
4389                 domain_exit(dmar_domain);
4390                 return -ENOMEM;
4391         }
4392         domain_update_iommu_cap(dmar_domain);
4393         domain->priv = dmar_domain;
4394
4395         domain->geometry.aperture_start = 0;
4396         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4397         domain->geometry.force_aperture = true;
4398
4399         return 0;
4400 }
4401
4402 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4403 {
4404         struct dmar_domain *dmar_domain = domain->priv;
4405
4406         domain->priv = NULL;
4407         domain_exit(dmar_domain);
4408 }
4409
4410 static int intel_iommu_attach_device(struct iommu_domain *domain,
4411                                      struct device *dev)
4412 {
4413         struct dmar_domain *dmar_domain = domain->priv;
4414         struct intel_iommu *iommu;
4415         int addr_width;
4416         u8 bus, devfn;
4417
4418         if (device_is_rmrr_locked(dev)) {
4419                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4420                 return -EPERM;
4421         }
4422
4423         /* normally dev is not mapped */
4424         if (unlikely(domain_context_mapped(dev))) {
4425                 struct dmar_domain *old_domain;
4426
4427                 old_domain = find_domain(dev);
4428                 if (old_domain) {
4429                         if (domain_type_is_vm_or_si(dmar_domain))
4430                                 domain_remove_one_dev_info(old_domain, dev);
4431                         else
4432                                 domain_remove_dev_info(old_domain);
4433                 }
4434         }
4435
4436         iommu = device_to_iommu(dev, &bus, &devfn);
4437         if (!iommu)
4438                 return -ENODEV;
4439
4440         /* check if this iommu agaw is sufficient for max mapped address */
4441         addr_width = agaw_to_width(iommu->agaw);
4442         if (addr_width > cap_mgaw(iommu->cap))
4443                 addr_width = cap_mgaw(iommu->cap);
4444
4445         if (dmar_domain->max_addr > (1LL << addr_width)) {
4446                 printk(KERN_ERR "%s: iommu width (%d) is not "
4447                        "sufficient for the mapped address (%llx)\n",
4448                        __func__, addr_width, dmar_domain->max_addr);
4449                 return -EFAULT;
4450         }
4451         dmar_domain->gaw = addr_width;
4452
4453         /*
4454          * Knock out extra levels of page tables if necessary
4455          */
4456         while (iommu->agaw < dmar_domain->agaw) {
4457                 struct dma_pte *pte;
4458
4459                 pte = dmar_domain->pgd;
4460                 if (dma_pte_present(pte)) {
4461                         dmar_domain->pgd = (struct dma_pte *)
4462                                 phys_to_virt(dma_pte_addr(pte));
4463                         free_pgtable_page(pte);
4464                 }
4465                 dmar_domain->agaw--;
4466         }
4467
4468         return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
4469 }
4470
4471 static void intel_iommu_detach_device(struct iommu_domain *domain,
4472                                       struct device *dev)
4473 {
4474         struct dmar_domain *dmar_domain = domain->priv;
4475
4476         domain_remove_one_dev_info(dmar_domain, dev);
4477 }
4478
4479 static int intel_iommu_map(struct iommu_domain *domain,
4480                            unsigned long iova, phys_addr_t hpa,
4481                            size_t size, int iommu_prot)
4482 {
4483         struct dmar_domain *dmar_domain = domain->priv;
4484         u64 max_addr;
4485         int prot = 0;
4486         int ret;
4487
4488         if (iommu_prot & IOMMU_READ)
4489                 prot |= DMA_PTE_READ;
4490         if (iommu_prot & IOMMU_WRITE)
4491                 prot |= DMA_PTE_WRITE;
4492         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4493                 prot |= DMA_PTE_SNP;
4494
4495         max_addr = iova + size;
4496         if (dmar_domain->max_addr < max_addr) {
4497                 u64 end;
4498
4499                 /* check if minimum agaw is sufficient for mapped address */
4500                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4501                 if (end < max_addr) {
4502                         printk(KERN_ERR "%s: iommu width (%d) is not "
4503                                "sufficient for the mapped address (%llx)\n",
4504                                __func__, dmar_domain->gaw, max_addr);
4505                         return -EFAULT;
4506                 }
4507                 dmar_domain->max_addr = max_addr;
4508         }
4509         /* Round up size to next multiple of PAGE_SIZE, if it and
4510            the low bits of hpa would take us onto the next page */
4511         size = aligned_nrpages(hpa, size);
4512         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4513                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4514         return ret;
4515 }
4516
4517 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4518                                 unsigned long iova, size_t size)
4519 {
4520         struct dmar_domain *dmar_domain = domain->priv;
4521         struct page *freelist = NULL;
4522         struct intel_iommu *iommu;
4523         unsigned long start_pfn, last_pfn;
4524         unsigned int npages;
4525         int iommu_id, num, ndomains, level = 0;
4526
4527         /* Cope with horrid API which requires us to unmap more than the
4528            size argument if it happens to be a large-page mapping. */
4529         if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4530                 BUG();
4531
4532         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4533                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4534
4535         start_pfn = iova >> VTD_PAGE_SHIFT;
4536         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4537
4538         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4539
4540         npages = last_pfn - start_pfn + 1;
4541
4542         for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4543                iommu = g_iommus[iommu_id];
4544
4545                /*
4546                 * find bit position of dmar_domain
4547                 */
4548                ndomains = cap_ndoms(iommu->cap);
4549                for_each_set_bit(num, iommu->domain_ids, ndomains) {
4550                        if (iommu->domains[num] == dmar_domain)
4551                                iommu_flush_iotlb_psi(iommu, num, start_pfn,
4552                                                      npages, !freelist, 0);
4553                }
4554
4555         }
4556
4557         dma_free_pagelist(freelist);
4558
4559         if (dmar_domain->max_addr == iova + size)
4560                 dmar_domain->max_addr = iova;
4561
4562         return size;
4563 }
4564
4565 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4566                                             dma_addr_t iova)
4567 {
4568         struct dmar_domain *dmar_domain = domain->priv;
4569         struct dma_pte *pte;
4570         int level = 0;
4571         u64 phys = 0;
4572
4573         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4574         if (pte)
4575                 phys = dma_pte_addr(pte);
4576
4577         return phys;
4578 }
4579
4580 static bool intel_iommu_capable(enum iommu_cap cap)
4581 {
4582         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4583                 return domain_update_iommu_snooping(NULL) == 1;
4584         if (cap == IOMMU_CAP_INTR_REMAP)
4585                 return irq_remapping_enabled == 1;
4586
4587         return false;
4588 }
4589
4590 static int intel_iommu_add_device(struct device *dev)
4591 {
4592         struct intel_iommu *iommu;
4593         struct iommu_group *group;
4594         u8 bus, devfn;
4595
4596         iommu = device_to_iommu(dev, &bus, &devfn);
4597         if (!iommu)
4598                 return -ENODEV;
4599
4600         iommu_device_link(iommu->iommu_dev, dev);
4601
4602         group = iommu_group_get_for_dev(dev);
4603
4604         if (IS_ERR(group))
4605                 return PTR_ERR(group);
4606
4607         iommu_group_put(group);
4608         return 0;
4609 }
4610
4611 static void intel_iommu_remove_device(struct device *dev)
4612 {
4613         struct intel_iommu *iommu;
4614         u8 bus, devfn;
4615
4616         iommu = device_to_iommu(dev, &bus, &devfn);
4617         if (!iommu)
4618                 return;
4619
4620         iommu_group_remove_device(dev);
4621
4622         iommu_device_unlink(iommu->iommu_dev, dev);
4623 }
4624
4625 static const struct iommu_ops intel_iommu_ops = {
4626         .capable        = intel_iommu_capable,
4627         .domain_init    = intel_iommu_domain_init,
4628         .domain_destroy = intel_iommu_domain_destroy,
4629         .attach_dev     = intel_iommu_attach_device,
4630         .detach_dev     = intel_iommu_detach_device,
4631         .map            = intel_iommu_map,
4632         .unmap          = intel_iommu_unmap,
4633         .map_sg         = default_iommu_map_sg,
4634         .iova_to_phys   = intel_iommu_iova_to_phys,
4635         .add_device     = intel_iommu_add_device,
4636         .remove_device  = intel_iommu_remove_device,
4637         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4638 };
4639
4640 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4641 {
4642         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4643         printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4644         dmar_map_gfx = 0;
4645 }
4646
4647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4654
4655 static void quirk_iommu_rwbf(struct pci_dev *dev)
4656 {
4657         /*
4658          * Mobile 4 Series Chipset neglects to set RWBF capability,
4659          * but needs it. Same seems to hold for the desktop versions.
4660          */
4661         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4662         rwbf_quirk = 1;
4663 }
4664
4665 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4666 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4672
4673 #define GGC 0x52
4674 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4675 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4676 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4677 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4678 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4679 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4680 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4681 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4682
4683 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4684 {
4685         unsigned short ggc;
4686
4687         if (pci_read_config_word(dev, GGC, &ggc))
4688                 return;
4689
4690         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4691                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4692                 dmar_map_gfx = 0;
4693         } else if (dmar_map_gfx) {
4694                 /* we have to ensure the gfx device is idle before we flush */
4695                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4696                 intel_iommu_strict = 1;
4697        }
4698 }
4699 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4701 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4702 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4703
4704 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4705    ISOCH DMAR unit for the Azalia sound device, but not give it any
4706    TLB entries, which causes it to deadlock. Check for that.  We do
4707    this in a function called from init_dmars(), instead of in a PCI
4708    quirk, because we don't want to print the obnoxious "BIOS broken"
4709    message if VT-d is actually disabled.
4710 */
4711 static void __init check_tylersburg_isoch(void)
4712 {
4713         struct pci_dev *pdev;
4714         uint32_t vtisochctrl;
4715
4716         /* If there's no Azalia in the system anyway, forget it. */
4717         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4718         if (!pdev)
4719                 return;
4720         pci_dev_put(pdev);
4721
4722         /* System Management Registers. Might be hidden, in which case
4723            we can't do the sanity check. But that's OK, because the
4724            known-broken BIOSes _don't_ actually hide it, so far. */
4725         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4726         if (!pdev)
4727                 return;
4728
4729         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4730                 pci_dev_put(pdev);
4731                 return;
4732         }
4733
4734         pci_dev_put(pdev);
4735
4736         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4737         if (vtisochctrl & 1)
4738                 return;
4739
4740         /* Drop all bits other than the number of TLB entries */
4741         vtisochctrl &= 0x1c;
4742
4743         /* If we have the recommended number of TLB entries (16), fine. */
4744         if (vtisochctrl == 0x10)
4745                 return;
4746
4747         /* Zero TLB entries? You get to ride the short bus to school. */
4748         if (!vtisochctrl) {
4749                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4750                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4751                      dmi_get_system_info(DMI_BIOS_VENDOR),
4752                      dmi_get_system_info(DMI_BIOS_VERSION),
4753                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4754                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4755                 return;
4756         }
4757         
4758         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4759                vtisochctrl);
4760 }