Merge branch 'r6040' of git://git.kernel.org/pub/scm/linux/kernel/git/romieu/netdev...
[cascardo/linux.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18  * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19  * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
20  */
21
22 #include <linux/init.h>
23 #include <linux/bitmap.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/sysdev.h>
28 #include <linux/spinlock.h>
29 #include <linux/pci.h>
30 #include <linux/dmar.h>
31 #include <linux/dma-mapping.h>
32 #include <linux/mempool.h>
33 #include "iova.h"
34 #include "intel-iommu.h"
35 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
36 #include <asm/cacheflush.h>
37 #include <asm/gart.h>
38 #include "pci.h"
39
40 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
41 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
42
43 #define IOAPIC_RANGE_START      (0xfee00000)
44 #define IOAPIC_RANGE_END        (0xfeefffff)
45 #define IOVA_START_ADDR         (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
48
49 #define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
50
51 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
52
53 static void domain_remove_dev_info(struct dmar_domain *domain);
54
55 static int dmar_disabled;
56 static int __initdata dmar_map_gfx = 1;
57 static int dmar_forcedac;
58
59 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
60 static DEFINE_SPINLOCK(device_domain_lock);
61 static LIST_HEAD(device_domain_list);
62
63 static int __init intel_iommu_setup(char *str)
64 {
65         if (!str)
66                 return -EINVAL;
67         while (*str) {
68                 if (!strncmp(str, "off", 3)) {
69                         dmar_disabled = 1;
70                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
71                 } else if (!strncmp(str, "igfx_off", 8)) {
72                         dmar_map_gfx = 0;
73                         printk(KERN_INFO
74                                 "Intel-IOMMU: disable GFX device mapping\n");
75                 } else if (!strncmp(str, "forcedac", 8)) {
76                         printk (KERN_INFO
77                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
78                         dmar_forcedac = 1;
79                 }
80
81                 str += strcspn(str, ",");
82                 while (*str == ',')
83                         str++;
84         }
85         return 0;
86 }
87 __setup("intel_iommu=", intel_iommu_setup);
88
89 static struct kmem_cache *iommu_domain_cache;
90 static struct kmem_cache *iommu_devinfo_cache;
91 static struct kmem_cache *iommu_iova_cache;
92
93 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
94 {
95         unsigned int flags;
96         void *vaddr;
97
98         /* trying to avoid low memory issues */
99         flags = current->flags & PF_MEMALLOC;
100         current->flags |= PF_MEMALLOC;
101         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
102         current->flags &= (~PF_MEMALLOC | flags);
103         return vaddr;
104 }
105
106
107 static inline void *alloc_pgtable_page(void)
108 {
109         unsigned int flags;
110         void *vaddr;
111
112         /* trying to avoid low memory issues */
113         flags = current->flags & PF_MEMALLOC;
114         current->flags |= PF_MEMALLOC;
115         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
116         current->flags &= (~PF_MEMALLOC | flags);
117         return vaddr;
118 }
119
120 static inline void free_pgtable_page(void *vaddr)
121 {
122         free_page((unsigned long)vaddr);
123 }
124
125 static inline void *alloc_domain_mem(void)
126 {
127         return iommu_kmem_cache_alloc(iommu_domain_cache);
128 }
129
130 static inline void free_domain_mem(void *vaddr)
131 {
132         kmem_cache_free(iommu_domain_cache, vaddr);
133 }
134
135 static inline void * alloc_devinfo_mem(void)
136 {
137         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
138 }
139
140 static inline void free_devinfo_mem(void *vaddr)
141 {
142         kmem_cache_free(iommu_devinfo_cache, vaddr);
143 }
144
145 struct iova *alloc_iova_mem(void)
146 {
147         return iommu_kmem_cache_alloc(iommu_iova_cache);
148 }
149
150 void free_iova_mem(struct iova *iova)
151 {
152         kmem_cache_free(iommu_iova_cache, iova);
153 }
154
155 static inline void __iommu_flush_cache(
156         struct intel_iommu *iommu, void *addr, int size)
157 {
158         if (!ecap_coherent(iommu->ecap))
159                 clflush_cache_range(addr, size);
160 }
161
162 /* Gets context entry for a given bus and devfn */
163 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
164                 u8 bus, u8 devfn)
165 {
166         struct root_entry *root;
167         struct context_entry *context;
168         unsigned long phy_addr;
169         unsigned long flags;
170
171         spin_lock_irqsave(&iommu->lock, flags);
172         root = &iommu->root_entry[bus];
173         context = get_context_addr_from_root(root);
174         if (!context) {
175                 context = (struct context_entry *)alloc_pgtable_page();
176                 if (!context) {
177                         spin_unlock_irqrestore(&iommu->lock, flags);
178                         return NULL;
179                 }
180                 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
181                 phy_addr = virt_to_phys((void *)context);
182                 set_root_value(root, phy_addr);
183                 set_root_present(root);
184                 __iommu_flush_cache(iommu, root, sizeof(*root));
185         }
186         spin_unlock_irqrestore(&iommu->lock, flags);
187         return &context[devfn];
188 }
189
190 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
191 {
192         struct root_entry *root;
193         struct context_entry *context;
194         int ret;
195         unsigned long flags;
196
197         spin_lock_irqsave(&iommu->lock, flags);
198         root = &iommu->root_entry[bus];
199         context = get_context_addr_from_root(root);
200         if (!context) {
201                 ret = 0;
202                 goto out;
203         }
204         ret = context_present(context[devfn]);
205 out:
206         spin_unlock_irqrestore(&iommu->lock, flags);
207         return ret;
208 }
209
210 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
211 {
212         struct root_entry *root;
213         struct context_entry *context;
214         unsigned long flags;
215
216         spin_lock_irqsave(&iommu->lock, flags);
217         root = &iommu->root_entry[bus];
218         context = get_context_addr_from_root(root);
219         if (context) {
220                 context_clear_entry(context[devfn]);
221                 __iommu_flush_cache(iommu, &context[devfn], \
222                         sizeof(*context));
223         }
224         spin_unlock_irqrestore(&iommu->lock, flags);
225 }
226
227 static void free_context_table(struct intel_iommu *iommu)
228 {
229         struct root_entry *root;
230         int i;
231         unsigned long flags;
232         struct context_entry *context;
233
234         spin_lock_irqsave(&iommu->lock, flags);
235         if (!iommu->root_entry) {
236                 goto out;
237         }
238         for (i = 0; i < ROOT_ENTRY_NR; i++) {
239                 root = &iommu->root_entry[i];
240                 context = get_context_addr_from_root(root);
241                 if (context)
242                         free_pgtable_page(context);
243         }
244         free_pgtable_page(iommu->root_entry);
245         iommu->root_entry = NULL;
246 out:
247         spin_unlock_irqrestore(&iommu->lock, flags);
248 }
249
250 /* page table handling */
251 #define LEVEL_STRIDE            (9)
252 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
253
254 static inline int agaw_to_level(int agaw)
255 {
256         return agaw + 2;
257 }
258
259 static inline int agaw_to_width(int agaw)
260 {
261         return 30 + agaw * LEVEL_STRIDE;
262
263 }
264
265 static inline int width_to_agaw(int width)
266 {
267         return (width - 30) / LEVEL_STRIDE;
268 }
269
270 static inline unsigned int level_to_offset_bits(int level)
271 {
272         return (12 + (level - 1) * LEVEL_STRIDE);
273 }
274
275 static inline int address_level_offset(u64 addr, int level)
276 {
277         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
278 }
279
280 static inline u64 level_mask(int level)
281 {
282         return ((u64)-1 << level_to_offset_bits(level));
283 }
284
285 static inline u64 level_size(int level)
286 {
287         return ((u64)1 << level_to_offset_bits(level));
288 }
289
290 static inline u64 align_to_level(u64 addr, int level)
291 {
292         return ((addr + level_size(level) - 1) & level_mask(level));
293 }
294
295 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
296 {
297         int addr_width = agaw_to_width(domain->agaw);
298         struct dma_pte *parent, *pte = NULL;
299         int level = agaw_to_level(domain->agaw);
300         int offset;
301         unsigned long flags;
302
303         BUG_ON(!domain->pgd);
304
305         addr &= (((u64)1) << addr_width) - 1;
306         parent = domain->pgd;
307
308         spin_lock_irqsave(&domain->mapping_lock, flags);
309         while (level > 0) {
310                 void *tmp_page;
311
312                 offset = address_level_offset(addr, level);
313                 pte = &parent[offset];
314                 if (level == 1)
315                         break;
316
317                 if (!dma_pte_present(*pte)) {
318                         tmp_page = alloc_pgtable_page();
319
320                         if (!tmp_page) {
321                                 spin_unlock_irqrestore(&domain->mapping_lock,
322                                         flags);
323                                 return NULL;
324                         }
325                         __iommu_flush_cache(domain->iommu, tmp_page,
326                                         PAGE_SIZE_4K);
327                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
328                         /*
329                          * high level table always sets r/w, last level page
330                          * table control read/write
331                          */
332                         dma_set_pte_readable(*pte);
333                         dma_set_pte_writable(*pte);
334                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
335                 }
336                 parent = phys_to_virt(dma_pte_addr(*pte));
337                 level--;
338         }
339
340         spin_unlock_irqrestore(&domain->mapping_lock, flags);
341         return pte;
342 }
343
344 /* return address's pte at specific level */
345 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
346                 int level)
347 {
348         struct dma_pte *parent, *pte = NULL;
349         int total = agaw_to_level(domain->agaw);
350         int offset;
351
352         parent = domain->pgd;
353         while (level <= total) {
354                 offset = address_level_offset(addr, total);
355                 pte = &parent[offset];
356                 if (level == total)
357                         return pte;
358
359                 if (!dma_pte_present(*pte))
360                         break;
361                 parent = phys_to_virt(dma_pte_addr(*pte));
362                 total--;
363         }
364         return NULL;
365 }
366
367 /* clear one page's page table */
368 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
369 {
370         struct dma_pte *pte = NULL;
371
372         /* get last level pte */
373         pte = dma_addr_level_pte(domain, addr, 1);
374
375         if (pte) {
376                 dma_clear_pte(*pte);
377                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
378         }
379 }
380
381 /* clear last level pte, a tlb flush should be followed */
382 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
383 {
384         int addr_width = agaw_to_width(domain->agaw);
385
386         start &= (((u64)1) << addr_width) - 1;
387         end &= (((u64)1) << addr_width) - 1;
388         /* in case it's partial page */
389         start = PAGE_ALIGN_4K(start);
390         end &= PAGE_MASK_4K;
391
392         /* we don't need lock here, nobody else touches the iova range */
393         while (start < end) {
394                 dma_pte_clear_one(domain, start);
395                 start += PAGE_SIZE_4K;
396         }
397 }
398
399 /* free page table pages. last level pte should already be cleared */
400 static void dma_pte_free_pagetable(struct dmar_domain *domain,
401         u64 start, u64 end)
402 {
403         int addr_width = agaw_to_width(domain->agaw);
404         struct dma_pte *pte;
405         int total = agaw_to_level(domain->agaw);
406         int level;
407         u64 tmp;
408
409         start &= (((u64)1) << addr_width) - 1;
410         end &= (((u64)1) << addr_width) - 1;
411
412         /* we don't need lock here, nobody else touches the iova range */
413         level = 2;
414         while (level <= total) {
415                 tmp = align_to_level(start, level);
416                 if (tmp >= end || (tmp + level_size(level) > end))
417                         return;
418
419                 while (tmp < end) {
420                         pte = dma_addr_level_pte(domain, tmp, level);
421                         if (pte) {
422                                 free_pgtable_page(
423                                         phys_to_virt(dma_pte_addr(*pte)));
424                                 dma_clear_pte(*pte);
425                                 __iommu_flush_cache(domain->iommu,
426                                                 pte, sizeof(*pte));
427                         }
428                         tmp += level_size(level);
429                 }
430                 level++;
431         }
432         /* free pgd */
433         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
434                 free_pgtable_page(domain->pgd);
435                 domain->pgd = NULL;
436         }
437 }
438
439 /* iommu handling */
440 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
441 {
442         struct root_entry *root;
443         unsigned long flags;
444
445         root = (struct root_entry *)alloc_pgtable_page();
446         if (!root)
447                 return -ENOMEM;
448
449         __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
450
451         spin_lock_irqsave(&iommu->lock, flags);
452         iommu->root_entry = root;
453         spin_unlock_irqrestore(&iommu->lock, flags);
454
455         return 0;
456 }
457
458 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
459 {\
460         unsigned long start_time = jiffies;\
461         while (1) {\
462                 sts = op (iommu->reg + offset);\
463                 if (cond)\
464                         break;\
465                 if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
466                         panic("DMAR hardware is malfunctioning\n");\
467                 cpu_relax();\
468         }\
469 }
470
471 static void iommu_set_root_entry(struct intel_iommu *iommu)
472 {
473         void *addr;
474         u32 cmd, sts;
475         unsigned long flag;
476
477         addr = iommu->root_entry;
478
479         spin_lock_irqsave(&iommu->register_lock, flag);
480         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
481
482         cmd = iommu->gcmd | DMA_GCMD_SRTP;
483         writel(cmd, iommu->reg + DMAR_GCMD_REG);
484
485         /* Make sure hardware complete it */
486         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
487                 readl, (sts & DMA_GSTS_RTPS), sts);
488
489         spin_unlock_irqrestore(&iommu->register_lock, flag);
490 }
491
492 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
493 {
494         u32 val;
495         unsigned long flag;
496
497         if (!cap_rwbf(iommu->cap))
498                 return;
499         val = iommu->gcmd | DMA_GCMD_WBF;
500
501         spin_lock_irqsave(&iommu->register_lock, flag);
502         writel(val, iommu->reg + DMAR_GCMD_REG);
503
504         /* Make sure hardware complete it */
505         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
506                         readl, (!(val & DMA_GSTS_WBFS)), val);
507
508         spin_unlock_irqrestore(&iommu->register_lock, flag);
509 }
510
511 /* return value determine if we need a write buffer flush */
512 static int __iommu_flush_context(struct intel_iommu *iommu,
513         u16 did, u16 source_id, u8 function_mask, u64 type,
514         int non_present_entry_flush)
515 {
516         u64 val = 0;
517         unsigned long flag;
518
519         /*
520          * In the non-present entry flush case, if hardware doesn't cache
521          * non-present entry we do nothing and if hardware cache non-present
522          * entry, we flush entries of domain 0 (the domain id is used to cache
523          * any non-present entries)
524          */
525         if (non_present_entry_flush) {
526                 if (!cap_caching_mode(iommu->cap))
527                         return 1;
528                 else
529                         did = 0;
530         }
531
532         switch (type) {
533         case DMA_CCMD_GLOBAL_INVL:
534                 val = DMA_CCMD_GLOBAL_INVL;
535                 break;
536         case DMA_CCMD_DOMAIN_INVL:
537                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
538                 break;
539         case DMA_CCMD_DEVICE_INVL:
540                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
541                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
542                 break;
543         default:
544                 BUG();
545         }
546         val |= DMA_CCMD_ICC;
547
548         spin_lock_irqsave(&iommu->register_lock, flag);
549         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
550
551         /* Make sure hardware complete it */
552         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
553                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
554
555         spin_unlock_irqrestore(&iommu->register_lock, flag);
556
557         /* flush context entry will implictly flush write buffer */
558         return 0;
559 }
560
561 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
562         int non_present_entry_flush)
563 {
564         return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
565                 non_present_entry_flush);
566 }
567
568 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
569         int non_present_entry_flush)
570 {
571         return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
572                 non_present_entry_flush);
573 }
574
575 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
576         u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
577 {
578         return __iommu_flush_context(iommu, did, source_id, function_mask,
579                 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
580 }
581
582 /* return value determine if we need a write buffer flush */
583 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
584         u64 addr, unsigned int size_order, u64 type,
585         int non_present_entry_flush)
586 {
587         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
588         u64 val = 0, val_iva = 0;
589         unsigned long flag;
590
591         /*
592          * In the non-present entry flush case, if hardware doesn't cache
593          * non-present entry we do nothing and if hardware cache non-present
594          * entry, we flush entries of domain 0 (the domain id is used to cache
595          * any non-present entries)
596          */
597         if (non_present_entry_flush) {
598                 if (!cap_caching_mode(iommu->cap))
599                         return 1;
600                 else
601                         did = 0;
602         }
603
604         switch (type) {
605         case DMA_TLB_GLOBAL_FLUSH:
606                 /* global flush doesn't need set IVA_REG */
607                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
608                 break;
609         case DMA_TLB_DSI_FLUSH:
610                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
611                 break;
612         case DMA_TLB_PSI_FLUSH:
613                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
614                 /* Note: always flush non-leaf currently */
615                 val_iva = size_order | addr;
616                 break;
617         default:
618                 BUG();
619         }
620         /* Note: set drain read/write */
621 #if 0
622         /*
623          * This is probably to be super secure.. Looks like we can
624          * ignore it without any impact.
625          */
626         if (cap_read_drain(iommu->cap))
627                 val |= DMA_TLB_READ_DRAIN;
628 #endif
629         if (cap_write_drain(iommu->cap))
630                 val |= DMA_TLB_WRITE_DRAIN;
631
632         spin_lock_irqsave(&iommu->register_lock, flag);
633         /* Note: Only uses first TLB reg currently */
634         if (val_iva)
635                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
636         dmar_writeq(iommu->reg + tlb_offset + 8, val);
637
638         /* Make sure hardware complete it */
639         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
640                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
641
642         spin_unlock_irqrestore(&iommu->register_lock, flag);
643
644         /* check IOTLB invalidation granularity */
645         if (DMA_TLB_IAIG(val) == 0)
646                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
647         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
648                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
649                         DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
650         /* flush context entry will implictly flush write buffer */
651         return 0;
652 }
653
654 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
655         int non_present_entry_flush)
656 {
657         return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
658                 non_present_entry_flush);
659 }
660
661 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
662         int non_present_entry_flush)
663 {
664         return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
665                 non_present_entry_flush);
666 }
667
668 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
669         u64 addr, unsigned int pages, int non_present_entry_flush)
670 {
671         unsigned int mask;
672
673         BUG_ON(addr & (~PAGE_MASK_4K));
674         BUG_ON(pages == 0);
675
676         /* Fallback to domain selective flush if no PSI support */
677         if (!cap_pgsel_inv(iommu->cap))
678                 return iommu_flush_iotlb_dsi(iommu, did,
679                         non_present_entry_flush);
680
681         /*
682          * PSI requires page size to be 2 ^ x, and the base address is naturally
683          * aligned to the size
684          */
685         mask = ilog2(__roundup_pow_of_two(pages));
686         /* Fallback to domain selective flush if size is too big */
687         if (mask > cap_max_amask_val(iommu->cap))
688                 return iommu_flush_iotlb_dsi(iommu, did,
689                         non_present_entry_flush);
690
691         return __iommu_flush_iotlb(iommu, did, addr, mask,
692                 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
693 }
694
695 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
696 {
697         u32 pmen;
698         unsigned long flags;
699
700         spin_lock_irqsave(&iommu->register_lock, flags);
701         pmen = readl(iommu->reg + DMAR_PMEN_REG);
702         pmen &= ~DMA_PMEN_EPM;
703         writel(pmen, iommu->reg + DMAR_PMEN_REG);
704
705         /* wait for the protected region status bit to clear */
706         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
707                 readl, !(pmen & DMA_PMEN_PRS), pmen);
708
709         spin_unlock_irqrestore(&iommu->register_lock, flags);
710 }
711
712 static int iommu_enable_translation(struct intel_iommu *iommu)
713 {
714         u32 sts;
715         unsigned long flags;
716
717         spin_lock_irqsave(&iommu->register_lock, flags);
718         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
719
720         /* Make sure hardware complete it */
721         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
722                 readl, (sts & DMA_GSTS_TES), sts);
723
724         iommu->gcmd |= DMA_GCMD_TE;
725         spin_unlock_irqrestore(&iommu->register_lock, flags);
726         return 0;
727 }
728
729 static int iommu_disable_translation(struct intel_iommu *iommu)
730 {
731         u32 sts;
732         unsigned long flag;
733
734         spin_lock_irqsave(&iommu->register_lock, flag);
735         iommu->gcmd &= ~DMA_GCMD_TE;
736         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
737
738         /* Make sure hardware complete it */
739         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
740                 readl, (!(sts & DMA_GSTS_TES)), sts);
741
742         spin_unlock_irqrestore(&iommu->register_lock, flag);
743         return 0;
744 }
745
746 /* iommu interrupt handling. Most stuff are MSI-like. */
747
748 static const char *fault_reason_strings[] =
749 {
750         "Software",
751         "Present bit in root entry is clear",
752         "Present bit in context entry is clear",
753         "Invalid context entry",
754         "Access beyond MGAW",
755         "PTE Write access is not set",
756         "PTE Read access is not set",
757         "Next page table ptr is invalid",
758         "Root table address invalid",
759         "Context table ptr is invalid",
760         "non-zero reserved fields in RTP",
761         "non-zero reserved fields in CTP",
762         "non-zero reserved fields in PTE",
763 };
764 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
765
766 const char *dmar_get_fault_reason(u8 fault_reason)
767 {
768         if (fault_reason > MAX_FAULT_REASON_IDX)
769                 return "Unknown";
770         else
771                 return fault_reason_strings[fault_reason];
772 }
773
774 void dmar_msi_unmask(unsigned int irq)
775 {
776         struct intel_iommu *iommu = get_irq_data(irq);
777         unsigned long flag;
778
779         /* unmask it */
780         spin_lock_irqsave(&iommu->register_lock, flag);
781         writel(0, iommu->reg + DMAR_FECTL_REG);
782         /* Read a reg to force flush the post write */
783         readl(iommu->reg + DMAR_FECTL_REG);
784         spin_unlock_irqrestore(&iommu->register_lock, flag);
785 }
786
787 void dmar_msi_mask(unsigned int irq)
788 {
789         unsigned long flag;
790         struct intel_iommu *iommu = get_irq_data(irq);
791
792         /* mask it */
793         spin_lock_irqsave(&iommu->register_lock, flag);
794         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
795         /* Read a reg to force flush the post write */
796         readl(iommu->reg + DMAR_FECTL_REG);
797         spin_unlock_irqrestore(&iommu->register_lock, flag);
798 }
799
800 void dmar_msi_write(int irq, struct msi_msg *msg)
801 {
802         struct intel_iommu *iommu = get_irq_data(irq);
803         unsigned long flag;
804
805         spin_lock_irqsave(&iommu->register_lock, flag);
806         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
807         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
808         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
809         spin_unlock_irqrestore(&iommu->register_lock, flag);
810 }
811
812 void dmar_msi_read(int irq, struct msi_msg *msg)
813 {
814         struct intel_iommu *iommu = get_irq_data(irq);
815         unsigned long flag;
816
817         spin_lock_irqsave(&iommu->register_lock, flag);
818         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
819         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
820         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
821         spin_unlock_irqrestore(&iommu->register_lock, flag);
822 }
823
824 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
825                 u8 fault_reason, u16 source_id, u64 addr)
826 {
827         const char *reason;
828
829         reason = dmar_get_fault_reason(fault_reason);
830
831         printk(KERN_ERR
832                 "DMAR:[%s] Request device [%02x:%02x.%d] "
833                 "fault addr %llx \n"
834                 "DMAR:[fault reason %02d] %s\n",
835                 (type ? "DMA Read" : "DMA Write"),
836                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
837                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
838         return 0;
839 }
840
841 #define PRIMARY_FAULT_REG_LEN (16)
842 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
843 {
844         struct intel_iommu *iommu = dev_id;
845         int reg, fault_index;
846         u32 fault_status;
847         unsigned long flag;
848
849         spin_lock_irqsave(&iommu->register_lock, flag);
850         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
851
852         /* TBD: ignore advanced fault log currently */
853         if (!(fault_status & DMA_FSTS_PPF))
854                 goto clear_overflow;
855
856         fault_index = dma_fsts_fault_record_index(fault_status);
857         reg = cap_fault_reg_offset(iommu->cap);
858         while (1) {
859                 u8 fault_reason;
860                 u16 source_id;
861                 u64 guest_addr;
862                 int type;
863                 u32 data;
864
865                 /* highest 32 bits */
866                 data = readl(iommu->reg + reg +
867                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
868                 if (!(data & DMA_FRCD_F))
869                         break;
870
871                 fault_reason = dma_frcd_fault_reason(data);
872                 type = dma_frcd_type(data);
873
874                 data = readl(iommu->reg + reg +
875                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
876                 source_id = dma_frcd_source_id(data);
877
878                 guest_addr = dmar_readq(iommu->reg + reg +
879                                 fault_index * PRIMARY_FAULT_REG_LEN);
880                 guest_addr = dma_frcd_page_addr(guest_addr);
881                 /* clear the fault */
882                 writel(DMA_FRCD_F, iommu->reg + reg +
883                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
884
885                 spin_unlock_irqrestore(&iommu->register_lock, flag);
886
887                 iommu_page_fault_do_one(iommu, type, fault_reason,
888                                 source_id, guest_addr);
889
890                 fault_index++;
891                 if (fault_index > cap_num_fault_regs(iommu->cap))
892                         fault_index = 0;
893                 spin_lock_irqsave(&iommu->register_lock, flag);
894         }
895 clear_overflow:
896         /* clear primary fault overflow */
897         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
898         if (fault_status & DMA_FSTS_PFO)
899                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
900
901         spin_unlock_irqrestore(&iommu->register_lock, flag);
902         return IRQ_HANDLED;
903 }
904
905 int dmar_set_interrupt(struct intel_iommu *iommu)
906 {
907         int irq, ret;
908
909         irq = create_irq();
910         if (!irq) {
911                 printk(KERN_ERR "IOMMU: no free vectors\n");
912                 return -EINVAL;
913         }
914
915         set_irq_data(irq, iommu);
916         iommu->irq = irq;
917
918         ret = arch_setup_dmar_msi(irq);
919         if (ret) {
920                 set_irq_data(irq, NULL);
921                 iommu->irq = 0;
922                 destroy_irq(irq);
923                 return 0;
924         }
925
926         /* Force fault register is cleared */
927         iommu_page_fault(irq, iommu);
928
929         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
930         if (ret)
931                 printk(KERN_ERR "IOMMU: can't request irq\n");
932         return ret;
933 }
934
935 static int iommu_init_domains(struct intel_iommu *iommu)
936 {
937         unsigned long ndomains;
938         unsigned long nlongs;
939
940         ndomains = cap_ndoms(iommu->cap);
941         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
942         nlongs = BITS_TO_LONGS(ndomains);
943
944         /* TBD: there might be 64K domains,
945          * consider other allocation for future chip
946          */
947         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
948         if (!iommu->domain_ids) {
949                 printk(KERN_ERR "Allocating domain id array failed\n");
950                 return -ENOMEM;
951         }
952         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
953                         GFP_KERNEL);
954         if (!iommu->domains) {
955                 printk(KERN_ERR "Allocating domain array failed\n");
956                 kfree(iommu->domain_ids);
957                 return -ENOMEM;
958         }
959
960         /*
961          * if Caching mode is set, then invalid translations are tagged
962          * with domainid 0. Hence we need to pre-allocate it.
963          */
964         if (cap_caching_mode(iommu->cap))
965                 set_bit(0, iommu->domain_ids);
966         return 0;
967 }
968
969 static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
970 {
971         struct intel_iommu *iommu;
972         int ret;
973         int map_size;
974         u32 ver;
975
976         iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
977         if (!iommu)
978                 return NULL;
979         iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
980         if (!iommu->reg) {
981                 printk(KERN_ERR "IOMMU: can't map the region\n");
982                 goto error;
983         }
984         iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
985         iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
986
987         /* the registers might be more than one page */
988         map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
989                 cap_max_fault_reg_offset(iommu->cap));
990         map_size = PAGE_ALIGN_4K(map_size);
991         if (map_size > PAGE_SIZE_4K) {
992                 iounmap(iommu->reg);
993                 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
994                 if (!iommu->reg) {
995                         printk(KERN_ERR "IOMMU: can't map the region\n");
996                         goto error;
997                 }
998         }
999
1000         ver = readl(iommu->reg + DMAR_VER_REG);
1001         pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
1002                 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
1003                 iommu->cap, iommu->ecap);
1004         ret = iommu_init_domains(iommu);
1005         if (ret)
1006                 goto error_unmap;
1007         spin_lock_init(&iommu->lock);
1008         spin_lock_init(&iommu->register_lock);
1009
1010         drhd->iommu = iommu;
1011         return iommu;
1012 error_unmap:
1013         iounmap(iommu->reg);
1014 error:
1015         kfree(iommu);
1016         return NULL;
1017 }
1018
1019 static void domain_exit(struct dmar_domain *domain);
1020 static void free_iommu(struct intel_iommu *iommu)
1021 {
1022         struct dmar_domain *domain;
1023         int i;
1024
1025         if (!iommu)
1026                 return;
1027
1028         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1029         for (; i < cap_ndoms(iommu->cap); ) {
1030                 domain = iommu->domains[i];
1031                 clear_bit(i, iommu->domain_ids);
1032                 domain_exit(domain);
1033                 i = find_next_bit(iommu->domain_ids,
1034                         cap_ndoms(iommu->cap), i+1);
1035         }
1036
1037         if (iommu->gcmd & DMA_GCMD_TE)
1038                 iommu_disable_translation(iommu);
1039
1040         if (iommu->irq) {
1041                 set_irq_data(iommu->irq, NULL);
1042                 /* This will mask the irq */
1043                 free_irq(iommu->irq, iommu);
1044                 destroy_irq(iommu->irq);
1045         }
1046
1047         kfree(iommu->domains);
1048         kfree(iommu->domain_ids);
1049
1050         /* free context mapping */
1051         free_context_table(iommu);
1052
1053         if (iommu->reg)
1054                 iounmap(iommu->reg);
1055         kfree(iommu);
1056 }
1057
1058 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1059 {
1060         unsigned long num;
1061         unsigned long ndomains;
1062         struct dmar_domain *domain;
1063         unsigned long flags;
1064
1065         domain = alloc_domain_mem();
1066         if (!domain)
1067                 return NULL;
1068
1069         ndomains = cap_ndoms(iommu->cap);
1070
1071         spin_lock_irqsave(&iommu->lock, flags);
1072         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1073         if (num >= ndomains) {
1074                 spin_unlock_irqrestore(&iommu->lock, flags);
1075                 free_domain_mem(domain);
1076                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1077                 return NULL;
1078         }
1079
1080         set_bit(num, iommu->domain_ids);
1081         domain->id = num;
1082         domain->iommu = iommu;
1083         iommu->domains[num] = domain;
1084         spin_unlock_irqrestore(&iommu->lock, flags);
1085
1086         return domain;
1087 }
1088
1089 static void iommu_free_domain(struct dmar_domain *domain)
1090 {
1091         unsigned long flags;
1092
1093         spin_lock_irqsave(&domain->iommu->lock, flags);
1094         clear_bit(domain->id, domain->iommu->domain_ids);
1095         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1096 }
1097
1098 static struct iova_domain reserved_iova_list;
1099
1100 static void dmar_init_reserved_ranges(void)
1101 {
1102         struct pci_dev *pdev = NULL;
1103         struct iova *iova;
1104         int i;
1105         u64 addr, size;
1106
1107         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1108
1109         /* IOAPIC ranges shouldn't be accessed by DMA */
1110         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1111                 IOVA_PFN(IOAPIC_RANGE_END));
1112         if (!iova)
1113                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1114
1115         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1116         for_each_pci_dev(pdev) {
1117                 struct resource *r;
1118
1119                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1120                         r = &pdev->resource[i];
1121                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1122                                 continue;
1123                         addr = r->start;
1124                         addr &= PAGE_MASK_4K;
1125                         size = r->end - addr;
1126                         size = PAGE_ALIGN_4K(size);
1127                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1128                                 IOVA_PFN(size + addr) - 1);
1129                         if (!iova)
1130                                 printk(KERN_ERR "Reserve iova failed\n");
1131                 }
1132         }
1133
1134 }
1135
1136 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1137 {
1138         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1139 }
1140
1141 static inline int guestwidth_to_adjustwidth(int gaw)
1142 {
1143         int agaw;
1144         int r = (gaw - 12) % 9;
1145
1146         if (r == 0)
1147                 agaw = gaw;
1148         else
1149                 agaw = gaw + 9 - r;
1150         if (agaw > 64)
1151                 agaw = 64;
1152         return agaw;
1153 }
1154
1155 static int domain_init(struct dmar_domain *domain, int guest_width)
1156 {
1157         struct intel_iommu *iommu;
1158         int adjust_width, agaw;
1159         unsigned long sagaw;
1160
1161         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1162         spin_lock_init(&domain->mapping_lock);
1163
1164         domain_reserve_special_ranges(domain);
1165
1166         /* calculate AGAW */
1167         iommu = domain->iommu;
1168         if (guest_width > cap_mgaw(iommu->cap))
1169                 guest_width = cap_mgaw(iommu->cap);
1170         domain->gaw = guest_width;
1171         adjust_width = guestwidth_to_adjustwidth(guest_width);
1172         agaw = width_to_agaw(adjust_width);
1173         sagaw = cap_sagaw(iommu->cap);
1174         if (!test_bit(agaw, &sagaw)) {
1175                 /* hardware doesn't support it, choose a bigger one */
1176                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1177                 agaw = find_next_bit(&sagaw, 5, agaw);
1178                 if (agaw >= 5)
1179                         return -ENODEV;
1180         }
1181         domain->agaw = agaw;
1182         INIT_LIST_HEAD(&domain->devices);
1183
1184         /* always allocate the top pgd */
1185         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1186         if (!domain->pgd)
1187                 return -ENOMEM;
1188         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1189         return 0;
1190 }
1191
1192 static void domain_exit(struct dmar_domain *domain)
1193 {
1194         u64 end;
1195
1196         /* Domain 0 is reserved, so dont process it */
1197         if (!domain)
1198                 return;
1199
1200         domain_remove_dev_info(domain);
1201         /* destroy iovas */
1202         put_iova_domain(&domain->iovad);
1203         end = DOMAIN_MAX_ADDR(domain->gaw);
1204         end = end & (~PAGE_MASK_4K);
1205
1206         /* clear ptes */
1207         dma_pte_clear_range(domain, 0, end);
1208
1209         /* free page tables */
1210         dma_pte_free_pagetable(domain, 0, end);
1211
1212         iommu_free_domain(domain);
1213         free_domain_mem(domain);
1214 }
1215
1216 static int domain_context_mapping_one(struct dmar_domain *domain,
1217                 u8 bus, u8 devfn)
1218 {
1219         struct context_entry *context;
1220         struct intel_iommu *iommu = domain->iommu;
1221         unsigned long flags;
1222
1223         pr_debug("Set context mapping for %02x:%02x.%d\n",
1224                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1225         BUG_ON(!domain->pgd);
1226         context = device_to_context_entry(iommu, bus, devfn);
1227         if (!context)
1228                 return -ENOMEM;
1229         spin_lock_irqsave(&iommu->lock, flags);
1230         if (context_present(*context)) {
1231                 spin_unlock_irqrestore(&iommu->lock, flags);
1232                 return 0;
1233         }
1234
1235         context_set_domain_id(*context, domain->id);
1236         context_set_address_width(*context, domain->agaw);
1237         context_set_address_root(*context, virt_to_phys(domain->pgd));
1238         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1239         context_set_fault_enable(*context);
1240         context_set_present(*context);
1241         __iommu_flush_cache(iommu, context, sizeof(*context));
1242
1243         /* it's a non-present to present mapping */
1244         if (iommu_flush_context_device(iommu, domain->id,
1245                         (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1246                 iommu_flush_write_buffer(iommu);
1247         else
1248                 iommu_flush_iotlb_dsi(iommu, 0, 0);
1249         spin_unlock_irqrestore(&iommu->lock, flags);
1250         return 0;
1251 }
1252
1253 static int
1254 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1255 {
1256         int ret;
1257         struct pci_dev *tmp, *parent;
1258
1259         ret = domain_context_mapping_one(domain, pdev->bus->number,
1260                 pdev->devfn);
1261         if (ret)
1262                 return ret;
1263
1264         /* dependent device mapping */
1265         tmp = pci_find_upstream_pcie_bridge(pdev);
1266         if (!tmp)
1267                 return 0;
1268         /* Secondary interface's bus number and devfn 0 */
1269         parent = pdev->bus->self;
1270         while (parent != tmp) {
1271                 ret = domain_context_mapping_one(domain, parent->bus->number,
1272                         parent->devfn);
1273                 if (ret)
1274                         return ret;
1275                 parent = parent->bus->self;
1276         }
1277         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1278                 return domain_context_mapping_one(domain,
1279                         tmp->subordinate->number, 0);
1280         else /* this is a legacy PCI bridge */
1281                 return domain_context_mapping_one(domain,
1282                         tmp->bus->number, tmp->devfn);
1283 }
1284
1285 static int domain_context_mapped(struct dmar_domain *domain,
1286         struct pci_dev *pdev)
1287 {
1288         int ret;
1289         struct pci_dev *tmp, *parent;
1290
1291         ret = device_context_mapped(domain->iommu,
1292                 pdev->bus->number, pdev->devfn);
1293         if (!ret)
1294                 return ret;
1295         /* dependent device mapping */
1296         tmp = pci_find_upstream_pcie_bridge(pdev);
1297         if (!tmp)
1298                 return ret;
1299         /* Secondary interface's bus number and devfn 0 */
1300         parent = pdev->bus->self;
1301         while (parent != tmp) {
1302                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1303                         parent->devfn);
1304                 if (!ret)
1305                         return ret;
1306                 parent = parent->bus->self;
1307         }
1308         if (tmp->is_pcie)
1309                 return device_context_mapped(domain->iommu,
1310                         tmp->subordinate->number, 0);
1311         else
1312                 return device_context_mapped(domain->iommu,
1313                         tmp->bus->number, tmp->devfn);
1314 }
1315
1316 static int
1317 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1318                         u64 hpa, size_t size, int prot)
1319 {
1320         u64 start_pfn, end_pfn;
1321         struct dma_pte *pte;
1322         int index;
1323
1324         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1325                 return -EINVAL;
1326         iova &= PAGE_MASK_4K;
1327         start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1328         end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1329         index = 0;
1330         while (start_pfn < end_pfn) {
1331                 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1332                 if (!pte)
1333                         return -ENOMEM;
1334                 /* We don't need lock here, nobody else
1335                  * touches the iova range
1336                  */
1337                 BUG_ON(dma_pte_addr(*pte));
1338                 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1339                 dma_set_pte_prot(*pte, prot);
1340                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1341                 start_pfn++;
1342                 index++;
1343         }
1344         return 0;
1345 }
1346
1347 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1348 {
1349         clear_context_table(domain->iommu, bus, devfn);
1350         iommu_flush_context_global(domain->iommu, 0);
1351         iommu_flush_iotlb_global(domain->iommu, 0);
1352 }
1353
1354 static void domain_remove_dev_info(struct dmar_domain *domain)
1355 {
1356         struct device_domain_info *info;
1357         unsigned long flags;
1358
1359         spin_lock_irqsave(&device_domain_lock, flags);
1360         while (!list_empty(&domain->devices)) {
1361                 info = list_entry(domain->devices.next,
1362                         struct device_domain_info, link);
1363                 list_del(&info->link);
1364                 list_del(&info->global);
1365                 if (info->dev)
1366                         info->dev->dev.archdata.iommu = NULL;
1367                 spin_unlock_irqrestore(&device_domain_lock, flags);
1368
1369                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1370                 free_devinfo_mem(info);
1371
1372                 spin_lock_irqsave(&device_domain_lock, flags);
1373         }
1374         spin_unlock_irqrestore(&device_domain_lock, flags);
1375 }
1376
1377 /*
1378  * find_domain
1379  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1380  */
1381 struct dmar_domain *
1382 find_domain(struct pci_dev *pdev)
1383 {
1384         struct device_domain_info *info;
1385
1386         /* No lock here, assumes no domain exit in normal case */
1387         info = pdev->dev.archdata.iommu;
1388         if (info)
1389                 return info->domain;
1390         return NULL;
1391 }
1392
1393 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1394      struct pci_dev *dev)
1395 {
1396         int index;
1397
1398         while (dev) {
1399                 for (index = 0; index < cnt; index ++)
1400                         if (dev == devices[index])
1401                                 return 1;
1402
1403                 /* Check our parent */
1404                 dev = dev->bus->self;
1405         }
1406
1407         return 0;
1408 }
1409
1410 static struct dmar_drhd_unit *
1411 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1412 {
1413         struct dmar_drhd_unit *drhd = NULL;
1414
1415         list_for_each_entry(drhd, &dmar_drhd_units, list) {
1416                 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1417                                                 drhd->devices_cnt, dev))
1418                         return drhd;
1419         }
1420
1421         return NULL;
1422 }
1423
1424 /* domain is initialized */
1425 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1426 {
1427         struct dmar_domain *domain, *found = NULL;
1428         struct intel_iommu *iommu;
1429         struct dmar_drhd_unit *drhd;
1430         struct device_domain_info *info, *tmp;
1431         struct pci_dev *dev_tmp;
1432         unsigned long flags;
1433         int bus = 0, devfn = 0;
1434
1435         domain = find_domain(pdev);
1436         if (domain)
1437                 return domain;
1438
1439         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1440         if (dev_tmp) {
1441                 if (dev_tmp->is_pcie) {
1442                         bus = dev_tmp->subordinate->number;
1443                         devfn = 0;
1444                 } else {
1445                         bus = dev_tmp->bus->number;
1446                         devfn = dev_tmp->devfn;
1447                 }
1448                 spin_lock_irqsave(&device_domain_lock, flags);
1449                 list_for_each_entry(info, &device_domain_list, global) {
1450                         if (info->bus == bus && info->devfn == devfn) {
1451                                 found = info->domain;
1452                                 break;
1453                         }
1454                 }
1455                 spin_unlock_irqrestore(&device_domain_lock, flags);
1456                 /* pcie-pci bridge already has a domain, uses it */
1457                 if (found) {
1458                         domain = found;
1459                         goto found_domain;
1460                 }
1461         }
1462
1463         /* Allocate new domain for the device */
1464         drhd = dmar_find_matched_drhd_unit(pdev);
1465         if (!drhd) {
1466                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1467                         pci_name(pdev));
1468                 return NULL;
1469         }
1470         iommu = drhd->iommu;
1471
1472         domain = iommu_alloc_domain(iommu);
1473         if (!domain)
1474                 goto error;
1475
1476         if (domain_init(domain, gaw)) {
1477                 domain_exit(domain);
1478                 goto error;
1479         }
1480
1481         /* register pcie-to-pci device */
1482         if (dev_tmp) {
1483                 info = alloc_devinfo_mem();
1484                 if (!info) {
1485                         domain_exit(domain);
1486                         goto error;
1487                 }
1488                 info->bus = bus;
1489                 info->devfn = devfn;
1490                 info->dev = NULL;
1491                 info->domain = domain;
1492                 /* This domain is shared by devices under p2p bridge */
1493                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1494
1495                 /* pcie-to-pci bridge already has a domain, uses it */
1496                 found = NULL;
1497                 spin_lock_irqsave(&device_domain_lock, flags);
1498                 list_for_each_entry(tmp, &device_domain_list, global) {
1499                         if (tmp->bus == bus && tmp->devfn == devfn) {
1500                                 found = tmp->domain;
1501                                 break;
1502                         }
1503                 }
1504                 if (found) {
1505                         free_devinfo_mem(info);
1506                         domain_exit(domain);
1507                         domain = found;
1508                 } else {
1509                         list_add(&info->link, &domain->devices);
1510                         list_add(&info->global, &device_domain_list);
1511                 }
1512                 spin_unlock_irqrestore(&device_domain_lock, flags);
1513         }
1514
1515 found_domain:
1516         info = alloc_devinfo_mem();
1517         if (!info)
1518                 goto error;
1519         info->bus = pdev->bus->number;
1520         info->devfn = pdev->devfn;
1521         info->dev = pdev;
1522         info->domain = domain;
1523         spin_lock_irqsave(&device_domain_lock, flags);
1524         /* somebody is fast */
1525         found = find_domain(pdev);
1526         if (found != NULL) {
1527                 spin_unlock_irqrestore(&device_domain_lock, flags);
1528                 if (found != domain) {
1529                         domain_exit(domain);
1530                         domain = found;
1531                 }
1532                 free_devinfo_mem(info);
1533                 return domain;
1534         }
1535         list_add(&info->link, &domain->devices);
1536         list_add(&info->global, &device_domain_list);
1537         pdev->dev.archdata.iommu = info;
1538         spin_unlock_irqrestore(&device_domain_lock, flags);
1539         return domain;
1540 error:
1541         /* recheck it here, maybe others set it */
1542         return find_domain(pdev);
1543 }
1544
1545 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1546 {
1547         struct dmar_domain *domain;
1548         unsigned long size;
1549         u64 base;
1550         int ret;
1551
1552         printk(KERN_INFO
1553                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1554                 pci_name(pdev), start, end);
1555         /* page table init */
1556         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1557         if (!domain)
1558                 return -ENOMEM;
1559
1560         /* The address might not be aligned */
1561         base = start & PAGE_MASK_4K;
1562         size = end - base;
1563         size = PAGE_ALIGN_4K(size);
1564         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1565                         IOVA_PFN(base + size) - 1)) {
1566                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1567                 ret = -ENOMEM;
1568                 goto error;
1569         }
1570
1571         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1572                 size, base, pci_name(pdev));
1573         /*
1574          * RMRR range might have overlap with physical memory range,
1575          * clear it first
1576          */
1577         dma_pte_clear_range(domain, base, base + size);
1578
1579         ret = domain_page_mapping(domain, base, base, size,
1580                 DMA_PTE_READ|DMA_PTE_WRITE);
1581         if (ret)
1582                 goto error;
1583
1584         /* context entry init */
1585         ret = domain_context_mapping(domain, pdev);
1586         if (!ret)
1587                 return 0;
1588 error:
1589         domain_exit(domain);
1590         return ret;
1591
1592 }
1593
1594 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1595         struct pci_dev *pdev)
1596 {
1597         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1598                 return 0;
1599         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1600                 rmrr->end_address + 1);
1601 }
1602
1603 #ifdef CONFIG_DMAR_GFX_WA
1604 extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
1605 static void __init iommu_prepare_gfx_mapping(void)
1606 {
1607         struct pci_dev *pdev = NULL;
1608         u64 base, size;
1609         int slot;
1610         int ret;
1611
1612         for_each_pci_dev(pdev) {
1613                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1614                                 !IS_GFX_DEVICE(pdev))
1615                         continue;
1616                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1617                         pci_name(pdev));
1618                 slot = arch_get_ram_range(0, &base, &size);
1619                 while (slot >= 0) {
1620                         ret = iommu_prepare_identity_map(pdev,
1621                                         base, base + size);
1622                         if (ret)
1623                                 goto error;
1624                         slot = arch_get_ram_range(slot, &base, &size);
1625                 }
1626                 continue;
1627 error:
1628                 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1629         }
1630 }
1631 #endif
1632
1633 #ifdef CONFIG_DMAR_FLOPPY_WA
1634 static inline void iommu_prepare_isa(void)
1635 {
1636         struct pci_dev *pdev;
1637         int ret;
1638
1639         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1640         if (!pdev)
1641                 return;
1642
1643         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1644         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1645
1646         if (ret)
1647                 printk("IOMMU: Failed to create 0-64M identity map, "
1648                         "floppy might not work\n");
1649
1650 }
1651 #else
1652 static inline void iommu_prepare_isa(void)
1653 {
1654         return;
1655 }
1656 #endif /* !CONFIG_DMAR_FLPY_WA */
1657
1658 int __init init_dmars(void)
1659 {
1660         struct dmar_drhd_unit *drhd;
1661         struct dmar_rmrr_unit *rmrr;
1662         struct pci_dev *pdev;
1663         struct intel_iommu *iommu;
1664         int ret, unit = 0;
1665
1666         /*
1667          * for each drhd
1668          *    allocate root
1669          *    initialize and program root entry to not present
1670          * endfor
1671          */
1672         for_each_drhd_unit(drhd) {
1673                 if (drhd->ignored)
1674                         continue;
1675                 iommu = alloc_iommu(drhd);
1676                 if (!iommu) {
1677                         ret = -ENOMEM;
1678                         goto error;
1679                 }
1680
1681                 /*
1682                  * TBD:
1683                  * we could share the same root & context tables
1684                  * amoung all IOMMU's. Need to Split it later.
1685                  */
1686                 ret = iommu_alloc_root_entry(iommu);
1687                 if (ret) {
1688                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1689                         goto error;
1690                 }
1691         }
1692
1693         /*
1694          * For each rmrr
1695          *   for each dev attached to rmrr
1696          *   do
1697          *     locate drhd for dev, alloc domain for dev
1698          *     allocate free domain
1699          *     allocate page table entries for rmrr
1700          *     if context not allocated for bus
1701          *           allocate and init context
1702          *           set present in root table for this bus
1703          *     init context with domain, translation etc
1704          *    endfor
1705          * endfor
1706          */
1707         for_each_rmrr_units(rmrr) {
1708                 int i;
1709                 for (i = 0; i < rmrr->devices_cnt; i++) {
1710                         pdev = rmrr->devices[i];
1711                         /* some BIOS lists non-exist devices in DMAR table */
1712                         if (!pdev)
1713                                 continue;
1714                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1715                         if (ret)
1716                                 printk(KERN_ERR
1717                                  "IOMMU: mapping reserved region failed\n");
1718                 }
1719         }
1720
1721         iommu_prepare_gfx_mapping();
1722
1723         iommu_prepare_isa();
1724
1725         /*
1726          * for each drhd
1727          *   enable fault log
1728          *   global invalidate context cache
1729          *   global invalidate iotlb
1730          *   enable translation
1731          */
1732         for_each_drhd_unit(drhd) {
1733                 if (drhd->ignored)
1734                         continue;
1735                 iommu = drhd->iommu;
1736                 sprintf (iommu->name, "dmar%d", unit++);
1737
1738                 iommu_flush_write_buffer(iommu);
1739
1740                 ret = dmar_set_interrupt(iommu);
1741                 if (ret)
1742                         goto error;
1743
1744                 iommu_set_root_entry(iommu);
1745
1746                 iommu_flush_context_global(iommu, 0);
1747                 iommu_flush_iotlb_global(iommu, 0);
1748
1749                 iommu_disable_protect_mem_regions(iommu);
1750
1751                 ret = iommu_enable_translation(iommu);
1752                 if (ret)
1753                         goto error;
1754         }
1755
1756         return 0;
1757 error:
1758         for_each_drhd_unit(drhd) {
1759                 if (drhd->ignored)
1760                         continue;
1761                 iommu = drhd->iommu;
1762                 free_iommu(iommu);
1763         }
1764         return ret;
1765 }
1766
1767 static inline u64 aligned_size(u64 host_addr, size_t size)
1768 {
1769         u64 addr;
1770         addr = (host_addr & (~PAGE_MASK_4K)) + size;
1771         return PAGE_ALIGN_4K(addr);
1772 }
1773
1774 struct iova *
1775 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1776 {
1777         struct iova *piova;
1778
1779         /* Make sure it's in range */
1780         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1781         if (!size || (IOVA_START_ADDR + size > end))
1782                 return NULL;
1783
1784         piova = alloc_iova(&domain->iovad,
1785                         size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1786         return piova;
1787 }
1788
1789 static struct iova *
1790 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1791                 size_t size)
1792 {
1793         struct pci_dev *pdev = to_pci_dev(dev);
1794         struct iova *iova = NULL;
1795
1796         if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1797                 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1798         } else  {
1799                 /*
1800                  * First try to allocate an io virtual address in
1801                  * DMA_32BIT_MASK and if that fails then try allocating
1802                  * from higher range
1803                  */
1804                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1805                 if (!iova)
1806                         iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1807         }
1808
1809         if (!iova) {
1810                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1811                 return NULL;
1812         }
1813
1814         return iova;
1815 }
1816
1817 static struct dmar_domain *
1818 get_valid_domain_for_dev(struct pci_dev *pdev)
1819 {
1820         struct dmar_domain *domain;
1821         int ret;
1822
1823         domain = get_domain_for_dev(pdev,
1824                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1825         if (!domain) {
1826                 printk(KERN_ERR
1827                         "Allocating domain for %s failed", pci_name(pdev));
1828                 return NULL;
1829         }
1830
1831         /* make sure context mapping is ok */
1832         if (unlikely(!domain_context_mapped(domain, pdev))) {
1833                 ret = domain_context_mapping(domain, pdev);
1834                 if (ret) {
1835                         printk(KERN_ERR
1836                                 "Domain context map for %s failed",
1837                                 pci_name(pdev));
1838                         return NULL;
1839                 }
1840         }
1841
1842         return domain;
1843 }
1844
1845 static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
1846         size_t size, int dir)
1847 {
1848         struct pci_dev *pdev = to_pci_dev(hwdev);
1849         int ret;
1850         struct dmar_domain *domain;
1851         unsigned long start_addr;
1852         struct iova *iova;
1853         int prot = 0;
1854
1855         BUG_ON(dir == DMA_NONE);
1856         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1857                 return virt_to_bus(addr);
1858
1859         domain = get_valid_domain_for_dev(pdev);
1860         if (!domain)
1861                 return 0;
1862
1863         addr = (void *)virt_to_phys(addr);
1864         size = aligned_size((u64)addr, size);
1865
1866         iova = __intel_alloc_iova(hwdev, domain, size);
1867         if (!iova)
1868                 goto error;
1869
1870         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1871
1872         /*
1873          * Check if DMAR supports zero-length reads on write only
1874          * mappings..
1875          */
1876         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1877                         !cap_zlr(domain->iommu->cap))
1878                 prot |= DMA_PTE_READ;
1879         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1880                 prot |= DMA_PTE_WRITE;
1881         /*
1882          * addr - (addr + size) might be partial page, we should map the whole
1883          * page.  Note: if two part of one page are separately mapped, we
1884          * might have two guest_addr mapping to the same host addr, but this
1885          * is not a big problem
1886          */
1887         ret = domain_page_mapping(domain, start_addr,
1888                 ((u64)addr) & PAGE_MASK_4K, size, prot);
1889         if (ret)
1890                 goto error;
1891
1892         pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1893                 pci_name(pdev), size, (u64)addr,
1894                 size, (u64)start_addr, dir);
1895
1896         /* it's a non-present to present mapping */
1897         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1898                         start_addr, size >> PAGE_SHIFT_4K, 1);
1899         if (ret)
1900                 iommu_flush_write_buffer(domain->iommu);
1901
1902         return (start_addr + ((u64)addr & (~PAGE_MASK_4K)));
1903
1904 error:
1905         if (iova)
1906                 __free_iova(&domain->iovad, iova);
1907         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1908                 pci_name(pdev), size, (u64)addr, dir);
1909         return 0;
1910 }
1911
1912 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1913         size_t size, int dir)
1914 {
1915         struct pci_dev *pdev = to_pci_dev(dev);
1916         struct dmar_domain *domain;
1917         unsigned long start_addr;
1918         struct iova *iova;
1919
1920         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1921                 return;
1922         domain = find_domain(pdev);
1923         BUG_ON(!domain);
1924
1925         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1926         if (!iova)
1927                 return;
1928
1929         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1930         size = aligned_size((u64)dev_addr, size);
1931
1932         pr_debug("Device %s unmapping: %lx@%llx\n",
1933                 pci_name(pdev), size, (u64)start_addr);
1934
1935         /*  clear the whole page */
1936         dma_pte_clear_range(domain, start_addr, start_addr + size);
1937         /* free page tables */
1938         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1939
1940         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
1941                         size >> PAGE_SHIFT_4K, 0))
1942                 iommu_flush_write_buffer(domain->iommu);
1943
1944         /* free iova */
1945         __free_iova(&domain->iovad, iova);
1946 }
1947
1948 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1949                        dma_addr_t *dma_handle, gfp_t flags)
1950 {
1951         void *vaddr;
1952         int order;
1953
1954         size = PAGE_ALIGN_4K(size);
1955         order = get_order(size);
1956         flags &= ~(GFP_DMA | GFP_DMA32);
1957
1958         vaddr = (void *)__get_free_pages(flags, order);
1959         if (!vaddr)
1960                 return NULL;
1961         memset(vaddr, 0, size);
1962
1963         *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
1964         if (*dma_handle)
1965                 return vaddr;
1966         free_pages((unsigned long)vaddr, order);
1967         return NULL;
1968 }
1969
1970 static void intel_free_coherent(struct device *hwdev, size_t size,
1971         void *vaddr, dma_addr_t dma_handle)
1972 {
1973         int order;
1974
1975         size = PAGE_ALIGN_4K(size);
1976         order = get_order(size);
1977
1978         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
1979         free_pages((unsigned long)vaddr, order);
1980 }
1981
1982 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
1983 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
1984         int nelems, int dir)
1985 {
1986         int i;
1987         struct pci_dev *pdev = to_pci_dev(hwdev);
1988         struct dmar_domain *domain;
1989         unsigned long start_addr;
1990         struct iova *iova;
1991         size_t size = 0;
1992         void *addr;
1993         struct scatterlist *sg;
1994
1995         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1996                 return;
1997
1998         domain = find_domain(pdev);
1999
2000         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2001         if (!iova)
2002                 return;
2003         for_each_sg(sglist, sg, nelems, i) {
2004                 addr = SG_ENT_VIRT_ADDRESS(sg);
2005                 size += aligned_size((u64)addr, sg->length);
2006         }
2007
2008         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2009
2010         /*  clear the whole page */
2011         dma_pte_clear_range(domain, start_addr, start_addr + size);
2012         /* free page tables */
2013         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2014
2015         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2016                         size >> PAGE_SHIFT_4K, 0))
2017                 iommu_flush_write_buffer(domain->iommu);
2018
2019         /* free iova */
2020         __free_iova(&domain->iovad, iova);
2021 }
2022
2023 static int intel_nontranslate_map_sg(struct device *hddev,
2024         struct scatterlist *sglist, int nelems, int dir)
2025 {
2026         int i;
2027         struct scatterlist *sg;
2028
2029         for_each_sg(sglist, sg, nelems, i) {
2030                 BUG_ON(!sg_page(sg));
2031                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2032                 sg->dma_length = sg->length;
2033         }
2034         return nelems;
2035 }
2036
2037 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2038                                 int nelems, int dir)
2039 {
2040         void *addr;
2041         int i;
2042         struct pci_dev *pdev = to_pci_dev(hwdev);
2043         struct dmar_domain *domain;
2044         size_t size = 0;
2045         int prot = 0;
2046         size_t offset = 0;
2047         struct iova *iova = NULL;
2048         int ret;
2049         struct scatterlist *sg;
2050         unsigned long start_addr;
2051
2052         BUG_ON(dir == DMA_NONE);
2053         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2054                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2055
2056         domain = get_valid_domain_for_dev(pdev);
2057         if (!domain)
2058                 return 0;
2059
2060         for_each_sg(sglist, sg, nelems, i) {
2061                 addr = SG_ENT_VIRT_ADDRESS(sg);
2062                 addr = (void *)virt_to_phys(addr);
2063                 size += aligned_size((u64)addr, sg->length);
2064         }
2065
2066         iova = __intel_alloc_iova(hwdev, domain, size);
2067         if (!iova) {
2068                 sglist->dma_length = 0;
2069                 return 0;
2070         }
2071
2072         /*
2073          * Check if DMAR supports zero-length reads on write only
2074          * mappings..
2075          */
2076         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2077                         !cap_zlr(domain->iommu->cap))
2078                 prot |= DMA_PTE_READ;
2079         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2080                 prot |= DMA_PTE_WRITE;
2081
2082         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2083         offset = 0;
2084         for_each_sg(sglist, sg, nelems, i) {
2085                 addr = SG_ENT_VIRT_ADDRESS(sg);
2086                 addr = (void *)virt_to_phys(addr);
2087                 size = aligned_size((u64)addr, sg->length);
2088                 ret = domain_page_mapping(domain, start_addr + offset,
2089                         ((u64)addr) & PAGE_MASK_4K,
2090                         size, prot);
2091                 if (ret) {
2092                         /*  clear the page */
2093                         dma_pte_clear_range(domain, start_addr,
2094                                   start_addr + offset);
2095                         /* free page tables */
2096                         dma_pte_free_pagetable(domain, start_addr,
2097                                   start_addr + offset);
2098                         /* free iova */
2099                         __free_iova(&domain->iovad, iova);
2100                         return 0;
2101                 }
2102                 sg->dma_address = start_addr + offset +
2103                                 ((u64)addr & (~PAGE_MASK_4K));
2104                 sg->dma_length = sg->length;
2105                 offset += size;
2106         }
2107
2108         /* it's a non-present to present mapping */
2109         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2110                         start_addr, offset >> PAGE_SHIFT_4K, 1))
2111                 iommu_flush_write_buffer(domain->iommu);
2112         return nelems;
2113 }
2114
2115 static struct dma_mapping_ops intel_dma_ops = {
2116         .alloc_coherent = intel_alloc_coherent,
2117         .free_coherent = intel_free_coherent,
2118         .map_single = intel_map_single,
2119         .unmap_single = intel_unmap_single,
2120         .map_sg = intel_map_sg,
2121         .unmap_sg = intel_unmap_sg,
2122 };
2123
2124 static inline int iommu_domain_cache_init(void)
2125 {
2126         int ret = 0;
2127
2128         iommu_domain_cache = kmem_cache_create("iommu_domain",
2129                                          sizeof(struct dmar_domain),
2130                                          0,
2131                                          SLAB_HWCACHE_ALIGN,
2132
2133                                          NULL);
2134         if (!iommu_domain_cache) {
2135                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2136                 ret = -ENOMEM;
2137         }
2138
2139         return ret;
2140 }
2141
2142 static inline int iommu_devinfo_cache_init(void)
2143 {
2144         int ret = 0;
2145
2146         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2147                                          sizeof(struct device_domain_info),
2148                                          0,
2149                                          SLAB_HWCACHE_ALIGN,
2150
2151                                          NULL);
2152         if (!iommu_devinfo_cache) {
2153                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2154                 ret = -ENOMEM;
2155         }
2156
2157         return ret;
2158 }
2159
2160 static inline int iommu_iova_cache_init(void)
2161 {
2162         int ret = 0;
2163
2164         iommu_iova_cache = kmem_cache_create("iommu_iova",
2165                                          sizeof(struct iova),
2166                                          0,
2167                                          SLAB_HWCACHE_ALIGN,
2168
2169                                          NULL);
2170         if (!iommu_iova_cache) {
2171                 printk(KERN_ERR "Couldn't create iova cache\n");
2172                 ret = -ENOMEM;
2173         }
2174
2175         return ret;
2176 }
2177
2178 static int __init iommu_init_mempool(void)
2179 {
2180         int ret;
2181         ret = iommu_iova_cache_init();
2182         if (ret)
2183                 return ret;
2184
2185         ret = iommu_domain_cache_init();
2186         if (ret)
2187                 goto domain_error;
2188
2189         ret = iommu_devinfo_cache_init();
2190         if (!ret)
2191                 return ret;
2192
2193         kmem_cache_destroy(iommu_domain_cache);
2194 domain_error:
2195         kmem_cache_destroy(iommu_iova_cache);
2196
2197         return -ENOMEM;
2198 }
2199
2200 static void __init iommu_exit_mempool(void)
2201 {
2202         kmem_cache_destroy(iommu_devinfo_cache);
2203         kmem_cache_destroy(iommu_domain_cache);
2204         kmem_cache_destroy(iommu_iova_cache);
2205
2206 }
2207
2208 void __init detect_intel_iommu(void)
2209 {
2210         if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2211                 return;
2212         if (early_dmar_detect()) {
2213                 iommu_detected = 1;
2214         }
2215 }
2216
2217 static void __init init_no_remapping_devices(void)
2218 {
2219         struct dmar_drhd_unit *drhd;
2220
2221         for_each_drhd_unit(drhd) {
2222                 if (!drhd->include_all) {
2223                         int i;
2224                         for (i = 0; i < drhd->devices_cnt; i++)
2225                                 if (drhd->devices[i] != NULL)
2226                                         break;
2227                         /* ignore DMAR unit if no pci devices exist */
2228                         if (i == drhd->devices_cnt)
2229                                 drhd->ignored = 1;
2230                 }
2231         }
2232
2233         if (dmar_map_gfx)
2234                 return;
2235
2236         for_each_drhd_unit(drhd) {
2237                 int i;
2238                 if (drhd->ignored || drhd->include_all)
2239                         continue;
2240
2241                 for (i = 0; i < drhd->devices_cnt; i++)
2242                         if (drhd->devices[i] &&
2243                                 !IS_GFX_DEVICE(drhd->devices[i]))
2244                                 break;
2245
2246                 if (i < drhd->devices_cnt)
2247                         continue;
2248
2249                 /* bypass IOMMU if it is just for gfx devices */
2250                 drhd->ignored = 1;
2251                 for (i = 0; i < drhd->devices_cnt; i++) {
2252                         if (!drhd->devices[i])
2253                                 continue;
2254                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2255                 }
2256         }
2257 }
2258
2259 int __init intel_iommu_init(void)
2260 {
2261         int ret = 0;
2262
2263         if (no_iommu || swiotlb || dmar_disabled)
2264                 return -ENODEV;
2265
2266         if (dmar_table_init())
2267                 return  -ENODEV;
2268
2269         iommu_init_mempool();
2270         dmar_init_reserved_ranges();
2271
2272         init_no_remapping_devices();
2273
2274         ret = init_dmars();
2275         if (ret) {
2276                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2277                 put_iova_domain(&reserved_iova_list);
2278                 iommu_exit_mempool();
2279                 return ret;
2280         }
2281         printk(KERN_INFO
2282         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2283
2284         force_iommu = 1;
2285         dma_ops = &intel_dma_ops;
2286         return 0;
2287 }
2288