KVM: CodingStyle cleanup
[cascardo/linux.git] / drivers / kvm / mmu.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * MMU support
8  *
9  * Copyright (C) 2006 Qumranet, Inc.
10  *
11  * Authors:
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *   Avi Kivity   <avi@qumranet.com>
14  *
15  * This work is licensed under the terms of the GNU GPL, version 2.  See
16  * the COPYING file in the top-level directory.
17  *
18  */
19
20 #include "vmx.h"
21 #include "kvm.h"
22
23 #include <linux/types.h>
24 #include <linux/string.h>
25 #include <linux/mm.h>
26 #include <linux/highmem.h>
27 #include <linux/module.h>
28
29 #include <asm/page.h>
30 #include <asm/cmpxchg.h>
31
32 #undef MMU_DEBUG
33
34 #undef AUDIT
35
36 #ifdef AUDIT
37 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
38 #else
39 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
40 #endif
41
42 #ifdef MMU_DEBUG
43
44 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
45 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
46
47 #else
48
49 #define pgprintk(x...) do { } while (0)
50 #define rmap_printk(x...) do { } while (0)
51
52 #endif
53
54 #if defined(MMU_DEBUG) || defined(AUDIT)
55 static int dbg = 1;
56 #endif
57
58 #ifndef MMU_DEBUG
59 #define ASSERT(x) do { } while (0)
60 #else
61 #define ASSERT(x)                                                       \
62         if (!(x)) {                                                     \
63                 printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
64                        __FILE__, __LINE__, #x);                         \
65         }
66 #endif
67
68 #define PT64_PT_BITS 9
69 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
70 #define PT32_PT_BITS 10
71 #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
72
73 #define PT_WRITABLE_SHIFT 1
74
75 #define PT_PRESENT_MASK (1ULL << 0)
76 #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
77 #define PT_USER_MASK (1ULL << 2)
78 #define PT_PWT_MASK (1ULL << 3)
79 #define PT_PCD_MASK (1ULL << 4)
80 #define PT_ACCESSED_MASK (1ULL << 5)
81 #define PT_DIRTY_MASK (1ULL << 6)
82 #define PT_PAGE_SIZE_MASK (1ULL << 7)
83 #define PT_PAT_MASK (1ULL << 7)
84 #define PT_GLOBAL_MASK (1ULL << 8)
85 #define PT64_NX_MASK (1ULL << 63)
86
87 #define PT_PAT_SHIFT 7
88 #define PT_DIR_PAT_SHIFT 12
89 #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
90
91 #define PT32_DIR_PSE36_SIZE 4
92 #define PT32_DIR_PSE36_SHIFT 13
93 #define PT32_DIR_PSE36_MASK \
94         (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
95
96
97 #define PT_FIRST_AVAIL_BITS_SHIFT 9
98 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
99
100 #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
101
102 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
103
104 #define PT64_LEVEL_BITS 9
105
106 #define PT64_LEVEL_SHIFT(level) \
107                 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
108
109 #define PT64_LEVEL_MASK(level) \
110                 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
111
112 #define PT64_INDEX(address, level)\
113         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
114
115
116 #define PT32_LEVEL_BITS 10
117
118 #define PT32_LEVEL_SHIFT(level) \
119                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
120
121 #define PT32_LEVEL_MASK(level) \
122                 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
123
124 #define PT32_INDEX(address, level)\
125         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
126
127
128 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
129 #define PT64_DIR_BASE_ADDR_MASK \
130         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
131
132 #define PT32_BASE_ADDR_MASK PAGE_MASK
133 #define PT32_DIR_BASE_ADDR_MASK \
134         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
135
136
137 #define PFERR_PRESENT_MASK (1U << 0)
138 #define PFERR_WRITE_MASK (1U << 1)
139 #define PFERR_USER_MASK (1U << 2)
140 #define PFERR_FETCH_MASK (1U << 4)
141
142 #define PT64_ROOT_LEVEL 4
143 #define PT32_ROOT_LEVEL 2
144 #define PT32E_ROOT_LEVEL 3
145
146 #define PT_DIRECTORY_LEVEL 2
147 #define PT_PAGE_TABLE_LEVEL 1
148
149 #define RMAP_EXT 4
150
151 struct kvm_rmap_desc {
152         u64 *shadow_ptes[RMAP_EXT];
153         struct kvm_rmap_desc *more;
154 };
155
156 static struct kmem_cache *pte_chain_cache;
157 static struct kmem_cache *rmap_desc_cache;
158 static struct kmem_cache *mmu_page_header_cache;
159
160 static u64 __read_mostly shadow_trap_nonpresent_pte;
161 static u64 __read_mostly shadow_notrap_nonpresent_pte;
162
163 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
164 {
165         shadow_trap_nonpresent_pte = trap_pte;
166         shadow_notrap_nonpresent_pte = notrap_pte;
167 }
168 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
169
170 static int is_write_protection(struct kvm_vcpu *vcpu)
171 {
172         return vcpu->cr0 & X86_CR0_WP;
173 }
174
175 static int is_cpuid_PSE36(void)
176 {
177         return 1;
178 }
179
180 static int is_nx(struct kvm_vcpu *vcpu)
181 {
182         return vcpu->shadow_efer & EFER_NX;
183 }
184
185 static int is_present_pte(unsigned long pte)
186 {
187         return pte & PT_PRESENT_MASK;
188 }
189
190 static int is_shadow_present_pte(u64 pte)
191 {
192         pte &= ~PT_SHADOW_IO_MARK;
193         return pte != shadow_trap_nonpresent_pte
194                 && pte != shadow_notrap_nonpresent_pte;
195 }
196
197 static int is_writeble_pte(unsigned long pte)
198 {
199         return pte & PT_WRITABLE_MASK;
200 }
201
202 static int is_io_pte(unsigned long pte)
203 {
204         return pte & PT_SHADOW_IO_MARK;
205 }
206
207 static int is_rmap_pte(u64 pte)
208 {
209         return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
210                 == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
211 }
212
213 static void set_shadow_pte(u64 *sptep, u64 spte)
214 {
215 #ifdef CONFIG_X86_64
216         set_64bit((unsigned long *)sptep, spte);
217 #else
218         set_64bit((unsigned long long *)sptep, spte);
219 #endif
220 }
221
222 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
223                                   struct kmem_cache *base_cache, int min)
224 {
225         void *obj;
226
227         if (cache->nobjs >= min)
228                 return 0;
229         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
230                 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
231                 if (!obj)
232                         return -ENOMEM;
233                 cache->objects[cache->nobjs++] = obj;
234         }
235         return 0;
236 }
237
238 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
239 {
240         while (mc->nobjs)
241                 kfree(mc->objects[--mc->nobjs]);
242 }
243
244 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
245                                        int min)
246 {
247         struct page *page;
248
249         if (cache->nobjs >= min)
250                 return 0;
251         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
252                 page = alloc_page(GFP_KERNEL);
253                 if (!page)
254                         return -ENOMEM;
255                 set_page_private(page, 0);
256                 cache->objects[cache->nobjs++] = page_address(page);
257         }
258         return 0;
259 }
260
261 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
262 {
263         while (mc->nobjs)
264                 free_page((unsigned long)mc->objects[--mc->nobjs]);
265 }
266
267 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
268 {
269         int r;
270
271         kvm_mmu_free_some_pages(vcpu);
272         r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
273                                    pte_chain_cache, 4);
274         if (r)
275                 goto out;
276         r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
277                                    rmap_desc_cache, 1);
278         if (r)
279                 goto out;
280         r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 8);
281         if (r)
282                 goto out;
283         r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
284                                    mmu_page_header_cache, 4);
285 out:
286         return r;
287 }
288
289 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
290 {
291         mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
292         mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
293         mmu_free_memory_cache_page(&vcpu->mmu_page_cache);
294         mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
295 }
296
297 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
298                                     size_t size)
299 {
300         void *p;
301
302         BUG_ON(!mc->nobjs);
303         p = mc->objects[--mc->nobjs];
304         memset(p, 0, size);
305         return p;
306 }
307
308 static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
309 {
310         return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
311                                       sizeof(struct kvm_pte_chain));
312 }
313
314 static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
315 {
316         kfree(pc);
317 }
318
319 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
320 {
321         return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
322                                       sizeof(struct kvm_rmap_desc));
323 }
324
325 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
326 {
327         kfree(rd);
328 }
329
330 /*
331  * Take gfn and return the reverse mapping to it.
332  * Note: gfn must be unaliased before this function get called
333  */
334
335 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
336 {
337         struct kvm_memory_slot *slot;
338
339         slot = gfn_to_memslot(kvm, gfn);
340         return &slot->rmap[gfn - slot->base_gfn];
341 }
342
343 /*
344  * Reverse mapping data structures:
345  *
346  * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
347  * that points to page_address(page).
348  *
349  * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
350  * containing more mappings.
351  */
352 static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
353 {
354         struct kvm_mmu_page *page;
355         struct kvm_rmap_desc *desc;
356         unsigned long *rmapp;
357         int i;
358
359         if (!is_rmap_pte(*spte))
360                 return;
361         gfn = unalias_gfn(vcpu->kvm, gfn);
362         page = page_header(__pa(spte));
363         page->gfns[spte - page->spt] = gfn;
364         rmapp = gfn_to_rmap(vcpu->kvm, gfn);
365         if (!*rmapp) {
366                 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
367                 *rmapp = (unsigned long)spte;
368         } else if (!(*rmapp & 1)) {
369                 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
370                 desc = mmu_alloc_rmap_desc(vcpu);
371                 desc->shadow_ptes[0] = (u64 *)*rmapp;
372                 desc->shadow_ptes[1] = spte;
373                 *rmapp = (unsigned long)desc | 1;
374         } else {
375                 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
376                 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
377                 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
378                         desc = desc->more;
379                 if (desc->shadow_ptes[RMAP_EXT-1]) {
380                         desc->more = mmu_alloc_rmap_desc(vcpu);
381                         desc = desc->more;
382                 }
383                 for (i = 0; desc->shadow_ptes[i]; ++i)
384                         ;
385                 desc->shadow_ptes[i] = spte;
386         }
387 }
388
389 static void rmap_desc_remove_entry(unsigned long *rmapp,
390                                    struct kvm_rmap_desc *desc,
391                                    int i,
392                                    struct kvm_rmap_desc *prev_desc)
393 {
394         int j;
395
396         for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
397                 ;
398         desc->shadow_ptes[i] = desc->shadow_ptes[j];
399         desc->shadow_ptes[j] = NULL;
400         if (j != 0)
401                 return;
402         if (!prev_desc && !desc->more)
403                 *rmapp = (unsigned long)desc->shadow_ptes[0];
404         else
405                 if (prev_desc)
406                         prev_desc->more = desc->more;
407                 else
408                         *rmapp = (unsigned long)desc->more | 1;
409         mmu_free_rmap_desc(desc);
410 }
411
412 static void rmap_remove(struct kvm *kvm, u64 *spte)
413 {
414         struct kvm_rmap_desc *desc;
415         struct kvm_rmap_desc *prev_desc;
416         struct kvm_mmu_page *page;
417         unsigned long *rmapp;
418         int i;
419
420         if (!is_rmap_pte(*spte))
421                 return;
422         page = page_header(__pa(spte));
423         rmapp = gfn_to_rmap(kvm, page->gfns[spte - page->spt]);
424         if (!*rmapp) {
425                 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
426                 BUG();
427         } else if (!(*rmapp & 1)) {
428                 rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
429                 if ((u64 *)*rmapp != spte) {
430                         printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
431                                spte, *spte);
432                         BUG();
433                 }
434                 *rmapp = 0;
435         } else {
436                 rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
437                 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
438                 prev_desc = NULL;
439                 while (desc) {
440                         for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
441                                 if (desc->shadow_ptes[i] == spte) {
442                                         rmap_desc_remove_entry(rmapp,
443                                                                desc, i,
444                                                                prev_desc);
445                                         return;
446                                 }
447                         prev_desc = desc;
448                         desc = desc->more;
449                 }
450                 BUG();
451         }
452 }
453
454 static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
455 {
456         struct kvm_rmap_desc *desc;
457         unsigned long *rmapp;
458         u64 *spte;
459
460         gfn = unalias_gfn(vcpu->kvm, gfn);
461         rmapp = gfn_to_rmap(vcpu->kvm, gfn);
462
463         while (*rmapp) {
464                 if (!(*rmapp & 1))
465                         spte = (u64 *)*rmapp;
466                 else {
467                         desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
468                         spte = desc->shadow_ptes[0];
469                 }
470                 BUG_ON(!spte);
471                 BUG_ON(!(*spte & PT_PRESENT_MASK));
472                 BUG_ON(!(*spte & PT_WRITABLE_MASK));
473                 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
474                 rmap_remove(vcpu->kvm, spte);
475                 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
476                 kvm_flush_remote_tlbs(vcpu->kvm);
477         }
478 }
479
480 #ifdef MMU_DEBUG
481 static int is_empty_shadow_page(u64 *spt)
482 {
483         u64 *pos;
484         u64 *end;
485
486         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
487                 if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
488                         printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
489                                pos, *pos);
490                         return 0;
491                 }
492         return 1;
493 }
494 #endif
495
496 static void kvm_mmu_free_page(struct kvm *kvm,
497                               struct kvm_mmu_page *page_head)
498 {
499         ASSERT(is_empty_shadow_page(page_head->spt));
500         list_del(&page_head->link);
501         __free_page(virt_to_page(page_head->spt));
502         __free_page(virt_to_page(page_head->gfns));
503         kfree(page_head);
504         ++kvm->n_free_mmu_pages;
505 }
506
507 static unsigned kvm_page_table_hashfn(gfn_t gfn)
508 {
509         return gfn;
510 }
511
512 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
513                                                u64 *parent_pte)
514 {
515         struct kvm_mmu_page *page;
516
517         if (!vcpu->kvm->n_free_mmu_pages)
518                 return NULL;
519
520         page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
521                                       sizeof *page);
522         page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
523         page->gfns = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
524         set_page_private(virt_to_page(page->spt), (unsigned long)page);
525         list_add(&page->link, &vcpu->kvm->active_mmu_pages);
526         ASSERT(is_empty_shadow_page(page->spt));
527         page->slot_bitmap = 0;
528         page->multimapped = 0;
529         page->parent_pte = parent_pte;
530         --vcpu->kvm->n_free_mmu_pages;
531         return page;
532 }
533
534 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
535                                     struct kvm_mmu_page *page, u64 *parent_pte)
536 {
537         struct kvm_pte_chain *pte_chain;
538         struct hlist_node *node;
539         int i;
540
541         if (!parent_pte)
542                 return;
543         if (!page->multimapped) {
544                 u64 *old = page->parent_pte;
545
546                 if (!old) {
547                         page->parent_pte = parent_pte;
548                         return;
549                 }
550                 page->multimapped = 1;
551                 pte_chain = mmu_alloc_pte_chain(vcpu);
552                 INIT_HLIST_HEAD(&page->parent_ptes);
553                 hlist_add_head(&pte_chain->link, &page->parent_ptes);
554                 pte_chain->parent_ptes[0] = old;
555         }
556         hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
557                 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
558                         continue;
559                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
560                         if (!pte_chain->parent_ptes[i]) {
561                                 pte_chain->parent_ptes[i] = parent_pte;
562                                 return;
563                         }
564         }
565         pte_chain = mmu_alloc_pte_chain(vcpu);
566         BUG_ON(!pte_chain);
567         hlist_add_head(&pte_chain->link, &page->parent_ptes);
568         pte_chain->parent_ptes[0] = parent_pte;
569 }
570
571 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
572                                        u64 *parent_pte)
573 {
574         struct kvm_pte_chain *pte_chain;
575         struct hlist_node *node;
576         int i;
577
578         if (!page->multimapped) {
579                 BUG_ON(page->parent_pte != parent_pte);
580                 page->parent_pte = NULL;
581                 return;
582         }
583         hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
584                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
585                         if (!pte_chain->parent_ptes[i])
586                                 break;
587                         if (pte_chain->parent_ptes[i] != parent_pte)
588                                 continue;
589                         while (i + 1 < NR_PTE_CHAIN_ENTRIES
590                                 && pte_chain->parent_ptes[i + 1]) {
591                                 pte_chain->parent_ptes[i]
592                                         = pte_chain->parent_ptes[i + 1];
593                                 ++i;
594                         }
595                         pte_chain->parent_ptes[i] = NULL;
596                         if (i == 0) {
597                                 hlist_del(&pte_chain->link);
598                                 mmu_free_pte_chain(pte_chain);
599                                 if (hlist_empty(&page->parent_ptes)) {
600                                         page->multimapped = 0;
601                                         page->parent_pte = NULL;
602                                 }
603                         }
604                         return;
605                 }
606         BUG();
607 }
608
609 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
610                                                 gfn_t gfn)
611 {
612         unsigned index;
613         struct hlist_head *bucket;
614         struct kvm_mmu_page *page;
615         struct hlist_node *node;
616
617         pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
618         index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
619         bucket = &vcpu->kvm->mmu_page_hash[index];
620         hlist_for_each_entry(page, node, bucket, hash_link)
621                 if (page->gfn == gfn && !page->role.metaphysical) {
622                         pgprintk("%s: found role %x\n",
623                                  __FUNCTION__, page->role.word);
624                         return page;
625                 }
626         return NULL;
627 }
628
629 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
630                                              gfn_t gfn,
631                                              gva_t gaddr,
632                                              unsigned level,
633                                              int metaphysical,
634                                              unsigned hugepage_access,
635                                              u64 *parent_pte)
636 {
637         union kvm_mmu_page_role role;
638         unsigned index;
639         unsigned quadrant;
640         struct hlist_head *bucket;
641         struct kvm_mmu_page *page;
642         struct hlist_node *node;
643
644         role.word = 0;
645         role.glevels = vcpu->mmu.root_level;
646         role.level = level;
647         role.metaphysical = metaphysical;
648         role.hugepage_access = hugepage_access;
649         if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
650                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
651                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
652                 role.quadrant = quadrant;
653         }
654         pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
655                  gfn, role.word);
656         index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
657         bucket = &vcpu->kvm->mmu_page_hash[index];
658         hlist_for_each_entry(page, node, bucket, hash_link)
659                 if (page->gfn == gfn && page->role.word == role.word) {
660                         mmu_page_add_parent_pte(vcpu, page, parent_pte);
661                         pgprintk("%s: found\n", __FUNCTION__);
662                         return page;
663                 }
664         page = kvm_mmu_alloc_page(vcpu, parent_pte);
665         if (!page)
666                 return page;
667         pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
668         page->gfn = gfn;
669         page->role = role;
670         hlist_add_head(&page->hash_link, bucket);
671         vcpu->mmu.prefetch_page(vcpu, page);
672         if (!metaphysical)
673                 rmap_write_protect(vcpu, gfn);
674         return page;
675 }
676
677 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
678                                          struct kvm_mmu_page *page)
679 {
680         unsigned i;
681         u64 *pt;
682         u64 ent;
683
684         pt = page->spt;
685
686         if (page->role.level == PT_PAGE_TABLE_LEVEL) {
687                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
688                         if (is_shadow_present_pte(pt[i]))
689                                 rmap_remove(kvm, &pt[i]);
690                         pt[i] = shadow_trap_nonpresent_pte;
691                 }
692                 kvm_flush_remote_tlbs(kvm);
693                 return;
694         }
695
696         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
697                 ent = pt[i];
698
699                 pt[i] = shadow_trap_nonpresent_pte;
700                 if (!is_shadow_present_pte(ent))
701                         continue;
702                 ent &= PT64_BASE_ADDR_MASK;
703                 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
704         }
705         kvm_flush_remote_tlbs(kvm);
706 }
707
708 static void kvm_mmu_put_page(struct kvm_mmu_page *page,
709                              u64 *parent_pte)
710 {
711         mmu_page_remove_parent_pte(page, parent_pte);
712 }
713
714 static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
715 {
716         int i;
717
718         for (i = 0; i < KVM_MAX_VCPUS; ++i)
719                 if (kvm->vcpus[i])
720                         kvm->vcpus[i]->last_pte_updated = NULL;
721 }
722
723 static void kvm_mmu_zap_page(struct kvm *kvm,
724                              struct kvm_mmu_page *page)
725 {
726         u64 *parent_pte;
727
728         while (page->multimapped || page->parent_pte) {
729                 if (!page->multimapped)
730                         parent_pte = page->parent_pte;
731                 else {
732                         struct kvm_pte_chain *chain;
733
734                         chain = container_of(page->parent_ptes.first,
735                                              struct kvm_pte_chain, link);
736                         parent_pte = chain->parent_ptes[0];
737                 }
738                 BUG_ON(!parent_pte);
739                 kvm_mmu_put_page(page, parent_pte);
740                 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
741         }
742         kvm_mmu_page_unlink_children(kvm, page);
743         if (!page->root_count) {
744                 hlist_del(&page->hash_link);
745                 kvm_mmu_free_page(kvm, page);
746         } else
747                 list_move(&page->link, &kvm->active_mmu_pages);
748         kvm_mmu_reset_last_pte_updated(kvm);
749 }
750
751 /*
752  * Changing the number of mmu pages allocated to the vm
753  * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
754  */
755 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
756 {
757         /*
758          * If we set the number of mmu pages to be smaller be than the
759          * number of actived pages , we must to free some mmu pages before we
760          * change the value
761          */
762
763         if ((kvm->n_alloc_mmu_pages - kvm->n_free_mmu_pages) >
764             kvm_nr_mmu_pages) {
765                 int n_used_mmu_pages = kvm->n_alloc_mmu_pages
766                                        - kvm->n_free_mmu_pages;
767
768                 while (n_used_mmu_pages > kvm_nr_mmu_pages) {
769                         struct kvm_mmu_page *page;
770
771                         page = container_of(kvm->active_mmu_pages.prev,
772                                             struct kvm_mmu_page, link);
773                         kvm_mmu_zap_page(kvm, page);
774                         n_used_mmu_pages--;
775                 }
776                 kvm->n_free_mmu_pages = 0;
777         }
778         else
779                 kvm->n_free_mmu_pages += kvm_nr_mmu_pages
780                                          - kvm->n_alloc_mmu_pages;
781
782         kvm->n_alloc_mmu_pages = kvm_nr_mmu_pages;
783 }
784
785 static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
786 {
787         unsigned index;
788         struct hlist_head *bucket;
789         struct kvm_mmu_page *page;
790         struct hlist_node *node, *n;
791         int r;
792
793         pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
794         r = 0;
795         index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
796         bucket = &vcpu->kvm->mmu_page_hash[index];
797         hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
798                 if (page->gfn == gfn && !page->role.metaphysical) {
799                         pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
800                                  page->role.word);
801                         kvm_mmu_zap_page(vcpu->kvm, page);
802                         r = 1;
803                 }
804         return r;
805 }
806
807 static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
808 {
809         struct kvm_mmu_page *page;
810
811         while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
812                 pgprintk("%s: zap %lx %x\n",
813                          __FUNCTION__, gfn, page->role.word);
814                 kvm_mmu_zap_page(vcpu->kvm, page);
815         }
816 }
817
818 static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
819 {
820         int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
821         struct kvm_mmu_page *page_head = page_header(__pa(pte));
822
823         __set_bit(slot, &page_head->slot_bitmap);
824 }
825
826 hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
827 {
828         hpa_t hpa = gpa_to_hpa(vcpu, gpa);
829
830         return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
831 }
832
833 hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
834 {
835         struct page *page;
836
837         ASSERT((gpa & HPA_ERR_MASK) == 0);
838         page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
839         if (!page)
840                 return gpa | HPA_ERR_MASK;
841         return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
842                 | (gpa & (PAGE_SIZE-1));
843 }
844
845 hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
846 {
847         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
848
849         if (gpa == UNMAPPED_GVA)
850                 return UNMAPPED_GVA;
851         return gpa_to_hpa(vcpu, gpa);
852 }
853
854 struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
855 {
856         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
857
858         if (gpa == UNMAPPED_GVA)
859                 return NULL;
860         return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
861 }
862
863 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
864 {
865 }
866
867 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
868 {
869         int level = PT32E_ROOT_LEVEL;
870         hpa_t table_addr = vcpu->mmu.root_hpa;
871
872         for (; ; level--) {
873                 u32 index = PT64_INDEX(v, level);
874                 u64 *table;
875                 u64 pte;
876
877                 ASSERT(VALID_PAGE(table_addr));
878                 table = __va(table_addr);
879
880                 if (level == 1) {
881                         pte = table[index];
882                         if (is_shadow_present_pte(pte) && is_writeble_pte(pte))
883                                 return 0;
884                         mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
885                         page_header_update_slot(vcpu->kvm, table, v);
886                         table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
887                                                                 PT_USER_MASK;
888                         rmap_add(vcpu, &table[index], v >> PAGE_SHIFT);
889                         return 0;
890                 }
891
892                 if (table[index] == shadow_trap_nonpresent_pte) {
893                         struct kvm_mmu_page *new_table;
894                         gfn_t pseudo_gfn;
895
896                         pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
897                                 >> PAGE_SHIFT;
898                         new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
899                                                      v, level - 1,
900                                                      1, 0, &table[index]);
901                         if (!new_table) {
902                                 pgprintk("nonpaging_map: ENOMEM\n");
903                                 return -ENOMEM;
904                         }
905
906                         table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
907                                 | PT_WRITABLE_MASK | PT_USER_MASK;
908                 }
909                 table_addr = table[index] & PT64_BASE_ADDR_MASK;
910         }
911 }
912
913 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
914                                     struct kvm_mmu_page *sp)
915 {
916         int i;
917
918         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
919                 sp->spt[i] = shadow_trap_nonpresent_pte;
920 }
921
922 static void mmu_free_roots(struct kvm_vcpu *vcpu)
923 {
924         int i;
925         struct kvm_mmu_page *page;
926
927         if (!VALID_PAGE(vcpu->mmu.root_hpa))
928                 return;
929 #ifdef CONFIG_X86_64
930         if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
931                 hpa_t root = vcpu->mmu.root_hpa;
932
933                 page = page_header(root);
934                 --page->root_count;
935                 vcpu->mmu.root_hpa = INVALID_PAGE;
936                 return;
937         }
938 #endif
939         for (i = 0; i < 4; ++i) {
940                 hpa_t root = vcpu->mmu.pae_root[i];
941
942                 if (root) {
943                         root &= PT64_BASE_ADDR_MASK;
944                         page = page_header(root);
945                         --page->root_count;
946                 }
947                 vcpu->mmu.pae_root[i] = INVALID_PAGE;
948         }
949         vcpu->mmu.root_hpa = INVALID_PAGE;
950 }
951
952 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
953 {
954         int i;
955         gfn_t root_gfn;
956         struct kvm_mmu_page *page;
957
958         root_gfn = vcpu->cr3 >> PAGE_SHIFT;
959
960 #ifdef CONFIG_X86_64
961         if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
962                 hpa_t root = vcpu->mmu.root_hpa;
963
964                 ASSERT(!VALID_PAGE(root));
965                 page = kvm_mmu_get_page(vcpu, root_gfn, 0,
966                                         PT64_ROOT_LEVEL, 0, 0, NULL);
967                 root = __pa(page->spt);
968                 ++page->root_count;
969                 vcpu->mmu.root_hpa = root;
970                 return;
971         }
972 #endif
973         for (i = 0; i < 4; ++i) {
974                 hpa_t root = vcpu->mmu.pae_root[i];
975
976                 ASSERT(!VALID_PAGE(root));
977                 if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
978                         if (!is_present_pte(vcpu->pdptrs[i])) {
979                                 vcpu->mmu.pae_root[i] = 0;
980                                 continue;
981                         }
982                         root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
983                 } else if (vcpu->mmu.root_level == 0)
984                         root_gfn = 0;
985                 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
986                                         PT32_ROOT_LEVEL, !is_paging(vcpu),
987                                         0, NULL);
988                 root = __pa(page->spt);
989                 ++page->root_count;
990                 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
991         }
992         vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
993 }
994
995 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
996 {
997         return vaddr;
998 }
999
1000 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1001                                u32 error_code)
1002 {
1003         gpa_t addr = gva;
1004         hpa_t paddr;
1005         int r;
1006
1007         r = mmu_topup_memory_caches(vcpu);
1008         if (r)
1009                 return r;
1010
1011         ASSERT(vcpu);
1012         ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
1013
1014
1015         paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
1016
1017         if (is_error_hpa(paddr))
1018                 return 1;
1019
1020         return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
1021 }
1022
1023 static void nonpaging_free(struct kvm_vcpu *vcpu)
1024 {
1025         mmu_free_roots(vcpu);
1026 }
1027
1028 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1029 {
1030         struct kvm_mmu *context = &vcpu->mmu;
1031
1032         context->new_cr3 = nonpaging_new_cr3;
1033         context->page_fault = nonpaging_page_fault;
1034         context->gva_to_gpa = nonpaging_gva_to_gpa;
1035         context->free = nonpaging_free;
1036         context->prefetch_page = nonpaging_prefetch_page;
1037         context->root_level = 0;
1038         context->shadow_root_level = PT32E_ROOT_LEVEL;
1039         context->root_hpa = INVALID_PAGE;
1040         return 0;
1041 }
1042
1043 static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1044 {
1045         ++vcpu->stat.tlb_flush;
1046         kvm_x86_ops->tlb_flush(vcpu);
1047 }
1048
1049 static void paging_new_cr3(struct kvm_vcpu *vcpu)
1050 {
1051         pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
1052         mmu_free_roots(vcpu);
1053 }
1054
1055 static void inject_page_fault(struct kvm_vcpu *vcpu,
1056                               u64 addr,
1057                               u32 err_code)
1058 {
1059         kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
1060 }
1061
1062 static void paging_free(struct kvm_vcpu *vcpu)
1063 {
1064         nonpaging_free(vcpu);
1065 }
1066
1067 #define PTTYPE 64
1068 #include "paging_tmpl.h"
1069 #undef PTTYPE
1070
1071 #define PTTYPE 32
1072 #include "paging_tmpl.h"
1073 #undef PTTYPE
1074
1075 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1076 {
1077         struct kvm_mmu *context = &vcpu->mmu;
1078
1079         ASSERT(is_pae(vcpu));
1080         context->new_cr3 = paging_new_cr3;
1081         context->page_fault = paging64_page_fault;
1082         context->gva_to_gpa = paging64_gva_to_gpa;
1083         context->prefetch_page = paging64_prefetch_page;
1084         context->free = paging_free;
1085         context->root_level = level;
1086         context->shadow_root_level = level;
1087         context->root_hpa = INVALID_PAGE;
1088         return 0;
1089 }
1090
1091 static int paging64_init_context(struct kvm_vcpu *vcpu)
1092 {
1093         return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1094 }
1095
1096 static int paging32_init_context(struct kvm_vcpu *vcpu)
1097 {
1098         struct kvm_mmu *context = &vcpu->mmu;
1099
1100         context->new_cr3 = paging_new_cr3;
1101         context->page_fault = paging32_page_fault;
1102         context->gva_to_gpa = paging32_gva_to_gpa;
1103         context->free = paging_free;
1104         context->prefetch_page = paging32_prefetch_page;
1105         context->root_level = PT32_ROOT_LEVEL;
1106         context->shadow_root_level = PT32E_ROOT_LEVEL;
1107         context->root_hpa = INVALID_PAGE;
1108         return 0;
1109 }
1110
1111 static int paging32E_init_context(struct kvm_vcpu *vcpu)
1112 {
1113         return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1114 }
1115
1116 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1117 {
1118         ASSERT(vcpu);
1119         ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1120
1121         if (!is_paging(vcpu))
1122                 return nonpaging_init_context(vcpu);
1123         else if (is_long_mode(vcpu))
1124                 return paging64_init_context(vcpu);
1125         else if (is_pae(vcpu))
1126                 return paging32E_init_context(vcpu);
1127         else
1128                 return paging32_init_context(vcpu);
1129 }
1130
1131 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1132 {
1133         ASSERT(vcpu);
1134         if (VALID_PAGE(vcpu->mmu.root_hpa)) {
1135                 vcpu->mmu.free(vcpu);
1136                 vcpu->mmu.root_hpa = INVALID_PAGE;
1137         }
1138 }
1139
1140 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1141 {
1142         destroy_kvm_mmu(vcpu);
1143         return init_kvm_mmu(vcpu);
1144 }
1145 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1146
1147 int kvm_mmu_load(struct kvm_vcpu *vcpu)
1148 {
1149         int r;
1150
1151         mutex_lock(&vcpu->kvm->lock);
1152         r = mmu_topup_memory_caches(vcpu);
1153         if (r)
1154                 goto out;
1155         mmu_alloc_roots(vcpu);
1156         kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
1157         kvm_mmu_flush_tlb(vcpu);
1158 out:
1159         mutex_unlock(&vcpu->kvm->lock);
1160         return r;
1161 }
1162 EXPORT_SYMBOL_GPL(kvm_mmu_load);
1163
1164 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1165 {
1166         mmu_free_roots(vcpu);
1167 }
1168
1169 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1170                                   struct kvm_mmu_page *page,
1171                                   u64 *spte)
1172 {
1173         u64 pte;
1174         struct kvm_mmu_page *child;
1175
1176         pte = *spte;
1177         if (is_shadow_present_pte(pte)) {
1178                 if (page->role.level == PT_PAGE_TABLE_LEVEL)
1179                         rmap_remove(vcpu->kvm, spte);
1180                 else {
1181                         child = page_header(pte & PT64_BASE_ADDR_MASK);
1182                         mmu_page_remove_parent_pte(child, spte);
1183                 }
1184         }
1185         set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1186         kvm_flush_remote_tlbs(vcpu->kvm);
1187 }
1188
1189 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1190                                   struct kvm_mmu_page *page,
1191                                   u64 *spte,
1192                                   const void *new, int bytes,
1193                                   int offset_in_pte)
1194 {
1195         if (page->role.level != PT_PAGE_TABLE_LEVEL)
1196                 return;
1197
1198         if (page->role.glevels == PT32_ROOT_LEVEL)
1199                 paging32_update_pte(vcpu, page, spte, new, bytes,
1200                                     offset_in_pte);
1201         else
1202                 paging64_update_pte(vcpu, page, spte, new, bytes,
1203                                     offset_in_pte);
1204 }
1205
1206 static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
1207 {
1208         u64 *spte = vcpu->last_pte_updated;
1209
1210         return !!(spte && (*spte & PT_ACCESSED_MASK));
1211 }
1212
1213 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1214                        const u8 *new, int bytes)
1215 {
1216         gfn_t gfn = gpa >> PAGE_SHIFT;
1217         struct kvm_mmu_page *page;
1218         struct hlist_node *node, *n;
1219         struct hlist_head *bucket;
1220         unsigned index;
1221         u64 *spte;
1222         unsigned offset = offset_in_page(gpa);
1223         unsigned pte_size;
1224         unsigned page_offset;
1225         unsigned misaligned;
1226         unsigned quadrant;
1227         int level;
1228         int flooded = 0;
1229         int npte;
1230
1231         pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1232         kvm_mmu_audit(vcpu, "pre pte write");
1233         if (gfn == vcpu->last_pt_write_gfn
1234             && !last_updated_pte_accessed(vcpu)) {
1235                 ++vcpu->last_pt_write_count;
1236                 if (vcpu->last_pt_write_count >= 3)
1237                         flooded = 1;
1238         } else {
1239                 vcpu->last_pt_write_gfn = gfn;
1240                 vcpu->last_pt_write_count = 1;
1241                 vcpu->last_pte_updated = NULL;
1242         }
1243         index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1244         bucket = &vcpu->kvm->mmu_page_hash[index];
1245         hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
1246                 if (page->gfn != gfn || page->role.metaphysical)
1247                         continue;
1248                 pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1249                 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1250                 misaligned |= bytes < 4;
1251                 if (misaligned || flooded) {
1252                         /*
1253                          * Misaligned accesses are too much trouble to fix
1254                          * up; also, they usually indicate a page is not used
1255                          * as a page table.
1256                          *
1257                          * If we're seeing too many writes to a page,
1258                          * it may no longer be a page table, or we may be
1259                          * forking, in which case it is better to unmap the
1260                          * page.
1261                          */
1262                         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1263                                  gpa, bytes, page->role.word);
1264                         kvm_mmu_zap_page(vcpu->kvm, page);
1265                         continue;
1266                 }
1267                 page_offset = offset;
1268                 level = page->role.level;
1269                 npte = 1;
1270                 if (page->role.glevels == PT32_ROOT_LEVEL) {
1271                         page_offset <<= 1;      /* 32->64 */
1272                         /*
1273                          * A 32-bit pde maps 4MB while the shadow pdes map
1274                          * only 2MB.  So we need to double the offset again
1275                          * and zap two pdes instead of one.
1276                          */
1277                         if (level == PT32_ROOT_LEVEL) {
1278                                 page_offset &= ~7; /* kill rounding error */
1279                                 page_offset <<= 1;
1280                                 npte = 2;
1281                         }
1282                         quadrant = page_offset >> PAGE_SHIFT;
1283                         page_offset &= ~PAGE_MASK;
1284                         if (quadrant != page->role.quadrant)
1285                                 continue;
1286                 }
1287                 spte = &page->spt[page_offset / sizeof(*spte)];
1288                 while (npte--) {
1289                         mmu_pte_write_zap_pte(vcpu, page, spte);
1290                         mmu_pte_write_new_pte(vcpu, page, spte, new, bytes,
1291                                               page_offset & (pte_size - 1));
1292                         ++spte;
1293                 }
1294         }
1295         kvm_mmu_audit(vcpu, "post pte write");
1296 }
1297
1298 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1299 {
1300         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
1301
1302         return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
1303 }
1304
1305 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1306 {
1307         while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
1308                 struct kvm_mmu_page *page;
1309
1310                 page = container_of(vcpu->kvm->active_mmu_pages.prev,
1311                                     struct kvm_mmu_page, link);
1312                 kvm_mmu_zap_page(vcpu->kvm, page);
1313         }
1314 }
1315
1316 static void free_mmu_pages(struct kvm_vcpu *vcpu)
1317 {
1318         struct kvm_mmu_page *page;
1319
1320         while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1321                 page = container_of(vcpu->kvm->active_mmu_pages.next,
1322                                     struct kvm_mmu_page, link);
1323                 kvm_mmu_zap_page(vcpu->kvm, page);
1324         }
1325         free_page((unsigned long)vcpu->mmu.pae_root);
1326 }
1327
1328 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1329 {
1330         struct page *page;
1331         int i;
1332
1333         ASSERT(vcpu);
1334
1335         if (vcpu->kvm->n_requested_mmu_pages)
1336                 vcpu->kvm->n_free_mmu_pages = vcpu->kvm->n_requested_mmu_pages;
1337         else
1338                 vcpu->kvm->n_free_mmu_pages = vcpu->kvm->n_alloc_mmu_pages;
1339         /*
1340          * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1341          * Therefore we need to allocate shadow page tables in the first
1342          * 4GB of memory, which happens to fit the DMA32 zone.
1343          */
1344         page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1345         if (!page)
1346                 goto error_1;
1347         vcpu->mmu.pae_root = page_address(page);
1348         for (i = 0; i < 4; ++i)
1349                 vcpu->mmu.pae_root[i] = INVALID_PAGE;
1350
1351         return 0;
1352
1353 error_1:
1354         free_mmu_pages(vcpu);
1355         return -ENOMEM;
1356 }
1357
1358 int kvm_mmu_create(struct kvm_vcpu *vcpu)
1359 {
1360         ASSERT(vcpu);
1361         ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1362
1363         return alloc_mmu_pages(vcpu);
1364 }
1365
1366 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1367 {
1368         ASSERT(vcpu);
1369         ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1370
1371         return init_kvm_mmu(vcpu);
1372 }
1373
1374 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1375 {
1376         ASSERT(vcpu);
1377
1378         destroy_kvm_mmu(vcpu);
1379         free_mmu_pages(vcpu);
1380         mmu_free_memory_caches(vcpu);
1381 }
1382
1383 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1384 {
1385         struct kvm_mmu_page *page;
1386
1387         list_for_each_entry(page, &kvm->active_mmu_pages, link) {
1388                 int i;
1389                 u64 *pt;
1390
1391                 if (!test_bit(slot, &page->slot_bitmap))
1392                         continue;
1393
1394                 pt = page->spt;
1395                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1396                         /* avoid RMW */
1397                         if (pt[i] & PT_WRITABLE_MASK) {
1398                                 rmap_remove(kvm, &pt[i]);
1399                                 pt[i] &= ~PT_WRITABLE_MASK;
1400                         }
1401         }
1402 }
1403
1404 void kvm_mmu_zap_all(struct kvm *kvm)
1405 {
1406         struct kvm_mmu_page *page, *node;
1407
1408         list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link)
1409                 kvm_mmu_zap_page(kvm, page);
1410
1411         kvm_flush_remote_tlbs(kvm);
1412 }
1413
1414 void kvm_mmu_module_exit(void)
1415 {
1416         if (pte_chain_cache)
1417                 kmem_cache_destroy(pte_chain_cache);
1418         if (rmap_desc_cache)
1419                 kmem_cache_destroy(rmap_desc_cache);
1420         if (mmu_page_header_cache)
1421                 kmem_cache_destroy(mmu_page_header_cache);
1422 }
1423
1424 int kvm_mmu_module_init(void)
1425 {
1426         pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1427                                             sizeof(struct kvm_pte_chain),
1428                                             0, 0, NULL);
1429         if (!pte_chain_cache)
1430                 goto nomem;
1431         rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1432                                             sizeof(struct kvm_rmap_desc),
1433                                             0, 0, NULL);
1434         if (!rmap_desc_cache)
1435                 goto nomem;
1436
1437         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1438                                                   sizeof(struct kvm_mmu_page),
1439                                                   0, 0, NULL);
1440         if (!mmu_page_header_cache)
1441                 goto nomem;
1442
1443         return 0;
1444
1445 nomem:
1446         kvm_mmu_module_exit();
1447         return -ENOMEM;
1448 }
1449
1450 #ifdef AUDIT
1451
1452 static const char *audit_msg;
1453
1454 static gva_t canonicalize(gva_t gva)
1455 {
1456 #ifdef CONFIG_X86_64
1457         gva = (long long)(gva << 16) >> 16;
1458 #endif
1459         return gva;
1460 }
1461
1462 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1463                                 gva_t va, int level)
1464 {
1465         u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1466         int i;
1467         gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1468
1469         for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1470                 u64 ent = pt[i];
1471
1472                 if (ent == shadow_trap_nonpresent_pte)
1473                         continue;
1474
1475                 va = canonicalize(va);
1476                 if (level > 1) {
1477                         if (ent == shadow_notrap_nonpresent_pte)
1478                                 printk(KERN_ERR "audit: (%s) nontrapping pte"
1479                                        " in nonleaf level: levels %d gva %lx"
1480                                        " level %d pte %llx\n", audit_msg,
1481                                        vcpu->mmu.root_level, va, level, ent);
1482
1483                         audit_mappings_page(vcpu, ent, va, level - 1);
1484                 } else {
1485                         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
1486                         hpa_t hpa = gpa_to_hpa(vcpu, gpa);
1487
1488                         if (is_shadow_present_pte(ent)
1489                             && (ent & PT64_BASE_ADDR_MASK) != hpa)
1490                                 printk(KERN_ERR "xx audit error: (%s) levels %d"
1491                                        " gva %lx gpa %llx hpa %llx ent %llx %d\n",
1492                                        audit_msg, vcpu->mmu.root_level,
1493                                        va, gpa, hpa, ent,
1494                                        is_shadow_present_pte(ent));
1495                         else if (ent == shadow_notrap_nonpresent_pte
1496                                  && !is_error_hpa(hpa))
1497                                 printk(KERN_ERR "audit: (%s) notrap shadow,"
1498                                        " valid guest gva %lx\n", audit_msg, va);
1499
1500                 }
1501         }
1502 }
1503
1504 static void audit_mappings(struct kvm_vcpu *vcpu)
1505 {
1506         unsigned i;
1507
1508         if (vcpu->mmu.root_level == 4)
1509                 audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
1510         else
1511                 for (i = 0; i < 4; ++i)
1512                         if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
1513                                 audit_mappings_page(vcpu,
1514                                                     vcpu->mmu.pae_root[i],
1515                                                     i << 30,
1516                                                     2);
1517 }
1518
1519 static int count_rmaps(struct kvm_vcpu *vcpu)
1520 {
1521         int nmaps = 0;
1522         int i, j, k;
1523
1524         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1525                 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1526                 struct kvm_rmap_desc *d;
1527
1528                 for (j = 0; j < m->npages; ++j) {
1529                         unsigned long *rmapp = &m->rmap[j];
1530
1531                         if (!*rmapp)
1532                                 continue;
1533                         if (!(*rmapp & 1)) {
1534                                 ++nmaps;
1535                                 continue;
1536                         }
1537                         d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
1538                         while (d) {
1539                                 for (k = 0; k < RMAP_EXT; ++k)
1540                                         if (d->shadow_ptes[k])
1541                                                 ++nmaps;
1542                                         else
1543                                                 break;
1544                                 d = d->more;
1545                         }
1546                 }
1547         }
1548         return nmaps;
1549 }
1550
1551 static int count_writable_mappings(struct kvm_vcpu *vcpu)
1552 {
1553         int nmaps = 0;
1554         struct kvm_mmu_page *page;
1555         int i;
1556
1557         list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1558                 u64 *pt = page->spt;
1559
1560                 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1561                         continue;
1562
1563                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1564                         u64 ent = pt[i];
1565
1566                         if (!(ent & PT_PRESENT_MASK))
1567                                 continue;
1568                         if (!(ent & PT_WRITABLE_MASK))
1569                                 continue;
1570                         ++nmaps;
1571                 }
1572         }
1573         return nmaps;
1574 }
1575
1576 static void audit_rmap(struct kvm_vcpu *vcpu)
1577 {
1578         int n_rmap = count_rmaps(vcpu);
1579         int n_actual = count_writable_mappings(vcpu);
1580
1581         if (n_rmap != n_actual)
1582                 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1583                        __FUNCTION__, audit_msg, n_rmap, n_actual);
1584 }
1585
1586 static void audit_write_protection(struct kvm_vcpu *vcpu)
1587 {
1588         struct kvm_mmu_page *page;
1589         struct kvm_memory_slot *slot;
1590         unsigned long *rmapp;
1591         gfn_t gfn;
1592
1593         list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1594                 if (page->role.metaphysical)
1595                         continue;
1596
1597                 slot = gfn_to_memslot(vcpu->kvm, page->gfn);
1598                 gfn = unalias_gfn(vcpu->kvm, page->gfn);
1599                 rmapp = &slot->rmap[gfn - slot->base_gfn];
1600                 if (*rmapp)
1601                         printk(KERN_ERR "%s: (%s) shadow page has writable"
1602                                " mappings: gfn %lx role %x\n",
1603                                __FUNCTION__, audit_msg, page->gfn,
1604                                page->role.word);
1605         }
1606 }
1607
1608 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1609 {
1610         int olddbg = dbg;
1611
1612         dbg = 0;
1613         audit_msg = msg;
1614         audit_rmap(vcpu);
1615         audit_write_protection(vcpu);
1616         audit_mappings(vcpu);
1617         dbg = olddbg;
1618 }
1619
1620 #endif