[PATCH] KVM: MMU: Ensure freed shadow pages are clean

[cascardo/linux.git] / drivers / kvm / mmu.c
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c

index 6dbd83b..b9ba240 100644 (file)
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -303,15 +303,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
         }
  }
  
-static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
-{
-       struct kvm_mmu_page *page_head = page_header(page_hpa);
-
-       list_del(&page_head->link);
-       page_head->page_hpa = page_hpa;
-       list_add(&page_head->link, &vcpu->free_pages);
-}
-
  static int is_empty_shadow_page(hpa_t page_hpa)
  {
         u32 *pos;
@@ -323,6 +314,17 @@ static int is_empty_shadow_page(hpa_t page_hpa)
         return 1;
  }
  
+static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
+{
+       struct kvm_mmu_page *page_head = page_header(page_hpa);
+
+       ASSERT(is_empty_shadow_page(page_hpa));
+       list_del(&page_head->link);
+       page_head->page_hpa = page_hpa;
+       list_add(&page_head->link, &vcpu->free_pages);
+       ++vcpu->kvm->n_free_mmu_pages;
+}
+
  static unsigned kvm_page_table_hashfn(gfn_t gfn)
  {
         return gfn;
@@ -344,6 +346,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
         page->global = 1;
         page->multimapped = 0;
         page->parent_pte = parent_pte;
+       --vcpu->kvm->n_free_mmu_pages;
         return page;
  }
  
@@ -402,12 +405,21 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
                                 break;
                         if (pte_chain->parent_ptes[i] != parent_pte)
                                 continue;
-                       while (i + 1 < NR_PTE_CHAIN_ENTRIES) {
+                       while (i + 1 < NR_PTE_CHAIN_ENTRIES
+                               && pte_chain->parent_ptes[i + 1]) {
                                 pte_chain->parent_ptes[i]
                                         = pte_chain->parent_ptes[i + 1];
                                 ++i;
                         }
                         pte_chain->parent_ptes[i] = NULL;
+                       if (i == 0) {
+                               hlist_del(&pte_chain->link);
+                               kfree(pte_chain);
+                               if (hlist_empty(&page->parent_ptes)) {
+                                       page->multimapped = 0;
+                                       page->parent_pte = NULL;
+                               }
+                       }
                         return;
                 }
         BUG();
@@ -478,6 +490,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
         return page;
  }
  
+static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
+                                        struct kvm_mmu_page *page)
+{
+       unsigned i;
+       u64 *pt;
+       u64 ent;
+
+       pt = __va(page->page_hpa);
+
+       if (page->role.level == PT_PAGE_TABLE_LEVEL) {
+               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+                       if (pt[i] & PT_PRESENT_MASK)
+                               rmap_remove(vcpu->kvm, &pt[i]);
+                       pt[i] = 0;
+               }
+               return;
+       }
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+               ent = pt[i];
+
+               pt[i] = 0;
+               if (!(ent & PT_PRESENT_MASK))
+                       continue;
+               ent &= PT64_BASE_ADDR_MASK;
+               mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
+       }
+}
+
  static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
                              struct kvm_mmu_page *page,
                              u64 *parent_pte)
@@ -485,6 +526,52 @@ static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
         mmu_page_remove_parent_pte(page, parent_pte);
  }
  
+static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
+                            struct kvm_mmu_page *page)
+{
+       u64 *parent_pte;
+
+       while (page->multimapped || page->parent_pte) {
+               if (!page->multimapped)
+                       parent_pte = page->parent_pte;
+               else {
+                       struct kvm_pte_chain *chain;
+
+                       chain = container_of(page->parent_ptes.first,
+                                            struct kvm_pte_chain, link);
+                       parent_pte = chain->parent_ptes[0];
+               }
+               BUG_ON(!parent_pte);
+               kvm_mmu_put_page(vcpu, page, parent_pte);
+               *parent_pte = 0;
+       }
+       kvm_mmu_page_unlink_children(vcpu, page);
+       hlist_del(&page->hash_link);
+       kvm_mmu_free_page(vcpu, page->page_hpa);
+}
+
+static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+       unsigned index;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *page;
+       struct hlist_node *node, *n;
+       int r;
+
+       pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+       r = 0;
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &vcpu->kvm->mmu_page_hash[index];
+       hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
+               if (page->gfn == gfn && !page->role.metaphysical) {
+                       pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+                                page->role.word);
+                       kvm_mmu_zap_page(vcpu, page);
+                       r = 1;
+               }
+       return r;
+}
+
  static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
  {
         int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
@@ -523,35 +610,6 @@ hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
         return gpa_to_hpa(vcpu, gpa);
  }
  
-
-static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa,
-                              int level)
-{
-       u64 *pos;
-       u64 *end;
-
-       ASSERT(vcpu);
-       ASSERT(VALID_PAGE(page_hpa));
-       ASSERT(level <= PT64_ROOT_LEVEL && level > 0);
-
-       for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
-            pos != end; pos++) {
-               u64 current_ent = *pos;
-
-               if (is_present_pte(current_ent)) {
-                       if (level != 1)
-                               release_pt_page_64(vcpu,
-                                                 current_ent &
-                                                 PT64_BASE_ADDR_MASK,
-                                                 level - 1);
-                       else
-                               rmap_remove(vcpu->kvm, pos);
-               }
-               *pos = 0;
-       }
-       kvm_mmu_free_page(vcpu, page_hpa);
-}
-
  static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
  {
  }
@@ -658,18 +716,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
         vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
  }
  
-static void nonpaging_flush(struct kvm_vcpu *vcpu)
-{
-       hpa_t root = vcpu->mmu.root_hpa;
-
-       ++kvm_stat.tlb_flush;
-       pgprintk("nonpaging_flush\n");
-       mmu_free_roots(vcpu);
-       mmu_alloc_roots(vcpu);
-       kvm_arch_ops->set_cr3(vcpu, root);
-       kvm_arch_ops->tlb_flush(vcpu);
-}
-
  static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
  {
         return vaddr;
@@ -678,32 +724,19 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
  static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
                                u32 error_code)
  {
-       int ret;
         gpa_t addr = gva;
+       hpa_t paddr;
  
         ASSERT(vcpu);
         ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
  
-       for (;;) {
-            hpa_t paddr;
-
-            paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
  
-            if (is_error_hpa(paddr))
-                    return 1;
+       paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
  
-            ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
-            if (ret) {
-                    nonpaging_flush(vcpu);
-                    continue;
-            }
-            break;
-       }
-       return ret;
-}
+       if (is_error_hpa(paddr))
+               return 1;
  
-static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
-{
+       return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
  }
  
  static void nonpaging_free(struct kvm_vcpu *vcpu)
@@ -717,7 +750,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
  
         context->new_cr3 = nonpaging_new_cr3;
         context->page_fault = nonpaging_page_fault;
-       context->inval_page = nonpaging_inval_page;
         context->gva_to_gpa = nonpaging_gva_to_gpa;
         context->free = nonpaging_free;
         context->root_level = 0;
@@ -830,42 +862,6 @@ static int may_access(u64 pte, int write, int user)
         return 1;
  }
  
-/*
- * Remove a shadow pte.
- */
-static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
-{
-       hpa_t page_addr = vcpu->mmu.root_hpa;
-       int level = vcpu->mmu.shadow_root_level;
-
-       ++kvm_stat.invlpg;
-
-       for (; ; level--) {
-               u32 index = PT64_INDEX(addr, level);
-               u64 *table = __va(page_addr);
-
-               if (level == PT_PAGE_TABLE_LEVEL ) {
-                       rmap_remove(vcpu->kvm, &table[index]);
-                       table[index] = 0;
-                       return;
-               }
-
-               if (!is_present_pte(table[index]))
-                       return;
-
-               page_addr = table[index] & PT64_BASE_ADDR_MASK;
-
-               if (level == PT_DIRECTORY_LEVEL &&
-                         (table[index] & PT_SHADOW_PS_MARK)) {
-                       table[index] = 0;
-                       release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL);
-
-                       kvm_arch_ops->tlb_flush(vcpu);
-                       return;
-               }
-       }
-}
-
  static void paging_free(struct kvm_vcpu *vcpu)
  {
         nonpaging_free(vcpu);
@@ -886,7 +882,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
         ASSERT(is_pae(vcpu));
         context->new_cr3 = paging_new_cr3;
         context->page_fault = paging64_page_fault;
-       context->inval_page = paging_inval_page;
         context->gva_to_gpa = paging64_gva_to_gpa;
         context->free = paging_free;
         context->root_level = level;
@@ -909,7 +904,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
  
         context->new_cr3 = paging_new_cr3;
         context->page_fault = paging32_page_fault;
-       context->inval_page = paging_inval_page;
         context->gva_to_gpa = paging32_gva_to_gpa;
         context->free = paging_free;
         context->root_level = PT32_ROOT_LEVEL;
@@ -961,21 +955,36 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
         gfn_t gfn = gpa >> PAGE_SHIFT;
         struct kvm_mmu_page *page;
         struct kvm_mmu_page *child;
-       struct hlist_node *node;
+       struct hlist_node *node, *n;
         struct hlist_head *bucket;
         unsigned index;
         u64 *spte;
         u64 pte;
         unsigned offset = offset_in_page(gpa);
+       unsigned pte_size;
         unsigned page_offset;
+       unsigned misaligned;
         int level;
  
         pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
         index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
         bucket = &vcpu->kvm->mmu_page_hash[index];
-       hlist_for_each_entry(page, node, bucket, hash_link) {
+       hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
                 if (page->gfn != gfn || page->role.metaphysical)
                         continue;
+               pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+               misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+               if (misaligned) {
+                       /*
+                        * Misaligned accesses are too much trouble to fix
+                        * up; also, they usually indicate a page is not used
+                        * as a page table.
+                        */
+                       pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+                                gpa, bytes, page->role.word);
+                       kvm_mmu_zap_page(vcpu, page);
+                       continue;
+               }
                 page_offset = offset;
                 level = page->role.level;
                 if (page->role.glevels == PT32_ROOT_LEVEL) {
@@ -1001,6 +1010,25 @@ void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
  {
  }
  
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
+
+       return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
+}
+
+void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+       while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
+               struct kvm_mmu_page *page;
+
+               page = container_of(vcpu->kvm->active_mmu_pages.prev,
+                                   struct kvm_mmu_page, link);
+               kvm_mmu_zap_page(vcpu, page);
+       }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_some_pages);
+
  static void free_mmu_pages(struct kvm_vcpu *vcpu)
  {
         while (!list_empty(&vcpu->free_pages)) {
@@ -1032,6 +1060,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
                 page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
                 memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
                 list_add(&page_header->link, &vcpu->free_pages);
+               ++vcpu->kvm->n_free_mmu_pages;
         }
  
         /*