Merge git://git.kernel.org/pub/scm/linux/kernel/git/bart/ide-2.6
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 5 May 2008 00:08:21 +0000 (17:08 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 5 May 2008 00:08:21 +0000 (17:08 -0700)
* git://git.kernel.org/pub/scm/linux/kernel/git/bart/ide-2.6:
  ide: IDE_HFLAG_SERIALIZE_DMA bugfix

18 files changed:
arch/powerpc/kvm/booke_guest.c
arch/powerpc/kvm/powerpc.c
arch/x86/kernel/kvmclock.c
arch/x86/kvm/i8254.c
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/vmx.h
arch/x86/kvm/x86.c
arch/x86/kvm/x86_emulate.c
include/asm-powerpc/kvm_host.h
include/asm-powerpc/kvm_ppc.h
include/asm-x86/kvm_host.h
include/linux/sysfs.h
init/Kconfig
kernel/module.c
virt/kvm/kvm_main.c

index 6d9884a..712d89a 100644 (file)
@@ -49,6 +49,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "inst_emu",   VCPU_STAT(emulated_inst_exits) },
        { "dec",        VCPU_STAT(dec_exits) },
        { "ext_intr",   VCPU_STAT(ext_intr_exits) },
+       { "halt_wakeup", VCPU_STAT(halt_wakeup) },
        { NULL }
 };
 
@@ -338,6 +339,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                }
                break;
 
+       case BOOKE_INTERRUPT_FP_UNAVAIL:
+               kvmppc_queue_exception(vcpu, exit_nr);
+               r = RESUME_GUEST;
+               break;
+
        case BOOKE_INTERRUPT_DATA_STORAGE:
                vcpu->arch.dear = vcpu->arch.fault_dear;
                vcpu->arch.esr = vcpu->arch.fault_esr;
index bad40bd..777e0f3 100644 (file)
@@ -36,13 +36,12 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
-       /* XXX implement me */
-       return 0;
+       return !!(v->arch.pending_exceptions);
 }
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-       return 1;
+       return !(v->arch.msr & MSR_WE);
 }
 
 
@@ -214,6 +213,11 @@ static void kvmppc_decrementer_func(unsigned long data)
        struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
 
        kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_DECREMENTER);
+
+       if (waitqueue_active(&vcpu->wq)) {
+               wake_up_interruptible(&vcpu->wq);
+               vcpu->stat.halt_wakeup++;
+       }
 }
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
@@ -339,6 +343,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
        int r;
        sigset_t sigsaved;
 
+       vcpu_load(vcpu);
+
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -363,12 +369,20 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
+       vcpu_put(vcpu);
+
        return r;
 }
 
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
        kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_EXTERNAL);
+
+       if (waitqueue_active(&vcpu->wq)) {
+               wake_up_interruptible(&vcpu->wq);
+               vcpu->stat.halt_wakeup++;
+       }
+
        return 0;
 }
 
index ddee040..4bc1be5 100644 (file)
@@ -133,6 +133,7 @@ static int kvm_register_clock(void)
        return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
 }
 
+#ifdef CONFIG_X86_LOCAL_APIC
 static void kvm_setup_secondary_clock(void)
 {
        /*
@@ -143,6 +144,7 @@ static void kvm_setup_secondary_clock(void)
        /* ok, done with our trickery, call native */
        setup_secondary_APIC_clock();
 }
+#endif
 
 /*
  * After the clock is registered, the host will keep writing to the
@@ -177,7 +179,9 @@ void __init kvmclock_init(void)
                pv_time_ops.get_wallclock = kvm_get_wallclock;
                pv_time_ops.set_wallclock = kvm_set_wallclock;
                pv_time_ops.sched_clock = kvm_clock_read;
+#ifdef CONFIG_X86_LOCAL_APIC
                pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
+#endif
                machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
                machine_ops.crash_shutdown  = kvm_crash_shutdown;
index 4c943ea..3324d90 100644 (file)
@@ -288,6 +288,8 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
         * mode 1 is one shot, mode 2 is period, otherwise del timer */
        switch (ps->channels[0].mode) {
        case 1:
+        /* FIXME: enhance mode 4 precision */
+       case 4:
                create_pit_timer(&ps->pit_timer, val, 0);
                break;
        case 2:
index 2ad6f54..36c5406 100644 (file)
@@ -79,36 +79,6 @@ static int dbg = 1;
        }
 #endif
 
-#define PT64_PT_BITS 9
-#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
-#define PT32_PT_BITS 10
-#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
-
-#define PT_WRITABLE_SHIFT 1
-
-#define PT_PRESENT_MASK (1ULL << 0)
-#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
-#define PT_PWT_MASK (1ULL << 3)
-#define PT_PCD_MASK (1ULL << 4)
-#define PT_ACCESSED_MASK (1ULL << 5)
-#define PT_DIRTY_MASK (1ULL << 6)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
-#define PT_PAT_MASK (1ULL << 7)
-#define PT_GLOBAL_MASK (1ULL << 8)
-#define PT64_NX_SHIFT 63
-#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
-
-#define PT_PAT_SHIFT 7
-#define PT_DIR_PAT_SHIFT 12
-#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
-
-#define PT32_DIR_PSE36_SIZE 4
-#define PT32_DIR_PSE36_SHIFT 13
-#define PT32_DIR_PSE36_MASK \
-       (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
-
-
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
@@ -154,10 +124,6 @@ static int dbg = 1;
 #define PFERR_USER_MASK (1U << 2)
 #define PFERR_FETCH_MASK (1U << 4)
 
-#define PT64_ROOT_LEVEL 4
-#define PT32_ROOT_LEVEL 2
-#define PT32E_ROOT_LEVEL 3
-
 #define PT_DIRECTORY_LEVEL 2
 #define PT_PAGE_TABLE_LEVEL 1
 
@@ -186,6 +152,12 @@ static struct kmem_cache *mmu_page_header_cache;
 
 static u64 __read_mostly shadow_trap_nonpresent_pte;
 static u64 __read_mostly shadow_notrap_nonpresent_pte;
+static u64 __read_mostly shadow_base_present_pte;
+static u64 __read_mostly shadow_nx_mask;
+static u64 __read_mostly shadow_x_mask;        /* mutual exclusive with nx_mask */
+static u64 __read_mostly shadow_user_mask;
+static u64 __read_mostly shadow_accessed_mask;
+static u64 __read_mostly shadow_dirty_mask;
 
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 {
@@ -194,6 +166,23 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
 
+void kvm_mmu_set_base_ptes(u64 base_pte)
+{
+       shadow_base_present_pte = base_pte;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
+
+void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
+               u64 dirty_mask, u64 nx_mask, u64 x_mask)
+{
+       shadow_user_mask = user_mask;
+       shadow_accessed_mask = accessed_mask;
+       shadow_dirty_mask = dirty_mask;
+       shadow_nx_mask = nx_mask;
+       shadow_x_mask = x_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+
 static int is_write_protection(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.cr0 & X86_CR0_WP;
@@ -232,7 +221,7 @@ static int is_writeble_pte(unsigned long pte)
 
 static int is_dirty_pte(unsigned long pte)
 {
-       return pte & PT_DIRTY_MASK;
+       return pte & shadow_dirty_mask;
 }
 
 static int is_rmap_pte(u64 pte)
@@ -387,7 +376,6 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 
        write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
        *write_count += 1;
-       WARN_ON(*write_count > KVM_PAGES_PER_HPAGE);
 }
 
 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -547,7 +535,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
                return;
        sp = page_header(__pa(spte));
        pfn = spte_to_pfn(*spte);
-       if (*spte & PT_ACCESSED_MASK)
+       if (*spte & shadow_accessed_mask)
                kvm_set_pfn_accessed(pfn);
        if (is_writeble_pte(*spte))
                kvm_release_pfn_dirty(pfn);
@@ -1073,17 +1061,17 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
         * whether the guest actually used the pte (in order to detect
         * demand paging).
         */
-       spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
+       spte = shadow_base_present_pte | shadow_dirty_mask;
        if (!speculative)
                pte_access |= PT_ACCESSED_MASK;
        if (!dirty)
                pte_access &= ~ACC_WRITE_MASK;
-       if (!(pte_access & ACC_EXEC_MASK))
-               spte |= PT64_NX_MASK;
-
-       spte |= PT_PRESENT_MASK;
+       if (pte_access & ACC_EXEC_MASK)
+               spte |= shadow_x_mask;
+       else
+               spte |= shadow_nx_mask;
        if (pte_access & ACC_USER_MASK)
-               spte |= PT_USER_MASK;
+               spte |= shadow_user_mask;
        if (largepage)
                spte |= PT_PAGE_SIZE_MASK;
 
@@ -1188,8 +1176,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                                return -ENOMEM;
                        }
 
-                       table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
-                               | PT_WRITABLE_MASK | PT_USER_MASK;
+                       table[index] = __pa(new_table->spt)
+                               | PT_PRESENT_MASK | PT_WRITABLE_MASK
+                               | shadow_user_mask | shadow_x_mask;
                }
                table_addr = table[index] & PT64_BASE_ADDR_MASK;
        }
@@ -1244,7 +1233,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
        if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
                return;
        spin_lock(&vcpu->kvm->mmu_lock);
-#ifdef CONFIG_X86_64
        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
                hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -1256,7 +1244,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
                spin_unlock(&vcpu->kvm->mmu_lock);
                return;
        }
-#endif
        for (i = 0; i < 4; ++i) {
                hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -1282,7 +1269,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 
        root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
 
-#ifdef CONFIG_X86_64
        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
                hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -1297,7 +1283,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
                vcpu->arch.mmu.root_hpa = root;
                return;
        }
-#endif
        metaphysical = !is_paging(vcpu);
        if (tdp_enabled)
                metaphysical = 1;
@@ -1377,7 +1362,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
        spin_lock(&vcpu->kvm->mmu_lock);
        kvm_mmu_free_some_pages(vcpu);
        r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
-                        largepage, gfn, pfn, TDP_ROOT_LEVEL);
+                        largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
        spin_unlock(&vcpu->kvm->mmu_lock);
 
        return r;
@@ -1484,7 +1469,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        context->page_fault = tdp_page_fault;
        context->free = nonpaging_free;
        context->prefetch_page = nonpaging_prefetch_page;
-       context->shadow_root_level = TDP_ROOT_LEVEL;
+       context->shadow_root_level = kvm_x86_ops->get_tdp_level();
        context->root_hpa = INVALID_PAGE;
 
        if (!is_paging(vcpu)) {
@@ -1633,7 +1618,7 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 {
        u64 *spte = vcpu->arch.last_pte_updated;
 
-       return !!(spte && (*spte & PT_ACCESSED_MASK));
+       return !!(spte && (*spte & shadow_accessed_mask));
 }
 
 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
index e64e9f5..1730757 100644 (file)
@@ -3,11 +3,38 @@
 
 #include <linux/kvm_host.h>
 
-#ifdef CONFIG_X86_64
-#define TDP_ROOT_LEVEL PT64_ROOT_LEVEL
-#else
-#define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL
-#endif
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+
+#define PT_WRITABLE_SHIFT 1
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << 2)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_MASK (1ULL << 5)
+#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_SHIFT 63
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT32_DIR_PSE36_SIZE 4
+#define PT32_DIR_PSE36_SHIFT 13
+#define PT32_DIR_PSE36_MASK \
+       (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+
+#define PT64_ROOT_LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3
 
 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
index 89e0be2..ab22615 100644 (file)
@@ -1863,6 +1863,15 @@ static bool svm_cpu_has_accelerated_tpr(void)
        return false;
 }
 
+static int get_npt_level(void)
+{
+#ifdef CONFIG_X86_64
+       return PT64_ROOT_LEVEL;
+#else
+       return PT32E_ROOT_LEVEL;
+#endif
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
@@ -1920,6 +1929,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .inject_pending_vectors = do_interrupt_requests,
 
        .set_tss_addr = svm_set_tss_addr,
+       .get_tdp_level = get_npt_level,
 };
 
 static int __init svm_init(void)
index 8e5d664..bfe4db1 100644 (file)
@@ -42,6 +42,9 @@ module_param(enable_vpid, bool, 0);
 static int flexpriority_enabled = 1;
 module_param(flexpriority_enabled, bool, 0);
 
+static int enable_ept = 1;
+module_param(enable_ept, bool, 0);
+
 struct vmcs {
        u32 revision_id;
        u32 abort;
@@ -84,7 +87,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
        return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
-static int init_rmode_tss(struct kvm *kvm);
+static int init_rmode(struct kvm *kvm);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -107,6 +110,11 @@ static struct vmcs_config {
        u32 vmentry_ctrl;
 } vmcs_config;
 
+struct vmx_capability {
+       u32 ept;
+       u32 vpid;
+} vmx_capability;
+
 #define VMX_SEGMENT_FIELD(seg)                                 \
        [VCPU_SREG_##seg] = {                                   \
                .selector = GUEST_##seg##_SELECTOR,             \
@@ -214,6 +222,32 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
                    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
 }
 
+static inline int cpu_has_vmx_invept_individual_addr(void)
+{
+       return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT));
+}
+
+static inline int cpu_has_vmx_invept_context(void)
+{
+       return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT));
+}
+
+static inline int cpu_has_vmx_invept_global(void)
+{
+       return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT));
+}
+
+static inline int cpu_has_vmx_ept(void)
+{
+       return (vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_ENABLE_EPT);
+}
+
+static inline int vm_need_ept(void)
+{
+       return (cpu_has_vmx_ept() && enable_ept);
+}
+
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
        return ((cpu_has_vmx_virtualize_apic_accesses()) &&
@@ -250,6 +284,18 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
                  : : "a"(&operand), "c"(ext) : "cc", "memory");
 }
 
+static inline void __invept(int ext, u64 eptp, gpa_t gpa)
+{
+       struct {
+               u64 eptp, gpa;
+       } operand = {eptp, gpa};
+
+       asm volatile (ASM_VMX_INVEPT
+                       /* CF==1 or ZF==1 --> rc = -1 */
+                       "; ja 1f ; ud2 ; 1:\n"
+                       : : "a" (&operand), "c" (ext) : "cc", "memory");
+}
+
 static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 {
        int i;
@@ -301,6 +347,33 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
        __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
 }
 
+static inline void ept_sync_global(void)
+{
+       if (cpu_has_vmx_invept_global())
+               __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+}
+
+static inline void ept_sync_context(u64 eptp)
+{
+       if (vm_need_ept()) {
+               if (cpu_has_vmx_invept_context())
+                       __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+               else
+                       ept_sync_global();
+       }
+}
+
+static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
+{
+       if (vm_need_ept()) {
+               if (cpu_has_vmx_invept_individual_addr())
+                       __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
+                                       eptp, gpa);
+               else
+                       ept_sync_context(eptp);
+       }
+}
+
 static unsigned long vmcs_readl(unsigned long field)
 {
        unsigned long value;
@@ -388,6 +461,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
                eb |= 1u << 1;
        if (vcpu->arch.rmode.active)
                eb = ~0;
+       if (vm_need_ept())
+               eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
        vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
@@ -985,7 +1060,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 {
        u32 vmx_msr_low, vmx_msr_high;
-       u32 min, opt;
+       u32 min, opt, min2, opt2;
        u32 _pin_based_exec_control = 0;
        u32 _cpu_based_exec_control = 0;
        u32 _cpu_based_2nd_exec_control = 0;
@@ -1003,6 +1078,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
              CPU_BASED_CR8_LOAD_EXITING |
              CPU_BASED_CR8_STORE_EXITING |
 #endif
+             CPU_BASED_CR3_LOAD_EXITING |
+             CPU_BASED_CR3_STORE_EXITING |
              CPU_BASED_USE_IO_BITMAPS |
              CPU_BASED_MOV_DR_EXITING |
              CPU_BASED_USE_TSC_OFFSETING;
@@ -1018,11 +1095,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                                           ~CPU_BASED_CR8_STORE_EXITING;
 #endif
        if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
-               min = 0;
-               opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+               min2 = 0;
+               opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                        SECONDARY_EXEC_WBINVD_EXITING |
-                       SECONDARY_EXEC_ENABLE_VPID;
-               if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
+                       SECONDARY_EXEC_ENABLE_VPID |
+                       SECONDARY_EXEC_ENABLE_EPT;
+               if (adjust_vmx_controls(min2, opt2,
+                                       MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
                        return -EIO;
        }
@@ -1031,6 +1110,16 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
                _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
 #endif
+       if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
+               /* CR3 accesses don't need to cause VM Exits when EPT enabled */
+               min &= ~(CPU_BASED_CR3_LOAD_EXITING |
+                        CPU_BASED_CR3_STORE_EXITING);
+               if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
+                                       &_cpu_based_exec_control) < 0)
+                       return -EIO;
+               rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
+                     vmx_capability.ept, vmx_capability.vpid);
+       }
 
        min = 0;
 #ifdef CONFIG_X86_64
@@ -1256,7 +1345,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
        fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
 
        kvm_mmu_reset_context(vcpu);
-       init_rmode_tss(vcpu->kvm);
+       init_rmode(vcpu->kvm);
 }
 
 #ifdef CONFIG_X86_64
@@ -1304,8 +1393,64 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
        vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
 }
 
+static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
+{
+       if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+               if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
+                       printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
+                       return;
+               }
+               vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
+               vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
+               vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
+               vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
+       }
+}
+
+static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+
+static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
+                                       unsigned long cr0,
+                                       struct kvm_vcpu *vcpu)
+{
+       if (!(cr0 & X86_CR0_PG)) {
+               /* From paging/starting to nonpaging */
+               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+                            vmcs_config.cpu_based_exec_ctrl |
+                            (CPU_BASED_CR3_LOAD_EXITING |
+                             CPU_BASED_CR3_STORE_EXITING));
+               vcpu->arch.cr0 = cr0;
+               vmx_set_cr4(vcpu, vcpu->arch.cr4);
+               *hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
+               *hw_cr0 &= ~X86_CR0_WP;
+       } else if (!is_paging(vcpu)) {
+               /* From nonpaging to paging */
+               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+                            vmcs_config.cpu_based_exec_ctrl &
+                            ~(CPU_BASED_CR3_LOAD_EXITING |
+                              CPU_BASED_CR3_STORE_EXITING));
+               vcpu->arch.cr0 = cr0;
+               vmx_set_cr4(vcpu, vcpu->arch.cr4);
+               if (!(vcpu->arch.cr0 & X86_CR0_WP))
+                       *hw_cr0 &= ~X86_CR0_WP;
+       }
+}
+
+static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
+                                       struct kvm_vcpu *vcpu)
+{
+       if (!is_paging(vcpu)) {
+               *hw_cr4 &= ~X86_CR4_PAE;
+               *hw_cr4 |= X86_CR4_PSE;
+       } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
+               *hw_cr4 &= ~X86_CR4_PAE;
+}
+
 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
+       unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) |
+                               KVM_VM_CR0_ALWAYS_ON;
+
        vmx_fpu_deactivate(vcpu);
 
        if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
@@ -1323,29 +1468,61 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        }
 #endif
 
+       if (vm_need_ept())
+               ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+
        vmcs_writel(CR0_READ_SHADOW, cr0);
-       vmcs_writel(GUEST_CR0,
-                   (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
+       vmcs_writel(GUEST_CR0, hw_cr0);
        vcpu->arch.cr0 = cr0;
 
        if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
                vmx_fpu_activate(vcpu);
 }
 
+static u64 construct_eptp(unsigned long root_hpa)
+{
+       u64 eptp;
+
+       /* TODO write the value reading from MSR */
+       eptp = VMX_EPT_DEFAULT_MT |
+               VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
+       eptp |= (root_hpa & PAGE_MASK);
+
+       return eptp;
+}
+
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+       unsigned long guest_cr3;
+       u64 eptp;
+
+       guest_cr3 = cr3;
+       if (vm_need_ept()) {
+               eptp = construct_eptp(cr3);
+               vmcs_write64(EPT_POINTER, eptp);
+               ept_sync_context(eptp);
+               ept_load_pdptrs(vcpu);
+               guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
+                       VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+       }
+
        vmx_flush_tlb(vcpu);
-       vmcs_writel(GUEST_CR3, cr3);
+       vmcs_writel(GUEST_CR3, guest_cr3);
        if (vcpu->arch.cr0 & X86_CR0_PE)
                vmx_fpu_deactivate(vcpu);
 }
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-       vmcs_writel(CR4_READ_SHADOW, cr4);
-       vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
-                   KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
+       unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ?
+                   KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+
        vcpu->arch.cr4 = cr4;
+       if (vm_need_ept())
+               ept_update_paging_mode_cr4(&hw_cr4, vcpu);
+
+       vmcs_writel(CR4_READ_SHADOW, cr4);
+       vmcs_writel(GUEST_CR4, hw_cr4);
 }
 
 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -1530,6 +1707,41 @@ out:
        return ret;
 }
 
+static int init_rmode_identity_map(struct kvm *kvm)
+{
+       int i, r, ret;
+       pfn_t identity_map_pfn;
+       u32 tmp;
+
+       if (!vm_need_ept())
+               return 1;
+       if (unlikely(!kvm->arch.ept_identity_pagetable)) {
+               printk(KERN_ERR "EPT: identity-mapping pagetable "
+                       "haven't been allocated!\n");
+               return 0;
+       }
+       if (likely(kvm->arch.ept_identity_pagetable_done))
+               return 1;
+       ret = 0;
+       identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT;
+       r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
+       if (r < 0)
+               goto out;
+       /* Set up identity-mapping pagetable for EPT in real mode */
+       for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
+               tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+                       _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+               r = kvm_write_guest_page(kvm, identity_map_pfn,
+                               &tmp, i * sizeof(tmp), sizeof(tmp));
+               if (r < 0)
+                       goto out;
+       }
+       kvm->arch.ept_identity_pagetable_done = true;
+       ret = 1;
+out:
+       return ret;
+}
+
 static void seg_setup(int seg)
 {
        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -1564,6 +1776,31 @@ out:
        return r;
 }
 
+static int alloc_identity_pagetable(struct kvm *kvm)
+{
+       struct kvm_userspace_memory_region kvm_userspace_mem;
+       int r = 0;
+
+       down_write(&kvm->slots_lock);
+       if (kvm->arch.ept_identity_pagetable)
+               goto out;
+       kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
+       kvm_userspace_mem.flags = 0;
+       kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+       kvm_userspace_mem.memory_size = PAGE_SIZE;
+       r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+       if (r)
+               goto out;
+
+       down_read(&current->mm->mmap_sem);
+       kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
+                       VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
+       up_read(&current->mm->mmap_sem);
+out:
+       up_write(&kvm->slots_lock);
+       return r;
+}
+
 static void allocate_vpid(struct vcpu_vmx *vmx)
 {
        int vpid;
@@ -1638,6 +1875,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
                                CPU_BASED_CR8_LOAD_EXITING;
 #endif
        }
+       if (!vm_need_ept())
+               exec_control |= CPU_BASED_CR3_STORE_EXITING |
+                               CPU_BASED_CR3_LOAD_EXITING;
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
 
        if (cpu_has_secondary_exec_ctrls()) {
@@ -1647,6 +1887,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
                                ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
                if (vmx->vpid == 0)
                        exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+               if (!vm_need_ept())
+                       exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
        }
 
@@ -1722,6 +1964,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        return 0;
 }
 
+static int init_rmode(struct kvm *kvm)
+{
+       if (!init_rmode_tss(kvm))
+               return 0;
+       if (!init_rmode_identity_map(kvm))
+               return 0;
+       return 1;
+}
+
 static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1729,7 +1980,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        int ret;
 
        down_read(&vcpu->kvm->slots_lock);
-       if (!init_rmode_tss(vmx->vcpu.kvm)) {
+       if (!init_rmode(vmx->vcpu.kvm)) {
                ret = -ENOMEM;
                goto out;
        }
@@ -1994,6 +2245,9 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
        if (is_page_fault(intr_info)) {
+               /* EPT won't cause page fault directly */
+               if (vm_need_ept())
+                       BUG();
                cr2 = vmcs_readl(EXIT_QUALIFICATION);
                KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
                            (u32)((u64)cr2 >> 32), handler);
@@ -2323,6 +2577,64 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return kvm_task_switch(vcpu, tss_selector, reason);
 }
 
+static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       u64 exit_qualification;
+       enum emulation_result er;
+       gpa_t gpa;
+       unsigned long hva;
+       int gla_validity;
+       int r;
+
+       exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+
+       if (exit_qualification & (1 << 6)) {
+               printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
+               return -ENOTSUPP;
+       }
+
+       gla_validity = (exit_qualification >> 7) & 0x3;
+       if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+               printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
+               printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+                       (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+                       (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+               printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
+                       (long unsigned int)exit_qualification);
+               kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+               kvm_run->hw.hardware_exit_reason = 0;
+               return -ENOTSUPP;
+       }
+
+       gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+       hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT);
+       if (!kvm_is_error_hva(hva)) {
+               r = kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
+               if (r < 0) {
+                       printk(KERN_ERR "EPT: Not enough memory!\n");
+                       return -ENOMEM;
+               }
+               return 1;
+       } else {
+               /* must be MMIO */
+               er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+               if (er == EMULATE_FAIL) {
+                       printk(KERN_ERR
+                        "EPT: Fail to handle EPT violation vmexit!er is %d\n",
+                        er);
+                       printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+                        (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+                        (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+                       printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
+                               (long unsigned int)exit_qualification);
+                       return -ENOTSUPP;
+               } else if (er == EMULATE_DO_MMIO)
+                       return 0;
+       }
+       return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -2346,6 +2658,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
        [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
        [EXIT_REASON_WBINVD]                  = handle_wbinvd,
        [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
+       [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -2364,6 +2677,13 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP),
                    (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit);
 
+       /* Access CR3 don't cause VMExit in paging mode, so we need
+        * to sync with guest real CR3. */
+       if (vm_need_ept() && is_paging(vcpu)) {
+               vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+               ept_load_pdptrs(vcpu);
+       }
+
        if (unlikely(vmx->fail)) {
                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                kvm_run->fail_entry.hardware_entry_failure_reason
@@ -2372,7 +2692,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        }
 
        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
-                               exit_reason != EXIT_REASON_EXCEPTION_NMI)
+                       (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
+                       exit_reason != EXIT_REASON_EPT_VIOLATION))
                printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
                       "exit reason is 0x%x\n", __func__, exit_reason);
        if (exit_reason < kvm_vmx_max_exit_handlers
@@ -2674,6 +2995,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
                return ERR_PTR(-ENOMEM);
 
        allocate_vpid(vmx);
+       if (id == 0 && vm_need_ept()) {
+               kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
+                       VMX_EPT_WRITABLE_MASK |
+                       VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
+               kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
+                               VMX_EPT_FAKE_DIRTY_MASK, 0ull,
+                               VMX_EPT_EXECUTABLE_MASK);
+               kvm_enable_tdp();
+       }
 
        err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
        if (err)
@@ -2706,6 +3036,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
                if (alloc_apic_access_page(kvm) != 0)
                        goto free_vmcs;
 
+       if (vm_need_ept())
+               if (alloc_identity_pagetable(kvm) != 0)
+                       goto free_vmcs;
+
        return &vmx->vcpu;
 
 free_vmcs:
@@ -2735,6 +3069,11 @@ static void __init vmx_check_processor_compat(void *rtn)
        }
 }
 
+static int get_ept_level(void)
+{
+       return VMX_EPT_DEFAULT_GAW + 1;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
@@ -2791,6 +3130,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .inject_pending_vectors = do_interrupt_requests,
 
        .set_tss_addr = vmx_set_tss_addr,
+       .get_tdp_level = get_ept_level,
 };
 
 static int __init vmx_init(void)
@@ -2843,9 +3183,14 @@ static int __init vmx_init(void)
        vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
        vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
 
+       if (cpu_has_vmx_ept())
+               bypass_guest_pf = 0;
+
        if (bypass_guest_pf)
                kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
 
+       ept_sync_global();
+
        return 0;
 
 out2:
index 5dff460..79d94c6 100644 (file)
@@ -35,6 +35,8 @@
 #define CPU_BASED_MWAIT_EXITING                 0x00000400
 #define CPU_BASED_RDPMC_EXITING                 0x00000800
 #define CPU_BASED_RDTSC_EXITING                 0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING             0x00008000
+#define CPU_BASED_CR3_STORE_EXITING            0x00010000
 #define CPU_BASED_CR8_LOAD_EXITING              0x00080000
 #define CPU_BASED_CR8_STORE_EXITING             0x00100000
 #define CPU_BASED_TPR_SHADOW                    0x00200000
@@ -49,6 +51,7 @@
  * Definitions of Secondary Processor-Based VM-Execution Controls.
  */
 #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT               0x00000002
 #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
 #define SECONDARY_EXEC_WBINVD_EXITING          0x00000040
 
@@ -100,10 +103,22 @@ enum vmcs_field {
        VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
        APIC_ACCESS_ADDR                = 0x00002014,
        APIC_ACCESS_ADDR_HIGH           = 0x00002015,
+       EPT_POINTER                     = 0x0000201a,
+       EPT_POINTER_HIGH                = 0x0000201b,
+       GUEST_PHYSICAL_ADDRESS          = 0x00002400,
+       GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
        VMCS_LINK_POINTER               = 0x00002800,
        VMCS_LINK_POINTER_HIGH          = 0x00002801,
        GUEST_IA32_DEBUGCTL             = 0x00002802,
        GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+       GUEST_PDPTR0                    = 0x0000280a,
+       GUEST_PDPTR0_HIGH               = 0x0000280b,
+       GUEST_PDPTR1                    = 0x0000280c,
+       GUEST_PDPTR1_HIGH               = 0x0000280d,
+       GUEST_PDPTR2                    = 0x0000280e,
+       GUEST_PDPTR2_HIGH               = 0x0000280f,
+       GUEST_PDPTR3                    = 0x00002810,
+       GUEST_PDPTR3_HIGH               = 0x00002811,
        PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
        CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
        EXCEPTION_BITMAP                = 0x00004004,
@@ -226,6 +241,8 @@ enum vmcs_field {
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
+#define EXIT_REASON_EPT_VIOLATION       48
+#define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_WBINVD             54
 
 /*
@@ -316,15 +333,36 @@ enum vmcs_field {
 #define MSR_IA32_VMX_CR4_FIXED1                 0x489
 #define MSR_IA32_VMX_VMCS_ENUM                  0x48a
 #define MSR_IA32_VMX_PROCBASED_CTLS2            0x48b
+#define MSR_IA32_VMX_EPT_VPID_CAP               0x48c
 
 #define MSR_IA32_FEATURE_CONTROL                0x3a
 #define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
 #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
 
 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT       9
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT     10
 
 #define VMX_NR_VPIDS                           (1 << 16)
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT         1
 #define VMX_VPID_EXTENT_ALL_CONTEXT            2
 
+#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR         0
+#define VMX_EPT_EXTENT_CONTEXT                 1
+#define VMX_EPT_EXTENT_GLOBAL                  2
+#define VMX_EPT_EXTENT_INDIVIDUAL_BIT          (1ull << 24)
+#define VMX_EPT_EXTENT_CONTEXT_BIT             (1ull << 25)
+#define VMX_EPT_EXTENT_GLOBAL_BIT              (1ull << 26)
+#define VMX_EPT_DEFAULT_GAW                    3
+#define VMX_EPT_MAX_GAW                                0x4
+#define VMX_EPT_MT_EPTE_SHIFT                  3
+#define VMX_EPT_GAW_EPTP_SHIFT                 3
+#define VMX_EPT_DEFAULT_MT                     0x6ull
+#define VMX_EPT_READABLE_MASK                  0x1ull
+#define VMX_EPT_WRITABLE_MASK                  0x2ull
+#define VMX_EPT_EXECUTABLE_MASK                        0x4ull
+#define VMX_EPT_FAKE_ACCESSED_MASK             (1ull << 62)
+#define VMX_EPT_FAKE_DIRTY_MASK                        (1ull << 63)
+
+#define VMX_EPT_IDENTITY_PAGETABLE_ADDR                0xfffbc000ul
+
 #endif
index 0ce5563..21338bd 100644 (file)
@@ -2417,6 +2417,9 @@ int kvm_arch_init(void *opaque)
 
        kvm_x86_ops = ops;
        kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
+       kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
+       kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
+                       PT_DIRTY_MASK, PT64_NX_MASK, 0);
        return 0;
 
 out:
@@ -3019,6 +3022,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
        kvm_x86_ops->decache_regs(vcpu);
 
+       vcpu->arch.exception.pending = false;
+
        vcpu_put(vcpu);
 
        return 0;
@@ -3481,7 +3486,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
        }
 
        if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
-               cseg_desc.type &= ~(1 << 8); //clear the B flag
+               cseg_desc.type &= ~(1 << 1); //clear the B flag
                save_guest_segment_descriptor(vcpu, tr_seg.selector,
                                              &cseg_desc);
        }
@@ -3507,7 +3512,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
        }
 
        if (reason != TASK_SWITCH_IRET) {
-               nseg_desc.type |= (1 << 8);
+               nseg_desc.type |= (1 << 1);
                save_guest_segment_descriptor(vcpu, tss_selector,
                                              &nseg_desc);
        }
@@ -3698,10 +3703,19 @@ void fx_init(struct kvm_vcpu *vcpu)
 {
        unsigned after_mxcsr_mask;
 
+       /*
+        * Touch the fpu the first time in non atomic context as if
+        * this is the first fpu instruction the exception handler
+        * will fire before the instruction returns and it'll have to
+        * allocate ram with GFP_KERNEL.
+        */
+       if (!used_math())
+               fx_save(&vcpu->arch.host_fx_image);
+
        /* Initialize guest FPU by resetting ours and saving into guest's */
        preempt_disable();
        fx_save(&vcpu->arch.host_fx_image);
-       fpu_init();
+       fx_finit();
        fx_save(&vcpu->arch.guest_fx_image);
        fx_restore(&vcpu->arch.host_fx_image);
        preempt_enable();
@@ -3906,6 +3920,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kvm_free_physmem(kvm);
        if (kvm->arch.apic_access_page)
                put_page(kvm->arch.apic_access_page);
+       if (kvm->arch.ept_identity_pagetable)
+               put_page(kvm->arch.ept_identity_pagetable);
        kfree(kvm);
 }
 
index 2ca0838..f2a696d 100644 (file)
@@ -1761,6 +1761,7 @@ twobyte_insn:
                case 6: /* lmsw */
                        realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
                                      &ctxt->eflags);
+                       c->dst.type = OP_NONE;
                        break;
                case 7: /* invlpg*/
                        emulate_invlpg(ctxt->vcpu, memop);
index 04ffbb8..81a69d7 100644 (file)
@@ -59,6 +59,7 @@ struct kvm_vcpu_stat {
        u32 emulated_inst_exits;
        u32 dec_exits;
        u32 ext_intr_exits;
+       u32 halt_wakeup;
 };
 
 struct tlbe {
index 7ac8203..b35a7e3 100644 (file)
@@ -77,12 +77,17 @@ static inline void kvmppc_clear_exception(struct kvm_vcpu *vcpu, int exception)
        clear_bit(priority, &vcpu->arch.pending_exceptions);
 }
 
+/* Helper function for "full" MSR writes. No need to call this if only EE is
+ * changing. */
 static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
 {
        if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR))
                kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
 
        vcpu->arch.msr = new_msr;
+
+       if (vcpu->arch.msr & MSR_WE)
+               kvm_vcpu_block(vcpu);
 }
 
 #endif /* __POWERPC_KVM_PPC_H__ */
index 9d963cd..1d8cd01 100644 (file)
@@ -314,6 +314,9 @@ struct kvm_arch{
        struct page *apic_access_page;
 
        gpa_t wall_clock;
+
+       struct page *ept_identity_pagetable;
+       bool ept_identity_pagetable_done;
 };
 
 struct kvm_vm_stat {
@@ -422,6 +425,7 @@ struct kvm_x86_ops {
                                       struct kvm_run *run);
 
        int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
+       int (*get_tdp_level)(void);
 };
 
 extern struct kvm_x86_ops *kvm_x86_ops;
@@ -433,6 +437,9 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
+void kvm_mmu_set_base_ptes(u64 base_pte);
+void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
+               u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -620,7 +627,7 @@ static inline void fx_restore(struct i387_fxsave_struct *image)
        asm("fxrstor (%0)":: "r" (image));
 }
 
-static inline void fpu_init(void)
+static inline void fx_finit(void)
 {
        asm("finit");
 }
@@ -644,6 +651,7 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
 #define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
 #define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
 #define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVEPT           ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
 #define ASM_VMX_INVVPID                  ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
 
 #define MSR_IA32_TIME_STAMP_COUNTER            0x010
index 27bad59..7858eac 100644 (file)
@@ -196,12 +196,6 @@ static inline int sysfs_update_group(struct kobject *kobj,
        return 0;
 }
 
-static inline int sysfs_update_group(struct kobject *kobj,
-                               const struct attribute_group *grp)
-{
-       return 0;
-}
-
 static inline void sysfs_remove_group(struct kobject *kobj,
                                      const struct attribute_group *grp)
 {
index 6a44def..f0e62e5 100644 (file)
@@ -825,6 +825,15 @@ menuconfig MODULES
 
          If unsure, say Y.
 
+config MODULE_FORCE_LOAD
+       bool "Forced module loading"
+       depends on MODULES
+       default n
+       help
+         This option allows loading of modules even if that would set the
+          'F' (forced) taint, due to lack of version info.  Which is
+         usually a really bad idea.
+
 config MODULE_UNLOAD
        bool "Module unloading"
        depends on MODULES
index 8674a39..8e4528c 100644 (file)
@@ -890,6 +890,19 @@ static struct module_attribute *modinfo_attrs[] = {
 
 static const char vermagic[] = VERMAGIC_STRING;
 
+static int try_to_force_load(struct module *mod, const char *symname)
+{
+#ifdef CONFIG_MODULE_FORCE_LOAD
+       if (!(tainted & TAINT_FORCED_MODULE))
+               printk("%s: no version for \"%s\" found: kernel tainted.\n",
+                      mod->name, symname);
+       add_taint_module(mod, TAINT_FORCED_MODULE);
+       return 0;
+#else
+       return -ENOEXEC;
+#endif
+}
+
 #ifdef CONFIG_MODVERSIONS
 static int check_version(Elf_Shdr *sechdrs,
                         unsigned int versindex,
@@ -914,18 +927,18 @@ static int check_version(Elf_Shdr *sechdrs,
 
                if (versions[i].crc == *crc)
                        return 1;
-               printk("%s: disagrees about version of symbol %s\n",
-                      mod->name, symname);
                DEBUGP("Found checksum %lX vs module %lX\n",
                       *crc, versions[i].crc);
-               return 0;
+               goto bad_version;
        }
-       /* Not in module's version table.  OK, but that taints the kernel. */
-       if (!(tainted & TAINT_FORCED_MODULE))
-               printk("%s: no version for \"%s\" found: kernel tainted.\n",
-                      mod->name, symname);
-       add_taint_module(mod, TAINT_FORCED_MODULE);
-       return 1;
+
+       if (!try_to_force_load(mod, symname))
+               return 1;
+
+bad_version:
+       printk("%s: disagrees about version of symbol %s\n",
+              mod->name, symname);
+       return 0;
 }
 
 static inline int check_modstruct_version(Elf_Shdr *sechdrs,
@@ -1853,9 +1866,9 @@ static struct module *load_module(void __user *umod,
        modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
        /* This is allowed: modprobe --force will invalidate it. */
        if (!modmagic) {
-               add_taint_module(mod, TAINT_FORCED_MODULE);
-               printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
-                      mod->name);
+               err = try_to_force_load(mod, "magic");
+               if (err)
+                       goto free_hdr;
        } else if (!same_magic(modmagic, vermagic)) {
                printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
                       mod->name, modmagic, vermagic);
@@ -2006,9 +2019,10 @@ static struct module *load_module(void __user *umod,
            (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
            (mod->num_unused_syms && !unusedcrcindex) ||
            (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
-               printk(KERN_WARNING "%s: No versions for exported symbols."
-                      " Tainting kernel.\n", mod->name);
-               add_taint_module(mod, TAINT_FORCED_MODULE);
+               printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
+               err = try_to_force_load(mod, "nocrc");
+               if (err)
+                       goto cleanup;
        }
 #endif
        markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
index e89338e..f7ba099 100644 (file)
@@ -522,6 +522,7 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
                return bad_hva();
        return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
 }
+EXPORT_SYMBOL_GPL(gfn_to_hva);
 
 /*
  * Requires current->mm->mmap_sem to be held