Merge tag 'iwlwifi-next-for-kalle-2014-12-30' of https://git.kernel.org/pub/scm/linux...
[cascardo/linux.git] / arch / x86 / kvm / vmx.c
index 3e556c6..d4c58d8 100644 (file)
@@ -99,13 +99,15 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
 static bool __read_mostly nested = 0;
 module_param(nested, bool, S_IRUGO);
 
+static u64 __read_mostly host_xss;
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON                                           \
        (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
 #define KVM_CR4_GUEST_OWNED_BITS                                     \
        (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
-        | X86_CR4_OSXMMEXCPT)
+        | X86_CR4_OSXMMEXCPT | X86_CR4_TSD)
 
 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
@@ -214,6 +216,7 @@ struct __packed vmcs12 {
        u64 virtual_apic_page_addr;
        u64 apic_access_addr;
        u64 ept_pointer;
+       u64 xss_exit_bitmap;
        u64 guest_physical_address;
        u64 vmcs_link_pointer;
        u64 guest_ia32_debugctl;
@@ -616,6 +619,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
        FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
        FIELD64(EPT_POINTER, ept_pointer),
+       FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
        FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
        FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
        FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
@@ -720,12 +724,15 @@ static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD(HOST_RSP, host_rsp),
        FIELD(HOST_RIP, host_rip),
 };
-static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
 
 static inline short vmcs_field_to_offset(unsigned long field)
 {
-       if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
-               return -1;
+       BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
+
+       if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
+           vmcs_field_to_offset_table[field] == 0)
+               return -ENOENT;
+
        return vmcs_field_to_offset_table[field];
 }
 
@@ -758,6 +765,7 @@ static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
+static bool vmx_xsaves_supported(void);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
@@ -1098,6 +1106,12 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
 }
 
+static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
+{
+       return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) &&
+               vmx_xsaves_supported();
+}
+
 static inline bool is_exception(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -1659,12 +1673,20 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
        vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
 
        clear_atomic_switch_msr(vmx, MSR_EFER);
-       /* On ept, can't emulate nx, and must switch nx atomically */
-       if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
+
+       /*
+        * On EPT, we can't emulate NX, so we must switch EFER atomically.
+        * On CPUs that support "load IA32_EFER", always switch EFER
+        * atomically, since it's faster than switching it manually.
+        */
+       if (cpu_has_load_ia32_efer ||
+           (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
                guest_efer = vmx->vcpu.arch.efer;
                if (!(guest_efer & EFER_LMA))
                        guest_efer &= ~EFER_LME;
-               add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
+               if (guest_efer != host_efer)
+                       add_atomic_switch_msr(vmx, MSR_EFER,
+                                             guest_efer, host_efer);
                return false;
        }
 
@@ -2377,12 +2399,13 @@ static __init void nested_vmx_setup_ctls_msrs(void)
        nested_vmx_secondary_ctls_low = 0;
        nested_vmx_secondary_ctls_high &=
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
-               SECONDARY_EXEC_UNRESTRICTED_GUEST |
-               SECONDARY_EXEC_WBINVD_EXITING;
+               SECONDARY_EXEC_WBINVD_EXITING |
+               SECONDARY_EXEC_XSAVES;
 
        if (enable_ept) {
                /* nested EPT: emulate EPT also to L1 */
-               nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+               nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT |
+                       SECONDARY_EXEC_UNRESTRICTED_GUEST;
                nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
                         VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
                         VMX_EPT_INVEPT_BIT;
@@ -2558,6 +2581,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                if (!nested_vmx_allowed(vcpu))
                        return 1;
                return vmx_get_vmx_msr(vcpu, msr_index, pdata);
+       case MSR_IA32_XSS:
+               if (!vmx_xsaves_supported())
+                       return 1;
+               data = vcpu->arch.ia32_xss;
+               break;
        case MSR_TSC_AUX:
                if (!to_vmx(vcpu)->rdtscp_enabled)
                        return 1;
@@ -2649,6 +2677,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                return 1; /* they are read-only */
+       case MSR_IA32_XSS:
+               if (!vmx_xsaves_supported())
+                       return 1;
+               /*
+                * The only supported bit as of Skylake is bit 8, but
+                * it is not supported on KVM.
+                */
+               if (data != 0)
+                       return 1;
+               vcpu->arch.ia32_xss = data;
+               if (vcpu->arch.ia32_xss != host_xss)
+                       add_atomic_switch_msr(vmx, MSR_IA32_XSS,
+                               vcpu->arch.ia32_xss, host_xss);
+               else
+                       clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
+               break;
        case MSR_TSC_AUX:
                if (!vmx->rdtscp_enabled)
                        return 1;
@@ -2884,7 +2928,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                        SECONDARY_EXEC_ENABLE_INVPCID |
                        SECONDARY_EXEC_APIC_REGISTER_VIRT |
                        SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-                       SECONDARY_EXEC_SHADOW_VMCS;
+                       SECONDARY_EXEC_SHADOW_VMCS |
+                       SECONDARY_EXEC_XSAVES;
                if (adjust_vmx_controls(min2, opt2,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
@@ -3007,6 +3052,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                }
        }
 
+       if (cpu_has_xsaves)
+               rdmsrl(MSR_IA32_XSS, host_xss);
+
        return 0;
 }
 
@@ -3110,76 +3158,6 @@ static __init int alloc_kvm_area(void)
        return 0;
 }
 
-static __init int hardware_setup(void)
-{
-       if (setup_vmcs_config(&vmcs_config) < 0)
-               return -EIO;
-
-       if (boot_cpu_has(X86_FEATURE_NX))
-               kvm_enable_efer_bits(EFER_NX);
-
-       if (!cpu_has_vmx_vpid())
-               enable_vpid = 0;
-       if (!cpu_has_vmx_shadow_vmcs())
-               enable_shadow_vmcs = 0;
-       if (enable_shadow_vmcs)
-               init_vmcs_shadow_fields();
-
-       if (!cpu_has_vmx_ept() ||
-           !cpu_has_vmx_ept_4levels()) {
-               enable_ept = 0;
-               enable_unrestricted_guest = 0;
-               enable_ept_ad_bits = 0;
-       }
-
-       if (!cpu_has_vmx_ept_ad_bits())
-               enable_ept_ad_bits = 0;
-
-       if (!cpu_has_vmx_unrestricted_guest())
-               enable_unrestricted_guest = 0;
-
-       if (!cpu_has_vmx_flexpriority()) {
-               flexpriority_enabled = 0;
-
-               /*
-                * set_apic_access_page_addr() is used to reload apic access
-                * page upon invalidation.  No need to do anything if the
-                * processor does not have the APIC_ACCESS_ADDR VMCS field.
-                */
-               kvm_x86_ops->set_apic_access_page_addr = NULL;
-       }
-
-       if (!cpu_has_vmx_tpr_shadow())
-               kvm_x86_ops->update_cr8_intercept = NULL;
-
-       if (enable_ept && !cpu_has_vmx_ept_2m_page())
-               kvm_disable_largepages();
-
-       if (!cpu_has_vmx_ple())
-               ple_gap = 0;
-
-       if (!cpu_has_vmx_apicv())
-               enable_apicv = 0;
-
-       if (enable_apicv)
-               kvm_x86_ops->update_cr8_intercept = NULL;
-       else {
-               kvm_x86_ops->hwapic_irr_update = NULL;
-               kvm_x86_ops->deliver_posted_interrupt = NULL;
-               kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
-       }
-
-       if (nested)
-               nested_vmx_setup_ctls_msrs();
-
-       return alloc_kvm_area();
-}
-
-static __exit void hardware_unsetup(void)
-{
-       free_kvm_area();
-}
-
 static bool emulation_required(struct kvm_vcpu *vcpu)
 {
        return emulate_invalid_guest_state && !guest_state_valid(vcpu);
@@ -4396,6 +4374,7 @@ static void ept_set_mmio_spte_mask(void)
        kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
 }
 
+#define VMX_XSS_EXIT_BITMAP 0
 /*
  * Sets up the vmcs for emulated real mode.
  */
@@ -4505,6 +4484,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
        set_cr4_guest_host_mask(vmx);
 
+       if (vmx_xsaves_supported())
+               vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
+
        return 0;
 }
 
@@ -5163,13 +5145,20 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 static int handle_dr(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification;
-       int dr, reg;
+       int dr, dr7, reg;
+
+       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+
+       /* First, if DR does not exist, trigger UD */
+       if (!kvm_require_dr(vcpu, dr))
+               return 1;
 
        /* Do not handle if the CPL > 0, will trigger GP on re-entry */
        if (!kvm_require_cpl(vcpu, 0))
                return 1;
-       dr = vmcs_readl(GUEST_DR7);
-       if (dr & DR7_GD) {
+       dr7 = vmcs_readl(GUEST_DR7);
+       if (dr7 & DR7_GD) {
                /*
                 * As the vm-exit takes precedence over the debug trap, we
                 * need to emulate the latter, either for the host or the
@@ -5177,17 +5166,14 @@ static int handle_dr(struct kvm_vcpu *vcpu)
                 */
                if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
                        vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
-                       vcpu->run->debug.arch.dr7 = dr;
-                       vcpu->run->debug.arch.pc =
-                               vmcs_readl(GUEST_CS_BASE) +
-                               vmcs_readl(GUEST_RIP);
+                       vcpu->run->debug.arch.dr7 = dr7;
+                       vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
                        vcpu->run->debug.arch.exception = DB_VECTOR;
                        vcpu->run->exit_reason = KVM_EXIT_DEBUG;
                        return 0;
                } else {
-                       vcpu->arch.dr7 &= ~DR7_GD;
+                       vcpu->arch.dr6 &= ~15;
                        vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
-                       vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
                        kvm_queue_exception(vcpu, DB_VECTOR);
                        return 1;
                }
@@ -5209,8 +5195,6 @@ static int handle_dr(struct kvm_vcpu *vcpu)
                return 1;
        }
 
-       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-       dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
        reg = DEBUG_REG_ACCESS_REG(exit_qualification);
        if (exit_qualification & TYPE_MOV_FROM_DR) {
                unsigned long val;
@@ -5391,6 +5375,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int handle_xsaves(struct kvm_vcpu *vcpu)
+{
+       skip_emulated_instruction(vcpu);
+       WARN(1, "this should never happen\n");
+       return 1;
+}
+
+static int handle_xrstors(struct kvm_vcpu *vcpu)
+{
+       skip_emulated_instruction(vcpu);
+       WARN(1, "this should never happen\n");
+       return 1;
+}
+
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
        if (likely(fasteoi)) {
@@ -5492,7 +5490,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
        }
 
        /* clear all local breakpoint enable flags */
-       vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55);
+       vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155);
 
        /*
         * TODO: What about debug traps on tss switch?
@@ -5539,11 +5537,11 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
        trace_kvm_page_fault(gpa, exit_qualification);
 
        /* It is a write fault? */
-       error_code = exit_qualification & (1U << 1);
+       error_code = exit_qualification & PFERR_WRITE_MASK;
        /* It is a fetch fault? */
-       error_code |= (exit_qualification & (1U << 2)) << 2;
+       error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
        /* ept page table is present? */
-       error_code |= (exit_qualification >> 3) & 0x1;
+       error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK;
 
        vcpu->arch.exit_qualification = exit_qualification;
 
@@ -5785,6 +5783,204 @@ static void update_ple_window_actual_max(void)
                                            ple_window_grow, INT_MIN);
 }
 
+static __init int hardware_setup(void)
+{
+       int r = -ENOMEM, i, msr;
+
+       rdmsrl_safe(MSR_EFER, &host_efer);
+
+       for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
+               kvm_define_shared_msr(i, vmx_msr_index[i]);
+
+       vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_io_bitmap_a)
+               return r;
+
+       vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_io_bitmap_b)
+               goto out;
+
+       vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_legacy)
+               goto out1;
+
+       vmx_msr_bitmap_legacy_x2apic =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_legacy_x2apic)
+               goto out2;
+
+       vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_longmode)
+               goto out3;
+
+       vmx_msr_bitmap_longmode_x2apic =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_longmode_x2apic)
+               goto out4;
+       vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_vmread_bitmap)
+               goto out5;
+
+       vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_vmwrite_bitmap)
+               goto out6;
+
+       memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
+       memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
+
+       /*
+        * Allow direct access to the PC debug port (it is often used for I/O
+        * delays, but the vmexits simply slow things down).
+        */
+       memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
+       clear_bit(0x80, vmx_io_bitmap_a);
+
+       memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
+
+       memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
+       memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+
+       if (setup_vmcs_config(&vmcs_config) < 0) {
+               r = -EIO;
+               goto out7;
+       }
+
+       if (boot_cpu_has(X86_FEATURE_NX))
+               kvm_enable_efer_bits(EFER_NX);
+
+       if (!cpu_has_vmx_vpid())
+               enable_vpid = 0;
+       if (!cpu_has_vmx_shadow_vmcs())
+               enable_shadow_vmcs = 0;
+       if (enable_shadow_vmcs)
+               init_vmcs_shadow_fields();
+
+       if (!cpu_has_vmx_ept() ||
+           !cpu_has_vmx_ept_4levels()) {
+               enable_ept = 0;
+               enable_unrestricted_guest = 0;
+               enable_ept_ad_bits = 0;
+       }
+
+       if (!cpu_has_vmx_ept_ad_bits())
+               enable_ept_ad_bits = 0;
+
+       if (!cpu_has_vmx_unrestricted_guest())
+               enable_unrestricted_guest = 0;
+
+       if (!cpu_has_vmx_flexpriority()) {
+               flexpriority_enabled = 0;
+
+               /*
+                * set_apic_access_page_addr() is used to reload apic access
+                * page upon invalidation.  No need to do anything if the
+                * processor does not have the APIC_ACCESS_ADDR VMCS field.
+                */
+               kvm_x86_ops->set_apic_access_page_addr = NULL;
+       }
+
+       if (!cpu_has_vmx_tpr_shadow())
+               kvm_x86_ops->update_cr8_intercept = NULL;
+
+       if (enable_ept && !cpu_has_vmx_ept_2m_page())
+               kvm_disable_largepages();
+
+       if (!cpu_has_vmx_ple())
+               ple_gap = 0;
+
+       if (!cpu_has_vmx_apicv())
+               enable_apicv = 0;
+
+       if (enable_apicv)
+               kvm_x86_ops->update_cr8_intercept = NULL;
+       else {
+               kvm_x86_ops->hwapic_irr_update = NULL;
+               kvm_x86_ops->deliver_posted_interrupt = NULL;
+               kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
+       }
+
+       if (nested)
+               nested_vmx_setup_ctls_msrs();
+
+       vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
+       vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
+       vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
+       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
+       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
+       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+       vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
+
+       memcpy(vmx_msr_bitmap_legacy_x2apic,
+                       vmx_msr_bitmap_legacy, PAGE_SIZE);
+       memcpy(vmx_msr_bitmap_longmode_x2apic,
+                       vmx_msr_bitmap_longmode, PAGE_SIZE);
+
+       if (enable_apicv) {
+               for (msr = 0x800; msr <= 0x8ff; msr++)
+                       vmx_disable_intercept_msr_read_x2apic(msr);
+
+               /* According SDM, in x2apic mode, the whole id reg is used.
+                * But in KVM, it only use the highest eight bits. Need to
+                * intercept it */
+               vmx_enable_intercept_msr_read_x2apic(0x802);
+               /* TMCCT */
+               vmx_enable_intercept_msr_read_x2apic(0x839);
+               /* TPR */
+               vmx_disable_intercept_msr_write_x2apic(0x808);
+               /* EOI */
+               vmx_disable_intercept_msr_write_x2apic(0x80b);
+               /* SELF-IPI */
+               vmx_disable_intercept_msr_write_x2apic(0x83f);
+       }
+
+       if (enable_ept) {
+               kvm_mmu_set_mask_ptes(0ull,
+                       (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
+                       (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
+                       0ull, VMX_EPT_EXECUTABLE_MASK);
+               ept_set_mmio_spte_mask();
+               kvm_enable_tdp();
+       } else
+               kvm_disable_tdp();
+
+       update_ple_window_actual_max();
+
+       return alloc_kvm_area();
+
+out7:
+       free_page((unsigned long)vmx_vmwrite_bitmap);
+out6:
+       free_page((unsigned long)vmx_vmread_bitmap);
+out5:
+       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+out4:
+       free_page((unsigned long)vmx_msr_bitmap_longmode);
+out3:
+       free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+out2:
+       free_page((unsigned long)vmx_msr_bitmap_legacy);
+out1:
+       free_page((unsigned long)vmx_io_bitmap_b);
+out:
+       free_page((unsigned long)vmx_io_bitmap_a);
+
+    return r;
+}
+
+static __exit void hardware_unsetup(void)
+{
+       free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+       free_page((unsigned long)vmx_msr_bitmap_legacy);
+       free_page((unsigned long)vmx_msr_bitmap_longmode);
+       free_page((unsigned long)vmx_io_bitmap_b);
+       free_page((unsigned long)vmx_io_bitmap_a);
+       free_page((unsigned long)vmx_vmwrite_bitmap);
+       free_page((unsigned long)vmx_vmread_bitmap);
+
+       free_kvm_area();
+}
+
 /*
  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -6361,58 +6557,60 @@ static inline int vmcs_field_readonly(unsigned long field)
  * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
  * 64-bit fields are to be returned).
  */
-static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
-                                       unsigned long field, u64 *ret)
+static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
+                                 unsigned long field, u64 *ret)
 {
        short offset = vmcs_field_to_offset(field);
        char *p;
 
        if (offset < 0)
-               return 0;
+               return offset;
 
        p = ((char *)(get_vmcs12(vcpu))) + offset;
 
        switch (vmcs_field_type(field)) {
        case VMCS_FIELD_TYPE_NATURAL_WIDTH:
                *ret = *((natural_width *)p);
-               return 1;
+               return 0;
        case VMCS_FIELD_TYPE_U16:
                *ret = *((u16 *)p);
-               return 1;
+               return 0;
        case VMCS_FIELD_TYPE_U32:
                *ret = *((u32 *)p);
-               return 1;
+               return 0;
        case VMCS_FIELD_TYPE_U64:
                *ret = *((u64 *)p);
-               return 1;
+               return 0;
        default:
-               return 0; /* can never happen. */
+               WARN_ON(1);
+               return -ENOENT;
        }
 }
 
 
-static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu,
-                                   unsigned long field, u64 field_value){
+static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
+                                  unsigned long field, u64 field_value){
        short offset = vmcs_field_to_offset(field);
        char *p = ((char *) get_vmcs12(vcpu)) + offset;
        if (offset < 0)
-               return false;
+               return offset;
 
        switch (vmcs_field_type(field)) {
        case VMCS_FIELD_TYPE_U16:
                *(u16 *)p = field_value;
-               return true;
+               return 0;
        case VMCS_FIELD_TYPE_U32:
                *(u32 *)p = field_value;
-               return true;
+               return 0;
        case VMCS_FIELD_TYPE_U64:
                *(u64 *)p = field_value;
-               return true;
+               return 0;
        case VMCS_FIELD_TYPE_NATURAL_WIDTH:
                *(natural_width *)p = field_value;
-               return true;
+               return 0;
        default:
-               return false; /* can never happen. */
+               WARN_ON(1);
+               return -ENOENT;
        }
 
 }
@@ -6445,6 +6643,9 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
                case VMCS_FIELD_TYPE_NATURAL_WIDTH:
                        field_value = vmcs_readl(field);
                        break;
+               default:
+                       WARN_ON(1);
+                       continue;
                }
                vmcs12_write_any(&vmx->vcpu, field, field_value);
        }
@@ -6490,6 +6691,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
                        case VMCS_FIELD_TYPE_NATURAL_WIDTH:
                                vmcs_writel(field, (long)field_value);
                                break;
+                       default:
+                               WARN_ON(1);
+                               break;
                        }
                }
        }
@@ -6528,7 +6732,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
        /* Decode instruction info and find the field to read */
        field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
        /* Read the field, zero-extended to a u64 field_value */
-       if (!vmcs12_read_any(vcpu, field, &field_value)) {
+       if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
                nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
                skip_emulated_instruction(vcpu);
                return 1;
@@ -6598,7 +6802,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                return 1;
        }
 
-       if (!vmcs12_write_any(vcpu, field, field_value)) {
+       if (vmcs12_write_any(vcpu, field, field_value) < 0) {
                nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
                skip_emulated_instruction(vcpu);
                return 1;
@@ -6802,6 +7006,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
        [EXIT_REASON_INVEPT]                  = handle_invept,
        [EXIT_REASON_INVVPID]                 = handle_invvpid,
+       [EXIT_REASON_XSAVES]                  = handle_xsaves,
+       [EXIT_REASON_XRSTORS]                 = handle_xrstors,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -7089,6 +7295,14 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
        case EXIT_REASON_XSETBV:
                return 1;
+       case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
+               /*
+                * This should never happen, since it is not possible to
+                * set XSS to a non-zero value---neither in L1 nor in L2.
+                * If if it were, XSS would have to be checked against
+                * the XSS exit bitmap in vmcs12.
+                */
+               return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
        default:
                return 1;
        }
@@ -7277,6 +7491,9 @@ static void vmx_set_rvi(int vector)
        u16 status;
        u8 old;
 
+       if (vector == -1)
+               vector = 0;
+
        status = vmcs_read16(GUEST_INTR_STATUS);
        old = (u8)status & 0xff;
        if ((u8)vector != old) {
@@ -7288,22 +7505,23 @@ static void vmx_set_rvi(int vector)
 
 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 {
+       if (!is_guest_mode(vcpu)) {
+               vmx_set_rvi(max_irr);
+               return;
+       }
+
        if (max_irr == -1)
                return;
 
        /*
-        * If a vmexit is needed, vmx_check_nested_events handles it.
+        * In guest mode.  If a vmexit is needed, vmx_check_nested_events
+        * handles it.
         */
-       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+       if (nested_exit_on_intr(vcpu))
                return;
 
-       if (!is_guest_mode(vcpu)) {
-               vmx_set_rvi(max_irr);
-               return;
-       }
-
        /*
-        * Fall back to pre-APICv interrupt injection since L2
+        * Else, fall back to pre-APICv interrupt injection since L2
         * is run without virtual interrupt delivery.
         */
        if (!kvm_event_needs_reinjection(vcpu) &&
@@ -7400,6 +7618,12 @@ static bool vmx_mpx_supported(void)
                (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
 }
 
+static bool vmx_xsaves_supported(void)
+{
+       return vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_XSAVES;
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
        u32 exit_intr_info;
@@ -8135,6 +8359,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
        vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
 
+       if (nested_cpu_has_xsaves(vmcs12))
+               vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
        vmcs_write64(VMCS_LINK_POINTER, -1ull);
 
        exec_control = vmcs12->pin_based_vm_exec_control;
@@ -8775,6 +9001,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
        if (vmx_mpx_supported())
                vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+       if (nested_cpu_has_xsaves(vmcs12))
+               vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
 
        /* update exit information fields: */
 
@@ -9176,6 +9404,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .check_intercept = vmx_check_intercept,
        .handle_external_intr = vmx_handle_external_intr,
        .mpx_supported = vmx_mpx_supported,
+       .xsaves_supported = vmx_xsaves_supported,
 
        .check_nested_events = vmx_check_nested_events,
 
@@ -9184,150 +9413,21 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 static int __init vmx_init(void)
 {
-       int r, i, msr;
-
-       rdmsrl_safe(MSR_EFER, &host_efer);
-
-       for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
-               kvm_define_shared_msr(i, vmx_msr_index[i]);
-
-       vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_io_bitmap_a)
-               return -ENOMEM;
-
-       r = -ENOMEM;
-
-       vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_io_bitmap_b)
-               goto out;
-
-       vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_msr_bitmap_legacy)
-               goto out1;
-
-       vmx_msr_bitmap_legacy_x2apic =
-                               (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_msr_bitmap_legacy_x2apic)
-               goto out2;
-
-       vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_msr_bitmap_longmode)
-               goto out3;
-
-       vmx_msr_bitmap_longmode_x2apic =
-                               (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_msr_bitmap_longmode_x2apic)
-               goto out4;
-       vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_vmread_bitmap)
-               goto out5;
-
-       vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_vmwrite_bitmap)
-               goto out6;
-
-       memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
-       memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
-
-       /*
-        * Allow direct access to the PC debug port (it is often used for I/O
-        * delays, but the vmexits simply slow things down).
-        */
-       memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
-       clear_bit(0x80, vmx_io_bitmap_a);
-
-       memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
-
-       memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
-       memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-
-       set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
-
-       r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
-                    __alignof__(struct vcpu_vmx), THIS_MODULE);
+       int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+                     __alignof__(struct vcpu_vmx), THIS_MODULE);
        if (r)
-               goto out7;
+               return r;
 
 #ifdef CONFIG_KEXEC
        rcu_assign_pointer(crash_vmclear_loaded_vmcss,
                           crash_vmclear_local_loaded_vmcss);
 #endif
 
-       vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
-       vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
-       vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
-       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
-       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
-       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
-       vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
-
-       memcpy(vmx_msr_bitmap_legacy_x2apic,
-                       vmx_msr_bitmap_legacy, PAGE_SIZE);
-       memcpy(vmx_msr_bitmap_longmode_x2apic,
-                       vmx_msr_bitmap_longmode, PAGE_SIZE);
-
-       if (enable_apicv) {
-               for (msr = 0x800; msr <= 0x8ff; msr++)
-                       vmx_disable_intercept_msr_read_x2apic(msr);
-
-               /* According SDM, in x2apic mode, the whole id reg is used.
-                * But in KVM, it only use the highest eight bits. Need to
-                * intercept it */
-               vmx_enable_intercept_msr_read_x2apic(0x802);
-               /* TMCCT */
-               vmx_enable_intercept_msr_read_x2apic(0x839);
-               /* TPR */
-               vmx_disable_intercept_msr_write_x2apic(0x808);
-               /* EOI */
-               vmx_disable_intercept_msr_write_x2apic(0x80b);
-               /* SELF-IPI */
-               vmx_disable_intercept_msr_write_x2apic(0x83f);
-       }
-
-       if (enable_ept) {
-               kvm_mmu_set_mask_ptes(0ull,
-                       (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
-                       (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
-                       0ull, VMX_EPT_EXECUTABLE_MASK);
-               ept_set_mmio_spte_mask();
-               kvm_enable_tdp();
-       } else
-               kvm_disable_tdp();
-
-       update_ple_window_actual_max();
-
        return 0;
-
-out7:
-       free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
-       free_page((unsigned long)vmx_vmread_bitmap);
-out5:
-       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-out4:
-       free_page((unsigned long)vmx_msr_bitmap_longmode);
-out3:
-       free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
-out2:
-       free_page((unsigned long)vmx_msr_bitmap_legacy);
-out1:
-       free_page((unsigned long)vmx_io_bitmap_b);
-out:
-       free_page((unsigned long)vmx_io_bitmap_a);
-       return r;
 }
 
 static void __exit vmx_exit(void)
 {
-       free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
-       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-       free_page((unsigned long)vmx_msr_bitmap_legacy);
-       free_page((unsigned long)vmx_msr_bitmap_longmode);
-       free_page((unsigned long)vmx_io_bitmap_b);
-       free_page((unsigned long)vmx_io_bitmap_a);
-       free_page((unsigned long)vmx_vmwrite_bitmap);
-       free_page((unsigned long)vmx_vmread_bitmap);
-
 #ifdef CONFIG_KEXEC
        RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
        synchronize_rcu();