Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 2 Aug 2016 20:11:27 +0000 (16:11 -0400)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 2 Aug 2016 20:11:27 +0000 (16:11 -0400)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 2 Aug 2016 20:11:27 +0000 (16:11 -0400)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 2 Aug 2016 20:11:27 +0000 (16:11 -0400)
diff --combined arch/arm/include/asm/pgtable.h

index d622040,e0d76ba..a8d656d
--- 1/arch/arm/include/asm/pgtable.h
--- 2/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@@ -97,7 -97,9 +97,9 @@@ extern pgprot_t               pgprot_s2_device
   #define PAGE_READONLY_EXEC    _MOD_PROT(pgprot_user, L_PTE_USER | L_PTE_RDONLY)
   #define PAGE_KERNEL           _MOD_PROT(pgprot_kernel, L_PTE_XN)
   #define PAGE_KERNEL_EXEC      pgprot_kernel
- #define PAGE_HYP              _MOD_PROT(pgprot_kernel, L_PTE_HYP)
+ #define PAGE_HYP              _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_XN)
+ #define PAGE_HYP_EXEC         _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY)
+ #define PAGE_HYP_RO           _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN)
   #define PAGE_HYP_DEVICE               _MOD_PROT(pgprot_hyp_device, L_PTE_HYP)
   #define PAGE_S2                       _MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY)
   #define PAGE_S2_DEVICE                _MOD_PROT(pgprot_s2_device, L_PTE_S2_RDONLY)
@@@ -182,6 -184,7 +184,6 @@@ extern pgd_t swapper_pg_dir[PTRS_PER_PG
   #define pgd_offset_k(addr)    pgd_offset(&init_mm, addr)
   
   #define pmd_none(pmd)         (!pmd_val(pmd))
- -#define pmd_present(pmd)      (pmd_val(pmd))
   
   static inline pte_t *pmd_page_vaddr(pmd_t pmd)
   {
diff --combined arch/arm/kvm/arm.c

index f1bde7c,abdb2ea..d94bb90
--- 1/arch/arm/kvm/arm.c
--- 2/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@@ -20,6 -20,7 +20,7 @@@
   #include <linux/errno.h>
   #include <linux/err.h>
   #include <linux/kvm_host.h>
+ #include <linux/list.h>
   #include <linux/module.h>
   #include <linux/vmalloc.h>
   #include <linux/fs.h>
@@@ -122,7 -123,7 +123,7 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
         if (ret)
                 goto out_fail_alloc;
   
-       ret = create_hyp_mappings(kvm, kvm + 1);
+       ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
         if (ret)
                 goto out_free_stage2_pgd;
   
@@@ -201,7 -202,7 +202,7 @@@ int kvm_vm_ioctl_check_extension(struc
                 r = KVM_MAX_VCPUS;
                 break;
         default:
-               r = kvm_arch_dev_ioctl_check_extension(ext);
+               r = kvm_arch_dev_ioctl_check_extension(kvm, ext);
                 break;
         }
         return r;
@@@ -239,7 -240,7 +240,7 @@@ struct kvm_vcpu *kvm_arch_vcpu_create(s
         if (err)
                 goto free_vcpu;
   
-       err = create_hyp_mappings(vcpu, vcpu + 1);
+       err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
         if (err)
                 goto vcpu_uninit;
   
@@@ -263,7 -264,6 +264,7 @@@ void kvm_arch_vcpu_free(struct kvm_vcp
         kvm_timer_vcpu_terminate(vcpu);
         kvm_vgic_vcpu_destroy(vcpu);
         kvm_pmu_vcpu_destroy(vcpu);
+ +      kvm_vcpu_uninit(vcpu);
         kmem_cache_free(kvm_vcpu_cache, vcpu);
   }
   
@@@ -377,7 -377,7 +378,7 @@@ void force_vm_exit(const cpumask_t *mas
   
   /**
    * need_new_vmid_gen - check that the VMID is still valid
-  * @kvm: The VM's VMID to checkt
+  * @kvm: The VM's VMID to check
    *
    * return true if there is a new generation of VMIDs being used
    *
@@@ -616,7 -616,7 +617,7 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
                  * Enter the guest
                  */
                 trace_kvm_entry(*vcpu_pc(vcpu));
-               __kvm_guest_enter();
+               guest_enter_irqoff();
                 vcpu->mode = IN_GUEST_MODE;
   
                 ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
@@@ -642,14 -642,14 +643,14 @@@
                 local_irq_enable();
   
                 /*
-                * We do local_irq_enable() before calling kvm_guest_exit() so
+                * We do local_irq_enable() before calling guest_exit() so
                  * that if a timer interrupt hits while running the guest we
                  * account that tick as being spent in the guest.  We enable
-                * preemption after calling kvm_guest_exit() so that if we get
+                * preemption after calling guest_exit() so that if we get
                  * preempted we make sure ticks after that is not counted as
                  * guest time.
                  */
-               kvm_guest_exit();
+               guest_exit();
                 trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
   
                 /*
@@@ -1039,7 -1039,6 +1040,6 @@@ long kvm_arch_vm_ioctl(struct file *fil
   
   static void cpu_init_hyp_mode(void *dummy)
   {
-       phys_addr_t boot_pgd_ptr;
         phys_addr_t pgd_ptr;
         unsigned long hyp_stack_ptr;
         unsigned long stack_page;
@@@ -1048,13 -1047,12 +1048,12 @@@
         /* Switch from the HYP stub to our own HYP init vector */
         __hyp_set_vectors(kvm_get_idmap_vector());
   
-       boot_pgd_ptr = kvm_mmu_get_boot_httbr();
         pgd_ptr = kvm_mmu_get_httbr();
         stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
         hyp_stack_ptr = stack_page + PAGE_SIZE;
         vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector);
   
-       __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+       __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
         __cpu_init_stage2();
   
         kvm_arm_init_debug();
@@@ -1076,15 -1074,9 +1075,9 @@@ static void cpu_hyp_reinit(void
   
   static void cpu_hyp_reset(void)
   {
-       phys_addr_t boot_pgd_ptr;
-       phys_addr_t phys_idmap_start;
- 
-       if (!is_kernel_in_hyp_mode()) {
-               boot_pgd_ptr = kvm_mmu_get_boot_httbr();
-               phys_idmap_start = kvm_get_idmap_start();
- 
-               __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start);
-       }
+       if (!is_kernel_in_hyp_mode())
+               __cpu_reset_hyp_mode(hyp_default_vectors,
+                                    kvm_get_idmap_start());
   }
   
   static void _kvm_arch_hardware_enable(void *discard)
@@@ -1294,14 -1286,14 +1287,14 @@@ static int init_hyp_mode(void
          * Map the Hyp-code called directly from the host
          */
         err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
-                                 kvm_ksym_ref(__hyp_text_end));
+                                 kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
         if (err) {
                 kvm_err("Cannot map world-switch code\n");
                 goto out_err;
         }
   
         err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
-                                 kvm_ksym_ref(__end_rodata));
+                                 kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
         if (err) {
                 kvm_err("Cannot map rodata section\n");
                 goto out_err;
@@@ -1312,7 -1304,8 +1305,8 @@@
          */
         for_each_possible_cpu(cpu) {
                 char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
-               err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE);
+               err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
+                                         PAGE_HYP);
   
                 if (err) {
                         kvm_err("Cannot map hyp stack\n");
@@@ -1324,7 -1317,7 +1318,7 @@@
                 kvm_cpu_context_t *cpu_ctxt;
   
                 cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu);
-               err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1);
+               err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP);
   
                 if (err) {
                         kvm_err("Cannot map host CPU state: %d\n", err);
@@@ -1332,10 -1325,6 +1326,6 @@@
                 }
         }
   
- #ifndef CONFIG_HOTPLUG_CPU
-       free_boot_hyp_pgd();
- #endif
- 
         /* set size of VMID supported by CPU */
         kvm_vmid_bits = kvm_get_vmid_bits();
         kvm_info("%d-bit VMID\n", kvm_vmid_bits);
diff --combined arch/arm64/include/asm/cpufeature.h

index 49dd1bd,d40edbb..7099f26
--- 1/arch/arm64/include/asm/cpufeature.h
--- 2/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@@ -36,8 -36,9 +36,9 @@@
   #define ARM64_HAS_VIRT_HOST_EXTN              11
   #define ARM64_WORKAROUND_CAVIUM_27456         12
   #define ARM64_HAS_32BIT_EL0                   13
+ #define ARM64_HYP_OFFSET_LOW                  14
   
- #define ARM64_NCAPS                           14
+ #define ARM64_NCAPS                           15
   
   #ifndef __ASSEMBLY__
   
@@@ -191,9 -192,7 +192,9 @@@ void __init setup_cpu_features(void)
   
   void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
                             const char *info);
+ +void enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps);
   void check_local_cpu_errata(void);
+ +void __init enable_errata_workarounds(void);
   
   void verify_local_cpu_errata(void);
   void verify_local_cpu_capabilities(void);
diff --combined arch/arm64/include/asm/virt.h

index bbc6a8c,88aa8ec..1788545
--- 1/arch/arm64/include/asm/virt.h
--- 2/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@@ -34,11 -34,6 +34,11 @@@
    */
   #define HVC_SET_VECTORS 1
   
+ +/*
+ + * HVC_SOFT_RESTART - CPU soft reset, used by the cpu_soft_restart routine.
+ + */
+ +#define HVC_SOFT_RESTART 2
+ +
   #define BOOT_CPU_MODE_EL1     (0xe11)
   #define BOOT_CPU_MODE_EL2     (0xe12)
   
@@@ -87,6 -82,10 +87,10 @@@ extern void verify_cpu_run_el(void)
   static inline void verify_cpu_run_el(void) {}
   #endif
   
+ /* The section containing the hypervisor idmap text */
+ extern char __hyp_idmap_text_start[];
+ extern char __hyp_idmap_text_end[];
+ 
   /* The section containing the hypervisor text */
   extern char __hyp_text_start[];
   extern char __hyp_text_end[];
diff --combined arch/arm64/kernel/cpufeature.c

index 916d27a,ffb3e14..62272ea
--- 1/arch/arm64/kernel/cpufeature.c
--- 2/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@@ -726,6 -726,19 +726,19 @@@ static bool runs_at_el2(const struct ar
         return is_kernel_in_hyp_mode();
   }
   
+ static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry,
+                          int __unused)
+ {
+       phys_addr_t idmap_addr = virt_to_phys(__hyp_idmap_text_start);
+ 
+       /*
+        * Activate the lower HYP offset only if:
+        * - the idmap doesn't clash with it,
+        * - the kernel is not running at EL2.
+        */
+       return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode();
+ }
+ 
   static const struct arm64_cpu_capabilities arm64_features[] = {
         {
                 .desc = "GIC system register CPU interface",
@@@ -803,6 -816,12 +816,12 @@@
                 .field_pos = ID_AA64PFR0_EL0_SHIFT,
                 .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
         },
+       {
+               .desc = "Reduced HYP mapping offset",
+               .capability = ARM64_HYP_OFFSET_LOW,
+               .def_scope = SCOPE_SYSTEM,
+               .matches = hyp_offset_low,
+       },
         {},
   };
   
@@@ -913,7 -932,8 +932,7 @@@ void update_cpu_capabilities(const stru
    * Run through the enabled capabilities and enable() it on all active
    * CPUs
    */
- -static void __init
- -enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
+ +void __init enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
   {
         for (; caps->matches; caps++)
                 if (caps->enable && cpus_have_cap(caps->capability))
@@@ -1035,7 -1055,6 +1054,7 @@@ void __init setup_cpu_features(void
   
         /* Set the CPU feature capabilies */
         setup_feature_capabilities();
+ +      enable_errata_workarounds();
         setup_elf_hwcaps(arm64_elf_hwcaps);
   
         if (system_supports_32bit_el0())
diff --combined arch/arm64/kvm/hyp/switch.c

index 4373997,81f21a2..ae7855f
--- 1/arch/arm64/kvm/hyp/switch.c
--- 2/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@@ -198,7 -198,7 +198,7 @@@ static bool __hyp_text __translate_far_
   static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
   {
         u64 esr = read_sysreg_el2(esr);
- -      u8 ec = esr >> ESR_ELx_EC_SHIFT;
+ +      u8 ec = ESR_ELx_EC(esr);
         u64 hpfar, far;
   
         vcpu->arch.fault.esr_el2 = esr;
@@@ -299,9 -299,16 +299,16 @@@ static const char __hyp_panic_string[] 
   
   static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par)
   {
-       unsigned long str_va = (unsigned long)__hyp_panic_string;
+       unsigned long str_va;
   
-       __hyp_do_panic(hyp_kern_va(str_va),
+       /*
+        * Force the panic string to be loaded from the literal pool,
+        * making sure it is a kernel address and not a PC-relative
+        * reference.
+        */
+       asm volatile("ldr %0, =__hyp_panic_string" : "=r" (str_va));
+ 
+       __hyp_do_panic(str_va,
                        spsr,  elr,
                        read_sysreg(esr_el2),   read_sysreg_el2(far),
                        read_sysreg(hpfar_el2), par,
diff --combined arch/powerpc/include/asm/paca.h

index ad171e9,4b17bd0..148303e
--- 1/arch/powerpc/include/asm/paca.h
--- 2/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@@ -25,7 -25,7 +25,8 @@@
   #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
   #include <asm/kvm_book3s_asm.h>
   #endif
+ +#include <asm/accounting.h>
+ #include <asm/hmi.h>
   
   register struct paca_struct *local_paca asm("r13");
   
@@@ -182,10 -182,21 +183,15 @@@ struct paca_struct 
          */
         u16 in_mce;
         u8 hmi_event_available;          /* HMI event is available */
+       /*
+        * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
+        * more details
+        */
+       struct sibling_subcore_state *sibling_subcore_state;
   #endif
   
         /* Stuff for accurate time accounting */
- -      u64 user_time;                  /* accumulated usermode TB ticks */
- -      u64 system_time;                /* accumulated system TB ticks */
- -      u64 user_time_scaled;           /* accumulated usermode SPURR ticks */
- -      u64 starttime;                  /* TB value snapshot */
- -      u64 starttime_user;             /* TB value on exit to usermode */
- -      u64 startspurr;                 /* SPURR value snapshot */
- -      u64 utime_sspurr;               /* ->user_time when ->startspurr set */
+ +      struct cpu_accounting_data accounting;
         u64 stolen_time;                /* TB ticks taken by hypervisor */
         u64 dtl_ridx;                   /* read index in dispatch log */
         struct dtl_entry *dtl_curr;     /* pointer corresponding to dtl_ridx */
diff --combined arch/powerpc/kernel/Makefile

index fe4c075,6972a23..b2027a5
--- 1/arch/powerpc/kernel/Makefile
--- 2/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@@ -41,12 -41,13 +41,12 @@@ obj-$(CONFIG_VDSO32)               += vdso32
   obj-$(CONFIG_HAVE_HW_BREAKPOINT)      += hw_breakpoint.o
   obj-$(CONFIG_PPC_BOOK3S_64)   += cpu_setup_ppc970.o cpu_setup_pa6t.o
   obj-$(CONFIG_PPC_BOOK3S_64)   += cpu_setup_power.o
- obj-$(CONFIG_PPC_BOOK3S_64)   += mce.o mce_power.o
+ obj-$(CONFIG_PPC_BOOK3S_64)   += mce.o mce_power.o hmi.o
- -obj64-$(CONFIG_RELOCATABLE)   += reloc_64.o
   obj-$(CONFIG_PPC_BOOK3E_64)   += exceptions-64e.o idle_book3e.o
   obj-$(CONFIG_PPC64)           += vdso64/
   obj-$(CONFIG_ALTIVEC)         += vecemu.o
   obj-$(CONFIG_PPC_970_NAP)     += idle_power4.o
- -obj-$(CONFIG_PPC_P7_NAP)      += idle_power7.o
+ +obj-$(CONFIG_PPC_P7_NAP)      += idle_book3s.o
   procfs-y                      := proc_powerpc.o
   obj-$(CONFIG_PROC_FS)         += $(procfs-y)
   rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI) := rtas_pci.o
@@@ -86,7 -87,7 +86,7 @@@ extra-$(CONFIG_FSL_BOOKE)     := head_fsl_b
   extra-$(CONFIG_8xx)           := head_8xx.o
   extra-y                               += vmlinux.lds
   
- -obj-$(CONFIG_RELOCATABLE_PPC32)       += reloc_32.o
+ +obj-$(CONFIG_RELOCATABLE)     += reloc_$(CONFIG_WORD_SIZE).o
   
   obj-$(CONFIG_PPC32)           += entry_32.o setup_32.o
   obj-$(CONFIG_PPC64)           += dma-iommu.o iommu.o
diff --combined arch/powerpc/kernel/exceptions-64s.S

index 6200e49,0eba47e..694def6
--- 1/arch/powerpc/kernel/exceptions-64s.S
--- 2/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@@ -107,9 -107,25 +107,9 @@@ BEGIN_FTR_SECTIO
         beq     9f
   
         cmpwi   cr3,r13,2
- -
- -      /*
- -       * Check if last bit of HSPGR0 is set. This indicates whether we are
- -       * waking up from winkle.
- -       */
         GET_PACA(r13)
- -      clrldi  r5,r13,63
- -      clrrdi  r13,r13,1
- -      cmpwi   cr4,r5,1
- -      mtspr   SPRN_HSPRG0,r13
- -
- -      lbz     r0,PACA_THREAD_IDLE_STATE(r13)
- -      cmpwi   cr2,r0,PNV_THREAD_NAP
- -      bgt     cr2,8f                          /* Either sleep or Winkle */
- -
- -      /* Waking up from nap should not cause hypervisor state loss */
- -      bgt     cr3,.
+ +      bl      pnv_restore_hyp_resource
   
- -      /* Waking up from nap */
         li      r0,PNV_THREAD_RUNNING
         stb     r0,PACA_THREAD_IDLE_STATE(r13)  /* Clear thread state */
   
@@@ -127,9 -143,13 +127,9 @@@
   
         /* Return SRR1 from power7_nap() */
         mfspr   r3,SPRN_SRR1
- -      beq     cr3,2f
- -      b       power7_wakeup_noloss
- -2:    b       power7_wakeup_loss
- -
- -      /* Fast Sleep wakeup on PowerNV */
- -8:    GET_PACA(r13)
- -      b       power7_wakeup_tb_loss
+ +      blt     cr3,2f
+ +      b       pnv_wakeup_loss
+ +2:    b       pnv_wakeup_noloss
   
   9:
   END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
@@@ -331,12 -351,6 +331,12 @@@ hv_doorbell_trampoline
         EXCEPTION_PROLOG_0(PACA_EXGEN)
         b       h_doorbell_hv
   
+ +      . = 0xea0
+ +hv_virt_irq_trampoline:
+ +      SET_SCRATCH0(r13)
+ +      EXCEPTION_PROLOG_0(PACA_EXGEN)
+ +      b       h_virt_irq_hv
+ +
         /* We need to deal with the Altivec unavailable exception
          * here which is at 0xf20, thus in the middle of the
          * prolog code of the PerformanceMonitor one. A little
@@@ -587,9 -601,6 +587,9 @@@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR
         MASKABLE_EXCEPTION_HV_OOL(0xe82, h_doorbell)
         KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe82)
   
+ +      MASKABLE_EXCEPTION_HV_OOL(0xea2, h_virt_irq)
+ +      KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xea2)
+ +
         /* moved from 0xf00 */
         STD_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
         KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf00)
@@@ -669,8 -680,8 +669,10 @@@ _GLOBAL(__replay_interrupt
   BEGIN_FTR_SECTION
         cmpwi   r3,0xe80
         beq     h_doorbell_common
+ +      cmpwi   r3,0xea0
+ +      beq     h_virt_irq_common
+       cmpwi   r3,0xe60
+       beq     hmi_exception_common
   FTR_SECTION_ELSE
         cmpwi   r3,0xa00
         beq     doorbell_super_common
@@@ -745,7 -756,6 +747,7 @@@ kvmppc_skip_Hinterrupt
   #else
         STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, unknown_exception)
   #endif
+ +      STD_EXCEPTION_COMMON_ASYNC(0xea0, h_virt_irq, do_IRQ)
         STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, performance_monitor_exception)
         STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, instruction_breakpoint_exception)
         STD_EXCEPTION_COMMON(0x1502, denorm, unknown_exception)
@@@ -754,6 -764,11 +756,6 @@@
   #else
         STD_EXCEPTION_COMMON(0x1700, altivec_assist, unknown_exception)
   #endif
- -#ifdef CONFIG_CBE_RAS
- -      STD_EXCEPTION_COMMON(0x1200, cbe_system_error, cbe_system_error_exception)
- -      STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, cbe_maintenance_exception)
- -      STD_EXCEPTION_COMMON(0x1800, cbe_thermal, cbe_thermal_exception)
- -#endif /* CONFIG_CBE_RAS */
   
         /*
          * Relocation-on interrupts: A subset of the interrupts can be delivered
@@@ -864,12 -879,6 +866,12 @@@ h_doorbell_relon_trampoline
         EXCEPTION_PROLOG_0(PACA_EXGEN)
         b       h_doorbell_relon_hv
   
+ +      . = 0x4ea0
+ +h_virt_irq_relon_trampoline:
+ +      SET_SCRATCH0(r13)
+ +      EXCEPTION_PROLOG_0(PACA_EXGEN)
+ +      b       h_virt_irq_relon_hv
+ +
         . = 0x4f00
   performance_monitor_relon_pseries_trampoline:
         SET_SCRATCH0(r13)
@@@ -1124,10 -1133,12 +1126,10 @@@ END_FTR_SECTION_IFSET(CPU_FTR_VSX
         bl      vsx_unavailable_exception
         b       ret_from_except
   
- -      STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception)
- -      STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception)
- -
         /* Equivalents to the above handlers for relocation-on interrupt vectors */
         STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
         MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
+ +      MASKABLE_RELON_EXCEPTION_HV_OOL(0xea0, h_virt_irq)
   
         STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
         STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
@@@ -1161,18 -1172,9 +1163,18 @@@ fwnmi_data_area
         . = 0x8000
   #endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
   
+ +      STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception)
+ +      STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception)
+ +
+ +#ifdef CONFIG_CBE_RAS
+ +      STD_EXCEPTION_COMMON(0x1200, cbe_system_error, cbe_system_error_exception)
+ +      STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, cbe_maintenance_exception)
+ +      STD_EXCEPTION_COMMON(0x1800, cbe_thermal, cbe_thermal_exception)
+ +#endif /* CONFIG_CBE_RAS */
+ +
         .globl hmi_exception_early
   hmi_exception_early:
-       EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
+       EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, 0xe62)
         mr      r10,r1                  /* Save r1                      */
         ld      r1,PACAEMERGSP(r13)     /* Use emergency stack          */
         subi    r1,r1,INT_FRAME_SIZE    /* alloc stack frame            */
@@@ -1289,7 -1291,7 +1291,7 @@@ machine_check_handle_early
         GET_PACA(r13)
         ld      r1,PACAR1(r13)
         li      r3,PNV_THREAD_NAP
- -      b       power7_enter_nap_mode
+ +      b       pnv_enter_arch207_idle_mode
   4:
   #endif
         /*
@@@ -1399,12 -1401,11 +1401,12 @@@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_RADIX
         lwz     r9,PACA_EXSLB+EX_CCR(r13)       /* get saved CR */
   
         mtlr    r10
- -BEGIN_MMU_FTR_SECTION
- -      b       2f
- -END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX)
         andi.   r10,r12,MSR_RI  /* check for unrecoverable exception */
+ +BEGIN_MMU_FTR_SECTION
         beq-    2f
+ +FTR_SECTION_ELSE
+ +      b       2f
+ +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX)
   
   .machine      push
   .machine      "power4"
diff --combined arch/powerpc/kernel/idle_book3s.S

index 335eb6c,0000000..8a56a51

mode 100644,000000..100644
--- 1/arch/powerpc/kernel/idle_book3s.S
--- /dev/null
+++ b/arch/powerpc/kernel/idle_book3s.S
@@@ -1,662 -1,0 +1,664 @@@
-       bl      opal_rm_handle_hmi;                                     \
+ +/*
+ + *  This file contains idle entry/exit functions for POWER7,
+ + *  POWER8 and POWER9 CPUs.
+ + *
+ + *  This program is free software; you can redistribute it and/or
+ + *  modify it under the terms of the GNU General Public License
+ + *  as published by the Free Software Foundation; either version
+ + *  2 of the License, or (at your option) any later version.
+ + */
+ +
+ +#include <linux/threads.h>
+ +#include <asm/processor.h>
+ +#include <asm/page.h>
+ +#include <asm/cputable.h>
+ +#include <asm/thread_info.h>
+ +#include <asm/ppc_asm.h>
+ +#include <asm/asm-offsets.h>
+ +#include <asm/ppc-opcode.h>
+ +#include <asm/hw_irq.h>
+ +#include <asm/kvm_book3s_asm.h>
+ +#include <asm/opal.h>
+ +#include <asm/cpuidle.h>
+ +#include <asm/book3s/64/mmu-hash.h>
+ +#include <asm/mmu.h>
+ +
+ +#undef DEBUG
+ +
+ +/*
+ + * Use unused space in the interrupt stack to save and restore
+ + * registers for winkle support.
+ + */
+ +#define _SDR1 GPR3
+ +#define _RPR  GPR4
+ +#define _SPURR        GPR5
+ +#define _PURR GPR6
+ +#define _TSCR GPR7
+ +#define _DSCR GPR8
+ +#define _AMOR GPR9
+ +#define _WORT GPR10
+ +#define _WORC GPR11
+ +#define _PTCR GPR12
+ +
+ +#define PSSCR_HV_TEMPLATE     PSSCR_ESL | PSSCR_EC | \
+ +                              PSSCR_PSLL_MASK | PSSCR_TR_MASK | \
+ +                              PSSCR_MTL_MASK
+ +
+ +/* Idle state entry routines */
+ +
+ +#define       IDLE_STATE_ENTER_SEQ(IDLE_INST)                         \
+ +      /* Magic NAP/SLEEP/WINKLE mode enter sequence */        \
+ +      std     r0,0(r1);                                       \
+ +      ptesync;                                                \
+ +      ld      r0,0(r1);                                       \
+ +1:    cmp     cr0,r0,r0;                                      \
+ +      bne     1b;                                             \
+ +      IDLE_INST;                                              \
+ +      b       .
+ +
+ +      .text
+ +
+ +/*
+ + * Used by threads before entering deep idle states. Saves SPRs
+ + * in interrupt stack frame
+ + */
+ +save_sprs_to_stack:
+ +      /*
+ +       * Note all register i.e per-core, per-subcore or per-thread is saved
+ +       * here since any thread in the core might wake up first
+ +       */
+ +BEGIN_FTR_SECTION
+ +      mfspr   r3,SPRN_PTCR
+ +      std     r3,_PTCR(r1)
+ +      /*
+ +       * Note - SDR1 is dropped in Power ISA v3. Hence not restoring
+ +       * SDR1 here
+ +       */
+ +FTR_SECTION_ELSE
+ +      mfspr   r3,SPRN_SDR1
+ +      std     r3,_SDR1(r1)
+ +ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
+ +      mfspr   r3,SPRN_RPR
+ +      std     r3,_RPR(r1)
+ +      mfspr   r3,SPRN_SPURR
+ +      std     r3,_SPURR(r1)
+ +      mfspr   r3,SPRN_PURR
+ +      std     r3,_PURR(r1)
+ +      mfspr   r3,SPRN_TSCR
+ +      std     r3,_TSCR(r1)
+ +      mfspr   r3,SPRN_DSCR
+ +      std     r3,_DSCR(r1)
+ +      mfspr   r3,SPRN_AMOR
+ +      std     r3,_AMOR(r1)
+ +      mfspr   r3,SPRN_WORT
+ +      std     r3,_WORT(r1)
+ +      mfspr   r3,SPRN_WORC
+ +      std     r3,_WORC(r1)
+ +
+ +      blr
+ +
+ +/*
+ + * Used by threads when the lock bit of core_idle_state is set.
+ + * Threads will spin in HMT_LOW until the lock bit is cleared.
+ + * r14 - pointer to core_idle_state
+ + * r15 - used to load contents of core_idle_state
+ + */
+ +
+ +core_idle_lock_held:
+ +      HMT_LOW
+ +3:    lwz     r15,0(r14)
+ +      andi.   r15,r15,PNV_CORE_IDLE_LOCK_BIT
+ +      bne     3b
+ +      HMT_MEDIUM
+ +      lwarx   r15,0,r14
+ +      blr
+ +
+ +/*
+ + * Pass requested state in r3:
+ + *    r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8
+ + *       - Requested STOP state in POWER9
+ + *
+ + * To check IRQ_HAPPENED in r4
+ + *    0 - don't check
+ + *    1 - check
+ + *
+ + * Address to 'rfid' to in r5
+ + */
+ +_GLOBAL(pnv_powersave_common)
+ +      /* Use r3 to pass state nap/sleep/winkle */
+ +      /* NAP is a state loss, we create a regs frame on the
+ +       * stack, fill it up with the state we care about and
+ +       * stick a pointer to it in PACAR1. We really only
+ +       * need to save PC, some CR bits and the NV GPRs,
+ +       * but for now an interrupt frame will do.
+ +       */
+ +      mflr    r0
+ +      std     r0,16(r1)
+ +      stdu    r1,-INT_FRAME_SIZE(r1)
+ +      std     r0,_LINK(r1)
+ +      std     r0,_NIP(r1)
+ +
+ +      /* Hard disable interrupts */
+ +      mfmsr   r9
+ +      rldicl  r9,r9,48,1
+ +      rotldi  r9,r9,16
+ +      mtmsrd  r9,1                    /* hard-disable interrupts */
+ +
+ +      /* Check if something happened while soft-disabled */
+ +      lbz     r0,PACAIRQHAPPENED(r13)
+ +      andi.   r0,r0,~PACA_IRQ_HARD_DIS@l
+ +      beq     1f
+ +      cmpwi   cr0,r4,0
+ +      beq     1f
+ +      addi    r1,r1,INT_FRAME_SIZE
+ +      ld      r0,16(r1)
+ +      li      r3,0                    /* Return 0 (no nap) */
+ +      mtlr    r0
+ +      blr
+ +
+ +1:    /* We mark irqs hard disabled as this is the state we'll
+ +       * be in when returning and we need to tell arch_local_irq_restore()
+ +       * about it
+ +       */
+ +      li      r0,PACA_IRQ_HARD_DIS
+ +      stb     r0,PACAIRQHAPPENED(r13)
+ +
+ +      /* We haven't lost state ... yet */
+ +      li      r0,0
+ +      stb     r0,PACA_NAPSTATELOST(r13)
+ +
+ +      /* Continue saving state */
+ +      SAVE_GPR(2, r1)
+ +      SAVE_NVGPRS(r1)
+ +      mfcr    r4
+ +      std     r4,_CCR(r1)
+ +      std     r9,_MSR(r1)
+ +      std     r1,PACAR1(r13)
+ +
+ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ +      /* Tell KVM we're entering idle */
+ +      li      r4,KVM_HWTHREAD_IN_IDLE
+ +      stb     r4,HSTATE_HWTHREAD_STATE(r13)
+ +#endif
+ +
+ +      /*
+ +       * Go to real mode to do the nap, as required by the architecture.
+ +       * Also, we need to be in real mode before setting hwthread_state,
+ +       * because as soon as we do that, another thread can switch
+ +       * the MMU context to the guest.
+ +       */
+ +      LOAD_REG_IMMEDIATE(r7, MSR_IDLE)
+ +      li      r6, MSR_RI
+ +      andc    r6, r9, r6
+ +      mtmsrd  r6, 1           /* clear RI before setting SRR0/1 */
+ +      mtspr   SPRN_SRR0, r5
+ +      mtspr   SPRN_SRR1, r7
+ +      rfid
+ +
+ +      .globl pnv_enter_arch207_idle_mode
+ +pnv_enter_arch207_idle_mode:
+ +      stb     r3,PACA_THREAD_IDLE_STATE(r13)
+ +      cmpwi   cr3,r3,PNV_THREAD_SLEEP
+ +      bge     cr3,2f
+ +      IDLE_STATE_ENTER_SEQ(PPC_NAP)
+ +      /* No return */
+ +2:
+ +      /* Sleep or winkle */
+ +      lbz     r7,PACA_THREAD_MASK(r13)
+ +      ld      r14,PACA_CORE_IDLE_STATE_PTR(r13)
+ +lwarx_loop1:
+ +      lwarx   r15,0,r14
+ +
+ +      andi.   r9,r15,PNV_CORE_IDLE_LOCK_BIT
+ +      bnel    core_idle_lock_held
+ +
+ +      andc    r15,r15,r7                      /* Clear thread bit */
+ +
+ +      andi.   r15,r15,PNV_CORE_IDLE_THREAD_BITS
+ +
+ +/*
+ + * If cr0 = 0, then current thread is the last thread of the core entering
+ + * sleep. Last thread needs to execute the hardware bug workaround code if
+ + * required by the platform.
+ + * Make the workaround call unconditionally here. The below branch call is
+ + * patched out when the idle states are discovered if the platform does not
+ + * require it.
+ + */
+ +.global pnv_fastsleep_workaround_at_entry
+ +pnv_fastsleep_workaround_at_entry:
+ +      beq     fastsleep_workaround_at_entry
+ +
+ +      stwcx.  r15,0,r14
+ +      bne-    lwarx_loop1
+ +      isync
+ +
+ +common_enter: /* common code for all the threads entering sleep or winkle */
+ +      bgt     cr3,enter_winkle
+ +      IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
+ +
+ +fastsleep_workaround_at_entry:
+ +      ori     r15,r15,PNV_CORE_IDLE_LOCK_BIT
+ +      stwcx.  r15,0,r14
+ +      bne-    lwarx_loop1
+ +      isync
+ +
+ +      /* Fast sleep workaround */
+ +      li      r3,1
+ +      li      r4,1
+ +      bl      opal_rm_config_cpu_idle_state
+ +
+ +      /* Clear Lock bit */
+ +      li      r0,0
+ +      lwsync
+ +      stw     r0,0(r14)
+ +      b       common_enter
+ +
+ +enter_winkle:
+ +      bl      save_sprs_to_stack
+ +
+ +      IDLE_STATE_ENTER_SEQ(PPC_WINKLE)
+ +
+ +/*
+ + * r3 - requested stop state
+ + */
+ +power_enter_stop:
+ +/*
+ + * Check if the requested state is a deep idle state.
+ + */
+ +      LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
+ +      ld      r4,ADDROFF(pnv_first_deep_stop_state)(r5)
+ +      cmpd    r3,r4
+ +      bge     2f
+ +      IDLE_STATE_ENTER_SEQ(PPC_STOP)
+ +2:
+ +/*
+ + * Entering deep idle state.
+ + * Clear thread bit in PACA_CORE_IDLE_STATE, save SPRs to
+ + * stack and enter stop
+ + */
+ +      lbz     r7,PACA_THREAD_MASK(r13)
+ +      ld      r14,PACA_CORE_IDLE_STATE_PTR(r13)
+ +
+ +lwarx_loop_stop:
+ +      lwarx   r15,0,r14
+ +      andi.   r9,r15,PNV_CORE_IDLE_LOCK_BIT
+ +      bnel    core_idle_lock_held
+ +      andc    r15,r15,r7                      /* Clear thread bit */
+ +
+ +      stwcx.  r15,0,r14
+ +      bne-    lwarx_loop_stop
+ +      isync
+ +
+ +      bl      save_sprs_to_stack
+ +
+ +      IDLE_STATE_ENTER_SEQ(PPC_STOP)
+ +
+ +_GLOBAL(power7_idle)
+ +      /* Now check if user or arch enabled NAP mode */
+ +      LOAD_REG_ADDRBASE(r3,powersave_nap)
+ +      lwz     r4,ADDROFF(powersave_nap)(r3)
+ +      cmpwi   0,r4,0
+ +      beqlr
+ +      li      r3, 1
+ +      /* fall through */
+ +
+ +_GLOBAL(power7_nap)
+ +      mr      r4,r3
+ +      li      r3,PNV_THREAD_NAP
+ +      LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
+ +      b       pnv_powersave_common
+ +      /* No return */
+ +
+ +_GLOBAL(power7_sleep)
+ +      li      r3,PNV_THREAD_SLEEP
+ +      li      r4,1
+ +      LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
+ +      b       pnv_powersave_common
+ +      /* No return */
+ +
+ +_GLOBAL(power7_winkle)
+ +      li      r3,PNV_THREAD_WINKLE
+ +      li      r4,1
+ +      LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
+ +      b       pnv_powersave_common
+ +      /* No return */
+ +
+ +#define CHECK_HMI_INTERRUPT                                           \
+ +      mfspr   r0,SPRN_SRR1;                                           \
+ +BEGIN_FTR_SECTION_NESTED(66);                                         \
+ +      rlwinm  r0,r0,45-31,0xf;  /* extract wake reason field (P8) */  \
+ +FTR_SECTION_ELSE_NESTED(66);                                          \
+ +      rlwinm  r0,r0,45-31,0xe;  /* P7 wake reason field is 3 bits */  \
+ +ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);              \
+ +      cmpwi   r0,0xa;                 /* Hypervisor maintenance ? */  \
+ +      bne     20f;                                                    \
+ +      /* Invoke opal call to handle hmi */                            \
+ +      ld      r2,PACATOC(r13);                                        \
+ +      ld      r1,PACAR1(r13);                                         \
+ +      std     r3,ORIG_GPR3(r1);       /* Save original r3 */          \
++      li      r3,0;                   /* NULL argument */             \
++      bl      hmi_exception_realmode;                                 \
++      nop;                                                            \
+ +      ld      r3,ORIG_GPR3(r1);       /* Restore original r3 */       \
+ +20:   nop;
+ +
+ +
+ +/*
+ + * r3 - requested stop state
+ + */
+ +_GLOBAL(power9_idle_stop)
+ +      LOAD_REG_IMMEDIATE(r4, PSSCR_HV_TEMPLATE)
+ +      or      r4,r4,r3
+ +      mtspr   SPRN_PSSCR, r4
+ +      li      r4, 1
+ +      LOAD_REG_ADDR(r5,power_enter_stop)
+ +      b       pnv_powersave_common
+ +      /* No return */
+ +/*
+ + * Called from reset vector. Check whether we have woken up with
+ + * hypervisor state loss. If yes, restore hypervisor state and return
+ + * back to reset vector.
+ + *
+ + * r13 - Contents of HSPRG0
+ + * cr3 - set to gt if waking up with partial/complete hypervisor state loss
+ + */
+ +_GLOBAL(pnv_restore_hyp_resource)
+ +      ld      r2,PACATOC(r13);
+ +BEGIN_FTR_SECTION
+ +      /*
+ +       * POWER ISA 3. Use PSSCR to determine if we
+ +       * are waking up from deep idle state
+ +       */
+ +      LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
+ +      ld      r4,ADDROFF(pnv_first_deep_stop_state)(r5)
+ +
+ +      mfspr   r5,SPRN_PSSCR
+ +      /*
+ +       * 0-3 bits correspond to Power-Saving Level Status
+ +       * which indicates the idle state we are waking up from
+ +       */
+ +      rldicl  r5,r5,4,60
+ +      cmpd    cr4,r5,r4
+ +      bge     cr4,pnv_wakeup_tb_loss
+ +      /*
+ +       * Waking up without hypervisor state loss. Return to
+ +       * reset vector
+ +       */
+ +      blr
+ +
+ +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+ +
+ +      /*
+ +       * POWER ISA 2.07 or less.
+ +       * Check if last bit of HSPGR0 is set. This indicates whether we are
+ +       * waking up from winkle.
+ +       */
+ +      clrldi  r5,r13,63
+ +      clrrdi  r13,r13,1
+ +      cmpwi   cr4,r5,1
+ +      mtspr   SPRN_HSPRG0,r13
+ +
+ +      lbz     r0,PACA_THREAD_IDLE_STATE(r13)
+ +      cmpwi   cr2,r0,PNV_THREAD_NAP
+ +      bgt     cr2,pnv_wakeup_tb_loss  /* Either sleep or Winkle */
+ +
+ +      /*
+ +       * We fall through here if PACA_THREAD_IDLE_STATE shows we are waking
+ +       * up from nap. At this stage CR3 shouldn't contains 'gt' since that
+ +       * indicates we are waking with hypervisor state loss from nap.
+ +       */
+ +      bgt     cr3,.
+ +
+ +      blr     /* Return back to System Reset vector from where
+ +                 pnv_restore_hyp_resource was invoked */
+ +
+ +/*
+ + * Called if waking up from idle state which can cause either partial or
+ + * complete hyp state loss.
+ + * In POWER8, called if waking up from fastsleep or winkle
+ + * In POWER9, called if waking up from stop state >= pnv_first_deep_stop_state
+ + *
+ + * r13 - PACA
+ + * cr3 - gt if waking up with partial/complete hypervisor state loss
+ + * cr4 - eq if waking up from complete hypervisor state loss.
+ + */
+ +_GLOBAL(pnv_wakeup_tb_loss)
+ +      ld      r1,PACAR1(r13)
+ +      /*
+ +       * Before entering any idle state, the NVGPRs are saved in the stack
+ +       * and they are restored before switching to the process context. Hence
+ +       * until they are restored, they are free to be used.
+ +       *
+ +       * Save SRR1 and LR in NVGPRs as they might be clobbered in
+ +       * opal_call() (called in CHECK_HMI_INTERRUPT). SRR1 is required
+ +       * to determine the wakeup reason if we branch to kvm_start_guest. LR
+ +       * is required to return back to reset vector after hypervisor state
+ +       * restore is complete.
+ +       */
+ +      mflr    r17
+ +      mfspr   r16,SPRN_SRR1
+ +BEGIN_FTR_SECTION
+ +      CHECK_HMI_INTERRUPT
+ +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+ +
+ +      lbz     r7,PACA_THREAD_MASK(r13)
+ +      ld      r14,PACA_CORE_IDLE_STATE_PTR(r13)
+ +lwarx_loop2:
+ +      lwarx   r15,0,r14
+ +      andi.   r9,r15,PNV_CORE_IDLE_LOCK_BIT
+ +      /*
+ +       * Lock bit is set in one of the 2 cases-
+ +       * a. In the sleep/winkle enter path, the last thread is executing
+ +       * fastsleep workaround code.
+ +       * b. In the wake up path, another thread is executing fastsleep
+ +       * workaround undo code or resyncing timebase or restoring context
+ +       * In either case loop until the lock bit is cleared.
+ +       */
+ +      bnel    core_idle_lock_held
+ +
+ +      cmpwi   cr2,r15,0
+ +
+ +      /*
+ +       * At this stage
+ +       * cr2 - eq if first thread to wakeup in core
+ +       * cr3-  gt if waking up with partial/complete hypervisor state loss
+ +       * cr4 - eq if waking up from complete hypervisor state loss.
+ +       */
+ +
+ +      ori     r15,r15,PNV_CORE_IDLE_LOCK_BIT
+ +      stwcx.  r15,0,r14
+ +      bne-    lwarx_loop2
+ +      isync
+ +
+ +BEGIN_FTR_SECTION
+ +      lbz     r4,PACA_SUBCORE_SIBLING_MASK(r13)
+ +      and     r4,r4,r15
+ +      cmpwi   r4,0    /* Check if first in subcore */
+ +
+ +      or      r15,r15,r7              /* Set thread bit */
+ +      beq     first_thread_in_subcore
+ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+ +
+ +      or      r15,r15,r7              /* Set thread bit */
+ +      beq     cr2,first_thread_in_core
+ +
+ +      /* Not first thread in core or subcore to wake up */
+ +      b       clear_lock
+ +
+ +first_thread_in_subcore:
+ +      /*
+ +       * If waking up from sleep, subcore state is not lost. Hence
+ +       * skip subcore state restore
+ +       */
+ +      bne     cr4,subcore_state_restored
+ +
+ +      /* Restore per-subcore state */
+ +      ld      r4,_SDR1(r1)
+ +      mtspr   SPRN_SDR1,r4
+ +
+ +      ld      r4,_RPR(r1)
+ +      mtspr   SPRN_RPR,r4
+ +      ld      r4,_AMOR(r1)
+ +      mtspr   SPRN_AMOR,r4
+ +
+ +subcore_state_restored:
+ +      /*
+ +       * Check if the thread is also the first thread in the core. If not,
+ +       * skip to clear_lock.
+ +       */
+ +      bne     cr2,clear_lock
+ +
+ +first_thread_in_core:
+ +
+ +      /*
+ +       * First thread in the core waking up from any state which can cause
+ +       * partial or complete hypervisor state loss. It needs to
+ +       * call the fastsleep workaround code if the platform requires it.
+ +       * Call it unconditionally here. The below branch instruction will
+ +       * be patched out if the platform does not have fastsleep or does not
+ +       * require the workaround. Patching will be performed during the
+ +       * discovery of idle-states.
+ +       */
+ +.global pnv_fastsleep_workaround_at_exit
+ +pnv_fastsleep_workaround_at_exit:
+ +      b       fastsleep_workaround_at_exit
+ +
+ +timebase_resync:
+ +      /*
+ +       * Use cr3 which indicates that we are waking up with atleast partial
+ +       * hypervisor state loss to determine if TIMEBASE RESYNC is needed.
+ +       */
+ +      ble     cr3,clear_lock
+ +      /* Time base re-sync */
+ +      bl      opal_rm_resync_timebase;
+ +      /*
+ +       * If waking up from sleep, per core state is not lost, skip to
+ +       * clear_lock.
+ +       */
+ +      bne     cr4,clear_lock
+ +
+ +      /*
+ +       * First thread in the core to wake up and its waking up with
+ +       * complete hypervisor state loss. Restore per core hypervisor
+ +       * state.
+ +       */
+ +BEGIN_FTR_SECTION
+ +      ld      r4,_PTCR(r1)
+ +      mtspr   SPRN_PTCR,r4
+ +      ld      r4,_RPR(r1)
+ +      mtspr   SPRN_RPR,r4
+ +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+ +
+ +      ld      r4,_TSCR(r1)
+ +      mtspr   SPRN_TSCR,r4
+ +      ld      r4,_WORC(r1)
+ +      mtspr   SPRN_WORC,r4
+ +
+ +clear_lock:
+ +      andi.   r15,r15,PNV_CORE_IDLE_THREAD_BITS
+ +      lwsync
+ +      stw     r15,0(r14)
+ +
+ +common_exit:
+ +      /*
+ +       * Common to all threads.
+ +       *
+ +       * If waking up from sleep, hypervisor state is not lost. Hence
+ +       * skip hypervisor state restore.
+ +       */
+ +      bne     cr4,hypervisor_state_restored
+ +
+ +      /* Waking up from winkle */
+ +
+ +BEGIN_MMU_FTR_SECTION
+ +      b       no_segments
+ +END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX)
+ +      /* Restore SLB  from PACA */
+ +      ld      r8,PACA_SLBSHADOWPTR(r13)
+ +
+ +      .rept   SLB_NUM_BOLTED
+ +      li      r3, SLBSHADOW_SAVEAREA
+ +      LDX_BE  r5, r8, r3
+ +      addi    r3, r3, 8
+ +      LDX_BE  r6, r8, r3
+ +      andis.  r7,r5,SLB_ESID_V@h
+ +      beq     1f
+ +      slbmte  r6,r5
+ +1:    addi    r8,r8,16
+ +      .endr
+ +no_segments:
+ +
+ +      /* Restore per thread state */
+ +
+ +      ld      r4,_SPURR(r1)
+ +      mtspr   SPRN_SPURR,r4
+ +      ld      r4,_PURR(r1)
+ +      mtspr   SPRN_PURR,r4
+ +      ld      r4,_DSCR(r1)
+ +      mtspr   SPRN_DSCR,r4
+ +      ld      r4,_WORT(r1)
+ +      mtspr   SPRN_WORT,r4
+ +
+ +      /* Call cur_cpu_spec->cpu_restore() */
+ +      LOAD_REG_ADDR(r4, cur_cpu_spec)
+ +      ld      r4,0(r4)
+ +      ld      r12,CPU_SPEC_RESTORE(r4)
+ +#ifdef PPC64_ELF_ABI_v1
+ +      ld      r12,0(r12)
+ +#endif
+ +      mtctr   r12
+ +      bctrl
+ +
+ +hypervisor_state_restored:
+ +
+ +      mtspr   SPRN_SRR1,r16
+ +      mtlr    r17
+ +      blr     /* Return back to System Reset vector from where
+ +                 pnv_restore_hyp_resource was invoked */
+ +
+ +fastsleep_workaround_at_exit:
+ +      li      r3,1
+ +      li      r4,0
+ +      bl      opal_rm_config_cpu_idle_state
+ +      b       timebase_resync
+ +
+ +/*
+ + * R3 here contains the value that will be returned to the caller
+ + * of power7_nap.
+ + */
+ +_GLOBAL(pnv_wakeup_loss)
+ +      ld      r1,PACAR1(r13)
+ +BEGIN_FTR_SECTION
+ +      CHECK_HMI_INTERRUPT
+ +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+ +      REST_NVGPRS(r1)
+ +      REST_GPR(2, r1)
+ +      ld      r6,_CCR(r1)
+ +      ld      r4,_MSR(r1)
+ +      ld      r5,_NIP(r1)
+ +      addi    r1,r1,INT_FRAME_SIZE
+ +      mtcr    r6
+ +      mtspr   SPRN_SRR1,r4
+ +      mtspr   SPRN_SRR0,r5
+ +      rfid
+ +
+ +/*
+ + * R3 here contains the value that will be returned to the caller
+ + * of power7_nap.
+ + */
+ +_GLOBAL(pnv_wakeup_noloss)
+ +      lbz     r0,PACA_NAPSTATELOST(r13)
+ +      cmpwi   r0,0
+ +      bne     pnv_wakeup_loss
+ +BEGIN_FTR_SECTION
+ +      CHECK_HMI_INTERRUPT
+ +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+ +      ld      r1,PACAR1(r13)
+ +      ld      r6,_CCR(r1)
+ +      ld      r4,_MSR(r1)
+ +      ld      r5,_NIP(r1)
+ +      addi    r1,r1,INT_FRAME_SIZE
+ +      mtcr    r6
+ +      mtspr   SPRN_SRR1,r4
+ +      mtspr   SPRN_SRR0,r5
+ +      rfid
diff --combined arch/powerpc/kernel/traps.c

index f7e2f2e,9ec95da..2cb5892
--- 1/arch/powerpc/kernel/traps.c
--- 2/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@@ -60,7 -60,7 +60,8 @@@
   #include <asm/switch_to.h>
   #include <asm/tm.h>
   #include <asm/debug.h>
+ +#include <asm/asm-prototypes.h>
+ #include <asm/hmi.h>
   #include <sysdev/fsl_pci.h>
   
   #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
@@@ -308,9 -308,13 +309,13 @@@ long hmi_exception_realmode(struct pt_r
   {
         __this_cpu_inc(irq_stat.hmi_exceptions);
   
+       wait_for_subcore_guest_exit();
+ 
         if (ppc_md.hmi_exception_early)
                 ppc_md.hmi_exception_early(regs);
   
+       wait_for_tb_resync();
+ 
         return 0;
   }
   
@@@ -1377,7 -1381,6 +1382,7 @@@ void facility_unavailable_exception(str
                 [FSCR_TM_LG] = "TM",
                 [FSCR_EBB_LG] = "EBB",
                 [FSCR_TAR_LG] = "TAR",
+ +              [FSCR_LM_LG] = "LM",
         };
         char *facility = "unknown";
         u64 value;
@@@ -1420,8 -1423,7 +1425,8 @@@
                         rd = (instword >> 21) & 0x1f;
                         current->thread.dscr = regs->gpr[rd];
                         current->thread.dscr_inherit = 1;
- -                      mtspr(SPRN_FSCR, value | FSCR_DSCR);
+ +                      current->thread.fscr |= FSCR_DSCR;
+ +                      mtspr(SPRN_FSCR, current->thread.fscr);
                 }
   
                 /* Read from DSCR (mfspr RT, 0x03) */
@@@ -1435,14 -1437,6 +1440,14 @@@
                         emulate_single_step(regs);
                 }
                 return;
+ +      } else if ((status == FSCR_LM_LG) && cpu_has_feature(CPU_FTR_ARCH_300)) {
+ +              /*
+ +               * This process has touched LM, so turn it on forever
+ +               * for this process
+ +               */
+ +              current->thread.fscr |= FSCR_LM;
+ +              mtspr(SPRN_FSCR, current->thread.fscr);
+ +              return;
         }
   
         if ((status < ARRAY_SIZE(facility_strings)) &&
diff --combined arch/powerpc/kvm/book3s_hv_rmhandlers.S

index 86f0cae,543124f..9756555
--- 1/arch/powerpc/kvm/book3s_hv_rmhandlers.S
--- 2/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@@ -29,6 -29,7 +29,7 @@@
   #include <asm/kvm_book3s_asm.h>
   #include <asm/book3s/64/mmu-hash.h>
   #include <asm/tm.h>
+ #include <asm/opal.h>
   
   #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
   
@@@ -373,6 -374,18 +374,18 @@@ kvm_secondary_got_guest
         lwsync
         std     r0, HSTATE_KVM_VCORE(r13)
   
+       /*
+        * All secondaries exiting guest will fall through this path.
+        * Before proceeding, just check for HMI interrupt and
+        * invoke opal hmi handler. By now we are sure that the
+        * primary thread on this core/subcore has already made partition
+        * switch/TB resync and we are good to call opal hmi handler.
+        */
+       cmpwi   r12, BOOK3S_INTERRUPT_HMI
+       bne     kvm_no_guest
+ 
+       li      r3,0                    /* NULL argument */
+       bl      hmi_exception_realmode
   /*
    * At this point we have finished executing in the guest.
    * We need to wait for hwthread_req to become zero, since
@@@ -392,7 -405,7 +405,7 @@@ kvm_no_guest
         cmpwi   r3, 0
         bne     54f
   /*
- - * We jump to power7_wakeup_loss, which will return to the caller
+ + * We jump to pnv_wakeup_loss, which will return to the caller
    * of power7_nap in the powernv cpu offline loop.  The value we
    * put in r3 becomes the return value for power7_nap.
    */
@@@ -401,7 -414,7 +414,7 @@@
         rlwimi  r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
         mtspr   SPRN_LPCR, r4
         li      r3, 0
- -      b       power7_wakeup_loss
+ +      b       pnv_wakeup_loss
   
   53:   HMT_LOW
         ld      r5, HSTATE_KVM_VCORE(r13)
@@@ -427,6 -440,22 +440,22 @@@
    * whole-core mode, so we need to nap.
    */
   kvm_unsplit_nap:
+       /*
+        * When secondaries are napping in kvm_unsplit_nap() with
+        * hwthread_req = 1, HMI goes ignored even though subcores are
+        * already exited the guest. Hence HMI keeps waking up secondaries
+        * from nap in a loop and secondaries always go back to nap since
+        * no vcore is assigned to them. This makes impossible for primary
+        * thread to get hold of secondary threads resulting into a soft
+        * lockup in KVM path.
+        *
+        * Let us check if HMI is pending and handle it before we go to nap.
+        */
+       cmpwi   r12, BOOK3S_INTERRUPT_HMI
+       bne     55f
+       li      r3, 0                   /* NULL argument */
+       bl      hmi_exception_realmode
+ 55:
         /*
          * Ensure that secondary doesn't nap when it has
          * its vcore pointer set.
@@@ -601,6 -630,11 +630,11 @@@ BEGIN_FTR_SECTIO
         mtspr   SPRN_DPDES, r8
   END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
   
+       /* Mark the subcore state as inside guest */
+       bl      kvmppc_subcore_enter_guest
+       nop
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       ld      r4, HSTATE_KVM_VCPU(r13)
         li      r0,1
         stb     r0,VCORE_IN_GUEST(r5)   /* signal secondaries to continue */
   
@@@ -655,112 -689,8 +689,8 @@@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S
   
   #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
   BEGIN_FTR_SECTION
-       b       skip_tm
- END_FTR_SECTION_IFCLR(CPU_FTR_TM)
- 
-       /* Turn on TM/FP/VSX/VMX so we can restore them. */
-       mfmsr   r5
-       li      r6, MSR_TM >> 32
-       sldi    r6, r6, 32
-       or      r5, r5, r6
-       ori     r5, r5, MSR_FP
-       oris    r5, r5, (MSR_VEC | MSR_VSX)@h
-       mtmsrd  r5
- 
-       /*
-        * The user may change these outside of a transaction, so they must
-        * always be context switched.
-        */
-       ld      r5, VCPU_TFHAR(r4)
-       ld      r6, VCPU_TFIAR(r4)
-       ld      r7, VCPU_TEXASR(r4)
-       mtspr   SPRN_TFHAR, r5
-       mtspr   SPRN_TFIAR, r6
-       mtspr   SPRN_TEXASR, r7
- 
-       ld      r5, VCPU_MSR(r4)
-       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-       beq     skip_tm /* TM not active in guest */
- 
-       /* Make sure the failure summary is set, otherwise we'll program check
-        * when we trechkpt.  It's possible that this might have been not set
-        * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
-        * host.
-        */
-       oris    r7, r7, (TEXASR_FS)@h
-       mtspr   SPRN_TEXASR, r7
- 
-       /*
-        * We need to load up the checkpointed state for the guest.
-        * We need to do this early as it will blow away any GPRs, VSRs and
-        * some SPRs.
-        */
- 
-       mr      r31, r4
-       addi    r3, r31, VCPU_FPRS_TM
-       bl      load_fp_state
-       addi    r3, r31, VCPU_VRS_TM
-       bl      load_vr_state
-       mr      r4, r31
-       lwz     r7, VCPU_VRSAVE_TM(r4)
-       mtspr   SPRN_VRSAVE, r7
- 
-       ld      r5, VCPU_LR_TM(r4)
-       lwz     r6, VCPU_CR_TM(r4)
-       ld      r7, VCPU_CTR_TM(r4)
-       ld      r8, VCPU_AMR_TM(r4)
-       ld      r9, VCPU_TAR_TM(r4)
-       mtlr    r5
-       mtcr    r6
-       mtctr   r7
-       mtspr   SPRN_AMR, r8
-       mtspr   SPRN_TAR, r9
- 
-       /*
-        * Load up PPR and DSCR values but don't put them in the actual SPRs
-        * till the last moment to avoid running with userspace PPR and DSCR for
-        * too long.
-        */
-       ld      r29, VCPU_DSCR_TM(r4)
-       ld      r30, VCPU_PPR_TM(r4)
- 
-       std     r2, PACATMSCRATCH(r13) /* Save TOC */
- 
-       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
-       li      r5, 0
-       mtmsrd  r5, 1
- 
-       /* Load GPRs r0-r28 */
-       reg = 0
-       .rept   29
-       ld      reg, VCPU_GPRS_TM(reg)(r31)
-       reg = reg + 1
-       .endr
- 
-       mtspr   SPRN_DSCR, r29
-       mtspr   SPRN_PPR, r30
- 
-       /* Load final GPRs */
-       ld      29, VCPU_GPRS_TM(29)(r31)
-       ld      30, VCPU_GPRS_TM(30)(r31)
-       ld      31, VCPU_GPRS_TM(31)(r31)
- 
-       /* TM checkpointed state is now setup.  All GPRs are now volatile. */
-       TRECHKPT
- 
-       /* Now let's get back the state we need. */
-       HMT_MEDIUM
-       GET_PACA(r13)
-       ld      r29, HSTATE_DSCR(r13)
-       mtspr   SPRN_DSCR, r29
-       ld      r4, HSTATE_KVM_VCPU(r13)
-       ld      r1, HSTATE_HOST_R1(r13)
-       ld      r2, PACATMSCRATCH(r13)
- 
-       /* Set the MSR RI since we have our registers back. */
-       li      r5, MSR_RI
-       mtmsrd  r5, 1
- skip_tm:
+       bl      kvmppc_restore_tm
+ END_FTR_SECTION_IFSET(CPU_FTR_TM)
   #endif
   
         /* Load guest PMU registers */
@@@ -841,12 -771,6 +771,6 @@@ BEGIN_FTR_SECTIO
         /* Skip next section on POWER7 */
         b       8f
   END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
-       /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
-       mfmsr   r8
-       li      r0, 1
-       rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-       mtmsrd  r8
- 
         /* Load up POWER8-specific registers */
         ld      r5, VCPU_IAMR(r4)
         lwz     r6, VCPU_PSPB(r4)
@@@ -1436,106 -1360,8 +1360,8 @@@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S
   
   #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
   BEGIN_FTR_SECTION
-       b       2f
- END_FTR_SECTION_IFCLR(CPU_FTR_TM)
-       /* Turn on TM. */
-       mfmsr   r8
-       li      r0, 1
-       rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-       mtmsrd  r8
- 
-       ld      r5, VCPU_MSR(r9)
-       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-       beq     1f      /* TM not active in guest. */
- 
-       li      r3, TM_CAUSE_KVM_RESCHED
- 
-       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
-       li      r5, 0
-       mtmsrd  r5, 1
- 
-       /* All GPRs are volatile at this point. */
-       TRECLAIM(R3)
- 
-       /* Temporarily store r13 and r9 so we have some regs to play with */
-       SET_SCRATCH0(r13)
-       GET_PACA(r13)
-       std     r9, PACATMSCRATCH(r13)
-       ld      r9, HSTATE_KVM_VCPU(r13)
- 
-       /* Get a few more GPRs free. */
-       std     r29, VCPU_GPRS_TM(29)(r9)
-       std     r30, VCPU_GPRS_TM(30)(r9)
-       std     r31, VCPU_GPRS_TM(31)(r9)
- 
-       /* Save away PPR and DSCR soon so don't run with user values. */
-       mfspr   r31, SPRN_PPR
-       HMT_MEDIUM
-       mfspr   r30, SPRN_DSCR
-       ld      r29, HSTATE_DSCR(r13)
-       mtspr   SPRN_DSCR, r29
- 
-       /* Save all but r9, r13 & r29-r31 */
-       reg = 0
-       .rept   29
-       .if (reg != 9) && (reg != 13)
-       std     reg, VCPU_GPRS_TM(reg)(r9)
-       .endif
-       reg = reg + 1
-       .endr
-       /* ... now save r13 */
-       GET_SCRATCH0(r4)
-       std     r4, VCPU_GPRS_TM(13)(r9)
-       /* ... and save r9 */
-       ld      r4, PACATMSCRATCH(r13)
-       std     r4, VCPU_GPRS_TM(9)(r9)
- 
-       /* Reload stack pointer and TOC. */
-       ld      r1, HSTATE_HOST_R1(r13)
-       ld      r2, PACATOC(r13)
- 
-       /* Set MSR RI now we have r1 and r13 back. */
-       li      r5, MSR_RI
-       mtmsrd  r5, 1
- 
-       /* Save away checkpinted SPRs. */
-       std     r31, VCPU_PPR_TM(r9)
-       std     r30, VCPU_DSCR_TM(r9)
-       mflr    r5
-       mfcr    r6
-       mfctr   r7
-       mfspr   r8, SPRN_AMR
-       mfspr   r10, SPRN_TAR
-       std     r5, VCPU_LR_TM(r9)
-       stw     r6, VCPU_CR_TM(r9)
-       std     r7, VCPU_CTR_TM(r9)
-       std     r8, VCPU_AMR_TM(r9)
-       std     r10, VCPU_TAR_TM(r9)
- 
-       /* Restore r12 as trap number. */
-       lwz     r12, VCPU_TRAP(r9)
- 
-       /* Save FP/VSX. */
-       addi    r3, r9, VCPU_FPRS_TM
-       bl      store_fp_state
-       addi    r3, r9, VCPU_VRS_TM
-       bl      store_vr_state
-       mfspr   r6, SPRN_VRSAVE
-       stw     r6, VCPU_VRSAVE_TM(r9)
- 1:
-       /*
-        * We need to save these SPRs after the treclaim so that the software
-        * error code is recorded correctly in the TEXASR.  Also the user may
-        * change these outside of a transaction, so they must always be
-        * context switched.
-        */
-       mfspr   r5, SPRN_TFHAR
-       mfspr   r6, SPRN_TFIAR
-       mfspr   r7, SPRN_TEXASR
-       std     r5, VCPU_TFHAR(r9)
-       std     r6, VCPU_TFIAR(r9)
-       std     r7, VCPU_TEXASR(r9)
- 2:
+       bl      kvmppc_save_tm
+ END_FTR_SECTION_IFSET(CPU_FTR_TM)
   #endif
   
         /* Increment yield count if they have a VPA */
@@@ -1683,6 -1509,23 +1509,23 @@@ BEGIN_FTR_SECTIO
         mtspr   SPRN_DPDES, r8
   END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
   
+       /* If HMI, call kvmppc_realmode_hmi_handler() */
+       cmpwi   r12, BOOK3S_INTERRUPT_HMI
+       bne     27f
+       bl      kvmppc_realmode_hmi_handler
+       nop
+       li      r12, BOOK3S_INTERRUPT_HMI
+       /*
+        * At this point kvmppc_realmode_hmi_handler would have resync-ed
+        * the TB. Hence it is not required to subtract guest timebase
+        * offset from timebase. So, skip it.
+        *
+        * Also, do not call kvmppc_subcore_exit_guest() because it has
+        * been invoked as part of kvmppc_realmode_hmi_handler().
+        */
+       b       30f
+ 
+ 27:
         /* Subtract timebase offset from timebase */
         ld      r8,VCORE_TB_OFFSET(r5)
         cmpdi   r8,0
@@@ -1698,8 -1541,13 +1541,13 @@@
         addis   r8,r8,0x100             /* if so, increment upper 40 bits */
         mtspr   SPRN_TBU40,r8
   
+ 17:   bl      kvmppc_subcore_exit_guest
+       nop
+ 30:   ld      r5,HSTATE_KVM_VCORE(r13)
+       ld      r4,VCORE_KVM(r5)        /* pointer to struct kvm */
+ 
         /* Reset PCR */
- 17:   ld      r0, VCORE_PCR(r5)
+       ld      r0, VCORE_PCR(r5)
         cmpdi   r0, 0
         beq     18f
         li      r0, 0
@@@ -2245,6 -2093,13 +2093,13 @@@ _GLOBAL(kvmppc_h_cede)                /* r3 = vcpu po
         /* save FP state */
         bl      kvmppc_save_fp
   
+ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ BEGIN_FTR_SECTION
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       bl      kvmppc_save_tm
+ END_FTR_SECTION_IFSET(CPU_FTR_TM)
+ #endif
+ 
         /*
          * Set DEC to the smaller of DEC and HDEC, so that we wake
          * no later than the end of our timeslice (HDEC interrupts
@@@ -2321,6 -2176,12 +2176,12 @@@ kvm_end_cede
         bl      kvmhv_accumulate_time
   #endif
   
+ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ BEGIN_FTR_SECTION
+       bl      kvmppc_restore_tm
+ END_FTR_SECTION_IFSET(CPU_FTR_TM)
+ #endif
+ 
         /* load up FP state */
         bl      kvmppc_load_fp
   
@@@ -2461,6 -2322,8 +2322,8 @@@ BEGIN_FTR_SECTIO
         cmpwi   r6, 3                   /* hypervisor doorbell? */
         beq     3f
   END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       cmpwi   r6, 0xa                 /* Hypervisor maintenance ? */
+       beq     4f
         li      r3, 1                   /* anything else, return 1 */
   0:    blr
   
@@@ -2482,6 -2345,11 +2345,11 @@@
         li      r3, -1
         blr
   
+       /* Woken up due to Hypervisor maintenance interrupt */
+ 4:    li      r12, BOOK3S_INTERRUPT_HMI
+       li      r3, 1
+       blr
+ 
   /*
    * Determine what sort of external interrupt is pending (if any).
    * Returns:
@@@ -2631,6 -2499,239 +2499,239 @@@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC
         mr      r4,r31
         blr
   
+ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ /*
+  * Save transactional state and TM-related registers.
+  * Called with r9 pointing to the vcpu struct.
+  * This can modify all checkpointed registers, but
+  * restores r1, r2 and r9 (vcpu pointer) before exit.
+  */
+ kvmppc_save_tm:
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+ 
+       /* Turn on TM. */
+       mfmsr   r8
+       li      r0, 1
+       rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+       mtmsrd  r8
+ 
+       ld      r5, VCPU_MSR(r9)
+       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+       beq     1f      /* TM not active in guest. */
+ 
+       std     r1, HSTATE_HOST_R1(r13)
+       li      r3, TM_CAUSE_KVM_RESCHED
+ 
+       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+       li      r5, 0
+       mtmsrd  r5, 1
+ 
+       /* All GPRs are volatile at this point. */
+       TRECLAIM(R3)
+ 
+       /* Temporarily store r13 and r9 so we have some regs to play with */
+       SET_SCRATCH0(r13)
+       GET_PACA(r13)
+       std     r9, PACATMSCRATCH(r13)
+       ld      r9, HSTATE_KVM_VCPU(r13)
+ 
+       /* Get a few more GPRs free. */
+       std     r29, VCPU_GPRS_TM(29)(r9)
+       std     r30, VCPU_GPRS_TM(30)(r9)
+       std     r31, VCPU_GPRS_TM(31)(r9)
+ 
+       /* Save away PPR and DSCR soon so don't run with user values. */
+       mfspr   r31, SPRN_PPR
+       HMT_MEDIUM
+       mfspr   r30, SPRN_DSCR
+       ld      r29, HSTATE_DSCR(r13)
+       mtspr   SPRN_DSCR, r29
+ 
+       /* Save all but r9, r13 & r29-r31 */
+       reg = 0
+       .rept   29
+       .if (reg != 9) && (reg != 13)
+       std     reg, VCPU_GPRS_TM(reg)(r9)
+       .endif
+       reg = reg + 1
+       .endr
+       /* ... now save r13 */
+       GET_SCRATCH0(r4)
+       std     r4, VCPU_GPRS_TM(13)(r9)
+       /* ... and save r9 */
+       ld      r4, PACATMSCRATCH(r13)
+       std     r4, VCPU_GPRS_TM(9)(r9)
+ 
+       /* Reload stack pointer and TOC. */
+       ld      r1, HSTATE_HOST_R1(r13)
+       ld      r2, PACATOC(r13)
+ 
+       /* Set MSR RI now we have r1 and r13 back. */
+       li      r5, MSR_RI
+       mtmsrd  r5, 1
+ 
+       /* Save away checkpinted SPRs. */
+       std     r31, VCPU_PPR_TM(r9)
+       std     r30, VCPU_DSCR_TM(r9)
+       mflr    r5
+       mfcr    r6
+       mfctr   r7
+       mfspr   r8, SPRN_AMR
+       mfspr   r10, SPRN_TAR
+       std     r5, VCPU_LR_TM(r9)
+       stw     r6, VCPU_CR_TM(r9)
+       std     r7, VCPU_CTR_TM(r9)
+       std     r8, VCPU_AMR_TM(r9)
+       std     r10, VCPU_TAR_TM(r9)
+ 
+       /* Restore r12 as trap number. */
+       lwz     r12, VCPU_TRAP(r9)
+ 
+       /* Save FP/VSX. */
+       addi    r3, r9, VCPU_FPRS_TM
+       bl      store_fp_state
+       addi    r3, r9, VCPU_VRS_TM
+       bl      store_vr_state
+       mfspr   r6, SPRN_VRSAVE
+       stw     r6, VCPU_VRSAVE_TM(r9)
+ 1:
+       /*
+        * We need to save these SPRs after the treclaim so that the software
+        * error code is recorded correctly in the TEXASR.  Also the user may
+        * change these outside of a transaction, so they must always be
+        * context switched.
+        */
+       mfspr   r5, SPRN_TFHAR
+       mfspr   r6, SPRN_TFIAR
+       mfspr   r7, SPRN_TEXASR
+       std     r5, VCPU_TFHAR(r9)
+       std     r6, VCPU_TFIAR(r9)
+       std     r7, VCPU_TEXASR(r9)
+ 
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
+       blr
+ 
+ /*
+  * Restore transactional state and TM-related registers.
+  * Called with r4 pointing to the vcpu struct.
+  * This potentially modifies all checkpointed registers.
+  * It restores r1, r2, r4 from the PACA.
+  */
+ kvmppc_restore_tm:
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+ 
+       /* Turn on TM/FP/VSX/VMX so we can restore them. */
+       mfmsr   r5
+       li      r6, MSR_TM >> 32
+       sldi    r6, r6, 32
+       or      r5, r5, r6
+       ori     r5, r5, MSR_FP
+       oris    r5, r5, (MSR_VEC | MSR_VSX)@h
+       mtmsrd  r5
+ 
+       /*
+        * The user may change these outside of a transaction, so they must
+        * always be context switched.
+        */
+       ld      r5, VCPU_TFHAR(r4)
+       ld      r6, VCPU_TFIAR(r4)
+       ld      r7, VCPU_TEXASR(r4)
+       mtspr   SPRN_TFHAR, r5
+       mtspr   SPRN_TFIAR, r6
+       mtspr   SPRN_TEXASR, r7
+ 
+       ld      r5, VCPU_MSR(r4)
+       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+       beqlr           /* TM not active in guest */
+       std     r1, HSTATE_HOST_R1(r13)
+ 
+       /* Make sure the failure summary is set, otherwise we'll program check
+        * when we trechkpt.  It's possible that this might have been not set
+        * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
+        * host.
+        */
+       oris    r7, r7, (TEXASR_FS)@h
+       mtspr   SPRN_TEXASR, r7
+ 
+       /*
+        * We need to load up the checkpointed state for the guest.
+        * We need to do this early as it will blow away any GPRs, VSRs and
+        * some SPRs.
+        */
+ 
+       mr      r31, r4
+       addi    r3, r31, VCPU_FPRS_TM
+       bl      load_fp_state
+       addi    r3, r31, VCPU_VRS_TM
+       bl      load_vr_state
+       mr      r4, r31
+       lwz     r7, VCPU_VRSAVE_TM(r4)
+       mtspr   SPRN_VRSAVE, r7
+ 
+       ld      r5, VCPU_LR_TM(r4)
+       lwz     r6, VCPU_CR_TM(r4)
+       ld      r7, VCPU_CTR_TM(r4)
+       ld      r8, VCPU_AMR_TM(r4)
+       ld      r9, VCPU_TAR_TM(r4)
+       mtlr    r5
+       mtcr    r6
+       mtctr   r7
+       mtspr   SPRN_AMR, r8
+       mtspr   SPRN_TAR, r9
+ 
+       /*
+        * Load up PPR and DSCR values but don't put them in the actual SPRs
+        * till the last moment to avoid running with userspace PPR and DSCR for
+        * too long.
+        */
+       ld      r29, VCPU_DSCR_TM(r4)
+       ld      r30, VCPU_PPR_TM(r4)
+ 
+       std     r2, PACATMSCRATCH(r13) /* Save TOC */
+ 
+       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+       li      r5, 0
+       mtmsrd  r5, 1
+ 
+       /* Load GPRs r0-r28 */
+       reg = 0
+       .rept   29
+       ld      reg, VCPU_GPRS_TM(reg)(r31)
+       reg = reg + 1
+       .endr
+ 
+       mtspr   SPRN_DSCR, r29
+       mtspr   SPRN_PPR, r30
+ 
+       /* Load final GPRs */
+       ld      29, VCPU_GPRS_TM(29)(r31)
+       ld      30, VCPU_GPRS_TM(30)(r31)
+       ld      31, VCPU_GPRS_TM(31)(r31)
+ 
+       /* TM checkpointed state is now setup.  All GPRs are now volatile. */
+       TRECHKPT
+ 
+       /* Now let's get back the state we need. */
+       HMT_MEDIUM
+       GET_PACA(r13)
+       ld      r29, HSTATE_DSCR(r13)
+       mtspr   SPRN_DSCR, r29
+       ld      r4, HSTATE_KVM_VCPU(r13)
+       ld      r1, HSTATE_HOST_R1(r13)
+       ld      r2, PACATMSCRATCH(r13)
+ 
+       /* Set the MSR RI since we have our registers back. */
+       li      r5, MSR_RI
+       mtmsrd  r5, 1
+ 
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
+       blr
+ #endif
+ 
   /*
    * We come here if we get any exception or interrupt while we are
    * executing host real mode code while in guest MMU context.
diff --combined arch/powerpc/kvm/book3s_pr.c

index c4f7d6b,b6cd730..e76f79a
--- 1/arch/powerpc/kvm/book3s_pr.c
--- 2/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@@ -35,7 -35,7 +35,7 @@@
   #include <asm/mmu_context.h>
   #include <asm/switch_to.h>
   #include <asm/firmware.h>
- -#include <asm/hvcall.h>
+ +#include <asm/setup.h>
   #include <linux/gfp.h>
   #include <linux/sched.h>
   #include <linux/vmalloc.h>
@@@ -914,7 -914,7 +914,7 @@@ int kvmppc_handle_exit_pr(struct kvm_ru
         /* We get here with MSR.EE=1 */
   
         trace_kvm_exit(exit_nr, vcpu);
-       kvm_guest_exit();
+       guest_exit();
   
         switch (exit_nr) {
         case BOOK3S_INTERRUPT_INST_STORAGE:
@@@ -1049,7 -1049,17 +1049,17 @@@
                 int emul;
   
   program_interrupt:
-               flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+               /*
+                * shadow_srr1 only contains valid flags if we came here via
+                * a program exception. The other exceptions (emulation assist,
+                * FP unavailable, etc.) do not provide flags in SRR1, so use
+                * an illegal-instruction exception when injecting a program
+                * interrupt into the guest.
+                */
+               if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
+                       flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+               else
+                       flags = SRR1_PROGILL;
   
                 emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
                 if (emul != EMULATE_DONE) {
@@@ -1531,7 -1541,7 +1541,7 @@@ static int kvmppc_vcpu_run_pr(struct kv
   
         kvmppc_clear_debug(vcpu);
   
-       /* No need for kvm_guest_exit. It's done in handle_exit.
+       /* No need for guest_exit. It's done in handle_exit.
            We also get here with interrupts enabled. */
   
         /* Make sure we save the guest FPU/Altivec/VSX state */
@@@ -1690,7 -1700,7 +1700,7 @@@ static int kvmppc_core_init_vm_pr(struc
         if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
                 spin_lock(&kvm_global_user_count_lock);
                 if (++kvm_global_user_count == 1)
- -                      pSeries_disable_reloc_on_exc();
+ +                      pseries_disable_reloc_on_exc();
                 spin_unlock(&kvm_global_user_count_lock);
         }
         return 0;
@@@ -1706,7 -1716,7 +1716,7 @@@ static void kvmppc_core_destroy_vm_pr(s
                 spin_lock(&kvm_global_user_count_lock);
                 BUG_ON(kvm_global_user_count == 0);
                 if (--kvm_global_user_count == 0)
- -                      pSeries_enable_reloc_on_exc();
+ +                      pseries_enable_reloc_on_exc();
                 spin_unlock(&kvm_global_user_count_lock);
         }
   }
diff --combined arch/powerpc/platforms/powernv/opal-wrappers.S

index cf928bb,df6ad94..3d29d40
--- 1/arch/powerpc/platforms/powernv/opal-wrappers.S
--- 2/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@@ -59,12 -59,11 +59,11 @@@ END_FTR_SECTION(0, 1);                                             
   #define OPAL_CALL(name, token)                \
    _GLOBAL_TOC(name);                   \
         mflr    r0;                     \
- -      std     r0,16(r1);              \
+ +      std     r0,PPC_LR_STKOFF(r1);   \
         li      r0,token;               \
         OPAL_BRANCH(opal_tracepoint_entry) \
         mfcr    r12;                    \
         stw     r12,8(r1);              \
-       std     r1,PACAR1(r13);         \
         li      r11,0;                  \
         mfmsr   r12;                    \
         ori     r11,r11,MSR_EE;         \
@@@ -92,7 -91,7 +91,7 @@@ opal_return
         FIXUP_ENDIAN
         ld      r2,PACATOC(r13);
         lwz     r4,8(r1);
- -      ld      r5,16(r1);
+ +      ld      r5,PPC_LR_STKOFF(r1);
         ld      r6,PACASAVEDMSR(r13);
         mtspr   SPRN_SRR0,r5;
         mtspr   SPRN_SRR1,r6;
@@@ -127,7 -126,6 +126,6 @@@ opal_tracepoint_entry
         mfcr    r12
         std     r11,16(r1)
         stw     r12,8(r1)
-       std     r1,PACAR1(r13)
         li      r11,0
         mfmsr   r12
         ori     r11,r11,MSR_EE
@@@ -157,37 -155,43 +155,37 @@@ opal_tracepoint_return
         blr
   #endif
   
- -/*
- - * Make opal call in realmode. This is a generic function to be called
- - * from realmode. It handles endianness.
- - *
- - * r13 - paca pointer
- - * r1  - stack pointer
- - * r0  - opal token
- - */
- -_GLOBAL(opal_call_realmode)
- -      mflr    r12
- -      std     r12,PPC_LR_STKOFF(r1)
- -      ld      r2,PACATOC(r13)
- -      /* Set opal return address */
- -      LOAD_REG_ADDR(r12,return_from_opal_call)
- -      mtlr    r12
- -
- -      mfmsr   r12
- -#ifdef __LITTLE_ENDIAN__
- -      /* Handle endian-ness */
- -      li      r11,MSR_LE
- -      andc    r12,r12,r11
- -#endif
- -      mtspr   SPRN_HSRR1,r12
- -      LOAD_REG_ADDR(r11,opal)
- -      ld      r12,8(r11)
- -      ld      r2,0(r11)
- -      mtspr   SPRN_HSRR0,r12
+ +#define OPAL_CALL_REAL(name, token)                   \
+ + _GLOBAL_TOC(name);                                   \
+ +      mflr    r0;                                     \
+ +      std     r0,PPC_LR_STKOFF(r1);                   \
+ +      li      r0,token;                               \
+ +      mfcr    r12;                                    \
+ +      stw     r12,8(r1);                              \
+ +                                                      \
+ +      /* Set opal return address */                   \
+ +      LOAD_REG_ADDR(r11, opal_return_realmode);       \
+ +      mtlr    r11;                                    \
+ +      mfmsr   r12;                                    \
+ +      li      r11,MSR_LE;                             \
+ +      andc    r12,r12,r11;                            \
+ +      mtspr   SPRN_HSRR1,r12;                         \
+ +      LOAD_REG_ADDR(r11,opal);                        \
+ +      ld      r12,8(r11);                             \
+ +      ld      r2,0(r11);                              \
+ +      mtspr   SPRN_HSRR0,r12;                         \
         hrfid
   
- -return_from_opal_call:
- -#ifdef __LITTLE_ENDIAN__
+ +opal_return_realmode:
         FIXUP_ENDIAN
- -#endif
+ +      ld      r2,PACATOC(r13);
+ +      lwz     r11,8(r1);
         ld      r12,PPC_LR_STKOFF(r1)
+ +      mtcr    r11;
         mtlr    r12
         blr
   
+ +
   OPAL_CALL(opal_invalid_call,                  OPAL_INVALID_CALL);
   OPAL_CALL(opal_console_write,                 OPAL_CONSOLE_WRITE);
   OPAL_CALL(opal_console_read,                  OPAL_CONSOLE_READ);
@@@ -265,7 -269,6 +263,7 @@@ OPAL_CALL(opal_validate_flash,                     OPAL_F
   OPAL_CALL(opal_manage_flash,                  OPAL_FLASH_MANAGE);
   OPAL_CALL(opal_update_flash,                  OPAL_FLASH_UPDATE);
   OPAL_CALL(opal_resync_timebase,                       OPAL_RESYNC_TIMEBASE);
+ +OPAL_CALL_REAL(opal_rm_resync_timebase,               OPAL_RESYNC_TIMEBASE);
   OPAL_CALL(opal_check_token,                   OPAL_CHECK_TOKEN);
   OPAL_CALL(opal_dump_init,                     OPAL_DUMP_INIT);
   OPAL_CALL(opal_dump_info,                     OPAL_DUMP_INFO);
@@@ -273,7 -276,6 +271,7 @@@ OPAL_CALL(opal_dump_info2,                 OPAL_DUMP_
   OPAL_CALL(opal_dump_read,                     OPAL_DUMP_READ);
   OPAL_CALL(opal_dump_ack,                      OPAL_DUMP_ACK);
   OPAL_CALL(opal_get_msg,                               OPAL_GET_MSG);
+ +OPAL_CALL(opal_write_oppanel_async,           OPAL_WRITE_OPPANEL_ASYNC);
   OPAL_CALL(opal_check_completion,              OPAL_CHECK_ASYNC_COMPLETION);
   OPAL_CALL(opal_dump_resend_notification,      OPAL_DUMP_RESEND);
   OPAL_CALL(opal_sync_host_reboot,              OPAL_SYNC_HOST_REBOOT);
@@@ -281,9 -283,7 +279,9 @@@ OPAL_CALL(opal_sensor_read,                        OPAL_SENS
   OPAL_CALL(opal_get_param,                     OPAL_GET_PARAM);
   OPAL_CALL(opal_set_param,                     OPAL_SET_PARAM);
   OPAL_CALL(opal_handle_hmi,                    OPAL_HANDLE_HMI);
+ +OPAL_CALL_REAL(opal_rm_handle_hmi,            OPAL_HANDLE_HMI);
   OPAL_CALL(opal_config_cpu_idle_state,         OPAL_CONFIG_CPU_IDLE_STATE);
+ +OPAL_CALL_REAL(opal_rm_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE);
   OPAL_CALL(opal_slw_set_reg,                   OPAL_SLW_SET_REG);
   OPAL_CALL(opal_register_dump_region,          OPAL_REGISTER_DUMP_REGION);
   OPAL_CALL(opal_unregister_dump_region,                OPAL_UNREGISTER_DUMP_REGION);
@@@ -300,13 -300,3 +298,13 @@@ OPAL_CALL(opal_prd_msg,                          OPAL_PRD_MSG
   OPAL_CALL(opal_leds_get_ind,                  OPAL_LEDS_GET_INDICATOR);
   OPAL_CALL(opal_leds_set_ind,                  OPAL_LEDS_SET_INDICATOR);
   OPAL_CALL(opal_console_flush,                 OPAL_CONSOLE_FLUSH);
+ +OPAL_CALL(opal_get_device_tree,                       OPAL_GET_DEVICE_TREE);
+ +OPAL_CALL(opal_pci_get_presence_state,                OPAL_PCI_GET_PRESENCE_STATE);
+ +OPAL_CALL(opal_pci_get_power_state,           OPAL_PCI_GET_POWER_STATE);
+ +OPAL_CALL(opal_pci_set_power_state,           OPAL_PCI_SET_POWER_STATE);
+ +OPAL_CALL(opal_int_get_xirr,                  OPAL_INT_GET_XIRR);
+ +OPAL_CALL(opal_int_set_cppr,                  OPAL_INT_SET_CPPR);
+ +OPAL_CALL(opal_int_eoi,                               OPAL_INT_EOI);
+ +OPAL_CALL(opal_int_set_mfrr,                  OPAL_INT_SET_MFRR);
+ +OPAL_CALL(opal_pci_tce_kill,                  OPAL_PCI_TCE_KILL);
+ +OPAL_CALL_REAL(opal_rm_pci_tce_kill,          OPAL_PCI_TCE_KILL);
diff --combined arch/s390/include/asm/diag.h

index 86cae09,8221199..8acf482
--- 1/arch/s390/include/asm/diag.h
--- 2/arch/s390/include/asm/diag.h
+++ b/arch/s390/include/asm/diag.h
@@@ -49,7 -49,7 +49,7 @@@ static inline void diag10_range(unsigne
         diag_stat_inc(DIAG_STAT_X010);
         asm volatile(
                 "0:     diag    %0,%1,0x10\n"
- -              "1:\n"
+ +              "1:     nopr    %%r7\n"
                 EX_TABLE(0b, 1b)
                 EX_TABLE(1b, 1b)
                 : : "a" (start_addr), "a" (end_addr));
@@@ -78,4 -78,153 +78,153 @@@ struct diag210 
   
   extern int diag210(struct diag210 *addr);
   
+ /* bit is set in flags, when physical cpu info is included in diag 204 data */
+ #define DIAG204_LPAR_PHYS_FLG 0x80
+ #define DIAG204_LPAR_NAME_LEN 8               /* lpar name len in diag 204 data */
+ #define DIAG204_CPU_NAME_LEN 16               /* type name len of cpus in diag224 name table */
+ 
+ /* diag 204 subcodes */
+ enum diag204_sc {
+       DIAG204_SUBC_STIB4 = 4,
+       DIAG204_SUBC_RSI = 5,
+       DIAG204_SUBC_STIB6 = 6,
+       DIAG204_SUBC_STIB7 = 7
+ };
+ 
+ /* The two available diag 204 data formats */
+ enum diag204_format {
+       DIAG204_INFO_SIMPLE = 0,
+       DIAG204_INFO_EXT = 0x00010000
+ };
+ 
+ enum diag204_cpu_flags {
+       DIAG204_CPU_ONLINE = 0x20,
+       DIAG204_CPU_CAPPED = 0x40,
+ };
+ 
+ struct diag204_info_blk_hdr {
+       __u8  npar;
+       __u8  flags;
+       __u16 tslice;
+       __u16 phys_cpus;
+       __u16 this_part;
+       __u64 curtod;
+ } __packed;
+ 
+ struct diag204_x_info_blk_hdr {
+       __u8  npar;
+       __u8  flags;
+       __u16 tslice;
+       __u16 phys_cpus;
+       __u16 this_part;
+       __u64 curtod1;
+       __u64 curtod2;
+       char reserved[40];
+ } __packed;
+ 
+ struct diag204_part_hdr {
+       __u8 pn;
+       __u8 cpus;
+       char reserved[6];
+       char part_name[DIAG204_LPAR_NAME_LEN];
+ } __packed;
+ 
+ struct diag204_x_part_hdr {
+       __u8  pn;
+       __u8  cpus;
+       __u8  rcpus;
+       __u8  pflag;
+       __u32 mlu;
+       char  part_name[DIAG204_LPAR_NAME_LEN];
+       char  lpc_name[8];
+       char  os_name[8];
+       __u64 online_cs;
+       __u64 online_es;
+       __u8  upid;
+       __u8  reserved:3;
+       __u8  mtid:5;
+       char  reserved1[2];
+       __u32 group_mlu;
+       char  group_name[8];
+       char  hardware_group_name[8];
+       char  reserved2[24];
+ } __packed;
+ 
+ struct diag204_cpu_info {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       __u8  cflag;
+       __u16 weight;
+       __u64 acc_time;
+       __u64 lp_time;
+ } __packed;
+ 
+ struct diag204_x_cpu_info {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       __u8  cflag;
+       __u16 weight;
+       __u64 acc_time;
+       __u64 lp_time;
+       __u16 min_weight;
+       __u16 cur_weight;
+       __u16 max_weight;
+       char  reseved2[2];
+       __u64 online_time;
+       __u64 wait_time;
+       __u32 pma_weight;
+       __u32 polar_weight;
+       __u32 cpu_type_cap;
+       __u32 group_cpu_type_cap;
+       char  reserved3[32];
+ } __packed;
+ 
+ struct diag204_phys_hdr {
+       char reserved1[1];
+       __u8 cpus;
+       char reserved2[6];
+       char mgm_name[8];
+ } __packed;
+ 
+ struct diag204_x_phys_hdr {
+       char reserved1[1];
+       __u8 cpus;
+       char reserved2[6];
+       char mgm_name[8];
+       char reserved3[80];
+ } __packed;
+ 
+ struct diag204_phys_cpu {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       char  reserved2[3];
+       __u64 mgm_time;
+       char  reserved3[8];
+ } __packed;
+ 
+ struct diag204_x_phys_cpu {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       char  reserved2[1];
+       __u16 weight;
+       __u64 mgm_time;
+       char  reserved3[80];
+ } __packed;
+ 
+ struct diag204_x_part_block {
+       struct diag204_x_part_hdr hdr;
+       struct diag204_x_cpu_info cpus[];
+ } __packed;
+ 
+ struct diag204_x_phys_block {
+       struct diag204_x_phys_hdr hdr;
+       struct diag204_x_phys_cpu cpus[];
+ } __packed;
+ 
+ int diag204(unsigned long subcode, unsigned long size, void *addr);
+ int diag224(void *ptr);
   #endif /* _ASM_S390_DIAG_H */
diff --combined arch/s390/include/asm/kvm_host.h

index ac82e8e,183b017..8e5daf7
--- 1/arch/s390/include/asm/kvm_host.h
--- 2/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@@ -43,6 -43,7 +43,7 @@@
   /* s390-specific vcpu->requests bit members */
   #define KVM_REQ_ENABLE_IBS         8
   #define KVM_REQ_DISABLE_IBS        9
+ #define KVM_REQ_ICPT_OPEREXC       10
   
   #define SIGP_CTRL_C           0x80
   #define SIGP_CTRL_SCN_MASK    0x3f
@@@ -145,7 -146,7 +146,7 @@@ struct kvm_s390_sie_block 
         __u64   cputm;                  /* 0x0028 */
         __u64   ckc;                    /* 0x0030 */
         __u64   epoch;                  /* 0x0038 */
-       __u8    reserved40[4];          /* 0x0040 */
+       __u32   svcc;                   /* 0x0040 */
   #define LCTL_CR0      0x8000
   #define LCTL_CR6      0x0200
   #define LCTL_CR9      0x0040
@@@ -154,6 -155,7 +155,7 @@@
   #define LCTL_CR14     0x0002
         __u16   lctl;                   /* 0x0044 */
         __s16   icpua;                  /* 0x0046 */
+ #define ICTL_OPEREXC  0x80000000
   #define ICTL_PINT     0x20000000
   #define ICTL_LPSW     0x00400000
   #define ICTL_STCTL    0x00040000
@@@ -166,6 -168,9 +168,9 @@@
   #define ICPT_INST     0x04
   #define ICPT_PROGI    0x08
   #define ICPT_INSTPROGI        0x0C
+ #define ICPT_EXTINT   0x14
+ #define ICPT_VALIDITY 0x20
+ #define ICPT_STOP     0x28
   #define ICPT_OPEREXC  0x2C
   #define ICPT_PARTEXEC 0x38
   #define ICPT_IOINST   0x40
@@@ -185,7 -190,9 +190,9 @@@
         __u32   scaol;                  /* 0x0064 */
         __u8    reserved68[4];          /* 0x0068 */
         __u32   todpr;                  /* 0x006c */
-       __u8    reserved70[32];         /* 0x0070 */
+       __u8    reserved70[16];         /* 0x0070 */
+       __u64   mso;                    /* 0x0080 */
+       __u64   msl;                    /* 0x0088 */
         psw_t   gpsw;                   /* 0x0090 */
         __u64   gg14;                   /* 0x00a0 */
         __u64   gg15;                   /* 0x00a8 */
@@@ -223,7 -230,7 +230,7 @@@
         __u8    reserved1e6[2];         /* 0x01e6 */
         __u64   itdba;                  /* 0x01e8 */
         __u64   riccbd;                 /* 0x01f0 */
-       __u8    reserved1f8[8];         /* 0x01f8 */
+       __u64   gvrd;                   /* 0x01f8 */
   } __attribute__((packed));
   
   struct kvm_s390_itdb {
@@@ -245,7 -252,6 +252,7 @@@ struct kvm_vcpu_stat 
         u32 exit_stop_request;
         u32 exit_validity;
         u32 exit_instruction;
+ +      u32 exit_pei;
         u32 halt_successful_poll;
         u32 halt_attempted_poll;
         u32 halt_poll_invalid;
@@@ -256,6 -262,7 +263,7 @@@
         u32 instruction_stctg;
         u32 exit_program_interruption;
         u32 exit_instr_and_program;
+       u32 exit_operation_exception;
         u32 deliver_external_call;
         u32 deliver_emergency_signal;
         u32 deliver_service_signal;
@@@ -278,7 -285,9 +286,9 @@@
         u32 instruction_stsi;
         u32 instruction_stfl;
         u32 instruction_tprot;
+       u32 instruction_sie;
         u32 instruction_essa;
+       u32 instruction_sthyi;
         u32 instruction_sigp_sense;
         u32 instruction_sigp_sense_running;
         u32 instruction_sigp_external_call;
@@@ -541,12 -550,16 +551,16 @@@ struct kvm_guestdbg_info_arch 
   
   struct kvm_vcpu_arch {
         struct kvm_s390_sie_block *sie_block;
+       /* if vsie is active, currently executed shadow sie control block */
+       struct kvm_s390_sie_block *vsie_block;
         unsigned int      host_acrs[NUM_ACRS];
         struct fpu        host_fpregs;
         struct kvm_s390_local_interrupt local_int;
         struct hrtimer    ckc_timer;
         struct kvm_s390_pgm_info pgm;
         struct gmap *gmap;
+       /* backup location for the currently enabled gmap when scheduled out */
+       struct gmap *enabled_gmap;
         struct kvm_guestdbg_info_arch guestdbg;
         unsigned long pfault_token;
         unsigned long pfault_select;
@@@ -631,6 -644,14 +645,14 @@@ struct sie_page2 
         u8 reserved900[0x1000 - 0x900];                 /* 0x0900 */
   } __packed;
   
+ struct kvm_s390_vsie {
+       struct mutex mutex;
+       struct radix_tree_root addr_to_page;
+       int page_count;
+       int next;
+       struct page *pages[KVM_MAX_VCPUS];
+ };
+ 
   struct kvm_arch{
         void *sca;
         int use_esca;
@@@ -646,15 -667,20 +668,20 @@@
         int user_cpu_state_ctrl;
         int user_sigp;
         int user_stsi;
+       int user_instr0;
         struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
         wait_queue_head_t ipte_wq;
         int ipte_lock_count;
         struct mutex ipte_mutex;
+       struct ratelimit_state sthyi_limit;
         spinlock_t start_stop_lock;
         struct sie_page2 *sie_page2;
         struct kvm_s390_cpu_model model;
         struct kvm_s390_crypto crypto;
+       struct kvm_s390_vsie vsie;
         u64 epoch;
+       /* subset of available cpu features enabled by user space */
+       DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
   };
   
   #define KVM_HVA_ERR_BAD               (-1UL)
diff --combined arch/s390/include/asm/mmu.h

index 1822643,b941528..6d39329
--- 1/arch/s390/include/asm/mmu.h
--- 2/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@@ -6,10 -6,11 +6,11 @@@
   
   typedef struct {
         cpumask_t cpu_attach_mask;
- -      atomic_t attach_count;
+ +      atomic_t flush_count;
         unsigned int flush_mm;
-       spinlock_t list_lock;
+       spinlock_t pgtable_lock;
         struct list_head pgtable_list;
+       spinlock_t gmap_lock;
         struct list_head gmap_list;
         unsigned long asce;
         unsigned long asce_limit;
@@@ -22,9 -23,11 +23,11 @@@
         unsigned int use_skey:1;
   } mm_context_t;
   
- #define INIT_MM_CONTEXT(name)                                               \
-       .context.list_lock    = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \
-       .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list),    \
+ #define INIT_MM_CONTEXT(name)                                            \
+       .context.pgtable_lock =                                            \
+                       __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock),   \
+       .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
+       .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \
         .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
   
   static inline int tprot(unsigned long addr)
diff --combined arch/s390/include/asm/mmu_context.h

index f77c638,3ce3854..c6a088c
--- 1/arch/s390/include/asm/mmu_context.h
--- 2/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@@ -15,11 -15,12 +15,12 @@@
   static inline int init_new_context(struct task_struct *tsk,
                                    struct mm_struct *mm)
   {
-       spin_lock_init(&mm->context.list_lock);
+       spin_lock_init(&mm->context.pgtable_lock);
         INIT_LIST_HEAD(&mm->context.pgtable_list);
+       spin_lock_init(&mm->context.gmap_lock);
         INIT_LIST_HEAD(&mm->context.gmap_list);
         cpumask_clear(&mm->context.cpu_attach_mask);
- -      atomic_set(&mm->context.attach_count, 0);
+ +      atomic_set(&mm->context.flush_count, 0);
         mm->context.flush_mm = 0;
   #ifdef CONFIG_PGSTE
         mm->context.alloc_pgste = page_table_allocate_pgste;
@@@ -90,12 -91,15 +91,12 @@@ static inline void switch_mm(struct mm_
         S390_lowcore.user_asce = next->context.asce;
         if (prev == next)
                 return;
- -      if (MACHINE_HAS_TLB_LC)
- -              cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
+ +      cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
+ +      cpumask_set_cpu(cpu, mm_cpumask(next));
         /* Clear old ASCE by loading the kernel ASCE. */
         __ctl_load(S390_lowcore.kernel_asce, 1, 1);
         __ctl_load(S390_lowcore.kernel_asce, 7, 7);
- -      atomic_inc(&next->context.attach_count);
- -      atomic_dec(&prev->context.attach_count);
- -      if (MACHINE_HAS_TLB_LC)
- -              cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
+ +      cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
   }
   
   #define finish_arch_post_lock_switch finish_arch_post_lock_switch
@@@ -107,9 -111,10 +108,9 @@@ static inline void finish_arch_post_loc
         load_kernel_asce();
         if (mm) {
                 preempt_disable();
- -              while (atomic_read(&mm->context.attach_count) >> 16)
+ +              while (atomic_read(&mm->context.flush_count))
                         cpu_relax();
   
- -              cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
                 if (mm->context.flush_mm)
                         __tlb_flush_mm(mm);
                 preempt_enable();
@@@ -124,6 -129,7 +125,6 @@@ static inline void activate_mm(struct m
                                  struct mm_struct *next)
   {
         switch_mm(prev, next, current);
- -      cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
         set_user_asce(next);
   }
   
diff --combined arch/s390/include/asm/page.h

index b2146c4,b5edff3..69b8a41
--- 1/arch/s390/include/asm/page.h
--- 2/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@@ -21,7 -21,6 +21,7 @@@
   #define HPAGE_SIZE    (1UL << HPAGE_SHIFT)
   #define HPAGE_MASK    (~(HPAGE_SIZE - 1))
   #define HUGETLB_PAGE_ORDER    (HPAGE_SHIFT - PAGE_SHIFT)
+ +#define HUGE_MAX_HSTATE               2
   
   #define ARCH_HAS_SETCLEAR_HUGE_PTE
   #define ARCH_HAS_HUGE_PTE_TYPE
@@@ -31,12 -30,11 +31,12 @@@
   #include <asm/setup.h>
   #ifndef __ASSEMBLY__
   
+ +void __storage_key_init_range(unsigned long start, unsigned long end);
+ +
   static inline void storage_key_init_range(unsigned long start, unsigned long end)
   {
- -#if PAGE_DEFAULT_KEY
- -      __storage_key_init_range(start, end);
- -#endif
+ +      if (PAGE_DEFAULT_KEY)
+ +              __storage_key_init_range(start, end);
   }
   
   #define clear_page(page)      memset((page), 0, PAGE_SIZE)
@@@ -111,13 -109,14 +111,14 @@@ static inline unsigned char page_get_st
   
   static inline int page_reset_referenced(unsigned long addr)
   {
-       unsigned int ipm;
+       int cc;
   
         asm volatile(
                 "       rrbe    0,%1\n"
                 "       ipm     %0\n"
-               : "=d" (ipm) : "a" (addr) : "cc");
-       return !!(ipm & 0x20000000);
+               "       srl     %0,28\n"
+               : "=d" (cc) : "a" (addr) : "cc");
+       return cc;
   }
   
   /* Bits int the storage key */
@@@ -148,6 -147,8 +149,8 @@@ static inline int devmem_is_allowed(uns
   #define virt_to_page(kaddr)   pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
   #define page_to_phys(page)    (page_to_pfn(page) << PAGE_SHIFT)
   #define virt_addr_valid(kaddr)        pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+ #define pfn_to_virt(pfn)      __va((pfn) << PAGE_SHIFT)
+ #define page_to_virt(page)    pfn_to_virt(page_to_pfn(page))
   
   #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \
                                  VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
diff --combined arch/s390/include/asm/pgtable.h

index 48d383a,c7ebba4..72c7f60
--- 1/arch/s390/include/asm/pgtable.h
--- 2/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@@ -28,33 -28,12 +28,33 @@@
   #include <linux/mm_types.h>
   #include <linux/page-flags.h>
   #include <linux/radix-tree.h>
+ +#include <linux/atomic.h>
   #include <asm/bug.h>
   #include <asm/page.h>
   
- -extern pgd_t swapper_pg_dir[] __attribute__ ((aligned (4096)));
+ +extern pgd_t swapper_pg_dir[];
   extern void paging_init(void);
   extern void vmem_map_init(void);
+ +pmd_t *vmem_pmd_alloc(void);
+ +pte_t *vmem_pte_alloc(void);
+ +
+ +enum {
+ +      PG_DIRECT_MAP_4K = 0,
+ +      PG_DIRECT_MAP_1M,
+ +      PG_DIRECT_MAP_2G,
+ +      PG_DIRECT_MAP_MAX
+ +};
+ +
+ +extern atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX];
+ +
+ +static inline void update_page_count(int level, long count)
+ +{
+ +      if (IS_ENABLED(CONFIG_PROC_FS))
+ +              atomic_long_add(count, &direct_pages_count[level]);
+ +}
+ +
+ +struct seq_file;
+ +void arch_report_meminfo(struct seq_file *m);
   
   /*
    * The S390 doesn't have any external MMU info: the kernel page
@@@ -242,8 -221,8 +242,8 @@@ static inline int is_module_addr(void *
    * swap                               .11..ttttt.0
    * prot-none, clean, old      .11.xx0000.1
    * prot-none, clean, young    .11.xx0001.1
- - * prot-none, dirty, old      .10.xx0010.1
- - * prot-none, dirty, young    .10.xx0011.1
+ + * prot-none, dirty, old      .11.xx0010.1
+ + * prot-none, dirty, young    .11.xx0011.1
    * read-only, clean, old      .11.xx0100.1
    * read-only, clean, young    .01.xx0101.1
    * read-only, dirty, old      .11.xx0110.1
@@@ -277,6 -256,7 +277,7 @@@
   /* Bits in the region table entry */
   #define _REGION_ENTRY_ORIGIN  ~0xfffUL/* region/segment table origin      */
   #define _REGION_ENTRY_PROTECT 0x200   /* region protection bit            */
+ #define _REGION_ENTRY_OFFSET  0xc0    /* region table offset              */
   #define _REGION_ENTRY_INVALID 0x20    /* invalid region table entry       */
   #define _REGION_ENTRY_TYPE_MASK       0x0c    /* region/segment table type mask   */
   #define _REGION_ENTRY_TYPE_R1 0x0c    /* region first table type          */
@@@ -291,23 -271,8 +292,23 @@@
   #define _REGION3_ENTRY                (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH)
   #define _REGION3_ENTRY_EMPTY  (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID)
   
- -#define _REGION3_ENTRY_LARGE  0x400   /* RTTE-format control, large page  */
- -#define _REGION3_ENTRY_RO     0x200   /* page protection bit              */
+ +#define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address            */
+ +#define _REGION3_ENTRY_ORIGIN  ~0x7ffUL/* region third table origin        */
+ +
+ +#define _REGION3_ENTRY_DIRTY  0x2000  /* SW region dirty bit */
+ +#define _REGION3_ENTRY_YOUNG  0x1000  /* SW region young bit */
+ +#define _REGION3_ENTRY_LARGE  0x0400  /* RTTE-format control, large page  */
+ +#define _REGION3_ENTRY_READ   0x0002  /* SW region read bit */
+ +#define _REGION3_ENTRY_WRITE  0x0001  /* SW region write bit */
+ +
+ +#ifdef CONFIG_MEM_SOFT_DIRTY
+ +#define _REGION3_ENTRY_SOFT_DIRTY 0x4000 /* SW region soft dirty bit */
+ +#else
+ +#define _REGION3_ENTRY_SOFT_DIRTY 0x0000 /* SW region soft dirty bit */
+ +#endif
+ +
+ +#define _REGION_ENTRY_BITS     0xfffffffffffff227UL
+ +#define _REGION_ENTRY_BITS_LARGE 0xffffffff8000fe27UL
   
   /* Bits in the segment table entry */
   #define _SEGMENT_ENTRY_BITS   0xfffffffffffffe33UL
@@@ -323,8 -288,8 +324,8 @@@
   #define _SEGMENT_ENTRY_DIRTY  0x2000  /* SW segment dirty bit */
   #define _SEGMENT_ENTRY_YOUNG  0x1000  /* SW segment young bit */
   #define _SEGMENT_ENTRY_LARGE  0x0400  /* STE-format control, large page */
- -#define _SEGMENT_ENTRY_READ   0x0002  /* SW segment read bit */
- -#define _SEGMENT_ENTRY_WRITE  0x0001  /* SW segment write bit */
+ +#define _SEGMENT_ENTRY_WRITE  0x0002  /* SW segment write bit */
+ +#define _SEGMENT_ENTRY_READ   0x0001  /* SW segment read bit */
   
   #ifdef CONFIG_MEM_SOFT_DIRTY
   #define _SEGMENT_ENTRY_SOFT_DIRTY 0x4000 /* SW segment soft dirty bit */
@@@ -333,17 -298,16 +334,17 @@@
   #endif
   
   /*
- - * Segment table entry encoding (R = read-only, I = invalid, y = young bit):
- - *                            dy..R...I...rw
+ + * Segment table and region3 table entry encoding
+ + * (R = read-only, I = invalid, y = young bit):
+ + *                            dy..R...I...wr
    * prot-none, clean, old      00..1...1...00
    * prot-none, clean, young    01..1...1...00
    * prot-none, dirty, old      10..1...1...00
    * prot-none, dirty, young    11..1...1...00
- - * read-only, clean, old      00..1...1...10
- - * read-only, clean, young    01..1...0...10
- - * read-only, dirty, old      10..1...1...10
- - * read-only, dirty, young    11..1...0...10
+ + * read-only, clean, old      00..1...1...01
+ + * read-only, clean, young    01..1...0...01
+ + * read-only, dirty, old      10..1...1...01
+ + * read-only, dirty, young    11..1...0...01
    * read-write, clean, old     00..1...1...11
    * read-write, clean, young   01..1...0...11
    * read-write, dirty, old     10..0...1...11
@@@ -364,6 -328,7 +365,7 @@@
   #define PGSTE_GC_BIT  0x0002000000000000UL
   #define PGSTE_UC_BIT  0x0000800000000000UL    /* user dirty (migration) */
   #define PGSTE_IN_BIT  0x0000400000000000UL    /* IPTE notify bit */
+ #define PGSTE_VSIE_BIT        0x0000200000000000UL    /* ref'd in a shadow table */
   
   /* Guest Page State used for virtualization */
   #define _PGSTE_GPS_ZERO               0x0000000080000000UL
@@@ -382,7 -347,7 +384,7 @@@
   /*
    * Page protection definitions.
    */
- -#define PAGE_NONE     __pgprot(_PAGE_PRESENT | _PAGE_INVALID)
+ +#define PAGE_NONE     __pgprot(_PAGE_PRESENT | _PAGE_INVALID | _PAGE_PROTECT)
   #define PAGE_READ     __pgprot(_PAGE_PRESENT | _PAGE_READ | \
                                  _PAGE_INVALID | _PAGE_PROTECT)
   #define PAGE_WRITE    __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
@@@ -428,33 -393,6 +430,33 @@@
                                  _SEGMENT_ENTRY_READ)
   #define SEGMENT_WRITE __pgprot(_SEGMENT_ENTRY_READ | \
                                  _SEGMENT_ENTRY_WRITE)
+ +#define SEGMENT_KERNEL        __pgprot(_SEGMENT_ENTRY |       \
+ +                               _SEGMENT_ENTRY_LARGE | \
+ +                               _SEGMENT_ENTRY_READ |  \
+ +                               _SEGMENT_ENTRY_WRITE | \
+ +                               _SEGMENT_ENTRY_YOUNG | \
+ +                               _SEGMENT_ENTRY_DIRTY)
+ +#define SEGMENT_KERNEL_RO __pgprot(_SEGMENT_ENTRY |   \
+ +                               _SEGMENT_ENTRY_LARGE | \
+ +                               _SEGMENT_ENTRY_READ |  \
+ +                               _SEGMENT_ENTRY_YOUNG | \
+ +                               _SEGMENT_ENTRY_PROTECT)
+ +
+ +/*
+ + * Region3 entry (large page) protection definitions.
+ + */
+ +
+ +#define REGION3_KERNEL        __pgprot(_REGION_ENTRY_TYPE_R3 | \
+ +                               _REGION3_ENTRY_LARGE |  \
+ +                               _REGION3_ENTRY_READ |   \
+ +                               _REGION3_ENTRY_WRITE |  \
+ +                               _REGION3_ENTRY_YOUNG |  \
+ +                               _REGION3_ENTRY_DIRTY)
+ +#define REGION3_KERNEL_RO __pgprot(_REGION_ENTRY_TYPE_R3 | \
+ +                                 _REGION3_ENTRY_LARGE |  \
+ +                                 _REGION3_ENTRY_READ |   \
+ +                                 _REGION3_ENTRY_YOUNG |  \
+ +                                 _REGION_ENTRY_PROTECT)
   
   static inline int mm_has_pgste(struct mm_struct *mm)
   {
@@@ -488,53 -426,6 +490,53 @@@ static inline int mm_use_skey(struct mm
         return 0;
   }
   
+ +static inline void csp(unsigned int *ptr, unsigned int old, unsigned int new)
+ +{
+ +      register unsigned long reg2 asm("2") = old;
+ +      register unsigned long reg3 asm("3") = new;
+ +      unsigned long address = (unsigned long)ptr | 1;
+ +
+ +      asm volatile(
+ +              "       csp     %0,%3"
+ +              : "+d" (reg2), "+m" (*ptr)
+ +              : "d" (reg3), "d" (address)
+ +              : "cc");
+ +}
+ +
+ +static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new)
+ +{
+ +      register unsigned long reg2 asm("2") = old;
+ +      register unsigned long reg3 asm("3") = new;
+ +      unsigned long address = (unsigned long)ptr | 1;
+ +
+ +      asm volatile(
+ +              "       .insn   rre,0xb98a0000,%0,%3"
+ +              : "+d" (reg2), "+m" (*ptr)
+ +              : "d" (reg3), "d" (address)
+ +              : "cc");
+ +}
+ +
+ +#define CRDTE_DTT_PAGE                0x00UL
+ +#define CRDTE_DTT_SEGMENT     0x10UL
+ +#define CRDTE_DTT_REGION3     0x14UL
+ +#define CRDTE_DTT_REGION2     0x18UL
+ +#define CRDTE_DTT_REGION1     0x1cUL
+ +
+ +static inline void crdte(unsigned long old, unsigned long new,
+ +                       unsigned long table, unsigned long dtt,
+ +                       unsigned long address, unsigned long asce)
+ +{
+ +      register unsigned long reg2 asm("2") = old;
+ +      register unsigned long reg3 asm("3") = new;
+ +      register unsigned long reg4 asm("4") = table | dtt;
+ +      register unsigned long reg5 asm("5") = address;
+ +
+ +      asm volatile(".insn rrf,0xb98f0000,%0,%2,%4,0"
+ +                   : "+d" (reg2)
+ +                   : "d" (reg3), "d" (reg4), "d" (reg5), "a" (asce)
+ +                   : "memory", "cc");
+ +}
+ +
   /*
    * pgd/pmd/pte query functions
    */
@@@ -576,7 -467,7 +578,7 @@@ static inline int pud_none(pud_t pud
   {
         if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3)
                 return 0;
- -      return (pud_val(pud) & _REGION_ENTRY_INVALID) != 0UL;
+ +      return pud_val(pud) == _REGION3_ENTRY_EMPTY;
   }
   
   static inline int pud_large(pud_t pud)
@@@ -586,35 -477,17 +588,35 @@@
         return !!(pud_val(pud) & _REGION3_ENTRY_LARGE);
   }
   
+ +static inline unsigned long pud_pfn(pud_t pud)
+ +{
+ +      unsigned long origin_mask;
+ +
+ +      origin_mask = _REGION3_ENTRY_ORIGIN;
+ +      if (pud_large(pud))
+ +              origin_mask = _REGION3_ENTRY_ORIGIN_LARGE;
+ +      return (pud_val(pud) & origin_mask) >> PAGE_SHIFT;
+ +}
+ +
+ +static inline int pmd_large(pmd_t pmd)
+ +{
+ +      return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
+ +}
+ +
+ +static inline int pmd_bad(pmd_t pmd)
+ +{
+ +      if (pmd_large(pmd))
+ +              return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0;
+ +      return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
+ +}
+ +
   static inline int pud_bad(pud_t pud)
   {
- -      /*
- -       * With dynamic page table levels the pud can be a region table
- -       * entry or a segment table entry. Check for the bit that are
- -       * invalid for either table entry.
- -       */
- -      unsigned long mask =
- -              ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INVALID &
- -              ~_REGION_ENTRY_TYPE_MASK & ~_REGION_ENTRY_LENGTH;
- -      return (pud_val(pud) & mask) != 0;
+ +      if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3)
+ +              return pmd_bad(__pmd(pud_val(pud)));
+ +      if (pud_large(pud))
+ +              return (pud_val(pud) & ~_REGION_ENTRY_BITS_LARGE) != 0;
+ +      return (pud_val(pud) & ~_REGION_ENTRY_BITS) != 0;
   }
   
   static inline int pmd_present(pmd_t pmd)
@@@ -627,6 -500,11 +629,6 @@@ static inline int pmd_none(pmd_t pmd
         return pmd_val(pmd) == _SEGMENT_ENTRY_INVALID;
   }
   
- -static inline int pmd_large(pmd_t pmd)
- -{
- -      return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
- -}
- -
   static inline unsigned long pmd_pfn(pmd_t pmd)
   {
         unsigned long origin_mask;
@@@ -637,6 -515,13 +639,6 @@@
         return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT;
   }
   
- -static inline int pmd_bad(pmd_t pmd)
- -{
- -      if (pmd_large(pmd))
- -              return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0;
- -      return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
- -}
- -
   #define __HAVE_ARCH_PMD_WRITE
   static inline int pmd_write(pmd_t pmd)
   {
@@@ -1002,15 -887,26 +1004,26 @@@ static inline int ptep_set_access_flags
   void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
                      pte_t *ptep, pte_t entry);
   void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
- void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+ void ptep_notify(struct mm_struct *mm, unsigned long addr,
+                pte_t *ptep, unsigned long bits);
+ int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
+                   pte_t *ptep, int prot, unsigned long bit);
   void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
                      pte_t *ptep , int reset);
   void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+                   pte_t *sptep, pte_t *tptep, pte_t pte);
+ void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
   
   bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
   int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
                           unsigned char key, bool nq);
- unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr);
+ int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                              unsigned char key, unsigned char *oldkey,
+                              bool nq, bool mr, bool mc);
+ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr);
+ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                         unsigned char *key);
   
   /*
    * Certain architectures need to do special things when PTEs
@@@ -1080,7 -976,6 +1093,7 @@@ static inline pmd_t *pmd_offset(pud_t *
   #define pte_page(x) pfn_to_page(pte_pfn(x))
   
   #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
+ +#define pud_page(pud) pfn_to_page(pud_pfn(pud))
   
   /* Find an entry in the lowest level page table.. */
   #define pte_offset(pmd, addr) ((pte_t *) pmd_deref(*(pmd)) + pte_index(addr))
@@@ -1088,6 -983,20 +1101,6 @@@
   #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
   #define pte_unmap(pte) do { } while (0)
   
- -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
- -static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
- -{
- -      /*
- -       * pgprot is PAGE_NONE, PAGE_READ, or PAGE_WRITE (see __Pxxx / __Sxxx)
- -       * Convert to segment table entry format.
- -       */
- -      if (pgprot_val(pgprot) == pgprot_val(PAGE_NONE))
- -              return pgprot_val(SEGMENT_NONE);
- -      if (pgprot_val(pgprot) == pgprot_val(PAGE_READ))
- -              return pgprot_val(SEGMENT_READ);
- -      return pgprot_val(SEGMENT_WRITE);
- -}
- -
   static inline pmd_t pmd_wrprotect(pmd_t pmd)
   {
         pmd_val(pmd) &= ~_SEGMENT_ENTRY_WRITE;
@@@ -1124,56 -1033,6 +1137,56 @@@ static inline pmd_t pmd_mkdirty(pmd_t p
         return pmd;
   }
   
+ +static inline pud_t pud_wrprotect(pud_t pud)
+ +{
+ +      pud_val(pud) &= ~_REGION3_ENTRY_WRITE;
+ +      pud_val(pud) |= _REGION_ENTRY_PROTECT;
+ +      return pud;
+ +}
+ +
+ +static inline pud_t pud_mkwrite(pud_t pud)
+ +{
+ +      pud_val(pud) |= _REGION3_ENTRY_WRITE;
+ +      if (pud_large(pud) && !(pud_val(pud) & _REGION3_ENTRY_DIRTY))
+ +              return pud;
+ +      pud_val(pud) &= ~_REGION_ENTRY_PROTECT;
+ +      return pud;
+ +}
+ +
+ +static inline pud_t pud_mkclean(pud_t pud)
+ +{
+ +      if (pud_large(pud)) {
+ +              pud_val(pud) &= ~_REGION3_ENTRY_DIRTY;
+ +              pud_val(pud) |= _REGION_ENTRY_PROTECT;
+ +      }
+ +      return pud;
+ +}
+ +
+ +static inline pud_t pud_mkdirty(pud_t pud)
+ +{
+ +      if (pud_large(pud)) {
+ +              pud_val(pud) |= _REGION3_ENTRY_DIRTY |
+ +                              _REGION3_ENTRY_SOFT_DIRTY;
+ +              if (pud_val(pud) & _REGION3_ENTRY_WRITE)
+ +                      pud_val(pud) &= ~_REGION_ENTRY_PROTECT;
+ +      }
+ +      return pud;
+ +}
+ +
+ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+ +static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
+ +{
+ +      /*
+ +       * pgprot is PAGE_NONE, PAGE_READ, or PAGE_WRITE (see __Pxxx / __Sxxx)
+ +       * Convert to segment table entry format.
+ +       */
+ +      if (pgprot_val(pgprot) == pgprot_val(PAGE_NONE))
+ +              return pgprot_val(SEGMENT_NONE);
+ +      if (pgprot_val(pgprot) == pgprot_val(PAGE_READ))
+ +              return pgprot_val(SEGMENT_READ);
+ +      return pgprot_val(SEGMENT_WRITE);
+ +}
+ +
   static inline pmd_t pmd_mkyoung(pmd_t pmd)
   {
         if (pmd_large(pmd)) {
@@@ -1222,8 -1081,15 +1235,8 @@@ static inline pmd_t mk_pmd_phys(unsigne
   
   static inline void __pmdp_csp(pmd_t *pmdp)
   {
- -      register unsigned long reg2 asm("2") = pmd_val(*pmdp);
- -      register unsigned long reg3 asm("3") = pmd_val(*pmdp) |
- -                                             _SEGMENT_ENTRY_INVALID;
- -      register unsigned long reg4 asm("4") = ((unsigned long) pmdp) + 5;
- -
- -      asm volatile(
- -              "       csp %1,%3"
- -              : "=m" (*pmdp)
- -              : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc");
+ +      csp((unsigned int *)pmdp + 1, pmd_val(*pmdp),
+ +          pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
   }
   
   static inline void __pmdp_idte(unsigned long address, pmd_t *pmdp)
@@@ -1238,19 -1104,6 +1251,19 @@@
                 : "cc" );
   }
   
+ +static inline void __pudp_idte(unsigned long address, pud_t *pudp)
+ +{
+ +      unsigned long r3o;
+ +
+ +      r3o = (unsigned long) pudp - pud_index(address) * sizeof(pud_t);
+ +      r3o |= _ASCE_TYPE_REGION3;
+ +      asm volatile(
+ +              "       .insn   rrf,0xb98e0000,%2,%3,0,0"
+ +              : "=m" (*pudp)
+ +              : "m" (*pudp), "a" (r3o), "a" ((address & PUD_MASK))
+ +              : "cc");
+ +}
+ +
   static inline void __pmdp_idte_local(unsigned long address, pmd_t *pmdp)
   {
         unsigned long sto;
@@@ -1263,22 -1116,8 +1276,22 @@@
                 : "cc" );
   }
   
+ +static inline void __pudp_idte_local(unsigned long address, pud_t *pudp)
+ +{
+ +      unsigned long r3o;
+ +
+ +      r3o = (unsigned long) pudp - pud_index(address) * sizeof(pud_t);
+ +      r3o |= _ASCE_TYPE_REGION3;
+ +      asm volatile(
+ +              "       .insn   rrf,0xb98e0000,%2,%3,0,1"
+ +              : "=m" (*pudp)
+ +              : "m" (*pudp), "a" (r3o), "a" ((address & PUD_MASK))
+ +              : "cc");
+ +}
+ +
   pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t);
   pmd_t pmdp_xchg_lazy(struct mm_struct *, unsigned long, pmd_t *, pmd_t);
+ +pud_t pudp_xchg_direct(struct mm_struct *, unsigned long, pud_t *, pud_t);
   
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
   
diff --combined arch/s390/include/asm/processor.h

index 0952920,8c2922f..0332317
--- 1/arch/s390/include/asm/processor.h
--- 2/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@@ -77,10 -77,7 +77,10 @@@ static inline void get_cpu_id(struct cp
         asm volatile("stidp %0" : "=Q" (*ptr));
   }
   
- -extern void s390_adjust_jiffies(void);
+ +void s390_adjust_jiffies(void);
+ +void s390_update_cpu_mhz(void);
+ +void cpu_detect_mhz_feature(void);
+ +
   extern const struct seq_operations cpuinfo_op;
   extern int sysctl_ieee_emulation_warnings;
   extern void execve_tail(void);
@@@ -112,6 -109,8 +112,8 @@@ struct thread_struct 
           unsigned long ksp;              /* kernel stack pointer             */
         mm_segment_t mm_segment;
         unsigned long gmap_addr;        /* address of last gmap fault. */
+       unsigned int gmap_write_flag;   /* gmap fault write indication */
+       unsigned int gmap_int_code;     /* int code of last gmap fault */
         unsigned int gmap_pfault;       /* signal of a pending guest pfault */
         struct per_regs per_user;       /* User specified PER registers */
         struct per_event per_event;     /* Cause of the last PER trap */
@@@ -236,18 -235,6 +238,18 @@@ void cpu_relax(void)
   
   #define cpu_relax_lowlatency()  barrier()
   
+ +#define ECAG_CACHE_ATTRIBUTE  0
+ +#define ECAG_CPU_ATTRIBUTE    1
+ +
+ +static inline unsigned long __ecag(unsigned int asi, unsigned char parm)
+ +{
+ +      unsigned long val;
+ +
+ +      asm volatile(".insn     rsy,0xeb000000004c,%0,0,0(%1)" /* ecag */
+ +                   : "=d" (val) : "a" (asi << 8 | parm));
+ +      return val;
+ +}
+ +
   static inline void psw_set_key(unsigned int key)
   {
         asm volatile("spka 0(%0)" : : "d" (key));
diff --combined arch/s390/kernel/diag.c

index 48b37b8,a44faf4..a97354c
--- 1/arch/s390/kernel/diag.c
--- 2/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@@ -162,6 -162,28 +162,30 @@@ int diag14(unsigned long rx, unsigned l
   }
   EXPORT_SYMBOL(diag14);
   
- -static inline int __diag204(unsigned long subcode, unsigned long size, void *addr)
++static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
+ {
- -      register unsigned long _subcode asm("0") = subcode;
++      register unsigned long _subcode asm("0") = *subcode;
+       register unsigned long _size asm("1") = size;
+ 
+       asm volatile(
+               "       diag    %2,%0,0x204\n"
- -              "0:\n"
++              "0:     nopr    %%r7\n"
+               EX_TABLE(0b,0b)
+               : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
- -      if (_subcode)
- -              return -1;
++      *subcode = _subcode;
+       return _size;
+ }
+ 
+ int diag204(unsigned long subcode, unsigned long size, void *addr)
+ {
+       diag_stat_inc(DIAG_STAT_X204);
- -      return __diag204(subcode, size, addr);
++      size = __diag204(&subcode, size, addr);
++      if (subcode)
++              return -1;
++      return size;
+ }
+ EXPORT_SYMBOL(diag204);
+ 
   /*
    * Diagnose 210: Get information about a virtual device
    */
@@@ -196,3 -218,18 +220,18 @@@ int diag210(struct diag210 *addr
         return ccode;
   }
   EXPORT_SYMBOL(diag210);
+ 
+ int diag224(void *ptr)
+ {
+       int rc = -EOPNOTSUPP;
+ 
+       diag_stat_inc(DIAG_STAT_X224);
+       asm volatile(
+               "       diag    %1,%2,0x224\n"
+               "0:     lhi     %0,0x0\n"
+               "1:\n"
+               EX_TABLE(0b,1b)
+               : "+d" (rc) :"d" (0), "d" (ptr) : "memory");
+       return rc;
+ }
+ EXPORT_SYMBOL(diag224);
diff --combined arch/s390/kvm/intercept.c

index 2521571,7a2f155..dfd0ca2
--- 1/arch/s390/kvm/intercept.c
--- 2/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@@ -341,8 -341,6 +341,8 @@@ static int handle_mvpg_pei(struct kvm_v
   
   static int handle_partial_execution(struct kvm_vcpu *vcpu)
   {
+ +      vcpu->stat.exit_pei++;
+ +
         if (vcpu->arch.sie_block->ipa == 0xb254)        /* MVPG */
                 return handle_mvpg_pei(vcpu);
         if (vcpu->arch.sie_block->ipa >> 8 == 0xae)     /* SIGP */
@@@ -351,8 -349,26 +351,26 @@@
         return -EOPNOTSUPP;
   }
   
+ static int handle_operexc(struct kvm_vcpu *vcpu)
+ {
+       vcpu->stat.exit_operation_exception++;
+       trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
+                                     vcpu->arch.sie_block->ipb);
+ 
+       if (vcpu->arch.sie_block->ipa == 0xb256 &&
+           test_kvm_facility(vcpu->kvm, 74))
+               return handle_sthyi(vcpu);
+ 
+       if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
+               return -EOPNOTSUPP;
+ 
+       return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+ }
+ 
   int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
   {
+       int rc, per_rc = 0;
+ 
         if (kvm_is_ucontrol(vcpu->kvm))
                 return -EOPNOTSUPP;
   
@@@ -361,7 -377,8 +379,8 @@@
         case 0x18:
                 return handle_noop(vcpu);
         case 0x04:
-               return handle_instruction(vcpu);
+               rc = handle_instruction(vcpu);
+               break;
         case 0x08:
                 return handle_prog(vcpu);
         case 0x14:
@@@ -372,9 -389,19 +391,19 @@@
                 return handle_validity(vcpu);
         case 0x28:
                 return handle_stop(vcpu);
+       case 0x2c:
+               rc = handle_operexc(vcpu);
+               break;
         case 0x38:
-               return handle_partial_execution(vcpu);
+               rc = handle_partial_execution(vcpu);
+               break;
         default:
                 return -EOPNOTSUPP;
         }
+ 
+       /* process PER, also if the instrution is processed in user space */
+       if (vcpu->arch.sie_block->icptstatus & 0x02 &&
+           (!rc || rc == -EOPNOTSUPP))
+               per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu);
+       return per_rc ? per_rc : rc;
   }
diff --combined arch/s390/kvm/kvm-s390.c

index 6f5c344,63ac7c1..3f3ae48
--- 1/arch/s390/kvm/kvm-s390.c
--- 2/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@@ -21,20 -21,24 +21,24 @@@
   #include <linux/init.h>
   #include <linux/kvm.h>
   #include <linux/kvm_host.h>
+ #include <linux/mman.h>
   #include <linux/module.h>
   #include <linux/random.h>
   #include <linux/slab.h>
   #include <linux/timer.h>
   #include <linux/vmalloc.h>
+ #include <linux/bitmap.h>
   #include <asm/asm-offsets.h>
   #include <asm/lowcore.h>
- -#include <asm/etr.h>
+ +#include <asm/stp.h>
   #include <asm/pgtable.h>
   #include <asm/gmap.h>
   #include <asm/nmi.h>
   #include <asm/switch_to.h>
   #include <asm/isc.h>
   #include <asm/sclp.h>
- -#include <asm/etr.h>
+ #include <asm/cpacf.h>
++#include <asm/timex.h>
   #include "kvm-s390.h"
   #include "gaccess.h"
   
@@@ -61,9 -65,9 +65,10 @@@ struct kvm_stats_debugfs_item debugfs_e
         { "exit_external_request", VCPU_STAT(exit_external_request) },
         { "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) },
         { "exit_instruction", VCPU_STAT(exit_instruction) },
+ +      { "exit_pei", VCPU_STAT(exit_pei) },
         { "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
         { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
+       { "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
         { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
         { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
         { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
@@@ -94,6 -98,8 +99,8 @@@
         { "instruction_stsi", VCPU_STAT(instruction_stsi) },
         { "instruction_stfl", VCPU_STAT(instruction_stfl) },
         { "instruction_tprot", VCPU_STAT(instruction_tprot) },
+       { "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
+       { "instruction_sie", VCPU_STAT(instruction_sie) },
         { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
         { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
         { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
@@@ -119,6 -125,11 +126,11 @@@
         { NULL }
   };
   
+ /* allow nested virtualization in KVM (if enabled by user space) */
+ static int nested;
+ module_param(nested, int, S_IRUGO);
+ MODULE_PARM_DESC(nested, "Nested virtualization support");
+ 
   /* upper facilities limit for kvm */
   unsigned long kvm_s390_fac_list_mask[16] = {
         0xffe6000000000000UL,
@@@ -131,7 -142,13 +143,13 @@@ unsigned long kvm_s390_fac_list_mask_si
         return ARRAY_SIZE(kvm_s390_fac_list_mask);
   }
   
+ /* available cpu features supported by kvm */
+ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+ /* available subfunctions indicated via query / "test bit" */
+ static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
+ 
   static struct gmap_notifier gmap_notifier;
+ static struct gmap_notifier vsie_gmap_notifier;
   debug_info_t *kvm_s390_dbf;
   
   /* Section: not file related */
@@@ -141,7 -158,8 +159,8 @@@ int kvm_arch_hardware_enable(void
         return 0;
   }
   
- static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
+ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+                             unsigned long end);
   
   /*
    * This callback is executed during stop_machine(). All CPUs are therefore
@@@ -163,6 -181,8 +182,8 @@@ static int kvm_clock_sync(struct notifi
                         vcpu->arch.sie_block->epoch -= *delta;
                         if (vcpu->arch.cputm_enabled)
                                 vcpu->arch.cputm_start += *delta;
+                       if (vcpu->arch.vsie_block)
+                               vcpu->arch.vsie_block->epoch -= *delta;
                 }
         }
         return NOTIFY_OK;
@@@ -175,7 -195,9 +196,9 @@@ static struct notifier_block kvm_clock_
   int kvm_arch_hardware_setup(void)
   {
         gmap_notifier.notifier_call = kvm_gmap_notifier;
-       gmap_register_ipte_notifier(&gmap_notifier);
+       gmap_register_pte_notifier(&gmap_notifier);
+       vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+       gmap_register_pte_notifier(&vsie_gmap_notifier);
         atomic_notifier_chain_register(&s390_epoch_delta_notifier,
                                        &kvm_clock_notifier);
         return 0;
@@@ -183,11 -205,107 +206,109 @@@
   
   void kvm_arch_hardware_unsetup(void)
   {
-       gmap_unregister_ipte_notifier(&gmap_notifier);
+       gmap_unregister_pte_notifier(&gmap_notifier);
+       gmap_unregister_pte_notifier(&vsie_gmap_notifier);
         atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
                                          &kvm_clock_notifier);
   }
   
- -              etr_ptff(kvm_s390_available_subfunc.ptff, ETR_PTFF_QAF);
+ static void allow_cpu_feat(unsigned long nr)
+ {
+       set_bit_inv(nr, kvm_s390_available_cpu_feat);
+ }
+ 
+ static inline int plo_test_bit(unsigned char nr)
+ {
+       register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
+       int cc = 3; /* subfunction not available */
+ 
+       asm volatile(
+               /* Parameter registers are ignored for "test bit" */
+               "       plo     0,0,0,0(0)\n"
+               "       ipm     %0\n"
+               "       srl     %0,28\n"
+               : "=d" (cc)
+               : "d" (r0)
+               : "cc");
+       return cc == 0;
+ }
+ 
+ static void kvm_s390_cpu_feat_init(void)
+ {
+       int i;
+ 
+       for (i = 0; i < 256; ++i) {
+               if (plo_test_bit(i))
+                       kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7);
+       }
+ 
+       if (test_facility(28)) /* TOD-clock steering */
++              ptff(kvm_s390_available_subfunc.ptff,
++                   sizeof(kvm_s390_available_subfunc.ptff),
++                   PTFF_QAF);
+ 
+       if (test_facility(17)) { /* MSA */
+               __cpacf_query(CPACF_KMAC, kvm_s390_available_subfunc.kmac);
+               __cpacf_query(CPACF_KMC, kvm_s390_available_subfunc.kmc);
+               __cpacf_query(CPACF_KM, kvm_s390_available_subfunc.km);
+               __cpacf_query(CPACF_KIMD, kvm_s390_available_subfunc.kimd);
+               __cpacf_query(CPACF_KLMD, kvm_s390_available_subfunc.klmd);
+       }
+       if (test_facility(76)) /* MSA3 */
+               __cpacf_query(CPACF_PCKMO, kvm_s390_available_subfunc.pckmo);
+       if (test_facility(77)) { /* MSA4 */
+               __cpacf_query(CPACF_KMCTR, kvm_s390_available_subfunc.kmctr);
+               __cpacf_query(CPACF_KMF, kvm_s390_available_subfunc.kmf);
+               __cpacf_query(CPACF_KMO, kvm_s390_available_subfunc.kmo);
+               __cpacf_query(CPACF_PCC, kvm_s390_available_subfunc.pcc);
+       }
+       if (test_facility(57)) /* MSA5 */
+               __cpacf_query(CPACF_PPNO, kvm_s390_available_subfunc.ppno);
+ 
+       if (MACHINE_HAS_ESOP)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
+       /*
+        * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
+        * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
+        */
+       if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+           !test_facility(3) || !nested)
+               return;
+       allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
+       if (sclp.has_64bscao)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
+       if (sclp.has_siif)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
+       if (sclp.has_gpere)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
+       if (sclp.has_gsls)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
+       if (sclp.has_ib)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
+       if (sclp.has_cei)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
+       if (sclp.has_ibs)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+       /*
+        * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
+        * all skey handling functions read/set the skey from the PGSTE
+        * instead of the real storage key.
+        *
+        * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make
+        * pages being detected as preserved although they are resident.
+        *
+        * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will
+        * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY.
+        *
+        * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
+        * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
+        * correctly shadowed. We can do that for the PGSTE but not for PTE.I.
+        *
+        * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
+        * cannot easily shadow the SCA because of the ipte lock.
+        */
+ }
+ 
   int kvm_arch_init(void *opaque)
   {
         kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long));
@@@ -199,6 -317,8 +320,8 @@@
                 return -ENOMEM;
         }
   
+       kvm_s390_cpu_feat_init();
+ 
         /* Register floating interrupt controller interface. */
         return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
   }
@@@ -244,6 -364,7 +367,7 @@@ int kvm_vm_ioctl_check_extension(struc
         case KVM_CAP_S390_USER_STSI:
         case KVM_CAP_S390_SKEYS:
         case KVM_CAP_S390_IRQ_STATE:
+       case KVM_CAP_S390_USER_INSTR0:
                 r = 1;
                 break;
         case KVM_CAP_S390_MEM_OP:
@@@ -251,8 -372,9 +375,9 @@@
                 break;
         case KVM_CAP_NR_VCPUS:
         case KVM_CAP_MAX_VCPUS:
-               r = sclp.has_esca ? KVM_S390_ESCA_CPU_SLOTS
-                                 : KVM_S390_BSCA_CPU_SLOTS;
+               r = KVM_S390_BSCA_CPU_SLOTS;
+               if (sclp.has_esca && sclp.has_64bscao)
+                       r = KVM_S390_ESCA_CPU_SLOTS;
                 break;
         case KVM_CAP_NR_MEMSLOTS:
                 r = KVM_USER_MEM_SLOTS;
@@@ -335,6 -457,16 +460,16 @@@ out
         return r;
   }
   
+ static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
+ {
+       unsigned int i;
+       struct kvm_vcpu *vcpu;
+ 
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu);
+       }
+ }
+ 
   static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
   {
         int r;
@@@ -355,7 -487,7 +490,7 @@@
                 break;
         case KVM_CAP_S390_VECTOR_REGISTERS:
                 mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus)) {
+               if (kvm->created_vcpus) {
                         r = -EBUSY;
                 } else if (MACHINE_HAS_VX) {
                         set_kvm_facility(kvm->arch.model.fac_mask, 129);
@@@ -370,7 -502,7 +505,7 @@@
         case KVM_CAP_S390_RI:
                 r = -EINVAL;
                 mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus)) {
+               if (kvm->created_vcpus) {
                         r = -EBUSY;
                 } else if (test_facility(64)) {
                         set_kvm_facility(kvm->arch.model.fac_mask, 64);
@@@ -386,6 -518,12 +521,12 @@@
                 kvm->arch.user_stsi = 1;
                 r = 0;
                 break;
+       case KVM_CAP_S390_USER_INSTR0:
+               VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0");
+               kvm->arch.user_instr0 = 1;
+               icpt_operexc_on_all_vcpus(kvm);
+               r = 0;
+               break;
         default:
                 r = -EINVAL;
                 break;
@@@ -418,21 -556,23 +559,23 @@@ static int kvm_s390_set_mem_control(str
         unsigned int idx;
         switch (attr->attr) {
         case KVM_S390_VM_MEM_ENABLE_CMMA:
-               /* enable CMMA only for z10 and later (EDAT_1) */
-               ret = -EINVAL;
-               if (!MACHINE_IS_LPAR || !MACHINE_HAS_EDAT1)
+               ret = -ENXIO;
+               if (!sclp.has_cmma)
                         break;
   
                 ret = -EBUSY;
                 VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support");
                 mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus) == 0) {
+               if (!kvm->created_vcpus) {
                         kvm->arch.use_cmma = 1;
                         ret = 0;
                 }
                 mutex_unlock(&kvm->lock);
                 break;
         case KVM_S390_VM_MEM_CLR_CMMA:
+               ret = -ENXIO;
+               if (!sclp.has_cmma)
+                       break;
                 ret = -EINVAL;
                 if (!kvm->arch.use_cmma)
                         break;
@@@ -461,20 -601,20 +604,20 @@@
                 if (!new_limit)
                         return -EINVAL;
   
-               /* gmap_alloc takes last usable address */
+               /* gmap_create takes last usable address */
                 if (new_limit != KVM_S390_NO_MEM_LIMIT)
                         new_limit -= 1;
   
                 ret = -EBUSY;
                 mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus) == 0) {
-                       /* gmap_alloc will round the limit up */
-                       struct gmap *new = gmap_alloc(current->mm, new_limit);
+               if (!kvm->created_vcpus) {
+                       /* gmap_create will round the limit up */
+                       struct gmap *new = gmap_create(current->mm, new_limit);
   
                         if (!new) {
                                 ret = -ENOMEM;
                         } else {
-                               gmap_free(kvm->arch.gmap);
+                               gmap_remove(kvm->arch.gmap);
                                 new->private = kvm;
                                 kvm->arch.gmap = new;
                                 ret = 0;
@@@ -644,7 -784,7 +787,7 @@@ static int kvm_s390_set_processor(struc
         int ret = 0;
   
         mutex_lock(&kvm->lock);
-       if (atomic_read(&kvm->online_vcpus)) {
+       if (kvm->created_vcpus) {
                 ret = -EBUSY;
                 goto out;
         }
@@@ -658,7 -798,7 +801,7 @@@
                 kvm->arch.model.cpuid = proc->cpuid;
                 lowest_ibc = sclp.ibc >> 16 & 0xfff;
                 unblocked_ibc = sclp.ibc & 0xfff;
- -              if (lowest_ibc) {
+ +              if (lowest_ibc && proc->ibc) {
                         if (proc->ibc > unblocked_ibc)
                                 kvm->arch.model.ibc = unblocked_ibc;
                         else if (proc->ibc < lowest_ibc)
@@@ -676,6 -816,39 +819,39 @@@ out
         return ret;
   }
   
+ static int kvm_s390_set_processor_feat(struct kvm *kvm,
+                                      struct kvm_device_attr *attr)
+ {
+       struct kvm_s390_vm_cpu_feat data;
+       int ret = -EBUSY;
+ 
+       if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data)))
+               return -EFAULT;
+       if (!bitmap_subset((unsigned long *) data.feat,
+                          kvm_s390_available_cpu_feat,
+                          KVM_S390_VM_CPU_FEAT_NR_BITS))
+               return -EINVAL;
+ 
+       mutex_lock(&kvm->lock);
+       if (!atomic_read(&kvm->online_vcpus)) {
+               bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
+                           KVM_S390_VM_CPU_FEAT_NR_BITS);
+               ret = 0;
+       }
+       mutex_unlock(&kvm->lock);
+       return ret;
+ }
+ 
+ static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
+                                         struct kvm_device_attr *attr)
+ {
+       /*
+        * Once supported by kernel + hw, we have to store the subfunctions
+        * in kvm->arch and remember that user space configured them.
+        */
+       return -ENXIO;
+ }
+ 
   static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
   {
         int ret = -ENXIO;
@@@ -684,6 -857,12 +860,12 @@@
         case KVM_S390_VM_CPU_PROCESSOR:
                 ret = kvm_s390_set_processor(kvm, attr);
                 break;
+       case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+               ret = kvm_s390_set_processor_feat(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+               ret = kvm_s390_set_processor_subfunc(kvm, attr);
+               break;
         }
         return ret;
   }
@@@ -732,6 -911,50 +914,50 @@@ out
         return ret;
   }
   
+ static int kvm_s390_get_processor_feat(struct kvm *kvm,
+                                      struct kvm_device_attr *attr)
+ {
+       struct kvm_s390_vm_cpu_feat data;
+ 
+       bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat,
+                   KVM_S390_VM_CPU_FEAT_NR_BITS);
+       if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+               return -EFAULT;
+       return 0;
+ }
+ 
+ static int kvm_s390_get_machine_feat(struct kvm *kvm,
+                                    struct kvm_device_attr *attr)
+ {
+       struct kvm_s390_vm_cpu_feat data;
+ 
+       bitmap_copy((unsigned long *) data.feat,
+                   kvm_s390_available_cpu_feat,
+                   KVM_S390_VM_CPU_FEAT_NR_BITS);
+       if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+               return -EFAULT;
+       return 0;
+ }
+ 
+ static int kvm_s390_get_processor_subfunc(struct kvm *kvm,
+                                         struct kvm_device_attr *attr)
+ {
+       /*
+        * Once we can actually configure subfunctions (kernel + hw support),
+        * we have to check if they were already set by user space, if so copy
+        * them from kvm->arch.
+        */
+       return -ENXIO;
+ }
+ 
+ static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
+                                       struct kvm_device_attr *attr)
+ {
+       if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc,
+           sizeof(struct kvm_s390_vm_cpu_subfunc)))
+               return -EFAULT;
+       return 0;
+ }
   static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
   {
         int ret = -ENXIO;
@@@ -743,6 -966,18 +969,18 @@@
         case KVM_S390_VM_CPU_MACHINE:
                 ret = kvm_s390_get_machine(kvm, attr);
                 break;
+       case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+               ret = kvm_s390_get_processor_feat(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_MACHINE_FEAT:
+               ret = kvm_s390_get_machine_feat(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+               ret = kvm_s390_get_processor_subfunc(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
+               ret = kvm_s390_get_machine_subfunc(kvm, attr);
+               break;
         }
         return ret;
   }
@@@ -803,6 -1038,8 +1041,8 @@@ static int kvm_s390_vm_has_attr(struct 
                 switch (attr->attr) {
                 case KVM_S390_VM_MEM_ENABLE_CMMA:
                 case KVM_S390_VM_MEM_CLR_CMMA:
+                       ret = sclp.has_cmma ? 0 : -ENXIO;
+                       break;
                 case KVM_S390_VM_MEM_LIMIT_SIZE:
                         ret = 0;
                         break;
@@@ -826,8 -1063,13 +1066,13 @@@
                 switch (attr->attr) {
                 case KVM_S390_VM_CPU_PROCESSOR:
                 case KVM_S390_VM_CPU_MACHINE:
+               case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+               case KVM_S390_VM_CPU_MACHINE_FEAT:
+               case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
                         ret = 0;
                         break;
+               /* configuring subfunctions is not supported yet */
+               case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
                 default:
                         ret = -ENXIO;
                         break;
@@@ -858,7 -1100,6 +1103,6 @@@ static long kvm_s390_get_skeys(struct k
   {
         uint8_t *keys;
         uint64_t hva;
-       unsigned long curkey;
         int i, r = 0;
   
         if (args->flags != 0)
@@@ -879,26 -1120,27 +1123,27 @@@
         if (!keys)
                 return -ENOMEM;
   
+       down_read(&current->mm->mmap_sem);
         for (i = 0; i < args->count; i++) {
                 hva = gfn_to_hva(kvm, args->start_gfn + i);
                 if (kvm_is_error_hva(hva)) {
                         r = -EFAULT;
-                       goto out;
+                       break;
                 }
   
-               curkey = get_guest_storage_key(current->mm, hva);
-               if (IS_ERR_VALUE(curkey)) {
-                       r = curkey;
-                       goto out;
-               }
-               keys[i] = curkey;
+               r = get_guest_storage_key(current->mm, hva, &keys[i]);
+               if (r)
+                       break;
+       }
+       up_read(&current->mm->mmap_sem);
+ 
+       if (!r) {
+               r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
+                                sizeof(uint8_t) * args->count);
+               if (r)
+                       r = -EFAULT;
         }
   
-       r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
-                        sizeof(uint8_t) * args->count);
-       if (r)
-               r = -EFAULT;
- out:
         kvfree(keys);
         return r;
   }
@@@ -935,24 -1177,25 +1180,25 @@@ static long kvm_s390_set_skeys(struct k
         if (r)
                 goto out;
   
+       down_read(&current->mm->mmap_sem);
         for (i = 0; i < args->count; i++) {
                 hva = gfn_to_hva(kvm, args->start_gfn + i);
                 if (kvm_is_error_hva(hva)) {
                         r = -EFAULT;
-                       goto out;
+                       break;
                 }
   
                 /* Lowest order bit is reserved */
                 if (keys[i] & 0x01) {
                         r = -EINVAL;
-                       goto out;
+                       break;
                 }
   
-               r = set_guest_storage_key(current->mm, hva,
-                                         (unsigned long)keys[i], 0);
+               r = set_guest_storage_key(current->mm, hva, keys[i], 0);
                 if (r)
-                       goto out;
+                       break;
         }
+       up_read(&current->mm->mmap_sem);
   out:
         kvfree(keys);
         return r;
@@@ -1129,6 -1372,7 +1375,7 @@@ static void sca_dispose(struct kvm *kvm
   
   int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
   {
+       gfp_t alloc_flags = GFP_KERNEL;
         int i, rc;
         char debug_name[16];
         static unsigned long sca_offset;
@@@ -1150,9 -1394,13 +1397,13 @@@
   
         rc = -ENOMEM;
   
+       ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500);
+ 
         kvm->arch.use_esca = 0; /* start with basic SCA */
+       if (!sclp.has_64bscao)
+               alloc_flags |= GFP_DMA;
         rwlock_init(&kvm->arch.sca_lock);
-       kvm->arch.sca = (struct bsca_block *) get_zeroed_page(GFP_KERNEL);
+       kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
         if (!kvm->arch.sca)
                 goto out_err;
         spin_lock(&kvm_lock);
@@@ -1189,6 -1437,9 +1440,9 @@@
         memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask,
                S390_ARCH_FAC_LIST_SIZE_BYTE);
   
+       set_kvm_facility(kvm->arch.model.fac_mask, 74);
+       set_kvm_facility(kvm->arch.model.fac_list, 74);
+ 
         kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
         kvm->arch.model.ibc = sclp.ibc & 0x0fff;
   
@@@ -1212,7 -1463,7 +1466,7 @@@
                 else
                         kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE,
                                                     sclp.hamax + 1);
-               kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1);
+               kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
                 if (!kvm->arch.gmap)
                         goto out_err;
                 kvm->arch.gmap->private = kvm;
@@@ -1224,6 -1475,7 +1478,7 @@@
         kvm->arch.epoch = 0;
   
         spin_lock_init(&kvm->arch.start_stop_lock);
+       kvm_s390_vsie_init(kvm);
         KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
   
         return 0;
@@@ -1245,7 -1497,7 +1500,7 @@@ void kvm_arch_vcpu_destroy(struct kvm_v
                 sca_del_vcpu(vcpu);
   
         if (kvm_is_ucontrol(vcpu->kvm))
-               gmap_free(vcpu->arch.gmap);
+               gmap_remove(vcpu->arch.gmap);
   
         if (vcpu->kvm->arch.use_cmma)
                 kvm_s390_vcpu_unsetup_cmma(vcpu);
@@@ -1278,16 -1530,17 +1533,17 @@@ void kvm_arch_destroy_vm(struct kvm *kv
         debug_unregister(kvm->arch.dbf);
         free_page((unsigned long)kvm->arch.sie_page2);
         if (!kvm_is_ucontrol(kvm))
-               gmap_free(kvm->arch.gmap);
+               gmap_remove(kvm->arch.gmap);
         kvm_s390_destroy_adapters(kvm);
         kvm_s390_clear_float_irqs(kvm);
+       kvm_s390_vsie_destroy(kvm);
         KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
   }
   
   /* Section: vcpu related */
   static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
   {
-       vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
+       vcpu->arch.gmap = gmap_create(current->mm, -1UL);
         if (!vcpu->arch.gmap)
                 return -ENOMEM;
         vcpu->arch.gmap->private = vcpu->kvm;
@@@ -1396,7 -1649,7 +1652,7 @@@ static int sca_can_add_vcpu(struct kvm 
   
         if (id < KVM_S390_BSCA_CPU_SLOTS)
                 return true;
-       if (!sclp.has_esca)
+       if (!sclp.has_esca || !sclp.has_64bscao)
                 return false;
   
         mutex_lock(&kvm->lock);
@@@ -1537,7 -1790,7 +1793,7 @@@ void kvm_arch_vcpu_load(struct kvm_vcp
   
         save_access_regs(vcpu->arch.host_acrs);
         restore_access_regs(vcpu->run->s.regs.acrs);
-       gmap_enable(vcpu->arch.gmap);
+       gmap_enable(vcpu->arch.enabled_gmap);
         atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
         if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
                 __start_cpu_timer_accounting(vcpu);
@@@ -1550,7 -1803,8 +1806,8 @@@ void kvm_arch_vcpu_put(struct kvm_vcpu 
         if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
                 __stop_cpu_timer_accounting(vcpu);
         atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
-       gmap_disable(vcpu->arch.gmap);
+       vcpu->arch.enabled_gmap = gmap_get_enabled();
+       gmap_disable(vcpu->arch.enabled_gmap);
   
         /* Save guest register state */
         save_fpu_regs();
@@@ -1599,7 -1853,10 +1856,10 @@@ void kvm_arch_vcpu_postcreate(struct kv
                 vcpu->arch.gmap = vcpu->kvm->arch.gmap;
                 sca_add_vcpu(vcpu);
         }
- 
+       if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
+               vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+       /* make vcpu_load load the right gmap on the first trigger */
+       vcpu->arch.enabled_gmap = vcpu->arch.gmap;
   }
   
   static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
@@@ -1658,15 -1915,21 +1918,21 @@@ int kvm_arch_vcpu_setup(struct kvm_vcp
   
         kvm_s390_vcpu_setup_model(vcpu);
   
-       vcpu->arch.sie_block->ecb = 0x02;
+       /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */
+       if (MACHINE_HAS_ESOP)
+               vcpu->arch.sie_block->ecb |= 0x02;
         if (test_kvm_facility(vcpu->kvm, 9))
                 vcpu->arch.sie_block->ecb |= 0x04;
-       if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))
+       if (test_kvm_facility(vcpu->kvm, 73))
                 vcpu->arch.sie_block->ecb |= 0x10;
   
-       if (test_kvm_facility(vcpu->kvm, 8))
+       if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
                 vcpu->arch.sie_block->ecb2 |= 0x08;
-       vcpu->arch.sie_block->eca   = 0xC1002000U;
+       vcpu->arch.sie_block->eca = 0x1002000U;
+       if (sclp.has_cei)
+               vcpu->arch.sie_block->eca |= 0x80000000U;
+       if (sclp.has_ib)
+               vcpu->arch.sie_block->eca |= 0x40000000U;
         if (sclp.has_siif)
                 vcpu->arch.sie_block->eca |= 1;
         if (sclp.has_sigpif)
@@@ -1716,6 -1979,10 +1982,10 @@@ struct kvm_vcpu *kvm_arch_vcpu_create(s
         vcpu->arch.sie_block = &sie_page->sie_block;
         vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
   
+       /* the real guest size will always be smaller than msl */
+       vcpu->arch.sie_block->mso = 0;
+       vcpu->arch.sie_block->msl = sclp.hamax;
+ 
         vcpu->arch.sie_block->icpua = id;
         spin_lock_init(&vcpu->arch.local_int.lock);
         vcpu->arch.local_int.float_int = &kvm->arch.float_int;
@@@ -1784,16 -2051,25 +2054,25 @@@ void kvm_s390_sync_request(int req, str
         kvm_s390_vcpu_request(vcpu);
   }
   
- static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
+ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+                             unsigned long end)
   {
-       int i;
         struct kvm *kvm = gmap->private;
         struct kvm_vcpu *vcpu;
+       unsigned long prefix;
+       int i;
   
+       if (gmap_is_shadow(gmap))
+               return;
+       if (start >= 1UL << 31)
+               /* We are only interested in prefix pages */
+               return;
         kvm_for_each_vcpu(i, vcpu, kvm) {
                 /* match against both prefix pages */
-               if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
-                       VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
+               prefix = kvm_s390_get_prefix(vcpu);
+               if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
+                       VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
+                                  start, end);
                         kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
                 }
         }
@@@ -2002,6 -2278,8 +2281,8 @@@ int kvm_arch_vcpu_ioctl_set_guest_debug
   
         if (dbg->control & ~VALID_GUESTDBG_FLAGS)
                 return -EINVAL;
+       if (!sclp.has_gpere)
+               return -EINVAL;
   
         if (dbg->control & KVM_GUESTDBG_ENABLE) {
                 vcpu->guest_debug = dbg->control;
@@@ -2070,16 -2348,16 +2351,16 @@@ retry
                 return 0;
         /*
          * We use MMU_RELOAD just to re-arm the ipte notifier for the
-        * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
+        * guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
          * This ensures that the ipte instruction for this request has
          * already finished. We might race against a second unmapper that
          * wants to set the blocking bit. Lets just retry the request loop.
          */
         if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
                 int rc;
-               rc = gmap_ipte_notify(vcpu->arch.gmap,
-                                     kvm_s390_get_prefix(vcpu),
-                                     PAGE_SIZE * 2);
+               rc = gmap_mprotect_notify(vcpu->arch.gmap,
+                                         kvm_s390_get_prefix(vcpu),
+                                         PAGE_SIZE * 2, PROT_WRITE);
                 if (rc)
                         return rc;
                 goto retry;
@@@ -2108,6 -2386,11 +2389,11 @@@
                 goto retry;
         }
   
+       if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) {
+               vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+               goto retry;
+       }
+ 
         /* nothing to do, just clear the request */
         clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
   
@@@ -2362,14 -2645,14 +2648,14 @@@ static int __vcpu_run(struct kvm_vcpu *
                  * guest_enter and guest_exit should be no uaccess.
                  */
                 local_irq_disable();
-               __kvm_guest_enter();
+               guest_enter_irqoff();
                 __disable_cpu_timer_accounting(vcpu);
                 local_irq_enable();
                 exit_reason = sie64a(vcpu->arch.sie_block,
                                      vcpu->run->s.regs.gprs);
                 local_irq_disable();
                 __enable_cpu_timer_accounting(vcpu);
-               __kvm_guest_exit();
+               guest_exit_irqoff();
                 local_irq_enable();
                 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
   
@@@ -2598,6 -2881,8 +2884,8 @@@ static void __disable_ibs_on_all_vcpus(
   
   static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
   {
+       if (!sclp.has_ibs)
+               return;
         kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
         kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
   }
diff --combined arch/s390/mm/fault.c

index 25783dc,730e0d3..a58bca6
--- 1/arch/s390/mm/fault.c
--- 2/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@@ -418,6 -418,8 +418,8 @@@ static inline int do_exception(struct p
                 (struct gmap *) S390_lowcore.gmap : NULL;
         if (gmap) {
                 current->thread.gmap_addr = address;
+               current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
+               current->thread.gmap_int_code = regs->int_code & 0xffff;
                 address = __gmap_translate(gmap, address);
                 if (address == -EFAULT) {
                         fault = VM_FAULT_BADMAP;
@@@ -456,7 -458,7 +458,7 @@@ retry
          * make sure we exit gracefully rather than endlessly redo
          * the fault.
          */
- -      fault = handle_mm_fault(mm, vma, address, flags);
+ +      fault = handle_mm_fault(vma, address, flags);
         /* No reason to continue if interrupted by SIGKILL. */
         if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
                 fault = VM_FAULT_SIGNAL;
@@@ -624,7 -626,7 +626,7 @@@ void pfault_fini(void
         diag_stat_inc(DIAG_STAT_X258);
         asm volatile(
                 "       diag    %0,0,0x258\n"
- -              "0:\n"
+ +              "0:     nopr    %%r7\n"
                 EX_TABLE(0b,0b)
                 : : "a" (&refbk), "m" (refbk) : "cc");
   }
diff --combined arch/s390/mm/gmap.c

index 063c721,af0ae6d..2ce6bb3
--- 1/arch/s390/mm/gmap.c
--- 2/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@@ -20,14 -20,16 +20,16 @@@
   #include <asm/gmap.h>
   #include <asm/tlb.h>
   
+ #define GMAP_SHADOW_FAKE_TABLE 1ULL
+ 
   /**
-  * gmap_alloc - allocate a guest address space
+  * gmap_alloc - allocate and initialize a guest address space
    * @mm: pointer to the parent mm_struct
    * @limit: maximum address of the gmap address space
    *
    * Returns a guest address space structure.
    */
- struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
+ static struct gmap *gmap_alloc(unsigned long limit)
   {
         struct gmap *gmap;
         struct page *page;
@@@ -55,10 -57,14 +57,14 @@@
         if (!gmap)
                 goto out;
         INIT_LIST_HEAD(&gmap->crst_list);
+       INIT_LIST_HEAD(&gmap->children);
+       INIT_LIST_HEAD(&gmap->pt_list);
         INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
         INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
+       INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
         spin_lock_init(&gmap->guest_table_lock);
-       gmap->mm = mm;
+       spin_lock_init(&gmap->shadow_lock);
+       atomic_set(&gmap->ref_count, 1);
         page = alloc_pages(GFP_KERNEL, 2);
         if (!page)
                 goto out_free;
@@@ -70,9 -76,6 +76,6 @@@
         gmap->asce = atype | _ASCE_TABLE_LENGTH |
                 _ASCE_USER_BITS | __pa(table);
         gmap->asce_end = limit;
-       down_write(&mm->mmap_sem);
-       list_add(&gmap->list, &mm->context.gmap_list);
-       up_write(&mm->mmap_sem);
         return gmap;
   
   out_free:
@@@ -80,12 -83,33 +83,33 @@@
   out:
         return NULL;
   }
- EXPORT_SYMBOL_GPL(gmap_alloc);
+ 
+ /**
+  * gmap_create - create a guest address space
+  * @mm: pointer to the parent mm_struct
+  * @limit: maximum size of the gmap address space
+  *
+  * Returns a guest address space structure.
+  */
+ struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
+ {
+       struct gmap *gmap;
+ 
+       gmap = gmap_alloc(limit);
+       if (!gmap)
+               return NULL;
+       gmap->mm = mm;
+       spin_lock(&mm->context.gmap_lock);
+       list_add_rcu(&gmap->list, &mm->context.gmap_list);
+       spin_unlock(&mm->context.gmap_lock);
+       return gmap;
+ }
+ EXPORT_SYMBOL_GPL(gmap_create);
   
   static void gmap_flush_tlb(struct gmap *gmap)
   {
         if (MACHINE_HAS_IDTE)
- -              __tlb_flush_asce(gmap->mm, gmap->asce);
+ +              __tlb_flush_idte(gmap->asce);
         else
                 __tlb_flush_global();
   }
@@@ -114,31 -138,117 +138,117 @@@ static void gmap_radix_tree_free(struc
         } while (nr > 0);
   }
   
+ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+ {
+       struct gmap_rmap *rmap, *rnext, *head;
+       struct radix_tree_iter iter;
+       unsigned long indices[16];
+       unsigned long index;
+       void **slot;
+       int i, nr;
+ 
+       /* A radix tree is freed by deleting all of its entries */
+       index = 0;
+       do {
+               nr = 0;
+               radix_tree_for_each_slot(slot, root, &iter, index) {
+                       indices[nr] = iter.index;
+                       if (++nr == 16)
+                               break;
+               }
+               for (i = 0; i < nr; i++) {
+                       index = indices[i];
+                       head = radix_tree_delete(root, index);
+                       gmap_for_each_rmap_safe(rmap, rnext, head)
+                               kfree(rmap);
+               }
+       } while (nr > 0);
+ }
+ 
   /**
    * gmap_free - free a guest address space
    * @gmap: pointer to the guest address space structure
+  *
+  * No locks required. There are no references to this gmap anymore.
    */
- void gmap_free(struct gmap *gmap)
+ static void gmap_free(struct gmap *gmap)
   {
         struct page *page, *next;
   
-       /* Flush tlb. */
-       if (MACHINE_HAS_IDTE)
-               __tlb_flush_idte(gmap->asce);
-       else
-               __tlb_flush_global();
- 
+       /* Flush tlb of all gmaps (if not already done for shadows) */
+       if (!(gmap_is_shadow(gmap) && gmap->removed))
+               gmap_flush_tlb(gmap);
         /* Free all segment & region tables. */
         list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
                 __free_pages(page, 2);
         gmap_radix_tree_free(&gmap->guest_to_host);
         gmap_radix_tree_free(&gmap->host_to_guest);
-       down_write(&gmap->mm->mmap_sem);
-       list_del(&gmap->list);
-       up_write(&gmap->mm->mmap_sem);
+ 
+       /* Free additional data for a shadow gmap */
+       if (gmap_is_shadow(gmap)) {
+               /* Free all page tables. */
+               list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
+                       page_table_free_pgste(page);
+               gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+               /* Release reference to the parent */
+               gmap_put(gmap->parent);
+       }
+ 
         kfree(gmap);
   }
- EXPORT_SYMBOL_GPL(gmap_free);
+ 
+ /**
+  * gmap_get - increase reference counter for guest address space
+  * @gmap: pointer to the guest address space structure
+  *
+  * Returns the gmap pointer
+  */
+ struct gmap *gmap_get(struct gmap *gmap)
+ {
+       atomic_inc(&gmap->ref_count);
+       return gmap;
+ }
+ EXPORT_SYMBOL_GPL(gmap_get);
+ 
+ /**
+  * gmap_put - decrease reference counter for guest address space
+  * @gmap: pointer to the guest address space structure
+  *
+  * If the reference counter reaches zero the guest address space is freed.
+  */
+ void gmap_put(struct gmap *gmap)
+ {
+       if (atomic_dec_return(&gmap->ref_count) == 0)
+               gmap_free(gmap);
+ }
+ EXPORT_SYMBOL_GPL(gmap_put);
+ 
+ /**
+  * gmap_remove - remove a guest address space but do not free it yet
+  * @gmap: pointer to the guest address space structure
+  */
+ void gmap_remove(struct gmap *gmap)
+ {
+       struct gmap *sg, *next;
+ 
+       /* Remove all shadow gmaps linked to this gmap */
+       if (!list_empty(&gmap->children)) {
+               spin_lock(&gmap->shadow_lock);
+               list_for_each_entry_safe(sg, next, &gmap->children, list) {
+                       list_del(&sg->list);
+                       gmap_put(sg);
+               }
+               spin_unlock(&gmap->shadow_lock);
+       }
+       /* Remove gmap from the pre-mm list */
+       spin_lock(&gmap->mm->context.gmap_lock);
+       list_del_rcu(&gmap->list);
+       spin_unlock(&gmap->mm->context.gmap_lock);
+       synchronize_rcu();
+       /* Put reference */
+       gmap_put(gmap);
+ }
+ EXPORT_SYMBOL_GPL(gmap_remove);
   
   /**
    * gmap_enable - switch primary space to the guest address space
@@@ -160,6 -270,17 +270,17 @@@ void gmap_disable(struct gmap *gmap
   }
   EXPORT_SYMBOL_GPL(gmap_disable);
   
+ /**
+  * gmap_get_enabled - get a pointer to the currently enabled gmap
+  *
+  * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
+  */
+ struct gmap *gmap_get_enabled(void)
+ {
+       return (struct gmap *) S390_lowcore.gmap;
+ }
+ EXPORT_SYMBOL_GPL(gmap_get_enabled);
+ 
   /*
    * gmap_alloc_table is assumed to be called with mmap_sem held
    */
@@@ -175,7 -296,7 +296,7 @@@ static int gmap_alloc_table(struct gma
                 return -ENOMEM;
         new = (unsigned long *) page_to_phys(page);
         crst_table_init(new, init);
-       spin_lock(&gmap->mm->page_table_lock);
+       spin_lock(&gmap->guest_table_lock);
         if (*table & _REGION_ENTRY_INVALID) {
                 list_add(&page->lru, &gmap->crst_list);
                 *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
@@@ -183,7 -304,7 +304,7 @@@
                 page->index = gaddr;
                 page = NULL;
         }
-       spin_unlock(&gmap->mm->page_table_lock);
+       spin_unlock(&gmap->guest_table_lock);
         if (page)
                 __free_pages(page, 2);
         return 0;
@@@ -219,6 -340,7 +340,7 @@@ static int __gmap_unlink_by_vmaddr(stru
         unsigned long *entry;
         int flush = 0;
   
+       BUG_ON(gmap_is_shadow(gmap));
         spin_lock(&gmap->guest_table_lock);
         entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
         if (entry) {
@@@ -258,6 -380,7 +380,7 @@@ int gmap_unmap_segment(struct gmap *gma
         unsigned long off;
         int flush;
   
+       BUG_ON(gmap_is_shadow(gmap));
         if ((to | len) & (PMD_SIZE - 1))
                 return -EINVAL;
         if (len == 0 || to + len < to)
@@@ -289,6 -412,7 +412,7 @@@ int gmap_map_segment(struct gmap *gmap
         unsigned long off;
         int flush;
   
+       BUG_ON(gmap_is_shadow(gmap));
         if ((from | to | len) & (PMD_SIZE - 1))
                 return -EINVAL;
         if (len == 0 || from + len < from || to + len < to ||
@@@ -326,6 -450,8 +450,8 @@@ EXPORT_SYMBOL_GPL(gmap_map_segment)
    * This function does not establish potentially missing page table entries.
    * The mmap_sem of the mm that belongs to the address space must be held
    * when this function gets called.
+  *
+  * Note: Can also be called for shadow gmaps.
    */
   unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
   {
@@@ -333,6 -459,7 +459,7 @@@
   
         vmaddr = (unsigned long)
                 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
+       /* Note: guest_to_host is empty for a shadow gmap */
         return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
   }
   EXPORT_SYMBOL_GPL(__gmap_translate);
@@@ -369,11 -496,13 +496,13 @@@ void gmap_unlink(struct mm_struct *mm, 
         struct gmap *gmap;
         int flush;
   
-       list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
                 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
                 if (flush)
                         gmap_flush_tlb(gmap);
         }
+       rcu_read_unlock();
   }
   
   /**
@@@ -397,6 -526,7 +526,7 @@@ int __gmap_link(struct gmap *gmap, unsi
         pmd_t *pmd;
         int rc;
   
+       BUG_ON(gmap_is_shadow(gmap));
         /* Create higher level tables in the gmap page table */
         table = gmap->table;
         if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
@@@ -430,9 -560,6 +560,9 @@@
         VM_BUG_ON(pgd_none(*pgd));
         pud = pud_offset(pgd, vmaddr);
         VM_BUG_ON(pud_none(*pud));
+ +      /* large puds cannot yet be handled */
+ +      if (pud_large(*pud))
+ +              return -EFAULT;
         pmd = pmd_offset(pud, vmaddr);
         VM_BUG_ON(pmd_none(*pmd));
         /* large pmds cannot yet be handled */
@@@ -552,116 -679,1412 +682,1412 @@@ static LIST_HEAD(gmap_notifier_list)
   static DEFINE_SPINLOCK(gmap_notifier_lock);
   
   /**
-  * gmap_register_ipte_notifier - register a pte invalidation callback
+  * gmap_register_pte_notifier - register a pte invalidation callback
    * @nb: pointer to the gmap notifier block
    */
- void gmap_register_ipte_notifier(struct gmap_notifier *nb)
+ void gmap_register_pte_notifier(struct gmap_notifier *nb)
   {
         spin_lock(&gmap_notifier_lock);
-       list_add(&nb->list, &gmap_notifier_list);
+       list_add_rcu(&nb->list, &gmap_notifier_list);
         spin_unlock(&gmap_notifier_lock);
   }
- EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
+ EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
   
   /**
-  * gmap_unregister_ipte_notifier - remove a pte invalidation callback
+  * gmap_unregister_pte_notifier - remove a pte invalidation callback
    * @nb: pointer to the gmap notifier block
    */
- void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
+ void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
   {
         spin_lock(&gmap_notifier_lock);
-       list_del_init(&nb->list);
+       list_del_rcu(&nb->list);
         spin_unlock(&gmap_notifier_lock);
+       synchronize_rcu();
+ }
+ EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
+ 
+ /**
+  * gmap_call_notifier - call all registered invalidation callbacks
+  * @gmap: pointer to guest mapping meta data structure
+  * @start: start virtual address in the guest address space
+  * @end: end virtual address in the guest address space
+  */
+ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
+                              unsigned long end)
+ {
+       struct gmap_notifier *nb;
+ 
+       list_for_each_entry(nb, &gmap_notifier_list, list)
+               nb->notifier_call(gmap, start, end);
+ }
+ 
+ /**
+  * gmap_table_walk - walk the gmap page tables
+  * @gmap: pointer to guest mapping meta data structure
+  * @gaddr: virtual address in the guest address space
+  * @level: page table level to stop at
+  *
+  * Returns a table entry pointer for the given guest address and @level
+  * @level=0 : returns a pointer to a page table table entry (or NULL)
+  * @level=1 : returns a pointer to a segment table entry (or NULL)
+  * @level=2 : returns a pointer to a region-3 table entry (or NULL)
+  * @level=3 : returns a pointer to a region-2 table entry (or NULL)
+  * @level=4 : returns a pointer to a region-1 table entry (or NULL)
+  *
+  * Returns NULL if the gmap page tables could not be walked to the
+  * requested level.
+  *
+  * Note: Can also be called for shadow gmaps.
+  */
+ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
+                                            unsigned long gaddr, int level)
+ {
+       unsigned long *table;
+ 
+       if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
+               return NULL;
+       if (gmap_is_shadow(gmap) && gmap->removed)
+               return NULL;
+       if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+               return NULL;
+       table = gmap->table;
+       switch (gmap->asce & _ASCE_TYPE_MASK) {
+       case _ASCE_TYPE_REGION1:
+               table += (gaddr >> 53) & 0x7ff;
+               if (level == 4)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_REGION2:
+               table += (gaddr >> 42) & 0x7ff;
+               if (level == 3)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_REGION3:
+               table += (gaddr >> 31) & 0x7ff;
+               if (level == 2)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_SEGMENT:
+               table += (gaddr >> 20) & 0x7ff;
+               if (level == 1)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+               table += (gaddr >> 12) & 0xff;
+       }
+       return table;
+ }
+ 
+ /**
+  * gmap_pte_op_walk - walk the gmap page table, get the page table lock
+  *                  and return the pte pointer
+  * @gmap: pointer to guest mapping meta data structure
+  * @gaddr: virtual address in the guest address space
+  * @ptl: pointer to the spinlock pointer
+  *
+  * Returns a pointer to the locked pte for a guest address, or NULL
+  *
+  * Note: Can also be called for shadow gmaps.
+  */
+ static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
+                              spinlock_t **ptl)
+ {
+       unsigned long *table;
+ 
+       if (gmap_is_shadow(gmap))
+               spin_lock(&gmap->guest_table_lock);
+       /* Walk the gmap page table, lock and get pte pointer */
+       table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
+       if (!table || *table & _SEGMENT_ENTRY_INVALID) {
+               if (gmap_is_shadow(gmap))
+                       spin_unlock(&gmap->guest_table_lock);
+               return NULL;
+       }
+       if (gmap_is_shadow(gmap)) {
+               *ptl = &gmap->guest_table_lock;
+               return pte_offset_map((pmd_t *) table, gaddr);
+       }
+       return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
+ }
+ 
+ /**
+  * gmap_pte_op_fixup - force a page in and connect the gmap page table
+  * @gmap: pointer to guest mapping meta data structure
+  * @gaddr: virtual address in the guest address space
+  * @vmaddr: address in the host process address space
+  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+  *
+  * Returns 0 if the caller can retry __gmap_translate (might fail again),
+  * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
+  * up or connecting the gmap page table.
+  */
+ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
+                            unsigned long vmaddr, int prot)
+ {
+       struct mm_struct *mm = gmap->mm;
+       unsigned int fault_flags;
+       bool unlocked = false;
+ 
+       BUG_ON(gmap_is_shadow(gmap));
+       fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+       if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+               return -EFAULT;
+       if (unlocked)
+               /* lost mmap_sem, caller has to retry __gmap_translate */
+               return 0;
+       /* Connect the page tables */
+       return __gmap_link(gmap, gaddr, vmaddr);
   }
- EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
   
   /**
-  * gmap_ipte_notify - mark a range of ptes for invalidation notification
+  * gmap_pte_op_end - release the page table lock
+  * @ptl: pointer to the spinlock pointer
+  */
+ static void gmap_pte_op_end(spinlock_t *ptl)
+ {
+       spin_unlock(ptl);
+ }
+ 
+ /*
+  * gmap_protect_range - remove access rights to memory and set pgste bits
    * @gmap: pointer to guest mapping meta data structure
    * @gaddr: virtual address in the guest address space
    * @len: size of area
+  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+  * @bits: pgste notification bits to set
+  *
+  * Returns 0 if successfully protected, -ENOMEM if out of memory and
+  * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+  *
+  * Called with sg->mm->mmap_sem in read.
    *
-  * Returns 0 if for each page in the given range a gmap mapping exists and
-  * the invalidation notification could be set. If the gmap mapping is missing
-  * for one or more pages -EFAULT is returned. If no memory could be allocated
-  * -ENOMEM is returned. This function establishes missing page table entries.
+  * Note: Can also be called for shadow gmaps.
    */
- int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
+ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
+                             unsigned long len, int prot, unsigned long bits)
   {
-       unsigned long addr;
+       unsigned long vmaddr;
         spinlock_t *ptl;
         pte_t *ptep;
-       bool unlocked;
-       int rc = 0;
+       int rc;
+ 
+       while (len) {
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+               if (ptep) {
+                       rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
+                       gmap_pte_op_end(ptl);
+               }
+               if (rc) {
+                       vmaddr = __gmap_translate(gmap, gaddr);
+                       if (IS_ERR_VALUE(vmaddr))
+                               return vmaddr;
+                       rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
+                       if (rc)
+                               return rc;
+                       continue;
+               }
+               gaddr += PAGE_SIZE;
+               len -= PAGE_SIZE;
+       }
+       return 0;
+ }
   
-       if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
+ /**
+  * gmap_mprotect_notify - change access rights for a range of ptes and
+  *                        call the notifier if any pte changes again
+  * @gmap: pointer to guest mapping meta data structure
+  * @gaddr: virtual address in the guest address space
+  * @len: size of area
+  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+  *
+  * Returns 0 if for each page in the given range a gmap mapping exists,
+  * the new access rights could be set and the notifier could be armed.
+  * If the gmap mapping is missing for one or more pages -EFAULT is
+  * returned. If no memory could be allocated -ENOMEM is returned.
+  * This function establishes missing page table entries.
+  */
+ int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
+                        unsigned long len, int prot)
+ {
+       int rc;
+ 
+       if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
+               return -EINVAL;
+       if (!MACHINE_HAS_ESOP && prot == PROT_READ)
                 return -EINVAL;
         down_read(&gmap->mm->mmap_sem);
-       while (len) {
-               unlocked = false;
-               /* Convert gmap address and connect the page tables */
-               addr = __gmap_translate(gmap, gaddr);
-               if (IS_ERR_VALUE(addr)) {
-                       rc = addr;
-                       break;
+       rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
+       up_read(&gmap->mm->mmap_sem);
+       return rc;
+ }
+ EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+ 
+ /**
+  * gmap_read_table - get an unsigned long value from a guest page table using
+  *                   absolute addressing, without marking the page referenced.
+  * @gmap: pointer to guest mapping meta data structure
+  * @gaddr: virtual address in the guest address space
+  * @val: pointer to the unsigned long value to return
+  *
+  * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
+  * if reading using the virtual address failed.
+  *
+  * Called with gmap->mm->mmap_sem in read.
+  */
+ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
+ {
+       unsigned long address, vmaddr;
+       spinlock_t *ptl;
+       pte_t *ptep, pte;
+       int rc;
+ 
+       while (1) {
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+               if (ptep) {
+                       pte = *ptep;
+                       if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
+                               address = pte_val(pte) & PAGE_MASK;
+                               address += gaddr & ~PAGE_MASK;
+                               *val = *(unsigned long *) address;
+                               pte_val(*ptep) |= _PAGE_YOUNG;
+                               /* Do *NOT* clear the _PAGE_INVALID bit! */
+                               rc = 0;
+                       }
+                       gmap_pte_op_end(ptl);
                 }
-               /* Get the page mapped */
-               if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
-                                    &unlocked)) {
-                       rc = -EFAULT;
+               if (!rc)
+                       break;
+               vmaddr = __gmap_translate(gmap, gaddr);
+               if (IS_ERR_VALUE(vmaddr)) {
+                       rc = vmaddr;
                         break;
                 }
-               /* While trying to map mmap_sem got unlocked. Let us retry */
-               if (unlocked)
-                       continue;
-               rc = __gmap_link(gmap, gaddr, addr);
+               rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
                 if (rc)
                         break;
-               /* Walk the process page table, lock and get pte pointer */
-               ptep = get_locked_pte(gmap->mm, addr, &ptl);
-               VM_BUG_ON(!ptep);
-               /* Set notification bit in the pgste of the pte */
-               if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
-                       ptep_set_notify(gmap->mm, addr, ptep);
-                       gaddr += PAGE_SIZE;
-                       len -= PAGE_SIZE;
-               }
-               pte_unmap_unlock(ptep, ptl);
         }
-       up_read(&gmap->mm->mmap_sem);
         return rc;
   }
- EXPORT_SYMBOL_GPL(gmap_ipte_notify);
+ EXPORT_SYMBOL_GPL(gmap_read_table);
   
   /**
-  * ptep_notify - call all invalidation callbacks for a specific pte.
-  * @mm: pointer to the process mm_struct
-  * @addr: virtual address in the process address space
-  * @pte: pointer to the page table entry
+  * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
+  * @sg: pointer to the shadow guest address space structure
+  * @vmaddr: vm address associated with the rmap
+  * @rmap: pointer to the rmap structure
    *
-  * This function is assumed to be called with the page table lock held
-  * for the pte to notify.
+  * Called with the sg->guest_table_lock
    */
- void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
+ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
+                                   struct gmap_rmap *rmap)
   {
-       unsigned long offset, gaddr;
-       unsigned long *table;
-       struct gmap_notifier *nb;
-       struct gmap *gmap;
+       void **slot;
   
-       offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
-       offset = offset * (4096 / sizeof(pte_t));
-       spin_lock(&gmap_notifier_lock);
-       list_for_each_entry(gmap, &mm->context.gmap_list, list) {
-               table = radix_tree_lookup(&gmap->host_to_guest,
-                                         vmaddr >> PMD_SHIFT);
-               if (!table)
+       BUG_ON(!gmap_is_shadow(sg));
+       slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+       if (slot) {
+               rmap->next = radix_tree_deref_slot_protected(slot,
+                                                       &sg->guest_table_lock);
+               radix_tree_replace_slot(slot, rmap);
+       } else {
+               rmap->next = NULL;
+               radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
+                                 rmap);
+       }
+ }
+ 
+ /**
+  * gmap_protect_rmap - modify access rights to memory and create an rmap
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow gmap
+  * @paddr: address in the parent guest address space
+  * @len: length of the memory area to protect
+  * @prot: indicates access rights: none, read-only or read-write
+  *
+  * Returns 0 if successfully protected and the rmap was created, -ENOMEM
+  * if out of memory and -EFAULT if paddr is invalid.
+  */
+ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
+                            unsigned long paddr, unsigned long len, int prot)
+ {
+       struct gmap *parent;
+       struct gmap_rmap *rmap;
+       unsigned long vmaddr;
+       spinlock_t *ptl;
+       pte_t *ptep;
+       int rc;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       parent = sg->parent;
+       while (len) {
+               vmaddr = __gmap_translate(parent, paddr);
+               if (IS_ERR_VALUE(vmaddr))
+                       return vmaddr;
+               rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+               if (!rmap)
+                       return -ENOMEM;
+               rmap->raddr = raddr;
+               rc = radix_tree_preload(GFP_KERNEL);
+               if (rc) {
+                       kfree(rmap);
+                       return rc;
+               }
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(parent, paddr, &ptl);
+               if (ptep) {
+                       spin_lock(&sg->guest_table_lock);
+                       rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
+                                            PGSTE_VSIE_BIT);
+                       if (!rc)
+                               gmap_insert_rmap(sg, vmaddr, rmap);
+                       spin_unlock(&sg->guest_table_lock);
+                       gmap_pte_op_end(ptl);
+               }
+               radix_tree_preload_end();
+               if (rc) {
+                       kfree(rmap);
+                       rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+                       if (rc)
+                               return rc;
                         continue;
-               gaddr = __gmap_segment_gaddr(table) + offset;
-               list_for_each_entry(nb, &gmap_notifier_list, list)
-                       nb->notifier_call(gmap, gaddr);
+               }
+               paddr += PAGE_SIZE;
+               len -= PAGE_SIZE;
         }
-       spin_unlock(&gmap_notifier_lock);
+       return 0;
+ }
+ 
+ #define _SHADOW_RMAP_MASK     0x7
+ #define _SHADOW_RMAP_REGION1  0x5
+ #define _SHADOW_RMAP_REGION2  0x4
+ #define _SHADOW_RMAP_REGION3  0x3
+ #define _SHADOW_RMAP_SEGMENT  0x2
+ #define _SHADOW_RMAP_PGTABLE  0x1
+ 
+ /**
+  * gmap_idte_one - invalidate a single region or segment table entry
+  * @asce: region or segment table *origin* + table-type bits
+  * @vaddr: virtual address to identify the table entry to flush
+  *
+  * The invalid bit of a single region or segment table entry is set
+  * and the associated TLB entries depending on the entry are flushed.
+  * The table-type of the @asce identifies the portion of the @vaddr
+  * that is used as the invalidation index.
+  */
+ static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
+ {
+       asm volatile(
+               "       .insn   rrf,0xb98e0000,%0,%1,0,0"
+               : : "a" (asce), "a" (vaddr) : "cc", "memory");
+ }
+ 
+ /**
+  * gmap_unshadow_page - remove a page from a shadow page table
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
+ {
+       unsigned long *table;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
+       if (!table || *table & _PAGE_INVALID)
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1);
+       ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
+ }
+ 
+ /**
+  * __gmap_unshadow_pgt - remove all entries from a shadow page table
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  * @pgt: pointer to the start of a shadow page table
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
+                               unsigned long *pgt)
+ {
+       int i;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       for (i = 0; i < 256; i++, raddr += 1UL << 12)
+               pgt[i] = _PAGE_INVALID;
+ }
+ 
+ /**
+  * gmap_unshadow_pgt - remove a shadow page table from a segment entry
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: address in the shadow guest address space
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
+ {
+       unsigned long sto, *ste, *pgt;
+       struct page *page;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
+       if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1);
+       sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff));
+       gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
+       pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+       *ste = _SEGMENT_ENTRY_EMPTY;
+       __gmap_unshadow_pgt(sg, raddr, pgt);
+       /* Free page table */
+       page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       page_table_free_pgste(page);
+ }
+ 
+ /**
+  * __gmap_unshadow_sgt - remove all entries from a shadow segment table
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  * @sgt: pointer to the start of a shadow segment table
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
+                               unsigned long *sgt)
+ {
+       unsigned long asce, *pgt;
+       struct page *page;
+       int i;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 20) {
+               if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
+                       continue;
+               pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+               sgt[i] = _SEGMENT_ENTRY_EMPTY;
+               __gmap_unshadow_pgt(sg, raddr, pgt);
+               /* Free page table */
+               page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               page_table_free_pgste(page);
+       }
+ }
+ 
+ /**
+  * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  *
+  * Called with the shadow->guest_table_lock
+  */
+ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
+ {
+       unsigned long r3o, *r3e, *sgt;
+       struct page *page;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
+       if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1);
+       r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff));
+       gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
+       sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+       *r3e = _REGION3_ENTRY_EMPTY;
+       __gmap_unshadow_sgt(sg, raddr, sgt);
+       /* Free segment table */
+       page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+ }
+ 
+ /**
+  * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: address in the shadow guest address space
+  * @r3t: pointer to the start of a shadow region-3 table
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r3t)
+ {
+       unsigned long asce, *sgt;
+       struct page *page;
+       int i;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r3t | _ASCE_TYPE_REGION3;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 31) {
+               if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+               r3t[i] = _REGION3_ENTRY_EMPTY;
+               __gmap_unshadow_sgt(sg, raddr, sgt);
+               /* Free segment table */
+               page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+ }
+ 
+ /**
+  * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
+ {
+       unsigned long r2o, *r2e, *r3t;
+       struct page *page;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
+       if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1);
+       r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff));
+       gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
+       r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+       *r2e = _REGION2_ENTRY_EMPTY;
+       __gmap_unshadow_r3t(sg, raddr, r3t);
+       /* Free region 3 table */
+       page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+ }
+ 
+ /**
+  * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  * @r2t: pointer to the start of a shadow region-2 table
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r2t)
+ {
+       unsigned long asce, *r3t;
+       struct page *page;
+       int i;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r2t | _ASCE_TYPE_REGION2;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 42) {
+               if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+               r2t[i] = _REGION2_ENTRY_EMPTY;
+               __gmap_unshadow_r3t(sg, raddr, r3t);
+               /* Free region 3 table */
+               page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+ }
+ 
+ /**
+  * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
+ {
+       unsigned long r1o, *r1e, *r2t;
+       struct page *page;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
+       if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1);
+       r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff));
+       gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
+       r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+       *r1e = _REGION1_ENTRY_EMPTY;
+       __gmap_unshadow_r2t(sg, raddr, r2t);
+       /* Free region 2 table */
+       page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+ }
+ 
+ /**
+  * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  * @r1t: pointer to the start of a shadow region-1 table
+  *
+  * Called with the shadow->guest_table_lock
+  */
+ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r1t)
+ {
+       unsigned long asce, *r2t;
+       struct page *page;
+       int i;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 53) {
+               if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
+               __gmap_unshadow_r2t(sg, raddr, r2t);
+               /* Clear entry and flush translation r1t -> r2t */
+               gmap_idte_one(asce, raddr);
+               r1t[i] = _REGION1_ENTRY_EMPTY;
+               /* Free region 2 table */
+               page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+ }
+ 
+ /**
+  * gmap_unshadow - remove a shadow page table completely
+  * @sg: pointer to the shadow guest address space structure
+  *
+  * Called with sg->guest_table_lock
+  */
+ static void gmap_unshadow(struct gmap *sg)
+ {
+       unsigned long *table;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       if (sg->removed)
+               return;
+       sg->removed = 1;
+       gmap_call_notifier(sg, 0, -1UL);
+       gmap_flush_tlb(sg);
+       table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+       switch (sg->asce & _ASCE_TYPE_MASK) {
+       case _ASCE_TYPE_REGION1:
+               __gmap_unshadow_r1t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_REGION2:
+               __gmap_unshadow_r2t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_REGION3:
+               __gmap_unshadow_r3t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_SEGMENT:
+               __gmap_unshadow_sgt(sg, 0, table);
+               break;
+       }
+ }
+ 
+ /**
+  * gmap_find_shadow - find a specific asce in the list of shadow tables
+  * @parent: pointer to the parent gmap
+  * @asce: ASCE for which the shadow table is created
+  * @edat_level: edat level to be used for the shadow translation
+  *
+  * Returns the pointer to a gmap if a shadow table with the given asce is
+  * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+  * otherwise NULL
+  */
+ static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
+                                    int edat_level)
+ {
+       struct gmap *sg;
+ 
+       list_for_each_entry(sg, &parent->children, list) {
+               if (sg->orig_asce != asce || sg->edat_level != edat_level ||
+                   sg->removed)
+                       continue;
+               if (!sg->initialized)
+                       return ERR_PTR(-EAGAIN);
+               atomic_inc(&sg->ref_count);
+               return sg;
+       }
+       return NULL;
+ }
+ 
+ /**
+  * gmap_shadow_valid - check if a shadow guest address space matches the
+  *                     given properties and is still valid
+  * @sg: pointer to the shadow guest address space structure
+  * @asce: ASCE for which the shadow table is requested
+  * @edat_level: edat level to be used for the shadow translation
+  *
+  * Returns 1 if the gmap shadow is still valid and matches the given
+  * properties, the caller can continue using it. Returns 0 otherwise, the
+  * caller has to request a new shadow gmap in this case.
+  *
+  */
+ int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+ {
+       if (sg->removed)
+               return 0;
+       return sg->orig_asce == asce && sg->edat_level == edat_level;
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow_valid);
+ 
+ /**
+  * gmap_shadow - create/find a shadow guest address space
+  * @parent: pointer to the parent gmap
+  * @asce: ASCE for which the shadow table is created
+  * @edat_level: edat level to be used for the shadow translation
+  *
+  * The pages of the top level page table referred by the asce parameter
+  * will be set to read-only and marked in the PGSTEs of the kvm process.
+  * The shadow table will be removed automatically on any change to the
+  * PTE mapping for the source table.
+  *
+  * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+  * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+  * parent gmap table could not be protected.
+  */
+ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+                        int edat_level)
+ {
+       struct gmap *sg, *new;
+       unsigned long limit;
+       int rc;
+ 
+       BUG_ON(gmap_is_shadow(parent));
+       spin_lock(&parent->shadow_lock);
+       sg = gmap_find_shadow(parent, asce, edat_level);
+       spin_unlock(&parent->shadow_lock);
+       if (sg)
+               return sg;
+       /* Create a new shadow gmap */
+       limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+       if (asce & _ASCE_REAL_SPACE)
+               limit = -1UL;
+       new = gmap_alloc(limit);
+       if (!new)
+               return ERR_PTR(-ENOMEM);
+       new->mm = parent->mm;
+       new->parent = gmap_get(parent);
+       new->orig_asce = asce;
+       new->edat_level = edat_level;
+       new->initialized = false;
+       spin_lock(&parent->shadow_lock);
+       /* Recheck if another CPU created the same shadow */
+       sg = gmap_find_shadow(parent, asce, edat_level);
+       if (sg) {
+               spin_unlock(&parent->shadow_lock);
+               gmap_free(new);
+               return sg;
+       }
+       if (asce & _ASCE_REAL_SPACE) {
+               /* only allow one real-space gmap shadow */
+               list_for_each_entry(sg, &parent->children, list) {
+                       if (sg->orig_asce & _ASCE_REAL_SPACE) {
+                               spin_lock(&sg->guest_table_lock);
+                               gmap_unshadow(sg);
+                               spin_unlock(&sg->guest_table_lock);
+                               list_del(&sg->list);
+                               gmap_put(sg);
+                               break;
+                       }
+               }
+       }
+       atomic_set(&new->ref_count, 2);
+       list_add(&new->list, &parent->children);
+       if (asce & _ASCE_REAL_SPACE) {
+               /* nothing to protect, return right away */
+               new->initialized = true;
+               spin_unlock(&parent->shadow_lock);
+               return new;
+       }
+       spin_unlock(&parent->shadow_lock);
+       /* protect after insertion, so it will get properly invalidated */
+       down_read(&parent->mm->mmap_sem);
+       rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
+                               ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096,
+                               PROT_READ, PGSTE_VSIE_BIT);
+       up_read(&parent->mm->mmap_sem);
+       spin_lock(&parent->shadow_lock);
+       new->initialized = true;
+       if (rc) {
+               list_del(&new->list);
+               gmap_free(new);
+               new = ERR_PTR(rc);
+       }
+       spin_unlock(&parent->shadow_lock);
+       return new;
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow);
+ 
+ /**
+  * gmap_shadow_r2t - create an empty shadow region 2 table
+  * @sg: pointer to the shadow guest address space structure
+  * @saddr: faulting address in the shadow gmap
+  * @r2t: parent gmap address of the region 2 table to get shadowed
+  * @fake: r2t references contiguous guest memory block, not a r2t
+  *
+  * The r2t parameter specifies the address of the source table. The
+  * four pages of the source table are made read-only in the parent gmap
+  * address space. A write to the source table area @r2t will automatically
+  * remove the shadow r2 table and all of its decendents.
+  *
+  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+  * shadow table structure is incomplete, -ENOMEM if out of memory and
+  * -EFAULT if an address in the parent gmap could not be resolved.
+  *
+  * Called with sg->mm->mmap_sem in read.
+  */
+ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+                   int fake)
+ {
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_r2t, *table;
+       struct page *page;
+       int rc;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       /* Allocate a shadow region second table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = r2t & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_r2t = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= (r2t & _REGION_ENTRY_PROTECT);
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make r2t read-only in parent gmap page table */
+       raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1;
+       origin = r2t & _REGION_ENTRY_ORIGIN;
+       offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 4);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_r2t)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_r2t(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+ out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
+ 
+ /**
+  * gmap_shadow_r3t - create a shadow region 3 table
+  * @sg: pointer to the shadow guest address space structure
+  * @saddr: faulting address in the shadow gmap
+  * @r3t: parent gmap address of the region 3 table to get shadowed
+  * @fake: r3t references contiguous guest memory block, not a r3t
+  *
+  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+  * shadow table structure is incomplete, -ENOMEM if out of memory and
+  * -EFAULT if an address in the parent gmap could not be resolved.
+  *
+  * Called with sg->mm->mmap_sem in read.
+  */
+ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+                   int fake)
+ {
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_r3t, *table;
+       struct page *page;
+       int rc;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       /* Allocate a shadow region second table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = r3t & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_r3t = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+       }
+       crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= (r3t & _REGION_ENTRY_PROTECT);
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make r3t read-only in parent gmap page table */
+       raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2;
+       origin = r3t & _REGION_ENTRY_ORIGIN;
+       offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 3);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_r3t)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_r3t(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+ out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
+ 
+ /**
+  * gmap_shadow_sgt - create a shadow segment table
+  * @sg: pointer to the shadow guest address space structure
+  * @saddr: faulting address in the shadow gmap
+  * @sgt: parent gmap address of the segment table to get shadowed
+  * @fake: sgt references contiguous guest memory block, not a sgt
+  *
+  * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
+  * shadow table structure is incomplete, -ENOMEM if out of memory and
+  * -EFAULT if an address in the parent gmap could not be resolved.
+  *
+  * Called with sg->mm->mmap_sem in read.
+  */
+ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+                   int fake)
+ {
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_sgt, *table;
+       struct page *page;
+       int rc;
+ 
+       BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
+       /* Allocate a shadow segment table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = sgt & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_sgt = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= sgt & _REGION_ENTRY_PROTECT;
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make sgt read-only in parent gmap page table */
+       raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3;
+       origin = sgt & _REGION_ENTRY_ORIGIN;
+       offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 2);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_sgt)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_sgt(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+ out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
+ 
+ /**
+  * gmap_shadow_lookup_pgtable - find a shadow page table
+  * @sg: pointer to the shadow guest address space structure
+  * @saddr: the address in the shadow aguest address space
+  * @pgt: parent gmap address of the page table to get shadowed
+  * @dat_protection: if the pgtable is marked as protected by dat
+  * @fake: pgt references contiguous guest memory block, not a pgtable
+  *
+  * Returns 0 if the shadow page table was found and -EAGAIN if the page
+  * table was not found.
+  *
+  * Called with sg->mm->mmap_sem in read.
+  */
+ int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+                          unsigned long *pgt, int *dat_protection,
+                          int *fake)
+ {
+       unsigned long *table;
+       struct page *page;
+       int rc;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+       if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
+               /* Shadow page tables are full pages (pte+pgste) */
+               page = pfn_to_page(*table >> PAGE_SHIFT);
+               *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
+               *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+               *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
+               rc = 0;
+       } else  {
+               rc = -EAGAIN;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+ 
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
+ 
+ /**
+  * gmap_shadow_pgt - instantiate a shadow page table
+  * @sg: pointer to the shadow guest address space structure
+  * @saddr: faulting address in the shadow gmap
+  * @pgt: parent gmap address of the page table to get shadowed
+  * @fake: pgt references contiguous guest memory block, not a pgtable
+  *
+  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+  * shadow table structure is incomplete, -ENOMEM if out of memory,
+  * -EFAULT if an address in the parent gmap could not be resolved and
+  *
+  * Called with gmap->mm->mmap_sem in read
+  */
+ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+                   int fake)
+ {
+       unsigned long raddr, origin;
+       unsigned long *s_pgt, *table;
+       struct page *page;
+       int rc;
+ 
+       BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
+       /* Allocate a shadow page table */
+       page = page_table_alloc_pgste(sg->mm);
+       if (!page)
+               return -ENOMEM;
+       page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_pgt = (unsigned long *) page_to_phys(page);
+       /* Install shadow page table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _SEGMENT_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
+                (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
+       list_add(&page->lru, &sg->pt_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_SEGMENT_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make pgt read-only in parent gmap page table (not the pgste) */
+       raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;
+       origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
+       rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 1);
+               if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
+                             (unsigned long) s_pgt)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_SEGMENT_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_pgt(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+ out_free:
+       spin_unlock(&sg->guest_table_lock);
+       page_table_free_pgste(page);
+       return rc;
+ 
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
+ 
+ /**
+  * gmap_shadow_page - create a shadow page mapping
+  * @sg: pointer to the shadow guest address space structure
+  * @saddr: faulting address in the shadow gmap
+  * @pte: pte in parent gmap address space to get shadowed
+  *
+  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+  * shadow table structure is incomplete, -ENOMEM if out of memory and
+  * -EFAULT if an address in the parent gmap could not be resolved.
+  *
+  * Called with sg->mm->mmap_sem in read.
+  */
+ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
+ {
+       struct gmap *parent;
+       struct gmap_rmap *rmap;
+       unsigned long vmaddr, paddr;
+       spinlock_t *ptl;
+       pte_t *sptep, *tptep;
+       int prot;
+       int rc;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       parent = sg->parent;
+       prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
+ 
+       rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+       if (!rmap)
+               return -ENOMEM;
+       rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
+ 
+       while (1) {
+               paddr = pte_val(pte) & PAGE_MASK;
+               vmaddr = __gmap_translate(parent, paddr);
+               if (IS_ERR_VALUE(vmaddr)) {
+                       rc = vmaddr;
+                       break;
+               }
+               rc = radix_tree_preload(GFP_KERNEL);
+               if (rc)
+                       break;
+               rc = -EAGAIN;
+               sptep = gmap_pte_op_walk(parent, paddr, &ptl);
+               if (sptep) {
+                       spin_lock(&sg->guest_table_lock);
+                       /* Get page table pointer */
+                       tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
+                       if (!tptep) {
+                               spin_unlock(&sg->guest_table_lock);
+                               gmap_pte_op_end(ptl);
+                               radix_tree_preload_end();
+                               break;
+                       }
+                       rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
+                       if (rc > 0) {
+                               /* Success and a new mapping */
+                               gmap_insert_rmap(sg, vmaddr, rmap);
+                               rmap = NULL;
+                               rc = 0;
+                       }
+                       gmap_pte_op_end(ptl);
+                       spin_unlock(&sg->guest_table_lock);
+               }
+               radix_tree_preload_end();
+               if (!rc)
+                       break;
+               rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+               if (rc)
+                       break;
+       }
+       kfree(rmap);
+       return rc;
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow_page);
+ 
+ /**
+  * gmap_shadow_notify - handle notifications for shadow gmap
+  *
+  * Called with sg->parent->shadow_lock.
+  */
+ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
+                              unsigned long offset, pte_t *pte)
+ {
+       struct gmap_rmap *rmap, *rnext, *head;
+       unsigned long gaddr, start, end, bits, raddr;
+       unsigned long *table;
+ 
+       BUG_ON(!gmap_is_shadow(sg));
+       spin_lock(&sg->parent->guest_table_lock);
+       table = radix_tree_lookup(&sg->parent->host_to_guest,
+                                 vmaddr >> PMD_SHIFT);
+       gaddr = table ? __gmap_segment_gaddr(table) + offset : 0;
+       spin_unlock(&sg->parent->guest_table_lock);
+       if (!table)
+               return;
+ 
+       spin_lock(&sg->guest_table_lock);
+       if (sg->removed) {
+               spin_unlock(&sg->guest_table_lock);
+               return;
+       }
+       /* Check for top level table */
+       start = sg->orig_asce & _ASCE_ORIGIN;
+       end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096;
+       if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
+           gaddr < end) {
+               /* The complete shadow table has to go */
+               gmap_unshadow(sg);
+               spin_unlock(&sg->guest_table_lock);
+               list_del(&sg->list);
+               gmap_put(sg);
+               return;
+       }
+       /* Remove the page table tree from on specific entry */
+       head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12);
+       gmap_for_each_rmap_safe(rmap, rnext, head) {
+               bits = rmap->raddr & _SHADOW_RMAP_MASK;
+               raddr = rmap->raddr ^ bits;
+               switch (bits) {
+               case _SHADOW_RMAP_REGION1:
+                       gmap_unshadow_r2t(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_REGION2:
+                       gmap_unshadow_r3t(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_REGION3:
+                       gmap_unshadow_sgt(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_SEGMENT:
+                       gmap_unshadow_pgt(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_PGTABLE:
+                       gmap_unshadow_page(sg, raddr);
+                       break;
+               }
+               kfree(rmap);
+       }
+       spin_unlock(&sg->guest_table_lock);
+ }
+ 
+ /**
+  * ptep_notify - call all invalidation callbacks for a specific pte.
+  * @mm: pointer to the process mm_struct
+  * @addr: virtual address in the process address space
+  * @pte: pointer to the page table entry
+  * @bits: bits from the pgste that caused the notify call
+  *
+  * This function is assumed to be called with the page table lock held
+  * for the pte to notify.
+  */
+ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
+                pte_t *pte, unsigned long bits)
+ {
+       unsigned long offset, gaddr;
+       unsigned long *table;
+       struct gmap *gmap, *sg, *next;
+ 
+       offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
+       offset = offset * (4096 / sizeof(pte_t));
+       rcu_read_lock();
+       list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+               if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
+                       spin_lock(&gmap->shadow_lock);
+                       list_for_each_entry_safe(sg, next,
+                                                &gmap->children, list)
+                               gmap_shadow_notify(sg, vmaddr, offset, pte);
+                       spin_unlock(&gmap->shadow_lock);
+               }
+               if (!(bits & PGSTE_IN_BIT))
+                       continue;
+               spin_lock(&gmap->guest_table_lock);
+               table = radix_tree_lookup(&gmap->host_to_guest,
+                                         vmaddr >> PMD_SHIFT);
+               if (table)
+                       gaddr = __gmap_segment_gaddr(table) + offset;
+               spin_unlock(&gmap->guest_table_lock);
+               if (table)
+                       gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
+       }
+       rcu_read_unlock();
   }
   EXPORT_SYMBOL_GPL(ptep_notify);
   
diff --combined arch/s390/mm/pgalloc.c

index e2565d2,9c57a29..995f785
--- 1/arch/s390/mm/pgalloc.c
--- 2/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@@ -137,6 -137,29 +137,29 @@@ static inline unsigned int atomic_xor_b
         return new;
   }
   
+ #ifdef CONFIG_PGSTE
+ 
+ struct page *page_table_alloc_pgste(struct mm_struct *mm)
+ {
+       struct page *page;
+       unsigned long *table;
+ 
+       page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+       if (page) {
+               table = (unsigned long *) page_to_phys(page);
+               clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+               clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+       }
+       return page;
+ }
+ 
+ void page_table_free_pgste(struct page *page)
+ {
+       __free_page(page);
+ }
+ 
+ #endif /* CONFIG_PGSTE */
+ 
   /*
    * page table entry allocation/free routines.
    */
@@@ -149,7 -172,7 +172,7 @@@ unsigned long *page_table_alloc(struct 
         /* Try to get a fragment of a 4K page as a 2K page table */
         if (!mm_alloc_pgste(mm)) {
                 table = NULL;
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                 if (!list_empty(&mm->context.pgtable_list)) {
                         page = list_first_entry(&mm->context.pgtable_list,
                                                 struct page, lru);
@@@ -164,12 -187,12 +187,12 @@@
                                 list_del(&page->lru);
                         }
                 }
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
                 if (table)
                         return table;
         }
         /* Allocate a fresh page */
- -      page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+ +      page = alloc_page(GFP_KERNEL);
         if (!page)
                 return NULL;
         if (!pgtable_page_ctor(page)) {
@@@ -187,9 -210,9 +210,9 @@@
                 /* Return the first 2K fragment of the page */
                 atomic_set(&page->_mapcount, 1);
                 clear_table(table, _PAGE_INVALID, PAGE_SIZE);
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                 list_add(&page->lru, &mm->context.pgtable_list);
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
         }
         return table;
   }
@@@ -203,13 -226,13 +226,13 @@@ void page_table_free(struct mm_struct *
         if (!mm_alloc_pgste(mm)) {
                 /* Free 2K page table fragment of a 4K page */
                 bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                 mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
                 if (mask & 3)
                         list_add(&page->lru, &mm->context.pgtable_list);
                 else
                         list_del(&page->lru);
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
                 if (mask != 0)
                         return;
         }
@@@ -235,13 -258,13 +258,13 @@@ void page_table_free_rcu(struct mmu_gat
                 return;
         }
         bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
-       spin_lock_bh(&mm->context.list_lock);
+       spin_lock_bh(&mm->context.pgtable_lock);
         mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
         if (mask & 3)
                 list_add_tail(&page->lru, &mm->context.pgtable_list);
         else
                 list_del(&page->lru);
-       spin_unlock_bh(&mm->context.list_lock);
+       spin_unlock_bh(&mm->context.pgtable_lock);
         table = (unsigned long *) (__pa(table) | (1U << bit));
         tlb_remove_table(tlb, table);
   }
diff --combined arch/s390/mm/pgtable.c

index b98d1a1,293130b..5f09201
--- 1/arch/s390/mm/pgtable.c
--- 2/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@@ -27,37 -27,40 +27,37 @@@
   static inline pte_t ptep_flush_direct(struct mm_struct *mm,
                                       unsigned long addr, pte_t *ptep)
   {
- -      int active, count;
         pte_t old;
   
         old = *ptep;
         if (unlikely(pte_val(old) & _PAGE_INVALID))
                 return old;
- -      active = (mm == current->active_mm) ? 1 : 0;
- -      count = atomic_add_return(0x10000, &mm->context.attach_count);
- -      if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
+ +      atomic_inc(&mm->context.flush_count);
+ +      if (MACHINE_HAS_TLB_LC &&
             cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
                 __ptep_ipte_local(addr, ptep);
         else
                 __ptep_ipte(addr, ptep);
- -      atomic_sub(0x10000, &mm->context.attach_count);
+ +      atomic_dec(&mm->context.flush_count);
         return old;
   }
   
   static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
                                     unsigned long addr, pte_t *ptep)
   {
- -      int active, count;
         pte_t old;
   
         old = *ptep;
         if (unlikely(pte_val(old) & _PAGE_INVALID))
                 return old;
- -      active = (mm == current->active_mm) ? 1 : 0;
- -      count = atomic_add_return(0x10000, &mm->context.attach_count);
- -      if ((count & 0xffff) <= active) {
+ +      atomic_inc(&mm->context.flush_count);
+ +      if (cpumask_equal(&mm->context.cpu_attach_mask,
+ +                        cpumask_of(smp_processor_id()))) {
                 pte_val(*ptep) |= _PAGE_INVALID;
                 mm->context.flush_mm = 1;
         } else
                 __ptep_ipte(addr, ptep);
- -      atomic_sub(0x10000, &mm->context.attach_count);
+ +      atomic_dec(&mm->context.flush_count);
         return old;
   }
   
@@@ -67,6 -70,7 +67,6 @@@ static inline pgste_t pgste_get_lock(pt
   #ifdef CONFIG_PGSTE
         unsigned long old;
   
- -      preempt_disable();
         asm(
                 "       lg      %0,%2\n"
                 "0:     lgr     %1,%0\n"
@@@ -89,6 -93,7 +89,6 @@@ static inline void pgste_set_unlock(pte
                 : "=Q" (ptep[PTRS_PER_PTE])
                 : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
                 : "cc", "memory");
- -      preempt_enable();
   #endif
   }
   
@@@ -174,14 -179,17 +174,17 @@@ static inline pgste_t pgste_set_pte(pte
         return pgste;
   }
   
- static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
-                                       unsigned long addr,
-                                       pte_t *ptep, pgste_t pgste)
+ static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
+                                      unsigned long addr,
+                                      pte_t *ptep, pgste_t pgste)
   {
   #ifdef CONFIG_PGSTE
-       if (pgste_val(pgste) & PGSTE_IN_BIT) {
-               pgste_val(pgste) &= ~PGSTE_IN_BIT;
-               ptep_notify(mm, addr, ptep);
+       unsigned long bits;
+ 
+       bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+       if (bits) {
+               pgste_val(pgste) ^= bits;
+               ptep_notify(mm, addr, ptep, bits);
         }
   #endif
         return pgste;
@@@ -194,7 -202,7 +197,7 @@@ static inline pgste_t ptep_xchg_start(s
   
         if (mm_has_pgste(mm)) {
                 pgste = pgste_get_lock(ptep);
-               pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+               pgste = pgste_pte_notify(mm, addr, ptep, pgste);
         }
         return pgste;
   }
@@@ -225,11 -233,9 +228,11 @@@ pte_t ptep_xchg_direct(struct mm_struc
         pgste_t pgste;
         pte_t old;
   
+ +      preempt_disable();
         pgste = ptep_xchg_start(mm, addr, ptep);
         old = ptep_flush_direct(mm, addr, ptep);
         ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+ +      preempt_enable();
         return old;
   }
   EXPORT_SYMBOL(ptep_xchg_direct);
@@@ -240,11 -246,9 +243,11 @@@ pte_t ptep_xchg_lazy(struct mm_struct *
         pgste_t pgste;
         pte_t old;
   
+ +      preempt_disable();
         pgste = ptep_xchg_start(mm, addr, ptep);
         old = ptep_flush_lazy(mm, addr, ptep);
         ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+ +      preempt_enable();
         return old;
   }
   EXPORT_SYMBOL(ptep_xchg_lazy);
@@@ -255,7 -259,6 +258,7 @@@ pte_t ptep_modify_prot_start(struct mm_
         pgste_t pgste;
         pte_t old;
   
+ +      preempt_disable();
         pgste = ptep_xchg_start(mm, addr, ptep);
         old = ptep_flush_lazy(mm, addr, ptep);
         if (mm_has_pgste(mm)) {
@@@ -279,13 -282,13 +282,13 @@@ void ptep_modify_prot_commit(struct mm_
         } else {
                 *ptep = pte;
         }
+ +      preempt_enable();
   }
   EXPORT_SYMBOL(ptep_modify_prot_commit);
   
   static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
                                       unsigned long addr, pmd_t *pmdp)
   {
- -      int active, count;
         pmd_t old;
   
         old = *pmdp;
@@@ -295,34 -298,36 +298,34 @@@
                 __pmdp_csp(pmdp);
                 return old;
         }
- -      active = (mm == current->active_mm) ? 1 : 0;
- -      count = atomic_add_return(0x10000, &mm->context.attach_count);
- -      if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
+ +      atomic_inc(&mm->context.flush_count);
+ +      if (MACHINE_HAS_TLB_LC &&
             cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
                 __pmdp_idte_local(addr, pmdp);
         else
                 __pmdp_idte(addr, pmdp);
- -      atomic_sub(0x10000, &mm->context.attach_count);
+ +      atomic_dec(&mm->context.flush_count);
         return old;
   }
   
   static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
                                     unsigned long addr, pmd_t *pmdp)
   {
- -      int active, count;
         pmd_t old;
   
         old = *pmdp;
         if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
                 return old;
- -      active = (mm == current->active_mm) ? 1 : 0;
- -      count = atomic_add_return(0x10000, &mm->context.attach_count);
- -      if ((count & 0xffff) <= active) {
+ +      atomic_inc(&mm->context.flush_count);
+ +      if (cpumask_equal(&mm->context.cpu_attach_mask,
+ +                        cpumask_of(smp_processor_id()))) {
                 pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
                 mm->context.flush_mm = 1;
         } else if (MACHINE_HAS_IDTE)
                 __pmdp_idte(addr, pmdp);
         else
                 __pmdp_csp(pmdp);
- -      atomic_sub(0x10000, &mm->context.attach_count);
+ +      atomic_dec(&mm->context.flush_count);
         return old;
   }
   
@@@ -331,10 -336,8 +334,10 @@@ pmd_t pmdp_xchg_direct(struct mm_struc
   {
         pmd_t old;
   
+ +      preempt_disable();
         old = pmdp_flush_direct(mm, addr, pmdp);
         *pmdp = new;
+ +      preempt_enable();
         return old;
   }
   EXPORT_SYMBOL(pmdp_xchg_direct);
@@@ -344,53 -347,12 +347,53 @@@ pmd_t pmdp_xchg_lazy(struct mm_struct *
   {
         pmd_t old;
   
+ +      preempt_disable();
         old = pmdp_flush_lazy(mm, addr, pmdp);
         *pmdp = new;
+ +      preempt_enable();
         return old;
   }
   EXPORT_SYMBOL(pmdp_xchg_lazy);
   
+ +static inline pud_t pudp_flush_direct(struct mm_struct *mm,
+ +                                    unsigned long addr, pud_t *pudp)
+ +{
+ +      pud_t old;
+ +
+ +      old = *pudp;
+ +      if (pud_val(old) & _REGION_ENTRY_INVALID)
+ +              return old;
+ +      if (!MACHINE_HAS_IDTE) {
+ +              /*
+ +               * Invalid bit position is the same for pmd and pud, so we can
+ +               * re-use _pmd_csp() here
+ +               */
+ +              __pmdp_csp((pmd_t *) pudp);
+ +              return old;
+ +      }
+ +      atomic_inc(&mm->context.flush_count);
+ +      if (MACHINE_HAS_TLB_LC &&
+ +          cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+ +              __pudp_idte_local(addr, pudp);
+ +      else
+ +              __pudp_idte(addr, pudp);
+ +      atomic_dec(&mm->context.flush_count);
+ +      return old;
+ +}
+ +
+ +pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr,
+ +                     pud_t *pudp, pud_t new)
+ +{
+ +      pud_t old;
+ +
+ +      preempt_disable();
+ +      old = pudp_flush_direct(mm, addr, pudp);
+ +      *pudp = new;
+ +      preempt_enable();
+ +      return old;
+ +}
+ +EXPORT_SYMBOL(pudp_xchg_direct);
+ +
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
   void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                 pgtable_t pgtable)
@@@ -439,26 -401,106 +442,110 @@@ void ptep_set_pte_at(struct mm_struct *
         pgste_t pgste;
   
         /* the mm_has_pgste() check is done in set_pte_at() */
+ +      preempt_disable();
         pgste = pgste_get_lock(ptep);
         pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
         pgste_set_key(ptep, pgste, entry, mm);
         pgste = pgste_set_pte(ptep, pgste, entry);
         pgste_set_unlock(ptep, pgste);
+ +      preempt_enable();
   }
   
   void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
   {
         pgste_t pgste;
   
+ +      preempt_disable();
         pgste = pgste_get_lock(ptep);
         pgste_val(pgste) |= PGSTE_IN_BIT;
         pgste_set_unlock(ptep, pgste);
+ +      preempt_enable();
   }
   
+ /**
+  * ptep_force_prot - change access rights of a locked pte
+  * @mm: pointer to the process mm_struct
+  * @addr: virtual address in the guest address space
+  * @ptep: pointer to the page table entry
+  * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+  * @bit: pgste bit to set (e.g. for notification)
+  *
+  * Returns 0 if the access rights were changed and -EAGAIN if the current
+  * and requested access rights are incompatible.
+  */
+ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
+                   pte_t *ptep, int prot, unsigned long bit)
+ {
+       pte_t entry;
+       pgste_t pgste;
+       int pte_i, pte_p;
+ 
+       pgste = pgste_get_lock(ptep);
+       entry = *ptep;
+       /* Check pte entry after all locks have been acquired */
+       pte_i = pte_val(entry) & _PAGE_INVALID;
+       pte_p = pte_val(entry) & _PAGE_PROTECT;
+       if ((pte_i && (prot != PROT_NONE)) ||
+           (pte_p && (prot & PROT_WRITE))) {
+               pgste_set_unlock(ptep, pgste);
+               return -EAGAIN;
+       }
+       /* Change access rights and set pgste bit */
+       if (prot == PROT_NONE && !pte_i) {
+               ptep_flush_direct(mm, addr, ptep);
+               pgste = pgste_update_all(entry, pgste, mm);
+               pte_val(entry) |= _PAGE_INVALID;
+       }
+       if (prot == PROT_READ && !pte_p) {
+               ptep_flush_direct(mm, addr, ptep);
+               pte_val(entry) &= ~_PAGE_INVALID;
+               pte_val(entry) |= _PAGE_PROTECT;
+       }
+       pgste_val(pgste) |= bit;
+       pgste = pgste_set_pte(ptep, pgste, entry);
+       pgste_set_unlock(ptep, pgste);
+       return 0;
+ }
+ 
+ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+                   pte_t *sptep, pte_t *tptep, pte_t pte)
+ {
+       pgste_t spgste, tpgste;
+       pte_t spte, tpte;
+       int rc = -EAGAIN;
+ 
+       if (!(pte_val(*tptep) & _PAGE_INVALID))
+               return 0;       /* already shadowed */
+       spgste = pgste_get_lock(sptep);
+       spte = *sptep;
+       if (!(pte_val(spte) & _PAGE_INVALID) &&
+           !((pte_val(spte) & _PAGE_PROTECT) &&
+             !(pte_val(pte) & _PAGE_PROTECT))) {
+               pgste_val(spgste) |= PGSTE_VSIE_BIT;
+               tpgste = pgste_get_lock(tptep);
+               pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
+                               (pte_val(pte) & _PAGE_PROTECT);
+               /* don't touch the storage key - it belongs to parent pgste */
+               tpgste = pgste_set_pte(tptep, tpgste, tpte);
+               pgste_set_unlock(tptep, tpgste);
+               rc = 1;
+       }
+       pgste_set_unlock(sptep, spgste);
+       return rc;
+ }
+ 
+ void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
+ {
+       pgste_t pgste;
+ 
+       pgste = pgste_get_lock(ptep);
+       /* notifier is called by the caller */
+       ptep_flush_direct(mm, saddr, ptep);
+       /* don't touch the storage key - it belongs to parent pgste */
+       pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
+       pgste_set_unlock(ptep, pgste);
+ }
+ 
   static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
   {
         if (!non_swap_entry(entry))
@@@ -479,11 -521,10 +566,11 @@@ void ptep_zap_unused(struct mm_struct *
         pte_t pte;
   
         /* Zap unused and logically-zero pages */
+ +      preempt_disable();
         pgste = pgste_get_lock(ptep);
         pgstev = pgste_val(pgste);
         pte = *ptep;
- -      if (pte_swap(pte) &&
+ +      if (!reset && pte_swap(pte) &&
             ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
              (pgstev & _PGSTE_GPS_ZERO))) {
                 ptep_zap_swap_entry(mm, pte_to_swp_entry(pte));
@@@ -492,7 -533,6 +579,7 @@@
         if (reset)
                 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
         pgste_set_unlock(ptep, pgste);
+ +      preempt_enable();
   }
   
   void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@@ -501,7 -541,6 +588,7 @@@
         pgste_t pgste;
   
         /* Clear storage key */
+ +      preempt_disable();
         pgste = pgste_get_lock(ptep);
         pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
                               PGSTE_GR_BIT | PGSTE_GC_BIT);
@@@ -509,7 -548,6 +596,7 @@@
         if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
                 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
         pgste_set_unlock(ptep, pgste);
+ +      preempt_enable();
   }
   
   /*
@@@ -532,7 -570,7 +619,7 @@@ bool test_and_clear_guest_dirty(struct 
         pgste_val(pgste) &= ~PGSTE_UC_BIT;
         pte = *ptep;
         if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
-               pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+               pgste = pgste_pte_notify(mm, addr, ptep, pgste);
                 __ptep_ipte(addr, ptep);
                 if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
                         pte_val(pte) |= _PAGE_PROTECT;
@@@ -555,12 -593,9 +642,9 @@@ int set_guest_storage_key(struct mm_str
         pgste_t old, new;
         pte_t *ptep;
   
-       down_read(&mm->mmap_sem);
         ptep = get_locked_pte(mm, addr, &ptl);
-       if (unlikely(!ptep)) {
-               up_read(&mm->mmap_sem);
+       if (unlikely(!ptep))
                 return -EFAULT;
-       }
   
         new = old = pgste_get_lock(ptep);
         pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
@@@ -587,45 -622,100 +671,100 @@@
   
         pgste_set_unlock(ptep, new);
         pte_unmap_unlock(ptep, ptl);
-       up_read(&mm->mmap_sem);
         return 0;
   }
   EXPORT_SYMBOL(set_guest_storage_key);
   
- unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
+ /**
+  * Conditionally set a guest storage key (handling csske).
+  * oldkey will be updated when either mr or mc is set and a pointer is given.
+  *
+  * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest
+  * storage key was updated and -EFAULT on access errors.
+  */
+ int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                              unsigned char key, unsigned char *oldkey,
+                              bool nq, bool mr, bool mc)
+ {
+       unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT;
+       int rc;
+ 
+       /* we can drop the pgste lock between getting and setting the key */
+       if (mr | mc) {
+               rc = get_guest_storage_key(current->mm, addr, &tmp);
+               if (rc)
+                       return rc;
+               if (oldkey)
+                       *oldkey = tmp;
+               if (!mr)
+                       mask |= _PAGE_REFERENCED;
+               if (!mc)
+                       mask |= _PAGE_CHANGED;
+               if (!((tmp ^ key) & mask))
+                       return 0;
+       }
+       rc = set_guest_storage_key(current->mm, addr, key, nq);
+       return rc < 0 ? rc : 1;
+ }
+ EXPORT_SYMBOL(cond_set_guest_storage_key);
+ 
+ /**
+  * Reset a guest reference bit (rrbe), returning the reference and changed bit.
+  *
+  * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
+  */
+ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
   {
-       unsigned char key;
         spinlock_t *ptl;
-       pgste_t pgste;
+       pgste_t old, new;
         pte_t *ptep;
+       int cc = 0;
   
-       down_read(&mm->mmap_sem);
         ptep = get_locked_pte(mm, addr, &ptl);
-       if (unlikely(!ptep)) {
-               up_read(&mm->mmap_sem);
+       if (unlikely(!ptep))
                 return -EFAULT;
-       }
-       pgste = pgste_get_lock(ptep);
   
-       if (pte_val(*ptep) & _PAGE_INVALID) {
-               key  = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
-               key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
-               key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
-               key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
-       } else {
-               key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+       new = old = pgste_get_lock(ptep);
+       /* Reset guest reference bit only */
+       pgste_val(new) &= ~PGSTE_GR_BIT;
   
-               /* Reflect guest's logical view, not physical */
-               if (pgste_val(pgste) & PGSTE_GR_BIT)
-                       key |= _PAGE_REFERENCED;
-               if (pgste_val(pgste) & PGSTE_GC_BIT)
-                       key |= _PAGE_CHANGED;
+       if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+               cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
+               /* Merge real referenced bit into host-set */
+               pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
         }
+       /* Reflect guest's logical view, not physical */
+       cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
+       /* Changing the guest storage key is considered a change of the page */
+       if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
+               pgste_val(new) |= PGSTE_UC_BIT;
+ 
+       pgste_set_unlock(ptep, new);
+       pte_unmap_unlock(ptep, ptl);
+       return 0;
+ }
+ EXPORT_SYMBOL(reset_guest_reference_bit);
+ 
+ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                         unsigned char *key)
+ {
+       spinlock_t *ptl;
+       pgste_t pgste;
+       pte_t *ptep;
   
+       ptep = get_locked_pte(mm, addr, &ptl);
+       if (unlikely(!ptep))
+               return -EFAULT;
+ 
+       pgste = pgste_get_lock(ptep);
+       *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+       if (!(pte_val(*ptep) & _PAGE_INVALID))
+               *key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+       /* Reflect guest's logical view, not physical */
+       *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
         pgste_set_unlock(ptep, pgste);
         pte_unmap_unlock(ptep, ptl);
-       up_read(&mm->mmap_sem);
-       return key;
+       return 0;
   }
   EXPORT_SYMBOL(get_guest_storage_key);
   #endif
diff --combined arch/x86/include/asm/kvm_host.h

index 69e6286,9fcb197..33ae3a4
--- 1/arch/x86/include/asm/kvm_host.h
--- 2/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@@ -27,7 -27,6 +27,7 @@@
   #include <linux/irqbypass.h>
   #include <linux/hyperv.h>
   
+ +#include <asm/apic.h>
   #include <asm/pvclock-abi.h>
   #include <asm/desc.h>
   #include <asm/mtrr.h>
@@@ -35,8 -34,9 +35,9 @@@
   #include <asm/asm.h>
   #include <asm/kvm_page_track.h>
   
- #define KVM_MAX_VCPUS 255
- #define KVM_SOFT_MAX_VCPUS 160
+ #define KVM_MAX_VCPUS 288
+ #define KVM_SOFT_MAX_VCPUS 240
+ #define KVM_MAX_VCPU_ID 1023
   #define KVM_USER_MEM_SLOTS 509
   /* memory slots that are not exposed to userspace */
   #define KVM_PRIVATE_MEM_SLOTS 3
@@@ -599,6 -599,7 +600,7 @@@ struct kvm_vcpu_arch 
         u64 mcg_cap;
         u64 mcg_status;
         u64 mcg_ctl;
+       u64 mcg_ext_ctl;
         u64 *mce_banks;
   
         /* Cache MMIO info */
@@@ -682,9 -683,12 +684,12 @@@ struct kvm_arch_memory_slot 
   struct kvm_apic_map {
         struct rcu_head rcu;
         u8 mode;
-       struct kvm_lapic *phys_map[256];
-       /* first index is cluster id second is cpu id in a cluster */
-       struct kvm_lapic *logical_map[16][16];
+       u32 max_apic_id;
+       union {
+               struct kvm_lapic *xapic_flat_map[8];
+               struct kvm_lapic *xapic_cluster_map[16][4];
+       };
+       struct kvm_lapic *phys_map[];
   };
   
   /* Hyper-V emulation context */
@@@ -779,6 -783,9 +784,9 @@@ struct kvm_arch 
         u32 ldr_mode;
         struct page *avic_logical_id_table_page;
         struct page *avic_physical_id_table_page;
+ 
+       bool x2apic_format;
+       bool x2apic_broadcast_quirk_disabled;
   };
   
   struct kvm_vm_stat {
@@@ -1006,6 -1013,11 +1014,11 @@@ struct kvm_x86_ops 
         int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
                               uint32_t guest_irq, bool set);
         void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
+ 
+       int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc);
+       void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
+ 
+       void (*setup_mce)(struct kvm_vcpu *vcpu);
   };
   
   struct kvm_arch_async_pf {
@@@ -1026,7 -1038,7 +1039,7 @@@ void kvm_mmu_setup(struct kvm_vcpu *vcp
   void kvm_mmu_init_vm(struct kvm *kvm);
   void kvm_mmu_uninit_vm(struct kvm *kvm);
   void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask);
+               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask);
   
   void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
   void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@@ -1077,6 -1089,10 +1090,10 @@@ extern u32  kvm_max_guest_tsc_khz
   extern u8   kvm_tsc_scaling_ratio_frac_bits;
   /* maximum allowed value of TSC scaling ratio */
   extern u64  kvm_max_tsc_scaling_ratio;
+ /* 1ull << kvm_tsc_scaling_ratio_frac_bits */
+ extern u64  kvm_default_tsc_scaling_ratio;
+ 
+ extern u64 kvm_mce_cap_supported;
   
   enum emulation_result {
         EMULATE_DONE,         /* no further processing */
@@@ -1352,7 -1368,7 +1369,7 @@@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *v
   bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
                              struct kvm_vcpu **dest_vcpu);
   
- void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
                      struct kvm_lapic_irq *irq);
   
   static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
@@@ -1369,14 -1385,4 +1386,14 @@@ static inline void kvm_arch_vcpu_unbloc
   
   static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
   
+ +static inline int kvm_cpu_get_apicid(int mps_cpu)
+ +{
+ +#ifdef CONFIG_X86_LOCAL_APIC
+ +      return __default_cpu_present_to_apicid(mps_cpu);
+ +#else
+ +      WARN_ON_ONCE(1);
+ +      return BAD_APICID;
+ +#endif
+ +}
+ +
   #endif /* _ASM_X86_KVM_HOST_H */
diff --combined arch/x86/kvm/iommu.c

index 95e0e64,4f2010c..b181426
--- 1/arch/x86/kvm/iommu.c
--- 2/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@@ -25,12 -25,10 +25,10 @@@
   
   #include <linux/list.h>
   #include <linux/kvm_host.h>
- -#include <linux/module.h>
+ +#include <linux/moduleparam.h>
   #include <linux/pci.h>
   #include <linux/stat.h>
- #include <linux/dmar.h>
   #include <linux/iommu.h>
- #include <linux/intel-iommu.h>
   #include "assigned-dev.h"
   
   static bool allow_unsafe_assigned_interrupts;
diff --combined arch/x86/kvm/lapic.c

index 57549ed,6895fd2..730cf17
--- 1/arch/x86/kvm/lapic.c
--- 2/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@@ -25,7 -25,7 +25,7 @@@
   #include <linux/smp.h>
   #include <linux/hrtimer.h>
   #include <linux/io.h>
- -#include <linux/module.h>
+ +#include <linux/export.h>
   #include <linux/math64.h>
   #include <linux/slab.h>
   #include <asm/processor.h>
@@@ -115,26 -115,43 +115,43 @@@ static inline int apic_enabled(struct k
         (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
          APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
   
- /* The logical map is definitely wrong if we have multiple
-  * modes at the same time.  (Physical map is always right.)
-  */
- static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map)
- {
-       return !(map->mode & (map->mode - 1));
+ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
+               u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
+       switch (map->mode) {
+       case KVM_APIC_MODE_X2APIC: {
+               u32 offset = (dest_id >> 16) * 16;
+               u32 max_apic_id = map->max_apic_id;
+ 
+               if (offset <= max_apic_id) {
+                       u8 cluster_size = min(max_apic_id - offset + 1, 16U);
+ 
+                       *cluster = &map->phys_map[offset];
+                       *mask = dest_id & (0xffff >> (16 - cluster_size));
+               } else {
+                       *mask = 0;
+               }
+ 
+               return true;
+               }
+       case KVM_APIC_MODE_XAPIC_FLAT:
+               *cluster = map->xapic_flat_map;
+               *mask = dest_id & 0xff;
+               return true;
+       case KVM_APIC_MODE_XAPIC_CLUSTER:
+               *cluster = map->xapic_cluster_map[dest_id >> 4];
+               *mask = dest_id & 0xf;
+               return true;
+       default:
+               /* Not optimized. */
+               return false;
+       }
   }
   
- static inline void
- apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid)
+ static void kvm_apic_map_free(struct rcu_head *rcu)
   {
-       unsigned lid_bits;
+       struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu);
   
-       BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER !=  4);
-       BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT    !=  8);
-       BUILD_BUG_ON(KVM_APIC_MODE_X2APIC        != 16);
-       lid_bits = map->mode;
- 
-       *cid = dest_id >> lid_bits;
-       *lid = dest_id & ((1 << lid_bits) - 1);
+       kvfree(map);
   }
   
   static void recalculate_apic_map(struct kvm *kvm)
@@@ -142,17 -159,26 +159,26 @@@
         struct kvm_apic_map *new, *old = NULL;
         struct kvm_vcpu *vcpu;
         int i;
- 
-       new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
+       u32 max_id = 255;
   
         mutex_lock(&kvm->arch.apic_map_lock);
   
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               if (kvm_apic_present(vcpu))
+                       max_id = max(max_id, kvm_apic_id(vcpu->arch.apic));
+ 
+       new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
+                          sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
+ 
         if (!new)
                 goto out;
   
+       new->max_apic_id = max_id;
+ 
         kvm_for_each_vcpu(i, vcpu, kvm) {
                 struct kvm_lapic *apic = vcpu->arch.apic;
-               u16 cid, lid;
+               struct kvm_lapic **cluster;
+               u16 mask;
                 u32 ldr, aid;
   
                 if (!kvm_apic_present(vcpu))
@@@ -161,7 -187,7 +187,7 @@@
                 aid = kvm_apic_id(apic);
                 ldr = kvm_lapic_get_reg(apic, APIC_LDR);
   
-               if (aid < ARRAY_SIZE(new->phys_map))
+               if (aid <= new->max_apic_id)
                         new->phys_map[aid] = apic;
   
                 if (apic_x2apic_mode(apic)) {
@@@ -174,13 -200,11 +200,11 @@@
                                 new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
                 }
   
-               if (!kvm_apic_logical_map_valid(new))
+               if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))
                         continue;
   
-               apic_logical_id(new, ldr, &cid, &lid);
- 
-               if (lid && cid < ARRAY_SIZE(new->logical_map))
-                       new->logical_map[cid][ffs(lid) - 1] = apic;
+               if (mask)
+                       cluster[ffs(mask) - 1] = apic;
         }
   out:
         old = rcu_dereference_protected(kvm->arch.apic_map,
@@@ -189,7 -213,7 +213,7 @@@
         mutex_unlock(&kvm->arch.apic_map_lock);
   
         if (old)
-               kfree_rcu(old, rcu);
+               call_rcu(&old->rcu, kvm_apic_map_free);
   
         kvm_make_scan_ioapic_request(kvm);
   }
@@@ -210,7 -234,7 +234,7 @@@ static inline void apic_set_spiv(struc
         }
   }
   
- static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
+ static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
   {
         kvm_lapic_set_reg(apic, APIC_ID, id << 24);
         recalculate_apic_map(apic->vcpu->kvm);
@@@ -222,11 -246,11 +246,11 @@@ static inline void kvm_apic_set_ldr(str
         recalculate_apic_map(apic->vcpu->kvm);
   }
   
- static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id)
+ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
   {
         u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
   
-       kvm_lapic_set_reg(apic, APIC_ID, id << 24);
+       kvm_lapic_set_reg(apic, APIC_ID, id);
         kvm_lapic_set_reg(apic, APIC_LDR, ldr);
         recalculate_apic_map(apic->vcpu->kvm);
   }
@@@ -599,17 -623,30 +623,30 @@@ static bool kvm_apic_match_logical_addr
         }
   }
   
- /* KVM APIC implementation has two quirks
-  *  - dest always begins at 0 while xAPIC MDA has offset 24,
-  *  - IOxAPIC messages have to be delivered (directly) to x2APIC.
+ /* The KVM local APIC implementation has two quirks:
+  *
+  *  - the xAPIC MDA stores the destination at bits 24-31, while this
+  *    is not true of struct kvm_lapic_irq's dest_id field.  This is
+  *    just a quirk in the API and is not problematic.
+  *
+  *  - in-kernel IOAPIC messages have to be delivered directly to
+  *    x2APIC, because the kernel does not support interrupt remapping.
+  *    In order to support broadcast without interrupt remapping, x2APIC
+  *    rewrites the destination of non-IPI messages from APIC_BROADCAST
+  *    to X2APIC_BROADCAST.
+  *
+  * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API.  This is
+  * important when userspace wants to use x2APIC-format MSIs, because
+  * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
    */
- static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source,
-                                               struct kvm_lapic *target)
+ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
+               struct kvm_lapic *source, struct kvm_lapic *target)
   {
         bool ipi = source != NULL;
         bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
   
-       if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+       if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
+           !ipi && dest_id == APIC_BROADCAST && x2apic_mda)
                 return X2APIC_BROADCAST;
   
         return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
@@@ -619,7 -656,7 +656,7 @@@ bool kvm_apic_match_dest(struct kvm_vcp
                            int short_hand, unsigned int dest, int dest_mode)
   {
         struct kvm_lapic *target = vcpu->arch.apic;
-       u32 mda = kvm_apic_mda(dest, source, target);
+       u32 mda = kvm_apic_mda(vcpu, dest, source, target);
   
         apic_debug("target %p, source %p, dest 0x%x, "
                    "dest_mode 0x%x, short_hand 0x%x\n",
@@@ -671,102 -708,126 +708,126 @@@ static void kvm_apic_disabled_lapic_fou
         }
   }
   
- bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+ static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
+               struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
   {
-       struct kvm_apic_map *map;
-       unsigned long bitmap = 1;
-       struct kvm_lapic **dst;
-       int i;
-       bool ret, x2apic_ipi;
+       if (kvm->arch.x2apic_broadcast_quirk_disabled) {
+               if ((irq->dest_id == APIC_BROADCAST &&
+                               map->mode != KVM_APIC_MODE_X2APIC))
+                       return true;
+               if (irq->dest_id == X2APIC_BROADCAST)
+                       return true;
+       } else {
+               bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
+               if (irq->dest_id == (x2apic_ipi ?
+                                    X2APIC_BROADCAST : APIC_BROADCAST))
+                       return true;
+       }
   
-       *r = -1;
+       return false;
+ }
   
-       if (irq->shorthand == APIC_DEST_SELF) {
-               *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
-               return true;
-       }
+ /* Return true if the interrupt can be handled by using *bitmap as index mask
+  * for valid destinations in *dst array.
+  * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
+  * Note: we may have zero kvm_lapic destinations when we return true, which
+  * means that the interrupt should be dropped.  In this case, *bitmap would be
+  * zero and *dst undefined.
+  */
+ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
+               struct kvm_lapic **src, struct kvm_lapic_irq *irq,
+               struct kvm_apic_map *map, struct kvm_lapic ***dst,
+               unsigned long *bitmap)
+ {
+       int i, lowest;
   
-       if (irq->shorthand)
+       if (irq->shorthand == APIC_DEST_SELF && src) {
+               *dst = src;
+               *bitmap = 1;
+               return true;
+       } else if (irq->shorthand)
                 return false;
   
-       x2apic_ipi = src && apic_x2apic_mode(src);
-       if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
+       if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
                 return false;
   
-       ret = true;
-       rcu_read_lock();
-       map = rcu_dereference(kvm->arch.apic_map);
- 
-       if (!map) {
-               ret = false;
-               goto out;
+       if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+               if (irq->dest_id > map->max_apic_id) {
+                       *bitmap = 0;
+               } else {
+                       *dst = &map->phys_map[irq->dest_id];
+                       *bitmap = 1;
+               }
+               return true;
         }
   
-       if (irq->dest_mode == APIC_DEST_PHYSICAL) {
-               if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
-                       goto out;
+       *bitmap = 0;
+       if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst,
+                               (u16 *)bitmap))
+               return false;
   
-               dst = &map->phys_map[irq->dest_id];
-       } else {
-               u16 cid;
+       if (!kvm_lowest_prio_delivery(irq))
+               return true;
   
-               if (!kvm_apic_logical_map_valid(map)) {
-                       ret = false;
-                       goto out;
+       if (!kvm_vector_hashing_enabled()) {
+               lowest = -1;
+               for_each_set_bit(i, bitmap, 16) {
+                       if (!(*dst)[i])
+                               continue;
+                       if (lowest < 0)
+                               lowest = i;
+                       else if (kvm_apic_compare_prio((*dst)[i]->vcpu,
+                                               (*dst)[lowest]->vcpu) < 0)
+                               lowest = i;
                 }
+       } else {
+               if (!*bitmap)
+                       return true;
   
-               apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
+               lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap),
+                               bitmap, 16);
   
-               if (cid >= ARRAY_SIZE(map->logical_map))
-                       goto out;
+               if (!(*dst)[lowest]) {
+                       kvm_apic_disabled_lapic_found(kvm);
+                       *bitmap = 0;
+                       return true;
+               }
+       }
   
-               dst = map->logical_map[cid];
+       *bitmap = (lowest >= 0) ? 1 << lowest : 0;
   
-               if (!kvm_lowest_prio_delivery(irq))
-                       goto set_irq;
+       return true;
+ }
   
-               if (!kvm_vector_hashing_enabled()) {
-                       int l = -1;
-                       for_each_set_bit(i, &bitmap, 16) {
-                               if (!dst[i])
-                                       continue;
-                               if (l < 0)
-                                       l = i;
-                               else if (kvm_apic_compare_prio(dst[i]->vcpu,
-                                                       dst[l]->vcpu) < 0)
-                                       l = i;
-                       }
-                       bitmap = (l >= 0) ? 1 << l : 0;
-               } else {
-                       int idx;
-                       unsigned int dest_vcpus;
+ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+               struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+ {
+       struct kvm_apic_map *map;
+       unsigned long bitmap;
+       struct kvm_lapic **dst = NULL;
+       int i;
+       bool ret;
   
-                       dest_vcpus = hweight16(bitmap);
-                       if (dest_vcpus == 0)
-                               goto out;
+       *r = -1;
   
-                       idx = kvm_vector_to_index(irq->vector,
-                               dest_vcpus, &bitmap, 16);
+       if (irq->shorthand == APIC_DEST_SELF) {
+               *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
+               return true;
+       }
   
-                       if (!dst[idx]) {
-                               kvm_apic_disabled_lapic_found(kvm);
-                               goto out;
-                       }
+       rcu_read_lock();
+       map = rcu_dereference(kvm->arch.apic_map);
   
-                       bitmap = (idx >= 0) ? 1 << idx : 0;
+       ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
+       if (ret)
+               for_each_set_bit(i, &bitmap, 16) {
+                       if (!dst[i])
+                               continue;
+                       if (*r < 0)
+                               *r = 0;
+                       *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
                 }
-       }
   
- set_irq:
-       for_each_set_bit(i, &bitmap, 16) {
-               if (!dst[i])
-                       continue;
-               if (*r < 0)
-                       *r = 0;
-               *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
-       }
- out:
         rcu_read_unlock();
         return ret;
   }
@@@ -789,8 -850,9 +850,9 @@@ bool kvm_intr_is_single_vcpu_fast(struc
                         struct kvm_vcpu **dest_vcpu)
   {
         struct kvm_apic_map *map;
+       unsigned long bitmap;
+       struct kvm_lapic **dst = NULL;
         bool ret = false;
-       struct kvm_lapic *dst = NULL;
   
         if (irq->shorthand)
                 return false;
@@@ -798,69 -860,16 +860,16 @@@
         rcu_read_lock();
         map = rcu_dereference(kvm->arch.apic_map);
   
-       if (!map)
-               goto out;
- 
-       if (irq->dest_mode == APIC_DEST_PHYSICAL) {
-               if (irq->dest_id == 0xFF)
-                       goto out;
- 
-               if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
-                       goto out;
- 
-               dst = map->phys_map[irq->dest_id];
-               if (dst && kvm_apic_present(dst->vcpu))
-                       *dest_vcpu = dst->vcpu;
-               else
-                       goto out;
-       } else {
-               u16 cid;
-               unsigned long bitmap = 1;
-               int i, r = 0;
- 
-               if (!kvm_apic_logical_map_valid(map))
-                       goto out;
- 
-               apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
- 
-               if (cid >= ARRAY_SIZE(map->logical_map))
-                       goto out;
- 
-               if (kvm_vector_hashing_enabled() &&
-                               kvm_lowest_prio_delivery(irq)) {
-                       int idx;
-                       unsigned int dest_vcpus;
+       if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
+                       hweight16(bitmap) == 1) {
+               unsigned long i = find_first_bit(&bitmap, 16);
   
-                       dest_vcpus = hweight16(bitmap);
-                       if (dest_vcpus == 0)
-                               goto out;
- 
-                       idx = kvm_vector_to_index(irq->vector, dest_vcpus,
-                                                 &bitmap, 16);
- 
-                       dst = map->logical_map[cid][idx];
-                       if (!dst) {
-                               kvm_apic_disabled_lapic_found(kvm);
-                               goto out;
-                       }
- 
-                       *dest_vcpu = dst->vcpu;
-               } else {
-                       for_each_set_bit(i, &bitmap, 16) {
-                               dst = map->logical_map[cid][i];
-                               if (++r == 2)
-                                       goto out;
-                       }
- 
-                       if (dst && kvm_apic_present(dst->vcpu))
-                               *dest_vcpu = dst->vcpu;
-                       else
-                               goto out;
+               if (dst[i]) {
+                       *dest_vcpu = dst[i]->vcpu;
+                       ret = true;
                 }
         }
   
-       ret = true;
- out:
         rcu_read_unlock();
         return ret;
   }
@@@ -1127,12 -1136,6 +1136,6 @@@ static u32 __apic_read(struct kvm_lapi
                 return 0;
   
         switch (offset) {
-       case APIC_ID:
-               if (apic_x2apic_mode(apic))
-                       val = kvm_apic_id(apic);
-               else
-                       val = kvm_apic_id(apic) << 24;
-               break;
         case APIC_ARBPRI:
                 apic_debug("Access APIC ARBPRI register which is for P6\n");
                 break;
@@@ -1310,10 -1313,111 +1313,112 @@@ void wait_lapic_expire(struct kvm_vcpu 
   
         /* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
         if (guest_tsc < tsc_deadline)
- -              __delay(tsc_deadline - guest_tsc);
+ +              __delay(min(tsc_deadline - guest_tsc,
+ +                      nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
   }
   
+ static void start_sw_tscdeadline(struct kvm_lapic *apic)
+ {
+       u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+       u64 ns = 0;
+       ktime_t expire;
+       struct kvm_vcpu *vcpu = apic->vcpu;
+       unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+       unsigned long flags;
+       ktime_t now;
+ 
+       if (unlikely(!tscdeadline || !this_tsc_khz))
+               return;
+ 
+       local_irq_save(flags);
+ 
+       now = apic->lapic_timer.timer.base->get_time();
+       guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+       if (likely(tscdeadline > guest_tsc)) {
+               ns = (tscdeadline - guest_tsc) * 1000000ULL;
+               do_div(ns, this_tsc_khz);
+               expire = ktime_add_ns(now, ns);
+               expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
+               hrtimer_start(&apic->lapic_timer.timer,
+                               expire, HRTIMER_MODE_ABS_PINNED);
+       } else
+               apic_timer_expired(apic);
+ 
+       local_irq_restore(flags);
+ }
+ 
+ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+ {
+       return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
+ }
+ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
+ 
+ static void cancel_hv_tscdeadline(struct kvm_lapic *apic)
+ {
+       kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+       apic->lapic_timer.hv_timer_in_use = false;
+ }
+ 
+ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+ {
+       struct kvm_lapic *apic = vcpu->arch.apic;
+ 
+       WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+       WARN_ON(swait_active(&vcpu->wq));
+       cancel_hv_tscdeadline(apic);
+       apic_timer_expired(apic);
+ }
+ EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+ 
+ static bool start_hv_tscdeadline(struct kvm_lapic *apic)
+ {
+       u64 tscdeadline = apic->lapic_timer.tscdeadline;
+ 
+       if (atomic_read(&apic->lapic_timer.pending) ||
+               kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
+               if (apic->lapic_timer.hv_timer_in_use)
+                       cancel_hv_tscdeadline(apic);
+       } else {
+               apic->lapic_timer.hv_timer_in_use = true;
+               hrtimer_cancel(&apic->lapic_timer.timer);
+ 
+               /* In case the sw timer triggered in the window */
+               if (atomic_read(&apic->lapic_timer.pending))
+                       cancel_hv_tscdeadline(apic);
+       }
+       trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+                       apic->lapic_timer.hv_timer_in_use);
+       return apic->lapic_timer.hv_timer_in_use;
+ }
+ 
+ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+ {
+       struct kvm_lapic *apic = vcpu->arch.apic;
+ 
+       WARN_ON(apic->lapic_timer.hv_timer_in_use);
+ 
+       if (apic_lvtt_tscdeadline(apic))
+               start_hv_tscdeadline(apic);
+ }
+ EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
+ 
+ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+ {
+       struct kvm_lapic *apic = vcpu->arch.apic;
+ 
+       /* Possibly the TSC deadline timer is not enabled yet */
+       if (!apic->lapic_timer.hv_timer_in_use)
+               return;
+ 
+       cancel_hv_tscdeadline(apic);
+ 
+       if (atomic_read(&apic->lapic_timer.pending))
+               return;
+ 
+       start_sw_tscdeadline(apic);
+ }
+ EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
+ 
   static void start_apic_timer(struct kvm_lapic *apic)
   {
         ktime_t now;
@@@ -1360,32 -1464,8 +1465,8 @@@
                            ktime_to_ns(ktime_add_ns(now,
                                         apic->lapic_timer.period)));
         } else if (apic_lvtt_tscdeadline(apic)) {
-               /* lapic timer in tsc deadline mode */
-               u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
-               u64 ns = 0;
-               ktime_t expire;
-               struct kvm_vcpu *vcpu = apic->vcpu;
-               unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
-               unsigned long flags;
- 
-               if (unlikely(!tscdeadline || !this_tsc_khz))
-                       return;
- 
-               local_irq_save(flags);
- 
-               now = apic->lapic_timer.timer.base->get_time();
-               guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
-               if (likely(tscdeadline > guest_tsc)) {
-                       ns = (tscdeadline - guest_tsc) * 1000000ULL;
-                       do_div(ns, this_tsc_khz);
-                       expire = ktime_add_ns(now, ns);
-                       expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
-                       hrtimer_start(&apic->lapic_timer.timer,
-                                     expire, HRTIMER_MODE_ABS_PINNED);
-               } else
-                       apic_timer_expired(apic);
- 
-               local_irq_restore(flags);
+               if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic)))
+                       start_sw_tscdeadline(apic);
         }
   }
   
@@@ -1413,7 -1493,7 +1494,7 @@@ int kvm_lapic_reg_write(struct kvm_lapi
         switch (reg) {
         case APIC_ID:           /* Local APIC ID */
                 if (!apic_x2apic_mode(apic))
-                       kvm_apic_set_id(apic, val >> 24);
+                       kvm_apic_set_xapic_id(apic, val >> 24);
                 else
                         ret = 1;
                 break;
@@@ -1674,9 -1754,10 +1755,10 @@@ void kvm_lapic_set_base(struct kvm_vcp
   
         /* update jump label if enable bit changes */
         if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
-               if (value & MSR_IA32_APICBASE_ENABLE)
+               if (value & MSR_IA32_APICBASE_ENABLE) {
+                       kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
                         static_key_slow_dec_deferred(&apic_hw_disabled);
-               else
+               } else
                         static_key_slow_inc(&apic_hw_disabled.key);
                 recalculate_apic_map(vcpu->kvm);
         }
@@@ -1716,8 -1797,11 +1798,11 @@@ void kvm_lapic_reset(struct kvm_vcpu *v
         /* Stop the timer in case it's a reset to an active apic */
         hrtimer_cancel(&apic->lapic_timer.timer);
   
-       if (!init_event)
-               kvm_apic_set_id(apic, vcpu->vcpu_id);
+       if (!init_event) {
+               kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE |
+                                        MSR_IA32_APICBASE_ENABLE);
+               kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+       }
         kvm_apic_set_version(apic->vcpu);
   
         for (i = 0; i < KVM_APIC_LVT_NUM; i++)
@@@ -1856,9 -1940,6 +1941,6 @@@ int kvm_create_lapic(struct kvm_vcpu *v
          * thinking that APIC satet has changed.
          */
         vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
-       kvm_lapic_set_base(vcpu,
-                       APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
- 
         static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
         kvm_lapic_reset(vcpu, false);
         kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
@@@ -1938,17 -2019,48 +2020,48 @@@ int kvm_get_apic_interrupt(struct kvm_v
         return vector;
   }
   
- void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
-               struct kvm_lapic_state *s)
+ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
+               struct kvm_lapic_state *s, bool set)
+ {
+       if (apic_x2apic_mode(vcpu->arch.apic)) {
+               u32 *id = (u32 *)(s->regs + APIC_ID);
+ 
+               if (vcpu->kvm->arch.x2apic_format) {
+                       if (*id != vcpu->vcpu_id)
+                               return -EINVAL;
+               } else {
+                       if (set)
+                               *id >>= 24;
+                       else
+                               *id <<= 24;
+               }
+       }
+ 
+       return 0;
+ }
+ 
+ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+ {
+       memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+       return kvm_apic_state_fixup(vcpu, s, false);
+ }
+ 
+ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
   {
         struct kvm_lapic *apic = vcpu->arch.apic;
+       int r;
+ 
   
         kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
         /* set SPIV separately to get count of SW disabled APICs right */
         apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+ 
+       r = kvm_apic_state_fixup(vcpu, s, true);
+       if (r)
+               return r;
         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
-       /* call kvm_apic_set_id() to put apic into apic_map */
-       kvm_apic_set_id(apic, kvm_apic_id(apic));
+ 
+       recalculate_apic_map(vcpu->kvm);
         kvm_apic_set_version(vcpu);
   
         apic_update_ppr(apic);
@@@ -1974,6 -2086,8 +2087,8 @@@
                 kvm_rtc_eoi_tracking_restore_one(vcpu);
   
         vcpu->arch.apic_arb_prio = 0;
+ 
+       return 0;
   }
   
   void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --combined arch/x86/kvm/mmu.c

index 745a5f4,3041902..3d4cc8c
--- 1/arch/x86/kvm/mmu.c
--- 2/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@@ -29,8 -29,7 +29,8 @@@
   #include <linux/string.h>
   #include <linux/mm.h>
   #include <linux/highmem.h>
- -#include <linux/module.h>
+ +#include <linux/moduleparam.h>
+ +#include <linux/export.h>
   #include <linux/swap.h>
   #include <linux/hugetlb.h>
   #include <linux/compiler.h>
@@@ -176,6 -175,7 +176,7 @@@ static u64 __read_mostly shadow_user_ma
   static u64 __read_mostly shadow_accessed_mask;
   static u64 __read_mostly shadow_dirty_mask;
   static u64 __read_mostly shadow_mmio_mask;
+ static u64 __read_mostly shadow_present_mask;
   
   static void mmu_spte_set(u64 *sptep, u64 spte);
   static void mmu_free_roots(struct kvm_vcpu *vcpu);
@@@ -283,13 -283,14 +284,14 @@@ static bool check_mmio_spte(struct kvm_
   }
   
   void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask)
+               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask)
   {
         shadow_user_mask = user_mask;
         shadow_accessed_mask = accessed_mask;
         shadow_dirty_mask = dirty_mask;
         shadow_nx_mask = nx_mask;
         shadow_x_mask = x_mask;
+       shadow_present_mask = p_mask;
   }
   EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
   
@@@ -305,7 -306,7 +307,7 @@@ static int is_nx(struct kvm_vcpu *vcpu
   
   static int is_shadow_present_pte(u64 pte)
   {
-       return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
+       return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte);
   }
   
   static int is_large_pte(u64 pte)
@@@ -524,7 -525,7 +526,7 @@@ static void mmu_spte_set(u64 *sptep, u6
   }
   
   /* Rules for using mmu_spte_update:
-  * Update the state bits, it means the mapped pfn is not changged.
+  * Update the state bits, it means the mapped pfn is not changed.
    *
    * Whenever we overwrite a writable spte with a read-only one we
    * should flush remote TLBs. Otherwise rmap_write_protect
@@@ -2246,10 -2247,9 +2248,9 @@@ static void link_shadow_page(struct kvm
   {
         u64 spte;
   
-       BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
-                       VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+       BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
   
-       spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
+       spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
                shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
   
         mmu_spte_set(sptep, spte);
@@@ -2516,13 -2516,19 +2517,19 @@@ static int set_spte(struct kvm_vcpu *vc
                     gfn_t gfn, kvm_pfn_t pfn, bool speculative,
                     bool can_unsync, bool host_writable)
   {
-       u64 spte;
+       u64 spte = 0;
         int ret = 0;
   
         if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
                 return 0;
   
-       spte = PT_PRESENT_MASK;
+       /*
+        * For the EPT case, shadow_present_mask is 0 if hardware
+        * supports exec-only page table entries.  In that case,
+        * ACC_USER_MASK and shadow_user_mask are used to represent
+        * read access.  See FNAME(gpte_access) in paging_tmpl.h.
+        */
+       spte |= shadow_present_mask;
         if (!speculative)
                 spte |= shadow_accessed_mask;
   
@@@ -3190,7 -3196,7 +3197,7 @@@ static int mmu_alloc_shadow_roots(struc
                 MMU_WARN_ON(VALID_PAGE(root));
                 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
                         pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
-                       if (!is_present_gpte(pdptr)) {
+                       if (!(pdptr & PT_PRESENT_MASK)) {
                                 vcpu->arch.mmu.pae_root[i] = 0;
                                 continue;
                         }
@@@ -3915,9 -3921,7 +3922,7 @@@ static void update_permission_bitmask(s
                                  *   clearer.
                                  */
                                 smap = cr4_smap && u && !uf && !ff;
-                       } else
-                               /* Not really needed: no U/S accesses on ept  */
-                               u = 1;
+                       }
   
                         fault = (ff && !x) || (uf && !u) || (wf && !w) ||
                                 (smapf && smap);
diff --combined arch/x86/kvm/svm.c

index 16ef31b,5bfdbbf..af523d8
--- 1/arch/x86/kvm/svm.c
--- 2/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@@ -238,9 -238,7 +238,9 @@@ module_param(nested, int, S_IRUGO)
   
   /* enable / disable AVIC */
   static int avic;
+ +#ifdef CONFIG_X86_LOCAL_APIC
   module_param(avic, int, S_IRUGO);
+ +#endif
   
   static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
   static void svm_flush_tlb(struct kvm_vcpu *vcpu);
@@@ -983,14 -981,11 +983,14 @@@ static __init int svm_hardware_setup(vo
         } else
                 kvm_disable_tdp();
   
- -      if (avic && (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)))
- -              avic = false;
- -
- -      if (avic)
- -              pr_info("AVIC enabled\n");
+ +      if (avic) {
+ +              if (!npt_enabled ||
+ +                  !boot_cpu_has(X86_FEATURE_AVIC) ||
+ +                  !IS_ENABLED(CONFIG_X86_LOCAL_APIC))
+ +                      avic = false;
+ +              else
+ +                      pr_info("AVIC enabled\n");
+ +      }
   
         return 0;
   
@@@ -1329,7 -1324,7 +1329,7 @@@ free_avic
   static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
   {
         u64 entry;
- -      int h_physical_id = __default_cpu_present_to_apicid(vcpu->cpu);
+ +      int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
         struct vcpu_svm *svm = to_svm(vcpu);
   
         if (!kvm_vcpu_apicv_active(vcpu))
@@@ -1354,7 -1349,7 +1354,7 @@@ static void avic_vcpu_load(struct kvm_v
   {
         u64 entry;
         /* ID = 0xff (broadcast), ID > 0xff (reserved) */
- -      int h_physical_id = __default_cpu_present_to_apicid(cpu);
+ +      int h_physical_id = kvm_cpu_get_apicid(cpu);
         struct vcpu_svm *svm = to_svm(vcpu);
   
         if (!kvm_vcpu_apicv_active(vcpu))
@@@ -1577,7 -1572,7 +1577,7 @@@ static unsigned long svm_get_rflags(str
   static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
   {
          /*
-         * Any change of EFLAGS.VM is accompained by a reload of SS
+         * Any change of EFLAGS.VM is accompanied by a reload of SS
           * (caused by either a task switch or an inter-privilege IRET),
           * so we do not need to update the CPL here.
           */
@@@ -4241,7 -4236,7 +4241,7 @@@ static void svm_deliver_avic_intr(struc
   
         if (avic_vcpu_is_running(vcpu))
                 wrmsrl(SVM_AVIC_DOORBELL,
- -                     __default_cpu_present_to_apicid(vcpu->cpu));
+ +                     kvm_cpu_get_apicid(vcpu->cpu));
         else
                 kvm_vcpu_wake_up(vcpu);
   }
@@@ -4940,6 -4935,12 +4940,12 @@@ out
   static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
   {
         local_irq_enable();
+       /*
+        * We must have an instruction with interrupts enabled, so
+        * the timer interrupt isn't delayed by the interrupt shadow.
+        */
+       asm("nop");
+       local_irq_disable();
   }
   
   static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
diff --combined arch/x86/kvm/vmx.c

index df07a0a,b2f5591..bc354f0
--- 1/arch/x86/kvm/vmx.c
--- 2/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@@ -110,6 -110,13 +110,13 @@@ module_param_named(pml, enable_pml, boo
   
   #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
   
+ /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
+ static int __read_mostly cpu_preemption_timer_multi;
+ static bool __read_mostly enable_preemption_timer = 1;
+ #ifdef CONFIG_X86_64
+ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+ #endif
+ 
   #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
   #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
   #define KVM_VM_CR0_ALWAYS_ON                                          \
@@@ -398,6 -405,12 +405,12 @@@ struct nested_vmx 
         /* The host-usable pointer to the above */
         struct page *current_vmcs12_page;
         struct vmcs12 *current_vmcs12;
+       /*
+        * Cache of the guest's VMCS, existing outside of guest memory.
+        * Loaded from guest memory during VMPTRLD. Flushed to guest
+        * memory during VMXOFF, VMCLEAR, VMPTRLD.
+        */
+       struct vmcs12 *cached_vmcs12;
         struct vmcs *current_shadow_vmcs;
         /*
          * Indicates if the shadow vmcs must be updated with the
@@@ -421,7 -434,6 +434,6 @@@
         struct pi_desc *pi_desc;
         bool pi_pending;
         u16 posted_intr_nv;
-       u64 msr_ia32_feature_control;
   
         struct hrtimer preemption_timer;
         bool preemption_timer_expired;
@@@ -597,11 -609,22 +609,22 @@@ struct vcpu_vmx 
   #define PML_ENTITY_NUM                512
         struct page *pml_pg;
   
+       /* apic deadline value in host tsc */
+       u64 hv_deadline_tsc;
+ 
         u64 current_tsc_ratio;
   
         bool guest_pkru_valid;
         u32 guest_pkru;
         u32 host_pkru;
+ 
+       /*
+        * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+        * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+        * in msr_ia32_feature_control_valid_bits.
+        */
+       u64 msr_ia32_feature_control;
+       u64 msr_ia32_feature_control_valid_bits;
   };
   
   enum segment_cache_field {
@@@ -841,7 -864,7 +864,7 @@@ static inline short vmcs_field_to_offse
   
   static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
   {
-       return to_vmx(vcpu)->nested.current_vmcs12;
+       return to_vmx(vcpu)->nested.cached_vmcs12;
   }
   
   static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
@@@ -1056,6 -1079,58 +1079,58 @@@ static inline bool cpu_has_vmx_virtual_
                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
   }
   
+ /*
+  * Comment's format: document - errata name - stepping - processor name.
+  * Refer from
+  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+  */
+ static u32 vmx_preemption_cpu_tfms[] = {
+ /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
+ 0x000206E6,
+ /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
+ /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+ /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+ 0x00020652,
+ /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+ 0x00020655,
+ /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
+ /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
+ /*
+  * 320767.pdf - AAP86  - B1 -
+  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+  */
+ 0x000106E5,
+ /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+ 0x000106A0,
+ /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+ 0x000106A1,
+ /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+ 0x000106A4,
+  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+ 0x000106A5,
+ };
+ 
+ static inline bool cpu_has_broken_vmx_preemption_timer(void)
+ {
+       u32 eax = cpuid_eax(0x00000001), i;
+ 
+       /* Clear the reserved bits */
+       eax &= ~(0x3U << 14 | 0xfU << 28);
+       for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
+               if (eax == vmx_preemption_cpu_tfms[i])
+                       return true;
+ 
+       return false;
+ }
+ 
+ static inline bool cpu_has_vmx_preemption_timer(void)
+ {
+       return vmcs_config.pin_based_exec_ctrl &
+               PIN_BASED_VMX_PREEMPTION_TIMER;
+ }
+ 
   static inline bool cpu_has_vmx_posted_intr(void)
   {
         return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
@@@ -1603,6 -1678,11 +1678,11 @@@ static __always_inline void vmcs_set_bi
         __vmcs_writel(field, __vmcs_readl(field) | mask);
   }
   
+ static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
+ {
+       vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
+ }
+ 
   static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
   {
         vmcs_write32(VM_ENTRY_CONTROLS, val);
@@@ -1631,6 -1711,11 +1711,11 @@@ static inline void vm_entry_controls_cl
         vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
   }
   
+ static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
+ {
+       vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
+ }
+ 
   static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
   {
         vmcs_write32(VM_EXIT_CONTROLS, val);
@@@ -2072,8 -2157,7 +2157,8 @@@ static void vmx_vcpu_pi_load(struct kvm
         unsigned int dest;
   
         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- -              !irq_remapping_cap(IRQ_POSTING_CAP))
+ +              !irq_remapping_cap(IRQ_POSTING_CAP)  ||
+ +              !kvm_vcpu_apicv_active(vcpu))
                 return;
   
         do {
@@@ -2121,22 -2205,14 +2206,14 @@@ static void vmx_vcpu_load(struct kvm_vc
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+       bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
   
         if (!vmm_exclusive)
                 kvm_cpu_vmxon(phys_addr);
-       else if (vmx->loaded_vmcs->cpu != cpu)
+       else if (!already_loaded)
                 loaded_vmcs_clear(vmx->loaded_vmcs);
   
-       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
-               per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
-               vmcs_load(vmx->loaded_vmcs->vmcs);
-       }
- 
-       if (vmx->loaded_vmcs->cpu != cpu) {
-               struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
-               unsigned long sysenter_esp;
- 
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+       if (!already_loaded) {
                 local_irq_disable();
                 crash_disable_local_vmclear(cpu);
   
@@@ -2151,6 -2227,18 +2228,18 @@@
                          &per_cpu(loaded_vmcss_on_cpu, cpu));
                 crash_enable_local_vmclear(cpu);
                 local_irq_enable();
+       }
+ 
+       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+               per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+               vmcs_load(vmx->loaded_vmcs->vmcs);
+       }
+ 
+       if (!already_loaded) {
+               struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
+               unsigned long sysenter_esp;
+ 
+               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
   
                 /*
                  * Linux uses per-cpu TSS and GDT, so set these when switching
@@@ -2181,8 -2269,7 +2270,8 @@@ static void vmx_vcpu_pi_put(struct kvm_
         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
   
         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- -              !irq_remapping_cap(IRQ_POSTING_CAP))
+ +              !irq_remapping_cap(IRQ_POSTING_CAP)  ||
+ +              !kvm_vcpu_apicv_active(vcpu))
                 return;
   
         /* Set SN when the vCPU is preempted */
@@@ -2707,7 -2794,8 +2796,7 @@@ static void nested_vmx_setup_ctls_msrs(
                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                 SECONDARY_EXEC_WBINVD_EXITING |
- -              SECONDARY_EXEC_XSAVES |
- -              SECONDARY_EXEC_PCOMMIT;
+ +              SECONDARY_EXEC_XSAVES;
   
         if (enable_ept) {
                 /* nested EPT: emulate EPT also to L1 */
@@@ -2716,6 -2804,9 +2805,9 @@@
                 vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
                          VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
                          VMX_EPT_INVEPT_BIT;
+               if (cpu_has_vmx_ept_execute_only())
+                       vmx->nested.nested_vmx_ept_caps |=
+                               VMX_EPT_EXECUTE_ONLY_BIT;
                 vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
                 /*
                  * For nested guests, we don't do anything specific
@@@ -2864,6 -2955,14 +2956,14 @@@ static int vmx_get_vmx_msr(struct kvm_v
         return 0;
   }
   
+ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
+                                                uint64_t val)
+ {
+       uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+ 
+       return !(val & ~valid_bits);
+ }
+ 
   /*
    * Reads an msr value (of 'msr_index') into 'pdata'.
    * Returns 0 on success, non-0 otherwise.
@@@ -2905,10 -3004,15 +3005,15 @@@ static int vmx_get_msr(struct kvm_vcpu 
                         return 1;
                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
                 break;
-       case MSR_IA32_FEATURE_CONTROL:
-               if (!nested_vmx_allowed(vcpu))
+       case MSR_IA32_MCG_EXT_CTL:
+               if (!msr_info->host_initiated &&
+                   !(to_vmx(vcpu)->msr_ia32_feature_control &
+                     FEATURE_CONTROL_LMCE))
                         return 1;
-               msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+               msr_info->data = vcpu->arch.mcg_ext_ctl;
+               break;
+       case MSR_IA32_FEATURE_CONTROL:
+               msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
                 break;
         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                 if (!nested_vmx_allowed(vcpu))
@@@ -2998,12 -3102,20 +3103,20 @@@ static int vmx_set_msr(struct kvm_vcpu 
         case MSR_IA32_TSC_ADJUST:
                 ret = kvm_set_msr_common(vcpu, msr_info);
                 break;
+       case MSR_IA32_MCG_EXT_CTL:
+               if ((!msr_info->host_initiated &&
+                    !(to_vmx(vcpu)->msr_ia32_feature_control &
+                      FEATURE_CONTROL_LMCE)) ||
+                   (data & ~MCG_EXT_CTL_LMCE_EN))
+                       return 1;
+               vcpu->arch.mcg_ext_ctl = data;
+               break;
         case MSR_IA32_FEATURE_CONTROL:
-               if (!nested_vmx_allowed(vcpu) ||
-                   (to_vmx(vcpu)->nested.msr_ia32_feature_control &
+               if (!vmx_feature_control_msr_valid(vcpu, data) ||
+                   (to_vmx(vcpu)->msr_ia32_feature_control &
                      FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
                         return 1;
-               vmx->nested.msr_ia32_feature_control = data;
+               vmx->msr_ia32_feature_control = data;
                 if (msr_info->host_initiated && data == 0)
                         vmx_leave_nested(vcpu);
                 break;
@@@ -3269,6 -3381,7 +3382,6 @@@ static __init int setup_vmcs_config(str
                         SECONDARY_EXEC_SHADOW_VMCS |
                         SECONDARY_EXEC_XSAVES |
                         SECONDARY_EXEC_ENABLE_PML |
- -                      SECONDARY_EXEC_PCOMMIT |
                         SECONDARY_EXEC_TSC_SCALING;
                 if (adjust_vmx_controls(min2, opt2,
                                         MSR_IA32_VMX_PROCBASED_CTLS2,
@@@ -3297,25 -3410,27 +3410,27 @@@
                       vmx_capability.ept, vmx_capability.vpid);
         }
   
-       min = VM_EXIT_SAVE_DEBUG_CONTROLS;
+       min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
   #ifdef CONFIG_X86_64
         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
   #endif
         opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
-               VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
+               VM_EXIT_CLEAR_BNDCFGS;
         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
                                 &_vmexit_control) < 0)
                 return -EIO;
   
         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+                PIN_BASED_VMX_PREEMPTION_TIMER;
         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
                                 &_pin_based_exec_control) < 0)
                 return -EIO;
   
+       if (cpu_has_broken_vmx_preemption_timer())
+               _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
         if (!(_cpu_based_2nd_exec_control &
-               SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
-               !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+               SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
   
         min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
@@@ -3364,7 -3479,7 +3479,7 @@@
   
         /*
          * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
-        * but due to arrata below it can't be used. Workaround is to use
+        * but due to errata below it can't be used. Workaround is to use
          * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
          *
          * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
@@@ -4781,6 -4896,8 +4896,8 @@@ static u32 vmx_pin_based_exec_ctrl(stru
   
         if (!kvm_vcpu_apicv_active(&vmx->vcpu))
                 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+       /* Enable the preemption timer dynamically */
+       pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
         return pin_based_exec_ctrl;
   }
   
@@@ -4856,6 -4973,9 +4973,6 @@@ static u32 vmx_secondary_exec_control(s
         if (!enable_pml)
                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
   
- -      /* Currently, we allow L1 guest to directly run pcommit instruction. */
- -      exec_control &= ~SECONDARY_EXEC_PCOMMIT;
- -
         return exec_control;
   }
   
@@@ -4896,13 -5016,13 +5013,14 @@@ static int vmx_vcpu_setup(struct vcpu_v
   
         /* Control */
         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+       vmx->hv_deadline_tsc = -1;
   
         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
   
- -      if (cpu_has_secondary_exec_ctrls())
+ +      if (cpu_has_secondary_exec_ctrls()) {
                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
                                 vmx_secondary_exec_control(vmx));
+ +      }
   
         if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
@@@ -4975,12 -5095,6 +5093,12 @@@
         if (vmx_xsaves_supported())
                 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
   
+ +      if (enable_pml) {
+ +              ASSERT(vmx->pml_pg);
+ +              vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ +              vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ +      }
+ +
         return 0;
   }
   
@@@ -6016,12 -6130,14 +6134,14 @@@ static int handle_ept_violation(struct 
         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
         trace_kvm_page_fault(gpa, exit_qualification);
   
-       /* It is a write fault? */
-       error_code = exit_qualification & PFERR_WRITE_MASK;
+       /* it is a read fault? */
+       error_code = (exit_qualification << 2) & PFERR_USER_MASK;
+       /* it is a write fault? */
+       error_code |= exit_qualification & PFERR_WRITE_MASK;
         /* It is a fetch fault? */
         error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
         /* ept page table is present? */
-       error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK;
+       error_code |= (exit_qualification & 0x38) != 0;
   
         vcpu->arch.exit_qualification = exit_qualification;
   
@@@ -6355,9 -6471,6 +6475,6 @@@ static __init int hardware_setup(void
         for (msr = 0x800; msr <= 0x8ff; msr++)
                 vmx_disable_intercept_msr_read_x2apic(msr);
   
-       /* According SDM, in x2apic mode, the whole id reg is used.  But in
-        * KVM, it only use the highest eight bits. Need to intercept it */
-       vmx_enable_intercept_msr_read_x2apic(0x802);
         /* TMCCT */
         vmx_enable_intercept_msr_read_x2apic(0x839);
         /* TPR */
@@@ -6368,10 -6481,12 +6485,12 @@@
         vmx_disable_intercept_msr_write_x2apic(0x83f);
   
         if (enable_ept) {
-               kvm_mmu_set_mask_ptes(0ull,
+               kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
                         (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
                         (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
-                       0ull, VMX_EPT_EXECUTABLE_MASK);
+                       0ull, VMX_EPT_EXECUTABLE_MASK,
+                       cpu_has_vmx_ept_execute_only() ?
+                                     0ull : VMX_EPT_READABLE_MASK);
                 ept_set_mmio_spte_mask();
                 kvm_enable_tdp();
         } else
@@@ -6393,8 -6508,21 +6512,21 @@@
                 kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
         }
   
+       if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+               u64 vmx_msr;
+ 
+               rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+               cpu_preemption_timer_multi =
+                        vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+       } else {
+               kvm_x86_ops->set_hv_timer = NULL;
+               kvm_x86_ops->cancel_hv_timer = NULL;
+       }
+ 
         kvm_set_posted_intr_wakeup_handler(wakeup_handler);
   
+       kvm_mce_cap_supported |= MCG_LMCE_P;
+ 
         return alloc_kvm_area();
   
   out8:
@@@ -6673,13 -6801,7 +6805,13 @@@ static int get_vmx_mem_address(struct k
   
         /* Checks for #GP/#SS exceptions. */
         exn = false;
- -      if (is_protmode(vcpu)) {
+ +      if (is_long_mode(vcpu)) {
+ +              /* Long mode: #GP(0)/#SS(0) if the memory address is in a
+ +               * non-canonical form. This is the only check on the memory
+ +               * destination for long mode!
+ +               */
+ +              exn = is_noncanonical_address(*ret);
+ +      } else if (is_protmode(vcpu)) {
                 /* Protected mode: apply checks for segment validity in the
                  * following order:
                  * - segment type check (#GP(0) may be thrown)
@@@ -6696,10 -6818,17 +6828,10 @@@
                          * execute-only code segment
                          */
                         exn = ((s.type & 0xa) == 8);
- -      }
- -      if (exn) {
- -              kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
- -              return 1;
- -      }
- -      if (is_long_mode(vcpu)) {
- -              /* Long mode: #GP(0)/#SS(0) if the memory address is in a
- -               * non-canonical form. This is an only check for long mode.
- -               */
- -              exn = is_noncanonical_address(*ret);
- -      } else if (is_protmode(vcpu)) {
+ +              if (exn) {
+ +                      kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ +                      return 1;
+ +              }
                 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
                  */
                 exn = (s.unusable != 0);
@@@ -6862,16 -6991,22 +6994,22 @@@ static int handle_vmon(struct kvm_vcpu 
                 return 1;
         }
   
-       if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+       if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
                         != VMXON_NEEDED_FEATURES) {
                 kvm_inject_gp(vcpu, 0);
                 return 1;
         }
   
+       vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+       if (!vmx->nested.cached_vmcs12)
+               return -ENOMEM;
+ 
         if (enable_shadow_vmcs) {
                 shadow_vmcs = alloc_vmcs();
-               if (!shadow_vmcs)
+               if (!shadow_vmcs) {
+                       kfree(vmx->nested.cached_vmcs12);
                         return -ENOMEM;
+               }
                 /* mark vmcs as shadow */
                 shadow_vmcs->revision_id |= (1u << 31);
                 /* init shadow vmcs */
@@@ -6942,6 -7077,11 +7080,11 @@@ static inline void nested_release_vmcs1
                 vmcs_write64(VMCS_LINK_POINTER, -1ull);
         }
         vmx->nested.posted_intr_nv = -1;
+ 
+       /* Flush VMCS12 to guest memory */
+       memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
+              VMCS12_SIZE);
+ 
         kunmap(vmx->nested.current_vmcs12_page);
         nested_release_page(vmx->nested.current_vmcs12_page);
         vmx->nested.current_vmptr = -1ull;
@@@ -6962,6 -7102,7 +7105,7 @@@ static void free_nested(struct vcpu_vm
         nested_release_vmcs12(vmx);
         if (enable_shadow_vmcs)
                 free_vmcs(vmx->nested.current_shadow_vmcs);
+       kfree(vmx->nested.cached_vmcs12);
         /* Unpin physical memory we referred to in current vmcs02 */
         if (vmx->nested.apic_access_page) {
                 nested_release_page(vmx->nested.apic_access_page);
@@@ -7365,6 -7506,13 +7509,13 @@@ static int handle_vmptrld(struct kvm_vc
                 vmx->nested.current_vmptr = vmptr;
                 vmx->nested.current_vmcs12 = new_vmcs12;
                 vmx->nested.current_vmcs12_page = page;
+               /*
+                * Load VMCS12 from guest memory since it is not already
+                * cached.
+                */
+               memcpy(vmx->nested.cached_vmcs12,
+                      vmx->nested.current_vmcs12, VMCS12_SIZE);
+ 
                 if (enable_shadow_vmcs) {
                         vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
                                       SECONDARY_EXEC_SHADOW_VMCS);
@@@ -7560,6 -7708,19 +7711,12 @@@ static int handle_pml_full(struct kvm_v
         return 1;
   }
   
- -static int handle_pcommit(struct kvm_vcpu *vcpu)
- -{
- -      /* we never catch pcommit instruct for L1 guest. */
- -      WARN_ON(1);
- -      return 1;
- -}
- -
+ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+ {
+       kvm_lapic_expired_hv_timer(vcpu);
+       return 1;
+ }
+ 
   /*
    * The exit handlers return 1 if the exit was handled fully and guest execution
    * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@@ -7610,6 -7771,8 +7767,7 @@@ static int (*const kvm_vmx_exit_handler
         [EXIT_REASON_XSAVES]                  = handle_xsaves,
         [EXIT_REASON_XRSTORS]                 = handle_xrstors,
         [EXIT_REASON_PML_FULL]                = handle_pml_full,
- -      [EXIT_REASON_PCOMMIT]                 = handle_pcommit,
+       [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
   };
   
   static const int kvm_vmx_max_exit_handlers =
@@@ -7918,6 -8081,10 +8076,8 @@@ static bool nested_vmx_exit_handled(str
                  * the XSS exit bitmap in vmcs12.
                  */
                 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
- -      case EXIT_REASON_PCOMMIT:
- -              return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
+       case EXIT_REASON_PREEMPTION_TIMER:
+               return false;
         default:
                 return true;
         }
@@@ -7929,6 -8096,22 +8089,6 @@@ static void vmx_get_exit_info(struct kv
         *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
   }
   
- -static int vmx_create_pml_buffer(struct vcpu_vmx *vmx)
- -{
- -      struct page *pml_pg;
- -
- -      pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
- -      if (!pml_pg)
- -              return -ENOMEM;
- -
- -      vmx->pml_pg = pml_pg;
- -
- -      vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
- -      vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
- -
- -      return 0;
- -}
- -
   static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
   {
         if (vmx->pml_pg) {
@@@ -8200,7 -8383,6 +8360,7 @@@ static int vmx_handle_exit(struct kvm_v
         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
                         (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
                         exit_reason != EXIT_REASON_EPT_VIOLATION &&
+ +                      exit_reason != EXIT_REASON_PML_FULL &&
                         exit_reason != EXIT_REASON_TASK_SWITCH)) {
                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
@@@ -8303,7 -8485,7 +8463,7 @@@ static void vmx_set_apic_access_page_ad
          * the next L2->L1 exit.
          */
         if (!is_guest_mode(vcpu) ||
-           !nested_cpu_has2(vmx->nested.current_vmcs12,
+           !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
                              SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
                 vmcs_write64(APIC_ACCESS_ADDR, hpa);
   }
@@@ -8436,7 -8618,6 +8596,6 @@@ static void vmx_handle_external_intr(st
                         "push %[sp]\n\t"
   #endif
                         "pushf\n\t"
-                       "orl $0x200, (%%" _ASM_SP ")\n\t"
                         __ASM_SIZE(push) " $%c[cs]\n\t"
                         "call *%[entry]\n\t"
                         :
@@@ -8449,8 -8630,7 +8608,7 @@@
                         [ss]"i"(__KERNEL_DS),
                         [cs]"i"(__KERNEL_CS)
                         );
-       } else
-               local_irq_enable();
+       }
   }
   
   static bool vmx_has_high_real_mode_segbase(void)
@@@ -8601,6 -8781,26 +8759,26 @@@ static void atomic_switch_perf_msrs(str
                                         msrs[i].host);
   }
   
+ void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 tscl;
+       u32 delta_tsc;
+ 
+       if (vmx->hv_deadline_tsc == -1)
+               return;
+ 
+       tscl = rdtsc();
+       if (vmx->hv_deadline_tsc > tscl)
+               /* sure to be 32 bit only because checked on set_hv_timer */
+               delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+                       cpu_preemption_timer_multi);
+       else
+               delta_tsc = 0;
+ 
+       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+ }
+ 
   static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@@ -8650,6 -8850,8 +8828,8 @@@
         atomic_switch_perf_msrs(vmx);
         debugctlmsr = get_debugctlmsr();
   
+       vmx_arm_hv_timer(vcpu);
+ 
         vmx->__launched = vmx->loaded_vmcs->launched;
         asm(
                 /* Store host registers */
@@@ -8831,22 -9033,6 +9011,22 @@@ static void vmx_load_vmcs01(struct kvm_
         put_cpu();
   }
   
+ +/*
+ + * Ensure that the current vmcs of the logical processor is the
+ + * vmcs01 of the vcpu before calling free_nested().
+ + */
+ +static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
+ +{
+ +       struct vcpu_vmx *vmx = to_vmx(vcpu);
+ +       int r;
+ +
+ +       r = vcpu_load(vcpu);
+ +       BUG_ON(r);
+ +       vmx_load_vmcs01(vcpu);
+ +       free_nested(vmx);
+ +       vcpu_put(vcpu);
+ +}
+ +
   static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@@ -8855,7 -9041,8 +9035,7 @@@
                 vmx_destroy_pml_buffer(vmx);
         free_vpid(vmx->vpid);
         leave_guest_mode(vcpu);
- -      vmx_load_vmcs01(vcpu);
- -      free_nested(vmx);
+ +      vmx_free_vcpu_nested(vcpu);
         free_loaded_vmcs(vmx->loaded_vmcs);
         kfree(vmx->guest_msrs);
         kvm_vcpu_uninit(vcpu);
@@@ -8877,26 -9064,14 +9057,26 @@@ static struct kvm_vcpu *vmx_create_vcpu
         if (err)
                 goto free_vcpu;
   
+ +      err = -ENOMEM;
+ +
+ +      /*
+ +       * If PML is turned on, failure on enabling PML just results in failure
+ +       * of creating the vcpu, therefore we can simplify PML logic (by
+ +       * avoiding dealing with cases, such as enabling PML partially on vcpus
+ +       * for the guest, etc.
+ +       */
+ +      if (enable_pml) {
+ +              vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ +              if (!vmx->pml_pg)
+ +                      goto uninit_vcpu;
+ +      }
+ +
         vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
         BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
                      > PAGE_SIZE);
   
- -      err = -ENOMEM;
- -      if (!vmx->guest_msrs) {
- -              goto uninit_vcpu;
- -      }
+ +      if (!vmx->guest_msrs)
+ +              goto free_pml;
   
         vmx->loaded_vmcs = &vmx->vmcs01;
         vmx->loaded_vmcs->vmcs = alloc_vmcs();
@@@ -8940,6 -9115,20 +9120,8 @@@
         vmx->nested.current_vmptr = -1ull;
         vmx->nested.current_vmcs12 = NULL;
   
- -      /*
- -       * If PML is turned on, failure on enabling PML just results in failure
- -       * of creating the vcpu, therefore we can simplify PML logic (by
- -       * avoiding dealing with cases, such as enabling PML partially on vcpus
- -       * for the guest, etc.
- -       */
- -      if (enable_pml) {
- -              err = vmx_create_pml_buffer(vmx);
- -              if (err)
- -                      goto free_vmcs;
- -      }
- -
+       vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+ 
         return &vmx->vcpu;
   
   free_vmcs:
@@@ -8947,8 -9136,6 +9129,8 @@@
         free_loaded_vmcs(vmx->loaded_vmcs);
   free_msrs:
         kfree(vmx->guest_msrs);
+ +free_pml:
+ +      vmx_destroy_pml_buffer(vmx);
   uninit_vcpu:
         kvm_vcpu_uninit(&vmx->vcpu);
   free_vcpu:
@@@ -9080,6 -9267,22 +9262,13 @@@ static void vmx_cpuid_update(struct kvm
   
         if (cpu_has_secondary_exec_ctrls())
                 vmcs_set_secondary_exec_control(secondary_exec_ctl);
- -      if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
- -              if (guest_cpuid_has_pcommit(vcpu))
- -                      vmx->nested.nested_vmx_secondary_ctls_high |=
- -                              SECONDARY_EXEC_PCOMMIT;
- -              else
- -                      vmx->nested.nested_vmx_secondary_ctls_high &=
- -                              ~SECONDARY_EXEC_PCOMMIT;
- -      }
- -
+ 
+       if (nested_vmx_allowed(vcpu))
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+                       FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+       else
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+                       ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
   }
   
   static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@@ -9636,9 -9839,14 +9825,14 @@@ static void prepare_vmcs02(struct kvm_v
         vmcs_write64(VMCS_LINK_POINTER, -1ull);
   
         exec_control = vmcs12->pin_based_vm_exec_control;
-       exec_control |= vmcs_config.pin_based_exec_ctrl;
+ 
+       /* Preemption timer setting is only taken from vmcs01.  */
         exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       exec_control |= vmcs_config.pin_based_exec_ctrl;
+       if (vmx->hv_deadline_tsc == -1)
+               exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
   
+       /* Posted interrupts setting is only taken from vmcs12.  */
         if (nested_cpu_has_posted_intr(vmcs12)) {
                 /*
                  * Note that we use L0's vector here and in
@@@ -9692,7 -9900,8 +9886,7 @@@
                 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                                   SECONDARY_EXEC_RDTSCP |
                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- -                                SECONDARY_EXEC_APIC_REGISTER_VIRT |
- -                                SECONDARY_EXEC_PCOMMIT);
+ +                                SECONDARY_EXEC_APIC_REGISTER_VIRT);
                 if (nested_cpu_has(vmcs12,
                                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
                         exec_control |= vmcs12->secondary_vm_exec_control;
@@@ -10556,8 -10765,8 +10750,8 @@@ static void nested_vmx_vmexit(struct kv
                                        vmcs12->vm_exit_intr_error_code,
                                        KVM_ISA_VMX);
   
-       vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
-       vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
+       vm_entry_controls_reset_shadow(vmx);
+       vm_exit_controls_reset_shadow(vmx);
         vmx_segment_cache_clear(vmx);
   
         /* if no vmcs02 cache requested, remove the one we used */
@@@ -10566,8 -10775,14 +10760,14 @@@
   
         load_vmcs12_host_state(vcpu, vmcs12);
   
-       /* Update TSC_OFFSET if TSC was changed while L2 ran */
+       /* Update any VMCS fields that might have changed while L2 ran */
         vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+       if (vmx->hv_deadline_tsc == -1)
+               vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+                               PIN_BASED_VMX_PREEMPTION_TIMER);
+       else
+               vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+                             PIN_BASED_VMX_PREEMPTION_TIMER);
   
         /* This is needed for same reason as it was needed in prepare_vmcs02 */
         vmx->host_rsp = 0;
@@@ -10647,6 -10862,64 +10847,64 @@@ static int vmx_check_intercept(struct k
         return X86EMUL_CONTINUE;
   }
   
+ #ifdef CONFIG_X86_64
+ /* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+                                 u64 divisor, u64 *result)
+ {
+       u64 low = a << shift, high = a >> (64 - shift);
+ 
+       /* To avoid the overflow on divq */
+       if (high >= divisor)
+               return 1;
+ 
+       /* Low hold the result, high hold rem which is discarded */
+       asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+           "rm" (divisor), "0" (low), "1" (high));
+       *result = low;
+ 
+       return 0;
+ }
+ 
+ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 tscl = rdtsc();
+       u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+       u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+ 
+       /* Convert to host delta tsc if tsc scaling is enabled */
+       if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+                       u64_shl_div_u64(delta_tsc,
+                               kvm_tsc_scaling_ratio_frac_bits,
+                               vcpu->arch.tsc_scaling_ratio,
+                               &delta_tsc))
+               return -ERANGE;
+ 
+       /*
+        * If the delta tsc can't fit in the 32 bit after the multi shift,
+        * we can't use the preemption timer.
+        * It's possible that it fits on later vmentries, but checking
+        * on every vmentry is costly so we just use an hrtimer.
+        */
+       if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+               return -ERANGE;
+ 
+       vmx->hv_deadline_tsc = tscl + delta_tsc;
+       vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+                       PIN_BASED_VMX_PREEMPTION_TIMER);
+       return 0;
+ }
+ 
+ static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       vmx->hv_deadline_tsc = -1;
+       vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+                       PIN_BASED_VMX_PREEMPTION_TIMER);
+ }
+ #endif
+ 
   static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
   {
         if (ple_gap)
@@@ -10691,7 -10964,7 +10949,7 @@@ static void vmx_enable_log_dirty_pt_mas
    *   this case, return 1, otherwise, return 0.
    *
    */
- static int vmx_pre_block(struct kvm_vcpu *vcpu)
+ static int pi_pre_block(struct kvm_vcpu *vcpu)
   {
         unsigned long flags;
         unsigned int dest;
@@@ -10699,8 -10972,7 +10957,8 @@@
         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
   
         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- -              !irq_remapping_cap(IRQ_POSTING_CAP))
+ +              !irq_remapping_cap(IRQ_POSTING_CAP)  ||
+ +              !kvm_vcpu_apicv_active(vcpu))
                 return 0;
   
         vcpu->pre_pcpu = vcpu->cpu;
@@@ -10758,7 -11030,18 +11016,18 @@@
         return 0;
   }
   
- static void vmx_post_block(struct kvm_vcpu *vcpu)
+ static int vmx_pre_block(struct kvm_vcpu *vcpu)
+ {
+       if (pi_pre_block(vcpu))
+               return 1;
+ 
+       if (kvm_lapic_hv_timer_in_use(vcpu))
+               kvm_lapic_switch_to_sw_timer(vcpu);
+ 
+       return 0;
+ }
+ 
+ static void pi_post_block(struct kvm_vcpu *vcpu)
   {
         struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
         struct pi_desc old, new;
@@@ -10766,8 -11049,7 +11035,8 @@@
         unsigned long flags;
   
         if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- -              !irq_remapping_cap(IRQ_POSTING_CAP))
+ +              !irq_remapping_cap(IRQ_POSTING_CAP)  ||
+ +              !kvm_vcpu_apicv_active(vcpu))
                 return;
   
         do {
@@@ -10800,6 -11082,14 +11069,14 @@@
         }
   }
   
+ static void vmx_post_block(struct kvm_vcpu *vcpu)
+ {
+       if (kvm_x86_ops->set_hv_timer)
+               kvm_lapic_switch_to_hv_timer(vcpu);
+ 
+       pi_post_block(vcpu);
+ }
+ 
   /*
    * vmx_update_pi_irte - set IRTE for Posted-Interrupts
    *
@@@ -10820,8 -11110,7 +11097,8 @@@ static int vmx_update_pi_irte(struct kv
         int idx, ret = -EINVAL;
   
         if (!kvm_arch_has_assigned_device(kvm) ||
- -              !irq_remapping_cap(IRQ_POSTING_CAP))
+ +              !irq_remapping_cap(IRQ_POSTING_CAP) ||
+ +              !kvm_vcpu_apicv_active(kvm->vcpus[0]))
                 return 0;
   
         idx = srcu_read_lock(&kvm->irq_srcu);
@@@ -10844,7 -11133,7 +11121,7 @@@
                  * We will support full lowest-priority interrupt later.
                  */
   
-               kvm_set_msi_irq(e, &irq);
+               kvm_set_msi_irq(kvm, e, &irq);
                 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
                         /*
                          * Make sure the IRTE is in remapped mode if
@@@ -10889,6 -11178,16 +11166,16 @@@ out
         return ret;
   }
   
+ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+ {
+       if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+                       FEATURE_CONTROL_LMCE;
+       else
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+                       ~FEATURE_CONTROL_LMCE;
+ }
+ 
   static struct kvm_x86_ops vmx_x86_ops = {
         .cpu_has_kvm_support = cpu_has_kvm_support,
         .disabled_by_bios = vmx_disabled_by_bios,
@@@ -11013,6 -11312,13 +11300,13 @@@
         .pmu_ops = &intel_pmu_ops,
   
         .update_pi_irte = vmx_update_pi_irte,
+ 
+ #ifdef CONFIG_X86_64
+       .set_hv_timer = vmx_set_hv_timer,
+       .cancel_hv_timer = vmx_cancel_hv_timer,
+ #endif
+ 
+       .setup_mce = vmx_setup_mce,
   };
   
   static int __init vmx_init(void)
diff --combined arch/x86/kvm/x86.c

index 9c496c7,a27b330..19f9f9e
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -36,8 -36,7 +36,8 @@@
   #include <linux/kvm.h>
   #include <linux/fs.h>
   #include <linux/vmalloc.h>
- -#include <linux/module.h>
+ +#include <linux/export.h>
+ +#include <linux/moduleparam.h>
   #include <linux/mman.h>
   #include <linux/highmem.h>
   #include <linux/iommu.h>
@@@ -56,6 -55,9 +56,6 @@@
   #include <linux/irqbypass.h>
   #include <trace/events/kvm.h>
   
- -#define CREATE_TRACE_POINTS
- -#include "trace.h"
- -
   #include <asm/debugreg.h>
   #include <asm/msr.h>
   #include <asm/desc.h>
@@@ -66,12 -68,10 +66,13 @@@
   #include <asm/div64.h>
   #include <asm/irq_remapping.h>
   
+ +#define CREATE_TRACE_POINTS
+ +#include "trace.h"
+ +
   #define MAX_IO_MSRS 256
   #define KVM_MAX_MCE_BANKS 32
- #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+ u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
+ EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
   
   #define emul_to_vcpu(ctxt) \
         container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
@@@ -90,8 -90,12 +91,12 @@@ static u64 __read_mostly efer_reserved_
   #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
   #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
   
+ #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+ 
   static void update_cr8_intercept(struct kvm_vcpu *vcpu);
   static void process_nmi(struct kvm_vcpu *vcpu);
+ static void enter_smm(struct kvm_vcpu *vcpu);
   static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
   
   struct kvm_x86_ops *kvm_x86_ops __read_mostly;
@@@ -114,7 -118,8 +119,8 @@@ u8   __read_mostly kvm_tsc_scaling_rati
   EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
   u64  __read_mostly kvm_max_tsc_scaling_ratio;
   EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
- static u64 __read_mostly kvm_default_tsc_scaling_ratio;
+ u64 __read_mostly kvm_default_tsc_scaling_ratio;
+ EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
   
   /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
   static u32 __read_mostly tsc_tolerance_ppm = 250;
@@@ -538,7 -543,7 +544,7 @@@ int load_pdptrs(struct kvm_vcpu *vcpu, 
                 goto out;
         }
         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
-               if (is_present_gpte(pdpte[i]) &&
+               if ((pdpte[i] & PT_PRESENT_MASK) &&
                     (pdpte[i] &
                      vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
                         ret = 0;
@@@ -983,6 -988,7 +989,7 @@@ static u32 emulated_msrs[] = 
         MSR_IA32_MISC_ENABLE,
         MSR_IA32_MCG_STATUS,
         MSR_IA32_MCG_CTL,
+       MSR_IA32_MCG_EXT_CTL,
         MSR_IA32_SMBASE,
   };
   
@@@ -1162,7 -1168,7 +1169,7 @@@ static void kvm_write_wall_clock(struc
         int version;
         int r;
         struct pvclock_wall_clock wc;
-       struct timespec boot;
+       struct timespec64 boot;
   
         if (!wall_clock)
                 return;
@@@ -1185,13 -1191,13 +1192,13 @@@
          * wall clock specified here.  guest system time equals host
          * system time for us, thus we must fill in host boot time here.
          */
-       getboottime(&boot);
+       getboottime64(&boot);
   
         if (kvm->arch.kvmclock_offset) {
-               struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
-               boot = timespec_sub(boot, ts);
+               struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
+               boot = timespec64_sub(boot, ts);
         }
-       wc.sec = boot.tv_sec;
+       wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
         wc.nsec = boot.tv_nsec;
         wc.version = version;
   
@@@ -1245,6 -1251,12 +1252,6 @@@ static atomic_t kvm_guest_has_master_cl
   static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
   static unsigned long max_tsc_khz;
   
- -static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
- -{
- -      return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
- -                                 vcpu->arch.virtual_tsc_shift);
- -}
- -
   static u32 adjust_tsc_khz(u32 khz, s32 ppm)
   {
         u64 v = (u64)khz * (1000000 + ppm);
@@@ -2616,6 -2628,9 +2623,9 @@@ int kvm_vm_ioctl_check_extension(struc
         case KVM_CAP_TSC_CONTROL:
                 r = kvm_has_tsc_control;
                 break;
+       case KVM_CAP_X2APIC_API:
+               r = KVM_X2APIC_API_VALID_FLAGS;
+               break;
         default:
                 r = 0;
                 break;
@@@ -2678,11 -2693,9 +2688,9 @@@ long kvm_arch_dev_ioctl(struct file *fi
                 break;
         }
         case KVM_X86_GET_MCE_CAP_SUPPORTED: {
-               u64 mce_cap;
- 
-               mce_cap = KVM_MCE_CAP_SUPPORTED;
                 r = -EFAULT;
-               if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+               if (copy_to_user(argp, &kvm_mce_cap_supported,
+                                sizeof(kvm_mce_cap_supported)))
                         goto out;
                 r = 0;
                 break;
@@@ -2734,6 -2747,11 +2742,11 @@@ void kvm_arch_vcpu_load(struct kvm_vcp
                                 rdtsc() - vcpu->arch.last_host_tsc;
                 if (tsc_delta < 0)
                         mark_tsc_unstable("KVM discovered backwards TSC");
+ 
+               if (kvm_lapic_hv_timer_in_use(vcpu) &&
+                               kvm_x86_ops->set_hv_timer(vcpu,
+                                       kvm_get_lapic_tscdeadline_msr(vcpu)))
+                       kvm_lapic_switch_to_sw_timer(vcpu);
                 if (check_tsc_unstable()) {
                         u64 offset = kvm_compute_tsc_offset(vcpu,
                                                 vcpu->arch.last_guest_tsc);
@@@ -2767,15 -2785,17 +2780,17 @@@ static int kvm_vcpu_ioctl_get_lapic(str
         if (vcpu->arch.apicv_active)
                 kvm_x86_ops->sync_pir_to_irr(vcpu);
   
-       memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
- 
-       return 0;
+       return kvm_apic_get_state(vcpu, s);
   }
   
   static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
                                     struct kvm_lapic_state *s)
   {
-       kvm_apic_post_state_restore(vcpu, s);
+       int r;
+ 
+       r = kvm_apic_set_state(vcpu, s);
+       if (r)
+               return r;
         update_cr8_intercept(vcpu);
   
         return 0;
@@@ -2860,7 -2880,7 +2875,7 @@@ static int kvm_vcpu_ioctl_x86_setup_mce
         r = -EINVAL;
         if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
                 goto out;
-       if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+       if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
                 goto out;
         r = 0;
         vcpu->arch.mcg_cap = mcg_cap;
@@@ -2870,6 -2890,9 +2885,9 @@@
         /* Init IA32_MCi_CTL to all 1s */
         for (bank = 0; bank < bank_num; bank++)
                 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+ 
+       if (kvm_x86_ops->setup_mce)
+               kvm_x86_ops->setup_mce(vcpu);
   out:
         return r;
   }
@@@ -3768,7 -3791,7 +3786,7 @@@ static int kvm_vm_ioctl_enable_cap(stru
                 r = -EEXIST;
                 if (irqchip_in_kernel(kvm))
                         goto split_irqchip_unlock;
-               if (atomic_read(&kvm->online_vcpus))
+               if (kvm->created_vcpus)
                         goto split_irqchip_unlock;
                 r = kvm_setup_empty_irq_routing(kvm);
                 if (r)
@@@ -3782,6 -3805,18 +3800,18 @@@ split_irqchip_unlock
                 mutex_unlock(&kvm->lock);
                 break;
         }
+       case KVM_CAP_X2APIC_API:
+               r = -EINVAL;
+               if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
+                       break;
+ 
+               if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
+                       kvm->arch.x2apic_format = true;
+               if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+                       kvm->arch.x2apic_broadcast_quirk_disabled = true;
+ 
+               r = 0;
+               break;
         default:
                 r = -EINVAL;
                 break;
@@@ -3833,7 -3868,7 +3863,7 @@@ long kvm_arch_vm_ioctl(struct file *fil
                 if (kvm->arch.vpic)
                         goto create_irqchip_unlock;
                 r = -EINVAL;
-               if (atomic_read(&kvm->online_vcpus))
+               if (kvm->created_vcpus)
                         goto create_irqchip_unlock;
                 r = -ENOMEM;
                 vpic = kvm_create_pic(kvm);
@@@ -3873,7 -3908,7 +3903,7 @@@
                                    sizeof(struct kvm_pit_config)))
                         goto out;
         create_pit:
-               mutex_lock(&kvm->slots_lock);
+               mutex_lock(&kvm->lock);
                 r = -EEXIST;
                 if (kvm->arch.vpit)
                         goto create_pit_unlock;
@@@ -3882,7 -3917,7 +3912,7 @@@
                 if (kvm->arch.vpit)
                         r = 0;
         create_pit_unlock:
-               mutex_unlock(&kvm->slots_lock);
+               mutex_unlock(&kvm->lock);
                 break;
         case KVM_GET_IRQCHIP: {
                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
@@@ -3989,7 -4024,7 +4019,7 @@@
         case KVM_SET_BOOT_CPU_ID:
                 r = 0;
                 mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus) != 0)
+               if (kvm->created_vcpus)
                         r = -EBUSY;
                 else
                         kvm->arch.bsp_vcpu_id = arg;
@@@ -5297,13 -5332,8 +5327,8 @@@ static void kvm_smm_changed(struct kvm_
                 /* This is a good place to trace that we are exiting SMM.  */
                 trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
   
-               if (unlikely(vcpu->arch.smi_pending)) {
-                       kvm_make_request(KVM_REQ_SMI, vcpu);
-                       vcpu->arch.smi_pending = 0;
-               } else {
-                       /* Process a latched INIT, if any.  */
-                       kvm_make_request(KVM_REQ_EVENT, vcpu);
-               }
+               /* Process a latched INIT or SMI, if any.  */
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
         }
   
         kvm_mmu_reset_context(vcpu);
@@@ -5553,10 -5583,9 +5578,10 @@@ int kvm_fast_pio_out(struct kvm_vcpu *v
   }
   EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
   
- -static void tsc_bad(void *info)
+ +static int kvmclock_cpu_down_prep(unsigned int cpu)
   {
         __this_cpu_write(cpu_tsc_khz, 0);
+ +      return 0;
   }
   
   static void tsc_khz_changed(void *data)
@@@ -5661,18 -5690,35 +5686,18 @@@ static struct notifier_block kvmclock_c
         .notifier_call  = kvmclock_cpufreq_notifier
   };
   
- -static int kvmclock_cpu_notifier(struct notifier_block *nfb,
- -                                      unsigned long action, void *hcpu)
+ +static int kvmclock_cpu_online(unsigned int cpu)
   {
- -      unsigned int cpu = (unsigned long)hcpu;
- -
- -      switch (action) {
- -              case CPU_ONLINE:
- -              case CPU_DOWN_FAILED:
- -                      smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
- -                      break;
- -              case CPU_DOWN_PREPARE:
- -                      smp_call_function_single(cpu, tsc_bad, NULL, 1);
- -                      break;
- -      }
- -      return NOTIFY_OK;
+ +      tsc_khz_changed(NULL);
+ +      return 0;
   }
   
- -static struct notifier_block kvmclock_cpu_notifier_block = {
- -      .notifier_call  = kvmclock_cpu_notifier,
- -      .priority = -INT_MAX
- -};
- -
   static void kvm_timer_init(void)
   {
         int cpu;
   
         max_tsc_khz = tsc_khz;
   
- -      cpu_notifier_register_begin();
         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
   #ifdef CONFIG_CPU_FREQ
                 struct cpufreq_policy policy;
@@@ -5687,9 -5733,12 +5712,9 @@@
                                           CPUFREQ_TRANSITION_NOTIFIER);
         }
         pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
- -      for_each_online_cpu(cpu)
- -              smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
- -
- -      __register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
- -      cpu_notifier_register_done();
   
+ +      cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "AP_X86_KVM_CLK_ONLINE",
+ +                        kvmclock_cpu_online, kvmclock_cpu_down_prep);
   }
   
   static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@@ -5849,8 -5898,8 +5874,8 @@@ int kvm_arch_init(void *opaque
         kvm_x86_ops = ops;
   
         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-                       PT_DIRTY_MASK, PT64_NX_MASK, 0);
- 
+                       PT_DIRTY_MASK, PT64_NX_MASK, 0,
+                       PT_PRESENT_MASK);
         kvm_timer_init();
   
         perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@@ -5878,7 -5927,7 +5903,7 @@@ void kvm_arch_exit(void
         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
                                             CPUFREQ_TRANSITION_NOTIFIER);
- -      unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+ +      cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
   #ifdef CONFIG_X86_64
         pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
   #endif
@@@ -6084,7 -6133,10 +6109,10 @@@ static int inject_pending_event(struct 
         }
   
         /* try to inject new event if pending */
-       if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+       if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+               vcpu->arch.smi_pending = false;
+               enter_smm(vcpu);
+       } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
                 --vcpu->arch.nmi_pending;
                 vcpu->arch.nmi_injected = true;
                 kvm_x86_ops->set_nmi(vcpu);
@@@ -6107,6 -6159,7 +6135,7 @@@
                         kvm_x86_ops->set_irq(vcpu);
                 }
         }
+ 
         return 0;
   }
   
@@@ -6130,7 -6183,7 +6159,7 @@@ static void process_nmi(struct kvm_vcp
   #define put_smstate(type, buf, offset, val)                     \
         *(type *)((buf) + (offset) - 0x7e00) = val
   
- static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
+ static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
   {
         u32 flags = 0;
         flags |= seg->g       << 23;
@@@ -6144,7 -6197,7 +6173,7 @@@
         return flags;
   }
   
- static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+ static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
   {
         struct kvm_segment seg;
         int offset;
@@@ -6159,11 -6212,11 +6188,11 @@@
   
         put_smstate(u32, buf, offset + 8, seg.base);
         put_smstate(u32, buf, offset + 4, seg.limit);
-       put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
+       put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
   }
   
   #ifdef CONFIG_X86_64
- static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+ static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
   {
         struct kvm_segment seg;
         int offset;
@@@ -6172,7 -6225,7 +6201,7 @@@
         kvm_get_segment(vcpu, &seg, n);
         offset = 0x7e00 + n * 16;
   
-       flags = process_smi_get_segment_flags(&seg) >> 8;
+       flags = enter_smm_get_segment_flags(&seg) >> 8;
         put_smstate(u16, buf, offset, seg.selector);
         put_smstate(u16, buf, offset + 2, flags);
         put_smstate(u32, buf, offset + 4, seg.limit);
@@@ -6180,7 -6233,7 +6209,7 @@@
   }
   #endif
   
- static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
   {
         struct desc_ptr dt;
         struct kvm_segment seg;
@@@ -6204,13 -6257,13 +6233,13 @@@
         put_smstate(u32, buf, 0x7fc4, seg.selector);
         put_smstate(u32, buf, 0x7f64, seg.base);
         put_smstate(u32, buf, 0x7f60, seg.limit);
-       put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
+       put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
   
         kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
         put_smstate(u32, buf, 0x7fc0, seg.selector);
         put_smstate(u32, buf, 0x7f80, seg.base);
         put_smstate(u32, buf, 0x7f7c, seg.limit);
-       put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
+       put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
   
         kvm_x86_ops->get_gdt(vcpu, &dt);
         put_smstate(u32, buf, 0x7f74, dt.address);
@@@ -6221,7 -6274,7 +6250,7 @@@
         put_smstate(u32, buf, 0x7f54, dt.size);
   
         for (i = 0; i < 6; i++)
-               process_smi_save_seg_32(vcpu, buf, i);
+               enter_smm_save_seg_32(vcpu, buf, i);
   
         put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
   
@@@ -6230,7 -6283,7 +6259,7 @@@
         put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
   }
   
- static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
   {
   #ifdef CONFIG_X86_64
         struct desc_ptr dt;
@@@ -6262,7 -6315,7 +6291,7 @@@
   
         kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
         put_smstate(u16, buf, 0x7e90, seg.selector);
-       put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
+       put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
         put_smstate(u32, buf, 0x7e94, seg.limit);
         put_smstate(u64, buf, 0x7e98, seg.base);
   
@@@ -6272,7 -6325,7 +6301,7 @@@
   
         kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
         put_smstate(u16, buf, 0x7e70, seg.selector);
-       put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
+       put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
         put_smstate(u32, buf, 0x7e74, seg.limit);
         put_smstate(u64, buf, 0x7e78, seg.base);
   
@@@ -6281,31 -6334,26 +6310,26 @@@
         put_smstate(u64, buf, 0x7e68, dt.address);
   
         for (i = 0; i < 6; i++)
-               process_smi_save_seg_64(vcpu, buf, i);
+               enter_smm_save_seg_64(vcpu, buf, i);
   #else
         WARN_ON_ONCE(1);
   #endif
   }
   
- static void process_smi(struct kvm_vcpu *vcpu)
+ static void enter_smm(struct kvm_vcpu *vcpu)
   {
         struct kvm_segment cs, ds;
         struct desc_ptr dt;
         char buf[512];
         u32 cr0;
   
-       if (is_smm(vcpu)) {
-               vcpu->arch.smi_pending = true;
-               return;
-       }
- 
         trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
         vcpu->arch.hflags |= HF_SMM_MASK;
         memset(buf, 0, 512);
         if (guest_cpuid_has_longmode(vcpu))
-               process_smi_save_state_64(vcpu, buf);
+               enter_smm_save_state_64(vcpu, buf);
         else
-               process_smi_save_state_32(vcpu, buf);
+               enter_smm_save_state_32(vcpu, buf);
   
         kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
   
@@@ -6361,6 -6409,12 +6385,12 @@@
         kvm_mmu_reset_context(vcpu);
   }
   
+ static void process_smi(struct kvm_vcpu *vcpu)
+ {
+       vcpu->arch.smi_pending = true;
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+ 
   void kvm_make_scan_ioapic_request(struct kvm *kvm)
   {
         kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
@@@ -6555,8 -6609,18 +6585,18 @@@ static int vcpu_enter_guest(struct kvm_
   
                 if (inject_pending_event(vcpu, req_int_win) != 0)
                         req_immediate_exit = true;
-               /* enable NMI/IRQ window open exits if needed */
                 else {
+                       /* Enable NMI/IRQ window open exits if needed.
+                        *
+                        * SMIs have two cases: 1) they can be nested, and
+                        * then there is nothing to do here because RSM will
+                        * cause a vmexit anyway; 2) or the SMI can be pending
+                        * because inject_pending_event has completed the
+                        * injection of an IRQ or NMI from the previous vmexit,
+                        * and then we request an immediate exit to inject the SMI.
+                        */
+                       if (vcpu->arch.smi_pending && !is_smm(vcpu))
+                               req_immediate_exit = true;
                         if (vcpu->arch.nmi_pending)
                                 kvm_x86_ops->enable_nmi_window(vcpu);
                         if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
@@@ -6607,12 -6671,14 +6647,14 @@@
   
         kvm_load_guest_xcr0(vcpu);
   
-       if (req_immediate_exit)
+       if (req_immediate_exit) {
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
                 smp_send_reschedule(vcpu->cpu);
+       }
   
         trace_kvm_entry(vcpu->vcpu_id);
         wait_lapic_expire(vcpu);
-       __kvm_guest_enter();
+       guest_enter_irqoff();
   
         if (unlikely(vcpu->arch.switch_db_regs)) {
                 set_debugreg(0, 7);
@@@ -6663,16 -6729,9 +6705,9 @@@
   
         ++vcpu->stat.exits;
   
-       /*
-        * We must have an instruction between local_irq_enable() and
-        * kvm_guest_exit(), so the timer interrupt isn't delayed by
-        * the interrupt shadow.  The stat.exits increment will do nicely.
-        * But we need to prevent reordering, hence this barrier():
-        */
-       barrier();
- 
-       kvm_guest_exit();
+       guest_exit_irqoff();
   
+       local_irq_enable();
         preempt_enable();
   
         vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@@ -7409,6 -7468,7 +7444,7 @@@ void kvm_vcpu_reset(struct kvm_vcpu *vc
   {
         vcpu->arch.hflags = 0;
   
+       vcpu->arch.smi_pending = 0;
         atomic_set(&vcpu->arch.nmi_queued, 0);
         vcpu->arch.nmi_pending = 0;
         vcpu->arch.nmi_injected = false;
@@@ -7601,11 -7661,6 +7637,6 @@@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *v
         return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
   }
   
- bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
- {
-       return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
- }
- 
   struct static_key kvm_no_apic_vcpu __read_mostly;
   EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
   
@@@ -7872,7 -7927,7 +7903,7 @@@ void kvm_arch_destroy_vm(struct kvm *kv
         kfree(kvm->arch.vpic);
         kfree(kvm->arch.vioapic);
         kvm_free_vcpus(kvm);
-       kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+       kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
         kvm_mmu_uninit_vm(kvm);
   }
   
@@@ -8380,7 -8435,7 +8411,7 @@@ void kvm_arch_irq_bypass_del_producer(s
         /*
          * When producer of consumer is unregistered, we change back to
          * remapped mode, so we can re-use the current implementation
-        * when the irq is masked/disabed or the consumer side (KVM
+        * when the irq is masked/disabled or the consumer side (KVM
          * int this case doesn't want to receive the interrupts.
         */
         ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
diff --combined include/linux/context_tracking.h

index d9aef2a,ff4a32d..c78fc27
--- 1/include/linux/context_tracking.h
--- 2/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@@ -31,19 -31,6 +31,19 @@@ static inline void user_exit(void
                 context_tracking_exit(CONTEXT_USER);
   }
   
+ +/* Called with interrupts disabled.  */
+ +static inline void user_enter_irqoff(void)
+ +{
+ +      if (context_tracking_is_enabled())
+ +              __context_tracking_enter(CONTEXT_USER);
+ +
+ +}
+ +static inline void user_exit_irqoff(void)
+ +{
+ +      if (context_tracking_is_enabled())
+ +              __context_tracking_exit(CONTEXT_USER);
+ +}
+ +
   static inline enum ctx_state exception_enter(void)
   {
         enum ctx_state prev_ctx;
@@@ -82,8 -69,6 +82,8 @@@ static inline enum ctx_state ct_state(v
   #else
   static inline void user_enter(void) { }
   static inline void user_exit(void) { }
+ +static inline void user_enter_irqoff(void) { }
+ +static inline void user_exit_irqoff(void) { }
   static inline enum ctx_state exception_enter(void) { return 0; }
   static inline void exception_exit(enum ctx_state prev_ctx) { }
   static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
@@@ -99,7 -84,8 +99,8 @@@ static inline void context_tracking_ini
   
   
   #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- static inline void guest_enter(void)
+ /* must be called with irqs disabled */
+ static inline void guest_enter_irqoff(void)
   {
         if (vtime_accounting_cpu_enabled())
                 vtime_guest_enter(current);
@@@ -108,9 -94,19 +109,19 @@@
   
         if (context_tracking_is_enabled())
                 __context_tracking_enter(CONTEXT_GUEST);
+ 
+       /* KVM does not hold any references to rcu protected data when it
+        * switches CPU into a guest mode. In fact switching to a guest mode
+        * is very similar to exiting to userspace from rcu point of view. In
+        * addition CPU may stay in a guest mode for quite a long time (up to
+        * one time slice). Lets treat guest mode as quiescent state, just like
+        * we do with user-mode execution.
+        */
+       if (!context_tracking_cpu_is_enabled())
+               rcu_virt_note_context_switch(smp_processor_id());
   }
   
- static inline void guest_exit(void)
+ static inline void guest_exit_irqoff(void)
   {
         if (context_tracking_is_enabled())
                 __context_tracking_exit(CONTEXT_GUEST);
@@@ -122,7 -118,7 +133,7 @@@
   }
   
   #else
- static inline void guest_enter(void)
+ static inline void guest_enter_irqoff(void)
   {
         /*
          * This is running in ioctl context so its safe
@@@ -131,9 -127,10 +142,10 @@@
          */
         vtime_account_system(current);
         current->flags |= PF_VCPU;
+       rcu_virt_note_context_switch(smp_processor_id());
   }
   
- static inline void guest_exit(void)
+ static inline void guest_exit_irqoff(void)
   {
         /* Flush the guest cputime we spent on the guest */
         vtime_account_system(current);
@@@ -141,4 -138,22 +153,22 @@@
   }
   #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
   
+ static inline void guest_enter(void)
+ {
+       unsigned long flags;
+ 
+       local_irq_save(flags);
+       guest_enter_irqoff();
+       local_irq_restore(flags);
+ }
+ 
+ static inline void guest_exit(void)
+ {
+       unsigned long flags;
+ 
+       local_irq_save(flags);
+       guest_exit_irqoff();
+       local_irq_restore(flags);
+ }
+ 
   #endif
diff --combined include/linux/irqchip/arm-gic-v3.h

index 107eed4,700b421..56b0b7e
--- 1/include/linux/irqchip/arm-gic-v3.h
--- 2/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@@ -112,34 -112,76 +112,76 @@@
   #define GICR_WAKER_ProcessorSleep     (1U << 1)
   #define GICR_WAKER_ChildrenAsleep     (1U << 2)
   
- #define GICR_PROPBASER_NonShareable   (0U << 10)
- #define GICR_PROPBASER_InnerShareable (1U << 10)
- #define GICR_PROPBASER_OuterShareable (2U << 10)
- #define GICR_PROPBASER_SHAREABILITY_MASK (3UL << 10)
- #define GICR_PROPBASER_nCnB           (0U << 7)
- #define GICR_PROPBASER_nC             (1U << 7)
- #define GICR_PROPBASER_RaWt           (2U << 7)
- #define GICR_PROPBASER_RaWb           (3U << 7)
- #define GICR_PROPBASER_WaWt           (4U << 7)
- #define GICR_PROPBASER_WaWb           (5U << 7)
- #define GICR_PROPBASER_RaWaWt         (6U << 7)
- #define GICR_PROPBASER_RaWaWb         (7U << 7)
- #define GICR_PROPBASER_CACHEABILITY_MASK (7U << 7)
- #define GICR_PROPBASER_IDBITS_MASK    (0x1f)
- 
- #define GICR_PENDBASER_NonShareable   (0U << 10)
- #define GICR_PENDBASER_InnerShareable (1U << 10)
- #define GICR_PENDBASER_OuterShareable (2U << 10)
- #define GICR_PENDBASER_SHAREABILITY_MASK (3UL << 10)
- #define GICR_PENDBASER_nCnB           (0U << 7)
- #define GICR_PENDBASER_nC             (1U << 7)
- #define GICR_PENDBASER_RaWt           (2U << 7)
- #define GICR_PENDBASER_RaWb           (3U << 7)
- #define GICR_PENDBASER_WaWt           (4U << 7)
- #define GICR_PENDBASER_WaWb           (5U << 7)
- #define GICR_PENDBASER_RaWaWt         (6U << 7)
- #define GICR_PENDBASER_RaWaWb         (7U << 7)
- #define GICR_PENDBASER_CACHEABILITY_MASK (7U << 7)
+ #define GIC_BASER_CACHE_nCnB          0ULL
+ #define GIC_BASER_CACHE_SameAsInner   0ULL
+ #define GIC_BASER_CACHE_nC            1ULL
+ #define GIC_BASER_CACHE_RaWt          2ULL
+ #define GIC_BASER_CACHE_RaWb          3ULL
+ #define GIC_BASER_CACHE_WaWt          4ULL
+ #define GIC_BASER_CACHE_WaWb          5ULL
+ #define GIC_BASER_CACHE_RaWaWt                6ULL
+ #define GIC_BASER_CACHE_RaWaWb                7ULL
+ #define GIC_BASER_CACHE_MASK          7ULL
+ #define GIC_BASER_NonShareable                0ULL
+ #define GIC_BASER_InnerShareable      1ULL
+ #define GIC_BASER_OuterShareable      2ULL
+ #define GIC_BASER_SHAREABILITY_MASK   3ULL
+ 
+ #define GIC_BASER_CACHEABILITY(reg, inner_outer, type)                        \
+       (GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT)
+ 
+ #define GIC_BASER_SHAREABILITY(reg, type)                             \
+       (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT)
+ 
+ #define GICR_PROPBASER_SHAREABILITY_SHIFT             (10)
+ #define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT               (7)
+ #define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT               (56)
+ #define GICR_PROPBASER_SHAREABILITY_MASK                              \
+       GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK)
+ #define GICR_PROPBASER_INNER_CACHEABILITY_MASK                                \
+       GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK)
+ #define GICR_PROPBASER_OUTER_CACHEABILITY_MASK                                \
+       GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK)
+ #define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK
+ 
+ #define GICR_PROPBASER_InnerShareable                                 \
+       GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)
+ 
+ #define GICR_PROPBASER_nCnB   GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB)
+ #define GICR_PROPBASER_nC     GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC)
+ #define GICR_PROPBASER_RaWt   GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+ #define GICR_PROPBASER_RaWb   GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+ #define GICR_PROPBASER_WaWt   GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt)
+ #define GICR_PROPBASER_WaWb   GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb)
+ #define GICR_PROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt)
+ #define GICR_PROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb)
+ 
+ #define GICR_PROPBASER_IDBITS_MASK                    (0x1f)
+ 
+ #define GICR_PENDBASER_SHAREABILITY_SHIFT             (10)
+ #define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT               (7)
+ #define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT               (56)
+ #define GICR_PENDBASER_SHAREABILITY_MASK                              \
+       GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK)
+ #define GICR_PENDBASER_INNER_CACHEABILITY_MASK                                \
+       GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK)
+ #define GICR_PENDBASER_OUTER_CACHEABILITY_MASK                                \
+       GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK)
+ #define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK
+ 
+ #define GICR_PENDBASER_InnerShareable                                 \
+       GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)
+ 
+ #define GICR_PENDBASER_nCnB   GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB)
+ #define GICR_PENDBASER_nC     GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC)
+ #define GICR_PENDBASER_RaWt   GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+ #define GICR_PENDBASER_RaWb   GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+ #define GICR_PENDBASER_WaWt   GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt)
+ #define GICR_PENDBASER_WaWb   GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb)
+ #define GICR_PENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt)
+ #define GICR_PENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb)
+ 
+ #define GICR_PENDBASER_PTZ                            BIT_ULL(62)
   
   /*
    * Re-Distributor registers, offsets from SGI_base
@@@ -175,61 -217,90 +217,91 @@@
   #define GITS_CWRITER                  0x0088
   #define GITS_CREADR                   0x0090
   #define GITS_BASER                    0x0100
+ #define GITS_IDREGS_BASE              0xffd0
+ #define GITS_PIDR0                    0xffe0
+ #define GITS_PIDR1                    0xffe4
   #define GITS_PIDR2                    GICR_PIDR2
+ #define GITS_PIDR4                    0xffd0
+ #define GITS_CIDR0                    0xfff0
+ #define GITS_CIDR1                    0xfff4
+ #define GITS_CIDR2                    0xfff8
+ #define GITS_CIDR3                    0xfffc
   
   #define GITS_TRANSLATER                       0x10040
   
   #define GITS_CTLR_ENABLE              (1U << 0)
   #define GITS_CTLR_QUIESCENT           (1U << 31)
   
+ #define GITS_TYPER_PLPIS              (1UL << 0)
+ #define GITS_TYPER_IDBITS_SHIFT               8
   #define GITS_TYPER_DEVBITS_SHIFT      13
   #define GITS_TYPER_DEVBITS(r)         ((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
   #define GITS_TYPER_PTA                        (1UL << 19)
- 
- #define GITS_CBASER_VALID             (1UL << 63)
- #define GITS_CBASER_nCnB              (0UL << 59)
- #define GITS_CBASER_nC                        (1UL << 59)
- #define GITS_CBASER_RaWt              (2UL << 59)
- #define GITS_CBASER_RaWb              (3UL << 59)
- #define GITS_CBASER_WaWt              (4UL << 59)
- #define GITS_CBASER_WaWb              (5UL << 59)
- #define GITS_CBASER_RaWaWt            (6UL << 59)
- #define GITS_CBASER_RaWaWb            (7UL << 59)
- #define GITS_CBASER_CACHEABILITY_MASK (7UL << 59)
- #define GITS_CBASER_NonShareable      (0UL << 10)
- #define GITS_CBASER_InnerShareable    (1UL << 10)
- #define GITS_CBASER_OuterShareable    (2UL << 10)
- #define GITS_CBASER_SHAREABILITY_MASK (3UL << 10)
+ #define GITS_TYPER_HWCOLLCNT_SHIFT    24
+ 
+ #define GITS_CBASER_VALID                     (1UL << 63)
+ #define GITS_CBASER_SHAREABILITY_SHIFT                (10)
+ #define GITS_CBASER_INNER_CACHEABILITY_SHIFT  (59)
+ #define GITS_CBASER_OUTER_CACHEABILITY_SHIFT  (53)
+ #define GITS_CBASER_SHAREABILITY_MASK                                 \
+       GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK)
+ #define GITS_CBASER_INNER_CACHEABILITY_MASK                           \
+       GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK)
+ #define GITS_CBASER_OUTER_CACHEABILITY_MASK                           \
+       GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK)
+ #define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK
+ 
+ #define GITS_CBASER_InnerShareable                                    \
+       GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable)
+ 
+ #define GITS_CBASER_nCnB      GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB)
+ #define GITS_CBASER_nC                GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC)
+ #define GITS_CBASER_RaWt      GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+ #define GITS_CBASER_RaWb      GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+ #define GITS_CBASER_WaWt      GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt)
+ #define GITS_CBASER_WaWb      GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb)
+ #define GITS_CBASER_RaWaWt    GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt)
+ #define GITS_CBASER_RaWaWb    GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb)
   
   #define GITS_BASER_NR_REGS            8
   
- #define GITS_BASER_VALID              (1UL << 63)
- #define GITS_BASER_INDIRECT           (1UL << 62)
- #define GITS_BASER_nCnB                       (0UL << 59)
- #define GITS_BASER_nC                 (1UL << 59)
- #define GITS_BASER_RaWt                       (2UL << 59)
- #define GITS_BASER_RaWb                       (3UL << 59)
- #define GITS_BASER_WaWt                       (4UL << 59)
- #define GITS_BASER_WaWb                       (5UL << 59)
- #define GITS_BASER_RaWaWt             (6UL << 59)
- #define GITS_BASER_RaWaWb             (7UL << 59)
- #define GITS_BASER_CACHEABILITY_MASK  (7UL << 59)
- #define GITS_BASER_TYPE_SHIFT         (56)
+ #define GITS_BASER_VALID                      (1UL << 63)
+ #define GITS_BASER_INDIRECT                   (1ULL << 62)
+ 
+ #define GITS_BASER_INNER_CACHEABILITY_SHIFT   (59)
+ #define GITS_BASER_OUTER_CACHEABILITY_SHIFT   (53)
+ #define GITS_BASER_INNER_CACHEABILITY_MASK                            \
+       GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK)
+ #define GITS_BASER_CACHEABILITY_MASK          GITS_BASER_INNER_CACHEABILITY_MASK
+ #define GITS_BASER_OUTER_CACHEABILITY_MASK                            \
+       GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK)
+ #define GITS_BASER_SHAREABILITY_MASK                                  \
+       GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK)
+ 
+ #define GITS_BASER_nCnB               GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB)
+ #define GITS_BASER_nC         GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC)
+ #define GITS_BASER_RaWt               GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+ #define GITS_BASER_RaWb               GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+ #define GITS_BASER_WaWt               GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt)
+ #define GITS_BASER_WaWb               GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb)
+ #define GITS_BASER_RaWaWt     GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt)
+ #define GITS_BASER_RaWaWb     GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb)
+ 
+ #define GITS_BASER_TYPE_SHIFT                 (56)
   #define GITS_BASER_TYPE(r)            (((r) >> GITS_BASER_TYPE_SHIFT) & 7)
- #define GITS_BASER_ENTRY_SIZE_SHIFT   (48)
+ #define GITS_BASER_ENTRY_SIZE_SHIFT           (48)
   #define GITS_BASER_ENTRY_SIZE(r)      ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0xff) + 1)
- #define GITS_BASER_NonShareable               (0UL << 10)
- #define GITS_BASER_InnerShareable     (1UL << 10)
- #define GITS_BASER_OuterShareable     (2UL << 10)
   #define GITS_BASER_SHAREABILITY_SHIFT (10)
- #define GITS_BASER_SHAREABILITY_MASK  (3UL << GITS_BASER_SHAREABILITY_SHIFT)
+ #define GITS_BASER_InnerShareable                                     \
+       GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)
   #define GITS_BASER_PAGE_SIZE_SHIFT    (8)
   #define GITS_BASER_PAGE_SIZE_4K               (0UL << GITS_BASER_PAGE_SIZE_SHIFT)
   #define GITS_BASER_PAGE_SIZE_16K      (1UL << GITS_BASER_PAGE_SIZE_SHIFT)
   #define GITS_BASER_PAGE_SIZE_64K      (2UL << GITS_BASER_PAGE_SIZE_SHIFT)
   #define GITS_BASER_PAGE_SIZE_MASK     (3UL << GITS_BASER_PAGE_SIZE_SHIFT)
   #define GITS_BASER_PAGES_MAX          256
+ +#define GITS_BASER_PAGES_SHIFT                (0)
+ #define GITS_BASER_NR_PAGES(r)                (((r) & 0xff) + 1)
   
   #define GITS_BASER_TYPE_NONE          0
   #define GITS_BASER_TYPE_DEVICE                1
@@@ -240,14 -311,15 +312,17 @@@
   #define GITS_BASER_TYPE_RESERVED6     6
   #define GITS_BASER_TYPE_RESERVED7     7
   
+ +#define GITS_LVL1_ENTRY_SIZE           (8UL)
+ +
   /*
    * ITS commands
    */
   #define GITS_CMD_MAPD                 0x08
   #define GITS_CMD_MAPC                 0x09
- #define GITS_CMD_MAPVI                        0x0a
+ #define GITS_CMD_MAPTI                        0x0a
+ /* older GIC documentation used MAPVI for this command */
+ #define GITS_CMD_MAPVI                        GITS_CMD_MAPTI
+ #define GITS_CMD_MAPI                 0x0b
   #define GITS_CMD_MOVI                 0x01
   #define GITS_CMD_DISCARD              0x0f
   #define GITS_CMD_INV                  0x0c
@@@ -257,6 -329,22 +332,22 @@@
   #define GITS_CMD_CLEAR                        0x04
   #define GITS_CMD_SYNC                 0x05
   
+ /*
+  * ITS error numbers
+  */
+ #define E_ITS_MOVI_UNMAPPED_INTERRUPT         0x010107
+ #define E_ITS_MOVI_UNMAPPED_COLLECTION                0x010109
+ #define E_ITS_CLEAR_UNMAPPED_INTERRUPT                0x010507
+ #define E_ITS_MAPD_DEVICE_OOR                 0x010801
+ #define E_ITS_MAPC_PROCNUM_OOR                        0x010902
+ #define E_ITS_MAPC_COLLECTION_OOR             0x010903
+ #define E_ITS_MAPTI_UNMAPPED_DEVICE           0x010a04
+ #define E_ITS_MAPTI_PHYSICALID_OOR            0x010a06
+ #define E_ITS_INV_UNMAPPED_INTERRUPT          0x010c07
+ #define E_ITS_INVALL_UNMAPPED_COLLECTION      0x010d09
+ #define E_ITS_MOVALL_PROCNUM_OOR              0x010e01
+ #define E_ITS_DISCARD_UNMAPPED_INTERRUPT      0x010f07
+ 
   /*
    * CPU interface registers
    */
@@@ -309,12 -397,12 +400,12 @@@
   #define ICC_SGI1R_AFFINITY_1_SHIFT    16
   #define ICC_SGI1R_AFFINITY_1_MASK     (0xff << ICC_SGI1R_AFFINITY_1_SHIFT)
   #define ICC_SGI1R_SGI_ID_SHIFT                24
- -#define ICC_SGI1R_SGI_ID_MASK         (0xff << ICC_SGI1R_SGI_ID_SHIFT)
+ +#define ICC_SGI1R_SGI_ID_MASK         (0xfULL << ICC_SGI1R_SGI_ID_SHIFT)
   #define ICC_SGI1R_AFFINITY_2_SHIFT    32
- -#define ICC_SGI1R_AFFINITY_2_MASK     (0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
+ +#define ICC_SGI1R_AFFINITY_2_MASK     (0xffULL << ICC_SGI1R_AFFINITY_2_SHIFT)
   #define ICC_SGI1R_IRQ_ROUTING_MODE_BIT        40
   #define ICC_SGI1R_AFFINITY_3_SHIFT    48
- -#define ICC_SGI1R_AFFINITY_3_MASK     (0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
+ +#define ICC_SGI1R_AFFINITY_3_MASK     (0xffULL << ICC_SGI1R_AFFINITY_3_SHIFT)
   
   #include <asm/arch_gicv3.h>
   
diff --combined mm/gup.c

index 547741f,e3ac22f..96b2b2f
--- 1/mm/gup.c
--- 2/mm/gup.c
+++ b/mm/gup.c
@@@ -279,8 -279,6 +279,8 @@@ struct page *follow_page_mask(struct vm
                         spin_unlock(ptl);
                         ret = 0;
                         split_huge_pmd(vma, pmd, address);
+ +                      if (pmd_trans_unstable(pmd))
+ +                              ret = -EBUSY;
                 } else {
                         get_page(page);
                         spin_unlock(ptl);
@@@ -288,8 -286,6 +288,8 @@@
                         ret = split_huge_page(page);
                         unlock_page(page);
                         put_page(page);
+ +                      if (pmd_none(*pmd))
+ +                              return no_page_table(vma, flags);
                 }
   
                 return ret ? ERR_PTR(ret) :
@@@ -354,6 -350,7 +354,6 @@@ unmap
   static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
                 unsigned long address, unsigned int *flags, int *nonblocking)
   {
- -      struct mm_struct *mm = vma->vm_mm;
         unsigned int fault_flags = 0;
         int ret;
   
@@@ -378,7 -375,7 +378,7 @@@
                 fault_flags |= FAULT_FLAG_TRIED;
         }
   
- -      ret = handle_mm_fault(mm, vma, address, fault_flags);
+ +      ret = handle_mm_fault(vma, address, fault_flags);
         if (ret & VM_FAULT_ERROR) {
                 if (ret & VM_FAULT_OOM)
                         return -ENOMEM;
@@@ -693,7 -690,7 +693,7 @@@ retry
         if (!vma_permits_fault(vma, fault_flags))
                 return -EFAULT;
   
- -      ret = handle_mm_fault(mm, vma, address, fault_flags);
+ +      ret = handle_mm_fault(vma, address, fault_flags);
         major |= ret & VM_FAULT_MAJOR;
         if (ret & VM_FAULT_ERROR) {
                 if (ret & VM_FAULT_OOM)
@@@ -723,6 -720,7 +723,7 @@@
         }
         return 0;
   }
+ EXPORT_SYMBOL_GPL(fixup_user_fault);
   
   static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
                                                 struct mm_struct *mm,
diff --combined virt/kvm/arm/vgic/vgic-init.c

index 2c7f0d5,01a60dc..1e30ce0
--- 1/virt/kvm/arm/vgic/vgic-init.c
--- 2/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@@ -157,6 -157,9 +157,9 @@@ static int kvm_vgic_dist_init(struct kv
         struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
         int i;
   
+       INIT_LIST_HEAD(&dist->lpi_list_head);
+       spin_lock_init(&dist->lpi_list_lock);
+ 
         dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
         if (!dist->spis)
                 return  -ENOMEM;
@@@ -177,6 -180,7 +180,7 @@@
                 spin_lock_init(&irq->irq_lock);
                 irq->vcpu = NULL;
                 irq->target_vcpu = vcpu0;
+               kref_init(&irq->refcount);
                 if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
                         irq->targets = 0;
                 else
@@@ -211,6 -215,7 +215,7 @@@ static void kvm_vgic_vcpu_init(struct k
                 irq->vcpu = NULL;
                 irq->target_vcpu = vcpu;
                 irq->targets = 1U << vcpu->vcpu_id;
+               kref_init(&irq->refcount);
                 if (vgic_irq_is_sgi(i)) {
                         /* SGIs */
                         irq->enabled = 1;
@@@ -253,6 -258,9 +258,9 @@@ int vgic_init(struct kvm *kvm
         if (ret)
                 goto out;
   
+       if (vgic_has_its(kvm))
+               dist->msis_require_devid = true;
+ 
         kvm_for_each_vcpu(i, vcpu, kvm)
                 kvm_vgic_vcpu_init(vcpu);
   
@@@ -271,7 -279,6 +279,6 @@@ static void kvm_vgic_dist_destroy(struc
         dist->initialized = false;
   
         kfree(dist->spis);
-       kfree(dist->redist_iodevs);
         dist->nr_spis = 0;
   
         mutex_unlock(&kvm->lock);
@@@ -353,19 -360,32 +360,19 @@@ out
   
   /* GENERIC PROBE */
   
- -static void vgic_init_maintenance_interrupt(void *info)
+ +static int vgic_init_cpu_starting(unsigned int cpu)
   {
         enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0);
+ +      return 0;
   }
   
- -static int vgic_cpu_notify(struct notifier_block *self,
- -                         unsigned long action, void *cpu)
- -{
- -      switch (action) {
- -      case CPU_STARTING:
- -      case CPU_STARTING_FROZEN:
- -              vgic_init_maintenance_interrupt(NULL);
- -              break;
- -      case CPU_DYING:
- -      case CPU_DYING_FROZEN:
- -              disable_percpu_irq(kvm_vgic_global_state.maint_irq);
- -              break;
- -      }
   
- -      return NOTIFY_OK;
+ +static int vgic_init_cpu_dying(unsigned int cpu)
+ +{
+ +      disable_percpu_irq(kvm_vgic_global_state.maint_irq);
+ +      return 0;
   }
   
- -static struct notifier_block vgic_cpu_nb = {
- -      .notifier_call = vgic_cpu_notify,
- -};
- -
   static irqreturn_t vgic_maintenance_handler(int irq, void *data)
   {
         /*
@@@ -421,14 -441,14 +428,14 @@@ int kvm_vgic_hyp_init(void
                 return ret;
         }
   
- -      ret = __register_cpu_notifier(&vgic_cpu_nb);
+ +      ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING,
+ +                              "AP_KVM_ARM_VGIC_INIT_STARTING",
+ +                              vgic_init_cpu_starting, vgic_init_cpu_dying);
         if (ret) {
                 kvm_err("Cannot register vgic CPU notifier\n");
                 goto out_free_irq;
         }
   
- -      on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
- -
         kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq);
         return 0;
   
diff --combined virt/kvm/kvm_main.c

index 2e79136,61b31a5..cc081cc
--- 1/virt/kvm/kvm_main.c
--- 2/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@@ -148,7 -148,6 +148,7 @@@ int vcpu_load(struct kvm_vcpu *vcpu
         put_cpu();
         return 0;
   }
+ +EXPORT_SYMBOL_GPL(vcpu_load);
   
   void vcpu_put(struct kvm_vcpu *vcpu)
   {
@@@ -158,7 -157,6 +158,7 @@@
         preempt_enable();
         mutex_unlock(&vcpu->mutex);
   }
+ +EXPORT_SYMBOL_GPL(vcpu_put);
   
   static void ack_flush(void *_completed)
   {
@@@ -1444,6 -1442,52 +1444,52 @@@ static bool vma_is_valid(struct vm_area
         return true;
   }
   
+ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
+                              unsigned long addr, bool *async,
+                              bool write_fault, kvm_pfn_t *p_pfn)
+ {
+       unsigned long pfn;
+       int r;
+ 
+       r = follow_pfn(vma, addr, &pfn);
+       if (r) {
+               /*
+                * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
+                * not call the fault handler, so do it here.
+                */
+               bool unlocked = false;
+               r = fixup_user_fault(current, current->mm, addr,
+                                    (write_fault ? FAULT_FLAG_WRITE : 0),
+                                    &unlocked);
+               if (unlocked)
+                       return -EAGAIN;
+               if (r)
+                       return r;
+ 
+               r = follow_pfn(vma, addr, &pfn);
+               if (r)
+                       return r;
+ 
+       }
+ 
+ 
+       /*
+        * Get a reference here because callers of *hva_to_pfn* and
+        * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
+        * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
+        * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+        * simply do nothing for reserved pfns.
+        *
+        * Whoever called remap_pfn_range is also going to call e.g.
+        * unmap_mapping_range before the underlying pages are freed,
+        * causing a call to our MMU notifier.
+        */ 
+       kvm_get_pfn(pfn);
+ 
+       *p_pfn = pfn;
+       return 0;
+ }
+ 
   /*
    * Pin guest page in memory and return its pfn.
    * @addr: host virtual address which maps memory to the guest
@@@ -1463,7 -1507,7 +1509,7 @@@ static kvm_pfn_t hva_to_pfn(unsigned lo
   {
         struct vm_area_struct *vma;
         kvm_pfn_t pfn = 0;
-       int npages;
+       int npages, r;
   
         /* we can do it either atomically or asynchronously, not both */
         BUG_ON(atomic && async);
@@@ -1485,14 -1529,17 +1531,17 @@@
                 goto exit;
         }
   
+ retry:
         vma = find_vma_intersection(current->mm, addr, addr + 1);
   
         if (vma == NULL)
                 pfn = KVM_PFN_ERR_FAULT;
-       else if ((vma->vm_flags & VM_PFNMAP)) {
-               pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
-                       vma->vm_pgoff;
-               BUG_ON(!kvm_is_reserved_pfn(pfn));
+       else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
+               r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn);
+               if (r == -EAGAIN)
+                       goto retry;
+               if (r < 0)
+                       pfn = KVM_PFN_ERR_FAULT;
         } else {
                 if (async && vma_is_valid(vma, write_fault))
                         *async = true;
@@@ -2348,9 -2395,20 +2397,20 @@@ static int kvm_vm_ioctl_create_vcpu(str
         if (id >= KVM_MAX_VCPU_ID)
                 return -EINVAL;
   
+       mutex_lock(&kvm->lock);
+       if (kvm->created_vcpus == KVM_MAX_VCPUS) {
+               mutex_unlock(&kvm->lock);
+               return -EINVAL;
+       }
+ 
+       kvm->created_vcpus++;
+       mutex_unlock(&kvm->lock);
+ 
         vcpu = kvm_arch_vcpu_create(kvm, id);
-       if (IS_ERR(vcpu))
-               return PTR_ERR(vcpu);
+       if (IS_ERR(vcpu)) {
+               r = PTR_ERR(vcpu);
+               goto vcpu_decrement;
+       }
   
         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
   
@@@ -2359,14 -2417,6 +2419,6 @@@
                 goto vcpu_destroy;
   
         mutex_lock(&kvm->lock);
-       if (!kvm_vcpu_compatible(vcpu)) {
-               r = -EINVAL;
-               goto unlock_vcpu_destroy;
-       }
-       if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
-               r = -EINVAL;
-               goto unlock_vcpu_destroy;
-       }
         if (kvm_get_vcpu_by_id(kvm, id)) {
                 r = -EEXIST;
                 goto unlock_vcpu_destroy;
@@@ -2399,6 -2449,10 +2451,10 @@@ unlock_vcpu_destroy
         mutex_unlock(&kvm->lock);
   vcpu_destroy:
         kvm_arch_vcpu_destroy(vcpu);
+ vcpu_decrement:
+       mutex_lock(&kvm->lock);
+       kvm->created_vcpus--;
+       mutex_unlock(&kvm->lock);
         return r;
   }
   
@@@ -2943,7 -2997,7 +2999,7 @@@ static long kvm_vm_ioctl(struct file *f
                 if (copy_from_user(&routing, argp, sizeof(routing)))
                         goto out;
                 r = -EINVAL;
- -              if (routing.nr >= KVM_MAX_IRQ_ROUTES)
+ +              if (routing.nr > KVM_MAX_IRQ_ROUTES)
                         goto out;
                 if (routing.flags)
                         goto out;
@@@ -3050,7 -3104,6 +3106,7 @@@ static int kvm_dev_ioctl_create_vm(unsi
   {
         int r;
         struct kvm *kvm;
+ +      struct file *file;
   
         kvm = kvm_create_vm(type);
         if (IS_ERR(kvm))
@@@ -3062,25 -3115,17 +3118,25 @@@
                 return r;
         }
   #endif
- -      r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC);
+ +      r = get_unused_fd_flags(O_CLOEXEC);
         if (r < 0) {
                 kvm_put_kvm(kvm);
                 return r;
         }
+ +      file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+ +      if (IS_ERR(file)) {
+ +              put_unused_fd(r);
+ +              kvm_put_kvm(kvm);
+ +              return PTR_ERR(file);
+ +      }
   
         if (kvm_create_vm_debugfs(kvm, r) < 0) {
- -              kvm_put_kvm(kvm);
+ +              put_unused_fd(r);
+ +              fput(file);
                 return -ENOMEM;
         }
   
+ +      fd_install(r, file);
         return r;
   }
   
@@@ -3155,13 -3200,12 +3211,13 @@@ static void hardware_enable_nolock(voi
         }
   }
   
- -static void hardware_enable(void)
+ +static int kvm_starting_cpu(unsigned int cpu)
   {
         raw_spin_lock(&kvm_count_lock);
         if (kvm_usage_count)
                 hardware_enable_nolock(NULL);
         raw_spin_unlock(&kvm_count_lock);
+ +      return 0;
   }
   
   static void hardware_disable_nolock(void *junk)
@@@ -3174,13 -3218,12 +3230,13 @@@
         kvm_arch_hardware_disable();
   }
   
- -static void hardware_disable(void)
+ +static int kvm_dying_cpu(unsigned int cpu)
   {
         raw_spin_lock(&kvm_count_lock);
         if (kvm_usage_count)
                 hardware_disable_nolock(NULL);
         raw_spin_unlock(&kvm_count_lock);
+ +      return 0;
   }
   
   static void hardware_disable_all_nolock(void)
@@@ -3221,6 -3264,21 +3277,6 @@@ static int hardware_enable_all(void
         return r;
   }
   
- -static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
- -                         void *v)
- -{
- -      val &= ~CPU_TASKS_FROZEN;
- -      switch (val) {
- -      case CPU_DYING:
- -              hardware_disable();
- -              break;
- -      case CPU_STARTING:
- -              hardware_enable();
- -              break;
- -      }
- -      return NOTIFY_OK;
- -}
- -
   static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
                       void *v)
   {
@@@ -3487,6 -3545,34 +3543,30 @@@ int kvm_io_bus_unregister_dev(struct kv
         return r;
   }
   
- -static struct notifier_block kvm_cpu_notifier = {
- -      .notifier_call = kvm_cpu_hotplug,
- -};
- -
+ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                                        gpa_t addr)
+ {
+       struct kvm_io_bus *bus;
+       int dev_idx, srcu_idx;
+       struct kvm_io_device *iodev = NULL;
+ 
+       srcu_idx = srcu_read_lock(&kvm->srcu);
+ 
+       bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+ 
+       dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
+       if (dev_idx < 0)
+               goto out_unlock;
+ 
+       iodev = bus->range[dev_idx].dev;
+ 
+ out_unlock:
+       srcu_read_unlock(&kvm->srcu, srcu_idx);
+ 
+       return iodev;
+ }
+ EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
+ 
   static int kvm_debugfs_open(struct inode *inode, struct file *file,
                            int (*get)(void *, u64 *), int (*set)(void *, u64),
                            const char *fmt)
@@@ -3737,8 -3823,7 +3817,8 @@@ int kvm_init(void *opaque, unsigned vcp
                         goto out_free_1;
         }
   
- -      r = register_cpu_notifier(&kvm_cpu_notifier);
+ +      r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "AP_KVM_STARTING",
+ +                                    kvm_starting_cpu, kvm_dying_cpu);
         if (r)
                 goto out_free_2;
         register_reboot_notifier(&kvm_reboot_notifier);
@@@ -3792,7 -3877,7 +3872,7 @@@ out_free
         kmem_cache_destroy(kvm_vcpu_cache);
   out_free_3:
         unregister_reboot_notifier(&kvm_reboot_notifier);
- -      unregister_cpu_notifier(&kvm_cpu_notifier);
+ +      cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
   out_free_2:
   out_free_1:
         kvm_arch_hardware_unsetup();
@@@ -3815,7 -3900,7 +3895,7 @@@ void kvm_exit(void
         kvm_async_pf_deinit();
         unregister_syscore_ops(&kvm_syscore_ops);
         unregister_reboot_notifier(&kvm_reboot_notifier);
- -      unregister_cpu_notifier(&kvm_cpu_notifier);
+ +      cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
         on_each_cpu(hardware_disable_nolock, NULL, 1);
         kvm_arch_hardware_unsetup();
         kvm_arch_exit();
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 2 Aug 2016 20:11:27 +0000 (16:11 -0400)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 2 Aug 2016 20:11:27 +0000 (16:11 -0400)
		1	2
arch/arm/include/asm/pgtable.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/kvm/arm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/include/asm/cpufeature.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/include/asm/virt.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/kernel/cpufeature.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/kvm/hyp/switch.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/include/asm/paca.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/exceptions-64s.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/idle_book3s.S	patch \|	diff1 \|	\|	blob \| history
arch/powerpc/kernel/traps.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kvm/book3s_hv_rmhandlers.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kvm/book3s_pr.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/platforms/powernv/opal-wrappers.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/diag.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/mmu.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/mmu_context.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/page.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/pgtable.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/processor.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kernel/diag.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kvm/intercept.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kvm/kvm-s390.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/mm/fault.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/mm/gmap.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/mm/pgalloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/mm/pgtable.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/iommu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/lapic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/context_tracking.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/irqchip/arm-gic-v3.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/gup.c	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/arm/vgic/vgic-init.c	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/kvm_main.c	patch \|	diff1 \|	diff2 \|	blob \| history