Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 2 Aug 2016 20:11:27 +0000 (16:11 -0400)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 2 Aug 2016 20:11:27 +0000 (16:11 -0400)
Pull KVM updates from Paolo Bonzini:

 - ARM: GICv3 ITS emulation and various fixes.  Removal of the
   old VGIC implementation.

 - s390: support for trapping software breakpoints, nested
   virtualization (vSIE), the STHYI opcode, initial extensions
   for CPU model support.

 - MIPS: support for MIPS64 hosts (32-bit guests only) and lots
   of cleanups, preliminary to this and the upcoming support for
   hardware virtualization extensions.

 - x86: support for execute-only mappings in nested EPT; reduced
   vmexit latency for TSC deadline timer (by about 30%) on Intel
   hosts; support for more than 255 vCPUs.

 - PPC: bugfixes.

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (302 commits)
  KVM: PPC: Introduce KVM_CAP_PPC_HTM
  MIPS: Select HAVE_KVM for MIPS64_R{2,6}
  MIPS: KVM: Reset CP0_PageMask during host TLB flush
  MIPS: KVM: Fix ptr->int cast via KVM_GUEST_KSEGX()
  MIPS: KVM: Sign extend MFC0/RDHWR results
  MIPS: KVM: Fix 64-bit big endian dynamic translation
  MIPS: KVM: Fail if ebase doesn't fit in CP0_EBase
  MIPS: KVM: Use 64-bit CP0_EBase when appropriate
  MIPS: KVM: Set CP0_Status.KX on MIPS64
  MIPS: KVM: Make entry code MIPS64 friendly
  MIPS: KVM: Use kmap instead of CKSEG0ADDR()
  MIPS: KVM: Use virt_to_phys() to get commpage PFN
  MIPS: Fix definition of KSEGX() for 64-bit
  KVM: VMX: Add VMCS to CPU's loaded VMCSs before VMPTRLD
  kvm: x86: nVMX: maintain internal copy of current VMCS
  KVM: PPC: Book3S HV: Save/restore TM state in H_CEDE
  KVM: PPC: Book3S HV: Pull out TM state save/restore into separate procedures
  KVM: arm64: vgic-its: Simplify MAPI error handling
  KVM: arm64: vgic-its: Make vgic_its_cmd_handle_mapi similar to other handlers
  KVM: arm64: vgic-its: Turn device_id validation into generic ID validation
  ...

40 files changed:
1  2 
arch/arm/include/asm/pgtable.h
arch/arm/kvm/arm.c
arch/arm64/include/asm/cpufeature.h
arch/arm64/include/asm/virt.h
arch/arm64/kernel/cpufeature.c
arch/arm64/kvm/hyp/switch.c
arch/powerpc/include/asm/paca.h
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/idle_book3s.S
arch/powerpc/kernel/traps.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/platforms/powernv/opal-wrappers.S
arch/s390/include/asm/diag.h
arch/s390/include/asm/kvm_host.h
arch/s390/include/asm/mmu.h
arch/s390/include/asm/mmu_context.h
arch/s390/include/asm/page.h
arch/s390/include/asm/pgtable.h
arch/s390/include/asm/processor.h
arch/s390/kernel/diag.c
arch/s390/kvm/intercept.c
arch/s390/kvm/kvm-s390.c
arch/s390/mm/fault.c
arch/s390/mm/gmap.c
arch/s390/mm/pgalloc.c
arch/s390/mm/pgtable.c
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/iommu.c
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
include/linux/context_tracking.h
include/linux/irqchip/arm-gic-v3.h
mm/gup.c
virt/kvm/arm/vgic/vgic-init.c
virt/kvm/kvm_main.c

@@@ -97,7 -97,9 +97,9 @@@ extern pgprot_t               pgprot_s2_device
  #define PAGE_READONLY_EXEC    _MOD_PROT(pgprot_user, L_PTE_USER | L_PTE_RDONLY)
  #define PAGE_KERNEL           _MOD_PROT(pgprot_kernel, L_PTE_XN)
  #define PAGE_KERNEL_EXEC      pgprot_kernel
- #define PAGE_HYP              _MOD_PROT(pgprot_kernel, L_PTE_HYP)
+ #define PAGE_HYP              _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_XN)
+ #define PAGE_HYP_EXEC         _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY)
+ #define PAGE_HYP_RO           _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN)
  #define PAGE_HYP_DEVICE               _MOD_PROT(pgprot_hyp_device, L_PTE_HYP)
  #define PAGE_S2                       _MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY)
  #define PAGE_S2_DEVICE                _MOD_PROT(pgprot_s2_device, L_PTE_S2_RDONLY)
@@@ -182,6 -184,7 +184,6 @@@ extern pgd_t swapper_pg_dir[PTRS_PER_PG
  #define pgd_offset_k(addr)    pgd_offset(&init_mm, addr)
  
  #define pmd_none(pmd)         (!pmd_val(pmd))
 -#define pmd_present(pmd)      (pmd_val(pmd))
  
  static inline pte_t *pmd_page_vaddr(pmd_t pmd)
  {
diff --combined arch/arm/kvm/arm.c
@@@ -20,6 -20,7 +20,7 @@@
  #include <linux/errno.h>
  #include <linux/err.h>
  #include <linux/kvm_host.h>
+ #include <linux/list.h>
  #include <linux/module.h>
  #include <linux/vmalloc.h>
  #include <linux/fs.h>
@@@ -122,7 -123,7 +123,7 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
        if (ret)
                goto out_fail_alloc;
  
-       ret = create_hyp_mappings(kvm, kvm + 1);
+       ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
        if (ret)
                goto out_free_stage2_pgd;
  
@@@ -201,7 -202,7 +202,7 @@@ int kvm_vm_ioctl_check_extension(struc
                r = KVM_MAX_VCPUS;
                break;
        default:
-               r = kvm_arch_dev_ioctl_check_extension(ext);
+               r = kvm_arch_dev_ioctl_check_extension(kvm, ext);
                break;
        }
        return r;
@@@ -239,7 -240,7 +240,7 @@@ struct kvm_vcpu *kvm_arch_vcpu_create(s
        if (err)
                goto free_vcpu;
  
-       err = create_hyp_mappings(vcpu, vcpu + 1);
+       err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
        if (err)
                goto vcpu_uninit;
  
@@@ -263,7 -264,6 +264,7 @@@ void kvm_arch_vcpu_free(struct kvm_vcp
        kvm_timer_vcpu_terminate(vcpu);
        kvm_vgic_vcpu_destroy(vcpu);
        kvm_pmu_vcpu_destroy(vcpu);
 +      kvm_vcpu_uninit(vcpu);
        kmem_cache_free(kvm_vcpu_cache, vcpu);
  }
  
@@@ -377,7 -377,7 +378,7 @@@ void force_vm_exit(const cpumask_t *mas
  
  /**
   * need_new_vmid_gen - check that the VMID is still valid
-  * @kvm: The VM's VMID to checkt
+  * @kvm: The VM's VMID to check
   *
   * return true if there is a new generation of VMIDs being used
   *
@@@ -616,7 -616,7 +617,7 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
                 * Enter the guest
                 */
                trace_kvm_entry(*vcpu_pc(vcpu));
-               __kvm_guest_enter();
+               guest_enter_irqoff();
                vcpu->mode = IN_GUEST_MODE;
  
                ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
                local_irq_enable();
  
                /*
-                * We do local_irq_enable() before calling kvm_guest_exit() so
+                * We do local_irq_enable() before calling guest_exit() so
                 * that if a timer interrupt hits while running the guest we
                 * account that tick as being spent in the guest.  We enable
-                * preemption after calling kvm_guest_exit() so that if we get
+                * preemption after calling guest_exit() so that if we get
                 * preempted we make sure ticks after that is not counted as
                 * guest time.
                 */
-               kvm_guest_exit();
+               guest_exit();
                trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
  
                /*
@@@ -1039,7 -1039,6 +1040,6 @@@ long kvm_arch_vm_ioctl(struct file *fil
  
  static void cpu_init_hyp_mode(void *dummy)
  {
-       phys_addr_t boot_pgd_ptr;
        phys_addr_t pgd_ptr;
        unsigned long hyp_stack_ptr;
        unsigned long stack_page;
        /* Switch from the HYP stub to our own HYP init vector */
        __hyp_set_vectors(kvm_get_idmap_vector());
  
-       boot_pgd_ptr = kvm_mmu_get_boot_httbr();
        pgd_ptr = kvm_mmu_get_httbr();
        stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
        hyp_stack_ptr = stack_page + PAGE_SIZE;
        vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector);
  
-       __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+       __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
        __cpu_init_stage2();
  
        kvm_arm_init_debug();
@@@ -1076,15 -1074,9 +1075,9 @@@ static void cpu_hyp_reinit(void
  
  static void cpu_hyp_reset(void)
  {
-       phys_addr_t boot_pgd_ptr;
-       phys_addr_t phys_idmap_start;
-       if (!is_kernel_in_hyp_mode()) {
-               boot_pgd_ptr = kvm_mmu_get_boot_httbr();
-               phys_idmap_start = kvm_get_idmap_start();
-               __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start);
-       }
+       if (!is_kernel_in_hyp_mode())
+               __cpu_reset_hyp_mode(hyp_default_vectors,
+                                    kvm_get_idmap_start());
  }
  
  static void _kvm_arch_hardware_enable(void *discard)
@@@ -1294,14 -1286,14 +1287,14 @@@ static int init_hyp_mode(void
         * Map the Hyp-code called directly from the host
         */
        err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
-                                 kvm_ksym_ref(__hyp_text_end));
+                                 kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
        if (err) {
                kvm_err("Cannot map world-switch code\n");
                goto out_err;
        }
  
        err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
-                                 kvm_ksym_ref(__end_rodata));
+                                 kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
        if (err) {
                kvm_err("Cannot map rodata section\n");
                goto out_err;
         */
        for_each_possible_cpu(cpu) {
                char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
-               err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE);
+               err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
+                                         PAGE_HYP);
  
                if (err) {
                        kvm_err("Cannot map hyp stack\n");
                kvm_cpu_context_t *cpu_ctxt;
  
                cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu);
-               err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1);
+               err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP);
  
                if (err) {
                        kvm_err("Cannot map host CPU state: %d\n", err);
                }
        }
  
- #ifndef CONFIG_HOTPLUG_CPU
-       free_boot_hyp_pgd();
- #endif
        /* set size of VMID supported by CPU */
        kvm_vmid_bits = kvm_get_vmid_bits();
        kvm_info("%d-bit VMID\n", kvm_vmid_bits);
@@@ -36,8 -36,9 +36,9 @@@
  #define ARM64_HAS_VIRT_HOST_EXTN              11
  #define ARM64_WORKAROUND_CAVIUM_27456         12
  #define ARM64_HAS_32BIT_EL0                   13
+ #define ARM64_HYP_OFFSET_LOW                  14
  
- #define ARM64_NCAPS                           14
+ #define ARM64_NCAPS                           15
  
  #ifndef __ASSEMBLY__
  
@@@ -191,9 -192,7 +192,9 @@@ void __init setup_cpu_features(void)
  
  void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
                            const char *info);
 +void enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps);
  void check_local_cpu_errata(void);
 +void __init enable_errata_workarounds(void);
  
  void verify_local_cpu_errata(void);
  void verify_local_cpu_capabilities(void);
   */
  #define HVC_SET_VECTORS 1
  
 +/*
 + * HVC_SOFT_RESTART - CPU soft reset, used by the cpu_soft_restart routine.
 + */
 +#define HVC_SOFT_RESTART 2
 +
  #define BOOT_CPU_MODE_EL1     (0xe11)
  #define BOOT_CPU_MODE_EL2     (0xe12)
  
@@@ -87,6 -82,10 +87,10 @@@ extern void verify_cpu_run_el(void)
  static inline void verify_cpu_run_el(void) {}
  #endif
  
+ /* The section containing the hypervisor idmap text */
+ extern char __hyp_idmap_text_start[];
+ extern char __hyp_idmap_text_end[];
  /* The section containing the hypervisor text */
  extern char __hyp_text_start[];
  extern char __hyp_text_end[];
@@@ -726,6 -726,19 +726,19 @@@ static bool runs_at_el2(const struct ar
        return is_kernel_in_hyp_mode();
  }
  
+ static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry,
+                          int __unused)
+ {
+       phys_addr_t idmap_addr = virt_to_phys(__hyp_idmap_text_start);
+       /*
+        * Activate the lower HYP offset only if:
+        * - the idmap doesn't clash with it,
+        * - the kernel is not running at EL2.
+        */
+       return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode();
+ }
  static const struct arm64_cpu_capabilities arm64_features[] = {
        {
                .desc = "GIC system register CPU interface",
                .field_pos = ID_AA64PFR0_EL0_SHIFT,
                .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
        },
+       {
+               .desc = "Reduced HYP mapping offset",
+               .capability = ARM64_HYP_OFFSET_LOW,
+               .def_scope = SCOPE_SYSTEM,
+               .matches = hyp_offset_low,
+       },
        {},
  };
  
@@@ -913,7 -932,8 +932,7 @@@ void update_cpu_capabilities(const stru
   * Run through the enabled capabilities and enable() it on all active
   * CPUs
   */
 -static void __init
 -enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
 +void __init enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
  {
        for (; caps->matches; caps++)
                if (caps->enable && cpus_have_cap(caps->capability))
@@@ -1035,7 -1055,6 +1054,7 @@@ void __init setup_cpu_features(void
  
        /* Set the CPU feature capabilies */
        setup_feature_capabilities();
 +      enable_errata_workarounds();
        setup_elf_hwcaps(arm64_elf_hwcaps);
  
        if (system_supports_32bit_el0())
@@@ -198,7 -198,7 +198,7 @@@ static bool __hyp_text __translate_far_
  static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
  {
        u64 esr = read_sysreg_el2(esr);
 -      u8 ec = esr >> ESR_ELx_EC_SHIFT;
 +      u8 ec = ESR_ELx_EC(esr);
        u64 hpfar, far;
  
        vcpu->arch.fault.esr_el2 = esr;
@@@ -299,9 -299,16 +299,16 @@@ static const char __hyp_panic_string[] 
  
  static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par)
  {
-       unsigned long str_va = (unsigned long)__hyp_panic_string;
+       unsigned long str_va;
  
-       __hyp_do_panic(hyp_kern_va(str_va),
+       /*
+        * Force the panic string to be loaded from the literal pool,
+        * making sure it is a kernel address and not a PC-relative
+        * reference.
+        */
+       asm volatile("ldr %0, =__hyp_panic_string" : "=r" (str_va));
+       __hyp_do_panic(str_va,
                       spsr,  elr,
                       read_sysreg(esr_el2),   read_sysreg_el2(far),
                       read_sysreg(hpfar_el2), par,
@@@ -25,7 -25,7 +25,8 @@@
  #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
  #include <asm/kvm_book3s_asm.h>
  #endif
 +#include <asm/accounting.h>
+ #include <asm/hmi.h>
  
  register struct paca_struct *local_paca asm("r13");
  
@@@ -182,10 -182,21 +183,15 @@@ struct paca_struct 
         */
        u16 in_mce;
        u8 hmi_event_available;          /* HMI event is available */
+       /*
+        * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
+        * more details
+        */
+       struct sibling_subcore_state *sibling_subcore_state;
  #endif
  
        /* Stuff for accurate time accounting */
 -      u64 user_time;                  /* accumulated usermode TB ticks */
 -      u64 system_time;                /* accumulated system TB ticks */
 -      u64 user_time_scaled;           /* accumulated usermode SPURR ticks */
 -      u64 starttime;                  /* TB value snapshot */
 -      u64 starttime_user;             /* TB value on exit to usermode */
 -      u64 startspurr;                 /* SPURR value snapshot */
 -      u64 utime_sspurr;               /* ->user_time when ->startspurr set */
 +      struct cpu_accounting_data accounting;
        u64 stolen_time;                /* TB ticks taken by hypervisor */
        u64 dtl_ridx;                   /* read index in dispatch log */
        struct dtl_entry *dtl_curr;     /* pointer corresponding to dtl_ridx */
@@@ -41,12 -41,13 +41,12 @@@ obj-$(CONFIG_VDSO32)               += vdso32
  obj-$(CONFIG_HAVE_HW_BREAKPOINT)      += hw_breakpoint.o
  obj-$(CONFIG_PPC_BOOK3S_64)   += cpu_setup_ppc970.o cpu_setup_pa6t.o
  obj-$(CONFIG_PPC_BOOK3S_64)   += cpu_setup_power.o
- obj-$(CONFIG_PPC_BOOK3S_64)   += mce.o mce_power.o
+ obj-$(CONFIG_PPC_BOOK3S_64)   += mce.o mce_power.o hmi.o
 -obj64-$(CONFIG_RELOCATABLE)   += reloc_64.o
  obj-$(CONFIG_PPC_BOOK3E_64)   += exceptions-64e.o idle_book3e.o
  obj-$(CONFIG_PPC64)           += vdso64/
  obj-$(CONFIG_ALTIVEC)         += vecemu.o
  obj-$(CONFIG_PPC_970_NAP)     += idle_power4.o
 -obj-$(CONFIG_PPC_P7_NAP)      += idle_power7.o
 +obj-$(CONFIG_PPC_P7_NAP)      += idle_book3s.o
  procfs-y                      := proc_powerpc.o
  obj-$(CONFIG_PROC_FS)         += $(procfs-y)
  rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI) := rtas_pci.o
@@@ -86,7 -87,7 +86,7 @@@ extra-$(CONFIG_FSL_BOOKE)     := head_fsl_b
  extra-$(CONFIG_8xx)           := head_8xx.o
  extra-y                               += vmlinux.lds
  
 -obj-$(CONFIG_RELOCATABLE_PPC32)       += reloc_32.o
 +obj-$(CONFIG_RELOCATABLE)     += reloc_$(CONFIG_WORD_SIZE).o
  
  obj-$(CONFIG_PPC32)           += entry_32.o setup_32.o
  obj-$(CONFIG_PPC64)           += dma-iommu.o iommu.o
@@@ -107,9 -107,25 +107,9 @@@ BEGIN_FTR_SECTIO
        beq     9f
  
        cmpwi   cr3,r13,2
 -
 -      /*
 -       * Check if last bit of HSPGR0 is set. This indicates whether we are
 -       * waking up from winkle.
 -       */
        GET_PACA(r13)
 -      clrldi  r5,r13,63
 -      clrrdi  r13,r13,1
 -      cmpwi   cr4,r5,1
 -      mtspr   SPRN_HSPRG0,r13
 -
 -      lbz     r0,PACA_THREAD_IDLE_STATE(r13)
 -      cmpwi   cr2,r0,PNV_THREAD_NAP
 -      bgt     cr2,8f                          /* Either sleep or Winkle */
 -
 -      /* Waking up from nap should not cause hypervisor state loss */
 -      bgt     cr3,.
 +      bl      pnv_restore_hyp_resource
  
 -      /* Waking up from nap */
        li      r0,PNV_THREAD_RUNNING
        stb     r0,PACA_THREAD_IDLE_STATE(r13)  /* Clear thread state */
  
  
        /* Return SRR1 from power7_nap() */
        mfspr   r3,SPRN_SRR1
 -      beq     cr3,2f
 -      b       power7_wakeup_noloss
 -2:    b       power7_wakeup_loss
 -
 -      /* Fast Sleep wakeup on PowerNV */
 -8:    GET_PACA(r13)
 -      b       power7_wakeup_tb_loss
 +      blt     cr3,2f
 +      b       pnv_wakeup_loss
 +2:    b       pnv_wakeup_noloss
  
  9:
  END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
@@@ -331,12 -351,6 +331,12 @@@ hv_doorbell_trampoline
        EXCEPTION_PROLOG_0(PACA_EXGEN)
        b       h_doorbell_hv
  
 +      . = 0xea0
 +hv_virt_irq_trampoline:
 +      SET_SCRATCH0(r13)
 +      EXCEPTION_PROLOG_0(PACA_EXGEN)
 +      b       h_virt_irq_hv
 +
        /* We need to deal with the Altivec unavailable exception
         * here which is at 0xf20, thus in the middle of the
         * prolog code of the PerformanceMonitor one. A little
@@@ -587,9 -601,6 +587,9 @@@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR
        MASKABLE_EXCEPTION_HV_OOL(0xe82, h_doorbell)
        KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe82)
  
 +      MASKABLE_EXCEPTION_HV_OOL(0xea2, h_virt_irq)
 +      KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xea2)
 +
        /* moved from 0xf00 */
        STD_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
        KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf00)
@@@ -669,8 -680,8 +669,10 @@@ _GLOBAL(__replay_interrupt
  BEGIN_FTR_SECTION
        cmpwi   r3,0xe80
        beq     h_doorbell_common
 +      cmpwi   r3,0xea0
 +      beq     h_virt_irq_common
+       cmpwi   r3,0xe60
+       beq     hmi_exception_common
  FTR_SECTION_ELSE
        cmpwi   r3,0xa00
        beq     doorbell_super_common
@@@ -745,7 -756,6 +747,7 @@@ kvmppc_skip_Hinterrupt
  #else
        STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, unknown_exception)
  #endif
 +      STD_EXCEPTION_COMMON_ASYNC(0xea0, h_virt_irq, do_IRQ)
        STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, performance_monitor_exception)
        STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, instruction_breakpoint_exception)
        STD_EXCEPTION_COMMON(0x1502, denorm, unknown_exception)
  #else
        STD_EXCEPTION_COMMON(0x1700, altivec_assist, unknown_exception)
  #endif
 -#ifdef CONFIG_CBE_RAS
 -      STD_EXCEPTION_COMMON(0x1200, cbe_system_error, cbe_system_error_exception)
 -      STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, cbe_maintenance_exception)
 -      STD_EXCEPTION_COMMON(0x1800, cbe_thermal, cbe_thermal_exception)
 -#endif /* CONFIG_CBE_RAS */
  
        /*
         * Relocation-on interrupts: A subset of the interrupts can be delivered
@@@ -864,12 -879,6 +866,12 @@@ h_doorbell_relon_trampoline
        EXCEPTION_PROLOG_0(PACA_EXGEN)
        b       h_doorbell_relon_hv
  
 +      . = 0x4ea0
 +h_virt_irq_relon_trampoline:
 +      SET_SCRATCH0(r13)
 +      EXCEPTION_PROLOG_0(PACA_EXGEN)
 +      b       h_virt_irq_relon_hv
 +
        . = 0x4f00
  performance_monitor_relon_pseries_trampoline:
        SET_SCRATCH0(r13)
@@@ -1124,10 -1133,12 +1126,10 @@@ END_FTR_SECTION_IFSET(CPU_FTR_VSX
        bl      vsx_unavailable_exception
        b       ret_from_except
  
 -      STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception)
 -      STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception)
 -
        /* Equivalents to the above handlers for relocation-on interrupt vectors */
        STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
        MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
 +      MASKABLE_RELON_EXCEPTION_HV_OOL(0xea0, h_virt_irq)
  
        STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
        STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
@@@ -1161,18 -1172,9 +1163,18 @@@ fwnmi_data_area
        . = 0x8000
  #endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
  
 +      STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception)
 +      STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception)
 +
 +#ifdef CONFIG_CBE_RAS
 +      STD_EXCEPTION_COMMON(0x1200, cbe_system_error, cbe_system_error_exception)
 +      STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, cbe_maintenance_exception)
 +      STD_EXCEPTION_COMMON(0x1800, cbe_thermal, cbe_thermal_exception)
 +#endif /* CONFIG_CBE_RAS */
 +
        .globl hmi_exception_early
  hmi_exception_early:
-       EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
+       EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, 0xe62)
        mr      r10,r1                  /* Save r1                      */
        ld      r1,PACAEMERGSP(r13)     /* Use emergency stack          */
        subi    r1,r1,INT_FRAME_SIZE    /* alloc stack frame            */
@@@ -1289,7 -1291,7 +1291,7 @@@ machine_check_handle_early
        GET_PACA(r13)
        ld      r1,PACAR1(r13)
        li      r3,PNV_THREAD_NAP
 -      b       power7_enter_nap_mode
 +      b       pnv_enter_arch207_idle_mode
  4:
  #endif
        /*
@@@ -1399,12 -1401,11 +1401,12 @@@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_RADIX
        lwz     r9,PACA_EXSLB+EX_CCR(r13)       /* get saved CR */
  
        mtlr    r10
 -BEGIN_MMU_FTR_SECTION
 -      b       2f
 -END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX)
        andi.   r10,r12,MSR_RI  /* check for unrecoverable exception */
 +BEGIN_MMU_FTR_SECTION
        beq-    2f
 +FTR_SECTION_ELSE
 +      b       2f
 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX)
  
  .machine      push
  .machine      "power4"
index 335eb6c,0000000..8a56a51
mode 100644,000000..100644
--- /dev/null
@@@ -1,662 -1,0 +1,664 @@@
-       bl      opal_rm_handle_hmi;                                     \
 +/*
 + *  This file contains idle entry/exit functions for POWER7,
 + *  POWER8 and POWER9 CPUs.
 + *
 + *  This program is free software; you can redistribute it and/or
 + *  modify it under the terms of the GNU General Public License
 + *  as published by the Free Software Foundation; either version
 + *  2 of the License, or (at your option) any later version.
 + */
 +
 +#include <linux/threads.h>
 +#include <asm/processor.h>
 +#include <asm/page.h>
 +#include <asm/cputable.h>
 +#include <asm/thread_info.h>
 +#include <asm/ppc_asm.h>
 +#include <asm/asm-offsets.h>
 +#include <asm/ppc-opcode.h>
 +#include <asm/hw_irq.h>
 +#include <asm/kvm_book3s_asm.h>
 +#include <asm/opal.h>
 +#include <asm/cpuidle.h>
 +#include <asm/book3s/64/mmu-hash.h>
 +#include <asm/mmu.h>
 +
 +#undef DEBUG
 +
 +/*
 + * Use unused space in the interrupt stack to save and restore
 + * registers for winkle support.
 + */
 +#define _SDR1 GPR3
 +#define _RPR  GPR4
 +#define _SPURR        GPR5
 +#define _PURR GPR6
 +#define _TSCR GPR7
 +#define _DSCR GPR8
 +#define _AMOR GPR9
 +#define _WORT GPR10
 +#define _WORC GPR11
 +#define _PTCR GPR12
 +
 +#define PSSCR_HV_TEMPLATE     PSSCR_ESL | PSSCR_EC | \
 +                              PSSCR_PSLL_MASK | PSSCR_TR_MASK | \
 +                              PSSCR_MTL_MASK
 +
 +/* Idle state entry routines */
 +
 +#define       IDLE_STATE_ENTER_SEQ(IDLE_INST)                         \
 +      /* Magic NAP/SLEEP/WINKLE mode enter sequence */        \
 +      std     r0,0(r1);                                       \
 +      ptesync;                                                \
 +      ld      r0,0(r1);                                       \
 +1:    cmp     cr0,r0,r0;                                      \
 +      bne     1b;                                             \
 +      IDLE_INST;                                              \
 +      b       .
 +
 +      .text
 +
 +/*
 + * Used by threads before entering deep idle states. Saves SPRs
 + * in interrupt stack frame
 + */
 +save_sprs_to_stack:
 +      /*
 +       * Note all register i.e per-core, per-subcore or per-thread is saved
 +       * here since any thread in the core might wake up first
 +       */
 +BEGIN_FTR_SECTION
 +      mfspr   r3,SPRN_PTCR
 +      std     r3,_PTCR(r1)
 +      /*
 +       * Note - SDR1 is dropped in Power ISA v3. Hence not restoring
 +       * SDR1 here
 +       */
 +FTR_SECTION_ELSE
 +      mfspr   r3,SPRN_SDR1
 +      std     r3,_SDR1(r1)
 +ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
 +      mfspr   r3,SPRN_RPR
 +      std     r3,_RPR(r1)
 +      mfspr   r3,SPRN_SPURR
 +      std     r3,_SPURR(r1)
 +      mfspr   r3,SPRN_PURR
 +      std     r3,_PURR(r1)
 +      mfspr   r3,SPRN_TSCR
 +      std     r3,_TSCR(r1)
 +      mfspr   r3,SPRN_DSCR
 +      std     r3,_DSCR(r1)
 +      mfspr   r3,SPRN_AMOR
 +      std     r3,_AMOR(r1)
 +      mfspr   r3,SPRN_WORT
 +      std     r3,_WORT(r1)
 +      mfspr   r3,SPRN_WORC
 +      std     r3,_WORC(r1)
 +
 +      blr
 +
 +/*
 + * Used by threads when the lock bit of core_idle_state is set.
 + * Threads will spin in HMT_LOW until the lock bit is cleared.
 + * r14 - pointer to core_idle_state
 + * r15 - used to load contents of core_idle_state
 + */
 +
 +core_idle_lock_held:
 +      HMT_LOW
 +3:    lwz     r15,0(r14)
 +      andi.   r15,r15,PNV_CORE_IDLE_LOCK_BIT
 +      bne     3b
 +      HMT_MEDIUM
 +      lwarx   r15,0,r14
 +      blr
 +
 +/*
 + * Pass requested state in r3:
 + *    r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8
 + *       - Requested STOP state in POWER9
 + *
 + * To check IRQ_HAPPENED in r4
 + *    0 - don't check
 + *    1 - check
 + *
 + * Address to 'rfid' to in r5
 + */
 +_GLOBAL(pnv_powersave_common)
 +      /* Use r3 to pass state nap/sleep/winkle */
 +      /* NAP is a state loss, we create a regs frame on the
 +       * stack, fill it up with the state we care about and
 +       * stick a pointer to it in PACAR1. We really only
 +       * need to save PC, some CR bits and the NV GPRs,
 +       * but for now an interrupt frame will do.
 +       */
 +      mflr    r0
 +      std     r0,16(r1)
 +      stdu    r1,-INT_FRAME_SIZE(r1)
 +      std     r0,_LINK(r1)
 +      std     r0,_NIP(r1)
 +
 +      /* Hard disable interrupts */
 +      mfmsr   r9
 +      rldicl  r9,r9,48,1
 +      rotldi  r9,r9,16
 +      mtmsrd  r9,1                    /* hard-disable interrupts */
 +
 +      /* Check if something happened while soft-disabled */
 +      lbz     r0,PACAIRQHAPPENED(r13)
 +      andi.   r0,r0,~PACA_IRQ_HARD_DIS@l
 +      beq     1f
 +      cmpwi   cr0,r4,0
 +      beq     1f
 +      addi    r1,r1,INT_FRAME_SIZE
 +      ld      r0,16(r1)
 +      li      r3,0                    /* Return 0 (no nap) */
 +      mtlr    r0
 +      blr
 +
 +1:    /* We mark irqs hard disabled as this is the state we'll
 +       * be in when returning and we need to tell arch_local_irq_restore()
 +       * about it
 +       */
 +      li      r0,PACA_IRQ_HARD_DIS
 +      stb     r0,PACAIRQHAPPENED(r13)
 +
 +      /* We haven't lost state ... yet */
 +      li      r0,0
 +      stb     r0,PACA_NAPSTATELOST(r13)
 +
 +      /* Continue saving state */
 +      SAVE_GPR(2, r1)
 +      SAVE_NVGPRS(r1)
 +      mfcr    r4
 +      std     r4,_CCR(r1)
 +      std     r9,_MSR(r1)
 +      std     r1,PACAR1(r13)
 +
 +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 +      /* Tell KVM we're entering idle */
 +      li      r4,KVM_HWTHREAD_IN_IDLE
 +      stb     r4,HSTATE_HWTHREAD_STATE(r13)
 +#endif
 +
 +      /*
 +       * Go to real mode to do the nap, as required by the architecture.
 +       * Also, we need to be in real mode before setting hwthread_state,
 +       * because as soon as we do that, another thread can switch
 +       * the MMU context to the guest.
 +       */
 +      LOAD_REG_IMMEDIATE(r7, MSR_IDLE)
 +      li      r6, MSR_RI
 +      andc    r6, r9, r6
 +      mtmsrd  r6, 1           /* clear RI before setting SRR0/1 */
 +      mtspr   SPRN_SRR0, r5
 +      mtspr   SPRN_SRR1, r7
 +      rfid
 +
 +      .globl pnv_enter_arch207_idle_mode
 +pnv_enter_arch207_idle_mode:
 +      stb     r3,PACA_THREAD_IDLE_STATE(r13)
 +      cmpwi   cr3,r3,PNV_THREAD_SLEEP
 +      bge     cr3,2f
 +      IDLE_STATE_ENTER_SEQ(PPC_NAP)
 +      /* No return */
 +2:
 +      /* Sleep or winkle */
 +      lbz     r7,PACA_THREAD_MASK(r13)
 +      ld      r14,PACA_CORE_IDLE_STATE_PTR(r13)
 +lwarx_loop1:
 +      lwarx   r15,0,r14
 +
 +      andi.   r9,r15,PNV_CORE_IDLE_LOCK_BIT
 +      bnel    core_idle_lock_held
 +
 +      andc    r15,r15,r7                      /* Clear thread bit */
 +
 +      andi.   r15,r15,PNV_CORE_IDLE_THREAD_BITS
 +
 +/*
 + * If cr0 = 0, then current thread is the last thread of the core entering
 + * sleep. Last thread needs to execute the hardware bug workaround code if
 + * required by the platform.
 + * Make the workaround call unconditionally here. The below branch call is
 + * patched out when the idle states are discovered if the platform does not
 + * require it.
 + */
 +.global pnv_fastsleep_workaround_at_entry
 +pnv_fastsleep_workaround_at_entry:
 +      beq     fastsleep_workaround_at_entry
 +
 +      stwcx.  r15,0,r14
 +      bne-    lwarx_loop1
 +      isync
 +
 +common_enter: /* common code for all the threads entering sleep or winkle */
 +      bgt     cr3,enter_winkle
 +      IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
 +
 +fastsleep_workaround_at_entry:
 +      ori     r15,r15,PNV_CORE_IDLE_LOCK_BIT
 +      stwcx.  r15,0,r14
 +      bne-    lwarx_loop1
 +      isync
 +
 +      /* Fast sleep workaround */
 +      li      r3,1
 +      li      r4,1
 +      bl      opal_rm_config_cpu_idle_state
 +
 +      /* Clear Lock bit */
 +      li      r0,0
 +      lwsync
 +      stw     r0,0(r14)
 +      b       common_enter
 +
 +enter_winkle:
 +      bl      save_sprs_to_stack
 +
 +      IDLE_STATE_ENTER_SEQ(PPC_WINKLE)
 +
 +/*
 + * r3 - requested stop state
 + */
 +power_enter_stop:
 +/*
 + * Check if the requested state is a deep idle state.
 + */
 +      LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
 +      ld      r4,ADDROFF(pnv_first_deep_stop_state)(r5)
 +      cmpd    r3,r4
 +      bge     2f
 +      IDLE_STATE_ENTER_SEQ(PPC_STOP)
 +2:
 +/*
 + * Entering deep idle state.
 + * Clear thread bit in PACA_CORE_IDLE_STATE, save SPRs to
 + * stack and enter stop
 + */
 +      lbz     r7,PACA_THREAD_MASK(r13)
 +      ld      r14,PACA_CORE_IDLE_STATE_PTR(r13)
 +
 +lwarx_loop_stop:
 +      lwarx   r15,0,r14
 +      andi.   r9,r15,PNV_CORE_IDLE_LOCK_BIT
 +      bnel    core_idle_lock_held
 +      andc    r15,r15,r7                      /* Clear thread bit */
 +
 +      stwcx.  r15,0,r14
 +      bne-    lwarx_loop_stop
 +      isync
 +
 +      bl      save_sprs_to_stack
 +
 +      IDLE_STATE_ENTER_SEQ(PPC_STOP)
 +
 +_GLOBAL(power7_idle)
 +      /* Now check if user or arch enabled NAP mode */
 +      LOAD_REG_ADDRBASE(r3,powersave_nap)
 +      lwz     r4,ADDROFF(powersave_nap)(r3)
 +      cmpwi   0,r4,0
 +      beqlr
 +      li      r3, 1
 +      /* fall through */
 +
 +_GLOBAL(power7_nap)
 +      mr      r4,r3
 +      li      r3,PNV_THREAD_NAP
 +      LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
 +      b       pnv_powersave_common
 +      /* No return */
 +
 +_GLOBAL(power7_sleep)
 +      li      r3,PNV_THREAD_SLEEP
 +      li      r4,1
 +      LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
 +      b       pnv_powersave_common
 +      /* No return */
 +
 +_GLOBAL(power7_winkle)
 +      li      r3,PNV_THREAD_WINKLE
 +      li      r4,1
 +      LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
 +      b       pnv_powersave_common
 +      /* No return */
 +
 +#define CHECK_HMI_INTERRUPT                                           \
 +      mfspr   r0,SPRN_SRR1;                                           \
 +BEGIN_FTR_SECTION_NESTED(66);                                         \
 +      rlwinm  r0,r0,45-31,0xf;  /* extract wake reason field (P8) */  \
 +FTR_SECTION_ELSE_NESTED(66);                                          \
 +      rlwinm  r0,r0,45-31,0xe;  /* P7 wake reason field is 3 bits */  \
 +ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);              \
 +      cmpwi   r0,0xa;                 /* Hypervisor maintenance ? */  \
 +      bne     20f;                                                    \
 +      /* Invoke opal call to handle hmi */                            \
 +      ld      r2,PACATOC(r13);                                        \
 +      ld      r1,PACAR1(r13);                                         \
 +      std     r3,ORIG_GPR3(r1);       /* Save original r3 */          \
++      li      r3,0;                   /* NULL argument */             \
++      bl      hmi_exception_realmode;                                 \
++      nop;                                                            \
 +      ld      r3,ORIG_GPR3(r1);       /* Restore original r3 */       \
 +20:   nop;
 +
 +
 +/*
 + * r3 - requested stop state
 + */
 +_GLOBAL(power9_idle_stop)
 +      LOAD_REG_IMMEDIATE(r4, PSSCR_HV_TEMPLATE)
 +      or      r4,r4,r3
 +      mtspr   SPRN_PSSCR, r4
 +      li      r4, 1
 +      LOAD_REG_ADDR(r5,power_enter_stop)
 +      b       pnv_powersave_common
 +      /* No return */
 +/*
 + * Called from reset vector. Check whether we have woken up with
 + * hypervisor state loss. If yes, restore hypervisor state and return
 + * back to reset vector.
 + *
 + * r13 - Contents of HSPRG0
 + * cr3 - set to gt if waking up with partial/complete hypervisor state loss
 + */
 +_GLOBAL(pnv_restore_hyp_resource)
 +      ld      r2,PACATOC(r13);
 +BEGIN_FTR_SECTION
 +      /*
 +       * POWER ISA 3. Use PSSCR to determine if we
 +       * are waking up from deep idle state
 +       */
 +      LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
 +      ld      r4,ADDROFF(pnv_first_deep_stop_state)(r5)
 +
 +      mfspr   r5,SPRN_PSSCR
 +      /*
 +       * 0-3 bits correspond to Power-Saving Level Status
 +       * which indicates the idle state we are waking up from
 +       */
 +      rldicl  r5,r5,4,60
 +      cmpd    cr4,r5,r4
 +      bge     cr4,pnv_wakeup_tb_loss
 +      /*
 +       * Waking up without hypervisor state loss. Return to
 +       * reset vector
 +       */
 +      blr
 +
 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 +
 +      /*
 +       * POWER ISA 2.07 or less.
 +       * Check if last bit of HSPGR0 is set. This indicates whether we are
 +       * waking up from winkle.
 +       */
 +      clrldi  r5,r13,63
 +      clrrdi  r13,r13,1
 +      cmpwi   cr4,r5,1
 +      mtspr   SPRN_HSPRG0,r13
 +
 +      lbz     r0,PACA_THREAD_IDLE_STATE(r13)
 +      cmpwi   cr2,r0,PNV_THREAD_NAP
 +      bgt     cr2,pnv_wakeup_tb_loss  /* Either sleep or Winkle */
 +
 +      /*
 +       * We fall through here if PACA_THREAD_IDLE_STATE shows we are waking
 +       * up from nap. At this stage CR3 shouldn't contains 'gt' since that
 +       * indicates we are waking with hypervisor state loss from nap.
 +       */
 +      bgt     cr3,.
 +
 +      blr     /* Return back to System Reset vector from where
 +                 pnv_restore_hyp_resource was invoked */
 +
 +/*
 + * Called if waking up from idle state which can cause either partial or
 + * complete hyp state loss.
 + * In POWER8, called if waking up from fastsleep or winkle
 + * In POWER9, called if waking up from stop state >= pnv_first_deep_stop_state
 + *
 + * r13 - PACA
 + * cr3 - gt if waking up with partial/complete hypervisor state loss
 + * cr4 - eq if waking up from complete hypervisor state loss.
 + */
 +_GLOBAL(pnv_wakeup_tb_loss)
 +      ld      r1,PACAR1(r13)
 +      /*
 +       * Before entering any idle state, the NVGPRs are saved in the stack
 +       * and they are restored before switching to the process context. Hence
 +       * until they are restored, they are free to be used.
 +       *
 +       * Save SRR1 and LR in NVGPRs as they might be clobbered in
 +       * opal_call() (called in CHECK_HMI_INTERRUPT). SRR1 is required
 +       * to determine the wakeup reason if we branch to kvm_start_guest. LR
 +       * is required to return back to reset vector after hypervisor state
 +       * restore is complete.
 +       */
 +      mflr    r17
 +      mfspr   r16,SPRN_SRR1
 +BEGIN_FTR_SECTION
 +      CHECK_HMI_INTERRUPT
 +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 +
 +      lbz     r7,PACA_THREAD_MASK(r13)
 +      ld      r14,PACA_CORE_IDLE_STATE_PTR(r13)
 +lwarx_loop2:
 +      lwarx   r15,0,r14
 +      andi.   r9,r15,PNV_CORE_IDLE_LOCK_BIT
 +      /*
 +       * Lock bit is set in one of the 2 cases-
 +       * a. In the sleep/winkle enter path, the last thread is executing
 +       * fastsleep workaround code.
 +       * b. In the wake up path, another thread is executing fastsleep
 +       * workaround undo code or resyncing timebase or restoring context
 +       * In either case loop until the lock bit is cleared.
 +       */
 +      bnel    core_idle_lock_held
 +
 +      cmpwi   cr2,r15,0
 +
 +      /*
 +       * At this stage
 +       * cr2 - eq if first thread to wakeup in core
 +       * cr3-  gt if waking up with partial/complete hypervisor state loss
 +       * cr4 - eq if waking up from complete hypervisor state loss.
 +       */
 +
 +      ori     r15,r15,PNV_CORE_IDLE_LOCK_BIT
 +      stwcx.  r15,0,r14
 +      bne-    lwarx_loop2
 +      isync
 +
 +BEGIN_FTR_SECTION
 +      lbz     r4,PACA_SUBCORE_SIBLING_MASK(r13)
 +      and     r4,r4,r15
 +      cmpwi   r4,0    /* Check if first in subcore */
 +
 +      or      r15,r15,r7              /* Set thread bit */
 +      beq     first_thread_in_subcore
 +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 +
 +      or      r15,r15,r7              /* Set thread bit */
 +      beq     cr2,first_thread_in_core
 +
 +      /* Not first thread in core or subcore to wake up */
 +      b       clear_lock
 +
 +first_thread_in_subcore:
 +      /*
 +       * If waking up from sleep, subcore state is not lost. Hence
 +       * skip subcore state restore
 +       */
 +      bne     cr4,subcore_state_restored
 +
 +      /* Restore per-subcore state */
 +      ld      r4,_SDR1(r1)
 +      mtspr   SPRN_SDR1,r4
 +
 +      ld      r4,_RPR(r1)
 +      mtspr   SPRN_RPR,r4
 +      ld      r4,_AMOR(r1)
 +      mtspr   SPRN_AMOR,r4
 +
 +subcore_state_restored:
 +      /*
 +       * Check if the thread is also the first thread in the core. If not,
 +       * skip to clear_lock.
 +       */
 +      bne     cr2,clear_lock
 +
 +first_thread_in_core:
 +
 +      /*
 +       * First thread in the core waking up from any state which can cause
 +       * partial or complete hypervisor state loss. It needs to
 +       * call the fastsleep workaround code if the platform requires it.
 +       * Call it unconditionally here. The below branch instruction will
 +       * be patched out if the platform does not have fastsleep or does not
 +       * require the workaround. Patching will be performed during the
 +       * discovery of idle-states.
 +       */
 +.global pnv_fastsleep_workaround_at_exit
 +pnv_fastsleep_workaround_at_exit:
 +      b       fastsleep_workaround_at_exit
 +
 +timebase_resync:
 +      /*
 +       * Use cr3 which indicates that we are waking up with atleast partial
 +       * hypervisor state loss to determine if TIMEBASE RESYNC is needed.
 +       */
 +      ble     cr3,clear_lock
 +      /* Time base re-sync */
 +      bl      opal_rm_resync_timebase;
 +      /*
 +       * If waking up from sleep, per core state is not lost, skip to
 +       * clear_lock.
 +       */
 +      bne     cr4,clear_lock
 +
 +      /*
 +       * First thread in the core to wake up and its waking up with
 +       * complete hypervisor state loss. Restore per core hypervisor
 +       * state.
 +       */
 +BEGIN_FTR_SECTION
 +      ld      r4,_PTCR(r1)
 +      mtspr   SPRN_PTCR,r4
 +      ld      r4,_RPR(r1)
 +      mtspr   SPRN_RPR,r4
 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 +
 +      ld      r4,_TSCR(r1)
 +      mtspr   SPRN_TSCR,r4
 +      ld      r4,_WORC(r1)
 +      mtspr   SPRN_WORC,r4
 +
 +clear_lock:
 +      andi.   r15,r15,PNV_CORE_IDLE_THREAD_BITS
 +      lwsync
 +      stw     r15,0(r14)
 +
 +common_exit:
 +      /*
 +       * Common to all threads.
 +       *
 +       * If waking up from sleep, hypervisor state is not lost. Hence
 +       * skip hypervisor state restore.
 +       */
 +      bne     cr4,hypervisor_state_restored
 +
 +      /* Waking up from winkle */
 +
 +BEGIN_MMU_FTR_SECTION
 +      b       no_segments
 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX)
 +      /* Restore SLB  from PACA */
 +      ld      r8,PACA_SLBSHADOWPTR(r13)
 +
 +      .rept   SLB_NUM_BOLTED
 +      li      r3, SLBSHADOW_SAVEAREA
 +      LDX_BE  r5, r8, r3
 +      addi    r3, r3, 8
 +      LDX_BE  r6, r8, r3
 +      andis.  r7,r5,SLB_ESID_V@h
 +      beq     1f
 +      slbmte  r6,r5
 +1:    addi    r8,r8,16
 +      .endr
 +no_segments:
 +
 +      /* Restore per thread state */
 +
 +      ld      r4,_SPURR(r1)
 +      mtspr   SPRN_SPURR,r4
 +      ld      r4,_PURR(r1)
 +      mtspr   SPRN_PURR,r4
 +      ld      r4,_DSCR(r1)
 +      mtspr   SPRN_DSCR,r4
 +      ld      r4,_WORT(r1)
 +      mtspr   SPRN_WORT,r4
 +
 +      /* Call cur_cpu_spec->cpu_restore() */
 +      LOAD_REG_ADDR(r4, cur_cpu_spec)
 +      ld      r4,0(r4)
 +      ld      r12,CPU_SPEC_RESTORE(r4)
 +#ifdef PPC64_ELF_ABI_v1
 +      ld      r12,0(r12)
 +#endif
 +      mtctr   r12
 +      bctrl
 +
 +hypervisor_state_restored:
 +
 +      mtspr   SPRN_SRR1,r16
 +      mtlr    r17
 +      blr     /* Return back to System Reset vector from where
 +                 pnv_restore_hyp_resource was invoked */
 +
 +fastsleep_workaround_at_exit:
 +      li      r3,1
 +      li      r4,0
 +      bl      opal_rm_config_cpu_idle_state
 +      b       timebase_resync
 +
 +/*
 + * R3 here contains the value that will be returned to the caller
 + * of power7_nap.
 + */
 +_GLOBAL(pnv_wakeup_loss)
 +      ld      r1,PACAR1(r13)
 +BEGIN_FTR_SECTION
 +      CHECK_HMI_INTERRUPT
 +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 +      REST_NVGPRS(r1)
 +      REST_GPR(2, r1)
 +      ld      r6,_CCR(r1)
 +      ld      r4,_MSR(r1)
 +      ld      r5,_NIP(r1)
 +      addi    r1,r1,INT_FRAME_SIZE
 +      mtcr    r6
 +      mtspr   SPRN_SRR1,r4
 +      mtspr   SPRN_SRR0,r5
 +      rfid
 +
 +/*
 + * R3 here contains the value that will be returned to the caller
 + * of power7_nap.
 + */
 +_GLOBAL(pnv_wakeup_noloss)
 +      lbz     r0,PACA_NAPSTATELOST(r13)
 +      cmpwi   r0,0
 +      bne     pnv_wakeup_loss
 +BEGIN_FTR_SECTION
 +      CHECK_HMI_INTERRUPT
 +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 +      ld      r1,PACAR1(r13)
 +      ld      r6,_CCR(r1)
 +      ld      r4,_MSR(r1)
 +      ld      r5,_NIP(r1)
 +      addi    r1,r1,INT_FRAME_SIZE
 +      mtcr    r6
 +      mtspr   SPRN_SRR1,r4
 +      mtspr   SPRN_SRR0,r5
 +      rfid
@@@ -60,7 -60,7 +60,8 @@@
  #include <asm/switch_to.h>
  #include <asm/tm.h>
  #include <asm/debug.h>
 +#include <asm/asm-prototypes.h>
+ #include <asm/hmi.h>
  #include <sysdev/fsl_pci.h>
  
  #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
@@@ -308,9 -308,13 +309,13 @@@ long hmi_exception_realmode(struct pt_r
  {
        __this_cpu_inc(irq_stat.hmi_exceptions);
  
+       wait_for_subcore_guest_exit();
        if (ppc_md.hmi_exception_early)
                ppc_md.hmi_exception_early(regs);
  
+       wait_for_tb_resync();
        return 0;
  }
  
@@@ -1377,7 -1381,6 +1382,7 @@@ void facility_unavailable_exception(str
                [FSCR_TM_LG] = "TM",
                [FSCR_EBB_LG] = "EBB",
                [FSCR_TAR_LG] = "TAR",
 +              [FSCR_LM_LG] = "LM",
        };
        char *facility = "unknown";
        u64 value;
                        rd = (instword >> 21) & 0x1f;
                        current->thread.dscr = regs->gpr[rd];
                        current->thread.dscr_inherit = 1;
 -                      mtspr(SPRN_FSCR, value | FSCR_DSCR);
 +                      current->thread.fscr |= FSCR_DSCR;
 +                      mtspr(SPRN_FSCR, current->thread.fscr);
                }
  
                /* Read from DSCR (mfspr RT, 0x03) */
                        emulate_single_step(regs);
                }
                return;
 +      } else if ((status == FSCR_LM_LG) && cpu_has_feature(CPU_FTR_ARCH_300)) {
 +              /*
 +               * This process has touched LM, so turn it on forever
 +               * for this process
 +               */
 +              current->thread.fscr |= FSCR_LM;
 +              mtspr(SPRN_FSCR, current->thread.fscr);
 +              return;
        }
  
        if ((status < ARRAY_SIZE(facility_strings)) &&
@@@ -29,6 -29,7 +29,7 @@@
  #include <asm/kvm_book3s_asm.h>
  #include <asm/book3s/64/mmu-hash.h>
  #include <asm/tm.h>
+ #include <asm/opal.h>
  
  #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
  
@@@ -373,6 -374,18 +374,18 @@@ kvm_secondary_got_guest
        lwsync
        std     r0, HSTATE_KVM_VCORE(r13)
  
+       /*
+        * All secondaries exiting guest will fall through this path.
+        * Before proceeding, just check for HMI interrupt and
+        * invoke opal hmi handler. By now we are sure that the
+        * primary thread on this core/subcore has already made partition
+        * switch/TB resync and we are good to call opal hmi handler.
+        */
+       cmpwi   r12, BOOK3S_INTERRUPT_HMI
+       bne     kvm_no_guest
+       li      r3,0                    /* NULL argument */
+       bl      hmi_exception_realmode
  /*
   * At this point we have finished executing in the guest.
   * We need to wait for hwthread_req to become zero, since
@@@ -392,7 -405,7 +405,7 @@@ kvm_no_guest
        cmpwi   r3, 0
        bne     54f
  /*
 - * We jump to power7_wakeup_loss, which will return to the caller
 + * We jump to pnv_wakeup_loss, which will return to the caller
   * of power7_nap in the powernv cpu offline loop.  The value we
   * put in r3 becomes the return value for power7_nap.
   */
        rlwimi  r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
        mtspr   SPRN_LPCR, r4
        li      r3, 0
 -      b       power7_wakeup_loss
 +      b       pnv_wakeup_loss
  
  53:   HMT_LOW
        ld      r5, HSTATE_KVM_VCORE(r13)
   * whole-core mode, so we need to nap.
   */
  kvm_unsplit_nap:
+       /*
+        * When secondaries are napping in kvm_unsplit_nap() with
+        * hwthread_req = 1, HMI goes ignored even though subcores are
+        * already exited the guest. Hence HMI keeps waking up secondaries
+        * from nap in a loop and secondaries always go back to nap since
+        * no vcore is assigned to them. This makes impossible for primary
+        * thread to get hold of secondary threads resulting into a soft
+        * lockup in KVM path.
+        *
+        * Let us check if HMI is pending and handle it before we go to nap.
+        */
+       cmpwi   r12, BOOK3S_INTERRUPT_HMI
+       bne     55f
+       li      r3, 0                   /* NULL argument */
+       bl      hmi_exception_realmode
+ 55:
        /*
         * Ensure that secondary doesn't nap when it has
         * its vcore pointer set.
@@@ -601,6 -630,11 +630,11 @@@ BEGIN_FTR_SECTIO
        mtspr   SPRN_DPDES, r8
  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  
+       /* Mark the subcore state as inside guest */
+       bl      kvmppc_subcore_enter_guest
+       nop
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       ld      r4, HSTATE_KVM_VCPU(r13)
        li      r0,1
        stb     r0,VCORE_IN_GUEST(r5)   /* signal secondaries to continue */
  
@@@ -655,112 -689,8 +689,8 @@@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S
  
  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
  BEGIN_FTR_SECTION
-       b       skip_tm
- END_FTR_SECTION_IFCLR(CPU_FTR_TM)
-       /* Turn on TM/FP/VSX/VMX so we can restore them. */
-       mfmsr   r5
-       li      r6, MSR_TM >> 32
-       sldi    r6, r6, 32
-       or      r5, r5, r6
-       ori     r5, r5, MSR_FP
-       oris    r5, r5, (MSR_VEC | MSR_VSX)@h
-       mtmsrd  r5
-       /*
-        * The user may change these outside of a transaction, so they must
-        * always be context switched.
-        */
-       ld      r5, VCPU_TFHAR(r4)
-       ld      r6, VCPU_TFIAR(r4)
-       ld      r7, VCPU_TEXASR(r4)
-       mtspr   SPRN_TFHAR, r5
-       mtspr   SPRN_TFIAR, r6
-       mtspr   SPRN_TEXASR, r7
-       ld      r5, VCPU_MSR(r4)
-       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-       beq     skip_tm /* TM not active in guest */
-       /* Make sure the failure summary is set, otherwise we'll program check
-        * when we trechkpt.  It's possible that this might have been not set
-        * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
-        * host.
-        */
-       oris    r7, r7, (TEXASR_FS)@h
-       mtspr   SPRN_TEXASR, r7
-       /*
-        * We need to load up the checkpointed state for the guest.
-        * We need to do this early as it will blow away any GPRs, VSRs and
-        * some SPRs.
-        */
-       mr      r31, r4
-       addi    r3, r31, VCPU_FPRS_TM
-       bl      load_fp_state
-       addi    r3, r31, VCPU_VRS_TM
-       bl      load_vr_state
-       mr      r4, r31
-       lwz     r7, VCPU_VRSAVE_TM(r4)
-       mtspr   SPRN_VRSAVE, r7
-       ld      r5, VCPU_LR_TM(r4)
-       lwz     r6, VCPU_CR_TM(r4)
-       ld      r7, VCPU_CTR_TM(r4)
-       ld      r8, VCPU_AMR_TM(r4)
-       ld      r9, VCPU_TAR_TM(r4)
-       mtlr    r5
-       mtcr    r6
-       mtctr   r7
-       mtspr   SPRN_AMR, r8
-       mtspr   SPRN_TAR, r9
-       /*
-        * Load up PPR and DSCR values but don't put them in the actual SPRs
-        * till the last moment to avoid running with userspace PPR and DSCR for
-        * too long.
-        */
-       ld      r29, VCPU_DSCR_TM(r4)
-       ld      r30, VCPU_PPR_TM(r4)
-       std     r2, PACATMSCRATCH(r13) /* Save TOC */
-       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
-       li      r5, 0
-       mtmsrd  r5, 1
-       /* Load GPRs r0-r28 */
-       reg = 0
-       .rept   29
-       ld      reg, VCPU_GPRS_TM(reg)(r31)
-       reg = reg + 1
-       .endr
-       mtspr   SPRN_DSCR, r29
-       mtspr   SPRN_PPR, r30
-       /* Load final GPRs */
-       ld      29, VCPU_GPRS_TM(29)(r31)
-       ld      30, VCPU_GPRS_TM(30)(r31)
-       ld      31, VCPU_GPRS_TM(31)(r31)
-       /* TM checkpointed state is now setup.  All GPRs are now volatile. */
-       TRECHKPT
-       /* Now let's get back the state we need. */
-       HMT_MEDIUM
-       GET_PACA(r13)
-       ld      r29, HSTATE_DSCR(r13)
-       mtspr   SPRN_DSCR, r29
-       ld      r4, HSTATE_KVM_VCPU(r13)
-       ld      r1, HSTATE_HOST_R1(r13)
-       ld      r2, PACATMSCRATCH(r13)
-       /* Set the MSR RI since we have our registers back. */
-       li      r5, MSR_RI
-       mtmsrd  r5, 1
- skip_tm:
+       bl      kvmppc_restore_tm
+ END_FTR_SECTION_IFSET(CPU_FTR_TM)
  #endif
  
        /* Load guest PMU registers */
@@@ -841,12 -771,6 +771,6 @@@ BEGIN_FTR_SECTIO
        /* Skip next section on POWER7 */
        b       8f
  END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
-       /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
-       mfmsr   r8
-       li      r0, 1
-       rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-       mtmsrd  r8
        /* Load up POWER8-specific registers */
        ld      r5, VCPU_IAMR(r4)
        lwz     r6, VCPU_PSPB(r4)
@@@ -1436,106 -1360,8 +1360,8 @@@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S
  
  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
  BEGIN_FTR_SECTION
-       b       2f
- END_FTR_SECTION_IFCLR(CPU_FTR_TM)
-       /* Turn on TM. */
-       mfmsr   r8
-       li      r0, 1
-       rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-       mtmsrd  r8
-       ld      r5, VCPU_MSR(r9)
-       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-       beq     1f      /* TM not active in guest. */
-       li      r3, TM_CAUSE_KVM_RESCHED
-       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
-       li      r5, 0
-       mtmsrd  r5, 1
-       /* All GPRs are volatile at this point. */
-       TRECLAIM(R3)
-       /* Temporarily store r13 and r9 so we have some regs to play with */
-       SET_SCRATCH0(r13)
-       GET_PACA(r13)
-       std     r9, PACATMSCRATCH(r13)
-       ld      r9, HSTATE_KVM_VCPU(r13)
-       /* Get a few more GPRs free. */
-       std     r29, VCPU_GPRS_TM(29)(r9)
-       std     r30, VCPU_GPRS_TM(30)(r9)
-       std     r31, VCPU_GPRS_TM(31)(r9)
-       /* Save away PPR and DSCR soon so don't run with user values. */
-       mfspr   r31, SPRN_PPR
-       HMT_MEDIUM
-       mfspr   r30, SPRN_DSCR
-       ld      r29, HSTATE_DSCR(r13)
-       mtspr   SPRN_DSCR, r29
-       /* Save all but r9, r13 & r29-r31 */
-       reg = 0
-       .rept   29
-       .if (reg != 9) && (reg != 13)
-       std     reg, VCPU_GPRS_TM(reg)(r9)
-       .endif
-       reg = reg + 1
-       .endr
-       /* ... now save r13 */
-       GET_SCRATCH0(r4)
-       std     r4, VCPU_GPRS_TM(13)(r9)
-       /* ... and save r9 */
-       ld      r4, PACATMSCRATCH(r13)
-       std     r4, VCPU_GPRS_TM(9)(r9)
-       /* Reload stack pointer and TOC. */
-       ld      r1, HSTATE_HOST_R1(r13)
-       ld      r2, PACATOC(r13)
-       /* Set MSR RI now we have r1 and r13 back. */
-       li      r5, MSR_RI
-       mtmsrd  r5, 1
-       /* Save away checkpinted SPRs. */
-       std     r31, VCPU_PPR_TM(r9)
-       std     r30, VCPU_DSCR_TM(r9)
-       mflr    r5
-       mfcr    r6
-       mfctr   r7
-       mfspr   r8, SPRN_AMR
-       mfspr   r10, SPRN_TAR
-       std     r5, VCPU_LR_TM(r9)
-       stw     r6, VCPU_CR_TM(r9)
-       std     r7, VCPU_CTR_TM(r9)
-       std     r8, VCPU_AMR_TM(r9)
-       std     r10, VCPU_TAR_TM(r9)
-       /* Restore r12 as trap number. */
-       lwz     r12, VCPU_TRAP(r9)
-       /* Save FP/VSX. */
-       addi    r3, r9, VCPU_FPRS_TM
-       bl      store_fp_state
-       addi    r3, r9, VCPU_VRS_TM
-       bl      store_vr_state
-       mfspr   r6, SPRN_VRSAVE
-       stw     r6, VCPU_VRSAVE_TM(r9)
- 1:
-       /*
-        * We need to save these SPRs after the treclaim so that the software
-        * error code is recorded correctly in the TEXASR.  Also the user may
-        * change these outside of a transaction, so they must always be
-        * context switched.
-        */
-       mfspr   r5, SPRN_TFHAR
-       mfspr   r6, SPRN_TFIAR
-       mfspr   r7, SPRN_TEXASR
-       std     r5, VCPU_TFHAR(r9)
-       std     r6, VCPU_TFIAR(r9)
-       std     r7, VCPU_TEXASR(r9)
- 2:
+       bl      kvmppc_save_tm
+ END_FTR_SECTION_IFSET(CPU_FTR_TM)
  #endif
  
        /* Increment yield count if they have a VPA */
@@@ -1683,6 -1509,23 +1509,23 @@@ BEGIN_FTR_SECTIO
        mtspr   SPRN_DPDES, r8
  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  
+       /* If HMI, call kvmppc_realmode_hmi_handler() */
+       cmpwi   r12, BOOK3S_INTERRUPT_HMI
+       bne     27f
+       bl      kvmppc_realmode_hmi_handler
+       nop
+       li      r12, BOOK3S_INTERRUPT_HMI
+       /*
+        * At this point kvmppc_realmode_hmi_handler would have resync-ed
+        * the TB. Hence it is not required to subtract guest timebase
+        * offset from timebase. So, skip it.
+        *
+        * Also, do not call kvmppc_subcore_exit_guest() because it has
+        * been invoked as part of kvmppc_realmode_hmi_handler().
+        */
+       b       30f
+ 27:
        /* Subtract timebase offset from timebase */
        ld      r8,VCORE_TB_OFFSET(r5)
        cmpdi   r8,0
        addis   r8,r8,0x100             /* if so, increment upper 40 bits */
        mtspr   SPRN_TBU40,r8
  
+ 17:   bl      kvmppc_subcore_exit_guest
+       nop
+ 30:   ld      r5,HSTATE_KVM_VCORE(r13)
+       ld      r4,VCORE_KVM(r5)        /* pointer to struct kvm */
        /* Reset PCR */
17:   ld      r0, VCORE_PCR(r5)
      ld      r0, VCORE_PCR(r5)
        cmpdi   r0, 0
        beq     18f
        li      r0, 0
@@@ -2245,6 -2093,13 +2093,13 @@@ _GLOBAL(kvmppc_h_cede)                /* r3 = vcpu po
        /* save FP state */
        bl      kvmppc_save_fp
  
+ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ BEGIN_FTR_SECTION
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       bl      kvmppc_save_tm
+ END_FTR_SECTION_IFSET(CPU_FTR_TM)
+ #endif
        /*
         * Set DEC to the smaller of DEC and HDEC, so that we wake
         * no later than the end of our timeslice (HDEC interrupts
@@@ -2321,6 -2176,12 +2176,12 @@@ kvm_end_cede
        bl      kvmhv_accumulate_time
  #endif
  
+ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ BEGIN_FTR_SECTION
+       bl      kvmppc_restore_tm
+ END_FTR_SECTION_IFSET(CPU_FTR_TM)
+ #endif
        /* load up FP state */
        bl      kvmppc_load_fp
  
@@@ -2461,6 -2322,8 +2322,8 @@@ BEGIN_FTR_SECTIO
        cmpwi   r6, 3                   /* hypervisor doorbell? */
        beq     3f
  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       cmpwi   r6, 0xa                 /* Hypervisor maintenance ? */
+       beq     4f
        li      r3, 1                   /* anything else, return 1 */
  0:    blr
  
        li      r3, -1
        blr
  
+       /* Woken up due to Hypervisor maintenance interrupt */
+ 4:    li      r12, BOOK3S_INTERRUPT_HMI
+       li      r3, 1
+       blr
  /*
   * Determine what sort of external interrupt is pending (if any).
   * Returns:
@@@ -2631,6 -2499,239 +2499,239 @@@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC
        mr      r4,r31
        blr
  
+ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ /*
+  * Save transactional state and TM-related registers.
+  * Called with r9 pointing to the vcpu struct.
+  * This can modify all checkpointed registers, but
+  * restores r1, r2 and r9 (vcpu pointer) before exit.
+  */
+ kvmppc_save_tm:
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+       /* Turn on TM. */
+       mfmsr   r8
+       li      r0, 1
+       rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+       mtmsrd  r8
+       ld      r5, VCPU_MSR(r9)
+       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+       beq     1f      /* TM not active in guest. */
+       std     r1, HSTATE_HOST_R1(r13)
+       li      r3, TM_CAUSE_KVM_RESCHED
+       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+       li      r5, 0
+       mtmsrd  r5, 1
+       /* All GPRs are volatile at this point. */
+       TRECLAIM(R3)
+       /* Temporarily store r13 and r9 so we have some regs to play with */
+       SET_SCRATCH0(r13)
+       GET_PACA(r13)
+       std     r9, PACATMSCRATCH(r13)
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       /* Get a few more GPRs free. */
+       std     r29, VCPU_GPRS_TM(29)(r9)
+       std     r30, VCPU_GPRS_TM(30)(r9)
+       std     r31, VCPU_GPRS_TM(31)(r9)
+       /* Save away PPR and DSCR soon so don't run with user values. */
+       mfspr   r31, SPRN_PPR
+       HMT_MEDIUM
+       mfspr   r30, SPRN_DSCR
+       ld      r29, HSTATE_DSCR(r13)
+       mtspr   SPRN_DSCR, r29
+       /* Save all but r9, r13 & r29-r31 */
+       reg = 0
+       .rept   29
+       .if (reg != 9) && (reg != 13)
+       std     reg, VCPU_GPRS_TM(reg)(r9)
+       .endif
+       reg = reg + 1
+       .endr
+       /* ... now save r13 */
+       GET_SCRATCH0(r4)
+       std     r4, VCPU_GPRS_TM(13)(r9)
+       /* ... and save r9 */
+       ld      r4, PACATMSCRATCH(r13)
+       std     r4, VCPU_GPRS_TM(9)(r9)
+       /* Reload stack pointer and TOC. */
+       ld      r1, HSTATE_HOST_R1(r13)
+       ld      r2, PACATOC(r13)
+       /* Set MSR RI now we have r1 and r13 back. */
+       li      r5, MSR_RI
+       mtmsrd  r5, 1
+       /* Save away checkpinted SPRs. */
+       std     r31, VCPU_PPR_TM(r9)
+       std     r30, VCPU_DSCR_TM(r9)
+       mflr    r5
+       mfcr    r6
+       mfctr   r7
+       mfspr   r8, SPRN_AMR
+       mfspr   r10, SPRN_TAR
+       std     r5, VCPU_LR_TM(r9)
+       stw     r6, VCPU_CR_TM(r9)
+       std     r7, VCPU_CTR_TM(r9)
+       std     r8, VCPU_AMR_TM(r9)
+       std     r10, VCPU_TAR_TM(r9)
+       /* Restore r12 as trap number. */
+       lwz     r12, VCPU_TRAP(r9)
+       /* Save FP/VSX. */
+       addi    r3, r9, VCPU_FPRS_TM
+       bl      store_fp_state
+       addi    r3, r9, VCPU_VRS_TM
+       bl      store_vr_state
+       mfspr   r6, SPRN_VRSAVE
+       stw     r6, VCPU_VRSAVE_TM(r9)
+ 1:
+       /*
+        * We need to save these SPRs after the treclaim so that the software
+        * error code is recorded correctly in the TEXASR.  Also the user may
+        * change these outside of a transaction, so they must always be
+        * context switched.
+        */
+       mfspr   r5, SPRN_TFHAR
+       mfspr   r6, SPRN_TFIAR
+       mfspr   r7, SPRN_TEXASR
+       std     r5, VCPU_TFHAR(r9)
+       std     r6, VCPU_TFIAR(r9)
+       std     r7, VCPU_TEXASR(r9)
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
+       blr
+ /*
+  * Restore transactional state and TM-related registers.
+  * Called with r4 pointing to the vcpu struct.
+  * This potentially modifies all checkpointed registers.
+  * It restores r1, r2, r4 from the PACA.
+  */
+ kvmppc_restore_tm:
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+       /* Turn on TM/FP/VSX/VMX so we can restore them. */
+       mfmsr   r5
+       li      r6, MSR_TM >> 32
+       sldi    r6, r6, 32
+       or      r5, r5, r6
+       ori     r5, r5, MSR_FP
+       oris    r5, r5, (MSR_VEC | MSR_VSX)@h
+       mtmsrd  r5
+       /*
+        * The user may change these outside of a transaction, so they must
+        * always be context switched.
+        */
+       ld      r5, VCPU_TFHAR(r4)
+       ld      r6, VCPU_TFIAR(r4)
+       ld      r7, VCPU_TEXASR(r4)
+       mtspr   SPRN_TFHAR, r5
+       mtspr   SPRN_TFIAR, r6
+       mtspr   SPRN_TEXASR, r7
+       ld      r5, VCPU_MSR(r4)
+       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+       beqlr           /* TM not active in guest */
+       std     r1, HSTATE_HOST_R1(r13)
+       /* Make sure the failure summary is set, otherwise we'll program check
+        * when we trechkpt.  It's possible that this might have been not set
+        * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
+        * host.
+        */
+       oris    r7, r7, (TEXASR_FS)@h
+       mtspr   SPRN_TEXASR, r7
+       /*
+        * We need to load up the checkpointed state for the guest.
+        * We need to do this early as it will blow away any GPRs, VSRs and
+        * some SPRs.
+        */
+       mr      r31, r4
+       addi    r3, r31, VCPU_FPRS_TM
+       bl      load_fp_state
+       addi    r3, r31, VCPU_VRS_TM
+       bl      load_vr_state
+       mr      r4, r31
+       lwz     r7, VCPU_VRSAVE_TM(r4)
+       mtspr   SPRN_VRSAVE, r7
+       ld      r5, VCPU_LR_TM(r4)
+       lwz     r6, VCPU_CR_TM(r4)
+       ld      r7, VCPU_CTR_TM(r4)
+       ld      r8, VCPU_AMR_TM(r4)
+       ld      r9, VCPU_TAR_TM(r4)
+       mtlr    r5
+       mtcr    r6
+       mtctr   r7
+       mtspr   SPRN_AMR, r8
+       mtspr   SPRN_TAR, r9
+       /*
+        * Load up PPR and DSCR values but don't put them in the actual SPRs
+        * till the last moment to avoid running with userspace PPR and DSCR for
+        * too long.
+        */
+       ld      r29, VCPU_DSCR_TM(r4)
+       ld      r30, VCPU_PPR_TM(r4)
+       std     r2, PACATMSCRATCH(r13) /* Save TOC */
+       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+       li      r5, 0
+       mtmsrd  r5, 1
+       /* Load GPRs r0-r28 */
+       reg = 0
+       .rept   29
+       ld      reg, VCPU_GPRS_TM(reg)(r31)
+       reg = reg + 1
+       .endr
+       mtspr   SPRN_DSCR, r29
+       mtspr   SPRN_PPR, r30
+       /* Load final GPRs */
+       ld      29, VCPU_GPRS_TM(29)(r31)
+       ld      30, VCPU_GPRS_TM(30)(r31)
+       ld      31, VCPU_GPRS_TM(31)(r31)
+       /* TM checkpointed state is now setup.  All GPRs are now volatile. */
+       TRECHKPT
+       /* Now let's get back the state we need. */
+       HMT_MEDIUM
+       GET_PACA(r13)
+       ld      r29, HSTATE_DSCR(r13)
+       mtspr   SPRN_DSCR, r29
+       ld      r4, HSTATE_KVM_VCPU(r13)
+       ld      r1, HSTATE_HOST_R1(r13)
+       ld      r2, PACATMSCRATCH(r13)
+       /* Set the MSR RI since we have our registers back. */
+       li      r5, MSR_RI
+       mtmsrd  r5, 1
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
+       blr
+ #endif
  /*
   * We come here if we get any exception or interrupt while we are
   * executing host real mode code while in guest MMU context.
@@@ -35,7 -35,7 +35,7 @@@
  #include <asm/mmu_context.h>
  #include <asm/switch_to.h>
  #include <asm/firmware.h>
 -#include <asm/hvcall.h>
 +#include <asm/setup.h>
  #include <linux/gfp.h>
  #include <linux/sched.h>
  #include <linux/vmalloc.h>
@@@ -914,7 -914,7 +914,7 @@@ int kvmppc_handle_exit_pr(struct kvm_ru
        /* We get here with MSR.EE=1 */
  
        trace_kvm_exit(exit_nr, vcpu);
-       kvm_guest_exit();
+       guest_exit();
  
        switch (exit_nr) {
        case BOOK3S_INTERRUPT_INST_STORAGE:
                int emul;
  
  program_interrupt:
-               flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+               /*
+                * shadow_srr1 only contains valid flags if we came here via
+                * a program exception. The other exceptions (emulation assist,
+                * FP unavailable, etc.) do not provide flags in SRR1, so use
+                * an illegal-instruction exception when injecting a program
+                * interrupt into the guest.
+                */
+               if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
+                       flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+               else
+                       flags = SRR1_PROGILL;
  
                emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
                if (emul != EMULATE_DONE) {
@@@ -1531,7 -1541,7 +1541,7 @@@ static int kvmppc_vcpu_run_pr(struct kv
  
        kvmppc_clear_debug(vcpu);
  
-       /* No need for kvm_guest_exit. It's done in handle_exit.
+       /* No need for guest_exit. It's done in handle_exit.
           We also get here with interrupts enabled. */
  
        /* Make sure we save the guest FPU/Altivec/VSX state */
@@@ -1690,7 -1700,7 +1700,7 @@@ static int kvmppc_core_init_vm_pr(struc
        if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
                spin_lock(&kvm_global_user_count_lock);
                if (++kvm_global_user_count == 1)
 -                      pSeries_disable_reloc_on_exc();
 +                      pseries_disable_reloc_on_exc();
                spin_unlock(&kvm_global_user_count_lock);
        }
        return 0;
@@@ -1706,7 -1716,7 +1716,7 @@@ static void kvmppc_core_destroy_vm_pr(s
                spin_lock(&kvm_global_user_count_lock);
                BUG_ON(kvm_global_user_count == 0);
                if (--kvm_global_user_count == 0)
 -                      pSeries_enable_reloc_on_exc();
 +                      pseries_enable_reloc_on_exc();
                spin_unlock(&kvm_global_user_count_lock);
        }
  }
@@@ -59,12 -59,11 +59,11 @@@ END_FTR_SECTION(0, 1);                                             
  #define OPAL_CALL(name, token)                \
   _GLOBAL_TOC(name);                   \
        mflr    r0;                     \
 -      std     r0,16(r1);              \
 +      std     r0,PPC_LR_STKOFF(r1);   \
        li      r0,token;               \
        OPAL_BRANCH(opal_tracepoint_entry) \
        mfcr    r12;                    \
        stw     r12,8(r1);              \
-       std     r1,PACAR1(r13);         \
        li      r11,0;                  \
        mfmsr   r12;                    \
        ori     r11,r11,MSR_EE;         \
@@@ -92,7 -91,7 +91,7 @@@ opal_return
        FIXUP_ENDIAN
        ld      r2,PACATOC(r13);
        lwz     r4,8(r1);
 -      ld      r5,16(r1);
 +      ld      r5,PPC_LR_STKOFF(r1);
        ld      r6,PACASAVEDMSR(r13);
        mtspr   SPRN_SRR0,r5;
        mtspr   SPRN_SRR1,r6;
@@@ -127,7 -126,6 +126,6 @@@ opal_tracepoint_entry
        mfcr    r12
        std     r11,16(r1)
        stw     r12,8(r1)
-       std     r1,PACAR1(r13)
        li      r11,0
        mfmsr   r12
        ori     r11,r11,MSR_EE
@@@ -157,37 -155,43 +155,37 @@@ opal_tracepoint_return
        blr
  #endif
  
 -/*
 - * Make opal call in realmode. This is a generic function to be called
 - * from realmode. It handles endianness.
 - *
 - * r13 - paca pointer
 - * r1  - stack pointer
 - * r0  - opal token
 - */
 -_GLOBAL(opal_call_realmode)
 -      mflr    r12
 -      std     r12,PPC_LR_STKOFF(r1)
 -      ld      r2,PACATOC(r13)
 -      /* Set opal return address */
 -      LOAD_REG_ADDR(r12,return_from_opal_call)
 -      mtlr    r12
 -
 -      mfmsr   r12
 -#ifdef __LITTLE_ENDIAN__
 -      /* Handle endian-ness */
 -      li      r11,MSR_LE
 -      andc    r12,r12,r11
 -#endif
 -      mtspr   SPRN_HSRR1,r12
 -      LOAD_REG_ADDR(r11,opal)
 -      ld      r12,8(r11)
 -      ld      r2,0(r11)
 -      mtspr   SPRN_HSRR0,r12
 +#define OPAL_CALL_REAL(name, token)                   \
 + _GLOBAL_TOC(name);                                   \
 +      mflr    r0;                                     \
 +      std     r0,PPC_LR_STKOFF(r1);                   \
 +      li      r0,token;                               \
 +      mfcr    r12;                                    \
 +      stw     r12,8(r1);                              \
 +                                                      \
 +      /* Set opal return address */                   \
 +      LOAD_REG_ADDR(r11, opal_return_realmode);       \
 +      mtlr    r11;                                    \
 +      mfmsr   r12;                                    \
 +      li      r11,MSR_LE;                             \
 +      andc    r12,r12,r11;                            \
 +      mtspr   SPRN_HSRR1,r12;                         \
 +      LOAD_REG_ADDR(r11,opal);                        \
 +      ld      r12,8(r11);                             \
 +      ld      r2,0(r11);                              \
 +      mtspr   SPRN_HSRR0,r12;                         \
        hrfid
  
 -return_from_opal_call:
 -#ifdef __LITTLE_ENDIAN__
 +opal_return_realmode:
        FIXUP_ENDIAN
 -#endif
 +      ld      r2,PACATOC(r13);
 +      lwz     r11,8(r1);
        ld      r12,PPC_LR_STKOFF(r1)
 +      mtcr    r11;
        mtlr    r12
        blr
  
 +
  OPAL_CALL(opal_invalid_call,                  OPAL_INVALID_CALL);
  OPAL_CALL(opal_console_write,                 OPAL_CONSOLE_WRITE);
  OPAL_CALL(opal_console_read,                  OPAL_CONSOLE_READ);
@@@ -265,7 -269,6 +263,7 @@@ OPAL_CALL(opal_validate_flash,                     OPAL_F
  OPAL_CALL(opal_manage_flash,                  OPAL_FLASH_MANAGE);
  OPAL_CALL(opal_update_flash,                  OPAL_FLASH_UPDATE);
  OPAL_CALL(opal_resync_timebase,                       OPAL_RESYNC_TIMEBASE);
 +OPAL_CALL_REAL(opal_rm_resync_timebase,               OPAL_RESYNC_TIMEBASE);
  OPAL_CALL(opal_check_token,                   OPAL_CHECK_TOKEN);
  OPAL_CALL(opal_dump_init,                     OPAL_DUMP_INIT);
  OPAL_CALL(opal_dump_info,                     OPAL_DUMP_INFO);
@@@ -273,7 -276,6 +271,7 @@@ OPAL_CALL(opal_dump_info2,                 OPAL_DUMP_
  OPAL_CALL(opal_dump_read,                     OPAL_DUMP_READ);
  OPAL_CALL(opal_dump_ack,                      OPAL_DUMP_ACK);
  OPAL_CALL(opal_get_msg,                               OPAL_GET_MSG);
 +OPAL_CALL(opal_write_oppanel_async,           OPAL_WRITE_OPPANEL_ASYNC);
  OPAL_CALL(opal_check_completion,              OPAL_CHECK_ASYNC_COMPLETION);
  OPAL_CALL(opal_dump_resend_notification,      OPAL_DUMP_RESEND);
  OPAL_CALL(opal_sync_host_reboot,              OPAL_SYNC_HOST_REBOOT);
@@@ -281,9 -283,7 +279,9 @@@ OPAL_CALL(opal_sensor_read,                        OPAL_SENS
  OPAL_CALL(opal_get_param,                     OPAL_GET_PARAM);
  OPAL_CALL(opal_set_param,                     OPAL_SET_PARAM);
  OPAL_CALL(opal_handle_hmi,                    OPAL_HANDLE_HMI);
 +OPAL_CALL_REAL(opal_rm_handle_hmi,            OPAL_HANDLE_HMI);
  OPAL_CALL(opal_config_cpu_idle_state,         OPAL_CONFIG_CPU_IDLE_STATE);
 +OPAL_CALL_REAL(opal_rm_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE);
  OPAL_CALL(opal_slw_set_reg,                   OPAL_SLW_SET_REG);
  OPAL_CALL(opal_register_dump_region,          OPAL_REGISTER_DUMP_REGION);
  OPAL_CALL(opal_unregister_dump_region,                OPAL_UNREGISTER_DUMP_REGION);
@@@ -300,13 -300,3 +298,13 @@@ OPAL_CALL(opal_prd_msg,                          OPAL_PRD_MSG
  OPAL_CALL(opal_leds_get_ind,                  OPAL_LEDS_GET_INDICATOR);
  OPAL_CALL(opal_leds_set_ind,                  OPAL_LEDS_SET_INDICATOR);
  OPAL_CALL(opal_console_flush,                 OPAL_CONSOLE_FLUSH);
 +OPAL_CALL(opal_get_device_tree,                       OPAL_GET_DEVICE_TREE);
 +OPAL_CALL(opal_pci_get_presence_state,                OPAL_PCI_GET_PRESENCE_STATE);
 +OPAL_CALL(opal_pci_get_power_state,           OPAL_PCI_GET_POWER_STATE);
 +OPAL_CALL(opal_pci_set_power_state,           OPAL_PCI_SET_POWER_STATE);
 +OPAL_CALL(opal_int_get_xirr,                  OPAL_INT_GET_XIRR);
 +OPAL_CALL(opal_int_set_cppr,                  OPAL_INT_SET_CPPR);
 +OPAL_CALL(opal_int_eoi,                               OPAL_INT_EOI);
 +OPAL_CALL(opal_int_set_mfrr,                  OPAL_INT_SET_MFRR);
 +OPAL_CALL(opal_pci_tce_kill,                  OPAL_PCI_TCE_KILL);
 +OPAL_CALL_REAL(opal_rm_pci_tce_kill,          OPAL_PCI_TCE_KILL);
@@@ -49,7 -49,7 +49,7 @@@ static inline void diag10_range(unsigne
        diag_stat_inc(DIAG_STAT_X010);
        asm volatile(
                "0:     diag    %0,%1,0x10\n"
 -              "1:\n"
 +              "1:     nopr    %%r7\n"
                EX_TABLE(0b, 1b)
                EX_TABLE(1b, 1b)
                : : "a" (start_addr), "a" (end_addr));
@@@ -78,4 -78,153 +78,153 @@@ struct diag210 
  
  extern int diag210(struct diag210 *addr);
  
+ /* bit is set in flags, when physical cpu info is included in diag 204 data */
+ #define DIAG204_LPAR_PHYS_FLG 0x80
+ #define DIAG204_LPAR_NAME_LEN 8               /* lpar name len in diag 204 data */
+ #define DIAG204_CPU_NAME_LEN 16               /* type name len of cpus in diag224 name table */
+ /* diag 204 subcodes */
+ enum diag204_sc {
+       DIAG204_SUBC_STIB4 = 4,
+       DIAG204_SUBC_RSI = 5,
+       DIAG204_SUBC_STIB6 = 6,
+       DIAG204_SUBC_STIB7 = 7
+ };
+ /* The two available diag 204 data formats */
+ enum diag204_format {
+       DIAG204_INFO_SIMPLE = 0,
+       DIAG204_INFO_EXT = 0x00010000
+ };
+ enum diag204_cpu_flags {
+       DIAG204_CPU_ONLINE = 0x20,
+       DIAG204_CPU_CAPPED = 0x40,
+ };
+ struct diag204_info_blk_hdr {
+       __u8  npar;
+       __u8  flags;
+       __u16 tslice;
+       __u16 phys_cpus;
+       __u16 this_part;
+       __u64 curtod;
+ } __packed;
+ struct diag204_x_info_blk_hdr {
+       __u8  npar;
+       __u8  flags;
+       __u16 tslice;
+       __u16 phys_cpus;
+       __u16 this_part;
+       __u64 curtod1;
+       __u64 curtod2;
+       char reserved[40];
+ } __packed;
+ struct diag204_part_hdr {
+       __u8 pn;
+       __u8 cpus;
+       char reserved[6];
+       char part_name[DIAG204_LPAR_NAME_LEN];
+ } __packed;
+ struct diag204_x_part_hdr {
+       __u8  pn;
+       __u8  cpus;
+       __u8  rcpus;
+       __u8  pflag;
+       __u32 mlu;
+       char  part_name[DIAG204_LPAR_NAME_LEN];
+       char  lpc_name[8];
+       char  os_name[8];
+       __u64 online_cs;
+       __u64 online_es;
+       __u8  upid;
+       __u8  reserved:3;
+       __u8  mtid:5;
+       char  reserved1[2];
+       __u32 group_mlu;
+       char  group_name[8];
+       char  hardware_group_name[8];
+       char  reserved2[24];
+ } __packed;
+ struct diag204_cpu_info {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       __u8  cflag;
+       __u16 weight;
+       __u64 acc_time;
+       __u64 lp_time;
+ } __packed;
+ struct diag204_x_cpu_info {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       __u8  cflag;
+       __u16 weight;
+       __u64 acc_time;
+       __u64 lp_time;
+       __u16 min_weight;
+       __u16 cur_weight;
+       __u16 max_weight;
+       char  reseved2[2];
+       __u64 online_time;
+       __u64 wait_time;
+       __u32 pma_weight;
+       __u32 polar_weight;
+       __u32 cpu_type_cap;
+       __u32 group_cpu_type_cap;
+       char  reserved3[32];
+ } __packed;
+ struct diag204_phys_hdr {
+       char reserved1[1];
+       __u8 cpus;
+       char reserved2[6];
+       char mgm_name[8];
+ } __packed;
+ struct diag204_x_phys_hdr {
+       char reserved1[1];
+       __u8 cpus;
+       char reserved2[6];
+       char mgm_name[8];
+       char reserved3[80];
+ } __packed;
+ struct diag204_phys_cpu {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       char  reserved2[3];
+       __u64 mgm_time;
+       char  reserved3[8];
+ } __packed;
+ struct diag204_x_phys_cpu {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       char  reserved2[1];
+       __u16 weight;
+       __u64 mgm_time;
+       char  reserved3[80];
+ } __packed;
+ struct diag204_x_part_block {
+       struct diag204_x_part_hdr hdr;
+       struct diag204_x_cpu_info cpus[];
+ } __packed;
+ struct diag204_x_phys_block {
+       struct diag204_x_phys_hdr hdr;
+       struct diag204_x_phys_cpu cpus[];
+ } __packed;
+ int diag204(unsigned long subcode, unsigned long size, void *addr);
+ int diag224(void *ptr);
  #endif /* _ASM_S390_DIAG_H */
@@@ -43,6 -43,7 +43,7 @@@
  /* s390-specific vcpu->requests bit members */
  #define KVM_REQ_ENABLE_IBS         8
  #define KVM_REQ_DISABLE_IBS        9
+ #define KVM_REQ_ICPT_OPEREXC       10
  
  #define SIGP_CTRL_C           0x80
  #define SIGP_CTRL_SCN_MASK    0x3f
@@@ -145,7 -146,7 +146,7 @@@ struct kvm_s390_sie_block 
        __u64   cputm;                  /* 0x0028 */
        __u64   ckc;                    /* 0x0030 */
        __u64   epoch;                  /* 0x0038 */
-       __u8    reserved40[4];          /* 0x0040 */
+       __u32   svcc;                   /* 0x0040 */
  #define LCTL_CR0      0x8000
  #define LCTL_CR6      0x0200
  #define LCTL_CR9      0x0040
  #define LCTL_CR14     0x0002
        __u16   lctl;                   /* 0x0044 */
        __s16   icpua;                  /* 0x0046 */
+ #define ICTL_OPEREXC  0x80000000
  #define ICTL_PINT     0x20000000
  #define ICTL_LPSW     0x00400000
  #define ICTL_STCTL    0x00040000
  #define ICPT_INST     0x04
  #define ICPT_PROGI    0x08
  #define ICPT_INSTPROGI        0x0C
+ #define ICPT_EXTINT   0x14
+ #define ICPT_VALIDITY 0x20
+ #define ICPT_STOP     0x28
  #define ICPT_OPEREXC  0x2C
  #define ICPT_PARTEXEC 0x38
  #define ICPT_IOINST   0x40
        __u32   scaol;                  /* 0x0064 */
        __u8    reserved68[4];          /* 0x0068 */
        __u32   todpr;                  /* 0x006c */
-       __u8    reserved70[32];         /* 0x0070 */
+       __u8    reserved70[16];         /* 0x0070 */
+       __u64   mso;                    /* 0x0080 */
+       __u64   msl;                    /* 0x0088 */
        psw_t   gpsw;                   /* 0x0090 */
        __u64   gg14;                   /* 0x00a0 */
        __u64   gg15;                   /* 0x00a8 */
        __u8    reserved1e6[2];         /* 0x01e6 */
        __u64   itdba;                  /* 0x01e8 */
        __u64   riccbd;                 /* 0x01f0 */
-       __u8    reserved1f8[8];         /* 0x01f8 */
+       __u64   gvrd;                   /* 0x01f8 */
  } __attribute__((packed));
  
  struct kvm_s390_itdb {
@@@ -245,7 -252,6 +252,7 @@@ struct kvm_vcpu_stat 
        u32 exit_stop_request;
        u32 exit_validity;
        u32 exit_instruction;
 +      u32 exit_pei;
        u32 halt_successful_poll;
        u32 halt_attempted_poll;
        u32 halt_poll_invalid;
        u32 instruction_stctg;
        u32 exit_program_interruption;
        u32 exit_instr_and_program;
+       u32 exit_operation_exception;
        u32 deliver_external_call;
        u32 deliver_emergency_signal;
        u32 deliver_service_signal;
        u32 instruction_stsi;
        u32 instruction_stfl;
        u32 instruction_tprot;
+       u32 instruction_sie;
        u32 instruction_essa;
+       u32 instruction_sthyi;
        u32 instruction_sigp_sense;
        u32 instruction_sigp_sense_running;
        u32 instruction_sigp_external_call;
@@@ -541,12 -550,16 +551,16 @@@ struct kvm_guestdbg_info_arch 
  
  struct kvm_vcpu_arch {
        struct kvm_s390_sie_block *sie_block;
+       /* if vsie is active, currently executed shadow sie control block */
+       struct kvm_s390_sie_block *vsie_block;
        unsigned int      host_acrs[NUM_ACRS];
        struct fpu        host_fpregs;
        struct kvm_s390_local_interrupt local_int;
        struct hrtimer    ckc_timer;
        struct kvm_s390_pgm_info pgm;
        struct gmap *gmap;
+       /* backup location for the currently enabled gmap when scheduled out */
+       struct gmap *enabled_gmap;
        struct kvm_guestdbg_info_arch guestdbg;
        unsigned long pfault_token;
        unsigned long pfault_select;
@@@ -631,6 -644,14 +645,14 @@@ struct sie_page2 
        u8 reserved900[0x1000 - 0x900];                 /* 0x0900 */
  } __packed;
  
+ struct kvm_s390_vsie {
+       struct mutex mutex;
+       struct radix_tree_root addr_to_page;
+       int page_count;
+       int next;
+       struct page *pages[KVM_MAX_VCPUS];
+ };
  struct kvm_arch{
        void *sca;
        int use_esca;
        int user_cpu_state_ctrl;
        int user_sigp;
        int user_stsi;
+       int user_instr0;
        struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
        wait_queue_head_t ipte_wq;
        int ipte_lock_count;
        struct mutex ipte_mutex;
+       struct ratelimit_state sthyi_limit;
        spinlock_t start_stop_lock;
        struct sie_page2 *sie_page2;
        struct kvm_s390_cpu_model model;
        struct kvm_s390_crypto crypto;
+       struct kvm_s390_vsie vsie;
        u64 epoch;
+       /* subset of available cpu features enabled by user space */
+       DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
  };
  
  #define KVM_HVA_ERR_BAD               (-1UL)
@@@ -6,10 -6,11 +6,11 @@@
  
  typedef struct {
        cpumask_t cpu_attach_mask;
 -      atomic_t attach_count;
 +      atomic_t flush_count;
        unsigned int flush_mm;
-       spinlock_t list_lock;
+       spinlock_t pgtable_lock;
        struct list_head pgtable_list;
+       spinlock_t gmap_lock;
        struct list_head gmap_list;
        unsigned long asce;
        unsigned long asce_limit;
        unsigned int use_skey:1;
  } mm_context_t;
  
- #define INIT_MM_CONTEXT(name)                                               \
-       .context.list_lock    = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \
-       .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list),    \
+ #define INIT_MM_CONTEXT(name)                                            \
+       .context.pgtable_lock =                                            \
+                       __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock),   \
+       .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
+       .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \
        .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
  
  static inline int tprot(unsigned long addr)
  static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
  {
-       spin_lock_init(&mm->context.list_lock);
+       spin_lock_init(&mm->context.pgtable_lock);
        INIT_LIST_HEAD(&mm->context.pgtable_list);
+       spin_lock_init(&mm->context.gmap_lock);
        INIT_LIST_HEAD(&mm->context.gmap_list);
        cpumask_clear(&mm->context.cpu_attach_mask);
 -      atomic_set(&mm->context.attach_count, 0);
 +      atomic_set(&mm->context.flush_count, 0);
        mm->context.flush_mm = 0;
  #ifdef CONFIG_PGSTE
        mm->context.alloc_pgste = page_table_allocate_pgste;
@@@ -90,12 -91,15 +91,12 @@@ static inline void switch_mm(struct mm_
        S390_lowcore.user_asce = next->context.asce;
        if (prev == next)
                return;
 -      if (MACHINE_HAS_TLB_LC)
 -              cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
 +      cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
 +      cpumask_set_cpu(cpu, mm_cpumask(next));
        /* Clear old ASCE by loading the kernel ASCE. */
        __ctl_load(S390_lowcore.kernel_asce, 1, 1);
        __ctl_load(S390_lowcore.kernel_asce, 7, 7);
 -      atomic_inc(&next->context.attach_count);
 -      atomic_dec(&prev->context.attach_count);
 -      if (MACHINE_HAS_TLB_LC)
 -              cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
 +      cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
  }
  
  #define finish_arch_post_lock_switch finish_arch_post_lock_switch
@@@ -107,9 -111,10 +108,9 @@@ static inline void finish_arch_post_loc
        load_kernel_asce();
        if (mm) {
                preempt_disable();
 -              while (atomic_read(&mm->context.attach_count) >> 16)
 +              while (atomic_read(&mm->context.flush_count))
                        cpu_relax();
  
 -              cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
                if (mm->context.flush_mm)
                        __tlb_flush_mm(mm);
                preempt_enable();
@@@ -124,6 -129,7 +125,6 @@@ static inline void activate_mm(struct m
                                 struct mm_struct *next)
  {
        switch_mm(prev, next, current);
 -      cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
        set_user_asce(next);
  }
  
@@@ -21,7 -21,6 +21,7 @@@
  #define HPAGE_SIZE    (1UL << HPAGE_SHIFT)
  #define HPAGE_MASK    (~(HPAGE_SIZE - 1))
  #define HUGETLB_PAGE_ORDER    (HPAGE_SHIFT - PAGE_SHIFT)
 +#define HUGE_MAX_HSTATE               2
  
  #define ARCH_HAS_SETCLEAR_HUGE_PTE
  #define ARCH_HAS_HUGE_PTE_TYPE
  #include <asm/setup.h>
  #ifndef __ASSEMBLY__
  
 +void __storage_key_init_range(unsigned long start, unsigned long end);
 +
  static inline void storage_key_init_range(unsigned long start, unsigned long end)
  {
 -#if PAGE_DEFAULT_KEY
 -      __storage_key_init_range(start, end);
 -#endif
 +      if (PAGE_DEFAULT_KEY)
 +              __storage_key_init_range(start, end);
  }
  
  #define clear_page(page)      memset((page), 0, PAGE_SIZE)
@@@ -111,13 -109,14 +111,14 @@@ static inline unsigned char page_get_st
  
  static inline int page_reset_referenced(unsigned long addr)
  {
-       unsigned int ipm;
+       int cc;
  
        asm volatile(
                "       rrbe    0,%1\n"
                "       ipm     %0\n"
-               : "=d" (ipm) : "a" (addr) : "cc");
-       return !!(ipm & 0x20000000);
+               "       srl     %0,28\n"
+               : "=d" (cc) : "a" (addr) : "cc");
+       return cc;
  }
  
  /* Bits int the storage key */
@@@ -148,6 -147,8 +149,8 @@@ static inline int devmem_is_allowed(uns
  #define virt_to_page(kaddr)   pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
  #define page_to_phys(page)    (page_to_pfn(page) << PAGE_SHIFT)
  #define virt_addr_valid(kaddr)        pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+ #define pfn_to_virt(pfn)      __va((pfn) << PAGE_SHIFT)
+ #define page_to_virt(page)    pfn_to_virt(page_to_pfn(page))
  
  #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
  #include <linux/mm_types.h>
  #include <linux/page-flags.h>
  #include <linux/radix-tree.h>
 +#include <linux/atomic.h>
  #include <asm/bug.h>
  #include <asm/page.h>
  
 -extern pgd_t swapper_pg_dir[] __attribute__ ((aligned (4096)));
 +extern pgd_t swapper_pg_dir[];
  extern void paging_init(void);
  extern void vmem_map_init(void);
 +pmd_t *vmem_pmd_alloc(void);
 +pte_t *vmem_pte_alloc(void);
 +
 +enum {
 +      PG_DIRECT_MAP_4K = 0,
 +      PG_DIRECT_MAP_1M,
 +      PG_DIRECT_MAP_2G,
 +      PG_DIRECT_MAP_MAX
 +};
 +
 +extern atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX];
 +
 +static inline void update_page_count(int level, long count)
 +{
 +      if (IS_ENABLED(CONFIG_PROC_FS))
 +              atomic_long_add(count, &direct_pages_count[level]);
 +}
 +
 +struct seq_file;
 +void arch_report_meminfo(struct seq_file *m);
  
  /*
   * The S390 doesn't have any external MMU info: the kernel page
@@@ -242,8 -221,8 +242,8 @@@ static inline int is_module_addr(void *
   * swap                               .11..ttttt.0
   * prot-none, clean, old      .11.xx0000.1
   * prot-none, clean, young    .11.xx0001.1
 - * prot-none, dirty, old      .10.xx0010.1
 - * prot-none, dirty, young    .10.xx0011.1
 + * prot-none, dirty, old      .11.xx0010.1
 + * prot-none, dirty, young    .11.xx0011.1
   * read-only, clean, old      .11.xx0100.1
   * read-only, clean, young    .01.xx0101.1
   * read-only, dirty, old      .11.xx0110.1
  /* Bits in the region table entry */
  #define _REGION_ENTRY_ORIGIN  ~0xfffUL/* region/segment table origin      */
  #define _REGION_ENTRY_PROTECT 0x200   /* region protection bit            */
+ #define _REGION_ENTRY_OFFSET  0xc0    /* region table offset              */
  #define _REGION_ENTRY_INVALID 0x20    /* invalid region table entry       */
  #define _REGION_ENTRY_TYPE_MASK       0x0c    /* region/segment table type mask   */
  #define _REGION_ENTRY_TYPE_R1 0x0c    /* region first table type          */
  #define _REGION3_ENTRY                (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH)
  #define _REGION3_ENTRY_EMPTY  (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID)
  
 -#define _REGION3_ENTRY_LARGE  0x400   /* RTTE-format control, large page  */
 -#define _REGION3_ENTRY_RO     0x200   /* page protection bit              */
 +#define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address            */
 +#define _REGION3_ENTRY_ORIGIN  ~0x7ffUL/* region third table origin        */
 +
 +#define _REGION3_ENTRY_DIRTY  0x2000  /* SW region dirty bit */
 +#define _REGION3_ENTRY_YOUNG  0x1000  /* SW region young bit */
 +#define _REGION3_ENTRY_LARGE  0x0400  /* RTTE-format control, large page  */
 +#define _REGION3_ENTRY_READ   0x0002  /* SW region read bit */
 +#define _REGION3_ENTRY_WRITE  0x0001  /* SW region write bit */
 +
 +#ifdef CONFIG_MEM_SOFT_DIRTY
 +#define _REGION3_ENTRY_SOFT_DIRTY 0x4000 /* SW region soft dirty bit */
 +#else
 +#define _REGION3_ENTRY_SOFT_DIRTY 0x0000 /* SW region soft dirty bit */
 +#endif
 +
 +#define _REGION_ENTRY_BITS     0xfffffffffffff227UL
 +#define _REGION_ENTRY_BITS_LARGE 0xffffffff8000fe27UL
  
  /* Bits in the segment table entry */
  #define _SEGMENT_ENTRY_BITS   0xfffffffffffffe33UL
  #define _SEGMENT_ENTRY_DIRTY  0x2000  /* SW segment dirty bit */
  #define _SEGMENT_ENTRY_YOUNG  0x1000  /* SW segment young bit */
  #define _SEGMENT_ENTRY_LARGE  0x0400  /* STE-format control, large page */
 -#define _SEGMENT_ENTRY_READ   0x0002  /* SW segment read bit */
 -#define _SEGMENT_ENTRY_WRITE  0x0001  /* SW segment write bit */
 +#define _SEGMENT_ENTRY_WRITE  0x0002  /* SW segment write bit */
 +#define _SEGMENT_ENTRY_READ   0x0001  /* SW segment read bit */
  
  #ifdef CONFIG_MEM_SOFT_DIRTY
  #define _SEGMENT_ENTRY_SOFT_DIRTY 0x4000 /* SW segment soft dirty bit */
  #endif
  
  /*
 - * Segment table entry encoding (R = read-only, I = invalid, y = young bit):
 - *                            dy..R...I...rw
 + * Segment table and region3 table entry encoding
 + * (R = read-only, I = invalid, y = young bit):
 + *                            dy..R...I...wr
   * prot-none, clean, old      00..1...1...00
   * prot-none, clean, young    01..1...1...00
   * prot-none, dirty, old      10..1...1...00
   * prot-none, dirty, young    11..1...1...00
 - * read-only, clean, old      00..1...1...10
 - * read-only, clean, young    01..1...0...10
 - * read-only, dirty, old      10..1...1...10
 - * read-only, dirty, young    11..1...0...10
 + * read-only, clean, old      00..1...1...01
 + * read-only, clean, young    01..1...0...01
 + * read-only, dirty, old      10..1...1...01
 + * read-only, dirty, young    11..1...0...01
   * read-write, clean, old     00..1...1...11
   * read-write, clean, young   01..1...0...11
   * read-write, dirty, old     10..0...1...11
  #define PGSTE_GC_BIT  0x0002000000000000UL
  #define PGSTE_UC_BIT  0x0000800000000000UL    /* user dirty (migration) */
  #define PGSTE_IN_BIT  0x0000400000000000UL    /* IPTE notify bit */
+ #define PGSTE_VSIE_BIT        0x0000200000000000UL    /* ref'd in a shadow table */
  
  /* Guest Page State used for virtualization */
  #define _PGSTE_GPS_ZERO               0x0000000080000000UL
  /*
   * Page protection definitions.
   */
 -#define PAGE_NONE     __pgprot(_PAGE_PRESENT | _PAGE_INVALID)
 +#define PAGE_NONE     __pgprot(_PAGE_PRESENT | _PAGE_INVALID | _PAGE_PROTECT)
  #define PAGE_READ     __pgprot(_PAGE_PRESENT | _PAGE_READ | \
                                 _PAGE_INVALID | _PAGE_PROTECT)
  #define PAGE_WRITE    __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
                                 _SEGMENT_ENTRY_READ)
  #define SEGMENT_WRITE __pgprot(_SEGMENT_ENTRY_READ | \
                                 _SEGMENT_ENTRY_WRITE)
 +#define SEGMENT_KERNEL        __pgprot(_SEGMENT_ENTRY |       \
 +                               _SEGMENT_ENTRY_LARGE | \
 +                               _SEGMENT_ENTRY_READ |  \
 +                               _SEGMENT_ENTRY_WRITE | \
 +                               _SEGMENT_ENTRY_YOUNG | \
 +                               _SEGMENT_ENTRY_DIRTY)
 +#define SEGMENT_KERNEL_RO __pgprot(_SEGMENT_ENTRY |   \
 +                               _SEGMENT_ENTRY_LARGE | \
 +                               _SEGMENT_ENTRY_READ |  \
 +                               _SEGMENT_ENTRY_YOUNG | \
 +                               _SEGMENT_ENTRY_PROTECT)
 +
 +/*
 + * Region3 entry (large page) protection definitions.
 + */
 +
 +#define REGION3_KERNEL        __pgprot(_REGION_ENTRY_TYPE_R3 | \
 +                               _REGION3_ENTRY_LARGE |  \
 +                               _REGION3_ENTRY_READ |   \
 +                               _REGION3_ENTRY_WRITE |  \
 +                               _REGION3_ENTRY_YOUNG |  \
 +                               _REGION3_ENTRY_DIRTY)
 +#define REGION3_KERNEL_RO __pgprot(_REGION_ENTRY_TYPE_R3 | \
 +                                 _REGION3_ENTRY_LARGE |  \
 +                                 _REGION3_ENTRY_READ |   \
 +                                 _REGION3_ENTRY_YOUNG |  \
 +                                 _REGION_ENTRY_PROTECT)
  
  static inline int mm_has_pgste(struct mm_struct *mm)
  {
@@@ -488,53 -426,6 +490,53 @@@ static inline int mm_use_skey(struct mm
        return 0;
  }
  
 +static inline void csp(unsigned int *ptr, unsigned int old, unsigned int new)
 +{
 +      register unsigned long reg2 asm("2") = old;
 +      register unsigned long reg3 asm("3") = new;
 +      unsigned long address = (unsigned long)ptr | 1;
 +
 +      asm volatile(
 +              "       csp     %0,%3"
 +              : "+d" (reg2), "+m" (*ptr)
 +              : "d" (reg3), "d" (address)
 +              : "cc");
 +}
 +
 +static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new)
 +{
 +      register unsigned long reg2 asm("2") = old;
 +      register unsigned long reg3 asm("3") = new;
 +      unsigned long address = (unsigned long)ptr | 1;
 +
 +      asm volatile(
 +              "       .insn   rre,0xb98a0000,%0,%3"
 +              : "+d" (reg2), "+m" (*ptr)
 +              : "d" (reg3), "d" (address)
 +              : "cc");
 +}
 +
 +#define CRDTE_DTT_PAGE                0x00UL
 +#define CRDTE_DTT_SEGMENT     0x10UL
 +#define CRDTE_DTT_REGION3     0x14UL
 +#define CRDTE_DTT_REGION2     0x18UL
 +#define CRDTE_DTT_REGION1     0x1cUL
 +
 +static inline void crdte(unsigned long old, unsigned long new,
 +                       unsigned long table, unsigned long dtt,
 +                       unsigned long address, unsigned long asce)
 +{
 +      register unsigned long reg2 asm("2") = old;
 +      register unsigned long reg3 asm("3") = new;
 +      register unsigned long reg4 asm("4") = table | dtt;
 +      register unsigned long reg5 asm("5") = address;
 +
 +      asm volatile(".insn rrf,0xb98f0000,%0,%2,%4,0"
 +                   : "+d" (reg2)
 +                   : "d" (reg3), "d" (reg4), "d" (reg5), "a" (asce)
 +                   : "memory", "cc");
 +}
 +
  /*
   * pgd/pmd/pte query functions
   */
@@@ -576,7 -467,7 +578,7 @@@ static inline int pud_none(pud_t pud
  {
        if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3)
                return 0;
 -      return (pud_val(pud) & _REGION_ENTRY_INVALID) != 0UL;
 +      return pud_val(pud) == _REGION3_ENTRY_EMPTY;
  }
  
  static inline int pud_large(pud_t pud)
        return !!(pud_val(pud) & _REGION3_ENTRY_LARGE);
  }
  
 +static inline unsigned long pud_pfn(pud_t pud)
 +{
 +      unsigned long origin_mask;
 +
 +      origin_mask = _REGION3_ENTRY_ORIGIN;
 +      if (pud_large(pud))
 +              origin_mask = _REGION3_ENTRY_ORIGIN_LARGE;
 +      return (pud_val(pud) & origin_mask) >> PAGE_SHIFT;
 +}
 +
 +static inline int pmd_large(pmd_t pmd)
 +{
 +      return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
 +}
 +
 +static inline int pmd_bad(pmd_t pmd)
 +{
 +      if (pmd_large(pmd))
 +              return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0;
 +      return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
 +}
 +
  static inline int pud_bad(pud_t pud)
  {
 -      /*
 -       * With dynamic page table levels the pud can be a region table
 -       * entry or a segment table entry. Check for the bit that are
 -       * invalid for either table entry.
 -       */
 -      unsigned long mask =
 -              ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INVALID &
 -              ~_REGION_ENTRY_TYPE_MASK & ~_REGION_ENTRY_LENGTH;
 -      return (pud_val(pud) & mask) != 0;
 +      if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3)
 +              return pmd_bad(__pmd(pud_val(pud)));
 +      if (pud_large(pud))
 +              return (pud_val(pud) & ~_REGION_ENTRY_BITS_LARGE) != 0;
 +      return (pud_val(pud) & ~_REGION_ENTRY_BITS) != 0;
  }
  
  static inline int pmd_present(pmd_t pmd)
@@@ -627,6 -500,11 +629,6 @@@ static inline int pmd_none(pmd_t pmd
        return pmd_val(pmd) == _SEGMENT_ENTRY_INVALID;
  }
  
 -static inline int pmd_large(pmd_t pmd)
 -{
 -      return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
 -}
 -
  static inline unsigned long pmd_pfn(pmd_t pmd)
  {
        unsigned long origin_mask;
        return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT;
  }
  
 -static inline int pmd_bad(pmd_t pmd)
 -{
 -      if (pmd_large(pmd))
 -              return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0;
 -      return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
 -}
 -
  #define __HAVE_ARCH_PMD_WRITE
  static inline int pmd_write(pmd_t pmd)
  {
@@@ -1002,15 -887,26 +1004,26 @@@ static inline int ptep_set_access_flags
  void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep, pte_t entry);
  void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
- void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+ void ptep_notify(struct mm_struct *mm, unsigned long addr,
+                pte_t *ptep, unsigned long bits);
+ int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
+                   pte_t *ptep, int prot, unsigned long bit);
  void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep , int reset);
  void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+                   pte_t *sptep, pte_t *tptep, pte_t pte);
+ void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
  
  bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
  int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
                          unsigned char key, bool nq);
- unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr);
+ int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                              unsigned char key, unsigned char *oldkey,
+                              bool nq, bool mr, bool mc);
+ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr);
+ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                         unsigned char *key);
  
  /*
   * Certain architectures need to do special things when PTEs
@@@ -1080,7 -976,6 +1093,7 @@@ static inline pmd_t *pmd_offset(pud_t *
  #define pte_page(x) pfn_to_page(pte_pfn(x))
  
  #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
 +#define pud_page(pud) pfn_to_page(pud_pfn(pud))
  
  /* Find an entry in the lowest level page table.. */
  #define pte_offset(pmd, addr) ((pte_t *) pmd_deref(*(pmd)) + pte_index(addr))
  #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
  #define pte_unmap(pte) do { } while (0)
  
 -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
 -static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
 -{
 -      /*
 -       * pgprot is PAGE_NONE, PAGE_READ, or PAGE_WRITE (see __Pxxx / __Sxxx)
 -       * Convert to segment table entry format.
 -       */
 -      if (pgprot_val(pgprot) == pgprot_val(PAGE_NONE))
 -              return pgprot_val(SEGMENT_NONE);
 -      if (pgprot_val(pgprot) == pgprot_val(PAGE_READ))
 -              return pgprot_val(SEGMENT_READ);
 -      return pgprot_val(SEGMENT_WRITE);
 -}
 -
  static inline pmd_t pmd_wrprotect(pmd_t pmd)
  {
        pmd_val(pmd) &= ~_SEGMENT_ENTRY_WRITE;
@@@ -1124,56 -1033,6 +1137,56 @@@ static inline pmd_t pmd_mkdirty(pmd_t p
        return pmd;
  }
  
 +static inline pud_t pud_wrprotect(pud_t pud)
 +{
 +      pud_val(pud) &= ~_REGION3_ENTRY_WRITE;
 +      pud_val(pud) |= _REGION_ENTRY_PROTECT;
 +      return pud;
 +}
 +
 +static inline pud_t pud_mkwrite(pud_t pud)
 +{
 +      pud_val(pud) |= _REGION3_ENTRY_WRITE;
 +      if (pud_large(pud) && !(pud_val(pud) & _REGION3_ENTRY_DIRTY))
 +              return pud;
 +      pud_val(pud) &= ~_REGION_ENTRY_PROTECT;
 +      return pud;
 +}
 +
 +static inline pud_t pud_mkclean(pud_t pud)
 +{
 +      if (pud_large(pud)) {
 +              pud_val(pud) &= ~_REGION3_ENTRY_DIRTY;
 +              pud_val(pud) |= _REGION_ENTRY_PROTECT;
 +      }
 +      return pud;
 +}
 +
 +static inline pud_t pud_mkdirty(pud_t pud)
 +{
 +      if (pud_large(pud)) {
 +              pud_val(pud) |= _REGION3_ENTRY_DIRTY |
 +                              _REGION3_ENTRY_SOFT_DIRTY;
 +              if (pud_val(pud) & _REGION3_ENTRY_WRITE)
 +                      pud_val(pud) &= ~_REGION_ENTRY_PROTECT;
 +      }
 +      return pud;
 +}
 +
 +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
 +static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
 +{
 +      /*
 +       * pgprot is PAGE_NONE, PAGE_READ, or PAGE_WRITE (see __Pxxx / __Sxxx)
 +       * Convert to segment table entry format.
 +       */
 +      if (pgprot_val(pgprot) == pgprot_val(PAGE_NONE))
 +              return pgprot_val(SEGMENT_NONE);
 +      if (pgprot_val(pgprot) == pgprot_val(PAGE_READ))
 +              return pgprot_val(SEGMENT_READ);
 +      return pgprot_val(SEGMENT_WRITE);
 +}
 +
  static inline pmd_t pmd_mkyoung(pmd_t pmd)
  {
        if (pmd_large(pmd)) {
@@@ -1222,8 -1081,15 +1235,8 @@@ static inline pmd_t mk_pmd_phys(unsigne
  
  static inline void __pmdp_csp(pmd_t *pmdp)
  {
 -      register unsigned long reg2 asm("2") = pmd_val(*pmdp);
 -      register unsigned long reg3 asm("3") = pmd_val(*pmdp) |
 -                                             _SEGMENT_ENTRY_INVALID;
 -      register unsigned long reg4 asm("4") = ((unsigned long) pmdp) + 5;
 -
 -      asm volatile(
 -              "       csp %1,%3"
 -              : "=m" (*pmdp)
 -              : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc");
 +      csp((unsigned int *)pmdp + 1, pmd_val(*pmdp),
 +          pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
  }
  
  static inline void __pmdp_idte(unsigned long address, pmd_t *pmdp)
                : "cc" );
  }
  
 +static inline void __pudp_idte(unsigned long address, pud_t *pudp)
 +{
 +      unsigned long r3o;
 +
 +      r3o = (unsigned long) pudp - pud_index(address) * sizeof(pud_t);
 +      r3o |= _ASCE_TYPE_REGION3;
 +      asm volatile(
 +              "       .insn   rrf,0xb98e0000,%2,%3,0,0"
 +              : "=m" (*pudp)
 +              : "m" (*pudp), "a" (r3o), "a" ((address & PUD_MASK))
 +              : "cc");
 +}
 +
  static inline void __pmdp_idte_local(unsigned long address, pmd_t *pmdp)
  {
        unsigned long sto;
                : "cc" );
  }
  
 +static inline void __pudp_idte_local(unsigned long address, pud_t *pudp)
 +{
 +      unsigned long r3o;
 +
 +      r3o = (unsigned long) pudp - pud_index(address) * sizeof(pud_t);
 +      r3o |= _ASCE_TYPE_REGION3;
 +      asm volatile(
 +              "       .insn   rrf,0xb98e0000,%2,%3,0,1"
 +              : "=m" (*pudp)
 +              : "m" (*pudp), "a" (r3o), "a" ((address & PUD_MASK))
 +              : "cc");
 +}
 +
  pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t);
  pmd_t pmdp_xchg_lazy(struct mm_struct *, unsigned long, pmd_t *, pmd_t);
 +pud_t pudp_xchg_direct(struct mm_struct *, unsigned long, pud_t *, pud_t);
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  
@@@ -77,10 -77,7 +77,10 @@@ static inline void get_cpu_id(struct cp
        asm volatile("stidp %0" : "=Q" (*ptr));
  }
  
 -extern void s390_adjust_jiffies(void);
 +void s390_adjust_jiffies(void);
 +void s390_update_cpu_mhz(void);
 +void cpu_detect_mhz_feature(void);
 +
  extern const struct seq_operations cpuinfo_op;
  extern int sysctl_ieee_emulation_warnings;
  extern void execve_tail(void);
@@@ -112,6 -109,8 +112,8 @@@ struct thread_struct 
          unsigned long ksp;              /* kernel stack pointer             */
        mm_segment_t mm_segment;
        unsigned long gmap_addr;        /* address of last gmap fault. */
+       unsigned int gmap_write_flag;   /* gmap fault write indication */
+       unsigned int gmap_int_code;     /* int code of last gmap fault */
        unsigned int gmap_pfault;       /* signal of a pending guest pfault */
        struct per_regs per_user;       /* User specified PER registers */
        struct per_event per_event;     /* Cause of the last PER trap */
@@@ -236,18 -235,6 +238,18 @@@ void cpu_relax(void)
  
  #define cpu_relax_lowlatency()  barrier()
  
 +#define ECAG_CACHE_ATTRIBUTE  0
 +#define ECAG_CPU_ATTRIBUTE    1
 +
 +static inline unsigned long __ecag(unsigned int asi, unsigned char parm)
 +{
 +      unsigned long val;
 +
 +      asm volatile(".insn     rsy,0xeb000000004c,%0,0,0(%1)" /* ecag */
 +                   : "=d" (val) : "a" (asi << 8 | parm));
 +      return val;
 +}
 +
  static inline void psw_set_key(unsigned int key)
  {
        asm volatile("spka 0(%0)" : : "d" (key));
diff --combined arch/s390/kernel/diag.c
@@@ -162,6 -162,28 +162,30 @@@ int diag14(unsigned long rx, unsigned l
  }
  EXPORT_SYMBOL(diag14);
  
 -static inline int __diag204(unsigned long subcode, unsigned long size, void *addr)
++static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
+ {
 -      register unsigned long _subcode asm("0") = subcode;
++      register unsigned long _subcode asm("0") = *subcode;
+       register unsigned long _size asm("1") = size;
+       asm volatile(
+               "       diag    %2,%0,0x204\n"
 -              "0:\n"
++              "0:     nopr    %%r7\n"
+               EX_TABLE(0b,0b)
+               : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
 -      if (_subcode)
 -              return -1;
++      *subcode = _subcode;
+       return _size;
+ }
+ int diag204(unsigned long subcode, unsigned long size, void *addr)
+ {
+       diag_stat_inc(DIAG_STAT_X204);
 -      return __diag204(subcode, size, addr);
++      size = __diag204(&subcode, size, addr);
++      if (subcode)
++              return -1;
++      return size;
+ }
+ EXPORT_SYMBOL(diag204);
  /*
   * Diagnose 210: Get information about a virtual device
   */
@@@ -196,3 -218,18 +220,18 @@@ int diag210(struct diag210 *addr
        return ccode;
  }
  EXPORT_SYMBOL(diag210);
+ int diag224(void *ptr)
+ {
+       int rc = -EOPNOTSUPP;
+       diag_stat_inc(DIAG_STAT_X224);
+       asm volatile(
+               "       diag    %1,%2,0x224\n"
+               "0:     lhi     %0,0x0\n"
+               "1:\n"
+               EX_TABLE(0b,1b)
+               : "+d" (rc) :"d" (0), "d" (ptr) : "memory");
+       return rc;
+ }
+ EXPORT_SYMBOL(diag224);
@@@ -341,8 -341,6 +341,8 @@@ static int handle_mvpg_pei(struct kvm_v
  
  static int handle_partial_execution(struct kvm_vcpu *vcpu)
  {
 +      vcpu->stat.exit_pei++;
 +
        if (vcpu->arch.sie_block->ipa == 0xb254)        /* MVPG */
                return handle_mvpg_pei(vcpu);
        if (vcpu->arch.sie_block->ipa >> 8 == 0xae)     /* SIGP */
        return -EOPNOTSUPP;
  }
  
+ static int handle_operexc(struct kvm_vcpu *vcpu)
+ {
+       vcpu->stat.exit_operation_exception++;
+       trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
+                                     vcpu->arch.sie_block->ipb);
+       if (vcpu->arch.sie_block->ipa == 0xb256 &&
+           test_kvm_facility(vcpu->kvm, 74))
+               return handle_sthyi(vcpu);
+       if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
+               return -EOPNOTSUPP;
+       return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+ }
  int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
  {
+       int rc, per_rc = 0;
        if (kvm_is_ucontrol(vcpu->kvm))
                return -EOPNOTSUPP;
  
        case 0x18:
                return handle_noop(vcpu);
        case 0x04:
-               return handle_instruction(vcpu);
+               rc = handle_instruction(vcpu);
+               break;
        case 0x08:
                return handle_prog(vcpu);
        case 0x14:
                return handle_validity(vcpu);
        case 0x28:
                return handle_stop(vcpu);
+       case 0x2c:
+               rc = handle_operexc(vcpu);
+               break;
        case 0x38:
-               return handle_partial_execution(vcpu);
+               rc = handle_partial_execution(vcpu);
+               break;
        default:
                return -EOPNOTSUPP;
        }
+       /* process PER, also if the instrution is processed in user space */
+       if (vcpu->arch.sie_block->icptstatus & 0x02 &&
+           (!rc || rc == -EOPNOTSUPP))
+               per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu);
+       return per_rc ? per_rc : rc;
  }
diff --combined arch/s390/kvm/kvm-s390.c
  #include <linux/init.h>
  #include <linux/kvm.h>
  #include <linux/kvm_host.h>
+ #include <linux/mman.h>
  #include <linux/module.h>
  #include <linux/random.h>
  #include <linux/slab.h>
  #include <linux/timer.h>
  #include <linux/vmalloc.h>
+ #include <linux/bitmap.h>
  #include <asm/asm-offsets.h>
  #include <asm/lowcore.h>
 -#include <asm/etr.h>
 +#include <asm/stp.h>
  #include <asm/pgtable.h>
  #include <asm/gmap.h>
  #include <asm/nmi.h>
  #include <asm/switch_to.h>
  #include <asm/isc.h>
  #include <asm/sclp.h>
 -#include <asm/etr.h>
+ #include <asm/cpacf.h>
++#include <asm/timex.h>
  #include "kvm-s390.h"
  #include "gaccess.h"
  
@@@ -61,9 -65,9 +65,10 @@@ struct kvm_stats_debugfs_item debugfs_e
        { "exit_external_request", VCPU_STAT(exit_external_request) },
        { "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) },
        { "exit_instruction", VCPU_STAT(exit_instruction) },
 +      { "exit_pei", VCPU_STAT(exit_pei) },
        { "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
        { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
+       { "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
        { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
        { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
        { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
@@@ -94,6 -98,8 +99,8 @@@
        { "instruction_stsi", VCPU_STAT(instruction_stsi) },
        { "instruction_stfl", VCPU_STAT(instruction_stfl) },
        { "instruction_tprot", VCPU_STAT(instruction_tprot) },
+       { "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
+       { "instruction_sie", VCPU_STAT(instruction_sie) },
        { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
        { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
        { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
        { NULL }
  };
  
+ /* allow nested virtualization in KVM (if enabled by user space) */
+ static int nested;
+ module_param(nested, int, S_IRUGO);
+ MODULE_PARM_DESC(nested, "Nested virtualization support");
  /* upper facilities limit for kvm */
  unsigned long kvm_s390_fac_list_mask[16] = {
        0xffe6000000000000UL,
@@@ -131,7 -142,13 +143,13 @@@ unsigned long kvm_s390_fac_list_mask_si
        return ARRAY_SIZE(kvm_s390_fac_list_mask);
  }
  
+ /* available cpu features supported by kvm */
+ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+ /* available subfunctions indicated via query / "test bit" */
+ static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
  static struct gmap_notifier gmap_notifier;
+ static struct gmap_notifier vsie_gmap_notifier;
  debug_info_t *kvm_s390_dbf;
  
  /* Section: not file related */
@@@ -141,7 -158,8 +159,8 @@@ int kvm_arch_hardware_enable(void
        return 0;
  }
  
- static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
+ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+                             unsigned long end);
  
  /*
   * This callback is executed during stop_machine(). All CPUs are therefore
@@@ -163,6 -181,8 +182,8 @@@ static int kvm_clock_sync(struct notifi
                        vcpu->arch.sie_block->epoch -= *delta;
                        if (vcpu->arch.cputm_enabled)
                                vcpu->arch.cputm_start += *delta;
+                       if (vcpu->arch.vsie_block)
+                               vcpu->arch.vsie_block->epoch -= *delta;
                }
        }
        return NOTIFY_OK;
@@@ -175,7 -195,9 +196,9 @@@ static struct notifier_block kvm_clock_
  int kvm_arch_hardware_setup(void)
  {
        gmap_notifier.notifier_call = kvm_gmap_notifier;
-       gmap_register_ipte_notifier(&gmap_notifier);
+       gmap_register_pte_notifier(&gmap_notifier);
+       vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+       gmap_register_pte_notifier(&vsie_gmap_notifier);
        atomic_notifier_chain_register(&s390_epoch_delta_notifier,
                                       &kvm_clock_notifier);
        return 0;
  
  void kvm_arch_hardware_unsetup(void)
  {
-       gmap_unregister_ipte_notifier(&gmap_notifier);
+       gmap_unregister_pte_notifier(&gmap_notifier);
+       gmap_unregister_pte_notifier(&vsie_gmap_notifier);
        atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
                                         &kvm_clock_notifier);
  }
  
 -              etr_ptff(kvm_s390_available_subfunc.ptff, ETR_PTFF_QAF);
+ static void allow_cpu_feat(unsigned long nr)
+ {
+       set_bit_inv(nr, kvm_s390_available_cpu_feat);
+ }
+ static inline int plo_test_bit(unsigned char nr)
+ {
+       register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
+       int cc = 3; /* subfunction not available */
+       asm volatile(
+               /* Parameter registers are ignored for "test bit" */
+               "       plo     0,0,0,0(0)\n"
+               "       ipm     %0\n"
+               "       srl     %0,28\n"
+               : "=d" (cc)
+               : "d" (r0)
+               : "cc");
+       return cc == 0;
+ }
+ static void kvm_s390_cpu_feat_init(void)
+ {
+       int i;
+       for (i = 0; i < 256; ++i) {
+               if (plo_test_bit(i))
+                       kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7);
+       }
+       if (test_facility(28)) /* TOD-clock steering */
++              ptff(kvm_s390_available_subfunc.ptff,
++                   sizeof(kvm_s390_available_subfunc.ptff),
++                   PTFF_QAF);
+       if (test_facility(17)) { /* MSA */
+               __cpacf_query(CPACF_KMAC, kvm_s390_available_subfunc.kmac);
+               __cpacf_query(CPACF_KMC, kvm_s390_available_subfunc.kmc);
+               __cpacf_query(CPACF_KM, kvm_s390_available_subfunc.km);
+               __cpacf_query(CPACF_KIMD, kvm_s390_available_subfunc.kimd);
+               __cpacf_query(CPACF_KLMD, kvm_s390_available_subfunc.klmd);
+       }
+       if (test_facility(76)) /* MSA3 */
+               __cpacf_query(CPACF_PCKMO, kvm_s390_available_subfunc.pckmo);
+       if (test_facility(77)) { /* MSA4 */
+               __cpacf_query(CPACF_KMCTR, kvm_s390_available_subfunc.kmctr);
+               __cpacf_query(CPACF_KMF, kvm_s390_available_subfunc.kmf);
+               __cpacf_query(CPACF_KMO, kvm_s390_available_subfunc.kmo);
+               __cpacf_query(CPACF_PCC, kvm_s390_available_subfunc.pcc);
+       }
+       if (test_facility(57)) /* MSA5 */
+               __cpacf_query(CPACF_PPNO, kvm_s390_available_subfunc.ppno);
+       if (MACHINE_HAS_ESOP)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
+       /*
+        * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
+        * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
+        */
+       if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+           !test_facility(3) || !nested)
+               return;
+       allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
+       if (sclp.has_64bscao)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
+       if (sclp.has_siif)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
+       if (sclp.has_gpere)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
+       if (sclp.has_gsls)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
+       if (sclp.has_ib)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
+       if (sclp.has_cei)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
+       if (sclp.has_ibs)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+       /*
+        * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
+        * all skey handling functions read/set the skey from the PGSTE
+        * instead of the real storage key.
+        *
+        * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make
+        * pages being detected as preserved although they are resident.
+        *
+        * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will
+        * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY.
+        *
+        * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
+        * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
+        * correctly shadowed. We can do that for the PGSTE but not for PTE.I.
+        *
+        * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
+        * cannot easily shadow the SCA because of the ipte lock.
+        */
+ }
  int kvm_arch_init(void *opaque)
  {
        kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long));
                return -ENOMEM;
        }
  
+       kvm_s390_cpu_feat_init();
        /* Register floating interrupt controller interface. */
        return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
  }
@@@ -244,6 -364,7 +367,7 @@@ int kvm_vm_ioctl_check_extension(struc
        case KVM_CAP_S390_USER_STSI:
        case KVM_CAP_S390_SKEYS:
        case KVM_CAP_S390_IRQ_STATE:
+       case KVM_CAP_S390_USER_INSTR0:
                r = 1;
                break;
        case KVM_CAP_S390_MEM_OP:
                break;
        case KVM_CAP_NR_VCPUS:
        case KVM_CAP_MAX_VCPUS:
-               r = sclp.has_esca ? KVM_S390_ESCA_CPU_SLOTS
-                                 : KVM_S390_BSCA_CPU_SLOTS;
+               r = KVM_S390_BSCA_CPU_SLOTS;
+               if (sclp.has_esca && sclp.has_64bscao)
+                       r = KVM_S390_ESCA_CPU_SLOTS;
                break;
        case KVM_CAP_NR_MEMSLOTS:
                r = KVM_USER_MEM_SLOTS;
@@@ -335,6 -457,16 +460,16 @@@ out
        return r;
  }
  
+ static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
+ {
+       unsigned int i;
+       struct kvm_vcpu *vcpu;
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu);
+       }
+ }
  static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
  {
        int r;
                break;
        case KVM_CAP_S390_VECTOR_REGISTERS:
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus)) {
+               if (kvm->created_vcpus) {
                        r = -EBUSY;
                } else if (MACHINE_HAS_VX) {
                        set_kvm_facility(kvm->arch.model.fac_mask, 129);
        case KVM_CAP_S390_RI:
                r = -EINVAL;
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus)) {
+               if (kvm->created_vcpus) {
                        r = -EBUSY;
                } else if (test_facility(64)) {
                        set_kvm_facility(kvm->arch.model.fac_mask, 64);
                kvm->arch.user_stsi = 1;
                r = 0;
                break;
+       case KVM_CAP_S390_USER_INSTR0:
+               VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0");
+               kvm->arch.user_instr0 = 1;
+               icpt_operexc_on_all_vcpus(kvm);
+               r = 0;
+               break;
        default:
                r = -EINVAL;
                break;
@@@ -418,21 -556,23 +559,23 @@@ static int kvm_s390_set_mem_control(str
        unsigned int idx;
        switch (attr->attr) {
        case KVM_S390_VM_MEM_ENABLE_CMMA:
-               /* enable CMMA only for z10 and later (EDAT_1) */
-               ret = -EINVAL;
-               if (!MACHINE_IS_LPAR || !MACHINE_HAS_EDAT1)
+               ret = -ENXIO;
+               if (!sclp.has_cmma)
                        break;
  
                ret = -EBUSY;
                VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support");
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus) == 0) {
+               if (!kvm->created_vcpus) {
                        kvm->arch.use_cmma = 1;
                        ret = 0;
                }
                mutex_unlock(&kvm->lock);
                break;
        case KVM_S390_VM_MEM_CLR_CMMA:
+               ret = -ENXIO;
+               if (!sclp.has_cmma)
+                       break;
                ret = -EINVAL;
                if (!kvm->arch.use_cmma)
                        break;
                if (!new_limit)
                        return -EINVAL;
  
-               /* gmap_alloc takes last usable address */
+               /* gmap_create takes last usable address */
                if (new_limit != KVM_S390_NO_MEM_LIMIT)
                        new_limit -= 1;
  
                ret = -EBUSY;
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus) == 0) {
-                       /* gmap_alloc will round the limit up */
-                       struct gmap *new = gmap_alloc(current->mm, new_limit);
+               if (!kvm->created_vcpus) {
+                       /* gmap_create will round the limit up */
+                       struct gmap *new = gmap_create(current->mm, new_limit);
  
                        if (!new) {
                                ret = -ENOMEM;
                        } else {
-                               gmap_free(kvm->arch.gmap);
+                               gmap_remove(kvm->arch.gmap);
                                new->private = kvm;
                                kvm->arch.gmap = new;
                                ret = 0;
@@@ -644,7 -784,7 +787,7 @@@ static int kvm_s390_set_processor(struc
        int ret = 0;
  
        mutex_lock(&kvm->lock);
-       if (atomic_read(&kvm->online_vcpus)) {
+       if (kvm->created_vcpus) {
                ret = -EBUSY;
                goto out;
        }
                kvm->arch.model.cpuid = proc->cpuid;
                lowest_ibc = sclp.ibc >> 16 & 0xfff;
                unblocked_ibc = sclp.ibc & 0xfff;
 -              if (lowest_ibc) {
 +              if (lowest_ibc && proc->ibc) {
                        if (proc->ibc > unblocked_ibc)
                                kvm->arch.model.ibc = unblocked_ibc;
                        else if (proc->ibc < lowest_ibc)
@@@ -676,6 -816,39 +819,39 @@@ out
        return ret;
  }
  
+ static int kvm_s390_set_processor_feat(struct kvm *kvm,
+                                      struct kvm_device_attr *attr)
+ {
+       struct kvm_s390_vm_cpu_feat data;
+       int ret = -EBUSY;
+       if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data)))
+               return -EFAULT;
+       if (!bitmap_subset((unsigned long *) data.feat,
+                          kvm_s390_available_cpu_feat,
+                          KVM_S390_VM_CPU_FEAT_NR_BITS))
+               return -EINVAL;
+       mutex_lock(&kvm->lock);
+       if (!atomic_read(&kvm->online_vcpus)) {
+               bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
+                           KVM_S390_VM_CPU_FEAT_NR_BITS);
+               ret = 0;
+       }
+       mutex_unlock(&kvm->lock);
+       return ret;
+ }
+ static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
+                                         struct kvm_device_attr *attr)
+ {
+       /*
+        * Once supported by kernel + hw, we have to store the subfunctions
+        * in kvm->arch and remember that user space configured them.
+        */
+       return -ENXIO;
+ }
  static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
  {
        int ret = -ENXIO;
        case KVM_S390_VM_CPU_PROCESSOR:
                ret = kvm_s390_set_processor(kvm, attr);
                break;
+       case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+               ret = kvm_s390_set_processor_feat(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+               ret = kvm_s390_set_processor_subfunc(kvm, attr);
+               break;
        }
        return ret;
  }
@@@ -732,6 -911,50 +914,50 @@@ out
        return ret;
  }
  
+ static int kvm_s390_get_processor_feat(struct kvm *kvm,
+                                      struct kvm_device_attr *attr)
+ {
+       struct kvm_s390_vm_cpu_feat data;
+       bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat,
+                   KVM_S390_VM_CPU_FEAT_NR_BITS);
+       if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+               return -EFAULT;
+       return 0;
+ }
+ static int kvm_s390_get_machine_feat(struct kvm *kvm,
+                                    struct kvm_device_attr *attr)
+ {
+       struct kvm_s390_vm_cpu_feat data;
+       bitmap_copy((unsigned long *) data.feat,
+                   kvm_s390_available_cpu_feat,
+                   KVM_S390_VM_CPU_FEAT_NR_BITS);
+       if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+               return -EFAULT;
+       return 0;
+ }
+ static int kvm_s390_get_processor_subfunc(struct kvm *kvm,
+                                         struct kvm_device_attr *attr)
+ {
+       /*
+        * Once we can actually configure subfunctions (kernel + hw support),
+        * we have to check if they were already set by user space, if so copy
+        * them from kvm->arch.
+        */
+       return -ENXIO;
+ }
+ static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
+                                       struct kvm_device_attr *attr)
+ {
+       if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc,
+           sizeof(struct kvm_s390_vm_cpu_subfunc)))
+               return -EFAULT;
+       return 0;
+ }
  static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
  {
        int ret = -ENXIO;
        case KVM_S390_VM_CPU_MACHINE:
                ret = kvm_s390_get_machine(kvm, attr);
                break;
+       case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+               ret = kvm_s390_get_processor_feat(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_MACHINE_FEAT:
+               ret = kvm_s390_get_machine_feat(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+               ret = kvm_s390_get_processor_subfunc(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
+               ret = kvm_s390_get_machine_subfunc(kvm, attr);
+               break;
        }
        return ret;
  }
@@@ -803,6 -1038,8 +1041,8 @@@ static int kvm_s390_vm_has_attr(struct 
                switch (attr->attr) {
                case KVM_S390_VM_MEM_ENABLE_CMMA:
                case KVM_S390_VM_MEM_CLR_CMMA:
+                       ret = sclp.has_cmma ? 0 : -ENXIO;
+                       break;
                case KVM_S390_VM_MEM_LIMIT_SIZE:
                        ret = 0;
                        break;
                switch (attr->attr) {
                case KVM_S390_VM_CPU_PROCESSOR:
                case KVM_S390_VM_CPU_MACHINE:
+               case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+               case KVM_S390_VM_CPU_MACHINE_FEAT:
+               case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
                        ret = 0;
                        break;
+               /* configuring subfunctions is not supported yet */
+               case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
                default:
                        ret = -ENXIO;
                        break;
@@@ -858,7 -1100,6 +1103,6 @@@ static long kvm_s390_get_skeys(struct k
  {
        uint8_t *keys;
        uint64_t hva;
-       unsigned long curkey;
        int i, r = 0;
  
        if (args->flags != 0)
        if (!keys)
                return -ENOMEM;
  
+       down_read(&current->mm->mmap_sem);
        for (i = 0; i < args->count; i++) {
                hva = gfn_to_hva(kvm, args->start_gfn + i);
                if (kvm_is_error_hva(hva)) {
                        r = -EFAULT;
-                       goto out;
+                       break;
                }
  
-               curkey = get_guest_storage_key(current->mm, hva);
-               if (IS_ERR_VALUE(curkey)) {
-                       r = curkey;
-                       goto out;
-               }
-               keys[i] = curkey;
+               r = get_guest_storage_key(current->mm, hva, &keys[i]);
+               if (r)
+                       break;
+       }
+       up_read(&current->mm->mmap_sem);
+       if (!r) {
+               r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
+                                sizeof(uint8_t) * args->count);
+               if (r)
+                       r = -EFAULT;
        }
  
-       r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
-                        sizeof(uint8_t) * args->count);
-       if (r)
-               r = -EFAULT;
- out:
        kvfree(keys);
        return r;
  }
@@@ -935,24 -1177,25 +1180,25 @@@ static long kvm_s390_set_skeys(struct k
        if (r)
                goto out;
  
+       down_read(&current->mm->mmap_sem);
        for (i = 0; i < args->count; i++) {
                hva = gfn_to_hva(kvm, args->start_gfn + i);
                if (kvm_is_error_hva(hva)) {
                        r = -EFAULT;
-                       goto out;
+                       break;
                }
  
                /* Lowest order bit is reserved */
                if (keys[i] & 0x01) {
                        r = -EINVAL;
-                       goto out;
+                       break;
                }
  
-               r = set_guest_storage_key(current->mm, hva,
-                                         (unsigned long)keys[i], 0);
+               r = set_guest_storage_key(current->mm, hva, keys[i], 0);
                if (r)
-                       goto out;
+                       break;
        }
+       up_read(&current->mm->mmap_sem);
  out:
        kvfree(keys);
        return r;
@@@ -1129,6 -1372,7 +1375,7 @@@ static void sca_dispose(struct kvm *kvm
  
  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
  {
+       gfp_t alloc_flags = GFP_KERNEL;
        int i, rc;
        char debug_name[16];
        static unsigned long sca_offset;
  
        rc = -ENOMEM;
  
+       ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500);
        kvm->arch.use_esca = 0; /* start with basic SCA */
+       if (!sclp.has_64bscao)
+               alloc_flags |= GFP_DMA;
        rwlock_init(&kvm->arch.sca_lock);
-       kvm->arch.sca = (struct bsca_block *) get_zeroed_page(GFP_KERNEL);
+       kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
        if (!kvm->arch.sca)
                goto out_err;
        spin_lock(&kvm_lock);
        memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask,
               S390_ARCH_FAC_LIST_SIZE_BYTE);
  
+       set_kvm_facility(kvm->arch.model.fac_mask, 74);
+       set_kvm_facility(kvm->arch.model.fac_list, 74);
        kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
        kvm->arch.model.ibc = sclp.ibc & 0x0fff;
  
                else
                        kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE,
                                                    sclp.hamax + 1);
-               kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1);
+               kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
                if (!kvm->arch.gmap)
                        goto out_err;
                kvm->arch.gmap->private = kvm;
        kvm->arch.epoch = 0;
  
        spin_lock_init(&kvm->arch.start_stop_lock);
+       kvm_s390_vsie_init(kvm);
        KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
  
        return 0;
@@@ -1245,7 -1497,7 +1500,7 @@@ void kvm_arch_vcpu_destroy(struct kvm_v
                sca_del_vcpu(vcpu);
  
        if (kvm_is_ucontrol(vcpu->kvm))
-               gmap_free(vcpu->arch.gmap);
+               gmap_remove(vcpu->arch.gmap);
  
        if (vcpu->kvm->arch.use_cmma)
                kvm_s390_vcpu_unsetup_cmma(vcpu);
@@@ -1278,16 -1530,17 +1533,17 @@@ void kvm_arch_destroy_vm(struct kvm *kv
        debug_unregister(kvm->arch.dbf);
        free_page((unsigned long)kvm->arch.sie_page2);
        if (!kvm_is_ucontrol(kvm))
-               gmap_free(kvm->arch.gmap);
+               gmap_remove(kvm->arch.gmap);
        kvm_s390_destroy_adapters(kvm);
        kvm_s390_clear_float_irqs(kvm);
+       kvm_s390_vsie_destroy(kvm);
        KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
  }
  
  /* Section: vcpu related */
  static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
  {
-       vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
+       vcpu->arch.gmap = gmap_create(current->mm, -1UL);
        if (!vcpu->arch.gmap)
                return -ENOMEM;
        vcpu->arch.gmap->private = vcpu->kvm;
@@@ -1396,7 -1649,7 +1652,7 @@@ static int sca_can_add_vcpu(struct kvm 
  
        if (id < KVM_S390_BSCA_CPU_SLOTS)
                return true;
-       if (!sclp.has_esca)
+       if (!sclp.has_esca || !sclp.has_64bscao)
                return false;
  
        mutex_lock(&kvm->lock);
@@@ -1537,7 -1790,7 +1793,7 @@@ void kvm_arch_vcpu_load(struct kvm_vcp
  
        save_access_regs(vcpu->arch.host_acrs);
        restore_access_regs(vcpu->run->s.regs.acrs);
-       gmap_enable(vcpu->arch.gmap);
+       gmap_enable(vcpu->arch.enabled_gmap);
        atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
        if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
                __start_cpu_timer_accounting(vcpu);
@@@ -1550,7 -1803,8 +1806,8 @@@ void kvm_arch_vcpu_put(struct kvm_vcpu 
        if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
                __stop_cpu_timer_accounting(vcpu);
        atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
-       gmap_disable(vcpu->arch.gmap);
+       vcpu->arch.enabled_gmap = gmap_get_enabled();
+       gmap_disable(vcpu->arch.enabled_gmap);
  
        /* Save guest register state */
        save_fpu_regs();
@@@ -1599,7 -1853,10 +1856,10 @@@ void kvm_arch_vcpu_postcreate(struct kv
                vcpu->arch.gmap = vcpu->kvm->arch.gmap;
                sca_add_vcpu(vcpu);
        }
+       if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
+               vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+       /* make vcpu_load load the right gmap on the first trigger */
+       vcpu->arch.enabled_gmap = vcpu->arch.gmap;
  }
  
  static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
@@@ -1658,15 -1915,21 +1918,21 @@@ int kvm_arch_vcpu_setup(struct kvm_vcp
  
        kvm_s390_vcpu_setup_model(vcpu);
  
-       vcpu->arch.sie_block->ecb = 0x02;
+       /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */
+       if (MACHINE_HAS_ESOP)
+               vcpu->arch.sie_block->ecb |= 0x02;
        if (test_kvm_facility(vcpu->kvm, 9))
                vcpu->arch.sie_block->ecb |= 0x04;
-       if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))
+       if (test_kvm_facility(vcpu->kvm, 73))
                vcpu->arch.sie_block->ecb |= 0x10;
  
-       if (test_kvm_facility(vcpu->kvm, 8))
+       if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
                vcpu->arch.sie_block->ecb2 |= 0x08;
-       vcpu->arch.sie_block->eca   = 0xC1002000U;
+       vcpu->arch.sie_block->eca = 0x1002000U;
+       if (sclp.has_cei)
+               vcpu->arch.sie_block->eca |= 0x80000000U;
+       if (sclp.has_ib)
+               vcpu->arch.sie_block->eca |= 0x40000000U;
        if (sclp.has_siif)
                vcpu->arch.sie_block->eca |= 1;
        if (sclp.has_sigpif)
@@@ -1716,6 -1979,10 +1982,10 @@@ struct kvm_vcpu *kvm_arch_vcpu_create(s
        vcpu->arch.sie_block = &sie_page->sie_block;
        vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
  
+       /* the real guest size will always be smaller than msl */
+       vcpu->arch.sie_block->mso = 0;
+       vcpu->arch.sie_block->msl = sclp.hamax;
        vcpu->arch.sie_block->icpua = id;
        spin_lock_init(&vcpu->arch.local_int.lock);
        vcpu->arch.local_int.float_int = &kvm->arch.float_int;
@@@ -1784,16 -2051,25 +2054,25 @@@ void kvm_s390_sync_request(int req, str
        kvm_s390_vcpu_request(vcpu);
  }
  
- static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
+ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+                             unsigned long end)
  {
-       int i;
        struct kvm *kvm = gmap->private;
        struct kvm_vcpu *vcpu;
+       unsigned long prefix;
+       int i;
  
+       if (gmap_is_shadow(gmap))
+               return;
+       if (start >= 1UL << 31)
+               /* We are only interested in prefix pages */
+               return;
        kvm_for_each_vcpu(i, vcpu, kvm) {
                /* match against both prefix pages */
-               if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
-                       VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
+               prefix = kvm_s390_get_prefix(vcpu);
+               if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
+                       VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
+                                  start, end);
                        kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
                }
        }
@@@ -2002,6 -2278,8 +2281,8 @@@ int kvm_arch_vcpu_ioctl_set_guest_debug
  
        if (dbg->control & ~VALID_GUESTDBG_FLAGS)
                return -EINVAL;
+       if (!sclp.has_gpere)
+               return -EINVAL;
  
        if (dbg->control & KVM_GUESTDBG_ENABLE) {
                vcpu->guest_debug = dbg->control;
@@@ -2070,16 -2348,16 +2351,16 @@@ retry
                return 0;
        /*
         * We use MMU_RELOAD just to re-arm the ipte notifier for the
-        * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
+        * guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
         * This ensures that the ipte instruction for this request has
         * already finished. We might race against a second unmapper that
         * wants to set the blocking bit. Lets just retry the request loop.
         */
        if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
                int rc;
-               rc = gmap_ipte_notify(vcpu->arch.gmap,
-                                     kvm_s390_get_prefix(vcpu),
-                                     PAGE_SIZE * 2);
+               rc = gmap_mprotect_notify(vcpu->arch.gmap,
+                                         kvm_s390_get_prefix(vcpu),
+                                         PAGE_SIZE * 2, PROT_WRITE);
                if (rc)
                        return rc;
                goto retry;
                goto retry;
        }
  
+       if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) {
+               vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+               goto retry;
+       }
        /* nothing to do, just clear the request */
        clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
  
@@@ -2362,14 -2645,14 +2648,14 @@@ static int __vcpu_run(struct kvm_vcpu *
                 * guest_enter and guest_exit should be no uaccess.
                 */
                local_irq_disable();
-               __kvm_guest_enter();
+               guest_enter_irqoff();
                __disable_cpu_timer_accounting(vcpu);
                local_irq_enable();
                exit_reason = sie64a(vcpu->arch.sie_block,
                                     vcpu->run->s.regs.gprs);
                local_irq_disable();
                __enable_cpu_timer_accounting(vcpu);
-               __kvm_guest_exit();
+               guest_exit_irqoff();
                local_irq_enable();
                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
  
@@@ -2598,6 -2881,8 +2884,8 @@@ static void __disable_ibs_on_all_vcpus(
  
  static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
  {
+       if (!sclp.has_ibs)
+               return;
        kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
        kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
  }
diff --combined arch/s390/mm/fault.c
@@@ -418,6 -418,8 +418,8 @@@ static inline int do_exception(struct p
                (struct gmap *) S390_lowcore.gmap : NULL;
        if (gmap) {
                current->thread.gmap_addr = address;
+               current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
+               current->thread.gmap_int_code = regs->int_code & 0xffff;
                address = __gmap_translate(gmap, address);
                if (address == -EFAULT) {
                        fault = VM_FAULT_BADMAP;
@@@ -456,7 -458,7 +458,7 @@@ retry
         * make sure we exit gracefully rather than endlessly redo
         * the fault.
         */
 -      fault = handle_mm_fault(mm, vma, address, flags);
 +      fault = handle_mm_fault(vma, address, flags);
        /* No reason to continue if interrupted by SIGKILL. */
        if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
                fault = VM_FAULT_SIGNAL;
@@@ -624,7 -626,7 +626,7 @@@ void pfault_fini(void
        diag_stat_inc(DIAG_STAT_X258);
        asm volatile(
                "       diag    %0,0,0x258\n"
 -              "0:\n"
 +              "0:     nopr    %%r7\n"
                EX_TABLE(0b,0b)
                : : "a" (&refbk), "m" (refbk) : "cc");
  }
diff --combined arch/s390/mm/gmap.c
  #include <asm/gmap.h>
  #include <asm/tlb.h>
  
+ #define GMAP_SHADOW_FAKE_TABLE 1ULL
  /**
-  * gmap_alloc - allocate a guest address space
+  * gmap_alloc - allocate and initialize a guest address space
   * @mm: pointer to the parent mm_struct
   * @limit: maximum address of the gmap address space
   *
   * Returns a guest address space structure.
   */
- struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
+ static struct gmap *gmap_alloc(unsigned long limit)
  {
        struct gmap *gmap;
        struct page *page;
        if (!gmap)
                goto out;
        INIT_LIST_HEAD(&gmap->crst_list);
+       INIT_LIST_HEAD(&gmap->children);
+       INIT_LIST_HEAD(&gmap->pt_list);
        INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
        INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
+       INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
        spin_lock_init(&gmap->guest_table_lock);
-       gmap->mm = mm;
+       spin_lock_init(&gmap->shadow_lock);
+       atomic_set(&gmap->ref_count, 1);
        page = alloc_pages(GFP_KERNEL, 2);
        if (!page)
                goto out_free;
@@@ -70,9 -76,6 +76,6 @@@
        gmap->asce = atype | _ASCE_TABLE_LENGTH |
                _ASCE_USER_BITS | __pa(table);
        gmap->asce_end = limit;
-       down_write(&mm->mmap_sem);
-       list_add(&gmap->list, &mm->context.gmap_list);
-       up_write(&mm->mmap_sem);
        return gmap;
  
  out_free:
  out:
        return NULL;
  }
- EXPORT_SYMBOL_GPL(gmap_alloc);
+ /**
+  * gmap_create - create a guest address space
+  * @mm: pointer to the parent mm_struct
+  * @limit: maximum size of the gmap address space
+  *
+  * Returns a guest address space structure.
+  */
+ struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
+ {
+       struct gmap *gmap;
+       gmap = gmap_alloc(limit);
+       if (!gmap)
+               return NULL;
+       gmap->mm = mm;
+       spin_lock(&mm->context.gmap_lock);
+       list_add_rcu(&gmap->list, &mm->context.gmap_list);
+       spin_unlock(&mm->context.gmap_lock);
+       return gmap;
+ }
+ EXPORT_SYMBOL_GPL(gmap_create);
  
  static void gmap_flush_tlb(struct gmap *gmap)
  {
        if (MACHINE_HAS_IDTE)
 -              __tlb_flush_asce(gmap->mm, gmap->asce);
 +              __tlb_flush_idte(gmap->asce);
        else
                __tlb_flush_global();
  }
@@@ -114,31 -138,117 +138,117 @@@ static void gmap_radix_tree_free(struc
        } while (nr > 0);
  }
  
+ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+ {
+       struct gmap_rmap *rmap, *rnext, *head;
+       struct radix_tree_iter iter;
+       unsigned long indices[16];
+       unsigned long index;
+       void **slot;
+       int i, nr;
+       /* A radix tree is freed by deleting all of its entries */
+       index = 0;
+       do {
+               nr = 0;
+               radix_tree_for_each_slot(slot, root, &iter, index) {
+                       indices[nr] = iter.index;
+                       if (++nr == 16)
+                               break;
+               }
+               for (i = 0; i < nr; i++) {
+                       index = indices[i];
+                       head = radix_tree_delete(root, index);
+                       gmap_for_each_rmap_safe(rmap, rnext, head)
+                               kfree(rmap);
+               }
+       } while (nr > 0);
+ }
  /**
   * gmap_free - free a guest address space
   * @gmap: pointer to the guest address space structure
+  *
+  * No locks required. There are no references to this gmap anymore.
   */
- void gmap_free(struct gmap *gmap)
static void gmap_free(struct gmap *gmap)
  {
        struct page *page, *next;
  
-       /* Flush tlb. */
-       if (MACHINE_HAS_IDTE)
-               __tlb_flush_idte(gmap->asce);
-       else
-               __tlb_flush_global();
+       /* Flush tlb of all gmaps (if not already done for shadows) */
+       if (!(gmap_is_shadow(gmap) && gmap->removed))
+               gmap_flush_tlb(gmap);
        /* Free all segment & region tables. */
        list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
                __free_pages(page, 2);
        gmap_radix_tree_free(&gmap->guest_to_host);
        gmap_radix_tree_free(&gmap->host_to_guest);
-       down_write(&gmap->mm->mmap_sem);
-       list_del(&gmap->list);
-       up_write(&gmap->mm->mmap_sem);
+       /* Free additional data for a shadow gmap */
+       if (gmap_is_shadow(gmap)) {
+               /* Free all page tables. */
+               list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
+                       page_table_free_pgste(page);
+               gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+               /* Release reference to the parent */
+               gmap_put(gmap->parent);
+       }
        kfree(gmap);
  }
- EXPORT_SYMBOL_GPL(gmap_free);
+ /**
+  * gmap_get - increase reference counter for guest address space
+  * @gmap: pointer to the guest address space structure
+  *
+  * Returns the gmap pointer
+  */
+ struct gmap *gmap_get(struct gmap *gmap)
+ {
+       atomic_inc(&gmap->ref_count);
+       return gmap;
+ }
+ EXPORT_SYMBOL_GPL(gmap_get);
+ /**
+  * gmap_put - decrease reference counter for guest address space
+  * @gmap: pointer to the guest address space structure
+  *
+  * If the reference counter reaches zero the guest address space is freed.
+  */
+ void gmap_put(struct gmap *gmap)
+ {
+       if (atomic_dec_return(&gmap->ref_count) == 0)
+               gmap_free(gmap);
+ }
+ EXPORT_SYMBOL_GPL(gmap_put);
+ /**
+  * gmap_remove - remove a guest address space but do not free it yet
+  * @gmap: pointer to the guest address space structure
+  */
+ void gmap_remove(struct gmap *gmap)
+ {
+       struct gmap *sg, *next;
+       /* Remove all shadow gmaps linked to this gmap */
+       if (!list_empty(&gmap->children)) {
+               spin_lock(&gmap->shadow_lock);
+               list_for_each_entry_safe(sg, next, &gmap->children, list) {
+                       list_del(&sg->list);
+                       gmap_put(sg);
+               }
+               spin_unlock(&gmap->shadow_lock);
+       }
+       /* Remove gmap from the pre-mm list */
+       spin_lock(&gmap->mm->context.gmap_lock);
+       list_del_rcu(&gmap->list);
+       spin_unlock(&gmap->mm->context.gmap_lock);
+       synchronize_rcu();
+       /* Put reference */
+       gmap_put(gmap);
+ }
+ EXPORT_SYMBOL_GPL(gmap_remove);
  
  /**
   * gmap_enable - switch primary space to the guest address space
@@@ -160,6 -270,17 +270,17 @@@ void gmap_disable(struct gmap *gmap
  }
  EXPORT_SYMBOL_GPL(gmap_disable);
  
+ /**
+  * gmap_get_enabled - get a pointer to the currently enabled gmap
+  *
+  * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
+  */
+ struct gmap *gmap_get_enabled(void)
+ {
+       return (struct gmap *) S390_lowcore.gmap;
+ }
+ EXPORT_SYMBOL_GPL(gmap_get_enabled);
  /*
   * gmap_alloc_table is assumed to be called with mmap_sem held
   */
@@@ -175,7 -296,7 +296,7 @@@ static int gmap_alloc_table(struct gma
                return -ENOMEM;
        new = (unsigned long *) page_to_phys(page);
        crst_table_init(new, init);
-       spin_lock(&gmap->mm->page_table_lock);
+       spin_lock(&gmap->guest_table_lock);
        if (*table & _REGION_ENTRY_INVALID) {
                list_add(&page->lru, &gmap->crst_list);
                *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
                page->index = gaddr;
                page = NULL;
        }
-       spin_unlock(&gmap->mm->page_table_lock);
+       spin_unlock(&gmap->guest_table_lock);
        if (page)
                __free_pages(page, 2);
        return 0;
@@@ -219,6 -340,7 +340,7 @@@ static int __gmap_unlink_by_vmaddr(stru
        unsigned long *entry;
        int flush = 0;
  
+       BUG_ON(gmap_is_shadow(gmap));
        spin_lock(&gmap->guest_table_lock);
        entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
        if (entry) {
@@@ -258,6 -380,7 +380,7 @@@ int gmap_unmap_segment(struct gmap *gma
        unsigned long off;
        int flush;
  
+       BUG_ON(gmap_is_shadow(gmap));
        if ((to | len) & (PMD_SIZE - 1))
                return -EINVAL;
        if (len == 0 || to + len < to)
@@@ -289,6 -412,7 +412,7 @@@ int gmap_map_segment(struct gmap *gmap
        unsigned long off;
        int flush;
  
+       BUG_ON(gmap_is_shadow(gmap));
        if ((from | to | len) & (PMD_SIZE - 1))
                return -EINVAL;
        if (len == 0 || from + len < from || to + len < to ||
@@@ -326,6 -450,8 +450,8 @@@ EXPORT_SYMBOL_GPL(gmap_map_segment)
   * This function does not establish potentially missing page table entries.
   * The mmap_sem of the mm that belongs to the address space must be held
   * when this function gets called.
+  *
+  * Note: Can also be called for shadow gmaps.
   */
  unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
  {
  
        vmaddr = (unsigned long)
                radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
+       /* Note: guest_to_host is empty for a shadow gmap */
        return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
  }
  EXPORT_SYMBOL_GPL(__gmap_translate);
@@@ -369,11 -496,13 +496,13 @@@ void gmap_unlink(struct mm_struct *mm, 
        struct gmap *gmap;
        int flush;
  
-       list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
                flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
                if (flush)
                        gmap_flush_tlb(gmap);
        }
+       rcu_read_unlock();
  }
  
  /**
@@@ -397,6 -526,7 +526,7 @@@ int __gmap_link(struct gmap *gmap, unsi
        pmd_t *pmd;
        int rc;
  
+       BUG_ON(gmap_is_shadow(gmap));
        /* Create higher level tables in the gmap page table */
        table = gmap->table;
        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
        VM_BUG_ON(pgd_none(*pgd));
        pud = pud_offset(pgd, vmaddr);
        VM_BUG_ON(pud_none(*pud));
 +      /* large puds cannot yet be handled */
 +      if (pud_large(*pud))
 +              return -EFAULT;
        pmd = pmd_offset(pud, vmaddr);
        VM_BUG_ON(pmd_none(*pmd));
        /* large pmds cannot yet be handled */
@@@ -552,116 -679,1412 +682,1412 @@@ static LIST_HEAD(gmap_notifier_list)
  static DEFINE_SPINLOCK(gmap_notifier_lock);
  
  /**
-  * gmap_register_ipte_notifier - register a pte invalidation callback
+  * gmap_register_pte_notifier - register a pte invalidation callback
   * @nb: pointer to the gmap notifier block
   */
- void gmap_register_ipte_notifier(struct gmap_notifier *nb)
+ void gmap_register_pte_notifier(struct gmap_notifier *nb)
  {
        spin_lock(&gmap_notifier_lock);
-       list_add(&nb->list, &gmap_notifier_list);
+       list_add_rcu(&nb->list, &gmap_notifier_list);
        spin_unlock(&gmap_notifier_lock);
  }
- EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
+ EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
  
  /**
-  * gmap_unregister_ipte_notifier - remove a pte invalidation callback
+  * gmap_unregister_pte_notifier - remove a pte invalidation callback
   * @nb: pointer to the gmap notifier block
   */
- void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
+ void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
  {
        spin_lock(&gmap_notifier_lock);
-       list_del_init(&nb->list);
+       list_del_rcu(&nb->list);
        spin_unlock(&gmap_notifier_lock);
+       synchronize_rcu();
+ }
+ EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
+ /**
+  * gmap_call_notifier - call all registered invalidation callbacks
+  * @gmap: pointer to guest mapping meta data structure
+  * @start: start virtual address in the guest address space
+  * @end: end virtual address in the guest address space
+  */
+ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
+                              unsigned long end)
+ {
+       struct gmap_notifier *nb;
+       list_for_each_entry(nb, &gmap_notifier_list, list)
+               nb->notifier_call(gmap, start, end);
+ }
+ /**
+  * gmap_table_walk - walk the gmap page tables
+  * @gmap: pointer to guest mapping meta data structure
+  * @gaddr: virtual address in the guest address space
+  * @level: page table level to stop at
+  *
+  * Returns a table entry pointer for the given guest address and @level
+  * @level=0 : returns a pointer to a page table table entry (or NULL)
+  * @level=1 : returns a pointer to a segment table entry (or NULL)
+  * @level=2 : returns a pointer to a region-3 table entry (or NULL)
+  * @level=3 : returns a pointer to a region-2 table entry (or NULL)
+  * @level=4 : returns a pointer to a region-1 table entry (or NULL)
+  *
+  * Returns NULL if the gmap page tables could not be walked to the
+  * requested level.
+  *
+  * Note: Can also be called for shadow gmaps.
+  */
+ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
+                                            unsigned long gaddr, int level)
+ {
+       unsigned long *table;
+       if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
+               return NULL;
+       if (gmap_is_shadow(gmap) && gmap->removed)
+               return NULL;
+       if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+               return NULL;
+       table = gmap->table;
+       switch (gmap->asce & _ASCE_TYPE_MASK) {
+       case _ASCE_TYPE_REGION1:
+               table += (gaddr >> 53) & 0x7ff;
+               if (level == 4)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_REGION2:
+               table += (gaddr >> 42) & 0x7ff;
+               if (level == 3)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_REGION3:
+               table += (gaddr >> 31) & 0x7ff;
+               if (level == 2)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_SEGMENT:
+               table += (gaddr >> 20) & 0x7ff;
+               if (level == 1)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+               table += (gaddr >> 12) & 0xff;
+       }
+       return table;
+ }
+ /**
+  * gmap_pte_op_walk - walk the gmap page table, get the page table lock
+  *                  and return the pte pointer
+  * @gmap: pointer to guest mapping meta data structure
+  * @gaddr: virtual address in the guest address space
+  * @ptl: pointer to the spinlock pointer
+  *
+  * Returns a pointer to the locked pte for a guest address, or NULL
+  *
+  * Note: Can also be called for shadow gmaps.
+  */
+ static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
+                              spinlock_t **ptl)
+ {
+       unsigned long *table;
+       if (gmap_is_shadow(gmap))
+               spin_lock(&gmap->guest_table_lock);
+       /* Walk the gmap page table, lock and get pte pointer */
+       table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
+       if (!table || *table & _SEGMENT_ENTRY_INVALID) {
+               if (gmap_is_shadow(gmap))
+                       spin_unlock(&gmap->guest_table_lock);
+               return NULL;
+       }
+       if (gmap_is_shadow(gmap)) {
+               *ptl = &gmap->guest_table_lock;
+               return pte_offset_map((pmd_t *) table, gaddr);
+       }
+       return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
+ }
+ /**
+  * gmap_pte_op_fixup - force a page in and connect the gmap page table
+  * @gmap: pointer to guest mapping meta data structure
+  * @gaddr: virtual address in the guest address space
+  * @vmaddr: address in the host process address space
+  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+  *
+  * Returns 0 if the caller can retry __gmap_translate (might fail again),
+  * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
+  * up or connecting the gmap page table.
+  */
+ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
+                            unsigned long vmaddr, int prot)
+ {
+       struct mm_struct *mm = gmap->mm;
+       unsigned int fault_flags;
+       bool unlocked = false;
+       BUG_ON(gmap_is_shadow(gmap));
+       fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+       if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+               return -EFAULT;
+       if (unlocked)
+               /* lost mmap_sem, caller has to retry __gmap_translate */
+               return 0;
+       /* Connect the page tables */
+       return __gmap_link(gmap, gaddr, vmaddr);
  }
- EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
  
  /**
-  * gmap_ipte_notify - mark a range of ptes for invalidation notification
+  * gmap_pte_op_end - release the page table lock
+  * @ptl: pointer to the spinlock pointer
+  */
+ static void gmap_pte_op_end(spinlock_t *ptl)
+ {
+       spin_unlock(ptl);
+ }
+ /*
+  * gmap_protect_range - remove access rights to memory and set pgste bits
   * @gmap: pointer to guest mapping meta data structure
   * @gaddr: virtual address in the guest address space
   * @len: size of area
+  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+  * @bits: pgste notification bits to set
+  *
+  * Returns 0 if successfully protected, -ENOMEM if out of memory and
+  * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+  *
+  * Called with sg->mm->mmap_sem in read.
   *
-  * Returns 0 if for each page in the given range a gmap mapping exists and
-  * the invalidation notification could be set. If the gmap mapping is missing
-  * for one or more pages -EFAULT is returned. If no memory could be allocated
-  * -ENOMEM is returned. This function establishes missing page table entries.
+  * Note: Can also be called for shadow gmaps.
   */
- int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
+ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
+                             unsigned long len, int prot, unsigned long bits)
  {
-       unsigned long addr;
+       unsigned long vmaddr;
        spinlock_t *ptl;
        pte_t *ptep;
-       bool unlocked;
-       int rc = 0;
+       int rc;
+       while (len) {
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+               if (ptep) {
+                       rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
+                       gmap_pte_op_end(ptl);
+               }
+               if (rc) {
+                       vmaddr = __gmap_translate(gmap, gaddr);
+                       if (IS_ERR_VALUE(vmaddr))
+                               return vmaddr;
+                       rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
+                       if (rc)
+                               return rc;
+                       continue;
+               }
+               gaddr += PAGE_SIZE;
+               len -= PAGE_SIZE;
+       }
+       return 0;
+ }
  
-       if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
+ /**
+  * gmap_mprotect_notify - change access rights for a range of ptes and
+  *                        call the notifier if any pte changes again
+  * @gmap: pointer to guest mapping meta data structure
+  * @gaddr: virtual address in the guest address space
+  * @len: size of area
+  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+  *
+  * Returns 0 if for each page in the given range a gmap mapping exists,
+  * the new access rights could be set and the notifier could be armed.
+  * If the gmap mapping is missing for one or more pages -EFAULT is
+  * returned. If no memory could be allocated -ENOMEM is returned.
+  * This function establishes missing page table entries.
+  */
+ int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
+                        unsigned long len, int prot)
+ {
+       int rc;
+       if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
+               return -EINVAL;
+       if (!MACHINE_HAS_ESOP && prot == PROT_READ)
                return -EINVAL;
        down_read(&gmap->mm->mmap_sem);
-       while (len) {
-               unlocked = false;
-               /* Convert gmap address and connect the page tables */
-               addr = __gmap_translate(gmap, gaddr);
-               if (IS_ERR_VALUE(addr)) {
-                       rc = addr;
-                       break;
+       rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
+       up_read(&gmap->mm->mmap_sem);
+       return rc;
+ }
+ EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+ /**
+  * gmap_read_table - get an unsigned long value from a guest page table using
+  *                   absolute addressing, without marking the page referenced.
+  * @gmap: pointer to guest mapping meta data structure
+  * @gaddr: virtual address in the guest address space
+  * @val: pointer to the unsigned long value to return
+  *
+  * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
+  * if reading using the virtual address failed.
+  *
+  * Called with gmap->mm->mmap_sem in read.
+  */
+ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
+ {
+       unsigned long address, vmaddr;
+       spinlock_t *ptl;
+       pte_t *ptep, pte;
+       int rc;
+       while (1) {
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+               if (ptep) {
+                       pte = *ptep;
+                       if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
+                               address = pte_val(pte) & PAGE_MASK;
+                               address += gaddr & ~PAGE_MASK;
+                               *val = *(unsigned long *) address;
+                               pte_val(*ptep) |= _PAGE_YOUNG;
+                               /* Do *NOT* clear the _PAGE_INVALID bit! */
+                               rc = 0;
+                       }
+                       gmap_pte_op_end(ptl);
                }
-               /* Get the page mapped */
-               if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
-                                    &unlocked)) {
-                       rc = -EFAULT;
+               if (!rc)
+                       break;
+               vmaddr = __gmap_translate(gmap, gaddr);
+               if (IS_ERR_VALUE(vmaddr)) {
+                       rc = vmaddr;
                        break;
                }
-               /* While trying to map mmap_sem got unlocked. Let us retry */
-               if (unlocked)
-                       continue;
-               rc = __gmap_link(gmap, gaddr, addr);
+               rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
                if (rc)
                        break;
-               /* Walk the process page table, lock and get pte pointer */
-               ptep = get_locked_pte(gmap->mm, addr, &ptl);
-               VM_BUG_ON(!ptep);
-               /* Set notification bit in the pgste of the pte */
-               if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
-                       ptep_set_notify(gmap->mm, addr, ptep);
-                       gaddr += PAGE_SIZE;
-                       len -= PAGE_SIZE;
-               }
-               pte_unmap_unlock(ptep, ptl);
        }
-       up_read(&gmap->mm->mmap_sem);
        return rc;
  }
- EXPORT_SYMBOL_GPL(gmap_ipte_notify);
+ EXPORT_SYMBOL_GPL(gmap_read_table);
  
  /**
-  * ptep_notify - call all invalidation callbacks for a specific pte.
-  * @mm: pointer to the process mm_struct
-  * @addr: virtual address in the process address space
-  * @pte: pointer to the page table entry
+  * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
+  * @sg: pointer to the shadow guest address space structure
+  * @vmaddr: vm address associated with the rmap
+  * @rmap: pointer to the rmap structure
   *
-  * This function is assumed to be called with the page table lock held
-  * for the pte to notify.
+  * Called with the sg->guest_table_lock
   */
- void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
+ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
+                                   struct gmap_rmap *rmap)
  {
-       unsigned long offset, gaddr;
-       unsigned long *table;
-       struct gmap_notifier *nb;
-       struct gmap *gmap;
+       void **slot;
  
-       offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
-       offset = offset * (4096 / sizeof(pte_t));
-       spin_lock(&gmap_notifier_lock);
-       list_for_each_entry(gmap, &mm->context.gmap_list, list) {
-               table = radix_tree_lookup(&gmap->host_to_guest,
-                                         vmaddr >> PMD_SHIFT);
-               if (!table)
+       BUG_ON(!gmap_is_shadow(sg));
+       slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+       if (slot) {
+               rmap->next = radix_tree_deref_slot_protected(slot,
+                                                       &sg->guest_table_lock);
+               radix_tree_replace_slot(slot, rmap);
+       } else {
+               rmap->next = NULL;
+               radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
+                                 rmap);
+       }
+ }
+ /**
+  * gmap_protect_rmap - modify access rights to memory and create an rmap
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow gmap
+  * @paddr: address in the parent guest address space
+  * @len: length of the memory area to protect
+  * @prot: indicates access rights: none, read-only or read-write
+  *
+  * Returns 0 if successfully protected and the rmap was created, -ENOMEM
+  * if out of memory and -EFAULT if paddr is invalid.
+  */
+ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
+                            unsigned long paddr, unsigned long len, int prot)
+ {
+       struct gmap *parent;
+       struct gmap_rmap *rmap;
+       unsigned long vmaddr;
+       spinlock_t *ptl;
+       pte_t *ptep;
+       int rc;
+       BUG_ON(!gmap_is_shadow(sg));
+       parent = sg->parent;
+       while (len) {
+               vmaddr = __gmap_translate(parent, paddr);
+               if (IS_ERR_VALUE(vmaddr))
+                       return vmaddr;
+               rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+               if (!rmap)
+                       return -ENOMEM;
+               rmap->raddr = raddr;
+               rc = radix_tree_preload(GFP_KERNEL);
+               if (rc) {
+                       kfree(rmap);
+                       return rc;
+               }
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(parent, paddr, &ptl);
+               if (ptep) {
+                       spin_lock(&sg->guest_table_lock);
+                       rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
+                                            PGSTE_VSIE_BIT);
+                       if (!rc)
+                               gmap_insert_rmap(sg, vmaddr, rmap);
+                       spin_unlock(&sg->guest_table_lock);
+                       gmap_pte_op_end(ptl);
+               }
+               radix_tree_preload_end();
+               if (rc) {
+                       kfree(rmap);
+                       rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+                       if (rc)
+                               return rc;
                        continue;
-               gaddr = __gmap_segment_gaddr(table) + offset;
-               list_for_each_entry(nb, &gmap_notifier_list, list)
-                       nb->notifier_call(gmap, gaddr);
+               }
+               paddr += PAGE_SIZE;
+               len -= PAGE_SIZE;
        }
-       spin_unlock(&gmap_notifier_lock);
+       return 0;
+ }
+ #define _SHADOW_RMAP_MASK     0x7
+ #define _SHADOW_RMAP_REGION1  0x5
+ #define _SHADOW_RMAP_REGION2  0x4
+ #define _SHADOW_RMAP_REGION3  0x3
+ #define _SHADOW_RMAP_SEGMENT  0x2
+ #define _SHADOW_RMAP_PGTABLE  0x1
+ /**
+  * gmap_idte_one - invalidate a single region or segment table entry
+  * @asce: region or segment table *origin* + table-type bits
+  * @vaddr: virtual address to identify the table entry to flush
+  *
+  * The invalid bit of a single region or segment table entry is set
+  * and the associated TLB entries depending on the entry are flushed.
+  * The table-type of the @asce identifies the portion of the @vaddr
+  * that is used as the invalidation index.
+  */
+ static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
+ {
+       asm volatile(
+               "       .insn   rrf,0xb98e0000,%0,%1,0,0"
+               : : "a" (asce), "a" (vaddr) : "cc", "memory");
+ }
+ /**
+  * gmap_unshadow_page - remove a page from a shadow page table
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
+ {
+       unsigned long *table;
+       BUG_ON(!gmap_is_shadow(sg));
+       table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
+       if (!table || *table & _PAGE_INVALID)
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1);
+       ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
+ }
+ /**
+  * __gmap_unshadow_pgt - remove all entries from a shadow page table
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  * @pgt: pointer to the start of a shadow page table
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
+                               unsigned long *pgt)
+ {
+       int i;
+       BUG_ON(!gmap_is_shadow(sg));
+       for (i = 0; i < 256; i++, raddr += 1UL << 12)
+               pgt[i] = _PAGE_INVALID;
+ }
+ /**
+  * gmap_unshadow_pgt - remove a shadow page table from a segment entry
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: address in the shadow guest address space
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
+ {
+       unsigned long sto, *ste, *pgt;
+       struct page *page;
+       BUG_ON(!gmap_is_shadow(sg));
+       ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
+       if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1);
+       sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff));
+       gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
+       pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+       *ste = _SEGMENT_ENTRY_EMPTY;
+       __gmap_unshadow_pgt(sg, raddr, pgt);
+       /* Free page table */
+       page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       page_table_free_pgste(page);
+ }
+ /**
+  * __gmap_unshadow_sgt - remove all entries from a shadow segment table
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  * @sgt: pointer to the start of a shadow segment table
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
+                               unsigned long *sgt)
+ {
+       unsigned long asce, *pgt;
+       struct page *page;
+       int i;
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 20) {
+               if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
+                       continue;
+               pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+               sgt[i] = _SEGMENT_ENTRY_EMPTY;
+               __gmap_unshadow_pgt(sg, raddr, pgt);
+               /* Free page table */
+               page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               page_table_free_pgste(page);
+       }
+ }
+ /**
+  * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  *
+  * Called with the shadow->guest_table_lock
+  */
+ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
+ {
+       unsigned long r3o, *r3e, *sgt;
+       struct page *page;
+       BUG_ON(!gmap_is_shadow(sg));
+       r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
+       if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1);
+       r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff));
+       gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
+       sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+       *r3e = _REGION3_ENTRY_EMPTY;
+       __gmap_unshadow_sgt(sg, raddr, sgt);
+       /* Free segment table */
+       page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+ }
+ /**
+  * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: address in the shadow guest address space
+  * @r3t: pointer to the start of a shadow region-3 table
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r3t)
+ {
+       unsigned long asce, *sgt;
+       struct page *page;
+       int i;
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r3t | _ASCE_TYPE_REGION3;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 31) {
+               if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+               r3t[i] = _REGION3_ENTRY_EMPTY;
+               __gmap_unshadow_sgt(sg, raddr, sgt);
+               /* Free segment table */
+               page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+ }
+ /**
+  * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
+ {
+       unsigned long r2o, *r2e, *r3t;
+       struct page *page;
+       BUG_ON(!gmap_is_shadow(sg));
+       r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
+       if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1);
+       r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff));
+       gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
+       r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+       *r2e = _REGION2_ENTRY_EMPTY;
+       __gmap_unshadow_r3t(sg, raddr, r3t);
+       /* Free region 3 table */
+       page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+ }
+ /**
+  * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  * @r2t: pointer to the start of a shadow region-2 table
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r2t)
+ {
+       unsigned long asce, *r3t;
+       struct page *page;
+       int i;
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r2t | _ASCE_TYPE_REGION2;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 42) {
+               if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+               r2t[i] = _REGION2_ENTRY_EMPTY;
+               __gmap_unshadow_r3t(sg, raddr, r3t);
+               /* Free region 3 table */
+               page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+ }
+ /**
+  * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  *
+  * Called with the sg->guest_table_lock
+  */
+ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
+ {
+       unsigned long r1o, *r1e, *r2t;
+       struct page *page;
+       BUG_ON(!gmap_is_shadow(sg));
+       r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
+       if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1);
+       r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff));
+       gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
+       r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+       *r1e = _REGION1_ENTRY_EMPTY;
+       __gmap_unshadow_r2t(sg, raddr, r2t);
+       /* Free region 2 table */
+       page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+ }
+ /**
+  * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
+  * @sg: pointer to the shadow guest address space structure
+  * @raddr: rmap address in the shadow guest address space
+  * @r1t: pointer to the start of a shadow region-1 table
+  *
+  * Called with the shadow->guest_table_lock
+  */
+ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r1t)
+ {
+       unsigned long asce, *r2t;
+       struct page *page;
+       int i;
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 53) {
+               if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
+               __gmap_unshadow_r2t(sg, raddr, r2t);
+               /* Clear entry and flush translation r1t -> r2t */
+               gmap_idte_one(asce, raddr);
+               r1t[i] = _REGION1_ENTRY_EMPTY;
+               /* Free region 2 table */
+               page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+ }
+ /**
+  * gmap_unshadow - remove a shadow page table completely
+  * @sg: pointer to the shadow guest address space structure
+  *
+  * Called with sg->guest_table_lock
+  */
+ static void gmap_unshadow(struct gmap *sg)
+ {
+       unsigned long *table;
+       BUG_ON(!gmap_is_shadow(sg));
+       if (sg->removed)
+               return;
+       sg->removed = 1;
+       gmap_call_notifier(sg, 0, -1UL);
+       gmap_flush_tlb(sg);
+       table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+       switch (sg->asce & _ASCE_TYPE_MASK) {
+       case _ASCE_TYPE_REGION1:
+               __gmap_unshadow_r1t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_REGION2:
+               __gmap_unshadow_r2t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_REGION3:
+               __gmap_unshadow_r3t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_SEGMENT:
+               __gmap_unshadow_sgt(sg, 0, table);
+               break;
+       }
+ }
+ /**
+  * gmap_find_shadow - find a specific asce in the list of shadow tables
+  * @parent: pointer to the parent gmap
+  * @asce: ASCE for which the shadow table is created
+  * @edat_level: edat level to be used for the shadow translation
+  *
+  * Returns the pointer to a gmap if a shadow table with the given asce is
+  * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+  * otherwise NULL
+  */
+ static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
+                                    int edat_level)
+ {
+       struct gmap *sg;
+       list_for_each_entry(sg, &parent->children, list) {
+               if (sg->orig_asce != asce || sg->edat_level != edat_level ||
+                   sg->removed)
+                       continue;
+               if (!sg->initialized)
+                       return ERR_PTR(-EAGAIN);
+               atomic_inc(&sg->ref_count);
+               return sg;
+       }
+       return NULL;
+ }
+ /**
+  * gmap_shadow_valid - check if a shadow guest address space matches the
+  *                     given properties and is still valid
+  * @sg: pointer to the shadow guest address space structure
+  * @asce: ASCE for which the shadow table is requested
+  * @edat_level: edat level to be used for the shadow translation
+  *
+  * Returns 1 if the gmap shadow is still valid and matches the given
+  * properties, the caller can continue using it. Returns 0 otherwise, the
+  * caller has to request a new shadow gmap in this case.
+  *
+  */
+ int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+ {
+       if (sg->removed)
+               return 0;
+       return sg->orig_asce == asce && sg->edat_level == edat_level;
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow_valid);
+ /**
+  * gmap_shadow - create/find a shadow guest address space
+  * @parent: pointer to the parent gmap
+  * @asce: ASCE for which the shadow table is created
+  * @edat_level: edat level to be used for the shadow translation
+  *
+  * The pages of the top level page table referred by the asce parameter
+  * will be set to read-only and marked in the PGSTEs of the kvm process.
+  * The shadow table will be removed automatically on any change to the
+  * PTE mapping for the source table.
+  *
+  * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+  * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+  * parent gmap table could not be protected.
+  */
+ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+                        int edat_level)
+ {
+       struct gmap *sg, *new;
+       unsigned long limit;
+       int rc;
+       BUG_ON(gmap_is_shadow(parent));
+       spin_lock(&parent->shadow_lock);
+       sg = gmap_find_shadow(parent, asce, edat_level);
+       spin_unlock(&parent->shadow_lock);
+       if (sg)
+               return sg;
+       /* Create a new shadow gmap */
+       limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+       if (asce & _ASCE_REAL_SPACE)
+               limit = -1UL;
+       new = gmap_alloc(limit);
+       if (!new)
+               return ERR_PTR(-ENOMEM);
+       new->mm = parent->mm;
+       new->parent = gmap_get(parent);
+       new->orig_asce = asce;
+       new->edat_level = edat_level;
+       new->initialized = false;
+       spin_lock(&parent->shadow_lock);
+       /* Recheck if another CPU created the same shadow */
+       sg = gmap_find_shadow(parent, asce, edat_level);
+       if (sg) {
+               spin_unlock(&parent->shadow_lock);
+               gmap_free(new);
+               return sg;
+       }
+       if (asce & _ASCE_REAL_SPACE) {
+               /* only allow one real-space gmap shadow */
+               list_for_each_entry(sg, &parent->children, list) {
+                       if (sg->orig_asce & _ASCE_REAL_SPACE) {
+                               spin_lock(&sg->guest_table_lock);
+                               gmap_unshadow(sg);
+                               spin_unlock(&sg->guest_table_lock);
+                               list_del(&sg->list);
+                               gmap_put(sg);
+                               break;
+                       }
+               }
+       }
+       atomic_set(&new->ref_count, 2);
+       list_add(&new->list, &parent->children);
+       if (asce & _ASCE_REAL_SPACE) {
+               /* nothing to protect, return right away */
+               new->initialized = true;
+               spin_unlock(&parent->shadow_lock);
+               return new;
+       }
+       spin_unlock(&parent->shadow_lock);
+       /* protect after insertion, so it will get properly invalidated */
+       down_read(&parent->mm->mmap_sem);
+       rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
+                               ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096,
+                               PROT_READ, PGSTE_VSIE_BIT);
+       up_read(&parent->mm->mmap_sem);
+       spin_lock(&parent->shadow_lock);
+       new->initialized = true;
+       if (rc) {
+               list_del(&new->list);
+               gmap_free(new);
+               new = ERR_PTR(rc);
+       }
+       spin_unlock(&parent->shadow_lock);
+       return new;
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow);
+ /**
+  * gmap_shadow_r2t - create an empty shadow region 2 table
+  * @sg: pointer to the shadow guest address space structure
+  * @saddr: faulting address in the shadow gmap
+  * @r2t: parent gmap address of the region 2 table to get shadowed
+  * @fake: r2t references contiguous guest memory block, not a r2t
+  *
+  * The r2t parameter specifies the address of the source table. The
+  * four pages of the source table are made read-only in the parent gmap
+  * address space. A write to the source table area @r2t will automatically
+  * remove the shadow r2 table and all of its decendents.
+  *
+  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+  * shadow table structure is incomplete, -ENOMEM if out of memory and
+  * -EFAULT if an address in the parent gmap could not be resolved.
+  *
+  * Called with sg->mm->mmap_sem in read.
+  */
+ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+                   int fake)
+ {
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_r2t, *table;
+       struct page *page;
+       int rc;
+       BUG_ON(!gmap_is_shadow(sg));
+       /* Allocate a shadow region second table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = r2t & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_r2t = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= (r2t & _REGION_ENTRY_PROTECT);
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make r2t read-only in parent gmap page table */
+       raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1;
+       origin = r2t & _REGION_ENTRY_ORIGIN;
+       offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 4);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_r2t)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_r2t(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+ out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
+ /**
+  * gmap_shadow_r3t - create a shadow region 3 table
+  * @sg: pointer to the shadow guest address space structure
+  * @saddr: faulting address in the shadow gmap
+  * @r3t: parent gmap address of the region 3 table to get shadowed
+  * @fake: r3t references contiguous guest memory block, not a r3t
+  *
+  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+  * shadow table structure is incomplete, -ENOMEM if out of memory and
+  * -EFAULT if an address in the parent gmap could not be resolved.
+  *
+  * Called with sg->mm->mmap_sem in read.
+  */
+ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+                   int fake)
+ {
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_r3t, *table;
+       struct page *page;
+       int rc;
+       BUG_ON(!gmap_is_shadow(sg));
+       /* Allocate a shadow region second table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = r3t & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_r3t = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+       }
+       crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= (r3t & _REGION_ENTRY_PROTECT);
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make r3t read-only in parent gmap page table */
+       raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2;
+       origin = r3t & _REGION_ENTRY_ORIGIN;
+       offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 3);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_r3t)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_r3t(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+ out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
+ /**
+  * gmap_shadow_sgt - create a shadow segment table
+  * @sg: pointer to the shadow guest address space structure
+  * @saddr: faulting address in the shadow gmap
+  * @sgt: parent gmap address of the segment table to get shadowed
+  * @fake: sgt references contiguous guest memory block, not a sgt
+  *
+  * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
+  * shadow table structure is incomplete, -ENOMEM if out of memory and
+  * -EFAULT if an address in the parent gmap could not be resolved.
+  *
+  * Called with sg->mm->mmap_sem in read.
+  */
+ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+                   int fake)
+ {
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_sgt, *table;
+       struct page *page;
+       int rc;
+       BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
+       /* Allocate a shadow segment table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = sgt & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_sgt = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= sgt & _REGION_ENTRY_PROTECT;
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make sgt read-only in parent gmap page table */
+       raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3;
+       origin = sgt & _REGION_ENTRY_ORIGIN;
+       offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 2);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_sgt)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_sgt(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+ out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
+ /**
+  * gmap_shadow_lookup_pgtable - find a shadow page table
+  * @sg: pointer to the shadow guest address space structure
+  * @saddr: the address in the shadow aguest address space
+  * @pgt: parent gmap address of the page table to get shadowed
+  * @dat_protection: if the pgtable is marked as protected by dat
+  * @fake: pgt references contiguous guest memory block, not a pgtable
+  *
+  * Returns 0 if the shadow page table was found and -EAGAIN if the page
+  * table was not found.
+  *
+  * Called with sg->mm->mmap_sem in read.
+  */
+ int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+                          unsigned long *pgt, int *dat_protection,
+                          int *fake)
+ {
+       unsigned long *table;
+       struct page *page;
+       int rc;
+       BUG_ON(!gmap_is_shadow(sg));
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+       if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
+               /* Shadow page tables are full pages (pte+pgste) */
+               page = pfn_to_page(*table >> PAGE_SHIFT);
+               *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
+               *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+               *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
+               rc = 0;
+       } else  {
+               rc = -EAGAIN;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
+ /**
+  * gmap_shadow_pgt - instantiate a shadow page table
+  * @sg: pointer to the shadow guest address space structure
+  * @saddr: faulting address in the shadow gmap
+  * @pgt: parent gmap address of the page table to get shadowed
+  * @fake: pgt references contiguous guest memory block, not a pgtable
+  *
+  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+  * shadow table structure is incomplete, -ENOMEM if out of memory,
+  * -EFAULT if an address in the parent gmap could not be resolved and
+  *
+  * Called with gmap->mm->mmap_sem in read
+  */
+ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+                   int fake)
+ {
+       unsigned long raddr, origin;
+       unsigned long *s_pgt, *table;
+       struct page *page;
+       int rc;
+       BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
+       /* Allocate a shadow page table */
+       page = page_table_alloc_pgste(sg->mm);
+       if (!page)
+               return -ENOMEM;
+       page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_pgt = (unsigned long *) page_to_phys(page);
+       /* Install shadow page table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _SEGMENT_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
+                (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
+       list_add(&page->lru, &sg->pt_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_SEGMENT_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make pgt read-only in parent gmap page table (not the pgste) */
+       raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;
+       origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
+       rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 1);
+               if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
+                             (unsigned long) s_pgt)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_SEGMENT_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_pgt(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+ out_free:
+       spin_unlock(&sg->guest_table_lock);
+       page_table_free_pgste(page);
+       return rc;
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
+ /**
+  * gmap_shadow_page - create a shadow page mapping
+  * @sg: pointer to the shadow guest address space structure
+  * @saddr: faulting address in the shadow gmap
+  * @pte: pte in parent gmap address space to get shadowed
+  *
+  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+  * shadow table structure is incomplete, -ENOMEM if out of memory and
+  * -EFAULT if an address in the parent gmap could not be resolved.
+  *
+  * Called with sg->mm->mmap_sem in read.
+  */
+ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
+ {
+       struct gmap *parent;
+       struct gmap_rmap *rmap;
+       unsigned long vmaddr, paddr;
+       spinlock_t *ptl;
+       pte_t *sptep, *tptep;
+       int prot;
+       int rc;
+       BUG_ON(!gmap_is_shadow(sg));
+       parent = sg->parent;
+       prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
+       rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+       if (!rmap)
+               return -ENOMEM;
+       rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
+       while (1) {
+               paddr = pte_val(pte) & PAGE_MASK;
+               vmaddr = __gmap_translate(parent, paddr);
+               if (IS_ERR_VALUE(vmaddr)) {
+                       rc = vmaddr;
+                       break;
+               }
+               rc = radix_tree_preload(GFP_KERNEL);
+               if (rc)
+                       break;
+               rc = -EAGAIN;
+               sptep = gmap_pte_op_walk(parent, paddr, &ptl);
+               if (sptep) {
+                       spin_lock(&sg->guest_table_lock);
+                       /* Get page table pointer */
+                       tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
+                       if (!tptep) {
+                               spin_unlock(&sg->guest_table_lock);
+                               gmap_pte_op_end(ptl);
+                               radix_tree_preload_end();
+                               break;
+                       }
+                       rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
+                       if (rc > 0) {
+                               /* Success and a new mapping */
+                               gmap_insert_rmap(sg, vmaddr, rmap);
+                               rmap = NULL;
+                               rc = 0;
+                       }
+                       gmap_pte_op_end(ptl);
+                       spin_unlock(&sg->guest_table_lock);
+               }
+               radix_tree_preload_end();
+               if (!rc)
+                       break;
+               rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+               if (rc)
+                       break;
+       }
+       kfree(rmap);
+       return rc;
+ }
+ EXPORT_SYMBOL_GPL(gmap_shadow_page);
+ /**
+  * gmap_shadow_notify - handle notifications for shadow gmap
+  *
+  * Called with sg->parent->shadow_lock.
+  */
+ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
+                              unsigned long offset, pte_t *pte)
+ {
+       struct gmap_rmap *rmap, *rnext, *head;
+       unsigned long gaddr, start, end, bits, raddr;
+       unsigned long *table;
+       BUG_ON(!gmap_is_shadow(sg));
+       spin_lock(&sg->parent->guest_table_lock);
+       table = radix_tree_lookup(&sg->parent->host_to_guest,
+                                 vmaddr >> PMD_SHIFT);
+       gaddr = table ? __gmap_segment_gaddr(table) + offset : 0;
+       spin_unlock(&sg->parent->guest_table_lock);
+       if (!table)
+               return;
+       spin_lock(&sg->guest_table_lock);
+       if (sg->removed) {
+               spin_unlock(&sg->guest_table_lock);
+               return;
+       }
+       /* Check for top level table */
+       start = sg->orig_asce & _ASCE_ORIGIN;
+       end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096;
+       if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
+           gaddr < end) {
+               /* The complete shadow table has to go */
+               gmap_unshadow(sg);
+               spin_unlock(&sg->guest_table_lock);
+               list_del(&sg->list);
+               gmap_put(sg);
+               return;
+       }
+       /* Remove the page table tree from on specific entry */
+       head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12);
+       gmap_for_each_rmap_safe(rmap, rnext, head) {
+               bits = rmap->raddr & _SHADOW_RMAP_MASK;
+               raddr = rmap->raddr ^ bits;
+               switch (bits) {
+               case _SHADOW_RMAP_REGION1:
+                       gmap_unshadow_r2t(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_REGION2:
+                       gmap_unshadow_r3t(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_REGION3:
+                       gmap_unshadow_sgt(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_SEGMENT:
+                       gmap_unshadow_pgt(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_PGTABLE:
+                       gmap_unshadow_page(sg, raddr);
+                       break;
+               }
+               kfree(rmap);
+       }
+       spin_unlock(&sg->guest_table_lock);
+ }
+ /**
+  * ptep_notify - call all invalidation callbacks for a specific pte.
+  * @mm: pointer to the process mm_struct
+  * @addr: virtual address in the process address space
+  * @pte: pointer to the page table entry
+  * @bits: bits from the pgste that caused the notify call
+  *
+  * This function is assumed to be called with the page table lock held
+  * for the pte to notify.
+  */
+ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
+                pte_t *pte, unsigned long bits)
+ {
+       unsigned long offset, gaddr;
+       unsigned long *table;
+       struct gmap *gmap, *sg, *next;
+       offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
+       offset = offset * (4096 / sizeof(pte_t));
+       rcu_read_lock();
+       list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+               if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
+                       spin_lock(&gmap->shadow_lock);
+                       list_for_each_entry_safe(sg, next,
+                                                &gmap->children, list)
+                               gmap_shadow_notify(sg, vmaddr, offset, pte);
+                       spin_unlock(&gmap->shadow_lock);
+               }
+               if (!(bits & PGSTE_IN_BIT))
+                       continue;
+               spin_lock(&gmap->guest_table_lock);
+               table = radix_tree_lookup(&gmap->host_to_guest,
+                                         vmaddr >> PMD_SHIFT);
+               if (table)
+                       gaddr = __gmap_segment_gaddr(table) + offset;
+               spin_unlock(&gmap->guest_table_lock);
+               if (table)
+                       gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
+       }
+       rcu_read_unlock();
  }
  EXPORT_SYMBOL_GPL(ptep_notify);
  
diff --combined arch/s390/mm/pgalloc.c
@@@ -137,6 -137,29 +137,29 @@@ static inline unsigned int atomic_xor_b
        return new;
  }
  
+ #ifdef CONFIG_PGSTE
+ struct page *page_table_alloc_pgste(struct mm_struct *mm)
+ {
+       struct page *page;
+       unsigned long *table;
+       page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+       if (page) {
+               table = (unsigned long *) page_to_phys(page);
+               clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+               clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+       }
+       return page;
+ }
+ void page_table_free_pgste(struct page *page)
+ {
+       __free_page(page);
+ }
+ #endif /* CONFIG_PGSTE */
  /*
   * page table entry allocation/free routines.
   */
@@@ -149,7 -172,7 +172,7 @@@ unsigned long *page_table_alloc(struct 
        /* Try to get a fragment of a 4K page as a 2K page table */
        if (!mm_alloc_pgste(mm)) {
                table = NULL;
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                if (!list_empty(&mm->context.pgtable_list)) {
                        page = list_first_entry(&mm->context.pgtable_list,
                                                struct page, lru);
                                list_del(&page->lru);
                        }
                }
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
                if (table)
                        return table;
        }
        /* Allocate a fresh page */
 -      page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
 +      page = alloc_page(GFP_KERNEL);
        if (!page)
                return NULL;
        if (!pgtable_page_ctor(page)) {
                /* Return the first 2K fragment of the page */
                atomic_set(&page->_mapcount, 1);
                clear_table(table, _PAGE_INVALID, PAGE_SIZE);
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                list_add(&page->lru, &mm->context.pgtable_list);
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
        }
        return table;
  }
@@@ -203,13 -226,13 +226,13 @@@ void page_table_free(struct mm_struct *
        if (!mm_alloc_pgste(mm)) {
                /* Free 2K page table fragment of a 4K page */
                bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
                if (mask & 3)
                        list_add(&page->lru, &mm->context.pgtable_list);
                else
                        list_del(&page->lru);
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
                if (mask != 0)
                        return;
        }
@@@ -235,13 -258,13 +258,13 @@@ void page_table_free_rcu(struct mmu_gat
                return;
        }
        bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
-       spin_lock_bh(&mm->context.list_lock);
+       spin_lock_bh(&mm->context.pgtable_lock);
        mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
        if (mask & 3)
                list_add_tail(&page->lru, &mm->context.pgtable_list);
        else
                list_del(&page->lru);
-       spin_unlock_bh(&mm->context.list_lock);
+       spin_unlock_bh(&mm->context.pgtable_lock);
        table = (unsigned long *) (__pa(table) | (1U << bit));
        tlb_remove_table(tlb, table);
  }
diff --combined arch/s390/mm/pgtable.c
  static inline pte_t ptep_flush_direct(struct mm_struct *mm,
                                      unsigned long addr, pte_t *ptep)
  {
 -      int active, count;
        pte_t old;
  
        old = *ptep;
        if (unlikely(pte_val(old) & _PAGE_INVALID))
                return old;
 -      active = (mm == current->active_mm) ? 1 : 0;
 -      count = atomic_add_return(0x10000, &mm->context.attach_count);
 -      if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
 +      atomic_inc(&mm->context.flush_count);
 +      if (MACHINE_HAS_TLB_LC &&
            cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
                __ptep_ipte_local(addr, ptep);
        else
                __ptep_ipte(addr, ptep);
 -      atomic_sub(0x10000, &mm->context.attach_count);
 +      atomic_dec(&mm->context.flush_count);
        return old;
  }
  
  static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
                                    unsigned long addr, pte_t *ptep)
  {
 -      int active, count;
        pte_t old;
  
        old = *ptep;
        if (unlikely(pte_val(old) & _PAGE_INVALID))
                return old;
 -      active = (mm == current->active_mm) ? 1 : 0;
 -      count = atomic_add_return(0x10000, &mm->context.attach_count);
 -      if ((count & 0xffff) <= active) {
 +      atomic_inc(&mm->context.flush_count);
 +      if (cpumask_equal(&mm->context.cpu_attach_mask,
 +                        cpumask_of(smp_processor_id()))) {
                pte_val(*ptep) |= _PAGE_INVALID;
                mm->context.flush_mm = 1;
        } else
                __ptep_ipte(addr, ptep);
 -      atomic_sub(0x10000, &mm->context.attach_count);
 +      atomic_dec(&mm->context.flush_count);
        return old;
  }
  
@@@ -67,6 -70,7 +67,6 @@@ static inline pgste_t pgste_get_lock(pt
  #ifdef CONFIG_PGSTE
        unsigned long old;
  
 -      preempt_disable();
        asm(
                "       lg      %0,%2\n"
                "0:     lgr     %1,%0\n"
@@@ -89,6 -93,7 +89,6 @@@ static inline void pgste_set_unlock(pte
                : "=Q" (ptep[PTRS_PER_PTE])
                : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
                : "cc", "memory");
 -      preempt_enable();
  #endif
  }
  
@@@ -174,14 -179,17 +174,17 @@@ static inline pgste_t pgste_set_pte(pte
        return pgste;
  }
  
- static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
-                                       unsigned long addr,
-                                       pte_t *ptep, pgste_t pgste)
+ static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
+                                      unsigned long addr,
+                                      pte_t *ptep, pgste_t pgste)
  {
  #ifdef CONFIG_PGSTE
-       if (pgste_val(pgste) & PGSTE_IN_BIT) {
-               pgste_val(pgste) &= ~PGSTE_IN_BIT;
-               ptep_notify(mm, addr, ptep);
+       unsigned long bits;
+       bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+       if (bits) {
+               pgste_val(pgste) ^= bits;
+               ptep_notify(mm, addr, ptep, bits);
        }
  #endif
        return pgste;
@@@ -194,7 -202,7 +197,7 @@@ static inline pgste_t ptep_xchg_start(s
  
        if (mm_has_pgste(mm)) {
                pgste = pgste_get_lock(ptep);
-               pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+               pgste = pgste_pte_notify(mm, addr, ptep, pgste);
        }
        return pgste;
  }
@@@ -225,11 -233,9 +228,11 @@@ pte_t ptep_xchg_direct(struct mm_struc
        pgste_t pgste;
        pte_t old;
  
 +      preempt_disable();
        pgste = ptep_xchg_start(mm, addr, ptep);
        old = ptep_flush_direct(mm, addr, ptep);
        ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
 +      preempt_enable();
        return old;
  }
  EXPORT_SYMBOL(ptep_xchg_direct);
@@@ -240,11 -246,9 +243,11 @@@ pte_t ptep_xchg_lazy(struct mm_struct *
        pgste_t pgste;
        pte_t old;
  
 +      preempt_disable();
        pgste = ptep_xchg_start(mm, addr, ptep);
        old = ptep_flush_lazy(mm, addr, ptep);
        ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
 +      preempt_enable();
        return old;
  }
  EXPORT_SYMBOL(ptep_xchg_lazy);
@@@ -255,7 -259,6 +258,7 @@@ pte_t ptep_modify_prot_start(struct mm_
        pgste_t pgste;
        pte_t old;
  
 +      preempt_disable();
        pgste = ptep_xchg_start(mm, addr, ptep);
        old = ptep_flush_lazy(mm, addr, ptep);
        if (mm_has_pgste(mm)) {
@@@ -279,13 -282,13 +282,13 @@@ void ptep_modify_prot_commit(struct mm_
        } else {
                *ptep = pte;
        }
 +      preempt_enable();
  }
  EXPORT_SYMBOL(ptep_modify_prot_commit);
  
  static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
                                      unsigned long addr, pmd_t *pmdp)
  {
 -      int active, count;
        pmd_t old;
  
        old = *pmdp;
                __pmdp_csp(pmdp);
                return old;
        }
 -      active = (mm == current->active_mm) ? 1 : 0;
 -      count = atomic_add_return(0x10000, &mm->context.attach_count);
 -      if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
 +      atomic_inc(&mm->context.flush_count);
 +      if (MACHINE_HAS_TLB_LC &&
            cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
                __pmdp_idte_local(addr, pmdp);
        else
                __pmdp_idte(addr, pmdp);
 -      atomic_sub(0x10000, &mm->context.attach_count);
 +      atomic_dec(&mm->context.flush_count);
        return old;
  }
  
  static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
                                    unsigned long addr, pmd_t *pmdp)
  {
 -      int active, count;
        pmd_t old;
  
        old = *pmdp;
        if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
                return old;
 -      active = (mm == current->active_mm) ? 1 : 0;
 -      count = atomic_add_return(0x10000, &mm->context.attach_count);
 -      if ((count & 0xffff) <= active) {
 +      atomic_inc(&mm->context.flush_count);
 +      if (cpumask_equal(&mm->context.cpu_attach_mask,
 +                        cpumask_of(smp_processor_id()))) {
                pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
                mm->context.flush_mm = 1;
        } else if (MACHINE_HAS_IDTE)
                __pmdp_idte(addr, pmdp);
        else
                __pmdp_csp(pmdp);
 -      atomic_sub(0x10000, &mm->context.attach_count);
 +      atomic_dec(&mm->context.flush_count);
        return old;
  }
  
@@@ -331,10 -336,8 +334,10 @@@ pmd_t pmdp_xchg_direct(struct mm_struc
  {
        pmd_t old;
  
 +      preempt_disable();
        old = pmdp_flush_direct(mm, addr, pmdp);
        *pmdp = new;
 +      preempt_enable();
        return old;
  }
  EXPORT_SYMBOL(pmdp_xchg_direct);
@@@ -344,53 -347,12 +347,53 @@@ pmd_t pmdp_xchg_lazy(struct mm_struct *
  {
        pmd_t old;
  
 +      preempt_disable();
        old = pmdp_flush_lazy(mm, addr, pmdp);
        *pmdp = new;
 +      preempt_enable();
        return old;
  }
  EXPORT_SYMBOL(pmdp_xchg_lazy);
  
 +static inline pud_t pudp_flush_direct(struct mm_struct *mm,
 +                                    unsigned long addr, pud_t *pudp)
 +{
 +      pud_t old;
 +
 +      old = *pudp;
 +      if (pud_val(old) & _REGION_ENTRY_INVALID)
 +              return old;
 +      if (!MACHINE_HAS_IDTE) {
 +              /*
 +               * Invalid bit position is the same for pmd and pud, so we can
 +               * re-use _pmd_csp() here
 +               */
 +              __pmdp_csp((pmd_t *) pudp);
 +              return old;
 +      }
 +      atomic_inc(&mm->context.flush_count);
 +      if (MACHINE_HAS_TLB_LC &&
 +          cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
 +              __pudp_idte_local(addr, pudp);
 +      else
 +              __pudp_idte(addr, pudp);
 +      atomic_dec(&mm->context.flush_count);
 +      return old;
 +}
 +
 +pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr,
 +                     pud_t *pudp, pud_t new)
 +{
 +      pud_t old;
 +
 +      preempt_disable();
 +      old = pudp_flush_direct(mm, addr, pudp);
 +      *pudp = new;
 +      preempt_enable();
 +      return old;
 +}
 +EXPORT_SYMBOL(pudp_xchg_direct);
 +
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                pgtable_t pgtable)
@@@ -439,26 -401,106 +442,110 @@@ void ptep_set_pte_at(struct mm_struct *
        pgste_t pgste;
  
        /* the mm_has_pgste() check is done in set_pte_at() */
 +      preempt_disable();
        pgste = pgste_get_lock(ptep);
        pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
        pgste_set_key(ptep, pgste, entry, mm);
        pgste = pgste_set_pte(ptep, pgste, entry);
        pgste_set_unlock(ptep, pgste);
 +      preempt_enable();
  }
  
  void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
  {
        pgste_t pgste;
  
 +      preempt_disable();
        pgste = pgste_get_lock(ptep);
        pgste_val(pgste) |= PGSTE_IN_BIT;
        pgste_set_unlock(ptep, pgste);
 +      preempt_enable();
  }
  
+ /**
+  * ptep_force_prot - change access rights of a locked pte
+  * @mm: pointer to the process mm_struct
+  * @addr: virtual address in the guest address space
+  * @ptep: pointer to the page table entry
+  * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+  * @bit: pgste bit to set (e.g. for notification)
+  *
+  * Returns 0 if the access rights were changed and -EAGAIN if the current
+  * and requested access rights are incompatible.
+  */
+ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
+                   pte_t *ptep, int prot, unsigned long bit)
+ {
+       pte_t entry;
+       pgste_t pgste;
+       int pte_i, pte_p;
+       pgste = pgste_get_lock(ptep);
+       entry = *ptep;
+       /* Check pte entry after all locks have been acquired */
+       pte_i = pte_val(entry) & _PAGE_INVALID;
+       pte_p = pte_val(entry) & _PAGE_PROTECT;
+       if ((pte_i && (prot != PROT_NONE)) ||
+           (pte_p && (prot & PROT_WRITE))) {
+               pgste_set_unlock(ptep, pgste);
+               return -EAGAIN;
+       }
+       /* Change access rights and set pgste bit */
+       if (prot == PROT_NONE && !pte_i) {
+               ptep_flush_direct(mm, addr, ptep);
+               pgste = pgste_update_all(entry, pgste, mm);
+               pte_val(entry) |= _PAGE_INVALID;
+       }
+       if (prot == PROT_READ && !pte_p) {
+               ptep_flush_direct(mm, addr, ptep);
+               pte_val(entry) &= ~_PAGE_INVALID;
+               pte_val(entry) |= _PAGE_PROTECT;
+       }
+       pgste_val(pgste) |= bit;
+       pgste = pgste_set_pte(ptep, pgste, entry);
+       pgste_set_unlock(ptep, pgste);
+       return 0;
+ }
+ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+                   pte_t *sptep, pte_t *tptep, pte_t pte)
+ {
+       pgste_t spgste, tpgste;
+       pte_t spte, tpte;
+       int rc = -EAGAIN;
+       if (!(pte_val(*tptep) & _PAGE_INVALID))
+               return 0;       /* already shadowed */
+       spgste = pgste_get_lock(sptep);
+       spte = *sptep;
+       if (!(pte_val(spte) & _PAGE_INVALID) &&
+           !((pte_val(spte) & _PAGE_PROTECT) &&
+             !(pte_val(pte) & _PAGE_PROTECT))) {
+               pgste_val(spgste) |= PGSTE_VSIE_BIT;
+               tpgste = pgste_get_lock(tptep);
+               pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
+                               (pte_val(pte) & _PAGE_PROTECT);
+               /* don't touch the storage key - it belongs to parent pgste */
+               tpgste = pgste_set_pte(tptep, tpgste, tpte);
+               pgste_set_unlock(tptep, tpgste);
+               rc = 1;
+       }
+       pgste_set_unlock(sptep, spgste);
+       return rc;
+ }
+ void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
+ {
+       pgste_t pgste;
+       pgste = pgste_get_lock(ptep);
+       /* notifier is called by the caller */
+       ptep_flush_direct(mm, saddr, ptep);
+       /* don't touch the storage key - it belongs to parent pgste */
+       pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
+       pgste_set_unlock(ptep, pgste);
+ }
  static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
  {
        if (!non_swap_entry(entry))
@@@ -479,11 -521,10 +566,11 @@@ void ptep_zap_unused(struct mm_struct *
        pte_t pte;
  
        /* Zap unused and logically-zero pages */
 +      preempt_disable();
        pgste = pgste_get_lock(ptep);
        pgstev = pgste_val(pgste);
        pte = *ptep;
 -      if (pte_swap(pte) &&
 +      if (!reset && pte_swap(pte) &&
            ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
             (pgstev & _PGSTE_GPS_ZERO))) {
                ptep_zap_swap_entry(mm, pte_to_swp_entry(pte));
        if (reset)
                pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
        pgste_set_unlock(ptep, pgste);
 +      preempt_enable();
  }
  
  void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
        pgste_t pgste;
  
        /* Clear storage key */
 +      preempt_disable();
        pgste = pgste_get_lock(ptep);
        pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
                              PGSTE_GR_BIT | PGSTE_GC_BIT);
        if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
                page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
        pgste_set_unlock(ptep, pgste);
 +      preempt_enable();
  }
  
  /*
@@@ -532,7 -570,7 +619,7 @@@ bool test_and_clear_guest_dirty(struct 
        pgste_val(pgste) &= ~PGSTE_UC_BIT;
        pte = *ptep;
        if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
-               pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+               pgste = pgste_pte_notify(mm, addr, ptep, pgste);
                __ptep_ipte(addr, ptep);
                if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
                        pte_val(pte) |= _PAGE_PROTECT;
@@@ -555,12 -593,9 +642,9 @@@ int set_guest_storage_key(struct mm_str
        pgste_t old, new;
        pte_t *ptep;
  
-       down_read(&mm->mmap_sem);
        ptep = get_locked_pte(mm, addr, &ptl);
-       if (unlikely(!ptep)) {
-               up_read(&mm->mmap_sem);
+       if (unlikely(!ptep))
                return -EFAULT;
-       }
  
        new = old = pgste_get_lock(ptep);
        pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
  
        pgste_set_unlock(ptep, new);
        pte_unmap_unlock(ptep, ptl);
-       up_read(&mm->mmap_sem);
        return 0;
  }
  EXPORT_SYMBOL(set_guest_storage_key);
  
- unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
+ /**
+  * Conditionally set a guest storage key (handling csske).
+  * oldkey will be updated when either mr or mc is set and a pointer is given.
+  *
+  * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest
+  * storage key was updated and -EFAULT on access errors.
+  */
+ int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                              unsigned char key, unsigned char *oldkey,
+                              bool nq, bool mr, bool mc)
+ {
+       unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT;
+       int rc;
+       /* we can drop the pgste lock between getting and setting the key */
+       if (mr | mc) {
+               rc = get_guest_storage_key(current->mm, addr, &tmp);
+               if (rc)
+                       return rc;
+               if (oldkey)
+                       *oldkey = tmp;
+               if (!mr)
+                       mask |= _PAGE_REFERENCED;
+               if (!mc)
+                       mask |= _PAGE_CHANGED;
+               if (!((tmp ^ key) & mask))
+                       return 0;
+       }
+       rc = set_guest_storage_key(current->mm, addr, key, nq);
+       return rc < 0 ? rc : 1;
+ }
+ EXPORT_SYMBOL(cond_set_guest_storage_key);
+ /**
+  * Reset a guest reference bit (rrbe), returning the reference and changed bit.
+  *
+  * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
+  */
+ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
  {
-       unsigned char key;
        spinlock_t *ptl;
-       pgste_t pgste;
+       pgste_t old, new;
        pte_t *ptep;
+       int cc = 0;
  
-       down_read(&mm->mmap_sem);
        ptep = get_locked_pte(mm, addr, &ptl);
-       if (unlikely(!ptep)) {
-               up_read(&mm->mmap_sem);
+       if (unlikely(!ptep))
                return -EFAULT;
-       }
-       pgste = pgste_get_lock(ptep);
  
-       if (pte_val(*ptep) & _PAGE_INVALID) {
-               key  = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
-               key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
-               key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
-               key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
-       } else {
-               key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+       new = old = pgste_get_lock(ptep);
+       /* Reset guest reference bit only */
+       pgste_val(new) &= ~PGSTE_GR_BIT;
  
-               /* Reflect guest's logical view, not physical */
-               if (pgste_val(pgste) & PGSTE_GR_BIT)
-                       key |= _PAGE_REFERENCED;
-               if (pgste_val(pgste) & PGSTE_GC_BIT)
-                       key |= _PAGE_CHANGED;
+       if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+               cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
+               /* Merge real referenced bit into host-set */
+               pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
        }
+       /* Reflect guest's logical view, not physical */
+       cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
+       /* Changing the guest storage key is considered a change of the page */
+       if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
+               pgste_val(new) |= PGSTE_UC_BIT;
+       pgste_set_unlock(ptep, new);
+       pte_unmap_unlock(ptep, ptl);
+       return 0;
+ }
+ EXPORT_SYMBOL(reset_guest_reference_bit);
+ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                         unsigned char *key)
+ {
+       spinlock_t *ptl;
+       pgste_t pgste;
+       pte_t *ptep;
  
+       ptep = get_locked_pte(mm, addr, &ptl);
+       if (unlikely(!ptep))
+               return -EFAULT;
+       pgste = pgste_get_lock(ptep);
+       *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+       if (!(pte_val(*ptep) & _PAGE_INVALID))
+               *key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+       /* Reflect guest's logical view, not physical */
+       *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
        pgste_set_unlock(ptep, pgste);
        pte_unmap_unlock(ptep, ptl);
-       up_read(&mm->mmap_sem);
-       return key;
+       return 0;
  }
  EXPORT_SYMBOL(get_guest_storage_key);
  #endif
@@@ -27,7 -27,6 +27,7 @@@
  #include <linux/irqbypass.h>
  #include <linux/hyperv.h>
  
 +#include <asm/apic.h>
  #include <asm/pvclock-abi.h>
  #include <asm/desc.h>
  #include <asm/mtrr.h>
@@@ -35,8 -34,9 +35,9 @@@
  #include <asm/asm.h>
  #include <asm/kvm_page_track.h>
  
- #define KVM_MAX_VCPUS 255
- #define KVM_SOFT_MAX_VCPUS 160
+ #define KVM_MAX_VCPUS 288
+ #define KVM_SOFT_MAX_VCPUS 240
+ #define KVM_MAX_VCPU_ID 1023
  #define KVM_USER_MEM_SLOTS 509
  /* memory slots that are not exposed to userspace */
  #define KVM_PRIVATE_MEM_SLOTS 3
@@@ -599,6 -599,7 +600,7 @@@ struct kvm_vcpu_arch 
        u64 mcg_cap;
        u64 mcg_status;
        u64 mcg_ctl;
+       u64 mcg_ext_ctl;
        u64 *mce_banks;
  
        /* Cache MMIO info */
@@@ -682,9 -683,12 +684,12 @@@ struct kvm_arch_memory_slot 
  struct kvm_apic_map {
        struct rcu_head rcu;
        u8 mode;
-       struct kvm_lapic *phys_map[256];
-       /* first index is cluster id second is cpu id in a cluster */
-       struct kvm_lapic *logical_map[16][16];
+       u32 max_apic_id;
+       union {
+               struct kvm_lapic *xapic_flat_map[8];
+               struct kvm_lapic *xapic_cluster_map[16][4];
+       };
+       struct kvm_lapic *phys_map[];
  };
  
  /* Hyper-V emulation context */
@@@ -779,6 -783,9 +784,9 @@@ struct kvm_arch 
        u32 ldr_mode;
        struct page *avic_logical_id_table_page;
        struct page *avic_physical_id_table_page;
+       bool x2apic_format;
+       bool x2apic_broadcast_quirk_disabled;
  };
  
  struct kvm_vm_stat {
@@@ -1006,6 -1013,11 +1014,11 @@@ struct kvm_x86_ops 
        int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
                              uint32_t guest_irq, bool set);
        void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
+       int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc);
+       void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
+       void (*setup_mce)(struct kvm_vcpu *vcpu);
  };
  
  struct kvm_arch_async_pf {
@@@ -1026,7 -1038,7 +1039,7 @@@ void kvm_mmu_setup(struct kvm_vcpu *vcp
  void kvm_mmu_init_vm(struct kvm *kvm);
  void kvm_mmu_uninit_vm(struct kvm *kvm);
  void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask);
+               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask);
  
  void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
  void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@@ -1077,6 -1089,10 +1090,10 @@@ extern u32  kvm_max_guest_tsc_khz
  extern u8   kvm_tsc_scaling_ratio_frac_bits;
  /* maximum allowed value of TSC scaling ratio */
  extern u64  kvm_max_tsc_scaling_ratio;
+ /* 1ull << kvm_tsc_scaling_ratio_frac_bits */
+ extern u64  kvm_default_tsc_scaling_ratio;
+ extern u64 kvm_mce_cap_supported;
  
  enum emulation_result {
        EMULATE_DONE,         /* no further processing */
@@@ -1352,7 -1368,7 +1369,7 @@@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *v
  bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
                             struct kvm_vcpu **dest_vcpu);
  
- void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
                     struct kvm_lapic_irq *irq);
  
  static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
@@@ -1369,14 -1385,4 +1386,14 @@@ static inline void kvm_arch_vcpu_unbloc
  
  static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
  
 +static inline int kvm_cpu_get_apicid(int mps_cpu)
 +{
 +#ifdef CONFIG_X86_LOCAL_APIC
 +      return __default_cpu_present_to_apicid(mps_cpu);
 +#else
 +      WARN_ON_ONCE(1);
 +      return BAD_APICID;
 +#endif
 +}
 +
  #endif /* _ASM_X86_KVM_HOST_H */
diff --combined arch/x86/kvm/iommu.c
  
  #include <linux/list.h>
  #include <linux/kvm_host.h>
 -#include <linux/module.h>
 +#include <linux/moduleparam.h>
  #include <linux/pci.h>
  #include <linux/stat.h>
- #include <linux/dmar.h>
  #include <linux/iommu.h>
- #include <linux/intel-iommu.h>
  #include "assigned-dev.h"
  
  static bool allow_unsafe_assigned_interrupts;
diff --combined arch/x86/kvm/lapic.c
@@@ -25,7 -25,7 +25,7 @@@
  #include <linux/smp.h>
  #include <linux/hrtimer.h>
  #include <linux/io.h>
 -#include <linux/module.h>
 +#include <linux/export.h>
  #include <linux/math64.h>
  #include <linux/slab.h>
  #include <asm/processor.h>
@@@ -115,26 -115,43 +115,43 @@@ static inline int apic_enabled(struct k
        (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
         APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
  
- /* The logical map is definitely wrong if we have multiple
-  * modes at the same time.  (Physical map is always right.)
-  */
- static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map)
- {
-       return !(map->mode & (map->mode - 1));
+ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
+               u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
+       switch (map->mode) {
+       case KVM_APIC_MODE_X2APIC: {
+               u32 offset = (dest_id >> 16) * 16;
+               u32 max_apic_id = map->max_apic_id;
+               if (offset <= max_apic_id) {
+                       u8 cluster_size = min(max_apic_id - offset + 1, 16U);
+                       *cluster = &map->phys_map[offset];
+                       *mask = dest_id & (0xffff >> (16 - cluster_size));
+               } else {
+                       *mask = 0;
+               }
+               return true;
+               }
+       case KVM_APIC_MODE_XAPIC_FLAT:
+               *cluster = map->xapic_flat_map;
+               *mask = dest_id & 0xff;
+               return true;
+       case KVM_APIC_MODE_XAPIC_CLUSTER:
+               *cluster = map->xapic_cluster_map[dest_id >> 4];
+               *mask = dest_id & 0xf;
+               return true;
+       default:
+               /* Not optimized. */
+               return false;
+       }
  }
  
- static inline void
- apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid)
+ static void kvm_apic_map_free(struct rcu_head *rcu)
  {
-       unsigned lid_bits;
+       struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu);
  
-       BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER !=  4);
-       BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT    !=  8);
-       BUILD_BUG_ON(KVM_APIC_MODE_X2APIC        != 16);
-       lid_bits = map->mode;
-       *cid = dest_id >> lid_bits;
-       *lid = dest_id & ((1 << lid_bits) - 1);
+       kvfree(map);
  }
  
  static void recalculate_apic_map(struct kvm *kvm)
        struct kvm_apic_map *new, *old = NULL;
        struct kvm_vcpu *vcpu;
        int i;
-       new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
+       u32 max_id = 255;
  
        mutex_lock(&kvm->arch.apic_map_lock);
  
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               if (kvm_apic_present(vcpu))
+                       max_id = max(max_id, kvm_apic_id(vcpu->arch.apic));
+       new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
+                          sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
        if (!new)
                goto out;
  
+       new->max_apic_id = max_id;
        kvm_for_each_vcpu(i, vcpu, kvm) {
                struct kvm_lapic *apic = vcpu->arch.apic;
-               u16 cid, lid;
+               struct kvm_lapic **cluster;
+               u16 mask;
                u32 ldr, aid;
  
                if (!kvm_apic_present(vcpu))
                aid = kvm_apic_id(apic);
                ldr = kvm_lapic_get_reg(apic, APIC_LDR);
  
-               if (aid < ARRAY_SIZE(new->phys_map))
+               if (aid <= new->max_apic_id)
                        new->phys_map[aid] = apic;
  
                if (apic_x2apic_mode(apic)) {
                                new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
                }
  
-               if (!kvm_apic_logical_map_valid(new))
+               if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))
                        continue;
  
-               apic_logical_id(new, ldr, &cid, &lid);
-               if (lid && cid < ARRAY_SIZE(new->logical_map))
-                       new->logical_map[cid][ffs(lid) - 1] = apic;
+               if (mask)
+                       cluster[ffs(mask) - 1] = apic;
        }
  out:
        old = rcu_dereference_protected(kvm->arch.apic_map,
        mutex_unlock(&kvm->arch.apic_map_lock);
  
        if (old)
-               kfree_rcu(old, rcu);
+               call_rcu(&old->rcu, kvm_apic_map_free);
  
        kvm_make_scan_ioapic_request(kvm);
  }
@@@ -210,7 -234,7 +234,7 @@@ static inline void apic_set_spiv(struc
        }
  }
  
- static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
+ static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
  {
        kvm_lapic_set_reg(apic, APIC_ID, id << 24);
        recalculate_apic_map(apic->vcpu->kvm);
@@@ -222,11 -246,11 +246,11 @@@ static inline void kvm_apic_set_ldr(str
        recalculate_apic_map(apic->vcpu->kvm);
  }
  
- static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id)
+ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
  {
        u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
  
-       kvm_lapic_set_reg(apic, APIC_ID, id << 24);
+       kvm_lapic_set_reg(apic, APIC_ID, id);
        kvm_lapic_set_reg(apic, APIC_LDR, ldr);
        recalculate_apic_map(apic->vcpu->kvm);
  }
@@@ -599,17 -623,30 +623,30 @@@ static bool kvm_apic_match_logical_addr
        }
  }
  
- /* KVM APIC implementation has two quirks
-  *  - dest always begins at 0 while xAPIC MDA has offset 24,
-  *  - IOxAPIC messages have to be delivered (directly) to x2APIC.
+ /* The KVM local APIC implementation has two quirks:
+  *
+  *  - the xAPIC MDA stores the destination at bits 24-31, while this
+  *    is not true of struct kvm_lapic_irq's dest_id field.  This is
+  *    just a quirk in the API and is not problematic.
+  *
+  *  - in-kernel IOAPIC messages have to be delivered directly to
+  *    x2APIC, because the kernel does not support interrupt remapping.
+  *    In order to support broadcast without interrupt remapping, x2APIC
+  *    rewrites the destination of non-IPI messages from APIC_BROADCAST
+  *    to X2APIC_BROADCAST.
+  *
+  * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API.  This is
+  * important when userspace wants to use x2APIC-format MSIs, because
+  * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
   */
- static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source,
-                                               struct kvm_lapic *target)
+ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
+               struct kvm_lapic *source, struct kvm_lapic *target)
  {
        bool ipi = source != NULL;
        bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
  
-       if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+       if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
+           !ipi && dest_id == APIC_BROADCAST && x2apic_mda)
                return X2APIC_BROADCAST;
  
        return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
@@@ -619,7 -656,7 +656,7 @@@ bool kvm_apic_match_dest(struct kvm_vcp
                           int short_hand, unsigned int dest, int dest_mode)
  {
        struct kvm_lapic *target = vcpu->arch.apic;
-       u32 mda = kvm_apic_mda(dest, source, target);
+       u32 mda = kvm_apic_mda(vcpu, dest, source, target);
  
        apic_debug("target %p, source %p, dest 0x%x, "
                   "dest_mode 0x%x, short_hand 0x%x\n",
@@@ -671,102 -708,126 +708,126 @@@ static void kvm_apic_disabled_lapic_fou
        }
  }
  
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
+               struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
  {
-       struct kvm_apic_map *map;
-       unsigned long bitmap = 1;
-       struct kvm_lapic **dst;
-       int i;
-       bool ret, x2apic_ipi;
+       if (kvm->arch.x2apic_broadcast_quirk_disabled) {
+               if ((irq->dest_id == APIC_BROADCAST &&
+                               map->mode != KVM_APIC_MODE_X2APIC))
+                       return true;
+               if (irq->dest_id == X2APIC_BROADCAST)
+                       return true;
+       } else {
+               bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
+               if (irq->dest_id == (x2apic_ipi ?
+                                    X2APIC_BROADCAST : APIC_BROADCAST))
+                       return true;
+       }
  
-       *r = -1;
+       return false;
+ }
  
-       if (irq->shorthand == APIC_DEST_SELF) {
-               *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
-               return true;
-       }
+ /* Return true if the interrupt can be handled by using *bitmap as index mask
+  * for valid destinations in *dst array.
+  * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
+  * Note: we may have zero kvm_lapic destinations when we return true, which
+  * means that the interrupt should be dropped.  In this case, *bitmap would be
+  * zero and *dst undefined.
+  */
+ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
+               struct kvm_lapic **src, struct kvm_lapic_irq *irq,
+               struct kvm_apic_map *map, struct kvm_lapic ***dst,
+               unsigned long *bitmap)
+ {
+       int i, lowest;
  
-       if (irq->shorthand)
+       if (irq->shorthand == APIC_DEST_SELF && src) {
+               *dst = src;
+               *bitmap = 1;
+               return true;
+       } else if (irq->shorthand)
                return false;
  
-       x2apic_ipi = src && apic_x2apic_mode(src);
-       if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
+       if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
                return false;
  
-       ret = true;
-       rcu_read_lock();
-       map = rcu_dereference(kvm->arch.apic_map);
-       if (!map) {
-               ret = false;
-               goto out;
+       if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+               if (irq->dest_id > map->max_apic_id) {
+                       *bitmap = 0;
+               } else {
+                       *dst = &map->phys_map[irq->dest_id];
+                       *bitmap = 1;
+               }
+               return true;
        }
  
-       if (irq->dest_mode == APIC_DEST_PHYSICAL) {
-               if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
-                       goto out;
+       *bitmap = 0;
+       if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst,
+                               (u16 *)bitmap))
+               return false;
  
-               dst = &map->phys_map[irq->dest_id];
-       } else {
-               u16 cid;
+       if (!kvm_lowest_prio_delivery(irq))
+               return true;
  
-               if (!kvm_apic_logical_map_valid(map)) {
-                       ret = false;
-                       goto out;
+       if (!kvm_vector_hashing_enabled()) {
+               lowest = -1;
+               for_each_set_bit(i, bitmap, 16) {
+                       if (!(*dst)[i])
+                               continue;
+                       if (lowest < 0)
+                               lowest = i;
+                       else if (kvm_apic_compare_prio((*dst)[i]->vcpu,
+                                               (*dst)[lowest]->vcpu) < 0)
+                               lowest = i;
                }
+       } else {
+               if (!*bitmap)
+                       return true;
  
-               apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
+               lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap),
+                               bitmap, 16);
  
-               if (cid >= ARRAY_SIZE(map->logical_map))
-                       goto out;
+               if (!(*dst)[lowest]) {
+                       kvm_apic_disabled_lapic_found(kvm);
+                       *bitmap = 0;
+                       return true;
+               }
+       }
  
-               dst = map->logical_map[cid];
+       *bitmap = (lowest >= 0) ? 1 << lowest : 0;
  
-               if (!kvm_lowest_prio_delivery(irq))
-                       goto set_irq;
+       return true;
+ }
  
-               if (!kvm_vector_hashing_enabled()) {
-                       int l = -1;
-                       for_each_set_bit(i, &bitmap, 16) {
-                               if (!dst[i])
-                                       continue;
-                               if (l < 0)
-                                       l = i;
-                               else if (kvm_apic_compare_prio(dst[i]->vcpu,
-                                                       dst[l]->vcpu) < 0)
-                                       l = i;
-                       }
-                       bitmap = (l >= 0) ? 1 << l : 0;
-               } else {
-                       int idx;
-                       unsigned int dest_vcpus;
+ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+               struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+ {
+       struct kvm_apic_map *map;
+       unsigned long bitmap;
+       struct kvm_lapic **dst = NULL;
+       int i;
+       bool ret;
  
-                       dest_vcpus = hweight16(bitmap);
-                       if (dest_vcpus == 0)
-                               goto out;
+       *r = -1;
  
-                       idx = kvm_vector_to_index(irq->vector,
-                               dest_vcpus, &bitmap, 16);
+       if (irq->shorthand == APIC_DEST_SELF) {
+               *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
+               return true;
+       }
  
-                       if (!dst[idx]) {
-                               kvm_apic_disabled_lapic_found(kvm);
-                               goto out;
-                       }
+       rcu_read_lock();
+       map = rcu_dereference(kvm->arch.apic_map);
  
-                       bitmap = (idx >= 0) ? 1 << idx : 0;
+       ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
+       if (ret)
+               for_each_set_bit(i, &bitmap, 16) {
+                       if (!dst[i])
+                               continue;
+                       if (*r < 0)
+                               *r = 0;
+                       *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
                }
-       }
  
- set_irq:
-       for_each_set_bit(i, &bitmap, 16) {
-               if (!dst[i])
-                       continue;
-               if (*r < 0)
-                       *r = 0;
-               *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
-       }
- out:
        rcu_read_unlock();
        return ret;
  }
@@@ -789,8 -850,9 +850,9 @@@ bool kvm_intr_is_single_vcpu_fast(struc
                        struct kvm_vcpu **dest_vcpu)
  {
        struct kvm_apic_map *map;
+       unsigned long bitmap;
+       struct kvm_lapic **dst = NULL;
        bool ret = false;
-       struct kvm_lapic *dst = NULL;
  
        if (irq->shorthand)
                return false;
        rcu_read_lock();
        map = rcu_dereference(kvm->arch.apic_map);
  
-       if (!map)
-               goto out;
-       if (irq->dest_mode == APIC_DEST_PHYSICAL) {
-               if (irq->dest_id == 0xFF)
-                       goto out;
-               if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
-                       goto out;
-               dst = map->phys_map[irq->dest_id];
-               if (dst && kvm_apic_present(dst->vcpu))
-                       *dest_vcpu = dst->vcpu;
-               else
-                       goto out;
-       } else {
-               u16 cid;
-               unsigned long bitmap = 1;
-               int i, r = 0;
-               if (!kvm_apic_logical_map_valid(map))
-                       goto out;
-               apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
-               if (cid >= ARRAY_SIZE(map->logical_map))
-                       goto out;
-               if (kvm_vector_hashing_enabled() &&
-                               kvm_lowest_prio_delivery(irq)) {
-                       int idx;
-                       unsigned int dest_vcpus;
+       if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
+                       hweight16(bitmap) == 1) {
+               unsigned long i = find_first_bit(&bitmap, 16);
  
-                       dest_vcpus = hweight16(bitmap);
-                       if (dest_vcpus == 0)
-                               goto out;
-                       idx = kvm_vector_to_index(irq->vector, dest_vcpus,
-                                                 &bitmap, 16);
-                       dst = map->logical_map[cid][idx];
-                       if (!dst) {
-                               kvm_apic_disabled_lapic_found(kvm);
-                               goto out;
-                       }
-                       *dest_vcpu = dst->vcpu;
-               } else {
-                       for_each_set_bit(i, &bitmap, 16) {
-                               dst = map->logical_map[cid][i];
-                               if (++r == 2)
-                                       goto out;
-                       }
-                       if (dst && kvm_apic_present(dst->vcpu))
-                               *dest_vcpu = dst->vcpu;
-                       else
-                               goto out;
+               if (dst[i]) {
+                       *dest_vcpu = dst[i]->vcpu;
+                       ret = true;
                }
        }
  
-       ret = true;
- out:
        rcu_read_unlock();
        return ret;
  }
@@@ -1127,12 -1136,6 +1136,6 @@@ static u32 __apic_read(struct kvm_lapi
                return 0;
  
        switch (offset) {
-       case APIC_ID:
-               if (apic_x2apic_mode(apic))
-                       val = kvm_apic_id(apic);
-               else
-                       val = kvm_apic_id(apic) << 24;
-               break;
        case APIC_ARBPRI:
                apic_debug("Access APIC ARBPRI register which is for P6\n");
                break;
@@@ -1310,10 -1313,111 +1313,112 @@@ void wait_lapic_expire(struct kvm_vcpu 
  
        /* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
        if (guest_tsc < tsc_deadline)
 -              __delay(tsc_deadline - guest_tsc);
 +              __delay(min(tsc_deadline - guest_tsc,
 +                      nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
  }
  
+ static void start_sw_tscdeadline(struct kvm_lapic *apic)
+ {
+       u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+       u64 ns = 0;
+       ktime_t expire;
+       struct kvm_vcpu *vcpu = apic->vcpu;
+       unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+       unsigned long flags;
+       ktime_t now;
+       if (unlikely(!tscdeadline || !this_tsc_khz))
+               return;
+       local_irq_save(flags);
+       now = apic->lapic_timer.timer.base->get_time();
+       guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+       if (likely(tscdeadline > guest_tsc)) {
+               ns = (tscdeadline - guest_tsc) * 1000000ULL;
+               do_div(ns, this_tsc_khz);
+               expire = ktime_add_ns(now, ns);
+               expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
+               hrtimer_start(&apic->lapic_timer.timer,
+                               expire, HRTIMER_MODE_ABS_PINNED);
+       } else
+               apic_timer_expired(apic);
+       local_irq_restore(flags);
+ }
+ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+ {
+       return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
+ }
+ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
+ static void cancel_hv_tscdeadline(struct kvm_lapic *apic)
+ {
+       kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+       apic->lapic_timer.hv_timer_in_use = false;
+ }
+ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+ {
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+       WARN_ON(swait_active(&vcpu->wq));
+       cancel_hv_tscdeadline(apic);
+       apic_timer_expired(apic);
+ }
+ EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+ static bool start_hv_tscdeadline(struct kvm_lapic *apic)
+ {
+       u64 tscdeadline = apic->lapic_timer.tscdeadline;
+       if (atomic_read(&apic->lapic_timer.pending) ||
+               kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
+               if (apic->lapic_timer.hv_timer_in_use)
+                       cancel_hv_tscdeadline(apic);
+       } else {
+               apic->lapic_timer.hv_timer_in_use = true;
+               hrtimer_cancel(&apic->lapic_timer.timer);
+               /* In case the sw timer triggered in the window */
+               if (atomic_read(&apic->lapic_timer.pending))
+                       cancel_hv_tscdeadline(apic);
+       }
+       trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+                       apic->lapic_timer.hv_timer_in_use);
+       return apic->lapic_timer.hv_timer_in_use;
+ }
+ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+ {
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       WARN_ON(apic->lapic_timer.hv_timer_in_use);
+       if (apic_lvtt_tscdeadline(apic))
+               start_hv_tscdeadline(apic);
+ }
+ EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
+ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+ {
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       /* Possibly the TSC deadline timer is not enabled yet */
+       if (!apic->lapic_timer.hv_timer_in_use)
+               return;
+       cancel_hv_tscdeadline(apic);
+       if (atomic_read(&apic->lapic_timer.pending))
+               return;
+       start_sw_tscdeadline(apic);
+ }
+ EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
  static void start_apic_timer(struct kvm_lapic *apic)
  {
        ktime_t now;
                           ktime_to_ns(ktime_add_ns(now,
                                        apic->lapic_timer.period)));
        } else if (apic_lvtt_tscdeadline(apic)) {
-               /* lapic timer in tsc deadline mode */
-               u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
-               u64 ns = 0;
-               ktime_t expire;
-               struct kvm_vcpu *vcpu = apic->vcpu;
-               unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
-               unsigned long flags;
-               if (unlikely(!tscdeadline || !this_tsc_khz))
-                       return;
-               local_irq_save(flags);
-               now = apic->lapic_timer.timer.base->get_time();
-               guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
-               if (likely(tscdeadline > guest_tsc)) {
-                       ns = (tscdeadline - guest_tsc) * 1000000ULL;
-                       do_div(ns, this_tsc_khz);
-                       expire = ktime_add_ns(now, ns);
-                       expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
-                       hrtimer_start(&apic->lapic_timer.timer,
-                                     expire, HRTIMER_MODE_ABS_PINNED);
-               } else
-                       apic_timer_expired(apic);
-               local_irq_restore(flags);
+               if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic)))
+                       start_sw_tscdeadline(apic);
        }
  }
  
@@@ -1413,7 -1493,7 +1494,7 @@@ int kvm_lapic_reg_write(struct kvm_lapi
        switch (reg) {
        case APIC_ID:           /* Local APIC ID */
                if (!apic_x2apic_mode(apic))
-                       kvm_apic_set_id(apic, val >> 24);
+                       kvm_apic_set_xapic_id(apic, val >> 24);
                else
                        ret = 1;
                break;
@@@ -1674,9 -1754,10 +1755,10 @@@ void kvm_lapic_set_base(struct kvm_vcp
  
        /* update jump label if enable bit changes */
        if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
-               if (value & MSR_IA32_APICBASE_ENABLE)
+               if (value & MSR_IA32_APICBASE_ENABLE) {
+                       kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
                        static_key_slow_dec_deferred(&apic_hw_disabled);
-               else
+               else
                        static_key_slow_inc(&apic_hw_disabled.key);
                recalculate_apic_map(vcpu->kvm);
        }
@@@ -1716,8 -1797,11 +1798,11 @@@ void kvm_lapic_reset(struct kvm_vcpu *v
        /* Stop the timer in case it's a reset to an active apic */
        hrtimer_cancel(&apic->lapic_timer.timer);
  
-       if (!init_event)
-               kvm_apic_set_id(apic, vcpu->vcpu_id);
+       if (!init_event) {
+               kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE |
+                                        MSR_IA32_APICBASE_ENABLE);
+               kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+       }
        kvm_apic_set_version(apic->vcpu);
  
        for (i = 0; i < KVM_APIC_LVT_NUM; i++)
@@@ -1856,9 -1940,6 +1941,6 @@@ int kvm_create_lapic(struct kvm_vcpu *v
         * thinking that APIC satet has changed.
         */
        vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
-       kvm_lapic_set_base(vcpu,
-                       APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
        static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
        kvm_lapic_reset(vcpu, false);
        kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
@@@ -1938,17 -2019,48 +2020,48 @@@ int kvm_get_apic_interrupt(struct kvm_v
        return vector;
  }
  
- void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
-               struct kvm_lapic_state *s)
+ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
+               struct kvm_lapic_state *s, bool set)
+ {
+       if (apic_x2apic_mode(vcpu->arch.apic)) {
+               u32 *id = (u32 *)(s->regs + APIC_ID);
+               if (vcpu->kvm->arch.x2apic_format) {
+                       if (*id != vcpu->vcpu_id)
+                               return -EINVAL;
+               } else {
+                       if (set)
+                               *id >>= 24;
+                       else
+                               *id <<= 24;
+               }
+       }
+       return 0;
+ }
+ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+ {
+       memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+       return kvm_apic_state_fixup(vcpu, s, false);
+ }
+ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
  {
        struct kvm_lapic *apic = vcpu->arch.apic;
+       int r;
  
        kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
        /* set SPIV separately to get count of SW disabled APICs right */
        apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+       r = kvm_apic_state_fixup(vcpu, s, true);
+       if (r)
+               return r;
        memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
-       /* call kvm_apic_set_id() to put apic into apic_map */
-       kvm_apic_set_id(apic, kvm_apic_id(apic));
+       recalculate_apic_map(vcpu->kvm);
        kvm_apic_set_version(vcpu);
  
        apic_update_ppr(apic);
                kvm_rtc_eoi_tracking_restore_one(vcpu);
  
        vcpu->arch.apic_arb_prio = 0;
+       return 0;
  }
  
  void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --combined arch/x86/kvm/mmu.c
@@@ -29,8 -29,7 +29,8 @@@
  #include <linux/string.h>
  #include <linux/mm.h>
  #include <linux/highmem.h>
 -#include <linux/module.h>
 +#include <linux/moduleparam.h>
 +#include <linux/export.h>
  #include <linux/swap.h>
  #include <linux/hugetlb.h>
  #include <linux/compiler.h>
@@@ -176,6 -175,7 +176,7 @@@ static u64 __read_mostly shadow_user_ma
  static u64 __read_mostly shadow_accessed_mask;
  static u64 __read_mostly shadow_dirty_mask;
  static u64 __read_mostly shadow_mmio_mask;
+ static u64 __read_mostly shadow_present_mask;
  
  static void mmu_spte_set(u64 *sptep, u64 spte);
  static void mmu_free_roots(struct kvm_vcpu *vcpu);
@@@ -283,13 -283,14 +284,14 @@@ static bool check_mmio_spte(struct kvm_
  }
  
  void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask)
+               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask)
  {
        shadow_user_mask = user_mask;
        shadow_accessed_mask = accessed_mask;
        shadow_dirty_mask = dirty_mask;
        shadow_nx_mask = nx_mask;
        shadow_x_mask = x_mask;
+       shadow_present_mask = p_mask;
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
  
@@@ -305,7 -306,7 +307,7 @@@ static int is_nx(struct kvm_vcpu *vcpu
  
  static int is_shadow_present_pte(u64 pte)
  {
-       return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
+       return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte);
  }
  
  static int is_large_pte(u64 pte)
@@@ -524,7 -525,7 +526,7 @@@ static void mmu_spte_set(u64 *sptep, u6
  }
  
  /* Rules for using mmu_spte_update:
-  * Update the state bits, it means the mapped pfn is not changged.
+  * Update the state bits, it means the mapped pfn is not changed.
   *
   * Whenever we overwrite a writable spte with a read-only one we
   * should flush remote TLBs. Otherwise rmap_write_protect
@@@ -2246,10 -2247,9 +2248,9 @@@ static void link_shadow_page(struct kvm
  {
        u64 spte;
  
-       BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
-                       VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+       BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
  
-       spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
+       spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
               shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
  
        mmu_spte_set(sptep, spte);
@@@ -2516,13 -2516,19 +2517,19 @@@ static int set_spte(struct kvm_vcpu *vc
                    gfn_t gfn, kvm_pfn_t pfn, bool speculative,
                    bool can_unsync, bool host_writable)
  {
-       u64 spte;
+       u64 spte = 0;
        int ret = 0;
  
        if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
                return 0;
  
-       spte = PT_PRESENT_MASK;
+       /*
+        * For the EPT case, shadow_present_mask is 0 if hardware
+        * supports exec-only page table entries.  In that case,
+        * ACC_USER_MASK and shadow_user_mask are used to represent
+        * read access.  See FNAME(gpte_access) in paging_tmpl.h.
+        */
+       spte |= shadow_present_mask;
        if (!speculative)
                spte |= shadow_accessed_mask;
  
@@@ -3190,7 -3196,7 +3197,7 @@@ static int mmu_alloc_shadow_roots(struc
                MMU_WARN_ON(VALID_PAGE(root));
                if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
                        pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
-                       if (!is_present_gpte(pdptr)) {
+                       if (!(pdptr & PT_PRESENT_MASK)) {
                                vcpu->arch.mmu.pae_root[i] = 0;
                                continue;
                        }
@@@ -3915,9 -3921,7 +3922,7 @@@ static void update_permission_bitmask(s
                                 *   clearer.
                                 */
                                smap = cr4_smap && u && !uf && !ff;
-                       } else
-                               /* Not really needed: no U/S accesses on ept  */
-                               u = 1;
+                       }
  
                        fault = (ff && !x) || (uf && !u) || (wf && !w) ||
                                (smapf && smap);
diff --combined arch/x86/kvm/svm.c
@@@ -238,9 -238,7 +238,9 @@@ module_param(nested, int, S_IRUGO)
  
  /* enable / disable AVIC */
  static int avic;
 +#ifdef CONFIG_X86_LOCAL_APIC
  module_param(avic, int, S_IRUGO);
 +#endif
  
  static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
  static void svm_flush_tlb(struct kvm_vcpu *vcpu);
@@@ -983,14 -981,11 +983,14 @@@ static __init int svm_hardware_setup(vo
        } else
                kvm_disable_tdp();
  
 -      if (avic && (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)))
 -              avic = false;
 -
 -      if (avic)
 -              pr_info("AVIC enabled\n");
 +      if (avic) {
 +              if (!npt_enabled ||
 +                  !boot_cpu_has(X86_FEATURE_AVIC) ||
 +                  !IS_ENABLED(CONFIG_X86_LOCAL_APIC))
 +                      avic = false;
 +              else
 +                      pr_info("AVIC enabled\n");
 +      }
  
        return 0;
  
@@@ -1329,7 -1324,7 +1329,7 @@@ free_avic
  static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
  {
        u64 entry;
 -      int h_physical_id = __default_cpu_present_to_apicid(vcpu->cpu);
 +      int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
        struct vcpu_svm *svm = to_svm(vcpu);
  
        if (!kvm_vcpu_apicv_active(vcpu))
@@@ -1354,7 -1349,7 +1354,7 @@@ static void avic_vcpu_load(struct kvm_v
  {
        u64 entry;
        /* ID = 0xff (broadcast), ID > 0xff (reserved) */
 -      int h_physical_id = __default_cpu_present_to_apicid(cpu);
 +      int h_physical_id = kvm_cpu_get_apicid(cpu);
        struct vcpu_svm *svm = to_svm(vcpu);
  
        if (!kvm_vcpu_apicv_active(vcpu))
@@@ -1577,7 -1572,7 +1577,7 @@@ static unsigned long svm_get_rflags(str
  static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
  {
         /*
-         * Any change of EFLAGS.VM is accompained by a reload of SS
+         * Any change of EFLAGS.VM is accompanied by a reload of SS
          * (caused by either a task switch or an inter-privilege IRET),
          * so we do not need to update the CPL here.
          */
@@@ -4241,7 -4236,7 +4241,7 @@@ static void svm_deliver_avic_intr(struc
  
        if (avic_vcpu_is_running(vcpu))
                wrmsrl(SVM_AVIC_DOORBELL,
 -                     __default_cpu_present_to_apicid(vcpu->cpu));
 +                     kvm_cpu_get_apicid(vcpu->cpu));
        else
                kvm_vcpu_wake_up(vcpu);
  }
  static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
  {
        local_irq_enable();
+       /*
+        * We must have an instruction with interrupts enabled, so
+        * the timer interrupt isn't delayed by the interrupt shadow.
+        */
+       asm("nop");
+       local_irq_disable();
  }
  
  static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
diff --combined arch/x86/kvm/vmx.c
@@@ -110,6 -110,13 +110,13 @@@ module_param_named(pml, enable_pml, boo
  
  #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
  
+ /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
+ static int __read_mostly cpu_preemption_timer_multi;
+ static bool __read_mostly enable_preemption_timer = 1;
+ #ifdef CONFIG_X86_64
+ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+ #endif
  #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
  #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
  #define KVM_VM_CR0_ALWAYS_ON                                          \
@@@ -398,6 -405,12 +405,12 @@@ struct nested_vmx 
        /* The host-usable pointer to the above */
        struct page *current_vmcs12_page;
        struct vmcs12 *current_vmcs12;
+       /*
+        * Cache of the guest's VMCS, existing outside of guest memory.
+        * Loaded from guest memory during VMPTRLD. Flushed to guest
+        * memory during VMXOFF, VMCLEAR, VMPTRLD.
+        */
+       struct vmcs12 *cached_vmcs12;
        struct vmcs *current_shadow_vmcs;
        /*
         * Indicates if the shadow vmcs must be updated with the
        struct pi_desc *pi_desc;
        bool pi_pending;
        u16 posted_intr_nv;
-       u64 msr_ia32_feature_control;
  
        struct hrtimer preemption_timer;
        bool preemption_timer_expired;
@@@ -597,11 -609,22 +609,22 @@@ struct vcpu_vmx 
  #define PML_ENTITY_NUM                512
        struct page *pml_pg;
  
+       /* apic deadline value in host tsc */
+       u64 hv_deadline_tsc;
        u64 current_tsc_ratio;
  
        bool guest_pkru_valid;
        u32 guest_pkru;
        u32 host_pkru;
+       /*
+        * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+        * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+        * in msr_ia32_feature_control_valid_bits.
+        */
+       u64 msr_ia32_feature_control;
+       u64 msr_ia32_feature_control_valid_bits;
  };
  
  enum segment_cache_field {
@@@ -841,7 -864,7 +864,7 @@@ static inline short vmcs_field_to_offse
  
  static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
  {
-       return to_vmx(vcpu)->nested.current_vmcs12;
+       return to_vmx(vcpu)->nested.cached_vmcs12;
  }
  
  static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
@@@ -1056,6 -1079,58 +1079,58 @@@ static inline bool cpu_has_vmx_virtual_
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
  }
  
+ /*
+  * Comment's format: document - errata name - stepping - processor name.
+  * Refer from
+  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+  */
+ static u32 vmx_preemption_cpu_tfms[] = {
+ /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
+ 0x000206E6,
+ /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
+ /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+ /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+ 0x00020652,
+ /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+ 0x00020655,
+ /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
+ /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
+ /*
+  * 320767.pdf - AAP86  - B1 -
+  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+  */
+ 0x000106E5,
+ /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+ 0x000106A0,
+ /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+ 0x000106A1,
+ /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+ 0x000106A4,
+  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+ 0x000106A5,
+ };
+ static inline bool cpu_has_broken_vmx_preemption_timer(void)
+ {
+       u32 eax = cpuid_eax(0x00000001), i;
+       /* Clear the reserved bits */
+       eax &= ~(0x3U << 14 | 0xfU << 28);
+       for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
+               if (eax == vmx_preemption_cpu_tfms[i])
+                       return true;
+       return false;
+ }
+ static inline bool cpu_has_vmx_preemption_timer(void)
+ {
+       return vmcs_config.pin_based_exec_ctrl &
+               PIN_BASED_VMX_PREEMPTION_TIMER;
+ }
  static inline bool cpu_has_vmx_posted_intr(void)
  {
        return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
@@@ -1603,6 -1678,11 +1678,11 @@@ static __always_inline void vmcs_set_bi
        __vmcs_writel(field, __vmcs_readl(field) | mask);
  }
  
+ static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
+ {
+       vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
+ }
  static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
  {
        vmcs_write32(VM_ENTRY_CONTROLS, val);
@@@ -1631,6 -1711,11 +1711,11 @@@ static inline void vm_entry_controls_cl
        vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
  }
  
+ static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
+ {
+       vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
+ }
  static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
  {
        vmcs_write32(VM_EXIT_CONTROLS, val);
@@@ -2072,8 -2157,7 +2157,8 @@@ static void vmx_vcpu_pi_load(struct kvm
        unsigned int dest;
  
        if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
 -              !irq_remapping_cap(IRQ_POSTING_CAP))
 +              !irq_remapping_cap(IRQ_POSTING_CAP)  ||
 +              !kvm_vcpu_apicv_active(vcpu))
                return;
  
        do {
@@@ -2121,22 -2205,14 +2206,14 @@@ static void vmx_vcpu_load(struct kvm_vc
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+       bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
  
        if (!vmm_exclusive)
                kvm_cpu_vmxon(phys_addr);
-       else if (vmx->loaded_vmcs->cpu != cpu)
+       else if (!already_loaded)
                loaded_vmcs_clear(vmx->loaded_vmcs);
  
-       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
-               per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
-               vmcs_load(vmx->loaded_vmcs->vmcs);
-       }
-       if (vmx->loaded_vmcs->cpu != cpu) {
-               struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
-               unsigned long sysenter_esp;
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+       if (!already_loaded) {
                local_irq_disable();
                crash_disable_local_vmclear(cpu);
  
                         &per_cpu(loaded_vmcss_on_cpu, cpu));
                crash_enable_local_vmclear(cpu);
                local_irq_enable();
+       }
+       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+               per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+               vmcs_load(vmx->loaded_vmcs->vmcs);
+       }
+       if (!already_loaded) {
+               struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
+               unsigned long sysenter_esp;
+               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
  
                /*
                 * Linux uses per-cpu TSS and GDT, so set these when switching
@@@ -2181,8 -2269,7 +2270,8 @@@ static void vmx_vcpu_pi_put(struct kvm_
        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
  
        if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
 -              !irq_remapping_cap(IRQ_POSTING_CAP))
 +              !irq_remapping_cap(IRQ_POSTING_CAP)  ||
 +              !kvm_vcpu_apicv_active(vcpu))
                return;
  
        /* Set SN when the vCPU is preempted */
@@@ -2707,7 -2794,8 +2796,7 @@@ static void nested_vmx_setup_ctls_msrs(
                SECONDARY_EXEC_APIC_REGISTER_VIRT |
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                SECONDARY_EXEC_WBINVD_EXITING |
 -              SECONDARY_EXEC_XSAVES |
 -              SECONDARY_EXEC_PCOMMIT;
 +              SECONDARY_EXEC_XSAVES;
  
        if (enable_ept) {
                /* nested EPT: emulate EPT also to L1 */
                vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
                         VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
                         VMX_EPT_INVEPT_BIT;
+               if (cpu_has_vmx_ept_execute_only())
+                       vmx->nested.nested_vmx_ept_caps |=
+                               VMX_EPT_EXECUTE_ONLY_BIT;
                vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
                /*
                 * For nested guests, we don't do anything specific
@@@ -2864,6 -2955,14 +2956,14 @@@ static int vmx_get_vmx_msr(struct kvm_v
        return 0;
  }
  
+ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
+                                                uint64_t val)
+ {
+       uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+       return !(val & ~valid_bits);
+ }
  /*
   * Reads an msr value (of 'msr_index') into 'pdata'.
   * Returns 0 on success, non-0 otherwise.
@@@ -2905,10 -3004,15 +3005,15 @@@ static int vmx_get_msr(struct kvm_vcpu 
                        return 1;
                msr_info->data = vmcs_read64(GUEST_BNDCFGS);
                break;
-       case MSR_IA32_FEATURE_CONTROL:
-               if (!nested_vmx_allowed(vcpu))
+       case MSR_IA32_MCG_EXT_CTL:
+               if (!msr_info->host_initiated &&
+                   !(to_vmx(vcpu)->msr_ia32_feature_control &
+                     FEATURE_CONTROL_LMCE))
                        return 1;
-               msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+               msr_info->data = vcpu->arch.mcg_ext_ctl;
+               break;
+       case MSR_IA32_FEATURE_CONTROL:
+               msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
                break;
        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                if (!nested_vmx_allowed(vcpu))
@@@ -2998,12 -3102,20 +3103,20 @@@ static int vmx_set_msr(struct kvm_vcpu 
        case MSR_IA32_TSC_ADJUST:
                ret = kvm_set_msr_common(vcpu, msr_info);
                break;
+       case MSR_IA32_MCG_EXT_CTL:
+               if ((!msr_info->host_initiated &&
+                    !(to_vmx(vcpu)->msr_ia32_feature_control &
+                      FEATURE_CONTROL_LMCE)) ||
+                   (data & ~MCG_EXT_CTL_LMCE_EN))
+                       return 1;
+               vcpu->arch.mcg_ext_ctl = data;
+               break;
        case MSR_IA32_FEATURE_CONTROL:
-               if (!nested_vmx_allowed(vcpu) ||
-                   (to_vmx(vcpu)->nested.msr_ia32_feature_control &
+               if (!vmx_feature_control_msr_valid(vcpu, data) ||
+                   (to_vmx(vcpu)->msr_ia32_feature_control &
                     FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
                        return 1;
-               vmx->nested.msr_ia32_feature_control = data;
+               vmx->msr_ia32_feature_control = data;
                if (msr_info->host_initiated && data == 0)
                        vmx_leave_nested(vcpu);
                break;
@@@ -3269,6 -3381,7 +3382,6 @@@ static __init int setup_vmcs_config(str
                        SECONDARY_EXEC_SHADOW_VMCS |
                        SECONDARY_EXEC_XSAVES |
                        SECONDARY_EXEC_ENABLE_PML |
 -                      SECONDARY_EXEC_PCOMMIT |
                        SECONDARY_EXEC_TSC_SCALING;
                if (adjust_vmx_controls(min2, opt2,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                      vmx_capability.ept, vmx_capability.vpid);
        }
  
-       min = VM_EXIT_SAVE_DEBUG_CONTROLS;
+       min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
  #ifdef CONFIG_X86_64
        min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
  #endif
        opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
-               VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
+               VM_EXIT_CLEAR_BNDCFGS;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
                                &_vmexit_control) < 0)
                return -EIO;
  
        min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+                PIN_BASED_VMX_PREEMPTION_TIMER;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
                                &_pin_based_exec_control) < 0)
                return -EIO;
  
+       if (cpu_has_broken_vmx_preemption_timer())
+               _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
        if (!(_cpu_based_2nd_exec_control &
-               SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
-               !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+               SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
                _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
  
        min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
  
        /*
         * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
-        * but due to arrata below it can't be used. Workaround is to use
+        * but due to errata below it can't be used. Workaround is to use
         * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
         *
         * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
@@@ -4781,6 -4896,8 +4896,8 @@@ static u32 vmx_pin_based_exec_ctrl(stru
  
        if (!kvm_vcpu_apicv_active(&vmx->vcpu))
                pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+       /* Enable the preemption timer dynamically */
+       pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
        return pin_based_exec_ctrl;
  }
  
@@@ -4856,6 -4973,9 +4973,6 @@@ static u32 vmx_secondary_exec_control(s
        if (!enable_pml)
                exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
  
 -      /* Currently, we allow L1 guest to directly run pcommit instruction. */
 -      exec_control &= ~SECONDARY_EXEC_PCOMMIT;
 -
        return exec_control;
  }
  
@@@ -4896,13 -5016,13 +5013,14 @@@ static int vmx_vcpu_setup(struct vcpu_v
  
        /* Control */
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+       vmx->hv_deadline_tsc = -1;
  
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
  
 -      if (cpu_has_secondary_exec_ctrls())
 +      if (cpu_has_secondary_exec_ctrls()) {
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
                                vmx_secondary_exec_control(vmx));
 +      }
  
        if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
                vmcs_write64(EOI_EXIT_BITMAP0, 0);
        if (vmx_xsaves_supported())
                vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
  
 +      if (enable_pml) {
 +              ASSERT(vmx->pml_pg);
 +              vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
 +              vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 +      }
 +
        return 0;
  }
  
@@@ -6016,12 -6130,14 +6134,14 @@@ static int handle_ept_violation(struct 
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
        trace_kvm_page_fault(gpa, exit_qualification);
  
-       /* It is a write fault? */
-       error_code = exit_qualification & PFERR_WRITE_MASK;
+       /* it is a read fault? */
+       error_code = (exit_qualification << 2) & PFERR_USER_MASK;
+       /* it is a write fault? */
+       error_code |= exit_qualification & PFERR_WRITE_MASK;
        /* It is a fetch fault? */
        error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
        /* ept page table is present? */
-       error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK;
+       error_code |= (exit_qualification & 0x38) != 0;
  
        vcpu->arch.exit_qualification = exit_qualification;
  
@@@ -6355,9 -6471,6 +6475,6 @@@ static __init int hardware_setup(void
        for (msr = 0x800; msr <= 0x8ff; msr++)
                vmx_disable_intercept_msr_read_x2apic(msr);
  
-       /* According SDM, in x2apic mode, the whole id reg is used.  But in
-        * KVM, it only use the highest eight bits. Need to intercept it */
-       vmx_enable_intercept_msr_read_x2apic(0x802);
        /* TMCCT */
        vmx_enable_intercept_msr_read_x2apic(0x839);
        /* TPR */
        vmx_disable_intercept_msr_write_x2apic(0x83f);
  
        if (enable_ept) {
-               kvm_mmu_set_mask_ptes(0ull,
+               kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
                        (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
                        (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
-                       0ull, VMX_EPT_EXECUTABLE_MASK);
+                       0ull, VMX_EPT_EXECUTABLE_MASK,
+                       cpu_has_vmx_ept_execute_only() ?
+                                     0ull : VMX_EPT_READABLE_MASK);
                ept_set_mmio_spte_mask();
                kvm_enable_tdp();
        } else
                kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
        }
  
+       if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+               u64 vmx_msr;
+               rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+               cpu_preemption_timer_multi =
+                        vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+       } else {
+               kvm_x86_ops->set_hv_timer = NULL;
+               kvm_x86_ops->cancel_hv_timer = NULL;
+       }
        kvm_set_posted_intr_wakeup_handler(wakeup_handler);
  
+       kvm_mce_cap_supported |= MCG_LMCE_P;
        return alloc_kvm_area();
  
  out8:
@@@ -6673,13 -6801,7 +6805,13 @@@ static int get_vmx_mem_address(struct k
  
        /* Checks for #GP/#SS exceptions. */
        exn = false;
 -      if (is_protmode(vcpu)) {
 +      if (is_long_mode(vcpu)) {
 +              /* Long mode: #GP(0)/#SS(0) if the memory address is in a
 +               * non-canonical form. This is the only check on the memory
 +               * destination for long mode!
 +               */
 +              exn = is_noncanonical_address(*ret);
 +      } else if (is_protmode(vcpu)) {
                /* Protected mode: apply checks for segment validity in the
                 * following order:
                 * - segment type check (#GP(0) may be thrown)
                         * execute-only code segment
                         */
                        exn = ((s.type & 0xa) == 8);
 -      }
 -      if (exn) {
 -              kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 -              return 1;
 -      }
 -      if (is_long_mode(vcpu)) {
 -              /* Long mode: #GP(0)/#SS(0) if the memory address is in a
 -               * non-canonical form. This is an only check for long mode.
 -               */
 -              exn = is_noncanonical_address(*ret);
 -      } else if (is_protmode(vcpu)) {
 +              if (exn) {
 +                      kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 +                      return 1;
 +              }
                /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
                 */
                exn = (s.unusable != 0);
@@@ -6862,16 -6991,22 +6994,22 @@@ static int handle_vmon(struct kvm_vcpu 
                return 1;
        }
  
-       if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+       if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
                        != VMXON_NEEDED_FEATURES) {
                kvm_inject_gp(vcpu, 0);
                return 1;
        }
  
+       vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+       if (!vmx->nested.cached_vmcs12)
+               return -ENOMEM;
        if (enable_shadow_vmcs) {
                shadow_vmcs = alloc_vmcs();
-               if (!shadow_vmcs)
+               if (!shadow_vmcs) {
+                       kfree(vmx->nested.cached_vmcs12);
                        return -ENOMEM;
+               }
                /* mark vmcs as shadow */
                shadow_vmcs->revision_id |= (1u << 31);
                /* init shadow vmcs */
@@@ -6942,6 -7077,11 +7080,11 @@@ static inline void nested_release_vmcs1
                vmcs_write64(VMCS_LINK_POINTER, -1ull);
        }
        vmx->nested.posted_intr_nv = -1;
+       /* Flush VMCS12 to guest memory */
+       memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
+              VMCS12_SIZE);
        kunmap(vmx->nested.current_vmcs12_page);
        nested_release_page(vmx->nested.current_vmcs12_page);
        vmx->nested.current_vmptr = -1ull;
@@@ -6962,6 -7102,7 +7105,7 @@@ static void free_nested(struct vcpu_vm
        nested_release_vmcs12(vmx);
        if (enable_shadow_vmcs)
                free_vmcs(vmx->nested.current_shadow_vmcs);
+       kfree(vmx->nested.cached_vmcs12);
        /* Unpin physical memory we referred to in current vmcs02 */
        if (vmx->nested.apic_access_page) {
                nested_release_page(vmx->nested.apic_access_page);
@@@ -7365,6 -7506,13 +7509,13 @@@ static int handle_vmptrld(struct kvm_vc
                vmx->nested.current_vmptr = vmptr;
                vmx->nested.current_vmcs12 = new_vmcs12;
                vmx->nested.current_vmcs12_page = page;
+               /*
+                * Load VMCS12 from guest memory since it is not already
+                * cached.
+                */
+               memcpy(vmx->nested.cached_vmcs12,
+                      vmx->nested.current_vmcs12, VMCS12_SIZE);
                if (enable_shadow_vmcs) {
                        vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
                                      SECONDARY_EXEC_SHADOW_VMCS);
@@@ -7560,6 -7708,19 +7711,12 @@@ static int handle_pml_full(struct kvm_v
        return 1;
  }
  
 -static int handle_pcommit(struct kvm_vcpu *vcpu)
 -{
 -      /* we never catch pcommit instruct for L1 guest. */
 -      WARN_ON(1);
 -      return 1;
 -}
 -
+ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+ {
+       kvm_lapic_expired_hv_timer(vcpu);
+       return 1;
+ }
  /*
   * The exit handlers return 1 if the exit was handled fully and guest execution
   * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@@ -7610,6 -7771,8 +7767,7 @@@ static int (*const kvm_vmx_exit_handler
        [EXIT_REASON_XSAVES]                  = handle_xsaves,
        [EXIT_REASON_XRSTORS]                 = handle_xrstors,
        [EXIT_REASON_PML_FULL]                = handle_pml_full,
 -      [EXIT_REASON_PCOMMIT]                 = handle_pcommit,
+       [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
  };
  
  static const int kvm_vmx_max_exit_handlers =
@@@ -7918,6 -8081,10 +8076,8 @@@ static bool nested_vmx_exit_handled(str
                 * the XSS exit bitmap in vmcs12.
                 */
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
 -      case EXIT_REASON_PCOMMIT:
 -              return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
+       case EXIT_REASON_PREEMPTION_TIMER:
+               return false;
        default:
                return true;
        }
@@@ -7929,6 -8096,22 +8089,6 @@@ static void vmx_get_exit_info(struct kv
        *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
  }
  
 -static int vmx_create_pml_buffer(struct vcpu_vmx *vmx)
 -{
 -      struct page *pml_pg;
 -
 -      pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
 -      if (!pml_pg)
 -              return -ENOMEM;
 -
 -      vmx->pml_pg = pml_pg;
 -
 -      vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
 -      vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 -
 -      return 0;
 -}
 -
  static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
  {
        if (vmx->pml_pg) {
@@@ -8200,7 -8383,6 +8360,7 @@@ static int vmx_handle_exit(struct kvm_v
        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
                        (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
                        exit_reason != EXIT_REASON_EPT_VIOLATION &&
 +                      exit_reason != EXIT_REASON_PML_FULL &&
                        exit_reason != EXIT_REASON_TASK_SWITCH)) {
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
@@@ -8303,7 -8485,7 +8463,7 @@@ static void vmx_set_apic_access_page_ad
         * the next L2->L1 exit.
         */
        if (!is_guest_mode(vcpu) ||
-           !nested_cpu_has2(vmx->nested.current_vmcs12,
+           !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
                             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
                vmcs_write64(APIC_ACCESS_ADDR, hpa);
  }
@@@ -8436,7 -8618,6 +8596,6 @@@ static void vmx_handle_external_intr(st
                        "push %[sp]\n\t"
  #endif
                        "pushf\n\t"
-                       "orl $0x200, (%%" _ASM_SP ")\n\t"
                        __ASM_SIZE(push) " $%c[cs]\n\t"
                        "call *%[entry]\n\t"
                        :
                        [ss]"i"(__KERNEL_DS),
                        [cs]"i"(__KERNEL_CS)
                        );
-       } else
-               local_irq_enable();
+       }
  }
  
  static bool vmx_has_high_real_mode_segbase(void)
@@@ -8601,6 -8781,26 +8759,26 @@@ static void atomic_switch_perf_msrs(str
                                        msrs[i].host);
  }
  
+ void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 tscl;
+       u32 delta_tsc;
+       if (vmx->hv_deadline_tsc == -1)
+               return;
+       tscl = rdtsc();
+       if (vmx->hv_deadline_tsc > tscl)
+               /* sure to be 32 bit only because checked on set_hv_timer */
+               delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+                       cpu_preemption_timer_multi);
+       else
+               delta_tsc = 0;
+       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+ }
  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        atomic_switch_perf_msrs(vmx);
        debugctlmsr = get_debugctlmsr();
  
+       vmx_arm_hv_timer(vcpu);
        vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
                /* Store host registers */
@@@ -8831,22 -9033,6 +9011,22 @@@ static void vmx_load_vmcs01(struct kvm_
        put_cpu();
  }
  
 +/*
 + * Ensure that the current vmcs of the logical processor is the
 + * vmcs01 of the vcpu before calling free_nested().
 + */
 +static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
 +{
 +       struct vcpu_vmx *vmx = to_vmx(vcpu);
 +       int r;
 +
 +       r = vcpu_load(vcpu);
 +       BUG_ON(r);
 +       vmx_load_vmcs01(vcpu);
 +       free_nested(vmx);
 +       vcpu_put(vcpu);
 +}
 +
  static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
                vmx_destroy_pml_buffer(vmx);
        free_vpid(vmx->vpid);
        leave_guest_mode(vcpu);
 -      vmx_load_vmcs01(vcpu);
 -      free_nested(vmx);
 +      vmx_free_vcpu_nested(vcpu);
        free_loaded_vmcs(vmx->loaded_vmcs);
        kfree(vmx->guest_msrs);
        kvm_vcpu_uninit(vcpu);
@@@ -8877,26 -9064,14 +9057,26 @@@ static struct kvm_vcpu *vmx_create_vcpu
        if (err)
                goto free_vcpu;
  
 +      err = -ENOMEM;
 +
 +      /*
 +       * If PML is turned on, failure on enabling PML just results in failure
 +       * of creating the vcpu, therefore we can simplify PML logic (by
 +       * avoiding dealing with cases, such as enabling PML partially on vcpus
 +       * for the guest, etc.
 +       */
 +      if (enable_pml) {
 +              vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
 +              if (!vmx->pml_pg)
 +                      goto uninit_vcpu;
 +      }
 +
        vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
        BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
                     > PAGE_SIZE);
  
 -      err = -ENOMEM;
 -      if (!vmx->guest_msrs) {
 -              goto uninit_vcpu;
 -      }
 +      if (!vmx->guest_msrs)
 +              goto free_pml;
  
        vmx->loaded_vmcs = &vmx->vmcs01;
        vmx->loaded_vmcs->vmcs = alloc_vmcs();
        vmx->nested.current_vmptr = -1ull;
        vmx->nested.current_vmcs12 = NULL;
  
 -      /*
 -       * If PML is turned on, failure on enabling PML just results in failure
 -       * of creating the vcpu, therefore we can simplify PML logic (by
 -       * avoiding dealing with cases, such as enabling PML partially on vcpus
 -       * for the guest, etc.
 -       */
 -      if (enable_pml) {
 -              err = vmx_create_pml_buffer(vmx);
 -              if (err)
 -                      goto free_vmcs;
 -      }
 -
+       vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
        return &vmx->vcpu;
  
  free_vmcs:
        free_loaded_vmcs(vmx->loaded_vmcs);
  free_msrs:
        kfree(vmx->guest_msrs);
 +free_pml:
 +      vmx_destroy_pml_buffer(vmx);
  uninit_vcpu:
        kvm_vcpu_uninit(&vmx->vcpu);
  free_vcpu:
@@@ -9080,6 -9267,22 +9262,13 @@@ static void vmx_cpuid_update(struct kvm
  
        if (cpu_has_secondary_exec_ctrls())
                vmcs_set_secondary_exec_control(secondary_exec_ctl);
 -      if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
 -              if (guest_cpuid_has_pcommit(vcpu))
 -                      vmx->nested.nested_vmx_secondary_ctls_high |=
 -                              SECONDARY_EXEC_PCOMMIT;
 -              else
 -                      vmx->nested.nested_vmx_secondary_ctls_high &=
 -                              ~SECONDARY_EXEC_PCOMMIT;
 -      }
 -
+       if (nested_vmx_allowed(vcpu))
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+                       FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+       else
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+                       ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
  }
  
  static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@@ -9636,9 -9839,14 +9825,14 @@@ static void prepare_vmcs02(struct kvm_v
        vmcs_write64(VMCS_LINK_POINTER, -1ull);
  
        exec_control = vmcs12->pin_based_vm_exec_control;
-       exec_control |= vmcs_config.pin_based_exec_ctrl;
+       /* Preemption timer setting is only taken from vmcs01.  */
        exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       exec_control |= vmcs_config.pin_based_exec_ctrl;
+       if (vmx->hv_deadline_tsc == -1)
+               exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
  
+       /* Posted interrupts setting is only taken from vmcs12.  */
        if (nested_cpu_has_posted_intr(vmcs12)) {
                /*
                 * Note that we use L0's vector here and in
                exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                                  SECONDARY_EXEC_RDTSCP |
                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 -                                SECONDARY_EXEC_APIC_REGISTER_VIRT |
 -                                SECONDARY_EXEC_PCOMMIT);
 +                                SECONDARY_EXEC_APIC_REGISTER_VIRT);
                if (nested_cpu_has(vmcs12,
                                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
                        exec_control |= vmcs12->secondary_vm_exec_control;
@@@ -10556,8 -10765,8 +10750,8 @@@ static void nested_vmx_vmexit(struct kv
                                       vmcs12->vm_exit_intr_error_code,
                                       KVM_ISA_VMX);
  
-       vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
-       vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
+       vm_entry_controls_reset_shadow(vmx);
+       vm_exit_controls_reset_shadow(vmx);
        vmx_segment_cache_clear(vmx);
  
        /* if no vmcs02 cache requested, remove the one we used */
  
        load_vmcs12_host_state(vcpu, vmcs12);
  
-       /* Update TSC_OFFSET if TSC was changed while L2 ran */
+       /* Update any VMCS fields that might have changed while L2 ran */
        vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+       if (vmx->hv_deadline_tsc == -1)
+               vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+                               PIN_BASED_VMX_PREEMPTION_TIMER);
+       else
+               vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+                             PIN_BASED_VMX_PREEMPTION_TIMER);
  
        /* This is needed for same reason as it was needed in prepare_vmcs02 */
        vmx->host_rsp = 0;
@@@ -10647,6 -10862,64 +10847,64 @@@ static int vmx_check_intercept(struct k
        return X86EMUL_CONTINUE;
  }
  
+ #ifdef CONFIG_X86_64
+ /* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+                                 u64 divisor, u64 *result)
+ {
+       u64 low = a << shift, high = a >> (64 - shift);
+       /* To avoid the overflow on divq */
+       if (high >= divisor)
+               return 1;
+       /* Low hold the result, high hold rem which is discarded */
+       asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+           "rm" (divisor), "0" (low), "1" (high));
+       *result = low;
+       return 0;
+ }
+ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 tscl = rdtsc();
+       u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+       u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+       /* Convert to host delta tsc if tsc scaling is enabled */
+       if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+                       u64_shl_div_u64(delta_tsc,
+                               kvm_tsc_scaling_ratio_frac_bits,
+                               vcpu->arch.tsc_scaling_ratio,
+                               &delta_tsc))
+               return -ERANGE;
+       /*
+        * If the delta tsc can't fit in the 32 bit after the multi shift,
+        * we can't use the preemption timer.
+        * It's possible that it fits on later vmentries, but checking
+        * on every vmentry is costly so we just use an hrtimer.
+        */
+       if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+               return -ERANGE;
+       vmx->hv_deadline_tsc = tscl + delta_tsc;
+       vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+                       PIN_BASED_VMX_PREEMPTION_TIMER);
+       return 0;
+ }
+ static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       vmx->hv_deadline_tsc = -1;
+       vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+                       PIN_BASED_VMX_PREEMPTION_TIMER);
+ }
+ #endif
  static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
  {
        if (ple_gap)
@@@ -10691,7 -10964,7 +10949,7 @@@ static void vmx_enable_log_dirty_pt_mas
   *   this case, return 1, otherwise, return 0.
   *
   */
- static int vmx_pre_block(struct kvm_vcpu *vcpu)
+ static int pi_pre_block(struct kvm_vcpu *vcpu)
  {
        unsigned long flags;
        unsigned int dest;
        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
  
        if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
 -              !irq_remapping_cap(IRQ_POSTING_CAP))
 +              !irq_remapping_cap(IRQ_POSTING_CAP)  ||
 +              !kvm_vcpu_apicv_active(vcpu))
                return 0;
  
        vcpu->pre_pcpu = vcpu->cpu;
        return 0;
  }
  
- static void vmx_post_block(struct kvm_vcpu *vcpu)
+ static int vmx_pre_block(struct kvm_vcpu *vcpu)
+ {
+       if (pi_pre_block(vcpu))
+               return 1;
+       if (kvm_lapic_hv_timer_in_use(vcpu))
+               kvm_lapic_switch_to_sw_timer(vcpu);
+       return 0;
+ }
+ static void pi_post_block(struct kvm_vcpu *vcpu)
  {
        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
        struct pi_desc old, new;
        unsigned long flags;
  
        if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
 -              !irq_remapping_cap(IRQ_POSTING_CAP))
 +              !irq_remapping_cap(IRQ_POSTING_CAP)  ||
 +              !kvm_vcpu_apicv_active(vcpu))
                return;
  
        do {
        }
  }
  
+ static void vmx_post_block(struct kvm_vcpu *vcpu)
+ {
+       if (kvm_x86_ops->set_hv_timer)
+               kvm_lapic_switch_to_hv_timer(vcpu);
+       pi_post_block(vcpu);
+ }
  /*
   * vmx_update_pi_irte - set IRTE for Posted-Interrupts
   *
@@@ -10820,8 -11110,7 +11097,8 @@@ static int vmx_update_pi_irte(struct kv
        int idx, ret = -EINVAL;
  
        if (!kvm_arch_has_assigned_device(kvm) ||
 -              !irq_remapping_cap(IRQ_POSTING_CAP))
 +              !irq_remapping_cap(IRQ_POSTING_CAP) ||
 +              !kvm_vcpu_apicv_active(kvm->vcpus[0]))
                return 0;
  
        idx = srcu_read_lock(&kvm->irq_srcu);
                 * We will support full lowest-priority interrupt later.
                 */
  
-               kvm_set_msi_irq(e, &irq);
+               kvm_set_msi_irq(kvm, e, &irq);
                if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
                        /*
                         * Make sure the IRTE is in remapped mode if
        return ret;
  }
  
+ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+ {
+       if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+                       FEATURE_CONTROL_LMCE;
+       else
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+                       ~FEATURE_CONTROL_LMCE;
+ }
  static struct kvm_x86_ops vmx_x86_ops = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
        .pmu_ops = &intel_pmu_ops,
  
        .update_pi_irte = vmx_update_pi_irte,
+ #ifdef CONFIG_X86_64
+       .set_hv_timer = vmx_set_hv_timer,
+       .cancel_hv_timer = vmx_cancel_hv_timer,
+ #endif
+       .setup_mce = vmx_setup_mce,
  };
  
  static int __init vmx_init(void)
diff --combined arch/x86/kvm/x86.c
@@@ -36,8 -36,7 +36,8 @@@
  #include <linux/kvm.h>
  #include <linux/fs.h>
  #include <linux/vmalloc.h>
 -#include <linux/module.h>
 +#include <linux/export.h>
 +#include <linux/moduleparam.h>
  #include <linux/mman.h>
  #include <linux/highmem.h>
  #include <linux/iommu.h>
@@@ -56,6 -55,9 +56,6 @@@
  #include <linux/irqbypass.h>
  #include <trace/events/kvm.h>
  
 -#define CREATE_TRACE_POINTS
 -#include "trace.h"
 -
  #include <asm/debugreg.h>
  #include <asm/msr.h>
  #include <asm/desc.h>
  #include <asm/div64.h>
  #include <asm/irq_remapping.h>
  
 +#define CREATE_TRACE_POINTS
 +#include "trace.h"
 +
  #define MAX_IO_MSRS 256
  #define KVM_MAX_MCE_BANKS 32
- #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+ u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
+ EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
  
  #define emul_to_vcpu(ctxt) \
        container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
@@@ -90,8 -90,12 +91,12 @@@ static u64 __read_mostly efer_reserved_
  #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  
+ #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
  static void update_cr8_intercept(struct kvm_vcpu *vcpu);
  static void process_nmi(struct kvm_vcpu *vcpu);
+ static void enter_smm(struct kvm_vcpu *vcpu);
  static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
  
  struct kvm_x86_ops *kvm_x86_ops __read_mostly;
@@@ -114,7 -118,8 +119,8 @@@ u8   __read_mostly kvm_tsc_scaling_rati
  EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
  u64  __read_mostly kvm_max_tsc_scaling_ratio;
  EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
- static u64 __read_mostly kvm_default_tsc_scaling_ratio;
+ u64 __read_mostly kvm_default_tsc_scaling_ratio;
+ EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
  
  /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
  static u32 __read_mostly tsc_tolerance_ppm = 250;
@@@ -538,7 -543,7 +544,7 @@@ int load_pdptrs(struct kvm_vcpu *vcpu, 
                goto out;
        }
        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
-               if (is_present_gpte(pdpte[i]) &&
+               if ((pdpte[i] & PT_PRESENT_MASK) &&
                    (pdpte[i] &
                     vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
                        ret = 0;
@@@ -983,6 -988,7 +989,7 @@@ static u32 emulated_msrs[] = 
        MSR_IA32_MISC_ENABLE,
        MSR_IA32_MCG_STATUS,
        MSR_IA32_MCG_CTL,
+       MSR_IA32_MCG_EXT_CTL,
        MSR_IA32_SMBASE,
  };
  
@@@ -1162,7 -1168,7 +1169,7 @@@ static void kvm_write_wall_clock(struc
        int version;
        int r;
        struct pvclock_wall_clock wc;
-       struct timespec boot;
+       struct timespec64 boot;
  
        if (!wall_clock)
                return;
         * wall clock specified here.  guest system time equals host
         * system time for us, thus we must fill in host boot time here.
         */
-       getboottime(&boot);
+       getboottime64(&boot);
  
        if (kvm->arch.kvmclock_offset) {
-               struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
-               boot = timespec_sub(boot, ts);
+               struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
+               boot = timespec64_sub(boot, ts);
        }
-       wc.sec = boot.tv_sec;
+       wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
        wc.nsec = boot.tv_nsec;
        wc.version = version;
  
@@@ -1245,6 -1251,12 +1252,6 @@@ static atomic_t kvm_guest_has_master_cl
  static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
  static unsigned long max_tsc_khz;
  
 -static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 -{
 -      return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
 -                                 vcpu->arch.virtual_tsc_shift);
 -}
 -
  static u32 adjust_tsc_khz(u32 khz, s32 ppm)
  {
        u64 v = (u64)khz * (1000000 + ppm);
@@@ -2616,6 -2628,9 +2623,9 @@@ int kvm_vm_ioctl_check_extension(struc
        case KVM_CAP_TSC_CONTROL:
                r = kvm_has_tsc_control;
                break;
+       case KVM_CAP_X2APIC_API:
+               r = KVM_X2APIC_API_VALID_FLAGS;
+               break;
        default:
                r = 0;
                break;
@@@ -2678,11 -2693,9 +2688,9 @@@ long kvm_arch_dev_ioctl(struct file *fi
                break;
        }
        case KVM_X86_GET_MCE_CAP_SUPPORTED: {
-               u64 mce_cap;
-               mce_cap = KVM_MCE_CAP_SUPPORTED;
                r = -EFAULT;
-               if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+               if (copy_to_user(argp, &kvm_mce_cap_supported,
+                                sizeof(kvm_mce_cap_supported)))
                        goto out;
                r = 0;
                break;
@@@ -2734,6 -2747,11 +2742,11 @@@ void kvm_arch_vcpu_load(struct kvm_vcp
                                rdtsc() - vcpu->arch.last_host_tsc;
                if (tsc_delta < 0)
                        mark_tsc_unstable("KVM discovered backwards TSC");
+               if (kvm_lapic_hv_timer_in_use(vcpu) &&
+                               kvm_x86_ops->set_hv_timer(vcpu,
+                                       kvm_get_lapic_tscdeadline_msr(vcpu)))
+                       kvm_lapic_switch_to_sw_timer(vcpu);
                if (check_tsc_unstable()) {
                        u64 offset = kvm_compute_tsc_offset(vcpu,
                                                vcpu->arch.last_guest_tsc);
@@@ -2767,15 -2785,17 +2780,17 @@@ static int kvm_vcpu_ioctl_get_lapic(str
        if (vcpu->arch.apicv_active)
                kvm_x86_ops->sync_pir_to_irr(vcpu);
  
-       memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-       return 0;
+       return kvm_apic_get_state(vcpu, s);
  }
  
  static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
                                    struct kvm_lapic_state *s)
  {
-       kvm_apic_post_state_restore(vcpu, s);
+       int r;
+       r = kvm_apic_set_state(vcpu, s);
+       if (r)
+               return r;
        update_cr8_intercept(vcpu);
  
        return 0;
@@@ -2860,7 -2880,7 +2875,7 @@@ static int kvm_vcpu_ioctl_x86_setup_mce
        r = -EINVAL;
        if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
                goto out;
-       if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+       if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
                goto out;
        r = 0;
        vcpu->arch.mcg_cap = mcg_cap;
        /* Init IA32_MCi_CTL to all 1s */
        for (bank = 0; bank < bank_num; bank++)
                vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+       if (kvm_x86_ops->setup_mce)
+               kvm_x86_ops->setup_mce(vcpu);
  out:
        return r;
  }
@@@ -3768,7 -3791,7 +3786,7 @@@ static int kvm_vm_ioctl_enable_cap(stru
                r = -EEXIST;
                if (irqchip_in_kernel(kvm))
                        goto split_irqchip_unlock;
-               if (atomic_read(&kvm->online_vcpus))
+               if (kvm->created_vcpus)
                        goto split_irqchip_unlock;
                r = kvm_setup_empty_irq_routing(kvm);
                if (r)
@@@ -3782,6 -3805,18 +3800,18 @@@ split_irqchip_unlock
                mutex_unlock(&kvm->lock);
                break;
        }
+       case KVM_CAP_X2APIC_API:
+               r = -EINVAL;
+               if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
+                       break;
+               if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
+                       kvm->arch.x2apic_format = true;
+               if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+                       kvm->arch.x2apic_broadcast_quirk_disabled = true;
+               r = 0;
+               break;
        default:
                r = -EINVAL;
                break;
@@@ -3833,7 -3868,7 +3863,7 @@@ long kvm_arch_vm_ioctl(struct file *fil
                if (kvm->arch.vpic)
                        goto create_irqchip_unlock;
                r = -EINVAL;
-               if (atomic_read(&kvm->online_vcpus))
+               if (kvm->created_vcpus)
                        goto create_irqchip_unlock;
                r = -ENOMEM;
                vpic = kvm_create_pic(kvm);
                                   sizeof(struct kvm_pit_config)))
                        goto out;
        create_pit:
-               mutex_lock(&kvm->slots_lock);
+               mutex_lock(&kvm->lock);
                r = -EEXIST;
                if (kvm->arch.vpit)
                        goto create_pit_unlock;
                if (kvm->arch.vpit)
                        r = 0;
        create_pit_unlock:
-               mutex_unlock(&kvm->slots_lock);
+               mutex_unlock(&kvm->lock);
                break;
        case KVM_GET_IRQCHIP: {
                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
        case KVM_SET_BOOT_CPU_ID:
                r = 0;
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus) != 0)
+               if (kvm->created_vcpus)
                        r = -EBUSY;
                else
                        kvm->arch.bsp_vcpu_id = arg;
@@@ -5297,13 -5332,8 +5327,8 @@@ static void kvm_smm_changed(struct kvm_
                /* This is a good place to trace that we are exiting SMM.  */
                trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
  
-               if (unlikely(vcpu->arch.smi_pending)) {
-                       kvm_make_request(KVM_REQ_SMI, vcpu);
-                       vcpu->arch.smi_pending = 0;
-               } else {
-                       /* Process a latched INIT, if any.  */
-                       kvm_make_request(KVM_REQ_EVENT, vcpu);
-               }
+               /* Process a latched INIT or SMI, if any.  */
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
        }
  
        kvm_mmu_reset_context(vcpu);
@@@ -5553,10 -5583,9 +5578,10 @@@ int kvm_fast_pio_out(struct kvm_vcpu *v
  }
  EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
  
 -static void tsc_bad(void *info)
 +static int kvmclock_cpu_down_prep(unsigned int cpu)
  {
        __this_cpu_write(cpu_tsc_khz, 0);
 +      return 0;
  }
  
  static void tsc_khz_changed(void *data)
@@@ -5661,18 -5690,35 +5686,18 @@@ static struct notifier_block kvmclock_c
        .notifier_call  = kvmclock_cpufreq_notifier
  };
  
 -static int kvmclock_cpu_notifier(struct notifier_block *nfb,
 -                                      unsigned long action, void *hcpu)
 +static int kvmclock_cpu_online(unsigned int cpu)
  {
 -      unsigned int cpu = (unsigned long)hcpu;
 -
 -      switch (action) {
 -              case CPU_ONLINE:
 -              case CPU_DOWN_FAILED:
 -                      smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
 -                      break;
 -              case CPU_DOWN_PREPARE:
 -                      smp_call_function_single(cpu, tsc_bad, NULL, 1);
 -                      break;
 -      }
 -      return NOTIFY_OK;
 +      tsc_khz_changed(NULL);
 +      return 0;
  }
  
 -static struct notifier_block kvmclock_cpu_notifier_block = {
 -      .notifier_call  = kvmclock_cpu_notifier,
 -      .priority = -INT_MAX
 -};
 -
  static void kvm_timer_init(void)
  {
        int cpu;
  
        max_tsc_khz = tsc_khz;
  
 -      cpu_notifier_register_begin();
        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
  #ifdef CONFIG_CPU_FREQ
                struct cpufreq_policy policy;
                                          CPUFREQ_TRANSITION_NOTIFIER);
        }
        pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
 -      for_each_online_cpu(cpu)
 -              smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
 -
 -      __register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
 -      cpu_notifier_register_done();
  
 +      cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "AP_X86_KVM_CLK_ONLINE",
 +                        kvmclock_cpu_online, kvmclock_cpu_down_prep);
  }
  
  static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@@ -5849,8 -5898,8 +5874,8 @@@ int kvm_arch_init(void *opaque
        kvm_x86_ops = ops;
  
        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-                       PT_DIRTY_MASK, PT64_NX_MASK, 0);
+                       PT_DIRTY_MASK, PT64_NX_MASK, 0,
+                       PT_PRESENT_MASK);
        kvm_timer_init();
  
        perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@@ -5878,7 -5927,7 +5903,7 @@@ void kvm_arch_exit(void
        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
                                            CPUFREQ_TRANSITION_NOTIFIER);
 -      unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
 +      cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
  #ifdef CONFIG_X86_64
        pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
  #endif
@@@ -6084,7 -6133,10 +6109,10 @@@ static int inject_pending_event(struct 
        }
  
        /* try to inject new event if pending */
-       if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+       if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+               vcpu->arch.smi_pending = false;
+               enter_smm(vcpu);
+       } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
                --vcpu->arch.nmi_pending;
                vcpu->arch.nmi_injected = true;
                kvm_x86_ops->set_nmi(vcpu);
                        kvm_x86_ops->set_irq(vcpu);
                }
        }
        return 0;
  }
  
@@@ -6130,7 -6183,7 +6159,7 @@@ static void process_nmi(struct kvm_vcp
  #define put_smstate(type, buf, offset, val)                     \
        *(type *)((buf) + (offset) - 0x7e00) = val
  
- static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
+ static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
  {
        u32 flags = 0;
        flags |= seg->g       << 23;
        return flags;
  }
  
- static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+ static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
  {
        struct kvm_segment seg;
        int offset;
  
        put_smstate(u32, buf, offset + 8, seg.base);
        put_smstate(u32, buf, offset + 4, seg.limit);
-       put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
+       put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
  }
  
  #ifdef CONFIG_X86_64
- static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+ static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
  {
        struct kvm_segment seg;
        int offset;
        kvm_get_segment(vcpu, &seg, n);
        offset = 0x7e00 + n * 16;
  
-       flags = process_smi_get_segment_flags(&seg) >> 8;
+       flags = enter_smm_get_segment_flags(&seg) >> 8;
        put_smstate(u16, buf, offset, seg.selector);
        put_smstate(u16, buf, offset + 2, flags);
        put_smstate(u32, buf, offset + 4, seg.limit);
  }
  #endif
  
- static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
  {
        struct desc_ptr dt;
        struct kvm_segment seg;
        put_smstate(u32, buf, 0x7fc4, seg.selector);
        put_smstate(u32, buf, 0x7f64, seg.base);
        put_smstate(u32, buf, 0x7f60, seg.limit);
-       put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
+       put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
  
        kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
        put_smstate(u32, buf, 0x7fc0, seg.selector);
        put_smstate(u32, buf, 0x7f80, seg.base);
        put_smstate(u32, buf, 0x7f7c, seg.limit);
-       put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
+       put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
  
        kvm_x86_ops->get_gdt(vcpu, &dt);
        put_smstate(u32, buf, 0x7f74, dt.address);
        put_smstate(u32, buf, 0x7f54, dt.size);
  
        for (i = 0; i < 6; i++)
-               process_smi_save_seg_32(vcpu, buf, i);
+               enter_smm_save_seg_32(vcpu, buf, i);
  
        put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
  
        put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
  }
  
- static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
  {
  #ifdef CONFIG_X86_64
        struct desc_ptr dt;
  
        kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
        put_smstate(u16, buf, 0x7e90, seg.selector);
-       put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
+       put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
        put_smstate(u32, buf, 0x7e94, seg.limit);
        put_smstate(u64, buf, 0x7e98, seg.base);
  
  
        kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
        put_smstate(u16, buf, 0x7e70, seg.selector);
-       put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
+       put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
        put_smstate(u32, buf, 0x7e74, seg.limit);
        put_smstate(u64, buf, 0x7e78, seg.base);
  
        put_smstate(u64, buf, 0x7e68, dt.address);
  
        for (i = 0; i < 6; i++)
-               process_smi_save_seg_64(vcpu, buf, i);
+               enter_smm_save_seg_64(vcpu, buf, i);
  #else
        WARN_ON_ONCE(1);
  #endif
  }
  
- static void process_smi(struct kvm_vcpu *vcpu)
+ static void enter_smm(struct kvm_vcpu *vcpu)
  {
        struct kvm_segment cs, ds;
        struct desc_ptr dt;
        char buf[512];
        u32 cr0;
  
-       if (is_smm(vcpu)) {
-               vcpu->arch.smi_pending = true;
-               return;
-       }
        trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
        vcpu->arch.hflags |= HF_SMM_MASK;
        memset(buf, 0, 512);
        if (guest_cpuid_has_longmode(vcpu))
-               process_smi_save_state_64(vcpu, buf);
+               enter_smm_save_state_64(vcpu, buf);
        else
-               process_smi_save_state_32(vcpu, buf);
+               enter_smm_save_state_32(vcpu, buf);
  
        kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
  
        kvm_mmu_reset_context(vcpu);
  }
  
+ static void process_smi(struct kvm_vcpu *vcpu)
+ {
+       vcpu->arch.smi_pending = true;
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
  void kvm_make_scan_ioapic_request(struct kvm *kvm)
  {
        kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
@@@ -6555,8 -6609,18 +6585,18 @@@ static int vcpu_enter_guest(struct kvm_
  
                if (inject_pending_event(vcpu, req_int_win) != 0)
                        req_immediate_exit = true;
-               /* enable NMI/IRQ window open exits if needed */
                else {
+                       /* Enable NMI/IRQ window open exits if needed.
+                        *
+                        * SMIs have two cases: 1) they can be nested, and
+                        * then there is nothing to do here because RSM will
+                        * cause a vmexit anyway; 2) or the SMI can be pending
+                        * because inject_pending_event has completed the
+                        * injection of an IRQ or NMI from the previous vmexit,
+                        * and then we request an immediate exit to inject the SMI.
+                        */
+                       if (vcpu->arch.smi_pending && !is_smm(vcpu))
+                               req_immediate_exit = true;
                        if (vcpu->arch.nmi_pending)
                                kvm_x86_ops->enable_nmi_window(vcpu);
                        if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
  
        kvm_load_guest_xcr0(vcpu);
  
-       if (req_immediate_exit)
+       if (req_immediate_exit) {
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
                smp_send_reschedule(vcpu->cpu);
+       }
  
        trace_kvm_entry(vcpu->vcpu_id);
        wait_lapic_expire(vcpu);
-       __kvm_guest_enter();
+       guest_enter_irqoff();
  
        if (unlikely(vcpu->arch.switch_db_regs)) {
                set_debugreg(0, 7);
  
        ++vcpu->stat.exits;
  
-       /*
-        * We must have an instruction between local_irq_enable() and
-        * kvm_guest_exit(), so the timer interrupt isn't delayed by
-        * the interrupt shadow.  The stat.exits increment will do nicely.
-        * But we need to prevent reordering, hence this barrier():
-        */
-       barrier();
-       kvm_guest_exit();
+       guest_exit_irqoff();
  
+       local_irq_enable();
        preempt_enable();
  
        vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@@ -7409,6 -7468,7 +7444,7 @@@ void kvm_vcpu_reset(struct kvm_vcpu *vc
  {
        vcpu->arch.hflags = 0;
  
+       vcpu->arch.smi_pending = 0;
        atomic_set(&vcpu->arch.nmi_queued, 0);
        vcpu->arch.nmi_pending = 0;
        vcpu->arch.nmi_injected = false;
@@@ -7601,11 -7661,6 +7637,6 @@@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *v
        return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
  }
  
- bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
- {
-       return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
- }
  struct static_key kvm_no_apic_vcpu __read_mostly;
  EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
  
@@@ -7872,7 -7927,7 +7903,7 @@@ void kvm_arch_destroy_vm(struct kvm *kv
        kfree(kvm->arch.vpic);
        kfree(kvm->arch.vioapic);
        kvm_free_vcpus(kvm);
-       kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+       kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
        kvm_mmu_uninit_vm(kvm);
  }
  
@@@ -8380,7 -8435,7 +8411,7 @@@ void kvm_arch_irq_bypass_del_producer(s
        /*
         * When producer of consumer is unregistered, we change back to
         * remapped mode, so we can re-use the current implementation
-        * when the irq is masked/disabed or the consumer side (KVM
+        * when the irq is masked/disabled or the consumer side (KVM
         * int this case doesn't want to receive the interrupts.
        */
        ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
@@@ -31,19 -31,6 +31,19 @@@ static inline void user_exit(void
                context_tracking_exit(CONTEXT_USER);
  }
  
 +/* Called with interrupts disabled.  */
 +static inline void user_enter_irqoff(void)
 +{
 +      if (context_tracking_is_enabled())
 +              __context_tracking_enter(CONTEXT_USER);
 +
 +}
 +static inline void user_exit_irqoff(void)
 +{
 +      if (context_tracking_is_enabled())
 +              __context_tracking_exit(CONTEXT_USER);
 +}
 +
  static inline enum ctx_state exception_enter(void)
  {
        enum ctx_state prev_ctx;
@@@ -82,8 -69,6 +82,8 @@@ static inline enum ctx_state ct_state(v
  #else
  static inline void user_enter(void) { }
  static inline void user_exit(void) { }
 +static inline void user_enter_irqoff(void) { }
 +static inline void user_exit_irqoff(void) { }
  static inline enum ctx_state exception_enter(void) { return 0; }
  static inline void exception_exit(enum ctx_state prev_ctx) { }
  static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
@@@ -99,7 -84,8 +99,8 @@@ static inline void context_tracking_ini
  
  
  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- static inline void guest_enter(void)
+ /* must be called with irqs disabled */
+ static inline void guest_enter_irqoff(void)
  {
        if (vtime_accounting_cpu_enabled())
                vtime_guest_enter(current);
  
        if (context_tracking_is_enabled())
                __context_tracking_enter(CONTEXT_GUEST);
+       /* KVM does not hold any references to rcu protected data when it
+        * switches CPU into a guest mode. In fact switching to a guest mode
+        * is very similar to exiting to userspace from rcu point of view. In
+        * addition CPU may stay in a guest mode for quite a long time (up to
+        * one time slice). Lets treat guest mode as quiescent state, just like
+        * we do with user-mode execution.
+        */
+       if (!context_tracking_cpu_is_enabled())
+               rcu_virt_note_context_switch(smp_processor_id());
  }
  
- static inline void guest_exit(void)
+ static inline void guest_exit_irqoff(void)
  {
        if (context_tracking_is_enabled())
                __context_tracking_exit(CONTEXT_GUEST);
  }
  
  #else
- static inline void guest_enter(void)
+ static inline void guest_enter_irqoff(void)
  {
        /*
         * This is running in ioctl context so its safe
         */
        vtime_account_system(current);
        current->flags |= PF_VCPU;
+       rcu_virt_note_context_switch(smp_processor_id());
  }
  
- static inline void guest_exit(void)
+ static inline void guest_exit_irqoff(void)
  {
        /* Flush the guest cputime we spent on the guest */
        vtime_account_system(current);
  }
  #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
  
+ static inline void guest_enter(void)
+ {
+       unsigned long flags;
+       local_irq_save(flags);
+       guest_enter_irqoff();
+       local_irq_restore(flags);
+ }
+ static inline void guest_exit(void)
+ {
+       unsigned long flags;
+       local_irq_save(flags);
+       guest_exit_irqoff();
+       local_irq_restore(flags);
+ }
  #endif
  #define GICR_WAKER_ProcessorSleep     (1U << 1)
  #define GICR_WAKER_ChildrenAsleep     (1U << 2)
  
- #define GICR_PROPBASER_NonShareable   (0U << 10)
- #define GICR_PROPBASER_InnerShareable (1U << 10)
- #define GICR_PROPBASER_OuterShareable (2U << 10)
- #define GICR_PROPBASER_SHAREABILITY_MASK (3UL << 10)
- #define GICR_PROPBASER_nCnB           (0U << 7)
- #define GICR_PROPBASER_nC             (1U << 7)
- #define GICR_PROPBASER_RaWt           (2U << 7)
- #define GICR_PROPBASER_RaWb           (3U << 7)
- #define GICR_PROPBASER_WaWt           (4U << 7)
- #define GICR_PROPBASER_WaWb           (5U << 7)
- #define GICR_PROPBASER_RaWaWt         (6U << 7)
- #define GICR_PROPBASER_RaWaWb         (7U << 7)
- #define GICR_PROPBASER_CACHEABILITY_MASK (7U << 7)
- #define GICR_PROPBASER_IDBITS_MASK    (0x1f)
- #define GICR_PENDBASER_NonShareable   (0U << 10)
- #define GICR_PENDBASER_InnerShareable (1U << 10)
- #define GICR_PENDBASER_OuterShareable (2U << 10)
- #define GICR_PENDBASER_SHAREABILITY_MASK (3UL << 10)
- #define GICR_PENDBASER_nCnB           (0U << 7)
- #define GICR_PENDBASER_nC             (1U << 7)
- #define GICR_PENDBASER_RaWt           (2U << 7)
- #define GICR_PENDBASER_RaWb           (3U << 7)
- #define GICR_PENDBASER_WaWt           (4U << 7)
- #define GICR_PENDBASER_WaWb           (5U << 7)
- #define GICR_PENDBASER_RaWaWt         (6U << 7)
- #define GICR_PENDBASER_RaWaWb         (7U << 7)
- #define GICR_PENDBASER_CACHEABILITY_MASK (7U << 7)
+ #define GIC_BASER_CACHE_nCnB          0ULL
+ #define GIC_BASER_CACHE_SameAsInner   0ULL
+ #define GIC_BASER_CACHE_nC            1ULL
+ #define GIC_BASER_CACHE_RaWt          2ULL
+ #define GIC_BASER_CACHE_RaWb          3ULL
+ #define GIC_BASER_CACHE_WaWt          4ULL
+ #define GIC_BASER_CACHE_WaWb          5ULL
+ #define GIC_BASER_CACHE_RaWaWt                6ULL
+ #define GIC_BASER_CACHE_RaWaWb                7ULL
+ #define GIC_BASER_CACHE_MASK          7ULL
+ #define GIC_BASER_NonShareable                0ULL
+ #define GIC_BASER_InnerShareable      1ULL
+ #define GIC_BASER_OuterShareable      2ULL
+ #define GIC_BASER_SHAREABILITY_MASK   3ULL
+ #define GIC_BASER_CACHEABILITY(reg, inner_outer, type)                        \
+       (GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT)
+ #define GIC_BASER_SHAREABILITY(reg, type)                             \
+       (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT)
+ #define GICR_PROPBASER_SHAREABILITY_SHIFT             (10)
+ #define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT               (7)
+ #define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT               (56)
+ #define GICR_PROPBASER_SHAREABILITY_MASK                              \
+       GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK)
+ #define GICR_PROPBASER_INNER_CACHEABILITY_MASK                                \
+       GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK)
+ #define GICR_PROPBASER_OUTER_CACHEABILITY_MASK                                \
+       GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK)
+ #define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK
+ #define GICR_PROPBASER_InnerShareable                                 \
+       GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)
+ #define GICR_PROPBASER_nCnB   GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB)
+ #define GICR_PROPBASER_nC     GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC)
+ #define GICR_PROPBASER_RaWt   GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+ #define GICR_PROPBASER_RaWb   GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+ #define GICR_PROPBASER_WaWt   GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt)
+ #define GICR_PROPBASER_WaWb   GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb)
+ #define GICR_PROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt)
+ #define GICR_PROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb)
+ #define GICR_PROPBASER_IDBITS_MASK                    (0x1f)
+ #define GICR_PENDBASER_SHAREABILITY_SHIFT             (10)
+ #define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT               (7)
+ #define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT               (56)
+ #define GICR_PENDBASER_SHAREABILITY_MASK                              \
+       GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK)
+ #define GICR_PENDBASER_INNER_CACHEABILITY_MASK                                \
+       GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK)
+ #define GICR_PENDBASER_OUTER_CACHEABILITY_MASK                                \
+       GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK)
+ #define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK
+ #define GICR_PENDBASER_InnerShareable                                 \
+       GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)
+ #define GICR_PENDBASER_nCnB   GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB)
+ #define GICR_PENDBASER_nC     GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC)
+ #define GICR_PENDBASER_RaWt   GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+ #define GICR_PENDBASER_RaWb   GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+ #define GICR_PENDBASER_WaWt   GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt)
+ #define GICR_PENDBASER_WaWb   GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb)
+ #define GICR_PENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt)
+ #define GICR_PENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb)
+ #define GICR_PENDBASER_PTZ                            BIT_ULL(62)
  
  /*
   * Re-Distributor registers, offsets from SGI_base
  #define GITS_CWRITER                  0x0088
  #define GITS_CREADR                   0x0090
  #define GITS_BASER                    0x0100
+ #define GITS_IDREGS_BASE              0xffd0
+ #define GITS_PIDR0                    0xffe0
+ #define GITS_PIDR1                    0xffe4
  #define GITS_PIDR2                    GICR_PIDR2
+ #define GITS_PIDR4                    0xffd0
+ #define GITS_CIDR0                    0xfff0
+ #define GITS_CIDR1                    0xfff4
+ #define GITS_CIDR2                    0xfff8
+ #define GITS_CIDR3                    0xfffc
  
  #define GITS_TRANSLATER                       0x10040
  
  #define GITS_CTLR_ENABLE              (1U << 0)
  #define GITS_CTLR_QUIESCENT           (1U << 31)
  
+ #define GITS_TYPER_PLPIS              (1UL << 0)
+ #define GITS_TYPER_IDBITS_SHIFT               8
  #define GITS_TYPER_DEVBITS_SHIFT      13
  #define GITS_TYPER_DEVBITS(r)         ((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
  #define GITS_TYPER_PTA                        (1UL << 19)
- #define GITS_CBASER_VALID             (1UL << 63)
- #define GITS_CBASER_nCnB              (0UL << 59)
- #define GITS_CBASER_nC                        (1UL << 59)
- #define GITS_CBASER_RaWt              (2UL << 59)
- #define GITS_CBASER_RaWb              (3UL << 59)
- #define GITS_CBASER_WaWt              (4UL << 59)
- #define GITS_CBASER_WaWb              (5UL << 59)
- #define GITS_CBASER_RaWaWt            (6UL << 59)
- #define GITS_CBASER_RaWaWb            (7UL << 59)
- #define GITS_CBASER_CACHEABILITY_MASK (7UL << 59)
- #define GITS_CBASER_NonShareable      (0UL << 10)
- #define GITS_CBASER_InnerShareable    (1UL << 10)
- #define GITS_CBASER_OuterShareable    (2UL << 10)
- #define GITS_CBASER_SHAREABILITY_MASK (3UL << 10)
+ #define GITS_TYPER_HWCOLLCNT_SHIFT    24
+ #define GITS_CBASER_VALID                     (1UL << 63)
+ #define GITS_CBASER_SHAREABILITY_SHIFT                (10)
+ #define GITS_CBASER_INNER_CACHEABILITY_SHIFT  (59)
+ #define GITS_CBASER_OUTER_CACHEABILITY_SHIFT  (53)
+ #define GITS_CBASER_SHAREABILITY_MASK                                 \
+       GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK)
+ #define GITS_CBASER_INNER_CACHEABILITY_MASK                           \
+       GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK)
+ #define GITS_CBASER_OUTER_CACHEABILITY_MASK                           \
+       GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK)
+ #define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK
+ #define GITS_CBASER_InnerShareable                                    \
+       GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable)
+ #define GITS_CBASER_nCnB      GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB)
+ #define GITS_CBASER_nC                GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC)
+ #define GITS_CBASER_RaWt      GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+ #define GITS_CBASER_RaWb      GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+ #define GITS_CBASER_WaWt      GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt)
+ #define GITS_CBASER_WaWb      GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb)
+ #define GITS_CBASER_RaWaWt    GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt)
+ #define GITS_CBASER_RaWaWb    GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb)
  
  #define GITS_BASER_NR_REGS            8
  
- #define GITS_BASER_VALID              (1UL << 63)
- #define GITS_BASER_INDIRECT           (1UL << 62)
- #define GITS_BASER_nCnB                       (0UL << 59)
- #define GITS_BASER_nC                 (1UL << 59)
- #define GITS_BASER_RaWt                       (2UL << 59)
- #define GITS_BASER_RaWb                       (3UL << 59)
- #define GITS_BASER_WaWt                       (4UL << 59)
- #define GITS_BASER_WaWb                       (5UL << 59)
- #define GITS_BASER_RaWaWt             (6UL << 59)
- #define GITS_BASER_RaWaWb             (7UL << 59)
- #define GITS_BASER_CACHEABILITY_MASK  (7UL << 59)
- #define GITS_BASER_TYPE_SHIFT         (56)
+ #define GITS_BASER_VALID                      (1UL << 63)
+ #define GITS_BASER_INDIRECT                   (1ULL << 62)
+ #define GITS_BASER_INNER_CACHEABILITY_SHIFT   (59)
+ #define GITS_BASER_OUTER_CACHEABILITY_SHIFT   (53)
+ #define GITS_BASER_INNER_CACHEABILITY_MASK                            \
+       GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK)
+ #define GITS_BASER_CACHEABILITY_MASK          GITS_BASER_INNER_CACHEABILITY_MASK
+ #define GITS_BASER_OUTER_CACHEABILITY_MASK                            \
+       GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK)
+ #define GITS_BASER_SHAREABILITY_MASK                                  \
+       GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK)
+ #define GITS_BASER_nCnB               GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB)
+ #define GITS_BASER_nC         GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC)
+ #define GITS_BASER_RaWt               GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+ #define GITS_BASER_RaWb               GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+ #define GITS_BASER_WaWt               GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt)
+ #define GITS_BASER_WaWb               GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb)
+ #define GITS_BASER_RaWaWt     GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt)
+ #define GITS_BASER_RaWaWb     GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb)
+ #define GITS_BASER_TYPE_SHIFT                 (56)
  #define GITS_BASER_TYPE(r)            (((r) >> GITS_BASER_TYPE_SHIFT) & 7)
- #define GITS_BASER_ENTRY_SIZE_SHIFT   (48)
+ #define GITS_BASER_ENTRY_SIZE_SHIFT           (48)
  #define GITS_BASER_ENTRY_SIZE(r)      ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0xff) + 1)
- #define GITS_BASER_NonShareable               (0UL << 10)
- #define GITS_BASER_InnerShareable     (1UL << 10)
- #define GITS_BASER_OuterShareable     (2UL << 10)
  #define GITS_BASER_SHAREABILITY_SHIFT (10)
- #define GITS_BASER_SHAREABILITY_MASK  (3UL << GITS_BASER_SHAREABILITY_SHIFT)
+ #define GITS_BASER_InnerShareable                                     \
+       GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)
  #define GITS_BASER_PAGE_SIZE_SHIFT    (8)
  #define GITS_BASER_PAGE_SIZE_4K               (0UL << GITS_BASER_PAGE_SIZE_SHIFT)
  #define GITS_BASER_PAGE_SIZE_16K      (1UL << GITS_BASER_PAGE_SIZE_SHIFT)
  #define GITS_BASER_PAGE_SIZE_64K      (2UL << GITS_BASER_PAGE_SIZE_SHIFT)
  #define GITS_BASER_PAGE_SIZE_MASK     (3UL << GITS_BASER_PAGE_SIZE_SHIFT)
  #define GITS_BASER_PAGES_MAX          256
 +#define GITS_BASER_PAGES_SHIFT                (0)
+ #define GITS_BASER_NR_PAGES(r)                (((r) & 0xff) + 1)
  
  #define GITS_BASER_TYPE_NONE          0
  #define GITS_BASER_TYPE_DEVICE                1
  #define GITS_BASER_TYPE_RESERVED6     6
  #define GITS_BASER_TYPE_RESERVED7     7
  
 +#define GITS_LVL1_ENTRY_SIZE           (8UL)
 +
  /*
   * ITS commands
   */
  #define GITS_CMD_MAPD                 0x08
  #define GITS_CMD_MAPC                 0x09
- #define GITS_CMD_MAPVI                        0x0a
+ #define GITS_CMD_MAPTI                        0x0a
+ /* older GIC documentation used MAPVI for this command */
+ #define GITS_CMD_MAPVI                        GITS_CMD_MAPTI
+ #define GITS_CMD_MAPI                 0x0b
  #define GITS_CMD_MOVI                 0x01
  #define GITS_CMD_DISCARD              0x0f
  #define GITS_CMD_INV                  0x0c
  #define GITS_CMD_CLEAR                        0x04
  #define GITS_CMD_SYNC                 0x05
  
+ /*
+  * ITS error numbers
+  */
+ #define E_ITS_MOVI_UNMAPPED_INTERRUPT         0x010107
+ #define E_ITS_MOVI_UNMAPPED_COLLECTION                0x010109
+ #define E_ITS_CLEAR_UNMAPPED_INTERRUPT                0x010507
+ #define E_ITS_MAPD_DEVICE_OOR                 0x010801
+ #define E_ITS_MAPC_PROCNUM_OOR                        0x010902
+ #define E_ITS_MAPC_COLLECTION_OOR             0x010903
+ #define E_ITS_MAPTI_UNMAPPED_DEVICE           0x010a04
+ #define E_ITS_MAPTI_PHYSICALID_OOR            0x010a06
+ #define E_ITS_INV_UNMAPPED_INTERRUPT          0x010c07
+ #define E_ITS_INVALL_UNMAPPED_COLLECTION      0x010d09
+ #define E_ITS_MOVALL_PROCNUM_OOR              0x010e01
+ #define E_ITS_DISCARD_UNMAPPED_INTERRUPT      0x010f07
  /*
   * CPU interface registers
   */
  #define ICC_SGI1R_AFFINITY_1_SHIFT    16
  #define ICC_SGI1R_AFFINITY_1_MASK     (0xff << ICC_SGI1R_AFFINITY_1_SHIFT)
  #define ICC_SGI1R_SGI_ID_SHIFT                24
 -#define ICC_SGI1R_SGI_ID_MASK         (0xff << ICC_SGI1R_SGI_ID_SHIFT)
 +#define ICC_SGI1R_SGI_ID_MASK         (0xfULL << ICC_SGI1R_SGI_ID_SHIFT)
  #define ICC_SGI1R_AFFINITY_2_SHIFT    32
 -#define ICC_SGI1R_AFFINITY_2_MASK     (0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
 +#define ICC_SGI1R_AFFINITY_2_MASK     (0xffULL << ICC_SGI1R_AFFINITY_2_SHIFT)
  #define ICC_SGI1R_IRQ_ROUTING_MODE_BIT        40
  #define ICC_SGI1R_AFFINITY_3_SHIFT    48
 -#define ICC_SGI1R_AFFINITY_3_MASK     (0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
 +#define ICC_SGI1R_AFFINITY_3_MASK     (0xffULL << ICC_SGI1R_AFFINITY_3_SHIFT)
  
  #include <asm/arch_gicv3.h>
  
diff --combined mm/gup.c
+++ b/mm/gup.c
@@@ -279,8 -279,6 +279,8 @@@ struct page *follow_page_mask(struct vm
                        spin_unlock(ptl);
                        ret = 0;
                        split_huge_pmd(vma, pmd, address);
 +                      if (pmd_trans_unstable(pmd))
 +                              ret = -EBUSY;
                } else {
                        get_page(page);
                        spin_unlock(ptl);
                        ret = split_huge_page(page);
                        unlock_page(page);
                        put_page(page);
 +                      if (pmd_none(*pmd))
 +                              return no_page_table(vma, flags);
                }
  
                return ret ? ERR_PTR(ret) :
@@@ -354,6 -350,7 +354,6 @@@ unmap
  static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
                unsigned long address, unsigned int *flags, int *nonblocking)
  {
 -      struct mm_struct *mm = vma->vm_mm;
        unsigned int fault_flags = 0;
        int ret;
  
                fault_flags |= FAULT_FLAG_TRIED;
        }
  
 -      ret = handle_mm_fault(mm, vma, address, fault_flags);
 +      ret = handle_mm_fault(vma, address, fault_flags);
        if (ret & VM_FAULT_ERROR) {
                if (ret & VM_FAULT_OOM)
                        return -ENOMEM;
@@@ -693,7 -690,7 +693,7 @@@ retry
        if (!vma_permits_fault(vma, fault_flags))
                return -EFAULT;
  
 -      ret = handle_mm_fault(mm, vma, address, fault_flags);
 +      ret = handle_mm_fault(vma, address, fault_flags);
        major |= ret & VM_FAULT_MAJOR;
        if (ret & VM_FAULT_ERROR) {
                if (ret & VM_FAULT_OOM)
        }
        return 0;
  }
+ EXPORT_SYMBOL_GPL(fixup_user_fault);
  
  static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
                                                struct mm_struct *mm,
@@@ -157,6 -157,9 +157,9 @@@ static int kvm_vgic_dist_init(struct kv
        struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
        int i;
  
+       INIT_LIST_HEAD(&dist->lpi_list_head);
+       spin_lock_init(&dist->lpi_list_lock);
        dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
        if (!dist->spis)
                return  -ENOMEM;
                spin_lock_init(&irq->irq_lock);
                irq->vcpu = NULL;
                irq->target_vcpu = vcpu0;
+               kref_init(&irq->refcount);
                if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
                        irq->targets = 0;
                else
@@@ -211,6 -215,7 +215,7 @@@ static void kvm_vgic_vcpu_init(struct k
                irq->vcpu = NULL;
                irq->target_vcpu = vcpu;
                irq->targets = 1U << vcpu->vcpu_id;
+               kref_init(&irq->refcount);
                if (vgic_irq_is_sgi(i)) {
                        /* SGIs */
                        irq->enabled = 1;
@@@ -253,6 -258,9 +258,9 @@@ int vgic_init(struct kvm *kvm
        if (ret)
                goto out;
  
+       if (vgic_has_its(kvm))
+               dist->msis_require_devid = true;
        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_vgic_vcpu_init(vcpu);
  
@@@ -271,7 -279,6 +279,6 @@@ static void kvm_vgic_dist_destroy(struc
        dist->initialized = false;
  
        kfree(dist->spis);
-       kfree(dist->redist_iodevs);
        dist->nr_spis = 0;
  
        mutex_unlock(&kvm->lock);
@@@ -353,19 -360,32 +360,19 @@@ out
  
  /* GENERIC PROBE */
  
 -static void vgic_init_maintenance_interrupt(void *info)
 +static int vgic_init_cpu_starting(unsigned int cpu)
  {
        enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0);
 +      return 0;
  }
  
 -static int vgic_cpu_notify(struct notifier_block *self,
 -                         unsigned long action, void *cpu)
 -{
 -      switch (action) {
 -      case CPU_STARTING:
 -      case CPU_STARTING_FROZEN:
 -              vgic_init_maintenance_interrupt(NULL);
 -              break;
 -      case CPU_DYING:
 -      case CPU_DYING_FROZEN:
 -              disable_percpu_irq(kvm_vgic_global_state.maint_irq);
 -              break;
 -      }
  
 -      return NOTIFY_OK;
 +static int vgic_init_cpu_dying(unsigned int cpu)
 +{
 +      disable_percpu_irq(kvm_vgic_global_state.maint_irq);
 +      return 0;
  }
  
 -static struct notifier_block vgic_cpu_nb = {
 -      .notifier_call = vgic_cpu_notify,
 -};
 -
  static irqreturn_t vgic_maintenance_handler(int irq, void *data)
  {
        /*
@@@ -421,14 -441,14 +428,14 @@@ int kvm_vgic_hyp_init(void
                return ret;
        }
  
 -      ret = __register_cpu_notifier(&vgic_cpu_nb);
 +      ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING,
 +                              "AP_KVM_ARM_VGIC_INIT_STARTING",
 +                              vgic_init_cpu_starting, vgic_init_cpu_dying);
        if (ret) {
                kvm_err("Cannot register vgic CPU notifier\n");
                goto out_free_irq;
        }
  
 -      on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
 -
        kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq);
        return 0;
  
diff --combined virt/kvm/kvm_main.c
@@@ -148,7 -148,6 +148,7 @@@ int vcpu_load(struct kvm_vcpu *vcpu
        put_cpu();
        return 0;
  }
 +EXPORT_SYMBOL_GPL(vcpu_load);
  
  void vcpu_put(struct kvm_vcpu *vcpu)
  {
        preempt_enable();
        mutex_unlock(&vcpu->mutex);
  }
 +EXPORT_SYMBOL_GPL(vcpu_put);
  
  static void ack_flush(void *_completed)
  {
@@@ -1444,6 -1442,52 +1444,52 @@@ static bool vma_is_valid(struct vm_area
        return true;
  }
  
+ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
+                              unsigned long addr, bool *async,
+                              bool write_fault, kvm_pfn_t *p_pfn)
+ {
+       unsigned long pfn;
+       int r;
+       r = follow_pfn(vma, addr, &pfn);
+       if (r) {
+               /*
+                * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
+                * not call the fault handler, so do it here.
+                */
+               bool unlocked = false;
+               r = fixup_user_fault(current, current->mm, addr,
+                                    (write_fault ? FAULT_FLAG_WRITE : 0),
+                                    &unlocked);
+               if (unlocked)
+                       return -EAGAIN;
+               if (r)
+                       return r;
+               r = follow_pfn(vma, addr, &pfn);
+               if (r)
+                       return r;
+       }
+       /*
+        * Get a reference here because callers of *hva_to_pfn* and
+        * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
+        * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
+        * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+        * simply do nothing for reserved pfns.
+        *
+        * Whoever called remap_pfn_range is also going to call e.g.
+        * unmap_mapping_range before the underlying pages are freed,
+        * causing a call to our MMU notifier.
+        */ 
+       kvm_get_pfn(pfn);
+       *p_pfn = pfn;
+       return 0;
+ }
  /*
   * Pin guest page in memory and return its pfn.
   * @addr: host virtual address which maps memory to the guest
@@@ -1463,7 -1507,7 +1509,7 @@@ static kvm_pfn_t hva_to_pfn(unsigned lo
  {
        struct vm_area_struct *vma;
        kvm_pfn_t pfn = 0;
-       int npages;
+       int npages, r;
  
        /* we can do it either atomically or asynchronously, not both */
        BUG_ON(atomic && async);
                goto exit;
        }
  
+ retry:
        vma = find_vma_intersection(current->mm, addr, addr + 1);
  
        if (vma == NULL)
                pfn = KVM_PFN_ERR_FAULT;
-       else if ((vma->vm_flags & VM_PFNMAP)) {
-               pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
-                       vma->vm_pgoff;
-               BUG_ON(!kvm_is_reserved_pfn(pfn));
+       else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
+               r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn);
+               if (r == -EAGAIN)
+                       goto retry;
+               if (r < 0)
+                       pfn = KVM_PFN_ERR_FAULT;
        } else {
                if (async && vma_is_valid(vma, write_fault))
                        *async = true;
@@@ -2348,9 -2395,20 +2397,20 @@@ static int kvm_vm_ioctl_create_vcpu(str
        if (id >= KVM_MAX_VCPU_ID)
                return -EINVAL;
  
+       mutex_lock(&kvm->lock);
+       if (kvm->created_vcpus == KVM_MAX_VCPUS) {
+               mutex_unlock(&kvm->lock);
+               return -EINVAL;
+       }
+       kvm->created_vcpus++;
+       mutex_unlock(&kvm->lock);
        vcpu = kvm_arch_vcpu_create(kvm, id);
-       if (IS_ERR(vcpu))
-               return PTR_ERR(vcpu);
+       if (IS_ERR(vcpu)) {
+               r = PTR_ERR(vcpu);
+               goto vcpu_decrement;
+       }
  
        preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
  
                goto vcpu_destroy;
  
        mutex_lock(&kvm->lock);
-       if (!kvm_vcpu_compatible(vcpu)) {
-               r = -EINVAL;
-               goto unlock_vcpu_destroy;
-       }
-       if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
-               r = -EINVAL;
-               goto unlock_vcpu_destroy;
-       }
        if (kvm_get_vcpu_by_id(kvm, id)) {
                r = -EEXIST;
                goto unlock_vcpu_destroy;
@@@ -2399,6 -2449,10 +2451,10 @@@ unlock_vcpu_destroy
        mutex_unlock(&kvm->lock);
  vcpu_destroy:
        kvm_arch_vcpu_destroy(vcpu);
+ vcpu_decrement:
+       mutex_lock(&kvm->lock);
+       kvm->created_vcpus--;
+       mutex_unlock(&kvm->lock);
        return r;
  }
  
@@@ -2943,7 -2997,7 +2999,7 @@@ static long kvm_vm_ioctl(struct file *f
                if (copy_from_user(&routing, argp, sizeof(routing)))
                        goto out;
                r = -EINVAL;
 -              if (routing.nr >= KVM_MAX_IRQ_ROUTES)
 +              if (routing.nr > KVM_MAX_IRQ_ROUTES)
                        goto out;
                if (routing.flags)
                        goto out;
@@@ -3050,7 -3104,6 +3106,7 @@@ static int kvm_dev_ioctl_create_vm(unsi
  {
        int r;
        struct kvm *kvm;
 +      struct file *file;
  
        kvm = kvm_create_vm(type);
        if (IS_ERR(kvm))
                return r;
        }
  #endif
 -      r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC);
 +      r = get_unused_fd_flags(O_CLOEXEC);
        if (r < 0) {
                kvm_put_kvm(kvm);
                return r;
        }
 +      file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 +      if (IS_ERR(file)) {
 +              put_unused_fd(r);
 +              kvm_put_kvm(kvm);
 +              return PTR_ERR(file);
 +      }
  
        if (kvm_create_vm_debugfs(kvm, r) < 0) {
 -              kvm_put_kvm(kvm);
 +              put_unused_fd(r);
 +              fput(file);
                return -ENOMEM;
        }
  
 +      fd_install(r, file);
        return r;
  }
  
@@@ -3155,13 -3200,12 +3211,13 @@@ static void hardware_enable_nolock(voi
        }
  }
  
 -static void hardware_enable(void)
 +static int kvm_starting_cpu(unsigned int cpu)
  {
        raw_spin_lock(&kvm_count_lock);
        if (kvm_usage_count)
                hardware_enable_nolock(NULL);
        raw_spin_unlock(&kvm_count_lock);
 +      return 0;
  }
  
  static void hardware_disable_nolock(void *junk)
        kvm_arch_hardware_disable();
  }
  
 -static void hardware_disable(void)
 +static int kvm_dying_cpu(unsigned int cpu)
  {
        raw_spin_lock(&kvm_count_lock);
        if (kvm_usage_count)
                hardware_disable_nolock(NULL);
        raw_spin_unlock(&kvm_count_lock);
 +      return 0;
  }
  
  static void hardware_disable_all_nolock(void)
@@@ -3221,6 -3264,21 +3277,6 @@@ static int hardware_enable_all(void
        return r;
  }
  
 -static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 -                         void *v)
 -{
 -      val &= ~CPU_TASKS_FROZEN;
 -      switch (val) {
 -      case CPU_DYING:
 -              hardware_disable();
 -              break;
 -      case CPU_STARTING:
 -              hardware_enable();
 -              break;
 -      }
 -      return NOTIFY_OK;
 -}
 -
  static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
                      void *v)
  {
@@@ -3487,6 -3545,34 +3543,30 @@@ int kvm_io_bus_unregister_dev(struct kv
        return r;
  }
  
 -static struct notifier_block kvm_cpu_notifier = {
 -      .notifier_call = kvm_cpu_hotplug,
 -};
 -
+ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                                        gpa_t addr)
+ {
+       struct kvm_io_bus *bus;
+       int dev_idx, srcu_idx;
+       struct kvm_io_device *iodev = NULL;
+       srcu_idx = srcu_read_lock(&kvm->srcu);
+       bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+       dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
+       if (dev_idx < 0)
+               goto out_unlock;
+       iodev = bus->range[dev_idx].dev;
+ out_unlock:
+       srcu_read_unlock(&kvm->srcu, srcu_idx);
+       return iodev;
+ }
+ EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
  static int kvm_debugfs_open(struct inode *inode, struct file *file,
                           int (*get)(void *, u64 *), int (*set)(void *, u64),
                           const char *fmt)
@@@ -3737,8 -3823,7 +3817,8 @@@ int kvm_init(void *opaque, unsigned vcp
                        goto out_free_1;
        }
  
 -      r = register_cpu_notifier(&kvm_cpu_notifier);
 +      r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "AP_KVM_STARTING",
 +                                    kvm_starting_cpu, kvm_dying_cpu);
        if (r)
                goto out_free_2;
        register_reboot_notifier(&kvm_reboot_notifier);
@@@ -3792,7 -3877,7 +3872,7 @@@ out_free
        kmem_cache_destroy(kvm_vcpu_cache);
  out_free_3:
        unregister_reboot_notifier(&kvm_reboot_notifier);
 -      unregister_cpu_notifier(&kvm_cpu_notifier);
 +      cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
  out_free_2:
  out_free_1:
        kvm_arch_hardware_unsetup();
@@@ -3815,7 -3900,7 +3895,7 @@@ void kvm_exit(void
        kvm_async_pf_deinit();
        unregister_syscore_ops(&kvm_syscore_ops);
        unregister_reboot_notifier(&kvm_reboot_notifier);
 -      unregister_cpu_notifier(&kvm_cpu_notifier);
 +      cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
        on_each_cpu(hardware_disable_nolock, NULL, 1);
        kvm_arch_hardware_unsetup();
        kvm_arch_exit();