Merge tag 'kvm-4.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 6 Oct 2016 17:49:01 +0000 (10:49 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 6 Oct 2016 17:49:01 +0000 (10:49 -0700)
Pull KVM updates from Radim Krčmář:
 "All architectures:
   - move `make kvmconfig` stubs from x86
   - use 64 bits for debugfs stats

  ARM:
   - Important fixes for not using an in-kernel irqchip
   - handle SError exceptions and present them to guests if appropriate
   - proxying of GICV access at EL2 if guest mappings are unsafe
   - GICv3 on AArch32 on ARMv8
   - preparations for GICv3 save/restore, including ABI docs
   - cleanups and a bit of optimizations

  MIPS:
   - A couple of fixes in preparation for supporting MIPS EVA host
     kernels
   - MIPS SMP host & TLB invalidation fixes

  PPC:
   - Fix the bug which caused guests to falsely report lockups
   - other minor fixes
   - a small optimization

  s390:
   - Lazy enablement of runtime instrumentation
   - up to 255 CPUs for nested guests
   - rework of machine check deliver
   - cleanups and fixes

  x86:
   - IOMMU part of AMD's AVIC for vmexit-less interrupt delivery
   - Hyper-V TSC page
   - per-vcpu tsc_offset in debugfs
   - accelerated INS/OUTS in nVMX
   - cleanups and fixes"

* tag 'kvm-4.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (140 commits)
  KVM: MIPS: Drop dubious EntryHi optimisation
  KVM: MIPS: Invalidate TLB by regenerating ASIDs
  KVM: MIPS: Split kernel/user ASID regeneration
  KVM: MIPS: Drop other CPU ASIDs on guest MMU changes
  KVM: arm/arm64: vgic: Don't flush/sync without a working vgic
  KVM: arm64: Require in-kernel irqchip for PMU support
  KVM: PPC: Book3s PR: Allow access to unprivileged MMCR2 register
  KVM: PPC: Book3S PR: Support 64kB page size on POWER8E and POWER8NVL
  KVM: PPC: Book3S: Remove duplicate setting of the B field in tlbie
  KVM: PPC: BookE: Fix a sanity check
  KVM: PPC: Book3S HV: Take out virtual core piggybacking code
  KVM: PPC: Book3S: Treat VTB as a per-subcore register, not per-thread
  ARM: gic-v3: Work around definition of gic_write_bpr1
  KVM: nVMX: Fix the NMI IDT-vectoring handling
  KVM: VMX: Enable MSR-BASED TPR shadow even if APICv is inactive
  KVM: nVMX: Fix reload apic access page warning
  kvmconfig: add virtio-gpu to config fragment
  config: move x86 kvm_guest.config to a common location
  arm64: KVM: Remove duplicating init code for setting VMID
  ARM: KVM: Support vgic-v3
  ...

14 files changed:
1  2 
Documentation/kernel-parameters.txt
arch/arm/include/asm/arch_gicv3.h
arch/arm/include/asm/cputype.h
arch/arm/kvm/arm.c
arch/arm/kvm/mmu.c
arch/arm64/include/asm/arch_gicv3.h
arch/arm64/include/asm/kvm_mmu.h
arch/powerpc/platforms/powernv/pci-ioda.c
arch/s390/kvm/kvm-s390.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
drivers/iommu/amd_iommu.c
drivers/iommu/amd_iommu_types.h

@@@ -460,6 -460,15 +460,15 @@@ bytes respectively. Such letter suffixe
                        driver will print ACPI tables for AMD IOMMU during
                        IOMMU initialization.
  
+       amd_iommu_intr= [HW,X86-64]
+                       Specifies one of the following AMD IOMMU interrupt
+                       remapping modes:
+                       legacy     - Use legacy interrupt remapping mode.
+                       vapic      - Use virtual APIC mode, which allows IOMMU
+                                    to inject interrupts directly into guest.
+                                    This mode requires kvm-amd.avic=1.
+                                    (Default when IOMMU HW support is present.)
        amijoy.map=     [HW,JOY] Amiga joystick support
                        Map of devices attached to JOY0DAT and JOY1DAT
                        Format: <a>,<b>
                        loops can be debugged more effectively on production
                        systems.
  
 +      clocksource.arm_arch_timer.fsl-a008585=
 +                      [ARM64]
 +                      Format: <bool>
 +                      Enable/disable the workaround of Freescale/NXP
 +                      erratum A-008585.  This can be useful for KVM
 +                      guests, if the guest device tree doesn't show the
 +                      erratum.  If unspecified, the workaround is
 +                      enabled based on the device tree.
 +
        clearcpuid=BITNUM [X86]
                        Disable CPUID feature X for the kernel. See
                        arch/x86/include/asm/cpufeatures.h for the valid bit
                        determined by the stdout-path property in device
                        tree's chosen node.
  
 -              cdns,<addr>
 -                      Start an early, polled-mode console on a cadence serial
 -                      port at the specified address. The cadence serial port
 -                      must already be setup and configured. Options are not
 -                      yet supported.
 +              cdns,<addr>[,options]
 +                      Start an early, polled-mode console on a Cadence
 +                      (xuartps) serial port at the specified address. Only
 +                      supported option is baud rate. If baud rate is not
 +                      specified, the serial port must already be setup and
 +                      configured.
  
                uart[8250],io,<addr>[,options]
                uart[8250],mmio,<addr>[,options]
                        Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
                        Default: 1024
  
 +      gpio-mockup.gpio_mockup_ranges
 +                      [HW] Sets the ranges of gpiochip of for this device.
 +                      Format: <start1>,<end1>,<start2>,<end2>...
 +
        hardlockup_all_cpu_backtrace=
                        [KNL] Should the hard-lockup detector generate
                        backtraces on all cpus.
  
        intel_idle.max_cstate=  [KNL,HW,ACPI,X86]
                        0       disables intel_idle and fall back on acpi_idle.
 -                      1 to 6  specify maximum depth of C-state.
 +                      1 to 9  specify maximum depth of C-state.
  
        intel_pstate=  [X86]
                       disable
                        than or equal to this physical address is ignored.
  
        maxcpus=        [SMP] Maximum number of processors that an SMP kernel
 -                      should make use of.  maxcpus=n : n >= 0 limits the
 -                      kernel to using 'n' processors.  n=0 is a special case,
 -                      it is equivalent to "nosmp", which also disables
 -                      the IO APIC.
 +                      will bring up during bootup.  maxcpus=n : n >= 0 limits
 +                      the kernel to bring up 'n' processors. Surely after
 +                      bootup you can bring up the other plugged cpu by executing
 +                      "echo 1 > /sys/devices/system/cpu/cpuX/online". So maxcpus
 +                      only takes effect during system bootup.
 +                      While n=0 is a special case, it is equivalent to "nosmp",
 +                      which also disables the IO APIC.
  
        max_loop=       [LOOP] The number of loop block devices that get
        (loop.max_loop) unconditionally pre-created at init time. The default
  
        nodelayacct     [KNL] Disable per-task delay accounting
  
 -      nodisconnect    [HW,SCSI,M68K] Disables SCSI disconnects.
 -
        nodsp           [SH] Disable hardware DSP at boot time.
  
        noefi           Disable EFI runtime services support.
  
        nr_cpus=        [SMP] Maximum number of processors that an SMP kernel
                        could support.  nr_cpus=n : n >= 1 limits the kernel to
 -                      supporting 'n' processors. Later in runtime you can not
 -                      use hotplug cpu feature to put more cpu back to online.
 -                      just like you compile the kernel NR_CPUS=n
 +                      support 'n' processors. It could be larger than the
 +                      number of already plugged CPU during bootup, later in
 +                      runtime you can physically add extra cpu until it reaches
 +                      n. So during boot up some boot time memory for per-cpu
 +                      variables need be pre-allocated for later physical cpu
 +                      hot plugging.
  
        nr_uarts=       [SERIAL] maximum number of UARTs to be registered.
  
                                PAGE_SIZE is used as alignment.
                                PCI-PCI bridge can be specified, if resource
                                windows need to be expanded.
 +                              To specify the alignment for several
 +                              instances of a device, the PCI vendor,
 +                              device, subvendor, and subdevice may be
 +                              specified, e.g., 4096@pci:8086:9c22:103c:198f
                ecrc=           Enable/disable PCIe ECRC (transaction layer
                                end-to-end CRC checking).
                                bios: Use BIOS/firmware settings. This is the
                                u = IGNORE_UAS (don't bind to the uas driver);
                                w = NO_WP_DETECT (don't test whether the
                                        medium is write-protected).
 +                              y = ALWAYS_SYNC (issue a SYNCHRONIZE_CACHE
 +                                      even if the device claims no cache)
                        Example: quirks=0419:aaf5:rl,0421:0433:rc
  
        user_debug=     [KNL,ARM]
@@@ -22,9 -22,7 +22,7 @@@
  
  #include <linux/io.h>
  #include <asm/barrier.h>
- #define __ACCESS_CP15(CRn, Op1, CRm, Op2)     p15, Op1, %0, CRn, CRm, Op2
- #define __ACCESS_CP15_64(Op1, CRm)            p15, Op1, %Q0, %R0, CRm
+ #include <asm/cp15.h>
  
  #define ICC_EOIR1                     __ACCESS_CP15(c12, 0, c12, 1)
  #define ICC_DIR                               __ACCESS_CP15(c12, 0, c11, 1)
@@@ -34,7 -32,6 +32,7 @@@
  #define ICC_CTLR                      __ACCESS_CP15(c12, 0, c12, 4)
  #define ICC_SRE                               __ACCESS_CP15(c12, 0, c12, 5)
  #define ICC_IGRPEN1                   __ACCESS_CP15(c12, 0, c12, 7)
 +#define ICC_BPR1                      __ACCESS_CP15(c12, 0, c12, 3)
  
  #define ICC_HSRE                      __ACCESS_CP15(c12, 4, c9, 5)
  
  #define ICH_AP1R2                     __AP1Rx(2)
  #define ICH_AP1R3                     __AP1Rx(3)
  
+ /* A32-to-A64 mappings used by VGIC save/restore */
+ #define CPUIF_MAP(a32, a64)                   \
+ static inline void write_ ## a64(u32 val)     \
+ {                                             \
+       write_sysreg(val, a32);                 \
+ }                                             \
+ static inline u32 read_ ## a64(void)          \
+ {                                             \
+       return read_sysreg(a32);                \
+ }                                             \
+ #define CPUIF_MAP_LO_HI(a32lo, a32hi, a64)    \
+ static inline void write_ ## a64(u64 val)     \
+ {                                             \
+       write_sysreg(lower_32_bits(val), a32lo);\
+       write_sysreg(upper_32_bits(val), a32hi);\
+ }                                             \
+ static inline u64 read_ ## a64(void)          \
+ {                                             \
+       u64 val = read_sysreg(a32lo);           \
+                                               \
+       val |=  (u64)read_sysreg(a32hi) << 32;  \
+                                               \
+       return val;                             \
+ }
+ CPUIF_MAP(ICH_HCR, ICH_HCR_EL2)
+ CPUIF_MAP(ICH_VTR, ICH_VTR_EL2)
+ CPUIF_MAP(ICH_MISR, ICH_MISR_EL2)
+ CPUIF_MAP(ICH_EISR, ICH_EISR_EL2)
+ CPUIF_MAP(ICH_ELSR, ICH_ELSR_EL2)
+ CPUIF_MAP(ICH_VMCR, ICH_VMCR_EL2)
+ CPUIF_MAP(ICH_AP0R3, ICH_AP0R3_EL2)
+ CPUIF_MAP(ICH_AP0R2, ICH_AP0R2_EL2)
+ CPUIF_MAP(ICH_AP0R1, ICH_AP0R1_EL2)
+ CPUIF_MAP(ICH_AP0R0, ICH_AP0R0_EL2)
+ CPUIF_MAP(ICH_AP1R3, ICH_AP1R3_EL2)
+ CPUIF_MAP(ICH_AP1R2, ICH_AP1R2_EL2)
+ CPUIF_MAP(ICH_AP1R1, ICH_AP1R1_EL2)
+ CPUIF_MAP(ICH_AP1R0, ICH_AP1R0_EL2)
+ CPUIF_MAP(ICC_HSRE, ICC_SRE_EL2)
+ CPUIF_MAP(ICC_SRE, ICC_SRE_EL1)
+ CPUIF_MAP_LO_HI(ICH_LR15, ICH_LRC15, ICH_LR15_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR14, ICH_LRC14, ICH_LR14_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR13, ICH_LRC13, ICH_LR13_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR12, ICH_LRC12, ICH_LR12_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR11, ICH_LRC11, ICH_LR11_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR10, ICH_LRC10, ICH_LR10_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR9, ICH_LRC9, ICH_LR9_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR8, ICH_LRC8, ICH_LR8_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR7, ICH_LRC7, ICH_LR7_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR6, ICH_LRC6, ICH_LR6_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR5, ICH_LRC5, ICH_LR5_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR4, ICH_LRC4, ICH_LR4_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR3, ICH_LRC3, ICH_LR3_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR2, ICH_LRC2, ICH_LR2_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR1, ICH_LRC1, ICH_LR1_EL2)
+ CPUIF_MAP_LO_HI(ICH_LR0, ICH_LRC0, ICH_LR0_EL2)
+ #define read_gicreg(r)                 read_##r()
+ #define write_gicreg(v, r)             write_##r(v)
  /* Low-level accessors */
  
  static inline void gic_write_eoir(u32 irq)
  {
-       asm volatile("mcr " __stringify(ICC_EOIR1) : : "r" (irq));
+       write_sysreg(irq, ICC_EOIR1);
        isb();
  }
  
  static inline void gic_write_dir(u32 val)
  {
-       asm volatile("mcr " __stringify(ICC_DIR) : : "r" (val));
+       write_sysreg(val, ICC_DIR);
        isb();
  }
  
  static inline u32 gic_read_iar(void)
  {
-       u32 irqstat;
+       u32 irqstat = read_sysreg(ICC_IAR1);
  
-       asm volatile("mrc " __stringify(ICC_IAR1) : "=r" (irqstat));
        dsb(sy);
        return irqstat;
  }
  
  static inline void gic_write_pmr(u32 val)
  {
-       asm volatile("mcr " __stringify(ICC_PMR) : : "r" (val));
+       write_sysreg(val, ICC_PMR);
  }
  
  static inline void gic_write_ctlr(u32 val)
  {
-       asm volatile("mcr " __stringify(ICC_CTLR) : : "r" (val));
+       write_sysreg(val, ICC_CTLR);
        isb();
  }
  
  static inline void gic_write_grpen1(u32 val)
  {
-       asm volatile("mcr " __stringify(ICC_IGRPEN1) : : "r" (val));
+       write_sysreg(val, ICC_IGRPEN1);
        isb();
  }
  
  static inline void gic_write_sgi1r(u64 val)
  {
-       asm volatile("mcrr " __stringify(ICC_SGI1R) : : "r" (val));
+       write_sysreg(val, ICC_SGI1R);
  }
  
  static inline u32 gic_read_sre(void)
  {
-       u32 val;
-       asm volatile("mrc " __stringify(ICC_SRE) : "=r" (val));
-       return val;
+       return read_sysreg(ICC_SRE);
  }
  
  static inline void gic_write_sre(u32 val)
  {
-       asm volatile("mcr " __stringify(ICC_SRE) : : "r" (val));
+       write_sysreg(val, ICC_SRE);
        isb();
  }
  
  static inline void gic_write_bpr1(u32 val)
  {
-       asm volatile("mcr " __stringify(ICC_BPR1) : : "r" (val));
 -#if defined(__write_sysreg) && defined(ICC_BPR1)
+       write_sysreg(val, ICC_BPR1);
 -#else
 -      asm volatile("mcr " __stringify(ICC_BPR1) : : "r" (val));
 -#endif
  }
  
  /*
  
  #define MPIDR_LEVEL_BITS 8
  #define MPIDR_LEVEL_MASK ((1 << MPIDR_LEVEL_BITS) - 1)
+ #define MPIDR_LEVEL_SHIFT(level) (MPIDR_LEVEL_BITS * level)
  
  #define MPIDR_AFFINITY_LEVEL(mpidr, level) \
        ((mpidr >> (MPIDR_LEVEL_BITS * level)) & MPIDR_LEVEL_MASK)
  
  #define ARM_CPU_IMP_ARM                       0x41
 +#define ARM_CPU_IMP_DEC                       0x44
  #define ARM_CPU_IMP_INTEL             0x69
  
  /* ARM implemented processors */
  #define ARM_CPU_PART_CORTEX_A15               0x4100c0f0
  #define ARM_CPU_PART_MASK             0xff00fff0
  
 +/* DEC implemented cores */
 +#define ARM_CPU_PART_SA1100           0x4400a110
 +
 +/* Intel implemented cores */
 +#define ARM_CPU_PART_SA1110           0x6900b110
 +#define ARM_CPU_REV_SA1110_A0         0
 +#define ARM_CPU_REV_SA1110_B0         4
 +#define ARM_CPU_REV_SA1110_B1         5
 +#define ARM_CPU_REV_SA1110_B2         6
 +#define ARM_CPU_REV_SA1110_B4         8
 +
  #define ARM_CPU_XSCALE_ARCH_MASK      0xe000
  #define ARM_CPU_XSCALE_ARCH_V1                0x2000
  #define ARM_CPU_XSCALE_ARCH_V2                0x4000
@@@ -164,11 -153,6 +165,11 @@@ static inline unsigned int __attribute_
        return read_cpuid(CPUID_ID);
  }
  
 +static inline unsigned int __attribute_const__ read_cpuid_cachetype(void)
 +{
 +      return read_cpuid(CPUID_CACHETYPE);
 +}
 +
  #elif defined(CONFIG_CPU_V7M)
  
  static inline unsigned int __attribute_const__ read_cpuid_id(void)
        return readl(BASEADDR_V7M_SCB + V7M_SCB_CPUID);
  }
  
 +static inline unsigned int __attribute_const__ read_cpuid_cachetype(void)
 +{
 +      return readl(BASEADDR_V7M_SCB + V7M_SCB_CTR);
 +}
 +
  #else /* ifdef CONFIG_CPU_CP15 / elif defined(CONFIG_CPU_V7M) */
  
  static inline unsigned int __attribute_const__ read_cpuid_id(void)
@@@ -195,11 -174,6 +196,11 @@@ static inline unsigned int __attribute_
        return (read_cpuid_id() & 0xFF000000) >> 24;
  }
  
 +static inline unsigned int __attribute_const__ read_cpuid_revision(void)
 +{
 +      return read_cpuid_id() & 0x0000000f;
 +}
 +
  /*
   * The CPU part number is meaningless without referring to the CPU
   * implementer: implementers are free to define their own part numbers
@@@ -220,6 -194,11 +221,6 @@@ static inline unsigned int __attribute_
        return read_cpuid_id() & ARM_CPU_XSCALE_ARCH_MASK;
  }
  
 -static inline unsigned int __attribute_const__ read_cpuid_cachetype(void)
 -{
 -      return read_cpuid(CPUID_CACHETYPE);
 -}
 -
  static inline unsigned int __attribute_const__ read_cpuid_tcmstatus(void)
  {
        return read_cpuid(CPUID_TCM);
@@@ -230,10 -209,6 +231,10 @@@ static inline unsigned int __attribute_
        return read_cpuid(CPUID_MPIDR);
  }
  
 +/* StrongARM-11x0 CPUs */
 +#define cpu_is_sa1100() (read_cpuid_part() == ARM_CPU_PART_SA1100)
 +#define cpu_is_sa1110() (read_cpuid_part() == ARM_CPU_PART_SA1110)
 +
  /*
   * Intel's XScale3 core supports some v6 features (supersections, L2)
   * but advertises itself as v5 as it does not support the v6 ISA.  For
diff --combined arch/arm/kvm/arm.c
@@@ -144,6 -144,16 +144,16 @@@ out_fail_alloc
        return ret;
  }
  
+ bool kvm_arch_has_vcpu_debugfs(void)
+ {
+       return false;
+ }
+ int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+ {
+       return 0;
+ }
  int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
  {
        return VM_FAULT_SIGBUS;
@@@ -158,6 -168,8 +168,6 @@@ void kvm_arch_destroy_vm(struct kvm *kv
  {
        int i;
  
 -      kvm_free_stage2_pgd(kvm);
 -
        for (i = 0; i < KVM_MAX_VCPUS; ++i) {
                if (kvm->vcpus[i]) {
                        kvm_arch_vcpu_free(kvm->vcpus[i]);
@@@ -1176,6 -1188,10 +1186,10 @@@ static int init_common_resources(void
                return -ENOMEM;
        }
  
+       /* set size of VMID supported by CPU */
+       kvm_vmid_bits = kvm_get_vmid_bits();
+       kvm_info("%d-bit VMID\n", kvm_vmid_bits);
        return 0;
  }
  
@@@ -1241,10 -1257,6 +1255,6 @@@ static void teardown_hyp_mode(void
  
  static int init_vhe_mode(void)
  {
-       /* set size of VMID supported by CPU */
-       kvm_vmid_bits = kvm_get_vmid_bits();
-       kvm_info("%d-bit VMID\n", kvm_vmid_bits);
        kvm_info("VHE mode initialized successfully\n");
        return 0;
  }
@@@ -1328,10 -1340,6 +1338,6 @@@ static int init_hyp_mode(void
                }
        }
  
-       /* set size of VMID supported by CPU */
-       kvm_vmid_bits = kvm_get_vmid_bits();
-       kvm_info("%d-bit VMID\n", kvm_vmid_bits);
        kvm_info("Hyp mode initialized successfully\n");
  
        return 0;
diff --combined arch/arm/kvm/mmu.c
@@@ -744,7 -744,6 +744,6 @@@ int kvm_alloc_stage2_pgd(struct kvm *kv
        if (!pgd)
                return -ENOMEM;
  
-       kvm_clean_pgd(pgd);
        kvm->arch.pgd = pgd;
        return 0;
  }
@@@ -936,7 -935,6 +935,6 @@@ static int stage2_set_pte(struct kvm *k
                if (!cache)
                        return 0; /* ignore calls from kvm_set_spte_hva */
                pte = mmu_memory_cache_alloc(cache);
-               kvm_clean_pte(pte);
                pmd_populate_kernel(NULL, pmd, pte);
                get_page(virt_to_page(pmd));
        }
@@@ -1434,6 -1432,11 +1432,11 @@@ int kvm_handle_guest_abort(struct kvm_v
        int ret, idx;
  
        is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
+       if (unlikely(!is_iabt && kvm_vcpu_dabt_isextabt(vcpu))) {
+               kvm_inject_vabt(vcpu);
+               return 1;
+       }
        fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
  
        trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
@@@ -1714,8 -1717,7 +1717,8 @@@ int kvm_mmu_init(void
                 kern_hyp_va(PAGE_OFFSET), kern_hyp_va(~0UL));
  
        if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
 -          hyp_idmap_start <  kern_hyp_va(~0UL)) {
 +          hyp_idmap_start <  kern_hyp_va(~0UL) &&
 +          hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
                /*
                 * The idmap page is intersecting with the VA space,
                 * it is not safe to continue further.
@@@ -1894,7 -1896,6 +1897,7 @@@ void kvm_arch_memslots_updated(struct k
  
  void kvm_arch_flush_shadow_all(struct kvm *kvm)
  {
 +      kvm_free_stage2_pgd(kvm);
  }
  
  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@@ -28,7 -28,6 +28,7 @@@
  #define ICC_CTLR_EL1                  sys_reg(3, 0, 12, 12, 4)
  #define ICC_SRE_EL1                   sys_reg(3, 0, 12, 12, 5)
  #define ICC_GRPEN1_EL1                        sys_reg(3, 0, 12, 12, 7)
 +#define ICC_BPR1_EL1                  sys_reg(3, 0, 12, 12, 3)
  
  #define ICC_SRE_EL2                   sys_reg(3, 4, 12, 9, 5)
  
  #include <linux/stringify.h>
  #include <asm/barrier.h>
  
+ #define read_gicreg(r)                                                        \
+       ({                                                              \
+               u64 reg;                                                \
+               asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \
+               reg;                                                    \
+       })
+ #define write_gicreg(v,r)                                             \
+       do {                                                            \
+               u64 __val = (v);                                        \
+               asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\
+       } while (0)
  /*
   * Low-level accessors
   *
@@@ -166,11 -178,6 +179,11 @@@ static inline void gic_write_sre(u32 va
        isb();
  }
  
 +static inline void gic_write_bpr1(u32 val)
 +{
 +      asm volatile("msr_s " __stringify(ICC_BPR1_EL1) ", %0" : : "r" (val));
 +}
 +
  #define gic_read_typer(c)             readq_relaxed(c)
  #define gic_write_irouter(v, c)               writeq_relaxed(v, c)
  
  .macro kern_hyp_va    reg
  alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
        and     \reg, \reg, #HYP_PAGE_OFFSET_HIGH_MASK
 -alternative_else
 -      nop
 -alternative_endif
 -alternative_if_not ARM64_HYP_OFFSET_LOW
 -      nop
 -alternative_else
 +alternative_else_nop_endif
 +alternative_if ARM64_HYP_OFFSET_LOW
        and     \reg, \reg, #HYP_PAGE_OFFSET_LOW_MASK
 -alternative_endif
 +alternative_else_nop_endif
  .endm
  
  #else
@@@ -162,12 -166,6 +162,6 @@@ void kvm_clear_hyp_idmap(void)
  #define       kvm_set_pte(ptep, pte)          set_pte(ptep, pte)
  #define       kvm_set_pmd(pmdp, pmd)          set_pmd(pmdp, pmd)
  
- static inline void kvm_clean_pgd(pgd_t *pgd) {}
- static inline void kvm_clean_pmd(pmd_t *pmd) {}
- static inline void kvm_clean_pmd_entry(pmd_t *pmd) {}
- static inline void kvm_clean_pte(pte_t *pte) {}
- static inline void kvm_clean_pte_entry(pte_t *pte) {}
  static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
  {
        pte_val(pte) |= PTE_S2_RDWR;
@@@ -124,13 -124,6 +124,13 @@@ static inline bool pnv_pci_is_m64(struc
                r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
  }
  
 +static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags)
 +{
 +      unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
 +
 +      return (resource_flags & flags) == flags;
 +}
 +
  static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
  {
        phb->ioda.pe_array[pe_no].phb = phb;
@@@ -156,7 -149,7 +156,7 @@@ static void pnv_ioda_reserve_pe(struct 
  
  static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
  {
 -      unsigned long pe = phb->ioda.total_pe_num - 1;
 +      long pe;
  
        for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
                if (!test_and_set_bit(pe, phb->ioda.pe_alloc))
  static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
  {
        struct pnv_phb *phb = pe->phb;
 +      unsigned int pe_num = pe->pe_number;
  
        WARN_ON(pe->pdev);
  
        memset(pe, 0, sizeof(struct pnv_ioda_pe));
 -      clear_bit(pe->pe_number, phb->ioda.pe_alloc);
 +      clear_bit(pe_num, phb->ioda.pe_alloc);
  }
  
  /* The default M64 BAR is shared by all PEs */
@@@ -2224,7 -2216,7 +2224,7 @@@ static long pnv_pci_ioda2_set_window(st
  
        pnv_pci_link_table_and_group(phb->hose->node, num,
                        tbl, &pe->table_group);
 -      pnv_pci_phb3_tce_invalidate_pe(pe);
 +      pnv_pci_ioda2_tce_invalidate_pe(pe);
  
        return 0;
  }
@@@ -2362,7 -2354,7 +2362,7 @@@ static long pnv_pci_ioda2_unset_window(
        if (ret)
                pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
        else
 -              pnv_pci_phb3_tce_invalidate_pe(pe);
 +              pnv_pci_ioda2_tce_invalidate_pe(pe);
  
        pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
  
@@@ -2718,15 -2710,21 +2718,21 @@@ static void pnv_pci_ioda2_setup_dma_pe(
  }
  
  #ifdef CONFIG_PCI_MSI
static void pnv_ioda2_msi_eoi(struct irq_data *d)
int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
  {
-       unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
-       struct irq_chip *chip = irq_data_get_irq_chip(d);
        struct pnv_phb *phb = container_of(chip, struct pnv_phb,
                                           ioda.irq_chip);
+       return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+ }
+ static void pnv_ioda2_msi_eoi(struct irq_data *d)
+ {
        int64_t rc;
+       unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+       struct irq_chip *chip = irq_data_get_irq_chip(d);
  
-       rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
+       rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
        WARN_ON_ONCE(rc);
  
        icp_native_eoi(d);
@@@ -2756,6 -2754,16 +2762,16 @@@ void pnv_set_msi_irq_chip(struct pnv_ph
        irq_set_chip(virq, &phb->ioda.irq_chip);
  }
  
+ /*
+  * Returns true iff chip is something that we could call
+  * pnv_opal_pci_msi_eoi for.
+  */
+ bool is_pnv_opal_msi(struct irq_chip *chip)
+ {
+       return chip->irq_eoi == pnv_ioda2_msi_eoi;
+ }
+ EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
  static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
                                  unsigned int hwirq, unsigned int virq,
                                  unsigned int is_64, struct msi_msg *msg)
@@@ -2878,7 -2886,7 +2894,7 @@@ static void pnv_pci_ioda_fixup_iov_reso
                res = &pdev->resource[i + PCI_IOV_RESOURCES];
                if (!res->flags || res->parent)
                        continue;
 -              if (!pnv_pci_is_m64(phb, res)) {
 +              if (!pnv_pci_is_m64_flags(res->flags)) {
                        dev_warn(&pdev->dev, "Don't support SR-IOV with"
                                        " non M64 VF BAR%d: %pR. \n",
                                 i, res);
@@@ -3103,7 -3111,7 +3119,7 @@@ static resource_size_t pnv_pci_window_a
         * alignment for any 64-bit resource, PCIe doesn't care and
         * bridges only do 64-bit prefetchable anyway.
         */
 -      if (phb->ioda.m64_segsize && (type & IORESOURCE_MEM_64))
 +      if (phb->ioda.m64_segsize && pnv_pci_is_m64_flags(type))
                return phb->ioda.m64_segsize;
        if (type & IORESOURCE_MEM)
                return phb->ioda.m32_segsize;
@@@ -3410,6 -3418,12 +3426,6 @@@ static void pnv_ioda_release_pe(struct 
        struct pnv_phb *phb = pe->phb;
        struct pnv_ioda_pe *slave, *tmp;
  
 -      /* Release slave PEs in compound PE */
 -      if (pe->flags & PNV_IODA_PE_MASTER) {
 -              list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
 -                      pnv_ioda_release_pe(slave);
 -      }
 -
        list_del(&pe->list);
        switch (phb->type) {
        case PNV_PHB_IODA1:
  
        pnv_ioda_release_pe_seg(pe);
        pnv_ioda_deconfigure_pe(pe->phb, pe);
 -      pnv_ioda_free_pe(pe);
 +
 +      /* Release slave PEs in the compound PE */
 +      if (pe->flags & PNV_IODA_PE_MASTER) {
 +              list_for_each_entry_safe(slave, tmp, &pe->slaves, list) {
 +                      list_del(&slave->list);
 +                      pnv_ioda_free_pe(slave);
 +              }
 +      }
 +
 +      /*
 +       * The PE for root bus can be removed because of hotplug in EEH
 +       * recovery for fenced PHB error. We need to mark the PE dead so
 +       * that it can be populated again in PCI hot add path. The PE
 +       * shouldn't be destroyed as it's the global reserved resource.
 +       */
 +      if (phb->ioda.root_pe_populated &&
 +          phb->ioda.root_pe_idx == pe->pe_number)
 +              phb->ioda.root_pe_populated = false;
 +      else
 +              pnv_ioda_free_pe(pe);
  }
  
  static void pnv_pci_release_device(struct pci_dev *pdev)
        if (!pdn || pdn->pe_number == IODA_INVALID_PE)
                return;
  
 +      /*
 +       * PCI hotplug can happen as part of EEH error recovery. The @pdn
 +       * isn't removed and added afterwards in this scenario. We should
 +       * set the PE number in @pdn to an invalid one. Otherwise, the PE's
 +       * device count is decreased on removing devices while failing to
 +       * be increased on adding devices. It leads to unbalanced PE's device
 +       * count and eventually make normal PCI hotplug path broken.
 +       */
        pe = &phb->ioda.pe_array[pdn->pe_number];
 +      pdn->pe_number = IODA_INVALID_PE;
 +
        WARN_ON(--pe->device_count < 0);
        if (pe->device_count == 0)
                pnv_ioda_release_pe(pe);
diff --combined arch/s390/kvm/kvm-s390.c
@@@ -245,33 -245,22 +245,33 @@@ static void kvm_s390_cpu_feat_init(void
                     PTFF_QAF);
  
        if (test_facility(17)) { /* MSA */
 -              __cpacf_query(CPACF_KMAC, kvm_s390_available_subfunc.kmac);
 -              __cpacf_query(CPACF_KMC, kvm_s390_available_subfunc.kmc);
 -              __cpacf_query(CPACF_KM, kvm_s390_available_subfunc.km);
 -              __cpacf_query(CPACF_KIMD, kvm_s390_available_subfunc.kimd);
 -              __cpacf_query(CPACF_KLMD, kvm_s390_available_subfunc.klmd);
 +              __cpacf_query(CPACF_KMAC, (cpacf_mask_t *)
 +                            kvm_s390_available_subfunc.kmac);
 +              __cpacf_query(CPACF_KMC, (cpacf_mask_t *)
 +                            kvm_s390_available_subfunc.kmc);
 +              __cpacf_query(CPACF_KM, (cpacf_mask_t *)
 +                            kvm_s390_available_subfunc.km);
 +              __cpacf_query(CPACF_KIMD, (cpacf_mask_t *)
 +                            kvm_s390_available_subfunc.kimd);
 +              __cpacf_query(CPACF_KLMD, (cpacf_mask_t *)
 +                            kvm_s390_available_subfunc.klmd);
        }
        if (test_facility(76)) /* MSA3 */
 -              __cpacf_query(CPACF_PCKMO, kvm_s390_available_subfunc.pckmo);
 +              __cpacf_query(CPACF_PCKMO, (cpacf_mask_t *)
 +                            kvm_s390_available_subfunc.pckmo);
        if (test_facility(77)) { /* MSA4 */
 -              __cpacf_query(CPACF_KMCTR, kvm_s390_available_subfunc.kmctr);
 -              __cpacf_query(CPACF_KMF, kvm_s390_available_subfunc.kmf);
 -              __cpacf_query(CPACF_KMO, kvm_s390_available_subfunc.kmo);
 -              __cpacf_query(CPACF_PCC, kvm_s390_available_subfunc.pcc);
 +              __cpacf_query(CPACF_KMCTR, (cpacf_mask_t *)
 +                            kvm_s390_available_subfunc.kmctr);
 +              __cpacf_query(CPACF_KMF, (cpacf_mask_t *)
 +                            kvm_s390_available_subfunc.kmf);
 +              __cpacf_query(CPACF_KMO, (cpacf_mask_t *)
 +                            kvm_s390_available_subfunc.kmo);
 +              __cpacf_query(CPACF_PCC, (cpacf_mask_t *)
 +                            kvm_s390_available_subfunc.pcc);
        }
        if (test_facility(57)) /* MSA5 */
 -              __cpacf_query(CPACF_PPNO, kvm_s390_available_subfunc.ppno);
 +              __cpacf_query(CPACF_PPNO, (cpacf_mask_t *)
 +                            kvm_s390_available_subfunc.ppno);
  
        if (MACHINE_HAS_ESOP)
                allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
@@@ -384,7 -373,9 +384,9 @@@ int kvm_vm_ioctl_check_extension(struc
        case KVM_CAP_NR_VCPUS:
        case KVM_CAP_MAX_VCPUS:
                r = KVM_S390_BSCA_CPU_SLOTS;
-               if (sclp.has_esca && sclp.has_64bscao)
+               if (!kvm_s390_use_sca_entries())
+                       r = KVM_MAX_VCPUS;
+               else if (sclp.has_esca && sclp.has_64bscao)
                        r = KVM_S390_ESCA_CPU_SLOTS;
                break;
        case KVM_CAP_NR_MEMSLOTS:
@@@ -1498,6 -1489,16 +1500,16 @@@ out_err
        return rc;
  }
  
+ bool kvm_arch_has_vcpu_debugfs(void)
+ {
+       return false;
+ }
+ int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+ {
+       return 0;
+ }
  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
  {
        VCPU_EVENT(vcpu, 3, "%s", "free cpu");
@@@ -1561,6 -1562,8 +1573,8 @@@ static int __kvm_ucontrol_vcpu_init(str
  
  static void sca_del_vcpu(struct kvm_vcpu *vcpu)
  {
+       if (!kvm_s390_use_sca_entries())
+               return;
        read_lock(&vcpu->kvm->arch.sca_lock);
        if (vcpu->kvm->arch.use_esca) {
                struct esca_block *sca = vcpu->kvm->arch.sca;
  
  static void sca_add_vcpu(struct kvm_vcpu *vcpu)
  {
+       if (!kvm_s390_use_sca_entries()) {
+               struct bsca_block *sca = vcpu->kvm->arch.sca;
+               /* we still need the basic sca for the ipte control */
+               vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
+               vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
+       }
        read_lock(&vcpu->kvm->arch.sca_lock);
        if (vcpu->kvm->arch.use_esca) {
                struct esca_block *sca = vcpu->kvm->arch.sca;
@@@ -1658,6 -1668,11 +1679,11 @@@ static int sca_can_add_vcpu(struct kvm 
  {
        int rc;
  
+       if (!kvm_s390_use_sca_entries()) {
+               if (id < KVM_MAX_VCPUS)
+                       return true;
+               return false;
+       }
        if (id < KVM_S390_BSCA_CPU_SLOTS)
                return true;
        if (!sclp.has_esca || !sclp.has_64bscao)
@@@ -1946,8 -1961,6 +1972,6 @@@ int kvm_arch_vcpu_setup(struct kvm_vcp
                vcpu->arch.sie_block->eca |= 1;
        if (sclp.has_sigpif)
                vcpu->arch.sie_block->eca |= 0x10000000U;
-       if (test_kvm_facility(vcpu->kvm, 64))
-               vcpu->arch.sie_block->ecb3 |= 0x01;
        if (test_kvm_facility(vcpu->kvm, 129)) {
                vcpu->arch.sie_block->eca |= 0x00020000;
                vcpu->arch.sie_block->ecd |= 0x20000000;
@@@ -2239,10 -2252,9 +2263,10 @@@ int kvm_arch_vcpu_ioctl_set_fpu(struct 
                return -EINVAL;
        current->thread.fpu.fpc = fpu->fpc;
        if (MACHINE_HAS_VX)
 -              convert_fp_to_vx(current->thread.fpu.vxrs, (freg_t *)fpu->fprs);
 +              convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs,
 +                               (freg_t *) fpu->fprs);
        else
 -              memcpy(current->thread.fpu.fprs, &fpu->fprs, sizeof(fpu->fprs));
 +              memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs));
        return 0;
  }
  
@@@ -2251,10 -2263,9 +2275,10 @@@ int kvm_arch_vcpu_ioctl_get_fpu(struct 
        /* make sure we have the latest values */
        save_fpu_regs();
        if (MACHINE_HAS_VX)
 -              convert_vx_to_fp((freg_t *)fpu->fprs, current->thread.fpu.vxrs);
 +              convert_vx_to_fp((freg_t *) fpu->fprs,
 +                               (__vector128 *) vcpu->run->s.regs.vrs);
        else
 -              memcpy(fpu->fprs, current->thread.fpu.fprs, sizeof(fpu->fprs));
 +              memcpy(fpu->fprs, vcpu->run->s.regs.fprs, sizeof(fpu->fprs));
        fpu->fpc = current->thread.fpu.fpc;
        return 0;
  }
@@@ -2704,6 -2715,19 +2728,19 @@@ static void sync_regs(struct kvm_vcpu *
                if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
                        kvm_clear_async_pf_completion_queue(vcpu);
        }
+       /*
+        * If userspace sets the riccb (e.g. after migration) to a valid state,
+        * we should enable RI here instead of doing the lazy enablement.
+        */
+       if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) &&
+           test_kvm_facility(vcpu->kvm, 64)) {
+               struct runtime_instr_cb *riccb =
+                       (struct runtime_instr_cb *) &kvm_run->s.regs.riccb;
+               if (riccb->valid)
+                       vcpu->arch.sie_block->ecb3 |= 0x01;
+       }
        kvm_run->kvm_dirty_regs = 0;
  }
  
@@@ -2847,38 -2871,6 +2884,6 @@@ int kvm_s390_vcpu_store_status(struct k
        return kvm_s390_store_status_unloaded(vcpu, addr);
  }
  
- /*
-  * store additional status at address
-  */
- int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
-                                       unsigned long gpa)
- {
-       /* Only bits 0-53 are used for address formation */
-       if (!(gpa & ~0x3ff))
-               return 0;
-       return write_guest_abs(vcpu, gpa & ~0x3ff,
-                              (void *)&vcpu->run->s.regs.vrs, 512);
- }
- int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr)
- {
-       if (!test_kvm_facility(vcpu->kvm, 129))
-               return 0;
-       /*
-        * The guest VXRS are in the host VXRs due to the lazy
-        * copying in vcpu load/put. We can simply call save_fpu_regs()
-        * to save the current register state because we are in the
-        * middle of a load/put cycle.
-        *
-        * Let's update our copies before we save it into the save area.
-        */
-       save_fpu_regs();
-       return kvm_s390_store_adtl_status_unloaded(vcpu, addr);
- }
  static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
  {
        kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu);
diff --combined arch/x86/kvm/svm.c
@@@ -34,6 -34,8 +34,8 @@@
  #include <linux/sched.h>
  #include <linux/trace_events.h>
  #include <linux/slab.h>
+ #include <linux/amd-iommu.h>
+ #include <linux/hashtable.h>
  
  #include <asm/apic.h>
  #include <asm/perf_event.h>
@@@ -41,6 -43,7 +43,7 @@@
  #include <asm/desc.h>
  #include <asm/debugreg.h>
  #include <asm/kvm_para.h>
+ #include <asm/irq_remapping.h>
  
  #include <asm/virtext.h>
  #include "trace.h"
@@@ -96,6 -99,19 +99,19 @@@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id)
  #define AVIC_UNACCEL_ACCESS_OFFSET_MASK               0xFF0
  #define AVIC_UNACCEL_ACCESS_VECTOR_MASK               0xFFFFFFFF
  
+ /* AVIC GATAG is encoded using VM and VCPU IDs */
+ #define AVIC_VCPU_ID_BITS             8
+ #define AVIC_VCPU_ID_MASK             ((1 << AVIC_VCPU_ID_BITS) - 1)
+ #define AVIC_VM_ID_BITS                       24
+ #define AVIC_VM_ID_NR                 (1 << AVIC_VM_ID_BITS)
+ #define AVIC_VM_ID_MASK                       ((1 << AVIC_VM_ID_BITS) - 1)
+ #define AVIC_GATAG(x, y)              (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
+                                               (y & AVIC_VCPU_ID_MASK))
+ #define AVIC_GATAG_TO_VMID(x)         ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
+ #define AVIC_GATAG_TO_VCPUID(x)               (x & AVIC_VCPU_ID_MASK)
  static bool erratum_383_found __read_mostly;
  
  static const u32 host_save_user_msrs[] = {
@@@ -185,6 -201,23 +201,23 @@@ struct vcpu_svm 
        struct page *avic_backing_page;
        u64 *avic_physical_id_cache;
        bool avic_is_running;
+       /*
+        * Per-vcpu list of struct amd_svm_iommu_ir:
+        * This is used mainly to store interrupt remapping information used
+        * when update the vcpu affinity. This avoids the need to scan for
+        * IRTE and try to match ga_tag in the IOMMU driver.
+        */
+       struct list_head ir_list;
+       spinlock_t ir_list_lock;
+ };
+ /*
+  * This is a wrapper of struct amd_iommu_ir_data.
+  */
+ struct amd_svm_iommu_ir {
+       struct list_head node;  /* Used by SVM for per-vcpu ir_list */
+       void *data;             /* Storing pointer to struct amd_ir_data */
  };
  
  #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK  (0xFF)
@@@ -242,6 -275,10 +275,10 @@@ static int avic
  module_param(avic, int, S_IRUGO);
  #endif
  
+ /* AVIC VM ID bit masks and lock */
+ static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
+ static DEFINE_SPINLOCK(avic_vm_id_lock);
  static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
  static void svm_flush_tlb(struct kvm_vcpu *vcpu);
  static void svm_complete_interrupts(struct vcpu_svm *svm);
@@@ -928,6 -965,55 +965,55 @@@ static void svm_disable_lbrv(struct vcp
        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
  }
  
+ /* Note:
+  * This hash table is used to map VM_ID to a struct kvm_arch,
+  * when handling AMD IOMMU GALOG notification to schedule in
+  * a particular vCPU.
+  */
+ #define SVM_VM_DATA_HASH_BITS 8
+ DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+ static spinlock_t svm_vm_data_hash_lock;
+ /* Note:
+  * This function is called from IOMMU driver to notify
+  * SVM to schedule in a particular vCPU of a particular VM.
+  */
+ static int avic_ga_log_notifier(u32 ga_tag)
+ {
+       unsigned long flags;
+       struct kvm_arch *ka = NULL;
+       struct kvm_vcpu *vcpu = NULL;
+       u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+       u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+       pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+       hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
+               struct kvm *kvm = container_of(ka, struct kvm, arch);
+               struct kvm_arch *vm_data = &kvm->arch;
+               if (vm_data->avic_vm_id != vm_id)
+                       continue;
+               vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+               break;
+       }
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+       if (!vcpu)
+               return 0;
+       /* Note:
+        * At this point, the IOMMU should have already set the pending
+        * bit in the vAPIC backing page. So, we just need to schedule
+        * in the vcpu.
+        */
+       if (vcpu->mode == OUTSIDE_GUEST_MODE)
+               kvm_vcpu_wake_up(vcpu);
+       return 0;
+ }
  static __init int svm_hardware_setup(void)
  {
        int cpu;
        if (avic) {
                if (!npt_enabled ||
                    !boot_cpu_has(X86_FEATURE_AVIC) ||
-                   !IS_ENABLED(CONFIG_X86_LOCAL_APIC))
+                   !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
                        avic = false;
-               else
+               } else {
                        pr_info("AVIC enabled\n");
+                       hash_init(svm_vm_data_hash);
+                       spin_lock_init(&svm_vm_data_hash_lock);
+                       amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+               }
        }
  
        return 0;
@@@ -1028,13 -1119,6 +1119,6 @@@ static void init_sys_seg(struct vmcb_se
        seg->base = 0;
  }
  
- static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
- {
-       struct vcpu_svm *svm = to_svm(vcpu);
-       return svm->vmcb->control.tsc_offset;
- }
  static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
@@@ -1280,19 -1364,55 +1364,55 @@@ static int avic_init_backing_page(struc
        return 0;
  }
  
+ static inline int avic_get_next_vm_id(void)
+ {
+       int id;
+       spin_lock(&avic_vm_id_lock);
+       /* AVIC VM ID is one-based. */
+       id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1);
+       if (id <= AVIC_VM_ID_MASK)
+               __set_bit(id, avic_vm_id_bitmap);
+       else
+               id = -EAGAIN;
+       spin_unlock(&avic_vm_id_lock);
+       return id;
+ }
+ static inline int avic_free_vm_id(int id)
+ {
+       if (id <= 0 || id > AVIC_VM_ID_MASK)
+               return -EINVAL;
+       spin_lock(&avic_vm_id_lock);
+       __clear_bit(id, avic_vm_id_bitmap);
+       spin_unlock(&avic_vm_id_lock);
+       return 0;
+ }
  static void avic_vm_destroy(struct kvm *kvm)
  {
+       unsigned long flags;
        struct kvm_arch *vm_data = &kvm->arch;
  
+       avic_free_vm_id(vm_data->avic_vm_id);
        if (vm_data->avic_logical_id_table_page)
                __free_page(vm_data->avic_logical_id_table_page);
        if (vm_data->avic_physical_id_table_page)
                __free_page(vm_data->avic_physical_id_table_page);
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+       hash_del(&vm_data->hnode);
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
  }
  
  static int avic_vm_init(struct kvm *kvm)
  {
-       int err = -ENOMEM;
+       unsigned long flags;
+       int vm_id, err = -ENOMEM;
        struct kvm_arch *vm_data = &kvm->arch;
        struct page *p_page;
        struct page *l_page;
        if (!avic)
                return 0;
  
+       vm_id = avic_get_next_vm_id();
+       if (vm_id < 0)
+               return vm_id;
+       vm_data->avic_vm_id = (u32)vm_id;
        /* Allocating physical APIC ID table (4KB) */
        p_page = alloc_page(GFP_KERNEL);
        if (!p_page)
        vm_data->avic_logical_id_table_page = l_page;
        clear_page(page_address(l_page));
  
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+       hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
        return 0;
  
  free_avic:
        return err;
  }
  
- /**
-  * This function is called during VCPU halt/unhalt.
-  */
- static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+ static inline int
+ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
  {
-       u64 entry;
-       int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
+       int ret = 0;
+       unsigned long flags;
+       struct amd_svm_iommu_ir *ir;
        struct vcpu_svm *svm = to_svm(vcpu);
  
-       if (!kvm_vcpu_apicv_active(vcpu))
-               return;
-       svm->avic_is_running = is_run;
+       if (!kvm_arch_has_assigned_device(vcpu->kvm))
+               return 0;
  
-       /* ID = 0xff (broadcast), ID > 0xff (reserved) */
-       if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
-               return;
+       /*
+        * Here, we go through the per-vcpu ir_list to update all existing
+        * interrupt remapping table entry targeting this vcpu.
+        */
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
  
-       entry = READ_ONCE(*(svm->avic_physical_id_cache));
-       WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+       if (list_empty(&svm->ir_list))
+               goto out;
  
-       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-       if (is_run)
-               entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+       list_for_each_entry(ir, &svm->ir_list, node) {
+               ret = amd_iommu_update_ga(cpu, r, ir->data);
+               if (ret)
+                       break;
+       }
+ out:
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+       return ret;
  }
  
  static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
  
        WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+       avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
+                                       svm->avic_is_running);
  }
  
  static void avic_vcpu_put(struct kvm_vcpu *vcpu)
                return;
  
        entry = READ_ONCE(*(svm->avic_physical_id_cache));
+       if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+               avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
        entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
        WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
  }
  
+ /**
+  * This function is called during VCPU halt/unhalt.
+  */
+ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       svm->avic_is_running = is_run;
+       if (is_run)
+               avic_vcpu_load(vcpu, vcpu->cpu);
+       else
+               avic_vcpu_put(vcpu);
+ }
  static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
@@@ -1450,6 -1601,9 +1601,9 @@@ static struct kvm_vcpu *svm_create_vcpu
                err = avic_init_backing_page(&svm->vcpu);
                if (err)
                        goto free_page4;
+               INIT_LIST_HEAD(&svm->ir_list);
+               spin_lock_init(&svm->ir_list_lock);
        }
  
        /* We initialize this flag to true to make sure that the is_running
@@@ -4246,6 -4400,209 +4400,209 @@@ static void svm_deliver_avic_intr(struc
                kvm_vcpu_wake_up(vcpu);
  }
  
+ static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+ {
+       unsigned long flags;
+       struct amd_svm_iommu_ir *cur;
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+       list_for_each_entry(cur, &svm->ir_list, node) {
+               if (cur->data != pi->ir_data)
+                       continue;
+               list_del(&cur->node);
+               kfree(cur);
+               break;
+       }
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ }
+ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+ {
+       int ret = 0;
+       unsigned long flags;
+       struct amd_svm_iommu_ir *ir;
+       /**
+        * In some cases, the existing irte is updaed and re-set,
+        * so we need to check here if it's already been * added
+        * to the ir_list.
+        */
+       if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+               struct kvm *kvm = svm->vcpu.kvm;
+               u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
+               struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+               struct vcpu_svm *prev_svm;
+               if (!prev_vcpu) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               prev_svm = to_svm(prev_vcpu);
+               svm_ir_list_del(prev_svm, pi);
+       }
+       /**
+        * Allocating new amd_iommu_pi_data, which will get
+        * add to the per-vcpu ir_list.
+        */
+       ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
+       if (!ir) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       ir->data = pi->ir_data;
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+       list_add(&ir->node, &svm->ir_list);
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ out:
+       return ret;
+ }
+ /**
+  * Note:
+  * The HW cannot support posting multicast/broadcast
+  * interrupts to a vCPU. So, we still use legacy interrupt
+  * remapping for these kind of interrupts.
+  *
+  * For lowest-priority interrupts, we only support
+  * those with single CPU as the destination, e.g. user
+  * configures the interrupts via /proc/irq or uses
+  * irqbalance to make the interrupts single-CPU.
+  */
+ static int
+ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+                struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
+ {
+       struct kvm_lapic_irq irq;
+       struct kvm_vcpu *vcpu = NULL;
+       kvm_set_msi_irq(kvm, e, &irq);
+       if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+               pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+                        __func__, irq.vector);
+               return -1;
+       }
+       pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+                irq.vector);
+       *svm = to_svm(vcpu);
+       vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
+       vcpu_info->vector = irq.vector;
+       return 0;
+ }
+ /*
+  * svm_update_pi_irte - set IRTE for Posted-Interrupts
+  *
+  * @kvm: kvm
+  * @host_irq: host irq of the interrupt
+  * @guest_irq: gsi of the interrupt
+  * @set: set or unset PI
+  * returns 0 on success, < 0 on failure
+  */
+ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+                             uint32_t guest_irq, bool set)
+ {
+       struct kvm_kernel_irq_routing_entry *e;
+       struct kvm_irq_routing_table *irq_rt;
+       int idx, ret = -EINVAL;
+       if (!kvm_arch_has_assigned_device(kvm) ||
+           !irq_remapping_cap(IRQ_POSTING_CAP))
+               return 0;
+       pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+                __func__, host_irq, guest_irq, set);
+       idx = srcu_read_lock(&kvm->irq_srcu);
+       irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+       WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+       hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+               struct vcpu_data vcpu_info;
+               struct vcpu_svm *svm = NULL;
+               if (e->type != KVM_IRQ_ROUTING_MSI)
+                       continue;
+               /**
+                * Here, we setup with legacy mode in the following cases:
+                * 1. When cannot target interrupt to a specific vcpu.
+                * 2. Unsetting posted interrupt.
+                * 3. APIC virtialization is disabled for the vcpu.
+                */
+               if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
+                   kvm_vcpu_apicv_active(&svm->vcpu)) {
+                       struct amd_iommu_pi_data pi;
+                       /* Try to enable guest_mode in IRTE */
+                       pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
+                       pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
+                                                    svm->vcpu.vcpu_id);
+                       pi.is_guest_mode = true;
+                       pi.vcpu_data = &vcpu_info;
+                       ret = irq_set_vcpu_affinity(host_irq, &pi);
+                       /**
+                        * Here, we successfully setting up vcpu affinity in
+                        * IOMMU guest mode. Now, we need to store the posted
+                        * interrupt information in a per-vcpu ir_list so that
+                        * we can reference to them directly when we update vcpu
+                        * scheduling information in IOMMU irte.
+                        */
+                       if (!ret && pi.is_guest_mode)
+                               svm_ir_list_add(svm, &pi);
+               } else {
+                       /* Use legacy mode in IRTE */
+                       struct amd_iommu_pi_data pi;
+                       /**
+                        * Here, pi is used to:
+                        * - Tell IOMMU to use legacy mode for this interrupt.
+                        * - Retrieve ga_tag of prior interrupt remapping data.
+                        */
+                       pi.is_guest_mode = false;
+                       ret = irq_set_vcpu_affinity(host_irq, &pi);
+                       /**
+                        * Check if the posted interrupt was previously
+                        * setup with the guest_mode by checking if the ga_tag
+                        * was cached. If so, we need to clean up the per-vcpu
+                        * ir_list.
+                        */
+                       if (!ret && pi.prev_ga_tag) {
+                               int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+                               struct kvm_vcpu *vcpu;
+                               vcpu = kvm_get_vcpu_by_id(kvm, id);
+                               if (vcpu)
+                                       svm_ir_list_del(to_svm(vcpu), &pi);
+                       }
+               }
+               if (!ret && svm) {
+                       trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
+                                                host_irq, e->gsi,
+                                                vcpu_info.vector,
+                                                vcpu_info.pi_desc_addr, set);
+               }
+               if (ret < 0) {
+                       pr_err("%s: failed to update PI IRTE\n", __func__);
+                       goto out;
+               }
+       }
+       ret = 0;
+ out:
+       srcu_read_unlock(&kvm->irq_srcu, idx);
+       return ret;
+ }
  static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
@@@ -4961,7 -5318,7 +5318,7 @@@ static inline void avic_post_state_rest
        avic_handle_ldr_update(vcpu);
  }
  
 -static struct kvm_x86_ops svm_x86_ops = {
 +static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
        .hardware_setup = svm_hardware_setup,
  
        .has_wbinvd_exit = svm_has_wbinvd_exit,
  
-       .read_tsc_offset = svm_read_tsc_offset,
        .write_tsc_offset = svm_write_tsc_offset,
        .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
        .read_l1_tsc = svm_read_l1_tsc,
  
        .pmu_ops = &amd_pmu_ops,
        .deliver_posted_interrupt = svm_deliver_avic_intr,
+       .update_pi_irte = svm_update_pi_irte,
  };
  
  static int __init svm_init(void)
diff --combined arch/x86/kvm/vmx.c
@@@ -927,6 -927,8 +927,8 @@@ static unsigned long *vmx_msr_bitmap_le
  static unsigned long *vmx_msr_bitmap_longmode;
  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+ static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+ static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
  static unsigned long *vmx_vmread_bitmap;
  static unsigned long *vmx_vmwrite_bitmap;
  
@@@ -939,6 -941,7 +941,7 @@@ static DEFINE_SPINLOCK(vmx_vpid_lock)
  static struct vmcs_config {
        int size;
        int order;
+       u32 basic_cap;
        u32 revision_id;
        u32 pin_based_exec_ctrl;
        u32 cpu_based_exec_ctrl;
@@@ -1215,6 -1218,11 +1218,11 @@@ static inline bool cpu_has_vmx_ple(void
                SECONDARY_EXEC_PAUSE_LOOP_EXITING;
  }
  
+ static inline bool cpu_has_vmx_basic_inout(void)
+ {
+       return  (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
+ }
  static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
  {
        return flexpriority_enabled && lapic_in_kernel(vcpu);
@@@ -2518,10 -2526,17 +2526,17 @@@ static void vmx_set_msr_bitmap(struct k
        else if (cpu_has_secondary_exec_ctrls() &&
                 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
                  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
-               if (is_long_mode(vcpu))
-                       msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
-               else
-                       msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+               if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
+                       if (is_long_mode(vcpu))
+                               msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+                       else
+                               msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+               } else {
+                       if (is_long_mode(vcpu))
+                               msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
+                       else
+                               msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+               }
        } else {
                if (is_long_mode(vcpu))
                        msr_bitmap = vmx_msr_bitmap_longmode;
@@@ -2603,11 -2618,6 +2618,6 @@@ static u64 vmx_read_l1_tsc(struct kvm_v
        return host_tsc + tsc_offset;
  }
  
- static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
- {
-       return vmcs_read64(TSC_OFFSET);
- }
  /*
   * writes 'offset' into guest's timestamp counter offset register
   */
@@@ -2877,6 -2887,8 +2887,8 @@@ static int vmx_get_vmx_msr(struct kvm_v
                *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
                           ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
                           (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+               if (cpu_has_vmx_basic_inout())
+                       *pdata |= VMX_BASIC_INOUT;
                break;
        case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
        case MSR_IA32_VMX_PINBASED_CTLS:
@@@ -3457,7 -3469,8 +3469,8 @@@ static __init int setup_vmcs_config(str
                return -EIO;
  
        vmcs_conf->size = vmx_msr_high & 0x1fff;
-       vmcs_conf->order = get_order(vmcs_config.size);
+       vmcs_conf->order = get_order(vmcs_conf->size);
+       vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
        vmcs_conf->revision_id = vmx_msr_low;
  
        vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
@@@ -4678,28 -4691,49 +4691,49 @@@ static void vmx_disable_intercept_for_m
                                                msr, MSR_TYPE_R | MSR_TYPE_W);
  }
  
- static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
+ static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
  {
-       __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-                       msr, MSR_TYPE_R);
-       __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-                       msr, MSR_TYPE_R);
+       if (apicv_active) {
+               __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+                               msr, MSR_TYPE_R);
+               __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+                               msr, MSR_TYPE_R);
+       } else {
+               __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_R);
+               __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_R);
+       }
  }
  
- static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
+ static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
  {
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-                       msr, MSR_TYPE_R);
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-                       msr, MSR_TYPE_R);
+       if (apicv_active) {
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+                               msr, MSR_TYPE_R);
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+                               msr, MSR_TYPE_R);
+       } else {
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_R);
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_R);
+       }
  }
  
- static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
+ static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active)
  {
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-                       msr, MSR_TYPE_W);
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-                       msr, MSR_TYPE_W);
+       if (apicv_active) {
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+                               msr, MSR_TYPE_W);
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+                               msr, MSR_TYPE_W);
+       } else {
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_W);
+               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+                               msr, MSR_TYPE_W);
+       }
  }
  
  static bool vmx_get_enable_apicv(void)
@@@ -5279,29 -5313,30 +5313,30 @@@ static void vmx_inject_nmi(struct kvm_v
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
  
-       if (is_guest_mode(vcpu))
-               return;
+       if (!is_guest_mode(vcpu)) {
+               if (!cpu_has_virtual_nmis()) {
+                       /*
+                        * Tracking the NMI-blocked state in software is built upon
+                        * finding the next open IRQ window. This, in turn, depends on
+                        * well-behaving guests: They have to keep IRQs disabled at
+                        * least as long as the NMI handler runs. Otherwise we may
+                        * cause NMI nesting, maybe breaking the guest. But as this is
+                        * highly unlikely, we can live with the residual risk.
+                        */
+                       vmx->soft_vnmi_blocked = 1;
+                       vmx->vnmi_blocked_time = 0;
+               }
  
-       if (!cpu_has_virtual_nmis()) {
-               /*
-                * Tracking the NMI-blocked state in software is built upon
-                * finding the next open IRQ window. This, in turn, depends on
-                * well-behaving guests: They have to keep IRQs disabled at
-                * least as long as the NMI handler runs. Otherwise we may
-                * cause NMI nesting, maybe breaking the guest. But as this is
-                * highly unlikely, we can live with the residual risk.
-                */
-               vmx->soft_vnmi_blocked = 1;
-               vmx->vnmi_blocked_time = 0;
+               ++vcpu->stat.nmi_injections;
+               vmx->nmi_known_unmasked = false;
        }
  
-       ++vcpu->stat.nmi_injections;
-       vmx->nmi_known_unmasked = false;
        if (vmx->rmode.vm86_active) {
                if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
                return;
        }
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
  }
@@@ -6109,7 -6144,7 +6144,7 @@@ static int handle_ept_violation(struct 
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
  
        gla_validity = (exit_qualification >> 7) & 0x3;
-       if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+       if (gla_validity == 0x2) {
                printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
                printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
                        (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
@@@ -6360,22 -6395,32 +6395,32 @@@ static __init int hardware_setup(void
        if (!vmx_msr_bitmap_legacy_x2apic)
                goto out2;
  
+       vmx_msr_bitmap_legacy_x2apic_apicv_inactive =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive)
+               goto out3;
        vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_msr_bitmap_longmode)
-               goto out3;
+               goto out4;
  
        vmx_msr_bitmap_longmode_x2apic =
                                (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_msr_bitmap_longmode_x2apic)
-               goto out4;
+               goto out5;
+       vmx_msr_bitmap_longmode_x2apic_apicv_inactive =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive)
+               goto out6;
  
        vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_vmread_bitmap)
-               goto out6;
+               goto out7;
  
        vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_vmwrite_bitmap)
-               goto out7;
+               goto out8;
  
        memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
        memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
  
        if (setup_vmcs_config(&vmcs_config) < 0) {
                r = -EIO;
-               goto out8;
+               goto out9;
        }
  
        if (boot_cpu_has(X86_FEATURE_NX))
                        vmx_msr_bitmap_legacy, PAGE_SIZE);
        memcpy(vmx_msr_bitmap_longmode_x2apic,
                        vmx_msr_bitmap_longmode, PAGE_SIZE);
+       memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+                       vmx_msr_bitmap_legacy, PAGE_SIZE);
+       memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+                       vmx_msr_bitmap_longmode, PAGE_SIZE);
  
        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
  
+       /*
+        * enable_apicv && kvm_vcpu_apicv_active()
+        */
        for (msr = 0x800; msr <= 0x8ff; msr++)
-               vmx_disable_intercept_msr_read_x2apic(msr);
+               vmx_disable_intercept_msr_read_x2apic(msr, true);
  
        /* TMCCT */
-       vmx_enable_intercept_msr_read_x2apic(0x839);
+       vmx_enable_intercept_msr_read_x2apic(0x839, true);
        /* TPR */
-       vmx_disable_intercept_msr_write_x2apic(0x808);
+       vmx_disable_intercept_msr_write_x2apic(0x808, true);
        /* EOI */
-       vmx_disable_intercept_msr_write_x2apic(0x80b);
+       vmx_disable_intercept_msr_write_x2apic(0x80b, true);
        /* SELF-IPI */
-       vmx_disable_intercept_msr_write_x2apic(0x83f);
+       vmx_disable_intercept_msr_write_x2apic(0x83f, true);
+       /*
+        * (enable_apicv && !kvm_vcpu_apicv_active()) ||
+        *      !enable_apicv
+        */
+       /* TPR */
+       vmx_disable_intercept_msr_read_x2apic(0x808, false);
+       vmx_disable_intercept_msr_write_x2apic(0x808, false);
  
        if (enable_ept) {
                kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
  
        return alloc_kvm_area();
  
- out8:
+ out9:
        free_page((unsigned long)vmx_vmwrite_bitmap);
- out7:
+ out8:
        free_page((unsigned long)vmx_vmread_bitmap);
+ out7:
+       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
  out6:
        free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
- out4:
+ out5:
        free_page((unsigned long)vmx_msr_bitmap_longmode);
+ out4:
+       free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
  out3:
        free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
  out2:
@@@ -6544,7 -6608,9 +6608,9 @@@ out
  static __exit void hardware_unsetup(void)
  {
        free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+       free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
        free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
        free_page((unsigned long)vmx_msr_bitmap_legacy);
        free_page((unsigned long)vmx_msr_bitmap_longmode);
        free_page((unsigned long)vmx_io_bitmap_b);
@@@ -6726,7 -6792,7 +6792,7 @@@ static void nested_vmx_abort(struct kvm
  {
        /* TODO: not to reset guest simply here. */
        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
-       pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+       pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
  }
  
  static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
@@@ -7013,7 -7079,7 +7079,7 @@@ static int handle_vmon(struct kvm_vcpu 
        vmx->nested.vmcs02_num = 0;
  
        hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
-                    HRTIMER_MODE_REL);
+                    HRTIMER_MODE_REL_PINNED);
        vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
  
        vmx->nested.vmxon = true;
@@@ -8435,12 -8501,7 +8501,7 @@@ static void vmx_set_virtual_x2apic_mode
                return;
        }
  
-       /*
-        * There is not point to enable virtualize x2apic without enable
-        * apicv
-        */
-       if (!cpu_has_vmx_virtualize_x2apic_mode() ||
-                               !kvm_vcpu_apicv_active(vcpu))
+       if (!cpu_has_vmx_virtualize_x2apic_mode())
                return;
  
        if (!cpu_need_tpr_shadow(vcpu))
@@@ -9598,7 -9659,7 +9659,7 @@@ static int nested_vmx_check_msr_switch(
        maxphyaddr = cpuid_maxphyaddr(vcpu);
        if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
            (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
-               pr_warn_ratelimited(
+               pr_debug_ratelimited(
                        "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
                        addr_field, maxphyaddr, count, addr);
                return -EINVAL;
@@@ -9671,13 -9732,13 +9732,13 @@@ static u32 nested_vmx_load_msr(struct k
        for (i = 0; i < count; i++) {
                if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
                                        &e, sizeof(e))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                "%s cannot read MSR entry (%u, 0x%08llx)\n",
                                __func__, i, gpa + i * sizeof(e));
                        goto fail;
                }
                if (nested_vmx_load_msr_check(vcpu, &e)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                "%s check failed (%u, 0x%x, 0x%x)\n",
                                __func__, i, e.index, e.reserved);
                        goto fail;
                msr.index = e.index;
                msr.data = e.value;
                if (kvm_set_msr(vcpu, &msr)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
                                __func__, i, e.index, e.value);
                        goto fail;
@@@ -9706,13 -9767,13 +9767,13 @@@ static int nested_vmx_store_msr(struct 
                if (kvm_vcpu_read_guest(vcpu,
                                        gpa + i * sizeof(e),
                                        &e, 2 * sizeof(u32))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                "%s cannot read MSR entry (%u, 0x%08llx)\n",
                                __func__, i, gpa + i * sizeof(e));
                        return -EINVAL;
                }
                if (nested_vmx_store_msr_check(vcpu, &e)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                "%s check failed (%u, 0x%x, 0x%x)\n",
                                __func__, i, e.index, e.reserved);
                        return -EINVAL;
                msr_info.host_initiated = false;
                msr_info.index = e.index;
                if (kvm_get_msr(vcpu, &msr_info)) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                "%s cannot read MSR (%u, 0x%x)\n",
                                __func__, i, e.index);
                        return -EINVAL;
                                         gpa + i * sizeof(e) +
                                             offsetof(struct vmx_msr_entry, value),
                                         &msr_info.data, sizeof(msr_info.data))) {
-                       pr_warn_ratelimited(
+                       pr_debug_ratelimited(
                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
                                __func__, i, e.index, msr_info.data);
                        return -EINVAL;
@@@ -10500,6 -10561,9 +10561,9 @@@ static void prepare_vmcs12(struct kvm_v
                vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
        }
  
+       if (nested_cpu_has_ept(vmcs12))
+               vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
        if (nested_cpu_has_vid(vmcs12))
                vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
  
@@@ -10793,7 -10857,7 +10857,7 @@@ static void nested_vmx_vmexit(struct kv
         * We are now running in L2, mmu_notifier will force to reload the
         * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
         */
-       kvm_vcpu_reload_apic_access_page(vcpu);
+       kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
  
        /*
         * Exiting from L2 to L1, we're now back to L1 which thinks it just
@@@ -11177,7 -11241,7 +11241,7 @@@ static void vmx_setup_mce(struct kvm_vc
                        ~FEATURE_CONTROL_LMCE;
  }
  
 -static struct kvm_x86_ops vmx_x86_ops = {
 +static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
        .hardware_setup = hardware_setup,
  
        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
  
-       .read_tsc_offset = vmx_read_tsc_offset,
        .write_tsc_offset = vmx_write_tsc_offset,
        .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
        .read_l1_tsc = vmx_read_l1_tsc,
diff --combined arch/x86/kvm/x86.c
@@@ -1367,7 -1367,7 +1367,7 @@@ static void kvm_track_tsc_matching(stru
  
  static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
  {
-       u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+       u64 curr_offset = vcpu->arch.tsc_offset;
        vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
  }
  
@@@ -1413,6 -1413,12 +1413,12 @@@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vc
  }
  EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
  
+ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+ {
+       kvm_x86_ops->write_tsc_offset(vcpu, offset);
+       vcpu->arch.tsc_offset = offset;
+ }
  void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
  {
        struct kvm *kvm = vcpu->kvm;
  
        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
        offset = kvm_compute_tsc_offset(vcpu, data);
-       ns = get_kernel_ns();
+       ns = ktime_get_boot_ns();
        elapsed = ns - kvm->arch.last_tsc_nsec;
  
        if (vcpu->arch.virtual_tsc_khz) {
  
        if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
                update_ia32_tsc_adjust_msr(vcpu, offset);
-       kvm_x86_ops->write_tsc_offset(vcpu, offset);
+       kvm_vcpu_write_tsc_offset(vcpu, offset);
        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
  
        spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
@@@ -1716,6 -1722,88 +1722,88 @@@ static void kvm_gen_update_masterclock(
  #endif
  }
  
+ static u64 __get_kvmclock_ns(struct kvm *kvm)
+ {
+       struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
+       struct kvm_arch *ka = &kvm->arch;
+       s64 ns;
+       if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
+               u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+               ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
+       } else {
+               ns = ktime_get_boot_ns() + ka->kvmclock_offset;
+       }
+       return ns;
+ }
+ u64 get_kvmclock_ns(struct kvm *kvm)
+ {
+       unsigned long flags;
+       s64 ns;
+       local_irq_save(flags);
+       ns = __get_kvmclock_ns(kvm);
+       local_irq_restore(flags);
+       return ns;
+ }
+ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+ {
+       struct kvm_vcpu_arch *vcpu = &v->arch;
+       struct pvclock_vcpu_time_info guest_hv_clock;
+       if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+               &guest_hv_clock, sizeof(guest_hv_clock))))
+               return;
+       /* This VCPU is paused, but it's legal for a guest to read another
+        * VCPU's kvmclock, so we really have to follow the specification where
+        * it says that version is odd if data is being modified, and even after
+        * it is consistent.
+        *
+        * Version field updates must be kept separate.  This is because
+        * kvm_write_guest_cached might use a "rep movs" instruction, and
+        * writes within a string instruction are weakly ordered.  So there
+        * are three writes overall.
+        *
+        * As a small optimization, only write the version field in the first
+        * and third write.  The vcpu->pv_time cache is still valid, because the
+        * version field is the first in the struct.
+        */
+       BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+       vcpu->hv_clock.version = guest_hv_clock.version + 1;
+       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                               &vcpu->hv_clock,
+                               sizeof(vcpu->hv_clock.version));
+       smp_wmb();
+       /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+       vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+       if (vcpu->pvclock_set_guest_stopped_request) {
+               vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+               vcpu->pvclock_set_guest_stopped_request = false;
+       }
+       trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                               &vcpu->hv_clock,
+                               sizeof(vcpu->hv_clock));
+       smp_wmb();
+       vcpu->hv_clock.version++;
+       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                               &vcpu->hv_clock,
+                               sizeof(vcpu->hv_clock.version));
+ }
  static int kvm_guest_time_update(struct kvm_vcpu *v)
  {
        unsigned long flags, tgt_tsc_khz;
        struct kvm_arch *ka = &v->kvm->arch;
        s64 kernel_ns;
        u64 tsc_timestamp, host_tsc;
-       struct pvclock_vcpu_time_info guest_hv_clock;
        u8 pvclock_flags;
        bool use_master_clock;
  
        }
        if (!use_master_clock) {
                host_tsc = rdtsc();
-               kernel_ns = get_kernel_ns();
+               kernel_ns = ktime_get_boot_ns();
        }
  
        tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
  
        local_irq_restore(flags);
  
-       if (!vcpu->pv_time_enabled)
-               return 0;
+       /* With all the info we got, fill in the values */
  
        if (kvm_has_tsc_control)
                tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
                vcpu->hw_tsc_khz = tgt_tsc_khz;
        }
  
-       /* With all the info we got, fill in the values */
        vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
        vcpu->last_guest_tsc = tsc_timestamp;
  
-       if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
-               &guest_hv_clock, sizeof(guest_hv_clock))))
-               return 0;
-       /* This VCPU is paused, but it's legal for a guest to read another
-        * VCPU's kvmclock, so we really have to follow the specification where
-        * it says that version is odd if data is being modified, and even after
-        * it is consistent.
-        *
-        * Version field updates must be kept separate.  This is because
-        * kvm_write_guest_cached might use a "rep movs" instruction, and
-        * writes within a string instruction are weakly ordered.  So there
-        * are three writes overall.
-        *
-        * As a small optimization, only write the version field in the first
-        * and third write.  The vcpu->pv_time cache is still valid, because the
-        * version field is the first in the struct.
-        */
-       BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-       vcpu->hv_clock.version = guest_hv_clock.version + 1;
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock.version));
-       smp_wmb();
-       /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
-       pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
-       if (vcpu->pvclock_set_guest_stopped_request) {
-               pvclock_flags |= PVCLOCK_GUEST_STOPPED;
-               vcpu->pvclock_set_guest_stopped_request = false;
-       }
        /* If the host uses TSC clocksource, then it is stable */
+       pvclock_flags = 0;
        if (use_master_clock)
                pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
  
        vcpu->hv_clock.flags = pvclock_flags;
  
-       trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock));
-       smp_wmb();
-       vcpu->hv_clock.version++;
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock.version));
+       if (vcpu->pv_time_enabled)
+               kvm_setup_pvclock_page(v);
+       if (v == kvm_get_vcpu(v->kvm, 0))
+               kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
        return 0;
  }
  
@@@ -2743,16 -2786,16 +2786,16 @@@ void kvm_arch_vcpu_load(struct kvm_vcp
                if (tsc_delta < 0)
                        mark_tsc_unstable("KVM discovered backwards TSC");
  
 -              if (kvm_lapic_hv_timer_in_use(vcpu) &&
 -                              kvm_x86_ops->set_hv_timer(vcpu,
 -                                      kvm_get_lapic_tscdeadline_msr(vcpu)))
 -                      kvm_lapic_switch_to_sw_timer(vcpu);
                if (check_tsc_unstable()) {
                        u64 offset = kvm_compute_tsc_offset(vcpu,
                                                vcpu->arch.last_guest_tsc);
-                       kvm_x86_ops->write_tsc_offset(vcpu, offset);
+                       kvm_vcpu_write_tsc_offset(vcpu, offset);
                        vcpu->arch.tsc_catchup = 1;
                }
 +              if (kvm_lapic_hv_timer_in_use(vcpu) &&
 +                              kvm_x86_ops->set_hv_timer(vcpu,
 +                                      kvm_get_lapic_tscdeadline_msr(vcpu)))
 +                      kvm_lapic_switch_to_sw_timer(vcpu);
                /*
                 * On a host with synchronized TSC, there is no need to update
                 * kvmclock on vcpu->cpu migration
@@@ -4039,7 -4082,6 +4082,6 @@@ long kvm_arch_vm_ioctl(struct file *fil
        case KVM_SET_CLOCK: {
                struct kvm_clock_data user_ns;
                u64 now_ns;
-               s64 delta;
  
                r = -EFAULT;
                if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
  
                r = 0;
                local_irq_disable();
-               now_ns = get_kernel_ns();
-               delta = user_ns.clock - now_ns;
+               now_ns = __get_kvmclock_ns(kvm);
+               kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
                local_irq_enable();
-               kvm->arch.kvmclock_offset = delta;
                kvm_gen_update_masterclock(kvm);
                break;
        }
                struct kvm_clock_data user_ns;
                u64 now_ns;
  
-               local_irq_disable();
-               now_ns = get_kernel_ns();
-               user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
-               local_irq_enable();
+               now_ns = get_kvmclock_ns(kvm);
+               user_ns.clock = now_ns;
                user_ns.flags = 0;
                memset(&user_ns.pad, 0, sizeof(user_ns.pad));
  
@@@ -6700,7 -6739,6 +6739,6 @@@ static int vcpu_enter_guest(struct kvm_
  
        kvm_put_guest_xcr0(vcpu);
  
-       /* Interrupt is enabled by handle_external_intr() */
        kvm_x86_ops->handle_external_intr(vcpu);
  
        ++vcpu->stat.exits;
@@@ -7530,7 -7568,7 +7568,7 @@@ int kvm_arch_hardware_enable(void
         * before any KVM threads can be running.  Unfortunately, we can't
         * bring the TSCs fully up to date with real time, as we aren't yet far
         * enough into CPU bringup that we know how much real time has actually
-        * elapsed; our helper function, get_kernel_ns() will be using boot
+        * elapsed; our helper function, ktime_get_boot_ns() will be using boot
         * variables that haven't been updated yet.
         *
         * So we simply find the maximum observed TSC above, then record the
@@@ -7765,6 -7803,7 +7803,7 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
        mutex_init(&kvm->arch.apic_map_lock);
        spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
  
+       kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
        pvclock_update_vm_gtod_copy(kvm);
  
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
@@@ -137,6 -137,7 +137,7 @@@ struct iommu_dev_data 
        bool pri_tlp;                     /* PASID TLB required for
                                             PPR completions */
        u32 errata;                       /* Bitmap for errata to apply */
+       bool use_vapic;                   /* Enable device to use vapic mode */
  };
  
  /*
@@@ -707,14 -708,74 +708,74 @@@ static void iommu_poll_ppr_log(struct a
        }
  }
  
+ #ifdef CONFIG_IRQ_REMAP
+ static int (*iommu_ga_log_notifier)(u32);
+ int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
+ {
+       iommu_ga_log_notifier = notifier;
+       return 0;
+ }
+ EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
+ static void iommu_poll_ga_log(struct amd_iommu *iommu)
+ {
+       u32 head, tail, cnt = 0;
+       if (iommu->ga_log == NULL)
+               return;
+       head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+       tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
+       while (head != tail) {
+               volatile u64 *raw;
+               u64 log_entry;
+               raw = (u64 *)(iommu->ga_log + head);
+               cnt++;
+               /* Avoid memcpy function-call overhead */
+               log_entry = *raw;
+               /* Update head pointer of hardware ring-buffer */
+               head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
+               writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
+               /* Handle GA entry */
+               switch (GA_REQ_TYPE(log_entry)) {
+               case GA_GUEST_NR:
+                       if (!iommu_ga_log_notifier)
+                               break;
+                       pr_debug("AMD-Vi: %s: devid=%#x, ga_tag=%#x\n",
+                                __func__, GA_DEVID(log_entry),
+                                GA_TAG(log_entry));
+                       if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
+                               pr_err("AMD-Vi: GA log notifier failed.\n");
+                       break;
+               default:
+                       break;
+               }
+       }
+ }
+ #endif /* CONFIG_IRQ_REMAP */
+ #define AMD_IOMMU_INT_MASK    \
+       (MMIO_STATUS_EVT_INT_MASK | \
+        MMIO_STATUS_PPR_INT_MASK | \
+        MMIO_STATUS_GALOG_INT_MASK)
  irqreturn_t amd_iommu_int_thread(int irq, void *data)
  {
        struct amd_iommu *iommu = (struct amd_iommu *) data;
        u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
  
-       while (status & (MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK)) {
-               /* Enable EVT and PPR interrupts again */
-               writel((MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK),
+       while (status & AMD_IOMMU_INT_MASK) {
+               /* Enable EVT and PPR and GA interrupts again */
+               writel(AMD_IOMMU_INT_MASK,
                        iommu->mmio_base + MMIO_STATUS_OFFSET);
  
                if (status & MMIO_STATUS_EVT_INT_MASK) {
                        iommu_poll_ppr_log(iommu);
                }
  
+ #ifdef CONFIG_IRQ_REMAP
+               if (status & MMIO_STATUS_GALOG_INT_MASK) {
+                       pr_devel("AMD-Vi: Processing IOMMU GA Log\n");
+                       iommu_poll_ga_log(iommu);
+               }
+ #endif
                /*
                 * Hardware bug: ERBT1312
                 * When re-enabling interrupt (by writing 1
@@@ -940,13 -1008,15 +1008,13 @@@ static void build_inv_irt(struct iommu_
   * Writes the command to the IOMMUs command buffer and informs the
   * hardware about the new command.
   */
 -static int iommu_queue_command_sync(struct amd_iommu *iommu,
 -                                  struct iommu_cmd *cmd,
 -                                  bool sync)
 +static int __iommu_queue_command_sync(struct amd_iommu *iommu,
 +                                    struct iommu_cmd *cmd,
 +                                    bool sync)
  {
        u32 left, tail, head, next_tail;
 -      unsigned long flags;
  
  again:
 -      spin_lock_irqsave(&iommu->lock, flags);
  
        head      = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
        tail      = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
  
        if (left <= 2) {
                struct iommu_cmd sync_cmd;
 -              volatile u64 sem = 0;
                int ret;
  
 -              build_completion_wait(&sync_cmd, (u64)&sem);
 -              copy_cmd_to_buffer(iommu, &sync_cmd, tail);
 +              iommu->cmd_sem = 0;
  
 -              spin_unlock_irqrestore(&iommu->lock, flags);
 +              build_completion_wait(&sync_cmd, (u64)&iommu->cmd_sem);
 +              copy_cmd_to_buffer(iommu, &sync_cmd, tail);
  
 -              if ((ret = wait_on_sem(&sem)) != 0)
 +              if ((ret = wait_on_sem(&iommu->cmd_sem)) != 0)
                        return ret;
  
                goto again;
        /* We need to sync now to make sure all commands are processed */
        iommu->need_sync = sync;
  
 +      return 0;
 +}
 +
 +static int iommu_queue_command_sync(struct amd_iommu *iommu,
 +                                  struct iommu_cmd *cmd,
 +                                  bool sync)
 +{
 +      unsigned long flags;
 +      int ret;
 +
 +      spin_lock_irqsave(&iommu->lock, flags);
 +      ret = __iommu_queue_command_sync(iommu, cmd, sync);
        spin_unlock_irqrestore(&iommu->lock, flags);
  
 -      return 0;
 +      return ret;
  }
  
  static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
  static int iommu_completion_wait(struct amd_iommu *iommu)
  {
        struct iommu_cmd cmd;
 -      volatile u64 sem = 0;
 +      unsigned long flags;
        int ret;
  
        if (!iommu->need_sync)
                return 0;
  
 -      build_completion_wait(&cmd, (u64)&sem);
  
 -      ret = iommu_queue_command_sync(iommu, &cmd, false);
 +      build_completion_wait(&cmd, (u64)&iommu->cmd_sem);
 +
 +      spin_lock_irqsave(&iommu->lock, flags);
 +
 +      iommu->cmd_sem = 0;
 +
 +      ret = __iommu_queue_command_sync(iommu, &cmd, false);
        if (ret)
 -              return ret;
 +              goto out_unlock;
 +
 +      ret = wait_on_sem(&iommu->cmd_sem);
 +
 +out_unlock:
 +      spin_unlock_irqrestore(&iommu->lock, flags);
  
 -      return wait_on_sem(&sem);
 +      return ret;
  }
  
  static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
@@@ -2967,6 -3016,12 +3035,12 @@@ static void amd_iommu_detach_device(str
        if (!iommu)
                return;
  
+ #ifdef CONFIG_IRQ_REMAP
+       if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
+           (dom->type == IOMMU_DOMAIN_UNMANAGED))
+               dev_data->use_vapic = 0;
+ #endif
        iommu_completion_wait(iommu);
  }
  
@@@ -2992,6 -3047,15 +3066,15 @@@ static int amd_iommu_attach_device(stru
  
        ret = attach_device(dev, domain);
  
+ #ifdef CONFIG_IRQ_REMAP
+       if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
+               if (dom->type == IOMMU_DOMAIN_UNMANAGED)
+                       dev_data->use_vapic = 1;
+               else
+                       dev_data->use_vapic = 0;
+       }
+ #endif
        iommu_completion_wait(iommu);
  
        return ret;
@@@ -3530,34 -3594,6 +3613,6 @@@ EXPORT_SYMBOL(amd_iommu_device_info)
   *
   *****************************************************************************/
  
- union irte {
-       u32 val;
-       struct {
-               u32 valid       : 1,
-                   no_fault    : 1,
-                   int_type    : 3,
-                   rq_eoi      : 1,
-                   dm          : 1,
-                   rsvd_1      : 1,
-                   destination : 8,
-                   vector      : 8,
-                   rsvd_2      : 8;
-       } fields;
- };
- struct irq_2_irte {
-       u16 devid; /* Device ID for IRTE table */
-       u16 index; /* Index into IRTE table*/
- };
- struct amd_ir_data {
-       struct irq_2_irte                       irq_2_irte;
-       union irte                              irte_entry;
-       union {
-               struct msi_msg                  msi_entry;
-       };
- };
  static struct irq_chip amd_ir_chip;
  
  #define DTE_IRQ_PHYS_ADDR_MASK        (((1ULL << 45)-1) << 6)
@@@ -3579,8 -3615,6 +3634,6 @@@ static void set_dte_irq_entry(u16 devid
        amd_iommu_dev_table[devid].data[2] = dte;
  }
  
- #define IRTE_ALLOCATED (~1U)
  static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
  {
        struct irq_remap_table *table = NULL;
                goto out;
        }
  
-       memset(table->table, 0, MAX_IRQS_PER_TABLE * sizeof(u32));
+       if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+               memset(table->table, 0,
+                      MAX_IRQS_PER_TABLE * sizeof(u32));
+       else
+               memset(table->table, 0,
+                      (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
  
        if (ioapic) {
                int i;
  
                for (i = 0; i < 32; ++i)
-                       table->table[i] = IRTE_ALLOCATED;
+                       iommu->irte_ops->set_allocated(table, i);
        }
  
        irq_lookup_table[devid] = table;
@@@ -3658,6 -3697,10 +3716,10 @@@ static int alloc_irq_index(u16 devid, i
        struct irq_remap_table *table;
        unsigned long flags;
        int index, c;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+       if (!iommu)
+               return -ENODEV;
  
        table = get_irq_table(devid, false);
        if (!table)
        for (c = 0, index = table->min_index;
             index < MAX_IRQS_PER_TABLE;
             ++index) {
-               if (table->table[index] == 0)
+               if (!iommu->irte_ops->is_allocated(table, index))
                        c += 1;
                else
                        c = 0;
  
                if (c == count) {
                        for (; c != 0; --c)
-                               table->table[index - c + 1] = IRTE_ALLOCATED;
+                               iommu->irte_ops->set_allocated(table, index - c + 1);
  
                        index -= count - 1;
                        goto out;
        return index;
  }
  
- static int modify_irte(u16 devid, int index, union irte irte)
+ static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte,
+                         struct amd_ir_data *data)
+ {
+       struct irq_remap_table *table;
+       struct amd_iommu *iommu;
+       unsigned long flags;
+       struct irte_ga *entry;
+       iommu = amd_iommu_rlookup_table[devid];
+       if (iommu == NULL)
+               return -EINVAL;
+       table = get_irq_table(devid, false);
+       if (!table)
+               return -ENOMEM;
+       spin_lock_irqsave(&table->lock, flags);
+       entry = (struct irte_ga *)table->table;
+       entry = &entry[index];
+       entry->lo.fields_remap.valid = 0;
+       entry->hi.val = irte->hi.val;
+       entry->lo.val = irte->lo.val;
+       entry->lo.fields_remap.valid = 1;
+       if (data)
+               data->ref = entry;
+       spin_unlock_irqrestore(&table->lock, flags);
+       iommu_flush_irt(iommu, devid);
+       iommu_completion_wait(iommu);
+       return 0;
+ }
+ static int modify_irte(u16 devid, int index, union irte *irte)
  {
        struct irq_remap_table *table;
        struct amd_iommu *iommu;
                return -ENOMEM;
  
        spin_lock_irqsave(&table->lock, flags);
-       table->table[index] = irte.val;
+       table->table[index] = irte->val;
        spin_unlock_irqrestore(&table->lock, flags);
  
        iommu_flush_irt(iommu, devid);
@@@ -3730,13 -3808,146 +3827,146 @@@ static void free_irte(u16 devid, int in
                return;
  
        spin_lock_irqsave(&table->lock, flags);
-       table->table[index] = 0;
+       iommu->irte_ops->clear_allocated(table, index);
        spin_unlock_irqrestore(&table->lock, flags);
  
        iommu_flush_irt(iommu, devid);
        iommu_completion_wait(iommu);
  }
  
+ static void irte_prepare(void *entry,
+                        u32 delivery_mode, u32 dest_mode,
+                        u8 vector, u32 dest_apicid, int devid)
+ {
+       union irte *irte = (union irte *) entry;
+       irte->val                = 0;
+       irte->fields.vector      = vector;
+       irte->fields.int_type    = delivery_mode;
+       irte->fields.destination = dest_apicid;
+       irte->fields.dm          = dest_mode;
+       irte->fields.valid       = 1;
+ }
+ static void irte_ga_prepare(void *entry,
+                           u32 delivery_mode, u32 dest_mode,
+                           u8 vector, u32 dest_apicid, int devid)
+ {
+       struct irte_ga *irte = (struct irte_ga *) entry;
+       struct iommu_dev_data *dev_data = search_dev_data(devid);
+       irte->lo.val                      = 0;
+       irte->hi.val                      = 0;
+       irte->lo.fields_remap.guest_mode  = dev_data ? dev_data->use_vapic : 0;
+       irte->lo.fields_remap.int_type    = delivery_mode;
+       irte->lo.fields_remap.dm          = dest_mode;
+       irte->hi.fields.vector            = vector;
+       irte->lo.fields_remap.destination = dest_apicid;
+       irte->lo.fields_remap.valid       = 1;
+ }
+ static void irte_activate(void *entry, u16 devid, u16 index)
+ {
+       union irte *irte = (union irte *) entry;
+       irte->fields.valid = 1;
+       modify_irte(devid, index, irte);
+ }
+ static void irte_ga_activate(void *entry, u16 devid, u16 index)
+ {
+       struct irte_ga *irte = (struct irte_ga *) entry;
+       irte->lo.fields_remap.valid = 1;
+       modify_irte_ga(devid, index, irte, NULL);
+ }
+ static void irte_deactivate(void *entry, u16 devid, u16 index)
+ {
+       union irte *irte = (union irte *) entry;
+       irte->fields.valid = 0;
+       modify_irte(devid, index, irte);
+ }
+ static void irte_ga_deactivate(void *entry, u16 devid, u16 index)
+ {
+       struct irte_ga *irte = (struct irte_ga *) entry;
+       irte->lo.fields_remap.valid = 0;
+       modify_irte_ga(devid, index, irte, NULL);
+ }
+ static void irte_set_affinity(void *entry, u16 devid, u16 index,
+                             u8 vector, u32 dest_apicid)
+ {
+       union irte *irte = (union irte *) entry;
+       irte->fields.vector = vector;
+       irte->fields.destination = dest_apicid;
+       modify_irte(devid, index, irte);
+ }
+ static void irte_ga_set_affinity(void *entry, u16 devid, u16 index,
+                                u8 vector, u32 dest_apicid)
+ {
+       struct irte_ga *irte = (struct irte_ga *) entry;
+       struct iommu_dev_data *dev_data = search_dev_data(devid);
+       if (!dev_data || !dev_data->use_vapic) {
+               irte->hi.fields.vector = vector;
+               irte->lo.fields_remap.destination = dest_apicid;
+               irte->lo.fields_remap.guest_mode = 0;
+               modify_irte_ga(devid, index, irte, NULL);
+       }
+ }
+ #define IRTE_ALLOCATED (~1U)
+ static void irte_set_allocated(struct irq_remap_table *table, int index)
+ {
+       table->table[index] = IRTE_ALLOCATED;
+ }
+ static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
+ {
+       struct irte_ga *ptr = (struct irte_ga *)table->table;
+       struct irte_ga *irte = &ptr[index];
+       memset(&irte->lo.val, 0, sizeof(u64));
+       memset(&irte->hi.val, 0, sizeof(u64));
+       irte->hi.fields.vector = 0xff;
+ }
+ static bool irte_is_allocated(struct irq_remap_table *table, int index)
+ {
+       union irte *ptr = (union irte *)table->table;
+       union irte *irte = &ptr[index];
+       return irte->val != 0;
+ }
+ static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
+ {
+       struct irte_ga *ptr = (struct irte_ga *)table->table;
+       struct irte_ga *irte = &ptr[index];
+       return irte->hi.fields.vector != 0;
+ }
+ static void irte_clear_allocated(struct irq_remap_table *table, int index)
+ {
+       table->table[index] = 0;
+ }
+ static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
+ {
+       struct irte_ga *ptr = (struct irte_ga *)table->table;
+       struct irte_ga *irte = &ptr[index];
+       memset(&irte->lo.val, 0, sizeof(u64));
+       memset(&irte->hi.val, 0, sizeof(u64));
+ }
  static int get_devid(struct irq_alloc_info *info)
  {
        int devid = -1;
@@@ -3821,19 -4032,17 +4051,17 @@@ static void irq_remapping_prepare_irte(
  {
        struct irq_2_irte *irte_info = &data->irq_2_irte;
        struct msi_msg *msg = &data->msi_entry;
-       union irte *irte = &data->irte_entry;
        struct IO_APIC_route_entry *entry;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+       if (!iommu)
+               return;
  
        data->irq_2_irte.devid = devid;
        data->irq_2_irte.index = index + sub_handle;
-       /* Setup IRTE for IOMMU */
-       irte->val = 0;
-       irte->fields.vector      = irq_cfg->vector;
-       irte->fields.int_type    = apic->irq_delivery_mode;
-       irte->fields.destination = irq_cfg->dest_apicid;
-       irte->fields.dm          = apic->irq_dest_mode;
-       irte->fields.valid       = 1;
+       iommu->irte_ops->prepare(data->entry, apic->irq_delivery_mode,
+                                apic->irq_dest_mode, irq_cfg->vector,
+                                irq_cfg->dest_apicid, devid);
  
        switch (info->type) {
        case X86_IRQ_ALLOC_TYPE_IOAPIC:
        }
  }
  
+ struct amd_irte_ops irte_32_ops = {
+       .prepare = irte_prepare,
+       .activate = irte_activate,
+       .deactivate = irte_deactivate,
+       .set_affinity = irte_set_affinity,
+       .set_allocated = irte_set_allocated,
+       .is_allocated = irte_is_allocated,
+       .clear_allocated = irte_clear_allocated,
+ };
+ struct amd_irte_ops irte_128_ops = {
+       .prepare = irte_ga_prepare,
+       .activate = irte_ga_activate,
+       .deactivate = irte_ga_deactivate,
+       .set_affinity = irte_ga_set_affinity,
+       .set_allocated = irte_ga_set_allocated,
+       .is_allocated = irte_ga_is_allocated,
+       .clear_allocated = irte_ga_clear_allocated,
+ };
  static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
                               unsigned int nr_irqs, void *arg)
  {
        struct irq_alloc_info *info = arg;
        struct irq_data *irq_data;
-       struct amd_ir_data *data;
+       struct amd_ir_data *data = NULL;
        struct irq_cfg *cfg;
        int i, ret, devid;
        int index = -1;
                if (!data)
                        goto out_free_data;
  
+               if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+                       data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
+               else
+                       data->entry = kzalloc(sizeof(struct irte_ga),
+                                                    GFP_KERNEL);
+               if (!data->entry) {
+                       kfree(data);
+                       goto out_free_data;
+               }
                irq_data->hwirq = (devid << 16) + i;
                irq_data->chip_data = data;
                irq_data->chip = &amd_ir_chip;
@@@ -3957,6 -4196,7 +4215,7 @@@ static void irq_remapping_free(struct i
                        data = irq_data->chip_data;
                        irte_info = &data->irq_2_irte;
                        free_irte(irte_info->devid, irte_info->index);
+                       kfree(data->entry);
                        kfree(data);
                }
        }
@@@ -3968,8 -4208,11 +4227,11 @@@ static void irq_remapping_activate(stru
  {
        struct amd_ir_data *data = irq_data->chip_data;
        struct irq_2_irte *irte_info = &data->irq_2_irte;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
  
-       modify_irte(irte_info->devid, irte_info->index, data->irte_entry);
+       if (iommu)
+               iommu->irte_ops->activate(data->entry, irte_info->devid,
+                                         irte_info->index);
  }
  
  static void irq_remapping_deactivate(struct irq_domain *domain,
  {
        struct amd_ir_data *data = irq_data->chip_data;
        struct irq_2_irte *irte_info = &data->irq_2_irte;
-       union irte entry;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
  
-       entry.val = 0;
-       modify_irte(irte_info->devid, irte_info->index, data->irte_entry);
+       if (iommu)
+               iommu->irte_ops->deactivate(data->entry, irte_info->devid,
+                                           irte_info->index);
  }
  
  static struct irq_domain_ops amd_ir_domain_ops = {
        .deactivate = irq_remapping_deactivate,
  };
  
+ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
+ {
+       struct amd_iommu *iommu;
+       struct amd_iommu_pi_data *pi_data = vcpu_info;
+       struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
+       struct amd_ir_data *ir_data = data->chip_data;
+       struct irte_ga *irte = (struct irte_ga *) ir_data->entry;
+       struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
+       struct iommu_dev_data *dev_data = search_dev_data(irte_info->devid);
+       /* Note:
+        * This device has never been set up for guest mode.
+        * we should not modify the IRTE
+        */
+       if (!dev_data || !dev_data->use_vapic)
+               return 0;
+       pi_data->ir_data = ir_data;
+       /* Note:
+        * SVM tries to set up for VAPIC mode, but we are in
+        * legacy mode. So, we force legacy mode instead.
+        */
+       if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
+               pr_debug("AMD-Vi: %s: Fall back to using intr legacy remap\n",
+                        __func__);
+               pi_data->is_guest_mode = false;
+       }
+       iommu = amd_iommu_rlookup_table[irte_info->devid];
+       if (iommu == NULL)
+               return -EINVAL;
+       pi_data->prev_ga_tag = ir_data->cached_ga_tag;
+       if (pi_data->is_guest_mode) {
+               /* Setting */
+               irte->hi.fields.ga_root_ptr = (pi_data->base >> 12);
+               irte->hi.fields.vector = vcpu_pi_info->vector;
+               irte->lo.fields_vapic.guest_mode = 1;
+               irte->lo.fields_vapic.ga_tag = pi_data->ga_tag;
+               ir_data->cached_ga_tag = pi_data->ga_tag;
+       } else {
+               /* Un-Setting */
+               struct irq_cfg *cfg = irqd_cfg(data);
+               irte->hi.val = 0;
+               irte->lo.val = 0;
+               irte->hi.fields.vector = cfg->vector;
+               irte->lo.fields_remap.guest_mode = 0;
+               irte->lo.fields_remap.destination = cfg->dest_apicid;
+               irte->lo.fields_remap.int_type = apic->irq_delivery_mode;
+               irte->lo.fields_remap.dm = apic->irq_dest_mode;
+               /*
+                * This communicates the ga_tag back to the caller
+                * so that it can do all the necessary clean up.
+                */
+               ir_data->cached_ga_tag = 0;
+       }
+       return modify_irte_ga(irte_info->devid, irte_info->index, irte, ir_data);
+ }
  static int amd_ir_set_affinity(struct irq_data *data,
                               const struct cpumask *mask, bool force)
  {
        struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
        struct irq_cfg *cfg = irqd_cfg(data);
        struct irq_data *parent = data->parent_data;
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
        int ret;
  
+       if (!iommu)
+               return -ENODEV;
        ret = parent->chip->irq_set_affinity(parent, mask, force);
        if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
                return ret;
         * Atomically updates the IRTE with the new destination, vector
         * and flushes the interrupt entry cache.
         */
-       ir_data->irte_entry.fields.vector = cfg->vector;
-       ir_data->irte_entry.fields.destination = cfg->dest_apicid;
-       modify_irte(irte_info->devid, irte_info->index, ir_data->irte_entry);
+       iommu->irte_ops->set_affinity(ir_data->entry, irte_info->devid,
+                           irte_info->index, cfg->vector, cfg->dest_apicid);
  
        /*
         * After this point, all the interrupts will start arriving
@@@ -4031,6 -4342,7 +4361,7 @@@ static void ir_compose_msi_msg(struct i
  static struct irq_chip amd_ir_chip = {
        .irq_ack = ir_ack_apic_edge,
        .irq_set_affinity = amd_ir_set_affinity,
+       .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity,
        .irq_compose_msi_msg = ir_compose_msi_msg,
  };
  
@@@ -4045,4 -4357,43 +4376,43 @@@ int amd_iommu_create_irq_domain(struct 
  
        return 0;
  }
+ int amd_iommu_update_ga(int cpu, bool is_run, void *data)
+ {
+       unsigned long flags;
+       struct amd_iommu *iommu;
+       struct irq_remap_table *irt;
+       struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
+       int devid = ir_data->irq_2_irte.devid;
+       struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
+       struct irte_ga *ref = (struct irte_ga *) ir_data->ref;
+       if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
+           !ref || !entry || !entry->lo.fields_vapic.guest_mode)
+               return 0;
+       iommu = amd_iommu_rlookup_table[devid];
+       if (!iommu)
+               return -ENODEV;
+       irt = get_irq_table(devid, false);
+       if (!irt)
+               return -ENODEV;
+       spin_lock_irqsave(&irt->lock, flags);
+       if (ref->lo.fields_vapic.guest_mode) {
+               if (cpu >= 0)
+                       ref->lo.fields_vapic.destination = cpu;
+               ref->lo.fields_vapic.is_run = is_run;
+               barrier();
+       }
+       spin_unlock_irqrestore(&irt->lock, flags);
+       iommu_flush_irt(iommu, devid);
+       iommu_completion_wait(iommu);
+       return 0;
+ }
+ EXPORT_SYMBOL(amd_iommu_update_ga);
  #endif
@@@ -22,6 -22,7 +22,7 @@@
  
  #include <linux/types.h>
  #include <linux/mutex.h>
+ #include <linux/msi.h>
  #include <linux/list.h>
  #include <linux/spinlock.h>
  #include <linux/pci.h>
@@@ -69,6 -70,8 +70,8 @@@
  #define MMIO_EXCL_LIMIT_OFFSET  0x0028
  #define MMIO_EXT_FEATURES     0x0030
  #define MMIO_PPR_LOG_OFFSET   0x0038
+ #define MMIO_GA_LOG_BASE_OFFSET       0x00e0
+ #define MMIO_GA_LOG_TAIL_OFFSET       0x00e8
  #define MMIO_CMD_HEAD_OFFSET  0x2000
  #define MMIO_CMD_TAIL_OFFSET  0x2008
  #define MMIO_EVT_HEAD_OFFSET  0x2010
@@@ -76,6 -79,8 +79,8 @@@
  #define MMIO_STATUS_OFFSET    0x2020
  #define MMIO_PPR_HEAD_OFFSET  0x2030
  #define MMIO_PPR_TAIL_OFFSET  0x2038
+ #define MMIO_GA_HEAD_OFFSET   0x2040
+ #define MMIO_GA_TAIL_OFFSET   0x2048
  #define MMIO_CNTR_CONF_OFFSET 0x4000
  #define MMIO_CNTR_REG_OFFSET  0x40000
  #define MMIO_REG_END_OFFSET   0x80000
@@@ -92,6 -97,7 +97,7 @@@
  #define FEATURE_GA            (1ULL<<7)
  #define FEATURE_HE            (1ULL<<8)
  #define FEATURE_PC            (1ULL<<9)
+ #define FEATURE_GAM_VAPIC     (1ULL<<21)
  
  #define FEATURE_PASID_SHIFT   32
  #define FEATURE_PASID_MASK    (0x1fULL << FEATURE_PASID_SHIFT)
  #define MMIO_STATUS_EVT_INT_MASK      (1 << 1)
  #define MMIO_STATUS_COM_WAIT_INT_MASK (1 << 2)
  #define MMIO_STATUS_PPR_INT_MASK      (1 << 6)
+ #define MMIO_STATUS_GALOG_RUN_MASK    (1 << 8)
+ #define MMIO_STATUS_GALOG_OVERFLOW_MASK       (1 << 9)
+ #define MMIO_STATUS_GALOG_INT_MASK    (1 << 10)
  
  /* event logging constants */
  #define EVENT_ENTRY_SIZE      0x10
  #define CONTROL_PPFINT_EN       0x0eULL
  #define CONTROL_PPR_EN          0x0fULL
  #define CONTROL_GT_EN           0x10ULL
+ #define CONTROL_GA_EN           0x11ULL
+ #define CONTROL_GAM_EN          0x19ULL
+ #define CONTROL_GALOG_EN        0x1CULL
+ #define CONTROL_GAINT_EN        0x1DULL
  
  #define CTRL_INV_TO_MASK      (7 << CONTROL_INV_TIMEOUT)
  #define CTRL_INV_TO_NONE      0
  
  #define PPR_REQ_FAULT         0x01
  
+ /* Constants for GA Log handling */
+ #define GA_LOG_ENTRIES                512
+ #define GA_LOG_SIZE_SHIFT     56
+ #define GA_LOG_SIZE_512               (0x8ULL << GA_LOG_SIZE_SHIFT)
+ #define GA_ENTRY_SIZE         8
+ #define GA_LOG_SIZE           (GA_ENTRY_SIZE * GA_LOG_ENTRIES)
+ #define GA_TAG(x)             (u32)(x & 0xffffffffULL)
+ #define GA_DEVID(x)           (u16)(((x) >> 32) & 0xffffULL)
+ #define GA_REQ_TYPE(x)                (((x) >> 60) & 0xfULL)
+ #define GA_GUEST_NR           0x1
  #define PAGE_MODE_NONE    0x00
  #define PAGE_MODE_1_LEVEL 0x01
  #define PAGE_MODE_2_LEVEL 0x02
  #define IOMMU_CAP_NPCACHE 26
  #define IOMMU_CAP_EFR     27
  
+ /* IOMMU Feature Reporting Field (for IVHD type 10h */
+ #define IOMMU_FEAT_GASUP_SHIFT        6
+ /* IOMMU Extended Feature Register (EFR) */
+ #define IOMMU_EFR_GASUP_SHIFT 7
  #define MAX_DOMAIN_ID 65536
  
  /* Protection domain flags */
@@@ -400,6 -432,7 +432,7 @@@ struct amd_iommu_fault 
  
  struct iommu_domain;
  struct irq_domain;
+ struct amd_irte_ops;
  
  /*
   * This structure contains generic data for  IOMMU protection domains
@@@ -490,6 -523,12 +523,12 @@@ struct amd_iommu 
        /* Base of the PPR log, if present */
        u8 *ppr_log;
  
+       /* Base of the GA log, if present */
+       u8 *ga_log;
+       /* Tail of the GA log, if present */
+       u8 *ga_log_tail;
        /* true if interrupts for this IOMMU are already enabled */
        bool int_enabled;
  
  #ifdef CONFIG_IRQ_REMAP
        struct irq_domain *ir_domain;
        struct irq_domain *msi_domain;
+       struct amd_irte_ops *irte_ops;
  #endif
 +
 +      volatile u64 __aligned(8) cmd_sem;
  };
  
  #define ACPIHID_UID_LEN 256
@@@ -683,4 -722,112 +724,112 @@@ static inline int get_hpet_devid(int id
        return -EINVAL;
  }
  
+ enum amd_iommu_intr_mode_type {
+       AMD_IOMMU_GUEST_IR_LEGACY,
+       /* This mode is not visible to users. It is used when
+        * we cannot fully enable vAPIC and fallback to only support
+        * legacy interrupt remapping via 128-bit IRTE.
+        */
+       AMD_IOMMU_GUEST_IR_LEGACY_GA,
+       AMD_IOMMU_GUEST_IR_VAPIC,
+ };
+ #define AMD_IOMMU_GUEST_IR_GA(x)      (x == AMD_IOMMU_GUEST_IR_VAPIC || \
+                                        x == AMD_IOMMU_GUEST_IR_LEGACY_GA)
+ #define AMD_IOMMU_GUEST_IR_VAPIC(x)   (x == AMD_IOMMU_GUEST_IR_VAPIC)
+ union irte {
+       u32 val;
+       struct {
+               u32 valid       : 1,
+                   no_fault    : 1,
+                   int_type    : 3,
+                   rq_eoi      : 1,
+                   dm          : 1,
+                   rsvd_1      : 1,
+                   destination : 8,
+                   vector      : 8,
+                   rsvd_2      : 8;
+       } fields;
+ };
+ union irte_ga_lo {
+       u64 val;
+       /* For int remapping */
+       struct {
+               u64 valid       : 1,
+                   no_fault    : 1,
+                   /* ------ */
+                   int_type    : 3,
+                   rq_eoi      : 1,
+                   dm          : 1,
+                   /* ------ */
+                   guest_mode  : 1,
+                   destination : 8,
+                   rsvd        : 48;
+       } fields_remap;
+       /* For guest vAPIC */
+       struct {
+               u64 valid       : 1,
+                   no_fault    : 1,
+                   /* ------ */
+                   ga_log_intr : 1,
+                   rsvd1       : 3,
+                   is_run      : 1,
+                   /* ------ */
+                   guest_mode  : 1,
+                   destination : 8,
+                   rsvd2       : 16,
+                   ga_tag      : 32;
+       } fields_vapic;
+ };
+ union irte_ga_hi {
+       u64 val;
+       struct {
+               u64 vector      : 8,
+                   rsvd_1      : 4,
+                   ga_root_ptr : 40,
+                   rsvd_2      : 12;
+       } fields;
+ };
+ struct irte_ga {
+       union irte_ga_lo lo;
+       union irte_ga_hi hi;
+ };
+ struct irq_2_irte {
+       u16 devid; /* Device ID for IRTE table */
+       u16 index; /* Index into IRTE table*/
+ };
+ struct amd_ir_data {
+       u32 cached_ga_tag;
+       struct irq_2_irte irq_2_irte;
+       struct msi_msg msi_entry;
+       void *entry;    /* Pointer to union irte or struct irte_ga */
+       void *ref;      /* Pointer to the actual irte */
+ };
+ struct amd_irte_ops {
+       void (*prepare)(void *, u32, u32, u8, u32, int);
+       void (*activate)(void *, u16, u16);
+       void (*deactivate)(void *, u16, u16);
+       void (*set_affinity)(void *, u16, u16, u8, u32);
+       void *(*get)(struct irq_remap_table *, int);
+       void (*set_allocated)(struct irq_remap_table *, int);
+       bool (*is_allocated)(struct irq_remap_table *, int);
+       void (*clear_allocated)(struct irq_remap_table *, int);
+ };
+ #ifdef CONFIG_IRQ_REMAP
+ extern struct amd_irte_ops irte_32_ops;
+ extern struct amd_irte_ops irte_128_ops;
+ #endif
  #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */