Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 2 Aug 2016 20:11:27 +0000 (16:11 -0400)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 2 Aug 2016 20:11:27 +0000 (16:11 -0400)
Pull KVM updates from Paolo Bonzini:

 - ARM: GICv3 ITS emulation and various fixes.  Removal of the
   old VGIC implementation.

 - s390: support for trapping software breakpoints, nested
   virtualization (vSIE), the STHYI opcode, initial extensions
   for CPU model support.

 - MIPS: support for MIPS64 hosts (32-bit guests only) and lots
   of cleanups, preliminary to this and the upcoming support for
   hardware virtualization extensions.

 - x86: support for execute-only mappings in nested EPT; reduced
   vmexit latency for TSC deadline timer (by about 30%) on Intel
   hosts; support for more than 255 vCPUs.

 - PPC: bugfixes.

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (302 commits)
  KVM: PPC: Introduce KVM_CAP_PPC_HTM
  MIPS: Select HAVE_KVM for MIPS64_R{2,6}
  MIPS: KVM: Reset CP0_PageMask during host TLB flush
  MIPS: KVM: Fix ptr->int cast via KVM_GUEST_KSEGX()
  MIPS: KVM: Sign extend MFC0/RDHWR results
  MIPS: KVM: Fix 64-bit big endian dynamic translation
  MIPS: KVM: Fail if ebase doesn't fit in CP0_EBase
  MIPS: KVM: Use 64-bit CP0_EBase when appropriate
  MIPS: KVM: Set CP0_Status.KX on MIPS64
  MIPS: KVM: Make entry code MIPS64 friendly
  MIPS: KVM: Use kmap instead of CKSEG0ADDR()
  MIPS: KVM: Use virt_to_phys() to get commpage PFN
  MIPS: Fix definition of KSEGX() for 64-bit
  KVM: VMX: Add VMCS to CPU's loaded VMCSs before VMPTRLD
  kvm: x86: nVMX: maintain internal copy of current VMCS
  KVM: PPC: Book3S HV: Save/restore TM state in H_CEDE
  KVM: PPC: Book3S HV: Pull out TM state save/restore into separate procedures
  KVM: arm64: vgic-its: Simplify MAPI error handling
  KVM: arm64: vgic-its: Make vgic_its_cmd_handle_mapi similar to other handlers
  KVM: arm64: vgic-its: Turn device_id validation into generic ID validation
  ...

167 files changed:
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/devices/arm-vgic.txt
Documentation/virtual/kvm/devices/vm.txt
Documentation/virtual/kvm/locking.txt
arch/arm/include/asm/kvm_asm.h
arch/arm/include/asm/kvm_host.h
arch/arm/include/asm/kvm_hyp.h
arch/arm/include/asm/kvm_mmu.h
arch/arm/include/asm/pgtable.h
arch/arm/include/asm/virt.h
arch/arm/kvm/Kconfig
arch/arm/kvm/Makefile
arch/arm/kvm/arm.c
arch/arm/kvm/emulate.c
arch/arm/kvm/guest.c
arch/arm/kvm/init.S
arch/arm/kvm/mmu.c
arch/arm/kvm/reset.c
arch/arm64/include/asm/cpufeature.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_hyp.h
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/include/asm/pgtable-hwdef.h
arch/arm64/include/asm/pgtable-prot.h
arch/arm64/include/asm/virt.h
arch/arm64/include/uapi/asm/kvm.h
arch/arm64/kernel/cpufeature.c
arch/arm64/kvm/Kconfig
arch/arm64/kvm/Makefile
arch/arm64/kvm/guest.c
arch/arm64/kvm/hyp-init.S
arch/arm64/kvm/hyp/entry.S
arch/arm64/kvm/hyp/hyp-entry.S
arch/arm64/kvm/hyp/switch.c
arch/arm64/kvm/reset.c
arch/arm64/kvm/sys_regs.c
arch/mips/Kconfig
arch/mips/include/asm/addrspace.h
arch/mips/include/asm/kvm_host.h
arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h
arch/mips/include/asm/mipsregs.h
arch/mips/include/asm/setup.h
arch/mips/include/asm/uasm.h
arch/mips/include/uapi/asm/inst.h
arch/mips/kernel/asm-offsets.c
arch/mips/kernel/branch.c
arch/mips/kernel/traps.c
arch/mips/kvm/Kconfig
arch/mips/kvm/Makefile
arch/mips/kvm/commpage.c
arch/mips/kvm/dyntrans.c
arch/mips/kvm/emulate.c
arch/mips/kvm/entry.c [new file with mode: 0644]
arch/mips/kvm/fpu.S
arch/mips/kvm/interrupt.c
arch/mips/kvm/interrupt.h
arch/mips/kvm/locore.S [deleted file]
arch/mips/kvm/mips.c
arch/mips/kvm/mmu.c [new file with mode: 0644]
arch/mips/kvm/stats.c
arch/mips/kvm/tlb.c
arch/mips/kvm/trace.h
arch/mips/kvm/trap_emul.c
arch/mips/math-emu/cp1emu.c
arch/mips/mm/c-r4k.c
arch/mips/mm/uasm-micromips.c
arch/mips/mm/uasm-mips.c
arch/mips/mm/uasm.c
arch/powerpc/include/asm/hmi.h [new file with mode: 0644]
arch/powerpc/include/asm/paca.h
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/hmi.c [new file with mode: 0644]
arch/powerpc/kernel/idle_book3s.S
arch/powerpc/kernel/traps.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_ras.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/emulate.c
arch/powerpc/kvm/mpic.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/platforms/powernv/opal-wrappers.S
arch/s390/hypfs/hypfs_diag.c
arch/s390/include/asm/cpacf.h
arch/s390/include/asm/diag.h
arch/s390/include/asm/gmap.h
arch/s390/include/asm/kvm_host.h
arch/s390/include/asm/mmu.h
arch/s390/include/asm/mmu_context.h
arch/s390/include/asm/page.h
arch/s390/include/asm/pgalloc.h
arch/s390/include/asm/pgtable.h
arch/s390/include/asm/processor.h
arch/s390/include/asm/sclp.h
arch/s390/include/uapi/asm/kvm.h
arch/s390/include/uapi/asm/sie.h
arch/s390/kernel/diag.c
arch/s390/kvm/Makefile
arch/s390/kvm/diag.c
arch/s390/kvm/gaccess.c
arch/s390/kvm/gaccess.h
arch/s390/kvm/guestdbg.c
arch/s390/kvm/intercept.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/priv.c
arch/s390/kvm/sigp.c
arch/s390/kvm/sthyi.c [new file with mode: 0644]
arch/s390/kvm/trace.h
arch/s390/kvm/vsie.c [new file with mode: 0644]
arch/s390/mm/fault.c
arch/s390/mm/gmap.c
arch/s390/mm/pgalloc.c
arch/s390/mm/pgtable.c
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/svm.h
arch/x86/include/asm/virtext.h
arch/x86/kvm/Kconfig
arch/x86/kvm/i8254.c
arch/x86/kvm/iommu.c
arch/x86/kvm/irq_comm.c
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/pmu_intel.c
arch/x86/kvm/svm.c
arch/x86/kvm/trace.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
drivers/s390/char/sclp_early.c
drivers/s390/char/sclp_ocf.c
include/kvm/arm_vgic.h
include/kvm/vgic/vgic.h [deleted file]
include/linux/context_tracking.h
include/linux/irqchip/arm-gic-v3.h
include/linux/kvm_host.h
include/linux/page_ref.h
include/trace/events/kvm.h
include/uapi/linux/kvm.h
mm/gup.c
virt/kvm/Kconfig
virt/kvm/arm/hyp/vgic-v2-sr.c
virt/kvm/arm/vgic-v2-emul.c [deleted file]
virt/kvm/arm/vgic-v2.c [deleted file]
virt/kvm/arm/vgic-v3-emul.c [deleted file]
virt/kvm/arm/vgic-v3.c [deleted file]
virt/kvm/arm/vgic.c [deleted file]
virt/kvm/arm/vgic.h [deleted file]
virt/kvm/arm/vgic/vgic-init.c
virt/kvm/arm/vgic/vgic-its.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-kvm-device.c
virt/kvm/arm/vgic/vgic-mmio-v2.c
virt/kvm/arm/vgic/vgic-mmio-v3.c
virt/kvm/arm/vgic/vgic-mmio.c
virt/kvm/arm/vgic/vgic-mmio.h
virt/kvm/arm/vgic/vgic-v2.c
virt/kvm/arm/vgic/vgic-v3.c
virt/kvm/arm/vgic/vgic.c
virt/kvm/arm/vgic/vgic.h
virt/kvm/irqchip.c
virt/kvm/kvm_main.c

index a4482cc..5237e1b 100644 (file)
@@ -1482,6 +1482,11 @@ struct kvm_irq_routing_msi {
        __u32 pad;
 };
 
+On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS
+feature of KVM_CAP_X2APIC_API capability is enabled.  If it is enabled,
+address_hi bits 31-8 provide bits 31-8 of the destination id.  Bits 7-0 of
+address_hi must be zero.
+
 struct kvm_irq_routing_s390_adapter {
        __u64 ind_addr;
        __u64 summary_addr;
@@ -1583,6 +1588,17 @@ struct kvm_lapic_state {
 Reads the Local APIC registers and copies them into the input argument.  The
 data format and layout are the same as documented in the architecture manual.
 
+If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is
+enabled, then the format of APIC_ID register depends on the APIC mode
+(reported by MSR_IA32_APICBASE) of its VCPU.  x2APIC stores APIC ID in
+the APIC_ID register (bytes 32-35).  xAPIC only allows an 8-bit APIC ID
+which is stored in bits 31-24 of the APIC register, or equivalently in
+byte 35 of struct kvm_lapic_state's regs field.  KVM_GET_LAPIC must then
+be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR.
+
+If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state
+always uses xAPIC format.
+
 
 4.58 KVM_SET_LAPIC
 
@@ -1600,6 +1616,10 @@ struct kvm_lapic_state {
 Copies the input argument into the Local APIC registers.  The data format
 and layout are the same as documented in the architecture manual.
 
+The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's
+regs field) depends on the state of the KVM_CAP_X2APIC_API capability.
+See the note in KVM_GET_LAPIC.
+
 
 4.59 KVM_IOEVENTFD
 
@@ -2032,6 +2052,12 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_CONFIG5      | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG7      | 32
   MIPS  | KVM_REG_MIPS_CP0_ERROREPC     | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH1    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH2    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH3    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH4    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH5    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH6    | 64
   MIPS  | KVM_REG_MIPS_COUNT_CTL        | 64
   MIPS  | KVM_REG_MIPS_COUNT_RESUME     | 64
   MIPS  | KVM_REG_MIPS_COUNT_HZ         | 64
@@ -2156,7 +2182,7 @@ after pausing the vcpu, but before it is resumed.
 4.71 KVM_SIGNAL_MSI
 
 Capability: KVM_CAP_SIGNAL_MSI
-Architectures: x86
+Architectures: x86 arm64
 Type: vm ioctl
 Parameters: struct kvm_msi (in)
 Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error
@@ -2169,10 +2195,22 @@ struct kvm_msi {
        __u32 address_hi;
        __u32 data;
        __u32 flags;
-       __u8  pad[16];
+       __u32 devid;
+       __u8  pad[12];
 };
 
-No flags are defined so far. The corresponding field must be 0.
+flags: KVM_MSI_VALID_DEVID: devid contains a valid value
+devid: If KVM_MSI_VALID_DEVID is set, contains a unique device identifier
+       for the device that wrote the MSI message.
+       For PCI, this is usually a BFD identifier in the lower 16 bits.
+
+The per-VM KVM_CAP_MSI_DEVID capability advertises the need to provide
+the device ID. If this capability is not set, userland cannot rely on
+the kernel to allow the KVM_MSI_VALID_DEVID flag being set.
+
+On x86, address_hi is ignored unless the KVM_CAP_X2APIC_API capability is
+enabled.  If it is enabled, address_hi bits 31-8 provide bits 31-8 of the
+destination id.  Bits 7-0 of address_hi must be zero.
 
 
 4.71 KVM_CREATE_PIT2
@@ -2520,6 +2558,7 @@ Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
 Errors:
   ENXIO:  The group or attribute is unknown/unsupported for this device
+          or hardware support is missing.
   EPERM:  The attribute cannot (currently) be accessed this way
           (e.g. read-only attribute, or attribute that only makes
           sense when the device is in a different state)
@@ -2547,6 +2586,7 @@ Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
 Errors:
   ENXIO:  The group or attribute is unknown/unsupported for this device
+          or hardware support is missing.
 
 Tests whether a device supports a particular attribute.  A successful
 return indicates the attribute is implemented.  It does not necessarily
@@ -3803,6 +3843,42 @@ Allows use of runtime-instrumentation introduced with zEC12 processor.
 Will return -EINVAL if the machine does not support runtime-instrumentation.
 Will return -EBUSY if a VCPU has already been created.
 
+7.7 KVM_CAP_X2APIC_API
+
+Architectures: x86
+Parameters: args[0] - features that should be enabled
+Returns: 0 on success, -EINVAL when args[0] contains invalid features
+
+Valid feature flags in args[0] are
+
+#define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
+
+Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of
+KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC,
+allowing the use of 32-bit APIC IDs.  See KVM_CAP_X2APIC_API in their
+respective sections.
+
+KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work
+in logical mode or with more than 255 VCPUs.  Otherwise, KVM treats 0xff
+as a broadcast even in x2APIC mode in order to support physical x2APIC
+without interrupt remapping.  This is undesirable in logical mode,
+where 0xff represents CPUs 0-7 in cluster 0.
+
+7.8 KVM_CAP_S390_USER_INSTR0
+
+Architectures: s390
+Parameters: none
+
+With this capability enabled, all illegal instructions 0x0000 (2 bytes) will
+be intercepted and forwarded to user space. User space can use this
+mechanism e.g. to realize 2-byte software breakpoints. The kernel will
+not inject an operating exception for these instructions, user space has
+to take care of that.
+
+This capability can be enabled dynamically even if VCPUs were already
+created and are running.
+
 8. Other capabilities.
 ----------------------
 
index 59541d4..89182f8 100644 (file)
@@ -4,16 +4,22 @@ ARM Virtual Generic Interrupt Controller (VGIC)
 Device types supported:
   KVM_DEV_TYPE_ARM_VGIC_V2     ARM Generic Interrupt Controller v2.0
   KVM_DEV_TYPE_ARM_VGIC_V3     ARM Generic Interrupt Controller v3.0
+  KVM_DEV_TYPE_ARM_VGIC_ITS    ARM Interrupt Translation Service Controller
 
-Only one VGIC instance may be instantiated through either this API or the
-legacy KVM_CREATE_IRQCHIP api.  The created VGIC will act as the VM interrupt
-controller, requiring emulated user-space devices to inject interrupts to the
-VGIC instead of directly to CPUs.
+Only one VGIC instance of the V2/V3 types above may be instantiated through
+either this API or the legacy KVM_CREATE_IRQCHIP api.  The created VGIC will
+act as the VM interrupt controller, requiring emulated user-space devices to
+inject interrupts to the VGIC instead of directly to CPUs.
 
 Creating a guest GICv3 device requires a host GICv3 as well.
 GICv3 implementations with hardware compatibility support allow a guest GICv2
 as well.
 
+Creating a virtual ITS controller requires a host GICv3 (but does not depend
+on having physical ITS controllers).
+There can be multiple ITS controllers per guest, each of them has to have
+a separate, non-overlapping MMIO region.
+
 Groups:
   KVM_DEV_ARM_VGIC_GRP_ADDR
   Attributes:
@@ -39,6 +45,13 @@ Groups:
       Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
       This address needs to be 64K aligned.
 
+    KVM_VGIC_V3_ADDR_TYPE_ITS (rw, 64-bit)
+      Base address in the guest physical address space of the GICv3 ITS
+      control register frame. The ITS allows MSI(-X) interrupts to be
+      injected into guests. This extension is optional. If the kernel
+      does not support the ITS, the call returns -ENODEV.
+      Only valid for KVM_DEV_TYPE_ARM_VGIC_ITS.
+      This address needs to be 64K aligned and the region covers 128K.
 
   KVM_DEV_ARM_VGIC_GRP_DIST_REGS
   Attributes:
@@ -109,8 +122,8 @@ Groups:
   KVM_DEV_ARM_VGIC_GRP_CTRL
   Attributes:
     KVM_DEV_ARM_VGIC_CTRL_INIT
-      request the initialization of the VGIC, no additional parameter in
-      kvm_device_attr.addr.
+      request the initialization of the VGIC or ITS, no additional parameter
+      in kvm_device_attr.addr.
   Errors:
     -ENXIO: VGIC not properly configured as required prior to calling
      this attribute
index a9ea877..b6cda49 100644 (file)
@@ -20,7 +20,8 @@ Enables Collaborative Memory Management Assist (CMMA) for the virtual machine.
 
 1.2. ATTRIBUTE: KVM_S390_VM_MEM_CLR_CMMA
 Parameters: none
-Returns: 0
+Returns: -EINVAL if CMMA was not enabled
+         0 otherwise
 
 Clear the CMMA status for all guest pages, so any pages the guest marked
 as unused are again used any may not be reclaimed by the host.
@@ -85,6 +86,90 @@ Returns:    -EBUSY in case 1 or more vcpus are already activated (only in write
            -ENOMEM if not enough memory is available to process the ioctl
            0 in case of success
 
+2.3. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_FEAT (r/o)
+
+Allows user space to retrieve available cpu features. A feature is available if
+provided by the hardware and supported by kvm. In theory, cpu features could
+even be completely emulated by kvm.
+
+struct kvm_s390_vm_cpu_feat {
+        __u64 feat[16]; # Bitmap (1 = feature available), MSB 0 bit numbering
+};
+
+Parameters: address of a buffer to load the feature list from.
+Returns:    -EFAULT if the given address is not accessible from kernel space.
+           0 in case of success.
+
+2.4. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_FEAT (r/w)
+
+Allows user space to retrieve or change enabled cpu features for all VCPUs of a
+VM. Features that are not available cannot be enabled.
+
+See 2.3. for a description of the parameter struct.
+
+Parameters: address of a buffer to store/load the feature list from.
+Returns:    -EFAULT if the given address is not accessible from kernel space.
+           -EINVAL if a cpu feature that is not available is to be enabled.
+           -EBUSY if at least one VCPU has already been defined.
+           0 in case of success.
+
+2.5. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_SUBFUNC (r/o)
+
+Allows user space to retrieve available cpu subfunctions without any filtering
+done by a set IBC. These subfunctions are indicated to the guest VCPU via
+query or "test bit" subfunctions and used e.g. by cpacf functions, plo and ptff.
+
+A subfunction block is only valid if KVM_S390_VM_CPU_MACHINE contains the
+STFL(E) bit introducing the affected instruction. If the affected instruction
+indicates subfunctions via a "query subfunction", the response block is
+contained in the returned struct. If the affected instruction
+indicates subfunctions via a "test bit" mechanism, the subfunction codes are
+contained in the returned struct in MSB 0 bit numbering.
+
+struct kvm_s390_vm_cpu_subfunc {
+       u8 plo[32];           # always valid (ESA/390 feature)
+       u8 ptff[16];          # valid with TOD-clock steering
+       u8 kmac[16];          # valid with Message-Security-Assist
+       u8 kmc[16];           # valid with Message-Security-Assist
+       u8 km[16];            # valid with Message-Security-Assist
+       u8 kimd[16];          # valid with Message-Security-Assist
+       u8 klmd[16];          # valid with Message-Security-Assist
+       u8 pckmo[16];         # valid with Message-Security-Assist-Extension 3
+       u8 kmctr[16];         # valid with Message-Security-Assist-Extension 4
+       u8 kmf[16];           # valid with Message-Security-Assist-Extension 4
+       u8 kmo[16];           # valid with Message-Security-Assist-Extension 4
+       u8 pcc[16];           # valid with Message-Security-Assist-Extension 4
+       u8 ppno[16];          # valid with Message-Security-Assist-Extension 5
+       u8 reserved[1824];    # reserved for future instructions
+};
+
+Parameters: address of a buffer to load the subfunction blocks from.
+Returns:    -EFAULT if the given address is not accessible from kernel space.
+           0 in case of success.
+
+2.6. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_SUBFUNC (r/w)
+
+Allows user space to retrieve or change cpu subfunctions to be indicated for
+all VCPUs of a VM. This attribute will only be available if kernel and
+hardware support are in place.
+
+The kernel uses the configured subfunction blocks for indication to
+the guest. A subfunction block will only be used if the associated STFL(E) bit
+has not been disabled by user space (so the instruction to be queried is
+actually available for the guest).
+
+As long as no data has been written, a read will fail. The IBC will be used
+to determine available subfunctions in this case, this will guarantee backward
+compatibility.
+
+See 2.5. for a description of the parameter struct.
+
+Parameters: address of a buffer to store/load the subfunction blocks from.
+Returns:    -EFAULT if the given address is not accessible from kernel space.
+           -EINVAL when reading, if there was no write yet.
+           -EBUSY if at least one VCPU has already been defined.
+           0 in case of success.
+
 3. GROUP: KVM_S390_VM_TOD
 Architectures: s390
 
index 19f94a6..f2491a8 100644 (file)
@@ -89,7 +89,7 @@ In mmu_spte_clear_track_bits():
    old_spte = *spte;
 
    /* 'if' condition is satisfied. */
-   if (old_spte.Accssed == 1 &&
+   if (old_spte.Accessed == 1 &&
         old_spte.W == 0)
       spte = 0ull;
                                          on fast page fault path:
@@ -102,7 +102,7 @@ In mmu_spte_clear_track_bits():
       old_spte = xchg(spte, 0ull)
 
 
-   if (old_spte.Accssed == 1)
+   if (old_spte.Accessed == 1)
       kvm_set_pfn_accessed(spte.pfn);
    if (old_spte.Dirty == 1)
       kvm_set_pfn_dirty(spte.pfn);
index 3d5a5cd..58faff5 100644 (file)
@@ -66,6 +66,8 @@ extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 extern void __init_stage2_translation(void);
+
+extern void __kvm_hyp_reset(unsigned long);
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
index 96387d4..de338d9 100644 (file)
@@ -241,8 +241,7 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
 int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
                int exception_index);
 
-static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
-                                      phys_addr_t pgd_ptr,
+static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
                                       unsigned long hyp_stack_ptr,
                                       unsigned long vector_ptr)
 {
@@ -251,18 +250,13 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
         * code. The init code doesn't need to preserve these
         * registers as r0-r3 are already callee saved according to
         * the AAPCS.
-        * Note that we slightly misuse the prototype by casing the
+        * Note that we slightly misuse the prototype by casting the
         * stack pointer to a void *.
-        *
-        * We don't have enough registers to perform the full init in
-        * one go.  Install the boot PGD first, and then install the
-        * runtime PGD, stack pointer and vectors. The PGDs are always
-        * passed as the third argument, in order to be passed into
-        * r2-r3 to the init code (yes, this is compliant with the
-        * PCS!).
-        */
 
-       kvm_call_hyp(NULL, 0, boot_pgd_ptr);
+        * The PGDs are always passed as the third argument, in order
+        * to be passed into r2-r3 to the init code (yes, this is
+        * compliant with the PCS!).
+        */
 
        kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
 }
@@ -272,16 +266,13 @@ static inline void __cpu_init_stage2(void)
        kvm_call_hyp(__init_stage2_translation);
 }
 
-static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
                                        phys_addr_t phys_idmap_start)
 {
-       /*
-        * TODO
-        * kvm_call_reset(boot_pgd_ptr, phys_idmap_start);
-        */
+       kvm_call_hyp((void *)virt_to_idmap(__kvm_hyp_reset), vector_ptr);
 }
 
-static inline int kvm_arch_dev_ioctl_check_extension(long ext)
+static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 {
        return 0;
 }
index f0e8607..6eaff28 100644 (file)
@@ -25,9 +25,6 @@
 
 #define __hyp_text __section(.hyp.text) notrace
 
-#define kern_hyp_va(v) (v)
-#define hyp_kern_va(v) (v)
-
 #define __ACCESS_CP15(CRn, Op1, CRm, Op2)      \
        "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32
 #define __ACCESS_CP15_64(Op1, CRm)             \
index f9a6506..3bb803d 100644 (file)
  * We directly use the kernel VA for the HYP, as we can directly share
  * the mapping (HTTBR "covers" TTBR1).
  */
-#define HYP_PAGE_OFFSET_MASK   UL(~0)
-#define HYP_PAGE_OFFSET                PAGE_OFFSET
-#define KERN_TO_HYP(kva)       (kva)
-
-/*
- * Our virtual mapping for the boot-time MMU-enable code. Must be
- * shared across all the page-tables. Conveniently, we use the vectors
- * page, where no kernel data will ever be shared with HYP.
- */
-#define TRAMPOLINE_VA          UL(CONFIG_VECTORS_BASE)
+#define kern_hyp_va(kva)       (kva)
 
 /*
  * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
@@ -49,9 +40,8 @@
 #include <asm/pgalloc.h>
 #include <asm/stage2_pgtable.h>
 
-int create_hyp_mappings(void *from, void *to);
+int create_hyp_mappings(void *from, void *to, pgprot_t prot);
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
-void free_boot_hyp_pgd(void);
 void free_hyp_pgds(void);
 
 void stage2_unmap_vm(struct kvm *kvm);
@@ -65,7 +55,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
-phys_addr_t kvm_mmu_get_boot_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
 phys_addr_t kvm_get_idmap_start(void);
 int kvm_mmu_init(void);
index d622040..a8d656d 100644 (file)
@@ -97,7 +97,9 @@ extern pgprot_t               pgprot_s2_device;
 #define PAGE_READONLY_EXEC     _MOD_PROT(pgprot_user, L_PTE_USER | L_PTE_RDONLY)
 #define PAGE_KERNEL            _MOD_PROT(pgprot_kernel, L_PTE_XN)
 #define PAGE_KERNEL_EXEC       pgprot_kernel
-#define PAGE_HYP               _MOD_PROT(pgprot_kernel, L_PTE_HYP)
+#define PAGE_HYP               _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_XN)
+#define PAGE_HYP_EXEC          _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY)
+#define PAGE_HYP_RO            _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN)
 #define PAGE_HYP_DEVICE                _MOD_PROT(pgprot_hyp_device, L_PTE_HYP)
 #define PAGE_S2                        _MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY)
 #define PAGE_S2_DEVICE         _MOD_PROT(pgprot_s2_device, L_PTE_S2_RDONLY)
index d4ceaf5..a2e75b8 100644 (file)
@@ -80,6 +80,10 @@ static inline bool is_kernel_in_hyp_mode(void)
        return false;
 }
 
+/* The section containing the hypervisor idmap text */
+extern char __hyp_idmap_text_start[];
+extern char __hyp_idmap_text_end[];
+
 /* The section containing the hypervisor text */
 extern char __hyp_text_start[];
 extern char __hyp_text_end[];
index 02abfff..95a0005 100644 (file)
@@ -46,13 +46,6 @@ config KVM_ARM_HOST
        ---help---
          Provides host support for ARM processors.
 
-config KVM_NEW_VGIC
-       bool "New VGIC implementation"
-       depends on KVM
-       default y
-       ---help---
-         uses the new VGIC implementation
-
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
index a596b58..5e28df8 100644 (file)
@@ -22,7 +22,6 @@ obj-y += kvm-arm.o init.o interrupts.o
 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
 obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
 
-ifeq ($(CONFIG_KVM_NEW_VGIC),y)
 obj-y += $(KVM)/arm/vgic/vgic.o
 obj-y += $(KVM)/arm/vgic/vgic-init.o
 obj-y += $(KVM)/arm/vgic/vgic-irqfd.o
@@ -30,9 +29,4 @@ obj-y += $(KVM)/arm/vgic/vgic-v2.o
 obj-y += $(KVM)/arm/vgic/vgic-mmio.o
 obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o
 obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o
-else
-obj-y += $(KVM)/arm/vgic.o
-obj-y += $(KVM)/arm/vgic-v2.o
-obj-y += $(KVM)/arm/vgic-v2-emul.o
-endif
 obj-y += $(KVM)/arm/arch_timer.o
index f1bde7c..d94bb90 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kvm_host.h>
+#include <linux/list.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
@@ -122,7 +123,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        if (ret)
                goto out_fail_alloc;
 
-       ret = create_hyp_mappings(kvm, kvm + 1);
+       ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
        if (ret)
                goto out_free_stage2_pgd;
 
@@ -201,7 +202,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = KVM_MAX_VCPUS;
                break;
        default:
-               r = kvm_arch_dev_ioctl_check_extension(ext);
+               r = kvm_arch_dev_ioctl_check_extension(kvm, ext);
                break;
        }
        return r;
@@ -239,7 +240,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
        if (err)
                goto free_vcpu;
 
-       err = create_hyp_mappings(vcpu, vcpu + 1);
+       err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
        if (err)
                goto vcpu_uninit;
 
@@ -377,7 +378,7 @@ void force_vm_exit(const cpumask_t *mask)
 
 /**
  * need_new_vmid_gen - check that the VMID is still valid
- * @kvm: The VM's VMID to checkt
+ * @kvm: The VM's VMID to check
  *
  * return true if there is a new generation of VMIDs being used
  *
@@ -616,7 +617,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                 * Enter the guest
                 */
                trace_kvm_entry(*vcpu_pc(vcpu));
-               __kvm_guest_enter();
+               guest_enter_irqoff();
                vcpu->mode = IN_GUEST_MODE;
 
                ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
@@ -642,14 +643,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                local_irq_enable();
 
                /*
-                * We do local_irq_enable() before calling kvm_guest_exit() so
+                * We do local_irq_enable() before calling guest_exit() so
                 * that if a timer interrupt hits while running the guest we
                 * account that tick as being spent in the guest.  We enable
-                * preemption after calling kvm_guest_exit() so that if we get
+                * preemption after calling guest_exit() so that if we get
                 * preempted we make sure ticks after that is not counted as
                 * guest time.
                 */
-               kvm_guest_exit();
+               guest_exit();
                trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
 
                /*
@@ -1039,7 +1040,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 static void cpu_init_hyp_mode(void *dummy)
 {
-       phys_addr_t boot_pgd_ptr;
        phys_addr_t pgd_ptr;
        unsigned long hyp_stack_ptr;
        unsigned long stack_page;
@@ -1048,13 +1048,12 @@ static void cpu_init_hyp_mode(void *dummy)
        /* Switch from the HYP stub to our own HYP init vector */
        __hyp_set_vectors(kvm_get_idmap_vector());
 
-       boot_pgd_ptr = kvm_mmu_get_boot_httbr();
        pgd_ptr = kvm_mmu_get_httbr();
        stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
        hyp_stack_ptr = stack_page + PAGE_SIZE;
        vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector);
 
-       __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+       __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
        __cpu_init_stage2();
 
        kvm_arm_init_debug();
@@ -1076,15 +1075,9 @@ static void cpu_hyp_reinit(void)
 
 static void cpu_hyp_reset(void)
 {
-       phys_addr_t boot_pgd_ptr;
-       phys_addr_t phys_idmap_start;
-
-       if (!is_kernel_in_hyp_mode()) {
-               boot_pgd_ptr = kvm_mmu_get_boot_httbr();
-               phys_idmap_start = kvm_get_idmap_start();
-
-               __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start);
-       }
+       if (!is_kernel_in_hyp_mode())
+               __cpu_reset_hyp_mode(hyp_default_vectors,
+                                    kvm_get_idmap_start());
 }
 
 static void _kvm_arch_hardware_enable(void *discard)
@@ -1294,14 +1287,14 @@ static int init_hyp_mode(void)
         * Map the Hyp-code called directly from the host
         */
        err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
-                                 kvm_ksym_ref(__hyp_text_end));
+                                 kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
        if (err) {
                kvm_err("Cannot map world-switch code\n");
                goto out_err;
        }
 
        err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
-                                 kvm_ksym_ref(__end_rodata));
+                                 kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
        if (err) {
                kvm_err("Cannot map rodata section\n");
                goto out_err;
@@ -1312,7 +1305,8 @@ static int init_hyp_mode(void)
         */
        for_each_possible_cpu(cpu) {
                char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
-               err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE);
+               err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
+                                         PAGE_HYP);
 
                if (err) {
                        kvm_err("Cannot map hyp stack\n");
@@ -1324,7 +1318,7 @@ static int init_hyp_mode(void)
                kvm_cpu_context_t *cpu_ctxt;
 
                cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu);
-               err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1);
+               err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP);
 
                if (err) {
                        kvm_err("Cannot map host CPU state: %d\n", err);
@@ -1332,10 +1326,6 @@ static int init_hyp_mode(void)
                }
        }
 
-#ifndef CONFIG_HOTPLUG_CPU
-       free_boot_hyp_pgd();
-#endif
-
        /* set size of VMID supported by CPU */
        kvm_vmid_bits = kvm_get_vmid_bits();
        kvm_info("%d-bit VMID\n", kvm_vmid_bits);
index a494def..af93e3f 100644 (file)
@@ -210,7 +210,7 @@ bool kvm_condition_valid(struct kvm_vcpu *vcpu)
  * @vcpu:      The VCPU pointer
  *
  * When exceptions occur while instructions are executed in Thumb IF-THEN
- * blocks, the ITSTATE field of the CPSR is not advanved (updated), so we have
+ * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
  * to do this little bit of work manually. The fields map like this:
  *
  * IT[7:0] -> CPSR[26:25],CPSR[15:10]
index 9093ed0..9aca920 100644 (file)
@@ -182,7 +182,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
 /**
  * kvm_arm_copy_reg_indices - get indices of all registers.
  *
- * We do core registers right here, then we apppend coproc regs.
+ * We do core registers right here, then we append coproc regs.
  */
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
 {
index 1f9ae17..bf89c91 100644 (file)
  *       r2,r3 = Hypervisor pgd pointer
  *
  * The init scenario is:
- * - We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd,
- *   runtime stack, runtime vectors
- * - Enable the MMU with the boot pgd
- * - Jump to a target into the trampoline page (remember, this is the same
- *   physical page!)
- * - Now switch to the runtime pgd (same VA, and still the same physical
- *   page!)
+ * - We jump in HYP with 3 parameters: runtime HYP pgd, runtime stack,
+ *   runtime vectors
  * - Invalidate TLBs
  * - Set stack and vectors
+ * - Setup the page tables
+ * - Enable the MMU
  * - Profit! (or eret, if you only care about the code).
- *
- * As we only have four registers available to pass parameters (and we
- * need six), we split the init in two phases:
- * - Phase 1: r0 = 0, r1 = 0, r2,r3 contain the boot PGD.
- *   Provides the basic HYP init, and enable the MMU.
- * - Phase 2: r0 = ToS, r1 = vectors, r2,r3 contain the runtime PGD.
- *   Switches to the runtime PGD, set stack and vectors.
  */
 
        .text
@@ -68,8 +58,11 @@ __kvm_hyp_init:
        W(b)    .
 
 __do_hyp_init:
-       cmp     r0, #0                  @ We have a SP?
-       bne     phase2                  @ Yes, second stage init
+       @ Set stack pointer
+       mov     sp, r0
+
+       @ Set HVBAR to point to the HYP vectors
+       mcr     p15, 4, r1, c12, c0, 0  @ HVBAR
 
        @ Set the HTTBR to point to the hypervisor PGD pointer passed
        mcrr    p15, 4, rr_lo_hi(r2, r3), c2
@@ -114,34 +107,25 @@ __do_hyp_init:
  THUMB(        ldr     r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE)          )
        orr     r1, r1, r2
        orr     r0, r0, r1
-       isb
        mcr     p15, 4, r0, c1, c0, 0   @ HSCR
+       isb
 
-       @ End of init phase-1
        eret
 
-phase2:
-       @ Set stack pointer
-       mov     sp, r0
-
-       @ Set HVBAR to point to the HYP vectors
-       mcr     p15, 4, r1, c12, c0, 0  @ HVBAR
-
-       @ Jump to the trampoline page
-       ldr     r0, =TRAMPOLINE_VA
-       adr     r1, target
-       bfi     r0, r1, #0, #PAGE_SHIFT
-       ret     r0
+       @ r0 : stub vectors address
+ENTRY(__kvm_hyp_reset)
+       /* We're now in idmap, disable MMU */
+       mrc     p15, 4, r1, c1, c0, 0   @ HSCTLR
+       ldr     r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I)
+       bic     r1, r1, r2
+       mcr     p15, 4, r1, c1, c0, 0   @ HSCTLR
 
-target:        @ We're now in the trampoline code, switch page tables
-       mcrr    p15, 4, rr_lo_hi(r2, r3), c2
+       /* Install stub vectors */
+       mcr     p15, 4, r0, c12, c0, 0  @ HVBAR
        isb
 
-       @ Invalidate the old TLBs
-       mcr     p15, 4, r0, c8, c7, 0   @ TLBIALLH
-       dsb     ish
-
        eret
+ENDPROC(__kvm_hyp_reset)
 
        .ltorg
 
index 45c43ae..bda27b6 100644 (file)
@@ -32,8 +32,6 @@
 
 #include "trace.h"
 
-extern char  __hyp_idmap_text_start[], __hyp_idmap_text_end[];
-
 static pgd_t *boot_hyp_pgd;
 static pgd_t *hyp_pgd;
 static pgd_t *merged_hyp_pgd;
@@ -483,28 +481,6 @@ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
        } while (pgd++, addr = next, addr != end);
 }
 
-/**
- * free_boot_hyp_pgd - free HYP boot page tables
- *
- * Free the HYP boot page tables. The bounce page is also freed.
- */
-void free_boot_hyp_pgd(void)
-{
-       mutex_lock(&kvm_hyp_pgd_mutex);
-
-       if (boot_hyp_pgd) {
-               unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
-               unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
-               free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
-               boot_hyp_pgd = NULL;
-       }
-
-       if (hyp_pgd)
-               unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
-
-       mutex_unlock(&kvm_hyp_pgd_mutex);
-}
-
 /**
  * free_hyp_pgds - free Hyp-mode page tables
  *
@@ -519,15 +495,20 @@ void free_hyp_pgds(void)
 {
        unsigned long addr;
 
-       free_boot_hyp_pgd();
-
        mutex_lock(&kvm_hyp_pgd_mutex);
 
+       if (boot_hyp_pgd) {
+               unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
+               free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
+               boot_hyp_pgd = NULL;
+       }
+
        if (hyp_pgd) {
+               unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE);
                for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
-                       unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+                       unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE);
                for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
-                       unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+                       unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE);
 
                free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
                hyp_pgd = NULL;
@@ -679,17 +660,18 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
  * @from:      The virtual kernel start address of the range
  * @to:                The virtual kernel end address of the range (exclusive)
+ * @prot:      The protection to be applied to this range
  *
  * The same virtual address as the kernel virtual address is also used
  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
  * physical pages.
  */
-int create_hyp_mappings(void *from, void *to)
+int create_hyp_mappings(void *from, void *to, pgprot_t prot)
 {
        phys_addr_t phys_addr;
        unsigned long virt_addr;
-       unsigned long start = KERN_TO_HYP((unsigned long)from);
-       unsigned long end = KERN_TO_HYP((unsigned long)to);
+       unsigned long start = kern_hyp_va((unsigned long)from);
+       unsigned long end = kern_hyp_va((unsigned long)to);
 
        if (is_kernel_in_hyp_mode())
                return 0;
@@ -704,7 +686,7 @@ int create_hyp_mappings(void *from, void *to)
                err = __create_hyp_mappings(hyp_pgd, virt_addr,
                                            virt_addr + PAGE_SIZE,
                                            __phys_to_pfn(phys_addr),
-                                           PAGE_HYP);
+                                           prot);
                if (err)
                        return err;
        }
@@ -723,8 +705,8 @@ int create_hyp_mappings(void *from, void *to)
  */
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
 {
-       unsigned long start = KERN_TO_HYP((unsigned long)from);
-       unsigned long end = KERN_TO_HYP((unsigned long)to);
+       unsigned long start = kern_hyp_va((unsigned long)from);
+       unsigned long end = kern_hyp_va((unsigned long)to);
 
        if (is_kernel_in_hyp_mode())
                return 0;
@@ -1687,14 +1669,6 @@ phys_addr_t kvm_mmu_get_httbr(void)
                return virt_to_phys(hyp_pgd);
 }
 
-phys_addr_t kvm_mmu_get_boot_httbr(void)
-{
-       if (__kvm_cpu_uses_extended_idmap())
-               return virt_to_phys(merged_hyp_pgd);
-       else
-               return virt_to_phys(boot_hyp_pgd);
-}
-
 phys_addr_t kvm_get_idmap_vector(void)
 {
        return hyp_idmap_vector;
@@ -1705,6 +1679,22 @@ phys_addr_t kvm_get_idmap_start(void)
        return hyp_idmap_start;
 }
 
+static int kvm_map_idmap_text(pgd_t *pgd)
+{
+       int err;
+
+       /* Create the idmap in the boot page tables */
+       err =   __create_hyp_mappings(pgd,
+                                     hyp_idmap_start, hyp_idmap_end,
+                                     __phys_to_pfn(hyp_idmap_start),
+                                     PAGE_HYP_EXEC);
+       if (err)
+               kvm_err("Failed to idmap %lx-%lx\n",
+                       hyp_idmap_start, hyp_idmap_end);
+
+       return err;
+}
+
 int kvm_mmu_init(void)
 {
        int err;
@@ -1719,28 +1709,41 @@ int kvm_mmu_init(void)
         */
        BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
 
-       hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
-       boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
+       kvm_info("IDMAP page: %lx\n", hyp_idmap_start);
+       kvm_info("HYP VA range: %lx:%lx\n",
+                kern_hyp_va(PAGE_OFFSET), kern_hyp_va(~0UL));
 
-       if (!hyp_pgd || !boot_hyp_pgd) {
-               kvm_err("Hyp mode PGD not allocated\n");
-               err = -ENOMEM;
+       if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
+           hyp_idmap_start <  kern_hyp_va(~0UL)) {
+               /*
+                * The idmap page is intersecting with the VA space,
+                * it is not safe to continue further.
+                */
+               kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
+               err = -EINVAL;
                goto out;
        }
 
-       /* Create the idmap in the boot page tables */
-       err =   __create_hyp_mappings(boot_hyp_pgd,
-                                     hyp_idmap_start, hyp_idmap_end,
-                                     __phys_to_pfn(hyp_idmap_start),
-                                     PAGE_HYP);
-
-       if (err) {
-               kvm_err("Failed to idmap %lx-%lx\n",
-                       hyp_idmap_start, hyp_idmap_end);
+       hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
+       if (!hyp_pgd) {
+               kvm_err("Hyp mode PGD not allocated\n");
+               err = -ENOMEM;
                goto out;
        }
 
        if (__kvm_cpu_uses_extended_idmap()) {
+               boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+                                                        hyp_pgd_order);
+               if (!boot_hyp_pgd) {
+                       kvm_err("Hyp boot PGD not allocated\n");
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               err = kvm_map_idmap_text(boot_hyp_pgd);
+               if (err)
+                       goto out;
+
                merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
                if (!merged_hyp_pgd) {
                        kvm_err("Failed to allocate extra HYP pgd\n");
@@ -1748,29 +1751,10 @@ int kvm_mmu_init(void)
                }
                __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
                                    hyp_idmap_start);
-               return 0;
-       }
-
-       /* Map the very same page at the trampoline VA */
-       err =   __create_hyp_mappings(boot_hyp_pgd,
-                                     TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
-                                     __phys_to_pfn(hyp_idmap_start),
-                                     PAGE_HYP);
-       if (err) {
-               kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
-                       TRAMPOLINE_VA);
-               goto out;
-       }
-
-       /* Map the same page again into the runtime page tables */
-       err =   __create_hyp_mappings(hyp_pgd,
-                                     TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
-                                     __phys_to_pfn(hyp_idmap_start),
-                                     PAGE_HYP);
-       if (err) {
-               kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
-                       TRAMPOLINE_VA);
-               goto out;
+       } else {
+               err = kvm_map_idmap_text(hyp_pgd);
+               if (err)
+                       goto out;
        }
 
        return 0;
index 0048b5a..4b5e802 100644 (file)
@@ -52,7 +52,7 @@ static const struct kvm_irq_level cortexa_vtimer_irq = {
  * @vcpu: The VCPU pointer
  *
  * This function finds the right table above and sets the registers on the
- * virtual CPU struct to their architectually defined reset values.
+ * virtual CPU struct to their architecturally defined reset values.
  */
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 {
index 49dd1bd..7099f26 100644 (file)
@@ -36,8 +36,9 @@
 #define ARM64_HAS_VIRT_HOST_EXTN               11
 #define ARM64_WORKAROUND_CAVIUM_27456          12
 #define ARM64_HAS_32BIT_EL0                    13
+#define ARM64_HYP_OFFSET_LOW                   14
 
-#define ARM64_NCAPS                            14
+#define ARM64_NCAPS                            15
 
 #ifndef __ASSEMBLY__
 
index 2cdb6b5..4b5c977 100644 (file)
 /* Hyp System Trap Register */
 #define HSTR_EL2_T(x)  (1 << x)
 
-/* Hyp Coproccessor Trap Register Shifts */
+/* Hyp Coprocessor Trap Register Shifts */
 #define CPTR_EL2_TFP_SHIFT 10
 
 /* Hyp Coprocessor Trap Register */
index 49095fc..3eda975 100644 (file)
@@ -47,8 +47,7 @@
 
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
-int kvm_arch_dev_ioctl_check_extension(long ext);
-unsigned long kvm_hyp_reset_entry(void);
+int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext);
 void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
 
 struct kvm_arch {
@@ -348,8 +347,7 @@ int kvm_perf_teardown(void);
 
 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
 
-static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
-                                      phys_addr_t pgd_ptr,
+static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
                                       unsigned long hyp_stack_ptr,
                                       unsigned long vector_ptr)
 {
@@ -357,19 +355,14 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
         * Call initialization code, and switch to the full blown
         * HYP code.
         */
-       __kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr,
-                      hyp_stack_ptr, vector_ptr);
+       __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr);
 }
 
-static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+void __kvm_hyp_teardown(void);
+static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
                                        phys_addr_t phys_idmap_start)
 {
-       /*
-        * Call reset code, and switch back to stub hyp vectors.
-        * Uses __kvm_call_hyp() to avoid kaslr's kvm_ksym_ref() translation.
-        */
-       __kvm_call_hyp((void *)kvm_hyp_reset_entry(),
-                      boot_pgd_ptr, phys_idmap_start);
+       kvm_call_hyp(__kvm_hyp_teardown, phys_idmap_start);
 }
 
 static inline void kvm_arch_hardware_unsetup(void) {}
index 44eaff7..cff5105 100644 (file)
 
 #define __hyp_text __section(.hyp.text) notrace
 
-static inline unsigned long __kern_hyp_va(unsigned long v)
-{
-       asm volatile(ALTERNATIVE("and %0, %0, %1",
-                                "nop",
-                                ARM64_HAS_VIRT_HOST_EXTN)
-                    : "+r" (v) : "i" (HYP_PAGE_OFFSET_MASK));
-       return v;
-}
-
-#define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v)))
-
-static inline unsigned long __hyp_kern_va(unsigned long v)
-{
-       u64 offset = PAGE_OFFSET - HYP_PAGE_OFFSET;
-       asm volatile(ALTERNATIVE("add %0, %0, %1",
-                                "nop",
-                                ARM64_HAS_VIRT_HOST_EXTN)
-                    : "+r" (v) : "r" (offset));
-       return v;
-}
-
-#define hyp_kern_va(v) (typeof(v))(__hyp_kern_va((unsigned long)(v)))
-
 #define read_sysreg_elx(r,nvh,vh)                                      \
        ({                                                              \
                u64 reg;                                                \
index f05ac27..b6bb834 100644 (file)
  *
  * Instead, give the HYP mode its own VA region at a fixed offset from
  * the kernel by just masking the top bits (which are all ones for a
- * kernel address).
+ * kernel address). We need to find out how many bits to mask.
  *
- * ARMv8.1 (using VHE) does have a TTBR1_EL2, and doesn't use these
- * macros (the entire kernel runs at EL2).
+ * We want to build a set of page tables that cover both parts of the
+ * idmap (the trampoline page used to initialize EL2), and our normal
+ * runtime VA space, at the same time.
+ *
+ * Given that the kernel uses VA_BITS for its entire address space,
+ * and that half of that space (VA_BITS - 1) is used for the linear
+ * mapping, we can also limit the EL2 space to (VA_BITS - 1).
+ *
+ * The main question is "Within the VA_BITS space, does EL2 use the
+ * top or the bottom half of that space to shadow the kernel's linear
+ * mapping?". As we need to idmap the trampoline page, this is
+ * determined by the range in which this page lives.
+ *
+ * If the page is in the bottom half, we have to use the top half. If
+ * the page is in the top half, we have to use the bottom half:
+ *
+ * T = __virt_to_phys(__hyp_idmap_text_start)
+ * if (T & BIT(VA_BITS - 1))
+ *     HYP_VA_MIN = 0  //idmap in upper half
+ * else
+ *     HYP_VA_MIN = 1 << (VA_BITS - 1)
+ * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1
+ *
+ * This of course assumes that the trampoline page exists within the
+ * VA_BITS range. If it doesn't, then it means we're in the odd case
+ * where the kernel idmap (as well as HYP) uses more levels than the
+ * kernel runtime page tables (as seen when the kernel is configured
+ * for 4k pages, 39bits VA, and yet memory lives just above that
+ * limit, forcing the idmap to use 4 levels of page tables while the
+ * kernel itself only uses 3). In this particular case, it doesn't
+ * matter which side of VA_BITS we use, as we're guaranteed not to
+ * conflict with anything.
+ *
+ * When using VHE, there are no separate hyp mappings and all KVM
+ * functionality is already mapped as part of the main kernel
+ * mappings, and none of this applies in that case.
  */
-#define HYP_PAGE_OFFSET_SHIFT  VA_BITS
-#define HYP_PAGE_OFFSET_MASK   ((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 1)
-#define HYP_PAGE_OFFSET                (PAGE_OFFSET & HYP_PAGE_OFFSET_MASK)
 
-/*
- * Our virtual mapping for the idmap-ed MMU-enable code. Must be
- * shared across all the page-tables. Conveniently, we use the last
- * possible page, where no kernel mapping will ever exist.
- */
-#define TRAMPOLINE_VA          (HYP_PAGE_OFFSET_MASK & PAGE_MASK)
+#define HYP_PAGE_OFFSET_HIGH_MASK      ((UL(1) << VA_BITS) - 1)
+#define HYP_PAGE_OFFSET_LOW_MASK       ((UL(1) << (VA_BITS - 1)) - 1)
 
 #ifdef __ASSEMBLY__
 
 /*
  * Convert a kernel VA into a HYP VA.
  * reg: VA to be converted.
+ *
+ * This generates the following sequences:
+ * - High mask:
+ *             and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK
+ *             nop
+ * - Low mask:
+ *             and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK
+ *             and x0, x0, #HYP_PAGE_OFFSET_LOW_MASK
+ * - VHE:
+ *             nop
+ *             nop
+ *
+ * The "low mask" version works because the mask is a strict subset of
+ * the "high mask", hence performing the first mask for nothing.
+ * Should be completely invisible on any viable CPU.
  */
 .macro kern_hyp_va     reg
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN    
-       and     \reg, \reg, #HYP_PAGE_OFFSET_MASK
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
+       and     \reg, \reg, #HYP_PAGE_OFFSET_HIGH_MASK
 alternative_else
        nop
 alternative_endif
+alternative_if_not ARM64_HYP_OFFSET_LOW
+       nop
+alternative_else
+       and     \reg, \reg, #HYP_PAGE_OFFSET_LOW_MASK
+alternative_endif
 .endm
 
 #else
@@ -70,7 +117,22 @@ alternative_endif
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 
-#define KERN_TO_HYP(kva)       ((unsigned long)kva - PAGE_OFFSET + HYP_PAGE_OFFSET)
+static inline unsigned long __kern_hyp_va(unsigned long v)
+{
+       asm volatile(ALTERNATIVE("and %0, %0, %1",
+                                "nop",
+                                ARM64_HAS_VIRT_HOST_EXTN)
+                    : "+r" (v)
+                    : "i" (HYP_PAGE_OFFSET_HIGH_MASK));
+       asm volatile(ALTERNATIVE("nop",
+                                "and %0, %0, %1",
+                                ARM64_HYP_OFFSET_LOW)
+                    : "+r" (v)
+                    : "i" (HYP_PAGE_OFFSET_LOW_MASK));
+       return v;
+}
+
+#define kern_hyp_va(v)         (typeof(v))(__kern_hyp_va((unsigned long)(v)))
 
 /*
  * We currently only support a 40bit IPA.
@@ -81,9 +143,8 @@ alternative_endif
 
 #include <asm/stage2_pgtable.h>
 
-int create_hyp_mappings(void *from, void *to);
+int create_hyp_mappings(void *from, void *to, pgprot_t prot);
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
-void free_boot_hyp_pgd(void);
 void free_hyp_pgds(void);
 
 void stage2_unmap_vm(struct kvm *kvm);
@@ -97,7 +158,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
-phys_addr_t kvm_mmu_get_boot_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
 phys_addr_t kvm_get_idmap_start(void);
 int kvm_mmu_init(void);
index 2813748..c3ae239 100644 (file)
 #define PTE_CONT               (_AT(pteval_t, 1) << 52)        /* Contiguous range */
 #define PTE_PXN                        (_AT(pteval_t, 1) << 53)        /* Privileged XN */
 #define PTE_UXN                        (_AT(pteval_t, 1) << 54)        /* User XN */
+#define PTE_HYP_XN             (_AT(pteval_t, 1) << 54)        /* HYP XN */
 
 /*
  * AttrIndx[2:0] encoding (mapping attributes defined in the MAIR* registers).
index 29fcb33..39f5252 100644 (file)
@@ -55,7 +55,9 @@
 #define PAGE_KERNEL_EXEC       __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
 #define PAGE_KERNEL_EXEC_CONT  __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT)
 
-#define PAGE_HYP               __pgprot(_PAGE_DEFAULT | PTE_HYP)
+#define PAGE_HYP               __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN)
+#define PAGE_HYP_EXEC          __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
+#define PAGE_HYP_RO            __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
 #define PAGE_HYP_DEVICE                __pgprot(PROT_DEVICE_nGnRE | PTE_HYP)
 
 #define PAGE_S2                        __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY)
index bbc6a8c..1788545 100644 (file)
@@ -87,6 +87,10 @@ extern void verify_cpu_run_el(void);
 static inline void verify_cpu_run_el(void) {}
 #endif
 
+/* The section containing the hypervisor idmap text */
+extern char __hyp_idmap_text_start[];
+extern char __hyp_idmap_text_end[];
+
 /* The section containing the hypervisor text */
 extern char __hyp_text_start[];
 extern char __hyp_text_end[];
index f209ea1..3051f86 100644 (file)
@@ -87,9 +87,11 @@ struct kvm_regs {
 /* Supported VGICv3 address types  */
 #define KVM_VGIC_V3_ADDR_TYPE_DIST     2
 #define KVM_VGIC_V3_ADDR_TYPE_REDIST   3
+#define KVM_VGIC_ITS_ADDR_TYPE         4
 
 #define KVM_VGIC_V3_DIST_SIZE          SZ_64K
 #define KVM_VGIC_V3_REDIST_SIZE                (2 * SZ_64K)
+#define KVM_VGIC_V3_ITS_SIZE           (2 * SZ_64K)
 
 #define KVM_ARM_VCPU_POWER_OFF         0 /* CPU is started in OFF state */
 #define KVM_ARM_VCPU_EL1_32BIT         1 /* CPU running a 32bit VM */
index 916d27a..62272ea 100644 (file)
@@ -726,6 +726,19 @@ static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused
        return is_kernel_in_hyp_mode();
 }
 
+static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry,
+                          int __unused)
+{
+       phys_addr_t idmap_addr = virt_to_phys(__hyp_idmap_text_start);
+
+       /*
+        * Activate the lower HYP offset only if:
+        * - the idmap doesn't clash with it,
+        * - the kernel is not running at EL2.
+        */
+       return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode();
+}
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
        {
                .desc = "GIC system register CPU interface",
@@ -803,6 +816,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
                .field_pos = ID_AA64PFR0_EL0_SHIFT,
                .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
        },
+       {
+               .desc = "Reduced HYP mapping offset",
+               .capability = ARM64_HYP_OFFSET_LOW,
+               .def_scope = SCOPE_SYSTEM,
+               .matches = hyp_offset_low,
+       },
        {},
 };
 
index c4f26ef..9d2eff0 100644 (file)
@@ -36,6 +36,7 @@ config KVM
        select HAVE_KVM_IRQFD
        select KVM_ARM_VGIC_V3
        select KVM_ARM_PMU if HW_PERF_EVENTS
+       select HAVE_KVM_MSI
        ---help---
          Support hosting virtualized guest machines.
          We don't support KVM with 16K page tables yet, due to the multiple
@@ -54,13 +55,6 @@ config KVM_ARM_PMU
          Adds support for a virtual Performance Monitoring Unit (PMU) in
          virtual machines.
 
-config KVM_NEW_VGIC
-       bool "New VGIC implementation"
-       depends on KVM
-       default y
-        ---help---
-          uses the new VGIC implementation
-
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
index a7a958c..a5b9664 100644 (file)
@@ -20,7 +20,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
 kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
 
-ifeq ($(CONFIG_KVM_NEW_VGIC),y)
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o
@@ -30,12 +29,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o
-else
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
-endif
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
 kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
index 32fad75..3f9e157 100644 (file)
@@ -211,7 +211,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
 /**
  * kvm_arm_copy_reg_indices - get indices of all registers.
  *
- * We do core registers right here, then we apppend system regs.
+ * We do core registers right here, then we append system regs.
  */
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
 {
index a873a6d..6b29d3d 100644 (file)
@@ -53,10 +53,9 @@ __invalid:
        b       .
 
        /*
-        * x0: HYP boot pgd
-        * x1: HYP pgd
-        * x2: HYP stack
-        * x3: HYP vectors
+        * x0: HYP pgd
+        * x1: HYP stack
+        * x2: HYP vectors
         */
 __do_hyp_init:
 
@@ -110,71 +109,27 @@ __do_hyp_init:
        msr     sctlr_el2, x4
        isb
 
-       /* Skip the trampoline dance if we merged the boot and runtime PGDs */
-       cmp     x0, x1
-       b.eq    merged
-
-       /* MMU is now enabled. Get ready for the trampoline dance */
-       ldr     x4, =TRAMPOLINE_VA
-       adr     x5, target
-       bfi     x4, x5, #0, #PAGE_SHIFT
-       br      x4
-
-target: /* We're now in the trampoline code, switch page tables */
-       msr     ttbr0_el2, x1
-       isb
-
-       /* Invalidate the old TLBs */
-       tlbi    alle2
-       dsb     sy
-
-merged:
        /* Set the stack and new vectors */
+       kern_hyp_va     x1
+       mov     sp, x1
        kern_hyp_va     x2
-       mov     sp, x2
-       kern_hyp_va     x3
-       msr     vbar_el2, x3
+       msr     vbar_el2, x2
 
        /* Hello, World! */
        eret
 ENDPROC(__kvm_hyp_init)
 
        /*
-        * Reset kvm back to the hyp stub. This is the trampoline dance in
-        * reverse. If kvm used an extended idmap, __extended_idmap_trampoline
-        * calls this code directly in the idmap. In this case switching to the
-        * boot tables is a no-op.
-        *
-        * x0: HYP boot pgd
-        * x1: HYP phys_idmap_start
+        * Reset kvm back to the hyp stub.
         */
 ENTRY(__kvm_hyp_reset)
-       /* We're in trampoline code in VA, switch back to boot page tables */
-       msr     ttbr0_el2, x0
-       isb
-
-       /* Ensure the PA branch doesn't find a stale tlb entry or stale code. */
-       ic      iallu
-       tlbi    alle2
-       dsb     sy
-       isb
-
-       /* Branch into PA space */
-       adr     x0, 1f
-       bfi     x1, x0, #0, #PAGE_SHIFT
-       br      x1
-
        /* We're now in idmap, disable MMU */
-1:     mrs     x0, sctlr_el2
+       mrs     x0, sctlr_el2
        ldr     x1, =SCTLR_ELx_FLAGS
        bic     x0, x0, x1              // Clear SCTL_M and etc
        msr     sctlr_el2, x0
        isb
 
-       /* Invalidate the old TLBs */
-       tlbi    alle2
-       dsb     sy
-
        /* Install stub vectors */
        adr_l   x0, __hyp_stub_vectors
        msr     vbar_el2, x0
index 70254a6..ce9e5e5 100644 (file)
@@ -164,22 +164,3 @@ alternative_endif
 
        eret
 ENDPROC(__fpsimd_guest_restore)
-
-/*
- * When using the extended idmap, we don't have a trampoline page we can use
- * while we switch pages tables during __kvm_hyp_reset. Accessing the idmap
- * directly would be ideal, but if we're using the extended idmap then the
- * idmap is located above HYP_PAGE_OFFSET, and the address will be masked by
- * kvm_call_hyp using kern_hyp_va.
- *
- * x0: HYP boot pgd
- * x1: HYP phys_idmap_start
- */
-ENTRY(__extended_idmap_trampoline)
-       mov     x4, x1
-       adr_l   x3, __kvm_hyp_reset
-
-       /* insert __kvm_hyp_reset()s offset into phys_idmap_start */
-       bfi     x4, x3, #0, #PAGE_SHIFT
-       br      x4
-ENDPROC(__extended_idmap_trampoline)
index 2d87f36..f6d9694 100644 (file)
@@ -62,6 +62,21 @@ ENTRY(__vhe_hyp_call)
        isb
        ret
 ENDPROC(__vhe_hyp_call)
+
+/*
+ * Compute the idmap address of __kvm_hyp_reset based on the idmap
+ * start passed as a parameter, and jump there.
+ *
+ * x0: HYP phys_idmap_start
+ */
+ENTRY(__kvm_hyp_teardown)
+       mov     x4, x0
+       adr_l   x3, __kvm_hyp_reset
+
+       /* insert __kvm_hyp_reset()s offset into phys_idmap_start */
+       bfi     x4, x3, #0, #PAGE_SHIFT
+       br      x4
+ENDPROC(__kvm_hyp_teardown)
        
 el1_sync:                              // Guest trapped into EL2
        save_x0_to_x3
index 4373997..ae7855f 100644 (file)
@@ -299,9 +299,16 @@ static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%
 
 static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par)
 {
-       unsigned long str_va = (unsigned long)__hyp_panic_string;
+       unsigned long str_va;
 
-       __hyp_do_panic(hyp_kern_va(str_va),
+       /*
+        * Force the panic string to be loaded from the literal pool,
+        * making sure it is a kernel address and not a PC-relative
+        * reference.
+        */
+       asm volatile("ldr %0, =__hyp_panic_string" : "=r" (str_va));
+
+       __hyp_do_panic(str_va,
                       spsr,  elr,
                       read_sysreg(esr_el2),   read_sysreg_el2(far),
                       read_sysreg(hpfar_el2), par,
index b1ad730..5bc4608 100644 (file)
@@ -65,7 +65,7 @@ static bool cpu_has_32bit_el1(void)
  * We currently assume that the number of HW registers is uniform
  * across all CPUs (see cpuinfo_sanity_check).
  */
-int kvm_arch_dev_ioctl_check_extension(long ext)
+int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 {
        int r;
 
@@ -86,6 +86,12 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
        case KVM_CAP_VCPU_ATTRIBUTES:
                r = 1;
                break;
+       case KVM_CAP_MSI_DEVID:
+               if (!kvm)
+                       r = -EINVAL;
+               else
+                       r = kvm->arch.vgic.msis_require_devid;
+               break;
        default:
                r = 0;
        }
@@ -98,7 +104,7 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
  * @vcpu: The VCPU pointer
  *
  * This function finds the right table above and sets the registers on
- * the virtual CPU struct to their architectually defined reset
+ * the virtual CPU struct to their architecturally defined reset
  * values.
  */
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
@@ -132,31 +138,3 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
        /* Reset timer */
        return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }
-
-extern char __hyp_idmap_text_start[];
-
-unsigned long kvm_hyp_reset_entry(void)
-{
-       if (!__kvm_cpu_uses_extended_idmap()) {
-               unsigned long offset;
-
-               /*
-                * Find the address of __kvm_hyp_reset() in the trampoline page.
-                * This is present in the running page tables, and the boot page
-                * tables, so we call the code here to start the trampoline
-                * dance in reverse.
-                */
-               offset = (unsigned long)__kvm_hyp_reset
-                        - ((unsigned long)__hyp_idmap_text_start & PAGE_MASK);
-
-               return TRAMPOLINE_VA + offset;
-       } else {
-               /*
-                * KVM is running with merged page tables, which don't have the
-                * trampoline page mapped. We know the idmap is still mapped,
-                * but can't be called into directly. Use
-                * __extended_idmap_trampoline to do the call.
-                */
-               return (unsigned long)kvm_ksym_ref(__extended_idmap_trampoline);
-       }
-}
index a57d650..b0b225c 100644 (file)
@@ -1546,7 +1546,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu,
                                struct sys_reg_params *params)
 {
        u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu);
-       int cp;
+       int cp = -1;
 
        switch(hsr_ec) {
        case ESR_ELx_EC_CP15_32:
@@ -1558,7 +1558,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu,
                cp = 14;
                break;
        default:
-               WARN_ON((cp = -1));
+               WARN_ON(1);
        }
 
        kvm_err("Unsupported guest CP%d access at: %08lx\n",
index ac91939..2986713 100644 (file)
@@ -1488,6 +1488,7 @@ config CPU_MIPS64_R2
        select CPU_SUPPORTS_HIGHMEM
        select CPU_SUPPORTS_HUGEPAGES
        select CPU_SUPPORTS_MSA
+       select HAVE_KVM
        help
          Choose this option to build a kernel for release 2 or later of the
          MIPS64 architecture.  Many modern embedded systems with a 64-bit
@@ -1505,6 +1506,7 @@ config CPU_MIPS64_R6
        select CPU_SUPPORTS_MSA
        select GENERIC_CSUM
        select MIPS_O32_FP64_SUPPORT if MIPS32_O32
+       select HAVE_KVM
        help
          Choose this option to build a kernel for release 6 or later of the
          MIPS64 architecture.  New MIPS processors, starting with the Warrior
index 3b0e51d..c5b04e7 100644 (file)
@@ -45,7 +45,7 @@
 /*
  * Returns the kernel segment base of a given address
  */
-#define KSEGX(a)               ((_ACAST32_ (a)) & 0xe0000000)
+#define KSEGX(a)               ((_ACAST32_(a)) & _ACAST32_(0xe0000000))
 
 /*
  * Returns the physical address of a CKSEGx / XKPHYS address
index 36a391d..b54bcad 100644 (file)
@@ -19,6 +19,9 @@
 #include <linux/threads.h>
 #include <linux/spinlock.h>
 
+#include <asm/inst.h>
+#include <asm/mipsregs.h>
+
 /* MIPS KVM register ids */
 #define MIPS_CP0_32(_R, _S)                                    \
        (KVM_REG_MIPS_CP0 | KVM_REG_SIZE_U32 | (8 * (_R) + (_S)))
 #define KVM_REG_MIPS_CP0_CONFIG7       MIPS_CP0_32(16, 7)
 #define KVM_REG_MIPS_CP0_XCONTEXT      MIPS_CP0_64(20, 0)
 #define KVM_REG_MIPS_CP0_ERROREPC      MIPS_CP0_64(30, 0)
+#define KVM_REG_MIPS_CP0_KSCRATCH1     MIPS_CP0_64(31, 2)
+#define KVM_REG_MIPS_CP0_KSCRATCH2     MIPS_CP0_64(31, 3)
+#define KVM_REG_MIPS_CP0_KSCRATCH3     MIPS_CP0_64(31, 4)
+#define KVM_REG_MIPS_CP0_KSCRATCH4     MIPS_CP0_64(31, 5)
+#define KVM_REG_MIPS_CP0_KSCRATCH5     MIPS_CP0_64(31, 6)
+#define KVM_REG_MIPS_CP0_KSCRATCH6     MIPS_CP0_64(31, 7)
 
 
 #define KVM_MAX_VCPUS          1
 
 
 
-/* Special address that contains the comm page, used for reducing # of traps */
-#define KVM_GUEST_COMMPAGE_ADDR                0x0
+/*
+ * Special address that contains the comm page, used for reducing # of traps
+ * This needs to be within 32Kb of 0x0 (so the zero register can be used), but
+ * preferably not at 0x0 so that most kernel NULL pointer dereferences can be
+ * caught.
+ */
+#define KVM_GUEST_COMMPAGE_ADDR                ((PAGE_SIZE > 0x8000) ? 0 : \
+                                        (0x8000 - PAGE_SIZE))
 
 #define KVM_GUEST_KERNEL_MODE(vcpu)    ((kvm_read_c0_guest_status(vcpu->arch.cop0) & (ST0_EXL | ST0_ERL)) || \
                                        ((kvm_read_c0_guest_status(vcpu->arch.cop0) & KSU_USER) == 0))
 #define KVM_INVALID_ADDR               0xdeadbeef
 
 extern atomic_t kvm_mips_instance;
-extern kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
-extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
-extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
 
 struct kvm_vm_stat {
        u32 remote_tlb_flush;
@@ -126,28 +138,6 @@ struct kvm_vcpu_stat {
        u32 halt_wakeup;
 };
 
-enum kvm_mips_exit_types {
-       WAIT_EXITS,
-       CACHE_EXITS,
-       SIGNAL_EXITS,
-       INT_EXITS,
-       COP_UNUSABLE_EXITS,
-       TLBMOD_EXITS,
-       TLBMISS_LD_EXITS,
-       TLBMISS_ST_EXITS,
-       ADDRERR_ST_EXITS,
-       ADDRERR_LD_EXITS,
-       SYSCALL_EXITS,
-       RESVD_INST_EXITS,
-       BREAK_INST_EXITS,
-       TRAP_INST_EXITS,
-       MSA_FPE_EXITS,
-       FPE_EXITS,
-       MSA_DISABLED_EXITS,
-       FLUSH_DCACHE_EXITS,
-       MAX_KVM_MIPS_EXIT_TYPES
-};
-
 struct kvm_arch_memory_slot {
 };
 
@@ -215,73 +205,6 @@ struct mips_coproc {
 #define MIPS_CP0_CONFIG4_SEL   4
 #define MIPS_CP0_CONFIG5_SEL   5
 
-/* Config0 register bits */
-#define CP0C0_M                        31
-#define CP0C0_K23              28
-#define CP0C0_KU               25
-#define CP0C0_MDU              20
-#define CP0C0_MM               17
-#define CP0C0_BM               16
-#define CP0C0_BE               15
-#define CP0C0_AT               13
-#define CP0C0_AR               10
-#define CP0C0_MT               7
-#define CP0C0_VI               3
-#define CP0C0_K0               0
-
-/* Config1 register bits */
-#define CP0C1_M                        31
-#define CP0C1_MMU              25
-#define CP0C1_IS               22
-#define CP0C1_IL               19
-#define CP0C1_IA               16
-#define CP0C1_DS               13
-#define CP0C1_DL               10
-#define CP0C1_DA               7
-#define CP0C1_C2               6
-#define CP0C1_MD               5
-#define CP0C1_PC               4
-#define CP0C1_WR               3
-#define CP0C1_CA               2
-#define CP0C1_EP               1
-#define CP0C1_FP               0
-
-/* Config2 Register bits */
-#define CP0C2_M                        31
-#define CP0C2_TU               28
-#define CP0C2_TS               24
-#define CP0C2_TL               20
-#define CP0C2_TA               16
-#define CP0C2_SU               12
-#define CP0C2_SS               8
-#define CP0C2_SL               4
-#define CP0C2_SA               0
-
-/* Config3 Register bits */
-#define CP0C3_M                        31
-#define CP0C3_ISA_ON_EXC       16
-#define CP0C3_ULRI             13
-#define CP0C3_DSPP             10
-#define CP0C3_LPA              7
-#define CP0C3_VEIC             6
-#define CP0C3_VInt             5
-#define CP0C3_SP               4
-#define CP0C3_MT               2
-#define CP0C3_SM               1
-#define CP0C3_TL               0
-
-/* MMU types, the first four entries have the same layout as the
-   CP0C0_MT field.  */
-enum mips_mmu_types {
-       MMU_TYPE_NONE,
-       MMU_TYPE_R4000,
-       MMU_TYPE_RESERVED,
-       MMU_TYPE_FMT,
-       MMU_TYPE_R3000,
-       MMU_TYPE_R6000,
-       MMU_TYPE_R8000
-};
-
 /* Resume Flags */
 #define RESUME_FLAG_DR         (1<<0)  /* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST       (1<<1)  /* Resume host? */
@@ -298,11 +221,6 @@ enum emulation_result {
        EMULATE_PRIV_FAIL,
 };
 
-#define MIPS3_PG_G     0x00000001 /* Global; ignore ASID if in lo0 & lo1 */
-#define MIPS3_PG_V     0x00000002 /* Valid */
-#define MIPS3_PG_NV    0x00000000
-#define MIPS3_PG_D     0x00000004 /* Dirty */
-
 #define mips3_paddr_to_tlbpfn(x) \
        (((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME)
 #define mips3_tlbpfn_to_paddr(x) \
@@ -313,13 +231,11 @@ enum emulation_result {
 
 #define VPN2_MASK              0xffffe000
 #define KVM_ENTRYHI_ASID       MIPS_ENTRYHI_ASID
-#define TLB_IS_GLOBAL(x)       (((x).tlb_lo0 & MIPS3_PG_G) &&          \
-                                ((x).tlb_lo1 & MIPS3_PG_G))
+#define TLB_IS_GLOBAL(x)       ((x).tlb_lo[0] & (x).tlb_lo[1] & ENTRYLO_G)
 #define TLB_VPN2(x)            ((x).tlb_hi & VPN2_MASK)
 #define TLB_ASID(x)            ((x).tlb_hi & KVM_ENTRYHI_ASID)
-#define TLB_IS_VALID(x, va)    (((va) & (1 << PAGE_SHIFT))             \
-                                ? ((x).tlb_lo1 & MIPS3_PG_V)           \
-                                : ((x).tlb_lo0 & MIPS3_PG_V))
+#define TLB_LO_IDX(x, va)      (((va) >> PAGE_SHIFT) & 1)
+#define TLB_IS_VALID(x, va)    ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_V)
 #define TLB_HI_VPN2_HIT(x, y)  ((TLB_VPN2(x) & ~(x).tlb_mask) ==       \
                                 ((y) & VPN2_MASK & ~(x).tlb_mask))
 #define TLB_HI_ASID_HIT(x, y)  (TLB_IS_GLOBAL(x) ||                    \
@@ -328,26 +244,23 @@ enum emulation_result {
 struct kvm_mips_tlb {
        long tlb_mask;
        long tlb_hi;
-       long tlb_lo0;
-       long tlb_lo1;
+       long tlb_lo[2];
 };
 
-#define KVM_MIPS_FPU_FPU       0x1
-#define KVM_MIPS_FPU_MSA       0x2
+#define KVM_MIPS_AUX_FPU       0x1
+#define KVM_MIPS_AUX_MSA       0x2
 
 #define KVM_MIPS_GUEST_TLB_SIZE        64
 struct kvm_vcpu_arch {
-       void *host_ebase, *guest_ebase;
+       void *guest_ebase;
        int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
        unsigned long host_stack;
        unsigned long host_gp;
 
        /* Host CP0 registers used when handling exits from guest */
        unsigned long host_cp0_badvaddr;
-       unsigned long host_cp0_cause;
        unsigned long host_cp0_epc;
-       unsigned long host_cp0_entryhi;
-       uint32_t guest_inst;
+       u32 host_cp0_cause;
 
        /* GPRS */
        unsigned long gprs[32];
@@ -357,8 +270,8 @@ struct kvm_vcpu_arch {
 
        /* FPU State */
        struct mips_fpu_struct fpu;
-       /* Which FPU state is loaded (KVM_MIPS_FPU_*) */
-       unsigned int fpu_inuse;
+       /* Which auxiliary state is loaded (KVM_MIPS_AUX_*) */
+       unsigned int aux_inuse;
 
        /* COP0 State */
        struct mips_coproc *cop0;
@@ -370,11 +283,11 @@ struct kvm_vcpu_arch {
 
        struct hrtimer comparecount_timer;
        /* Count timer control KVM register */
-       uint32_t count_ctl;
+       u32 count_ctl;
        /* Count bias from the raw time */
-       uint32_t count_bias;
+       u32 count_bias;
        /* Frequency of timer in Hz */
-       uint32_t count_hz;
+       u32 count_hz;
        /* Dynamic nanosecond bias (multiple of count_period) to avoid overflow */
        s64 count_dyn_bias;
        /* Resume time */
@@ -388,7 +301,7 @@ struct kvm_vcpu_arch {
        /* Bitmask of pending exceptions to be cleared */
        unsigned long pending_exceptions_clr;
 
-       unsigned long pending_load_cause;
+       u32 pending_load_cause;
 
        /* Save/Restore the entryhi register when are are preempted/scheduled back in */
        unsigned long preempt_entryhi;
@@ -397,8 +310,8 @@ struct kvm_vcpu_arch {
        struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE];
 
        /* Cached guest kernel/user ASIDs */
-       uint32_t guest_user_asid[NR_CPUS];
-       uint32_t guest_kernel_asid[NR_CPUS];
+       u32 guest_user_asid[NR_CPUS];
+       u32 guest_kernel_asid[NR_CPUS];
        struct mm_struct guest_kernel_mm, guest_user_mm;
 
        int last_sched_cpu;
@@ -408,6 +321,7 @@ struct kvm_vcpu_arch {
 
        u8 fpu_enabled;
        u8 msa_enabled;
+       u8 kscratch_enabled;
 };
 
 
@@ -461,6 +375,18 @@ struct kvm_vcpu_arch {
 #define kvm_write_c0_guest_config7(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][7] = (val))
 #define kvm_read_c0_guest_errorepc(cop0)       (cop0->reg[MIPS_CP0_ERROR_PC][0])
 #define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
+#define kvm_read_c0_guest_kscratch1(cop0)      (cop0->reg[MIPS_CP0_DESAVE][2])
+#define kvm_read_c0_guest_kscratch2(cop0)      (cop0->reg[MIPS_CP0_DESAVE][3])
+#define kvm_read_c0_guest_kscratch3(cop0)      (cop0->reg[MIPS_CP0_DESAVE][4])
+#define kvm_read_c0_guest_kscratch4(cop0)      (cop0->reg[MIPS_CP0_DESAVE][5])
+#define kvm_read_c0_guest_kscratch5(cop0)      (cop0->reg[MIPS_CP0_DESAVE][6])
+#define kvm_read_c0_guest_kscratch6(cop0)      (cop0->reg[MIPS_CP0_DESAVE][7])
+#define kvm_write_c0_guest_kscratch1(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][2] = (val))
+#define kvm_write_c0_guest_kscratch2(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][3] = (val))
+#define kvm_write_c0_guest_kscratch3(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][4] = (val))
+#define kvm_write_c0_guest_kscratch4(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][5] = (val))
+#define kvm_write_c0_guest_kscratch5(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][6] = (val))
+#define kvm_write_c0_guest_kscratch6(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][7] = (val))
 
 /*
  * Some of the guest registers may be modified asynchronously (e.g. from a
@@ -474,7 +400,7 @@ static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg,
        unsigned long temp;
        do {
                __asm__ __volatile__(
-               "       .set    mips3                           \n"
+               "       .set    "MIPS_ISA_ARCH_LEVEL"           \n"
                "       " __LL "%0, %1                          \n"
                "       or      %0, %2                          \n"
                "       " __SC  "%0, %1                         \n"
@@ -490,7 +416,7 @@ static inline void _kvm_atomic_clear_c0_guest_reg(unsigned long *reg,
        unsigned long temp;
        do {
                __asm__ __volatile__(
-               "       .set    mips3                           \n"
+               "       .set    "MIPS_ISA_ARCH_LEVEL"           \n"
                "       " __LL "%0, %1                          \n"
                "       and     %0, %2                          \n"
                "       " __SC  "%0, %1                         \n"
@@ -507,7 +433,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
        unsigned long temp;
        do {
                __asm__ __volatile__(
-               "       .set    mips3                           \n"
+               "       .set    "MIPS_ISA_ARCH_LEVEL"           \n"
                "       " __LL "%0, %1                          \n"
                "       and     %0, %2                          \n"
                "       or      %0, %3                          \n"
@@ -542,7 +468,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
 
 static inline bool kvm_mips_guest_can_have_fpu(struct kvm_vcpu_arch *vcpu)
 {
-       return (!__builtin_constant_p(cpu_has_fpu) || cpu_has_fpu) &&
+       return (!__builtin_constant_p(raw_cpu_has_fpu) || raw_cpu_has_fpu) &&
                vcpu->fpu_enabled;
 }
 
@@ -589,9 +515,11 @@ struct kvm_mips_callbacks {
        void (*dequeue_io_int)(struct kvm_vcpu *vcpu,
                               struct kvm_mips_interrupt *irq);
        int (*irq_deliver)(struct kvm_vcpu *vcpu, unsigned int priority,
-                          uint32_t cause);
+                          u32 cause);
        int (*irq_clear)(struct kvm_vcpu *vcpu, unsigned int priority,
-                        uint32_t cause);
+                        u32 cause);
+       unsigned long (*num_regs)(struct kvm_vcpu *vcpu);
+       int (*copy_reg_indices)(struct kvm_vcpu *vcpu, u64 __user *indices);
        int (*get_one_reg)(struct kvm_vcpu *vcpu,
                           const struct kvm_one_reg *reg, s64 *v);
        int (*set_one_reg)(struct kvm_vcpu *vcpu,
@@ -605,8 +533,13 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
 /* Debug: dump vcpu state */
 int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
 
-/* Trampoline ASM routine to start running in "Guest" context */
-extern int __kvm_mips_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
+
+/* Building of entry/exception code */
+int kvm_mips_entry_setup(void);
+void *kvm_mips_build_vcpu_run(void *addr);
+void *kvm_mips_build_exception(void *addr, void *handler);
+void *kvm_mips_build_exit(void *addr);
 
 /* FPU/MSA context management */
 void __kvm_save_fpu(struct kvm_vcpu_arch *vcpu);
@@ -622,11 +555,11 @@ void kvm_drop_fpu(struct kvm_vcpu *vcpu);
 void kvm_lose_fpu(struct kvm_vcpu *vcpu);
 
 /* TLB handling */
-uint32_t kvm_get_kernel_asid(struct kvm_vcpu *vcpu);
+u32 kvm_get_kernel_asid(struct kvm_vcpu *vcpu);
 
-uint32_t kvm_get_user_asid(struct kvm_vcpu *vcpu);
+u32 kvm_get_user_asid(struct kvm_vcpu *vcpu);
 
-uint32_t kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
+u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
 
 extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr,
                                           struct kvm_vcpu *vcpu);
@@ -635,22 +568,24 @@ extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
                                              struct kvm_vcpu *vcpu);
 
 extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
-                                               struct kvm_mips_tlb *tlb,
-                                               unsigned long *hpa0,
-                                               unsigned long *hpa1);
+                                               struct kvm_mips_tlb *tlb);
 
-extern enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
-                                                    uint32_t *opc,
+extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
+                                                    u32 *opc,
                                                     struct kvm_run *run,
                                                     struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause,
-                                                   uint32_t *opc,
+extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause,
+                                                   u32 *opc,
                                                    struct kvm_run *run,
                                                    struct kvm_vcpu *vcpu);
 
 extern void kvm_mips_dump_host_tlbs(void);
 extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu);
+extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
+                                  unsigned long entrylo0,
+                                  unsigned long entrylo1,
+                                  int flush_dcache_mask);
 extern void kvm_mips_flush_host_tlb(int skip_kseg0);
 extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi);
 
@@ -667,90 +602,90 @@ extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu);
 
 /* Emulation */
-uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu);
-enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause);
+u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu);
+enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
 
-extern enum emulation_result kvm_mips_emulate_inst(unsigned long cause,
-                                                  uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_inst(u32 cause,
+                                                  u32 *opc,
                                                   struct kvm_run *run,
                                                   struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
-                                                     uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_syscall(u32 cause,
+                                                     u32 *opc,
                                                      struct kvm_run *run,
                                                      struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
-                                                        uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
+                                                        u32 *opc,
                                                         struct kvm_run *run,
                                                         struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
-                                                       uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
+                                                       u32 *opc,
                                                        struct kvm_run *run,
                                                        struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
-                                                        uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
+                                                        u32 *opc,
                                                         struct kvm_run *run,
                                                         struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
-                                                       uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
+                                                       u32 *opc,
                                                        struct kvm_run *run,
                                                        struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
-                                                    uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
+                                                    u32 *opc,
                                                     struct kvm_run *run,
                                                     struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
-                                                     uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
+                                                     u32 *opc,
                                                      struct kvm_run *run,
                                                      struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_handle_ri(unsigned long cause,
-                                               uint32_t *opc,
+extern enum emulation_result kvm_mips_handle_ri(u32 cause,
+                                               u32 *opc,
                                                struct kvm_run *run,
                                                struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
-                                                    uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
+                                                    u32 *opc,
                                                     struct kvm_run *run,
                                                     struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
-                                                    uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
+                                                    u32 *opc,
                                                     struct kvm_run *run,
                                                     struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
-                                                      uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
+                                                      u32 *opc,
                                                       struct kvm_run *run,
                                                       struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
-                                                        uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
+                                                        u32 *opc,
                                                         struct kvm_run *run,
                                                         struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
-                                                     uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
+                                                     u32 *opc,
                                                      struct kvm_run *run,
                                                      struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
-                                                        uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
+                                                        u32 *opc,
                                                         struct kvm_run *run,
                                                         struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
                                                         struct kvm_run *run);
 
-uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu);
-void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count);
-void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack);
+u32 kvm_mips_read_count(struct kvm_vcpu *vcpu);
+void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count);
+void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack);
 void kvm_mips_init_count(struct kvm_vcpu *vcpu);
 int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl);
 int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume);
@@ -759,27 +694,27 @@ void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu);
 void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu);
 enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu);
 
-enum emulation_result kvm_mips_check_privilege(unsigned long cause,
-                                              uint32_t *opc,
+enum emulation_result kvm_mips_check_privilege(u32 cause,
+                                              u32 *opc,
                                               struct kvm_run *run,
                                               struct kvm_vcpu *vcpu);
 
-enum emulation_result kvm_mips_emulate_cache(uint32_t inst,
-                                            uint32_t *opc,
-                                            uint32_t cause,
+enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
+                                            u32 *opc,
+                                            u32 cause,
                                             struct kvm_run *run,
                                             struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_CP0(uint32_t inst,
-                                          uint32_t *opc,
-                                          uint32_t cause,
+enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
+                                          u32 *opc,
+                                          u32 cause,
                                           struct kvm_run *run,
                                           struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_store(uint32_t inst,
-                                            uint32_t cause,
+enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
+                                            u32 cause,
                                             struct kvm_run *run,
                                             struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_load(uint32_t inst,
-                                           uint32_t cause,
+enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
+                                           u32 cause,
                                            struct kvm_run *run,
                                            struct kvm_vcpu *vcpu);
 
@@ -789,13 +724,13 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu);
 
 /* Dynamic binary translation */
-extern int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc,
-                                     struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc,
+extern int kvm_mips_trans_cache_index(union mips_instruction inst,
+                                     u32 *opc, struct kvm_vcpu *vcpu);
+extern int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc,
                                   struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc,
+extern int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc,
                               struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc,
+extern int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
                               struct kvm_vcpu *vcpu);
 
 /* Misc */
index d68e685..bd8b9bb 100644 (file)
@@ -55,7 +55,7 @@
 #define cpu_has_mipsmt         0
 #define cpu_has_vint           0
 #define cpu_has_veic           0
-#define cpu_hwrena_impl_bits   0xc0000000
+#define cpu_hwrena_impl_bits   (MIPS_HWRENA_IMPL1 | MIPS_HWRENA_IMPL2)
 #define cpu_has_wsbh            1
 
 #define cpu_has_rixi           (cpu_data[0].cputype != CPU_CAVIUM_OCTEON)
index e1ca65c..def9d8d 100644 (file)
@@ -53,7 +53,7 @@
 #define CP0_SEGCTL2 $5, 4
 #define CP0_WIRED $6
 #define CP0_INFO $7
-#define CP0_HWRENA $7, 0
+#define CP0_HWRENA $7
 #define CP0_BADVADDR $8
 #define CP0_BADINSTR $8, 1
 #define CP0_COUNT $9
 #define TX49_CONF_CWFON                (_ULCAST_(1) << 27)
 
 /* Bits specific to the MIPS32/64 PRA. */
+#define MIPS_CONF_VI           (_ULCAST_(1) <<  3)
 #define MIPS_CONF_MT           (_ULCAST_(7) <<  7)
 #define MIPS_CONF_MT_TLB       (_ULCAST_(1) <<  7)
 #define MIPS_CONF_MT_FTLB      (_ULCAST_(4) <<  7)
 #define MIPS_CDMMBASE_ADDR_SHIFT 11
 #define MIPS_CDMMBASE_ADDR_START 15
 
+/* RDHWR register numbers */
+#define MIPS_HWR_CPUNUM                0       /* CPU number */
+#define MIPS_HWR_SYNCISTEP     1       /* SYNCI step size */
+#define MIPS_HWR_CC            2       /* Cycle counter */
+#define MIPS_HWR_CCRES         3       /* Cycle counter resolution */
+#define MIPS_HWR_ULR           29      /* UserLocal */
+#define MIPS_HWR_IMPL1         30      /* Implementation dependent */
+#define MIPS_HWR_IMPL2         31      /* Implementation dependent */
+
+/* Bits in HWREna register */
+#define MIPS_HWRENA_CPUNUM     (_ULCAST_(1) << MIPS_HWR_CPUNUM)
+#define MIPS_HWRENA_SYNCISTEP  (_ULCAST_(1) << MIPS_HWR_SYNCISTEP)
+#define MIPS_HWRENA_CC         (_ULCAST_(1) << MIPS_HWR_CC)
+#define MIPS_HWRENA_CCRES      (_ULCAST_(1) << MIPS_HWR_CCRES)
+#define MIPS_HWRENA_ULR                (_ULCAST_(1) << MIPS_HWR_ULR)
+#define MIPS_HWRENA_IMPL1      (_ULCAST_(1) << MIPS_HWR_IMPL1)
+#define MIPS_HWRENA_IMPL2      (_ULCAST_(1) << MIPS_HWR_IMPL2)
+
 /*
  * Bitfields in the TX39 family CP0 Configuration Register 3
  */
index d7bfdeb..4f5279a 100644 (file)
@@ -21,6 +21,7 @@ extern void *set_vi_handler(int n, vi_handler_t addr);
 
 extern void *set_except_vector(int n, void *addr);
 extern unsigned long ebase;
+extern unsigned int hwrena;
 extern void per_cpu_trap_init(bool);
 extern void cpu_cache_init(void);
 
index b6ecfee..f7929f6 100644 (file)
@@ -104,8 +104,13 @@ Ip_u1s2(_bltz);
 Ip_u1s2(_bltzl);
 Ip_u1u2s3(_bne);
 Ip_u2s3u1(_cache);
+Ip_u1u2(_cfc1);
+Ip_u2u1(_cfcmsa);
+Ip_u1u2(_ctc1);
+Ip_u2u1(_ctcmsa);
 Ip_u2u1s3(_daddiu);
 Ip_u3u1u2(_daddu);
+Ip_u1(_di);
 Ip_u2u1msbu3(_dins);
 Ip_u2u1msbu3(_dinsm);
 Ip_u1u2(_divu);
@@ -141,6 +146,8 @@ Ip_u1(_mfhi);
 Ip_u1(_mflo);
 Ip_u1u2u3(_mtc0);
 Ip_u1u2u3(_mthc0);
+Ip_u1(_mthi);
+Ip_u1(_mtlo);
 Ip_u3u1u2(_mul);
 Ip_u3u1u2(_or);
 Ip_u2u1u3(_ori);
index 8051f9a..77429d1 100644 (file)
 enum major_op {
        spec_op, bcond_op, j_op, jal_op,
        beq_op, bne_op, blez_op, bgtz_op,
-       addi_op, cbcond0_op = addi_op, addiu_op, slti_op, sltiu_op,
+       addi_op, pop10_op = addi_op, addiu_op, slti_op, sltiu_op,
        andi_op, ori_op, xori_op, lui_op,
        cop0_op, cop1_op, cop2_op, cop1x_op,
        beql_op, bnel_op, blezl_op, bgtzl_op,
-       daddi_op, cbcond1_op = daddi_op, daddiu_op, ldl_op, ldr_op,
+       daddi_op, pop30_op = daddi_op, daddiu_op, ldl_op, ldr_op,
        spec2_op, jalx_op, mdmx_op, msa_op = mdmx_op, spec3_op,
        lb_op, lh_op, lwl_op, lw_op,
        lbu_op, lhu_op, lwr_op, lwu_op,
        sb_op, sh_op, swl_op, sw_op,
        sdl_op, sdr_op, swr_op, cache_op,
        ll_op, lwc1_op, lwc2_op, bc6_op = lwc2_op, pref_op,
-       lld_op, ldc1_op, ldc2_op, beqzcjic_op = ldc2_op, ld_op,
+       lld_op, ldc1_op, ldc2_op, pop66_op = ldc2_op, ld_op,
        sc_op, swc1_op, swc2_op, balc6_op = swc2_op, major_3b_op,
-       scd_op, sdc1_op, sdc2_op, bnezcjialc_op = sdc2_op, sd_op
+       scd_op, sdc1_op, sdc2_op, pop76_op = sdc2_op, sd_op
 };
 
 /*
@@ -92,6 +92,50 @@ enum spec3_op {
        rdhwr_op  = 0x3b
 };
 
+/*
+ * Bits 10-6 minor opcode for r6 spec mult/div encodings
+ */
+enum mult_op {
+       mult_mult_op = 0x0,
+       mult_mul_op = 0x2,
+       mult_muh_op = 0x3,
+};
+enum multu_op {
+       multu_multu_op = 0x0,
+       multu_mulu_op = 0x2,
+       multu_muhu_op = 0x3,
+};
+enum div_op {
+       div_div_op = 0x0,
+       div_div6_op = 0x2,
+       div_mod_op = 0x3,
+};
+enum divu_op {
+       divu_divu_op = 0x0,
+       divu_divu6_op = 0x2,
+       divu_modu_op = 0x3,
+};
+enum dmult_op {
+       dmult_dmult_op = 0x0,
+       dmult_dmul_op = 0x2,
+       dmult_dmuh_op = 0x3,
+};
+enum dmultu_op {
+       dmultu_dmultu_op = 0x0,
+       dmultu_dmulu_op = 0x2,
+       dmultu_dmuhu_op = 0x3,
+};
+enum ddiv_op {
+       ddiv_ddiv_op = 0x0,
+       ddiv_ddiv6_op = 0x2,
+       ddiv_dmod_op = 0x3,
+};
+enum ddivu_op {
+       ddivu_ddivu_op = 0x0,
+       ddivu_ddivu6_op = 0x2,
+       ddivu_dmodu_op = 0x3,
+};
+
 /*
  * rt field of bcond opcodes.
  */
@@ -103,7 +147,7 @@ enum rt_op {
        bltzal_op, bgezal_op, bltzall_op, bgezall_op,
        rt_op_0x14, rt_op_0x15, rt_op_0x16, rt_op_0x17,
        rt_op_0x18, rt_op_0x19, rt_op_0x1a, rt_op_0x1b,
-       bposge32_op, rt_op_0x1d, rt_op_0x1e, rt_op_0x1f
+       bposge32_op, rt_op_0x1d, rt_op_0x1e, synci_op
 };
 
 /*
@@ -237,6 +281,21 @@ enum bshfl_func {
        seh_op  = 0x18,
 };
 
+/*
+ * MSA minor opcodes.
+ */
+enum msa_func {
+       msa_elm_op = 0x19,
+};
+
+/*
+ * MSA ELM opcodes.
+ */
+enum msa_elm {
+       msa_ctc_op = 0x3e,
+       msa_cfc_op = 0x7e,
+};
+
 /*
  * func field for MSA MI10 format.
  */
@@ -264,7 +323,7 @@ enum mm_major_op {
        mm_pool32b_op, mm_pool16b_op, mm_lhu16_op, mm_andi16_op,
        mm_addiu32_op, mm_lhu32_op, mm_sh32_op, mm_lh32_op,
        mm_pool32i_op, mm_pool16c_op, mm_lwsp16_op, mm_pool16d_op,
-       mm_ori32_op, mm_pool32f_op, mm_reserved1_op, mm_reserved2_op,
+       mm_ori32_op, mm_pool32f_op, mm_pool32s_op, mm_reserved2_op,
        mm_pool32c_op, mm_lwgp16_op, mm_lw16_op, mm_pool16e_op,
        mm_xori32_op, mm_jals32_op, mm_addiupc_op, mm_reserved3_op,
        mm_reserved4_op, mm_pool16f_op, mm_sb16_op, mm_beqz16_op,
@@ -360,7 +419,10 @@ enum mm_32axf_minor_op {
        mm_mflo32_op = 0x075,
        mm_jalrhb_op = 0x07c,
        mm_tlbwi_op = 0x08d,
+       mm_mthi32_op = 0x0b5,
        mm_tlbwr_op = 0x0cd,
+       mm_mtlo32_op = 0x0f5,
+       mm_di_op = 0x11d,
        mm_jalrs_op = 0x13c,
        mm_jalrshb_op = 0x17c,
        mm_sync_op = 0x1ad,
@@ -478,6 +540,13 @@ enum mm_32f_73_minor_op {
        mm_fcvts1_op = 0xed,
 };
 
+/*
+ * (microMIPS) POOL32S minor opcodes.
+ */
+enum mm_32s_minor_op {
+       mm_32s_elm_op = 0x16,
+};
+
 /*
  * (microMIPS) POOL16C minor opcodes.
  */
@@ -586,6 +655,36 @@ struct r_format {                  /* Register format */
        ;))))))
 };
 
+struct c0r_format {                    /* C0 register format */
+       __BITFIELD_FIELD(unsigned int opcode : 6,
+       __BITFIELD_FIELD(unsigned int rs : 5,
+       __BITFIELD_FIELD(unsigned int rt : 5,
+       __BITFIELD_FIELD(unsigned int rd : 5,
+       __BITFIELD_FIELD(unsigned int z: 8,
+       __BITFIELD_FIELD(unsigned int sel : 3,
+       ;))))))
+};
+
+struct mfmc0_format {                  /* MFMC0 register format */
+       __BITFIELD_FIELD(unsigned int opcode : 6,
+       __BITFIELD_FIELD(unsigned int rs : 5,
+       __BITFIELD_FIELD(unsigned int rt : 5,
+       __BITFIELD_FIELD(unsigned int rd : 5,
+       __BITFIELD_FIELD(unsigned int re : 5,
+       __BITFIELD_FIELD(unsigned int sc : 1,
+       __BITFIELD_FIELD(unsigned int : 2,
+       __BITFIELD_FIELD(unsigned int sel : 3,
+       ;))))))))
+};
+
+struct co_format {                     /* C0 CO format */
+       __BITFIELD_FIELD(unsigned int opcode : 6,
+       __BITFIELD_FIELD(unsigned int co : 1,
+       __BITFIELD_FIELD(unsigned int code : 19,
+       __BITFIELD_FIELD(unsigned int func : 6,
+       ;))))
+};
+
 struct p_format {              /* Performance counter format (R10000) */
        __BITFIELD_FIELD(unsigned int opcode : 6,
        __BITFIELD_FIELD(unsigned int rs : 5,
@@ -937,6 +1036,9 @@ union mips_instruction {
        struct u_format u_format;
        struct c_format c_format;
        struct r_format r_format;
+       struct c0r_format c0r_format;
+       struct mfmc0_format mfmc0_format;
+       struct co_format co_format;
        struct p_format p_format;
        struct f_format f_format;
        struct ma_format ma_format;
index 1ea973b..fae2f94 100644 (file)
@@ -339,71 +339,9 @@ void output_pm_defines(void)
 }
 #endif
 
-void output_cpuinfo_defines(void)
-{
-       COMMENT(" MIPS cpuinfo offsets. ");
-       DEFINE(CPUINFO_SIZE, sizeof(struct cpuinfo_mips));
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
-       OFFSET(CPUINFO_ASID_MASK, cpuinfo_mips, asid_mask);
-#endif
-}
-
 void output_kvm_defines(void)
 {
        COMMENT(" KVM/MIPS Specfic offsets. ");
-       DEFINE(VCPU_ARCH_SIZE, sizeof(struct kvm_vcpu_arch));
-       OFFSET(VCPU_RUN, kvm_vcpu, run);
-       OFFSET(VCPU_HOST_ARCH, kvm_vcpu, arch);
-
-       OFFSET(VCPU_HOST_EBASE, kvm_vcpu_arch, host_ebase);
-       OFFSET(VCPU_GUEST_EBASE, kvm_vcpu_arch, guest_ebase);
-
-       OFFSET(VCPU_HOST_STACK, kvm_vcpu_arch, host_stack);
-       OFFSET(VCPU_HOST_GP, kvm_vcpu_arch, host_gp);
-
-       OFFSET(VCPU_HOST_CP0_BADVADDR, kvm_vcpu_arch, host_cp0_badvaddr);
-       OFFSET(VCPU_HOST_CP0_CAUSE, kvm_vcpu_arch, host_cp0_cause);
-       OFFSET(VCPU_HOST_EPC, kvm_vcpu_arch, host_cp0_epc);
-       OFFSET(VCPU_HOST_ENTRYHI, kvm_vcpu_arch, host_cp0_entryhi);
-
-       OFFSET(VCPU_GUEST_INST, kvm_vcpu_arch, guest_inst);
-
-       OFFSET(VCPU_R0, kvm_vcpu_arch, gprs[0]);
-       OFFSET(VCPU_R1, kvm_vcpu_arch, gprs[1]);
-       OFFSET(VCPU_R2, kvm_vcpu_arch, gprs[2]);
-       OFFSET(VCPU_R3, kvm_vcpu_arch, gprs[3]);
-       OFFSET(VCPU_R4, kvm_vcpu_arch, gprs[4]);
-       OFFSET(VCPU_R5, kvm_vcpu_arch, gprs[5]);
-       OFFSET(VCPU_R6, kvm_vcpu_arch, gprs[6]);
-       OFFSET(VCPU_R7, kvm_vcpu_arch, gprs[7]);
-       OFFSET(VCPU_R8, kvm_vcpu_arch, gprs[8]);
-       OFFSET(VCPU_R9, kvm_vcpu_arch, gprs[9]);
-       OFFSET(VCPU_R10, kvm_vcpu_arch, gprs[10]);
-       OFFSET(VCPU_R11, kvm_vcpu_arch, gprs[11]);
-       OFFSET(VCPU_R12, kvm_vcpu_arch, gprs[12]);
-       OFFSET(VCPU_R13, kvm_vcpu_arch, gprs[13]);
-       OFFSET(VCPU_R14, kvm_vcpu_arch, gprs[14]);
-       OFFSET(VCPU_R15, kvm_vcpu_arch, gprs[15]);
-       OFFSET(VCPU_R16, kvm_vcpu_arch, gprs[16]);
-       OFFSET(VCPU_R17, kvm_vcpu_arch, gprs[17]);
-       OFFSET(VCPU_R18, kvm_vcpu_arch, gprs[18]);
-       OFFSET(VCPU_R19, kvm_vcpu_arch, gprs[19]);
-       OFFSET(VCPU_R20, kvm_vcpu_arch, gprs[20]);
-       OFFSET(VCPU_R21, kvm_vcpu_arch, gprs[21]);
-       OFFSET(VCPU_R22, kvm_vcpu_arch, gprs[22]);
-       OFFSET(VCPU_R23, kvm_vcpu_arch, gprs[23]);
-       OFFSET(VCPU_R24, kvm_vcpu_arch, gprs[24]);
-       OFFSET(VCPU_R25, kvm_vcpu_arch, gprs[25]);
-       OFFSET(VCPU_R26, kvm_vcpu_arch, gprs[26]);
-       OFFSET(VCPU_R27, kvm_vcpu_arch, gprs[27]);
-       OFFSET(VCPU_R28, kvm_vcpu_arch, gprs[28]);
-       OFFSET(VCPU_R29, kvm_vcpu_arch, gprs[29]);
-       OFFSET(VCPU_R30, kvm_vcpu_arch, gprs[30]);
-       OFFSET(VCPU_R31, kvm_vcpu_arch, gprs[31]);
-       OFFSET(VCPU_LO, kvm_vcpu_arch, lo);
-       OFFSET(VCPU_HI, kvm_vcpu_arch, hi);
-       OFFSET(VCPU_PC, kvm_vcpu_arch, pc);
-       BLANK();
 
        OFFSET(VCPU_FPR0, kvm_vcpu_arch, fpu.fpr[0]);
        OFFSET(VCPU_FPR1, kvm_vcpu_arch, fpu.fpr[1]);
@@ -441,14 +379,6 @@ void output_kvm_defines(void)
        OFFSET(VCPU_FCR31, kvm_vcpu_arch, fpu.fcr31);
        OFFSET(VCPU_MSA_CSR, kvm_vcpu_arch, fpu.msacsr);
        BLANK();
-
-       OFFSET(VCPU_COP0, kvm_vcpu_arch, cop0);
-       OFFSET(VCPU_GUEST_KERNEL_ASID, kvm_vcpu_arch, guest_kernel_asid);
-       OFFSET(VCPU_GUEST_USER_ASID, kvm_vcpu_arch, guest_user_asid);
-
-       OFFSET(COP0_TLB_HI, mips_coproc, reg[MIPS_CP0_TLB_HI][0]);
-       OFFSET(COP0_STATUS, mips_coproc, reg[MIPS_CP0_STATUS][0]);
-       BLANK();
 }
 
 #ifdef CONFIG_MIPS_CPS
index 6dc3f1f..46c227f 100644 (file)
@@ -790,7 +790,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
                epc += 4 + (insn.i_format.simmediate << 2);
                regs->cp0_epc = epc;
                break;
-       case beqzcjic_op:
+       case pop66_op:
                if (!cpu_has_mips_r6) {
                        ret = -SIGILL;
                        break;
@@ -798,7 +798,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
                /* Compact branch: BEQZC || JIC */
                regs->cp0_epc += 8;
                break;
-       case bnezcjialc_op:
+       case pop76_op:
                if (!cpu_has_mips_r6) {
                        ret = -SIGILL;
                        break;
@@ -809,8 +809,8 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
                regs->cp0_epc += 8;
                break;
 #endif
-       case cbcond0_op:
-       case cbcond1_op:
+       case pop10_op:
+       case pop30_op:
                /* Only valid for MIPS R6 */
                if (!cpu_has_mips_r6) {
                        ret = -SIGILL;
index 4a1712b..6fb4704 100644 (file)
@@ -619,17 +619,17 @@ static int simulate_rdhwr(struct pt_regs *regs, int rd, int rt)
        perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS,
                        1, regs, 0);
        switch (rd) {
-       case 0:         /* CPU number */
+       case MIPS_HWR_CPUNUM:           /* CPU number */
                regs->regs[rt] = smp_processor_id();
                return 0;
-       case 1:         /* SYNCI length */
+       case MIPS_HWR_SYNCISTEP:        /* SYNCI length */
                regs->regs[rt] = min(current_cpu_data.dcache.linesz,
                                     current_cpu_data.icache.linesz);
                return 0;
-       case 2:         /* Read count register */
+       case MIPS_HWR_CC:               /* Read count register */
                regs->regs[rt] = read_c0_count();
                return 0;
-       case 3:         /* Count register resolution */
+       case MIPS_HWR_CCRES:            /* Count register resolution */
                switch (current_cpu_type()) {
                case CPU_20KC:
                case CPU_25KF:
@@ -639,7 +639,7 @@ static int simulate_rdhwr(struct pt_regs *regs, int rd, int rt)
                        regs->regs[rt] = 2;
                }
                return 0;
-       case 29:
+       case MIPS_HWR_ULR:              /* Read UserLocal register */
                regs->regs[rt] = ti->tp_value;
                return 0;
        default:
@@ -1859,6 +1859,7 @@ void __noreturn nmi_exception_handler(struct pt_regs *regs)
 #define VECTORSPACING 0x100    /* for EI/VI mode */
 
 unsigned long ebase;
+EXPORT_SYMBOL_GPL(ebase);
 unsigned long exception_handlers[32];
 unsigned long vi_handlers[64];
 
@@ -2063,16 +2064,22 @@ static void configure_status(void)
                         status_set);
 }
 
+unsigned int hwrena;
+EXPORT_SYMBOL_GPL(hwrena);
+
 /* configure HWRENA register */
 static void configure_hwrena(void)
 {
-       unsigned int hwrena = cpu_hwrena_impl_bits;
+       hwrena = cpu_hwrena_impl_bits;
 
        if (cpu_has_mips_r2_r6)
-               hwrena |= 0x0000000f;
+               hwrena |= MIPS_HWRENA_CPUNUM |
+                         MIPS_HWRENA_SYNCISTEP |
+                         MIPS_HWRENA_CC |
+                         MIPS_HWRENA_CCRES;
 
        if (!noulri && cpu_has_userlocal)
-               hwrena |= (1 << 29);
+               hwrena |= MIPS_HWRENA_ULR;
 
        if (hwrena)
                write_c0_hwrena(hwrena);
index 2ae1282..7c56d6b 100644 (file)
@@ -17,6 +17,7 @@ if VIRTUALIZATION
 config KVM
        tristate "Kernel-based Virtual Machine (KVM) support"
        depends on HAVE_KVM
+       select EXPORT_UASM
        select PREEMPT_NOTIFIERS
        select ANON_INODES
        select KVM_MMIO
index 637ebbe..847429d 100644 (file)
@@ -7,9 +7,10 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
 
 common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o
 
-kvm-objs := $(common-objs-y) mips.o emulate.o locore.o \
+kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \
            interrupt.o stats.o commpage.o \
            dyntrans.o trap_emul.o fpu.o
+kvm-objs += mmu.o
 
 obj-$(CONFIG_KVM)      += kvm.o
 obj-y                  += callback.o tlb.o
index 2d6e976..a36b77e 100644 (file)
@@ -4,7 +4,7 @@
  * for more details.
  *
  * commpage, currently used for Virtual COP0 registers.
- * Mapped into the guest kernel @ 0x0.
+ * Mapped into the guest kernel @ KVM_GUEST_COMMPAGE_ADDR.
  *
  * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
  * Authors: Sanjay Lal <sanjayl@kymasys.com>
index f1527a4..d280894 100644 (file)
@@ -11,6 +11,7 @@
 
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/highmem.h>
 #include <linux/kvm_host.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 
 #include "commpage.h"
 
-#define SYNCI_TEMPLATE  0x041f0000
-#define SYNCI_BASE(x)   (((x) >> 21) & 0x1f)
-#define SYNCI_OFFSET    ((x) & 0xffff)
+/**
+ * kvm_mips_trans_replace() - Replace trapping instruction in guest memory.
+ * @vcpu:      Virtual CPU.
+ * @opc:       PC of instruction to replace.
+ * @replace:   Instruction to write
+ */
+static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc,
+                                 union mips_instruction replace)
+{
+       unsigned long paddr, flags;
+       void *vaddr;
+
+       if (KVM_GUEST_KSEGX((unsigned long)opc) == KVM_GUEST_KSEG0) {
+               paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
+                                                           (unsigned long)opc);
+               vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
+               vaddr += paddr & ~PAGE_MASK;
+               memcpy(vaddr, (void *)&replace, sizeof(u32));
+               local_flush_icache_range((unsigned long)vaddr,
+                                        (unsigned long)vaddr + 32);
+               kunmap_atomic(vaddr);
+       } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
+               local_irq_save(flags);
+               memcpy((void *)opc, (void *)&replace, sizeof(u32));
+               local_flush_icache_range((unsigned long)opc,
+                                        (unsigned long)opc + 32);
+               local_irq_restore(flags);
+       } else {
+               kvm_err("%s: Invalid address: %p\n", __func__, opc);
+               return -EFAULT;
+       }
 
-#define LW_TEMPLATE     0x8c000000
-#define CLEAR_TEMPLATE  0x00000020
-#define SW_TEMPLATE     0xac000000
+       return 0;
+}
 
-int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc,
+int kvm_mips_trans_cache_index(union mips_instruction inst, u32 *opc,
                               struct kvm_vcpu *vcpu)
 {
-       int result = 0;
-       unsigned long kseg0_opc;
-       uint32_t synci_inst = 0x0;
+       union mips_instruction nop_inst = { 0 };
 
        /* Replace the CACHE instruction, with a NOP */
-       kseg0_opc =
-           CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
-                      (vcpu, (unsigned long) opc));
-       memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t));
-       local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-
-       return result;
+       return kvm_mips_trans_replace(vcpu, opc, nop_inst);
 }
 
 /*
  * Address based CACHE instructions are transformed into synci(s). A little
  * heavy for just D-cache invalidates, but avoids an expensive trap
  */
-int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc,
+int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc,
                            struct kvm_vcpu *vcpu)
 {
-       int result = 0;
-       unsigned long kseg0_opc;
-       uint32_t synci_inst = SYNCI_TEMPLATE, base, offset;
-
-       base = (inst >> 21) & 0x1f;
-       offset = inst & 0xffff;
-       synci_inst |= (base << 21);
-       synci_inst |= offset;
-
-       kseg0_opc =
-           CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
-                      (vcpu, (unsigned long) opc));
-       memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t));
-       local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-
-       return result;
+       union mips_instruction synci_inst = { 0 };
+
+       synci_inst.i_format.opcode = bcond_op;
+       synci_inst.i_format.rs = inst.i_format.rs;
+       synci_inst.i_format.rt = synci_op;
+       if (cpu_has_mips_r6)
+               synci_inst.i_format.simmediate = inst.spec3_format.simmediate;
+       else
+               synci_inst.i_format.simmediate = inst.i_format.simmediate;
+
+       return kvm_mips_trans_replace(vcpu, opc, synci_inst);
 }
 
-int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu)
+int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc,
+                       struct kvm_vcpu *vcpu)
 {
-       int32_t rt, rd, sel;
-       uint32_t mfc0_inst;
-       unsigned long kseg0_opc, flags;
-
-       rt = (inst >> 16) & 0x1f;
-       rd = (inst >> 11) & 0x1f;
-       sel = inst & 0x7;
+       union mips_instruction mfc0_inst = { 0 };
+       u32 rd, sel;
 
-       if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) {
-               mfc0_inst = CLEAR_TEMPLATE;
-               mfc0_inst |= ((rt & 0x1f) << 16);
-       } else {
-               mfc0_inst = LW_TEMPLATE;
-               mfc0_inst |= ((rt & 0x1f) << 16);
-               mfc0_inst |= offsetof(struct kvm_mips_commpage,
-                                     cop0.reg[rd][sel]);
-       }
+       rd = inst.c0r_format.rd;
+       sel = inst.c0r_format.sel;
 
-       if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
-               kseg0_opc =
-                   CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
-                              (vcpu, (unsigned long) opc));
-               memcpy((void *)kseg0_opc, (void *)&mfc0_inst, sizeof(uint32_t));
-               local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-       } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
-               local_irq_save(flags);
-               memcpy((void *)opc, (void *)&mfc0_inst, sizeof(uint32_t));
-               local_flush_icache_range((unsigned long)opc,
-                                        (unsigned long)opc + 32);
-               local_irq_restore(flags);
+       if (rd == MIPS_CP0_ERRCTL && sel == 0) {
+               mfc0_inst.r_format.opcode = spec_op;
+               mfc0_inst.r_format.rd = inst.c0r_format.rt;
+               mfc0_inst.r_format.func = add_op;
        } else {
-               kvm_err("%s: Invalid address: %p\n", __func__, opc);
-               return -EFAULT;
+               mfc0_inst.i_format.opcode = lw_op;
+               mfc0_inst.i_format.rt = inst.c0r_format.rt;
+               mfc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR |
+                       offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
+#ifdef CONFIG_CPU_BIG_ENDIAN
+               if (sizeof(vcpu->arch.cop0->reg[0][0]) == 8)
+                       mfc0_inst.i_format.simmediate |= 4;
+#endif
        }
 
-       return 0;
+       return kvm_mips_trans_replace(vcpu, opc, mfc0_inst);
 }
 
-int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu)
+int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
+                       struct kvm_vcpu *vcpu)
 {
-       int32_t rt, rd, sel;
-       uint32_t mtc0_inst = SW_TEMPLATE;
-       unsigned long kseg0_opc, flags;
-
-       rt = (inst >> 16) & 0x1f;
-       rd = (inst >> 11) & 0x1f;
-       sel = inst & 0x7;
-
-       mtc0_inst |= ((rt & 0x1f) << 16);
-       mtc0_inst |= offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
-
-       if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
-               kseg0_opc =
-                   CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
-                              (vcpu, (unsigned long) opc));
-               memcpy((void *)kseg0_opc, (void *)&mtc0_inst, sizeof(uint32_t));
-               local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-       } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
-               local_irq_save(flags);
-               memcpy((void *)opc, (void *)&mtc0_inst, sizeof(uint32_t));
-               local_flush_icache_range((unsigned long)opc,
-                                        (unsigned long)opc + 32);
-               local_irq_restore(flags);
-       } else {
-               kvm_err("%s: Invalid address: %p\n", __func__, opc);
-               return -EFAULT;
-       }
-
-       return 0;
+       union mips_instruction mtc0_inst = { 0 };
+       u32 rd, sel;
+
+       rd = inst.c0r_format.rd;
+       sel = inst.c0r_format.sel;
+
+       mtc0_inst.i_format.opcode = sw_op;
+       mtc0_inst.i_format.rt = inst.c0r_format.rt;
+       mtc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR |
+               offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
+#ifdef CONFIG_CPU_BIG_ENDIAN
+       if (sizeof(vcpu->arch.cop0->reg[0][0]) == 8)
+               mtc0_inst.i_format.simmediate |= 4;
+#endif
+
+       return kvm_mips_trans_replace(vcpu, opc, mtc0_inst);
 }
index 645c8a1..6eb52b9 100644 (file)
@@ -52,7 +52,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
                goto unaligned;
 
        /* Read the instruction */
-       insn.word = kvm_get_inst((uint32_t *) epc, vcpu);
+       insn.word = kvm_get_inst((u32 *) epc, vcpu);
 
        if (insn.word == KVM_INVALID_INST)
                return KVM_INVALID_INST;
@@ -161,9 +161,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
                nextpc = epc;
                break;
 
-       case blez_op:           /* not really i_format */
-       case blezl_op:
-               /* rt field assumed to be zero */
+       case blez_op:   /* POP06 */
+#ifndef CONFIG_CPU_MIPSR6
+       case blezl_op:  /* removed in R6 */
+#endif
+               if (insn.i_format.rt != 0)
+                       goto compact_branch;
                if ((long)arch->gprs[insn.i_format.rs] <= 0)
                        epc = epc + 4 + (insn.i_format.simmediate << 2);
                else
@@ -171,9 +174,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
                nextpc = epc;
                break;
 
-       case bgtz_op:
-       case bgtzl_op:
-               /* rt field assumed to be zero */
+       case bgtz_op:   /* POP07 */
+#ifndef CONFIG_CPU_MIPSR6
+       case bgtzl_op:  /* removed in R6 */
+#endif
+               if (insn.i_format.rt != 0)
+                       goto compact_branch;
                if ((long)arch->gprs[insn.i_format.rs] > 0)
                        epc = epc + 4 + (insn.i_format.simmediate << 2);
                else
@@ -185,6 +191,40 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
        case cop1_op:
                kvm_err("%s: unsupported cop1_op\n", __func__);
                break;
+
+#ifdef CONFIG_CPU_MIPSR6
+       /* R6 added the following compact branches with forbidden slots */
+       case blezl_op:  /* POP26 */
+       case bgtzl_op:  /* POP27 */
+               /* only rt == 0 isn't compact branch */
+               if (insn.i_format.rt != 0)
+                       goto compact_branch;
+               break;
+       case pop10_op:
+       case pop30_op:
+               /* only rs == rt == 0 is reserved, rest are compact branches */
+               if (insn.i_format.rs != 0 || insn.i_format.rt != 0)
+                       goto compact_branch;
+               break;
+       case pop66_op:
+       case pop76_op:
+               /* only rs == 0 isn't compact branch */
+               if (insn.i_format.rs != 0)
+                       goto compact_branch;
+               break;
+compact_branch:
+               /*
+                * If we've hit an exception on the forbidden slot, then
+                * the branch must not have been taken.
+                */
+               epc += 8;
+               nextpc = epc;
+               break;
+#else
+compact_branch:
+               /* Compact branches not supported before R6 */
+               break;
+#endif
        }
 
        return nextpc;
@@ -198,7 +238,7 @@ sigill:
        return nextpc;
 }
 
-enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause)
+enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause)
 {
        unsigned long branch_pc;
        enum emulation_result er = EMULATE_DONE;
@@ -243,7 +283,7 @@ static inline int kvm_mips_count_disabled(struct kvm_vcpu *vcpu)
  *
  * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
  */
-static uint32_t kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now)
+static u32 kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now)
 {
        s64 now_ns, periods;
        u64 delta;
@@ -300,11 +340,11 @@ static inline ktime_t kvm_mips_count_time(struct kvm_vcpu *vcpu)
  *
  * Returns:    The current value of the guest CP0_Count register.
  */
-static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
+static u32 kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        ktime_t expires, threshold;
-       uint32_t count, compare;
+       u32 count, compare;
        int running;
 
        /* Calculate the biased and scaled guest CP0_Count */
@@ -315,7 +355,7 @@ static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
         * Find whether CP0_Count has reached the closest timer interrupt. If
         * not, we shouldn't inject it.
         */
-       if ((int32_t)(count - compare) < 0)
+       if ((s32)(count - compare) < 0)
                return count;
 
        /*
@@ -360,7 +400,7 @@ static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
  *
  * Returns:    The current guest CP0_Count value.
  */
-uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu)
+u32 kvm_mips_read_count(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
 
@@ -387,8 +427,7 @@ uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu)
  *
  * Returns:    The ktime at the point of freeze.
  */
-static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu,
-                                      uint32_t *count)
+static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count)
 {
        ktime_t now;
 
@@ -419,16 +458,16 @@ static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu,
  * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
  */
 static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
-                                   ktime_t now, uint32_t count)
+                                   ktime_t now, u32 count)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
-       uint32_t compare;
+       u32 compare;
        u64 delta;
        ktime_t expire;
 
        /* Calculate timeout (wrap 0 to 2^32) */
        compare = kvm_read_c0_guest_compare(cop0);
-       delta = (u64)(uint32_t)(compare - count - 1) + 1;
+       delta = (u64)(u32)(compare - count - 1) + 1;
        delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz);
        expire = ktime_add_ns(now, delta);
 
@@ -444,7 +483,7 @@ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
  *
  * Sets the CP0_Count value and updates the timer accordingly.
  */
-void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count)
+void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        ktime_t now;
@@ -538,13 +577,13 @@ int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz)
  * If @ack, atomically acknowledge any pending timer interrupt, otherwise ensure
  * any pending timer interrupt is preserved.
  */
-void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack)
+void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        int dc;
        u32 old_compare = kvm_read_c0_guest_compare(cop0);
        ktime_t now;
-       uint32_t count;
+       u32 count;
 
        /* if unchanged, must just be an ack */
        if (old_compare == compare) {
@@ -585,7 +624,7 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack)
 static ktime_t kvm_mips_count_disable(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
-       uint32_t count;
+       u32 count;
        ktime_t now;
 
        /* Stop hrtimer */
@@ -632,7 +671,7 @@ void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu)
 void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
-       uint32_t count;
+       u32 count;
 
        kvm_clear_c0_guest_cause(cop0, CAUSEF_DC);
 
@@ -661,7 +700,7 @@ int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl)
        s64 changed = count_ctl ^ vcpu->arch.count_ctl;
        s64 delta;
        ktime_t expire, now;
-       uint32_t count, compare;
+       u32 count, compare;
 
        /* Only allow defined bits to be changed */
        if (changed & ~(s64)(KVM_REG_MIPS_COUNT_CTL_DC))
@@ -687,7 +726,7 @@ int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl)
                         */
                        count = kvm_read_c0_guest_count(cop0);
                        compare = kvm_read_c0_guest_compare(cop0);
-                       delta = (u64)(uint32_t)(compare - count - 1) + 1;
+                       delta = (u64)(u32)(compare - count - 1) + 1;
                        delta = div_u64(delta * NSEC_PER_SEC,
                                        vcpu->arch.count_hz);
                        expire = ktime_add_ns(vcpu->arch.count_resume, delta);
@@ -776,7 +815,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
                  vcpu->arch.pending_exceptions);
 
        ++vcpu->stat.wait_exits;
-       trace_kvm_exit(vcpu, WAIT_EXITS);
+       trace_kvm_exit(vcpu, KVM_TRACE_EXIT_WAIT);
        if (!vcpu->arch.pending_exceptions) {
                vcpu->arch.wait = 1;
                kvm_vcpu_block(vcpu);
@@ -801,9 +840,9 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
 enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
-       uint32_t pc = vcpu->arch.pc;
+       unsigned long pc = vcpu->arch.pc;
 
-       kvm_err("[%#x] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0));
+       kvm_err("[%#lx] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0));
        return EMULATE_FAIL;
 }
 
@@ -813,11 +852,11 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        int index = kvm_read_c0_guest_index(cop0);
        struct kvm_mips_tlb *tlb = NULL;
-       uint32_t pc = vcpu->arch.pc;
+       unsigned long pc = vcpu->arch.pc;
 
        if (index < 0 || index >= KVM_MIPS_GUEST_TLB_SIZE) {
                kvm_debug("%s: illegal index: %d\n", __func__, index);
-               kvm_debug("[%#x] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
+               kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
                          pc, index, kvm_read_c0_guest_entryhi(cop0),
                          kvm_read_c0_guest_entrylo0(cop0),
                          kvm_read_c0_guest_entrylo1(cop0),
@@ -834,10 +873,10 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
 
        tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
        tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
-       tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0);
-       tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0);
+       tlb->tlb_lo[0] = kvm_read_c0_guest_entrylo0(cop0);
+       tlb->tlb_lo[1] = kvm_read_c0_guest_entrylo1(cop0);
 
-       kvm_debug("[%#x] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
+       kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
                  pc, index, kvm_read_c0_guest_entryhi(cop0),
                  kvm_read_c0_guest_entrylo0(cop0),
                  kvm_read_c0_guest_entrylo1(cop0),
@@ -851,7 +890,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        struct kvm_mips_tlb *tlb = NULL;
-       uint32_t pc = vcpu->arch.pc;
+       unsigned long pc = vcpu->arch.pc;
        int index;
 
        get_random_bytes(&index, sizeof(index));
@@ -867,10 +906,10 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
 
        tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
        tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
-       tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0);
-       tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0);
+       tlb->tlb_lo[0] = kvm_read_c0_guest_entrylo0(cop0);
+       tlb->tlb_lo[1] = kvm_read_c0_guest_entrylo1(cop0);
 
-       kvm_debug("[%#x] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n",
+       kvm_debug("[%#lx] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n",
                  pc, index, kvm_read_c0_guest_entryhi(cop0),
                  kvm_read_c0_guest_entrylo0(cop0),
                  kvm_read_c0_guest_entrylo1(cop0));
@@ -882,14 +921,14 @@ enum emulation_result kvm_mips_emul_tlbp(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        long entryhi = kvm_read_c0_guest_entryhi(cop0);
-       uint32_t pc = vcpu->arch.pc;
+       unsigned long pc = vcpu->arch.pc;
        int index = -1;
 
        index = kvm_mips_guest_tlb_lookup(vcpu, entryhi);
 
        kvm_write_c0_guest_index(cop0, index);
 
-       kvm_debug("[%#x] COP0_TLBP (entryhi: %#lx), index: %d\n", pc, entryhi,
+       kvm_debug("[%#lx] COP0_TLBP (entryhi: %#lx), index: %d\n", pc, entryhi,
                  index);
 
        return EMULATE_DONE;
@@ -922,8 +961,8 @@ unsigned int kvm_mips_config1_wrmask(struct kvm_vcpu *vcpu)
  */
 unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu)
 {
-       /* Config4 is optional */
-       unsigned int mask = MIPS_CONF_M;
+       /* Config4 and ULRI are optional */
+       unsigned int mask = MIPS_CONF_M | MIPS_CONF3_ULRI;
 
        /* Permit MSA to be present if MSA is supported */
        if (kvm_mips_guest_can_have_msa(&vcpu->arch))
@@ -942,7 +981,12 @@ unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu)
 unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu)
 {
        /* Config5 is optional */
-       return MIPS_CONF_M;
+       unsigned int mask = MIPS_CONF_M;
+
+       /* KScrExist */
+       mask |= (unsigned int)vcpu->arch.kscratch_enabled << 16;
+
+       return mask;
 }
 
 /**
@@ -973,14 +1017,14 @@ unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu)
        return mask;
 }
 
-enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
-                                          uint32_t cause, struct kvm_run *run,
+enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
+                                          u32 *opc, u32 cause,
+                                          struct kvm_run *run,
                                           struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        enum emulation_result er = EMULATE_DONE;
-       int32_t rt, rd, copz, sel, co_bit, op;
-       uint32_t pc = vcpu->arch.pc;
+       u32 rt, rd, sel;
        unsigned long curr_pc;
 
        /*
@@ -992,16 +1036,8 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
        if (er == EMULATE_FAIL)
                return er;
 
-       copz = (inst >> 21) & 0x1f;
-       rt = (inst >> 16) & 0x1f;
-       rd = (inst >> 11) & 0x1f;
-       sel = inst & 0x7;
-       co_bit = (inst >> 25) & 1;
-
-       if (co_bit) {
-               op = (inst) & 0xff;
-
-               switch (op) {
+       if (inst.co_format.co) {
+               switch (inst.co_format.func) {
                case tlbr_op:   /*  Read indexed TLB entry  */
                        er = kvm_mips_emul_tlbr(vcpu);
                        break;
@@ -1020,47 +1056,58 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                case eret_op:
                        er = kvm_mips_emul_eret(vcpu);
                        goto dont_update_pc;
-                       break;
                case wait_op:
                        er = kvm_mips_emul_wait(vcpu);
                        break;
                }
        } else {
-               switch (copz) {
+               rt = inst.c0r_format.rt;
+               rd = inst.c0r_format.rd;
+               sel = inst.c0r_format.sel;
+
+               switch (inst.c0r_format.rs) {
                case mfc_op:
 #ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
                        cop0->stat[rd][sel]++;
 #endif
                        /* Get reg */
                        if ((rd == MIPS_CP0_COUNT) && (sel == 0)) {
-                               vcpu->arch.gprs[rt] = kvm_mips_read_count(vcpu);
+                               vcpu->arch.gprs[rt] =
+                                   (s32)kvm_mips_read_count(vcpu);
                        } else if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) {
                                vcpu->arch.gprs[rt] = 0x0;
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
                                kvm_mips_trans_mfc0(inst, opc, vcpu);
 #endif
                        } else {
-                               vcpu->arch.gprs[rt] = cop0->reg[rd][sel];
+                               vcpu->arch.gprs[rt] = (s32)cop0->reg[rd][sel];
 
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
                                kvm_mips_trans_mfc0(inst, opc, vcpu);
 #endif
                        }
 
-                       kvm_debug
-                           ("[%#x] MFCz[%d][%d], vcpu->arch.gprs[%d]: %#lx\n",
-                            pc, rd, sel, rt, vcpu->arch.gprs[rt]);
-
+                       trace_kvm_hwr(vcpu, KVM_TRACE_MFC0,
+                                     KVM_TRACE_COP0(rd, sel),
+                                     vcpu->arch.gprs[rt]);
                        break;
 
                case dmfc_op:
                        vcpu->arch.gprs[rt] = cop0->reg[rd][sel];
+
+                       trace_kvm_hwr(vcpu, KVM_TRACE_DMFC0,
+                                     KVM_TRACE_COP0(rd, sel),
+                                     vcpu->arch.gprs[rt]);
                        break;
 
                case mtc_op:
 #ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
                        cop0->stat[rd][sel]++;
 #endif
+                       trace_kvm_hwr(vcpu, KVM_TRACE_MTC0,
+                                     KVM_TRACE_COP0(rd, sel),
+                                     vcpu->arch.gprs[rt]);
+
                        if ((rd == MIPS_CP0_TLB_INDEX)
                            && (vcpu->arch.gprs[rt] >=
                                KVM_MIPS_GUEST_TLB_SIZE)) {
@@ -1078,16 +1125,15 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                kvm_err("MTCz, cop0->reg[EBASE]: %#lx\n",
                                        kvm_read_c0_guest_ebase(cop0));
                        } else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
-                               uint32_t nasid =
+                               u32 nasid =
                                        vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
                                if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) &&
                                    ((kvm_read_c0_guest_entryhi(cop0) &
                                      KVM_ENTRYHI_ASID) != nasid)) {
-                                       kvm_debug("MTCz, change ASID from %#lx to %#lx\n",
+                                       trace_kvm_asid_change(vcpu,
                                                kvm_read_c0_guest_entryhi(cop0)
-                                               & KVM_ENTRYHI_ASID,
-                                               vcpu->arch.gprs[rt]
-                                               & KVM_ENTRYHI_ASID);
+                                                       & KVM_ENTRYHI_ASID,
+                                               nasid);
 
                                        /* Blow away the shadow host TLBs */
                                        kvm_mips_flush_host_tlb(1);
@@ -1100,10 +1146,6 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]);
                                goto done;
                        } else if ((rd == MIPS_CP0_COMPARE) && (sel == 0)) {
-                               kvm_debug("[%#x] MTCz, COMPARE %#lx <- %#lx\n",
-                                         pc, kvm_read_c0_guest_compare(cop0),
-                                         vcpu->arch.gprs[rt]);
-
                                /* If we are writing to COMPARE */
                                /* Clear pending timer interrupt, if any */
                                kvm_mips_write_compare(vcpu,
@@ -1155,7 +1197,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                 * it first.
                                 */
                                if (change & ST0_CU1 && !(val & ST0_FR) &&
-                                   vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+                                   vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
                                        kvm_lose_fpu(vcpu);
 
                                /*
@@ -1166,7 +1208,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                 * the near future.
                                 */
                                if (change & ST0_CU1 &&
-                                   vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)
+                                   vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)
                                        change_c0_status(ST0_CU1, val);
 
                                preempt_enable();
@@ -1201,7 +1243,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                 * context is already loaded.
                                 */
                                if (change & MIPS_CONF5_FRE &&
-                                   vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)
+                                   vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)
                                        change_c0_config5(MIPS_CONF5_FRE, val);
 
                                /*
@@ -1211,7 +1253,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                 * quickly enabled again in the near future.
                                 */
                                if (change & MIPS_CONF5_MSAEN &&
-                                   vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+                                   vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
                                        change_c0_config5(MIPS_CONF5_MSAEN,
                                                          val);
 
@@ -1219,7 +1261,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
 
                                kvm_write_c0_guest_config5(cop0, val);
                        } else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) {
-                               uint32_t old_cause, new_cause;
+                               u32 old_cause, new_cause;
 
                                old_cause = kvm_read_c0_guest_cause(cop0);
                                new_cause = vcpu->arch.gprs[rt];
@@ -1233,20 +1275,30 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                        else
                                                kvm_mips_count_enable_cause(vcpu);
                                }
+                       } else if ((rd == MIPS_CP0_HWRENA) && (sel == 0)) {
+                               u32 mask = MIPS_HWRENA_CPUNUM |
+                                          MIPS_HWRENA_SYNCISTEP |
+                                          MIPS_HWRENA_CC |
+                                          MIPS_HWRENA_CCRES;
+
+                               if (kvm_read_c0_guest_config3(cop0) &
+                                   MIPS_CONF3_ULRI)
+                                       mask |= MIPS_HWRENA_ULR;
+                               cop0->reg[rd][sel] = vcpu->arch.gprs[rt] & mask;
                        } else {
                                cop0->reg[rd][sel] = vcpu->arch.gprs[rt];
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
                                kvm_mips_trans_mtc0(inst, opc, vcpu);
 #endif
                        }
-
-                       kvm_debug("[%#x] MTCz, cop0->reg[%d][%d]: %#lx\n", pc,
-                                 rd, sel, cop0->reg[rd][sel]);
                        break;
 
                case dmtc_op:
                        kvm_err("!!!!!!![%#lx]dmtc_op: rt: %d, rd: %d, sel: %d!!!!!!\n",
                                vcpu->arch.pc, rt, rd, sel);
+                       trace_kvm_hwr(vcpu, KVM_TRACE_DMTC0,
+                                     KVM_TRACE_COP0(rd, sel),
+                                     vcpu->arch.gprs[rt]);
                        er = EMULATE_FAIL;
                        break;
 
@@ -1258,7 +1310,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                vcpu->arch.gprs[rt] =
                                    kvm_read_c0_guest_status(cop0);
                        /* EI */
-                       if (inst & 0x20) {
+                       if (inst.mfmc0_format.sc) {
                                kvm_debug("[%#lx] mfmc0_op: EI\n",
                                          vcpu->arch.pc);
                                kvm_set_c0_guest_status(cop0, ST0_IE);
@@ -1272,9 +1324,8 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
 
                case wrpgpr_op:
                        {
-                               uint32_t css =
-                                   cop0->reg[MIPS_CP0_STATUS][2] & 0xf;
-                               uint32_t pss =
+                               u32 css = cop0->reg[MIPS_CP0_STATUS][2] & 0xf;
+                               u32 pss =
                                    (cop0->reg[MIPS_CP0_STATUS][2] >> 6) & 0xf;
                                /*
                                 * We don't support any shadow register sets, so
@@ -1291,7 +1342,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                        break;
                default:
                        kvm_err("[%#lx]MachEmulateCP0: unsupported COP0, copz: 0x%x\n",
-                               vcpu->arch.pc, copz);
+                               vcpu->arch.pc, inst.c0r_format.rs);
                        er = EMULATE_FAIL;
                        break;
                }
@@ -1312,13 +1363,14 @@ dont_update_pc:
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
+enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
+                                            u32 cause,
                                             struct kvm_run *run,
                                             struct kvm_vcpu *vcpu)
 {
        enum emulation_result er = EMULATE_DO_MMIO;
-       int32_t op, base, rt, offset;
-       uint32_t bytes;
+       u32 rt;
+       u32 bytes;
        void *data = run->mmio.data;
        unsigned long curr_pc;
 
@@ -1331,12 +1383,9 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
        if (er == EMULATE_FAIL)
                return er;
 
-       rt = (inst >> 16) & 0x1f;
-       base = (inst >> 21) & 0x1f;
-       offset = inst & 0xffff;
-       op = (inst >> 26) & 0x3f;
+       rt = inst.i_format.rt;
 
-       switch (op) {
+       switch (inst.i_format.opcode) {
        case sb_op:
                bytes = 1;
                if (bytes > sizeof(run->mmio.data)) {
@@ -1357,7 +1406,7 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
                *(u8 *) data = vcpu->arch.gprs[rt];
                kvm_debug("OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n",
                          vcpu->arch.host_cp0_badvaddr, vcpu->arch.gprs[rt],
-                         *(uint8_t *) data);
+                         *(u8 *) data);
 
                break;
 
@@ -1379,11 +1428,11 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
                run->mmio.is_write = 1;
                vcpu->mmio_needed = 1;
                vcpu->mmio_is_write = 1;
-               *(uint32_t *) data = vcpu->arch.gprs[rt];
+               *(u32 *) data = vcpu->arch.gprs[rt];
 
                kvm_debug("[%#lx] OP_SW: eaddr: %#lx, gpr: %#lx, data: %#x\n",
                          vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
-                         vcpu->arch.gprs[rt], *(uint32_t *) data);
+                         vcpu->arch.gprs[rt], *(u32 *) data);
                break;
 
        case sh_op:
@@ -1404,15 +1453,16 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
                run->mmio.is_write = 1;
                vcpu->mmio_needed = 1;
                vcpu->mmio_is_write = 1;
-               *(uint16_t *) data = vcpu->arch.gprs[rt];
+               *(u16 *) data = vcpu->arch.gprs[rt];
 
                kvm_debug("[%#lx] OP_SH: eaddr: %#lx, gpr: %#lx, data: %#x\n",
                          vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
-                         vcpu->arch.gprs[rt], *(uint32_t *) data);
+                         vcpu->arch.gprs[rt], *(u32 *) data);
                break;
 
        default:
-               kvm_err("Store not yet supported");
+               kvm_err("Store not yet supported (inst=0x%08x)\n",
+                       inst.word);
                er = EMULATE_FAIL;
                break;
        }
@@ -1424,18 +1474,16 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
-                                           struct kvm_run *run,
+enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
+                                           u32 cause, struct kvm_run *run,
                                            struct kvm_vcpu *vcpu)
 {
        enum emulation_result er = EMULATE_DO_MMIO;
-       int32_t op, base, rt, offset;
-       uint32_t bytes;
+       u32 op, rt;
+       u32 bytes;
 
-       rt = (inst >> 16) & 0x1f;
-       base = (inst >> 21) & 0x1f;
-       offset = inst & 0xffff;
-       op = (inst >> 26) & 0x3f;
+       rt = inst.i_format.rt;
+       op = inst.i_format.opcode;
 
        vcpu->arch.pending_load_cause = cause;
        vcpu->arch.io_gpr = rt;
@@ -1521,7 +1569,8 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
                break;
 
        default:
-               kvm_err("Load not yet supported");
+               kvm_err("Load not yet supported (inst=0x%08x)\n",
+                       inst.word);
                er = EMULATE_FAIL;
                break;
        }
@@ -1529,40 +1578,15 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
        return er;
 }
 
-int kvm_mips_sync_icache(unsigned long va, struct kvm_vcpu *vcpu)
-{
-       unsigned long offset = (va & ~PAGE_MASK);
-       struct kvm *kvm = vcpu->kvm;
-       unsigned long pa;
-       gfn_t gfn;
-       kvm_pfn_t pfn;
-
-       gfn = va >> PAGE_SHIFT;
-
-       if (gfn >= kvm->arch.guest_pmap_npages) {
-               kvm_err("%s: Invalid gfn: %#llx\n", __func__, gfn);
-               kvm_mips_dump_host_tlbs();
-               kvm_arch_vcpu_dump_regs(vcpu);
-               return -1;
-       }
-       pfn = kvm->arch.guest_pmap[gfn];
-       pa = (pfn << PAGE_SHIFT) | offset;
-
-       kvm_debug("%s: va: %#lx, unmapped: %#x\n", __func__, va,
-                 CKSEG0ADDR(pa));
-
-       local_flush_icache_range(CKSEG0ADDR(pa), 32);
-       return 0;
-}
-
-enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
-                                            uint32_t cause,
+enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
+                                            u32 *opc, u32 cause,
                                             struct kvm_run *run,
                                             struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        enum emulation_result er = EMULATE_DONE;
-       int32_t offset, cache, op_inst, op, base;
+       u32 cache, op_inst, op, base;
+       s16 offset;
        struct kvm_vcpu_arch *arch = &vcpu->arch;
        unsigned long va;
        unsigned long curr_pc;
@@ -1576,9 +1600,12 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
        if (er == EMULATE_FAIL)
                return er;
 
-       base = (inst >> 21) & 0x1f;
-       op_inst = (inst >> 16) & 0x1f;
-       offset = (int16_t)inst;
+       base = inst.i_format.rs;
+       op_inst = inst.i_format.rt;
+       if (cpu_has_mips_r6)
+               offset = inst.spec3_format.simmediate;
+       else
+               offset = inst.i_format.simmediate;
        cache = op_inst & CacheOp_Cache;
        op = op_inst & CacheOp_Op;
 
@@ -1634,7 +1661,6 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
                                                   (cop0) & KVM_ENTRYHI_ASID));
 
                if (index < 0) {
-                       vcpu->arch.host_cp0_entryhi = (va & VPN2_MASK);
                        vcpu->arch.host_cp0_badvaddr = va;
                        vcpu->arch.pc = curr_pc;
                        er = kvm_mips_emulate_tlbmiss_ld(cause, NULL, run,
@@ -1659,9 +1685,7 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
                                 * We fault an entry from the guest tlb to the
                                 * shadow host TLB
                                 */
-                               kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb,
-                                                                    NULL,
-                                                                    NULL);
+                               kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb);
                        }
                }
        } else {
@@ -1714,20 +1738,20 @@ dont_update_pc:
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
                                            struct kvm_run *run,
                                            struct kvm_vcpu *vcpu)
 {
+       union mips_instruction inst;
        enum emulation_result er = EMULATE_DONE;
-       uint32_t inst;
 
        /* Fetch the instruction. */
        if (cause & CAUSEF_BD)
                opc += 1;
 
-       inst = kvm_get_inst(opc, vcpu);
+       inst.word = kvm_get_inst(opc, vcpu);
 
-       switch (((union mips_instruction)inst).r_format.opcode) {
+       switch (inst.r_format.opcode) {
        case cop0_op:
                er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu);
                break;
@@ -1744,15 +1768,31 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc,
                er = kvm_mips_emulate_load(inst, cause, run, vcpu);
                break;
 
+#ifndef CONFIG_CPU_MIPSR6
        case cache_op:
                ++vcpu->stat.cache_exits;
-               trace_kvm_exit(vcpu, CACHE_EXITS);
+               trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
                er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu);
                break;
+#else
+       case spec3_op:
+               switch (inst.spec3_format.func) {
+               case cache6_op:
+                       ++vcpu->stat.cache_exits;
+                       trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
+                       er = kvm_mips_emulate_cache(inst, opc, cause, run,
+                                                   vcpu);
+                       break;
+               default:
+                       goto unknown;
+               };
+               break;
+unknown:
+#endif
 
        default:
                kvm_err("Instruction emulation not supported (%p/%#x)\n", opc,
-                       inst);
+                       inst.word);
                kvm_arch_vcpu_dump_regs(vcpu);
                er = EMULATE_FAIL;
                break;
@@ -1761,8 +1801,8 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
-                                              uint32_t *opc,
+enum emulation_result kvm_mips_emulate_syscall(u32 cause,
+                                              u32 *opc,
                                               struct kvm_run *run,
                                               struct kvm_vcpu *vcpu)
 {
@@ -1796,8 +1836,8 @@ enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
-                                                 uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
+                                                 u32 *opc,
                                                  struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
 {
@@ -1842,8 +1882,8 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
        return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
-                                                uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
+                                                u32 *opc,
                                                 struct kvm_run *run,
                                                 struct kvm_vcpu *vcpu)
 {
@@ -1888,8 +1928,8 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
        return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
-                                                 uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
+                                                 u32 *opc,
                                                  struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
 {
@@ -1932,8 +1972,8 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
        return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
-                                                uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
+                                                u32 *opc,
                                                 struct kvm_run *run,
                                                 struct kvm_vcpu *vcpu)
 {
@@ -1977,7 +2017,7 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
 }
 
 /* TLBMOD: store into address matching TLB with Dirty bit off */
-enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc,
                                             struct kvm_run *run,
                                             struct kvm_vcpu *vcpu)
 {
@@ -2005,8 +2045,8 @@ enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
-                                             uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
+                                             u32 *opc,
                                              struct kvm_run *run,
                                              struct kvm_vcpu *vcpu)
 {
@@ -2048,8 +2088,8 @@ enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
        return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
-                                              uint32_t *opc,
+enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
+                                              u32 *opc,
                                               struct kvm_run *run,
                                               struct kvm_vcpu *vcpu)
 {
@@ -2077,8 +2117,8 @@ enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
        return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
-                                             uint32_t *opc,
+enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
+                                             u32 *opc,
                                              struct kvm_run *run,
                                              struct kvm_vcpu *vcpu)
 {
@@ -2112,8 +2152,8 @@ enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
-                                             uint32_t *opc,
+enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
+                                             u32 *opc,
                                              struct kvm_run *run,
                                              struct kvm_vcpu *vcpu)
 {
@@ -2147,8 +2187,8 @@ enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
-                                               uint32_t *opc,
+enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
+                                               u32 *opc,
                                                struct kvm_run *run,
                                                struct kvm_vcpu *vcpu)
 {
@@ -2182,8 +2222,8 @@ enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
-                                                 uint32_t *opc,
+enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
+                                                 u32 *opc,
                                                  struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
 {
@@ -2217,8 +2257,8 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
-                                              uint32_t *opc,
+enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
+                                              u32 *opc,
                                               struct kvm_run *run,
                                               struct kvm_vcpu *vcpu)
 {
@@ -2252,8 +2292,8 @@ enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
        return er;
 }
 
-enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
-                                                 uint32_t *opc,
+enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
+                                                 u32 *opc,
                                                  struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
 {
@@ -2287,22 +2327,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
        return er;
 }
 
-/* ll/sc, rdhwr, sync emulation */
-
-#define OPCODE 0xfc000000
-#define BASE   0x03e00000
-#define RT     0x001f0000
-#define OFFSET 0x0000ffff
-#define LL     0xc0000000
-#define SC     0xe0000000
-#define SPEC0  0x00000000
-#define SPEC3  0x7c000000
-#define RD     0x0000f800
-#define FUNC   0x0000003f
-#define SYNC   0x0000000f
-#define RDHWR  0x0000003b
-
-enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
                                         struct kvm_run *run,
                                         struct kvm_vcpu *vcpu)
 {
@@ -2310,7 +2335,7 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
        struct kvm_vcpu_arch *arch = &vcpu->arch;
        enum emulation_result er = EMULATE_DONE;
        unsigned long curr_pc;
-       uint32_t inst;
+       union mips_instruction inst;
 
        /*
         * Update PC and hold onto current PC in case there is
@@ -2325,17 +2350,22 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
        if (cause & CAUSEF_BD)
                opc += 1;
 
-       inst = kvm_get_inst(opc, vcpu);
+       inst.word = kvm_get_inst(opc, vcpu);
 
-       if (inst == KVM_INVALID_INST) {
+       if (inst.word == KVM_INVALID_INST) {
                kvm_err("%s: Cannot get inst @ %p\n", __func__, opc);
                return EMULATE_FAIL;
        }
 
-       if ((inst & OPCODE) == SPEC3 && (inst & FUNC) == RDHWR) {
+       if (inst.r_format.opcode == spec3_op &&
+           inst.r_format.func == rdhwr_op &&
+           inst.r_format.rs == 0 &&
+           (inst.r_format.re >> 3) == 0) {
                int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
-               int rd = (inst & RD) >> 11;
-               int rt = (inst & RT) >> 16;
+               int rd = inst.r_format.rd;
+               int rt = inst.r_format.rt;
+               int sel = inst.r_format.re & 0x7;
+
                /* If usermode, check RDHWR rd is allowed by guest HWREna */
                if (usermode && !(kvm_read_c0_guest_hwrena(cop0) & BIT(rd))) {
                        kvm_debug("RDHWR %#x disallowed by HWREna @ %p\n",
@@ -2343,17 +2373,17 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
                        goto emulate_ri;
                }
                switch (rd) {
-               case 0: /* CPU number */
-                       arch->gprs[rt] = 0;
+               case MIPS_HWR_CPUNUM:           /* CPU number */
+                       arch->gprs[rt] = vcpu->vcpu_id;
                        break;
-               case 1: /* SYNCI length */
+               case MIPS_HWR_SYNCISTEP:        /* SYNCI length */
                        arch->gprs[rt] = min(current_cpu_data.dcache.linesz,
                                             current_cpu_data.icache.linesz);
                        break;
-               case 2: /* Read count register */
-                       arch->gprs[rt] = kvm_mips_read_count(vcpu);
+               case MIPS_HWR_CC:               /* Read count register */
+                       arch->gprs[rt] = (s32)kvm_mips_read_count(vcpu);
                        break;
-               case 3: /* Count register resolution */
+               case MIPS_HWR_CCRES:            /* Count register resolution */
                        switch (current_cpu_data.cputype) {
                        case CPU_20KC:
                        case CPU_25KF:
@@ -2363,7 +2393,7 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
                                arch->gprs[rt] = 2;
                        }
                        break;
-               case 29:
+               case MIPS_HWR_ULR:              /* Read UserLocal register */
                        arch->gprs[rt] = kvm_read_c0_guest_userlocal(cop0);
                        break;
 
@@ -2371,8 +2401,12 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
                        kvm_debug("RDHWR %#x not supported @ %p\n", rd, opc);
                        goto emulate_ri;
                }
+
+               trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR, KVM_TRACE_HWR(rd, sel),
+                             vcpu->arch.gprs[rt]);
        } else {
-               kvm_debug("Emulate RI not supported @ %p: %#x\n", opc, inst);
+               kvm_debug("Emulate RI not supported @ %p: %#x\n",
+                         opc, inst.word);
                goto emulate_ri;
        }
 
@@ -2405,19 +2439,19 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
 
        switch (run->mmio.len) {
        case 4:
-               *gpr = *(int32_t *) run->mmio.data;
+               *gpr = *(s32 *) run->mmio.data;
                break;
 
        case 2:
                if (vcpu->mmio_needed == 2)
-                       *gpr = *(int16_t *) run->mmio.data;
+                       *gpr = *(s16 *) run->mmio.data;
                else
-                       *gpr = *(uint16_t *)run->mmio.data;
+                       *gpr = *(u16 *)run->mmio.data;
 
                break;
        case 1:
                if (vcpu->mmio_needed == 2)
-                       *gpr = *(int8_t *) run->mmio.data;
+                       *gpr = *(s8 *) run->mmio.data;
                else
                        *gpr = *(u8 *) run->mmio.data;
                break;
@@ -2432,12 +2466,12 @@ done:
        return er;
 }
 
-static enum emulation_result kvm_mips_emulate_exc(unsigned long cause,
-                                                 uint32_t *opc,
+static enum emulation_result kvm_mips_emulate_exc(u32 cause,
+                                                 u32 *opc,
                                                  struct kvm_run *run,
                                                  struct kvm_vcpu *vcpu)
 {
-       uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+       u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        struct kvm_vcpu_arch *arch = &vcpu->arch;
        enum emulation_result er = EMULATE_DONE;
@@ -2470,13 +2504,13 @@ static enum emulation_result kvm_mips_emulate_exc(unsigned long cause,
        return er;
 }
 
-enum emulation_result kvm_mips_check_privilege(unsigned long cause,
-                                              uint32_t *opc,
+enum emulation_result kvm_mips_check_privilege(u32 cause,
+                                              u32 *opc,
                                               struct kvm_run *run,
                                               struct kvm_vcpu *vcpu)
 {
        enum emulation_result er = EMULATE_DONE;
-       uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+       u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
        unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 
        int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
@@ -2566,18 +2600,18 @@ enum emulation_result kvm_mips_check_privilege(unsigned long cause,
  * (2) TLB entry is present in the Guest TLB but not in the shadow, in this
  *     case we inject the TLB from the Guest TLB into the shadow host TLB
  */
-enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
-                                             uint32_t *opc,
+enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
+                                             u32 *opc,
                                              struct kvm_run *run,
                                              struct kvm_vcpu *vcpu)
 {
        enum emulation_result er = EMULATE_DONE;
-       uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+       u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
        unsigned long va = vcpu->arch.host_cp0_badvaddr;
        int index;
 
-       kvm_debug("kvm_mips_handle_tlbmiss: badvaddr: %#lx, entryhi: %#lx\n",
-                 vcpu->arch.host_cp0_badvaddr, vcpu->arch.host_cp0_entryhi);
+       kvm_debug("kvm_mips_handle_tlbmiss: badvaddr: %#lx\n",
+                 vcpu->arch.host_cp0_badvaddr);
 
        /*
         * KVM would not have got the exception if this entry was valid in the
@@ -2620,13 +2654,12 @@ enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
                        }
                } else {
                        kvm_debug("Injecting hi: %#lx, lo0: %#lx, lo1: %#lx into shadow host TLB\n",
-                                 tlb->tlb_hi, tlb->tlb_lo0, tlb->tlb_lo1);
+                                 tlb->tlb_hi, tlb->tlb_lo[0], tlb->tlb_lo[1]);
                        /*
                         * OK we have a Guest TLB entry, now inject it into the
                         * shadow host TLB
                         */
-                       kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, NULL,
-                                                            NULL);
+                       kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb);
                }
        }
 
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
new file mode 100644 (file)
index 0000000..6a02b3a
--- /dev/null
@@ -0,0 +1,701 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Generation of main entry point for the guest, exception handling.
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
+ *
+ * Copyright (C) 2016 Imagination Technologies Ltd.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/msa.h>
+#include <asm/setup.h>
+#include <asm/uasm.h>
+
+/* Register names */
+#define ZERO           0
+#define AT             1
+#define V0             2
+#define V1             3
+#define A0             4
+#define A1             5
+
+#if _MIPS_SIM == _MIPS_SIM_ABI32
+#define T0             8
+#define T1             9
+#define T2             10
+#define T3             11
+#endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
+
+#if _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32
+#define T0             12
+#define T1             13
+#define T2             14
+#define T3             15
+#endif /* _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32 */
+
+#define S0             16
+#define S1             17
+#define T9             25
+#define K0             26
+#define K1             27
+#define GP             28
+#define SP             29
+#define RA             31
+
+/* Some CP0 registers */
+#define C0_HWRENA      7, 0
+#define C0_BADVADDR    8, 0
+#define C0_ENTRYHI     10, 0
+#define C0_STATUS      12, 0
+#define C0_CAUSE       13, 0
+#define C0_EPC         14, 0
+#define C0_EBASE       15, 1
+#define C0_CONFIG5     16, 5
+#define C0_DDATA_LO    28, 3
+#define C0_ERROREPC    30, 0
+
+#define CALLFRAME_SIZ   32
+
+#ifdef CONFIG_64BIT
+#define ST0_KX_IF_64   ST0_KX
+#else
+#define ST0_KX_IF_64   0
+#endif
+
+static unsigned int scratch_vcpu[2] = { C0_DDATA_LO };
+static unsigned int scratch_tmp[2] = { C0_ERROREPC };
+
+enum label_id {
+       label_fpu_1 = 1,
+       label_msa_1,
+       label_return_to_host,
+       label_kernel_asid,
+       label_exit_common,
+};
+
+UASM_L_LA(_fpu_1)
+UASM_L_LA(_msa_1)
+UASM_L_LA(_return_to_host)
+UASM_L_LA(_kernel_asid)
+UASM_L_LA(_exit_common)
+
+static void *kvm_mips_build_enter_guest(void *addr);
+static void *kvm_mips_build_ret_from_exit(void *addr);
+static void *kvm_mips_build_ret_to_guest(void *addr);
+static void *kvm_mips_build_ret_to_host(void *addr);
+
+/**
+ * kvm_mips_entry_setup() - Perform global setup for entry code.
+ *
+ * Perform global setup for entry code, such as choosing a scratch register.
+ *
+ * Returns:    0 on success.
+ *             -errno on failure.
+ */
+int kvm_mips_entry_setup(void)
+{
+       /*
+        * We prefer to use KScratchN registers if they are available over the
+        * defaults above, which may not work on all cores.
+        */
+       unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc;
+
+       /* Pick a scratch register for storing VCPU */
+       if (kscratch_mask) {
+               scratch_vcpu[0] = 31;
+               scratch_vcpu[1] = ffs(kscratch_mask) - 1;
+               kscratch_mask &= ~BIT(scratch_vcpu[1]);
+       }
+
+       /* Pick a scratch register to use as a temp for saving state */
+       if (kscratch_mask) {
+               scratch_tmp[0] = 31;
+               scratch_tmp[1] = ffs(kscratch_mask) - 1;
+               kscratch_mask &= ~BIT(scratch_tmp[1]);
+       }
+
+       return 0;
+}
+
+static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp,
+                                       unsigned int frame)
+{
+       /* Save the VCPU scratch register value in cp0_epc of the stack frame */
+       UASM_i_MFC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
+       UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
+
+       /* Save the temp scratch register value in cp0_cause of stack frame */
+       if (scratch_tmp[0] == 31) {
+               UASM_i_MFC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
+               UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
+       }
+}
+
+static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp,
+                                          unsigned int frame)
+{
+       /*
+        * Restore host scratch register values saved by
+        * kvm_mips_build_save_scratch().
+        */
+       UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
+       UASM_i_MTC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
+
+       if (scratch_tmp[0] == 31) {
+               UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
+               UASM_i_MTC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
+       }
+}
+
+/**
+ * build_set_exc_base() - Assemble code to write exception base address.
+ * @p:         Code buffer pointer.
+ * @reg:       Source register (generated code may set WG bit in @reg).
+ *
+ * Assemble code to modify the exception base address in the EBase register,
+ * using the appropriately sized access and setting the WG bit if necessary.
+ */
+static inline void build_set_exc_base(u32 **p, unsigned int reg)
+{
+       if (cpu_has_ebase_wg) {
+               /* Set WG so that all the bits get written */
+               uasm_i_ori(p, reg, reg, MIPS_EBASE_WG);
+               UASM_i_MTC0(p, reg, C0_EBASE);
+       } else {
+               uasm_i_mtc0(p, reg, C0_EBASE);
+       }
+}
+
+/**
+ * kvm_mips_build_vcpu_run() - Assemble function to start running a guest VCPU.
+ * @addr:      Address to start writing code.
+ *
+ * Assemble the start of the vcpu_run function to run a guest VCPU. The function
+ * conforms to the following prototype:
+ *
+ * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ *
+ * The exit from the guest and return to the caller is handled by the code
+ * generated by kvm_mips_build_ret_to_host().
+ *
+ * Returns:    Next address after end of written function.
+ */
+void *kvm_mips_build_vcpu_run(void *addr)
+{
+       u32 *p = addr;
+       unsigned int i;
+
+       /*
+        * A0: run
+        * A1: vcpu
+        */
+
+       /* k0/k1 not being used in host kernel context */
+       UASM_i_ADDIU(&p, K1, SP, -(int)sizeof(struct pt_regs));
+       for (i = 16; i < 32; ++i) {
+               if (i == 24)
+                       i = 28;
+               UASM_i_SW(&p, i, offsetof(struct pt_regs, regs[i]), K1);
+       }
+
+       /* Save host status */
+       uasm_i_mfc0(&p, V0, C0_STATUS);
+       UASM_i_SW(&p, V0, offsetof(struct pt_regs, cp0_status), K1);
+
+       /* Save scratch registers, will be used to store pointer to vcpu etc */
+       kvm_mips_build_save_scratch(&p, V1, K1);
+
+       /* VCPU scratch register has pointer to vcpu */
+       UASM_i_MTC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+
+       /* Offset into vcpu->arch */
+       UASM_i_ADDIU(&p, K1, A1, offsetof(struct kvm_vcpu, arch));
+
+       /*
+        * Save the host stack to VCPU, used for exception processing
+        * when we exit from the Guest
+        */
+       UASM_i_SW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+
+       /* Save the kernel gp as well */
+       UASM_i_SW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1);
+
+       /*
+        * Setup status register for running the guest in UM, interrupts
+        * are disabled
+        */
+       UASM_i_LA(&p, K0, ST0_EXL | KSU_USER | ST0_BEV | ST0_KX_IF_64);
+       uasm_i_mtc0(&p, K0, C0_STATUS);
+       uasm_i_ehb(&p);
+
+       /* load up the new EBASE */
+       UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
+       build_set_exc_base(&p, K0);
+
+       /*
+        * Now that the new EBASE has been loaded, unset BEV, set
+        * interrupt mask as it was but make sure that timer interrupts
+        * are enabled
+        */
+       uasm_i_addiu(&p, K0, ZERO, ST0_EXL | KSU_USER | ST0_IE | ST0_KX_IF_64);
+       uasm_i_andi(&p, V0, V0, ST0_IM);
+       uasm_i_or(&p, K0, K0, V0);
+       uasm_i_mtc0(&p, K0, C0_STATUS);
+       uasm_i_ehb(&p);
+
+       p = kvm_mips_build_enter_guest(p);
+
+       return p;
+}
+
+/**
+ * kvm_mips_build_enter_guest() - Assemble code to resume guest execution.
+ * @addr:      Address to start writing code.
+ *
+ * Assemble the code to resume guest execution. This code is common between the
+ * initial entry into the guest from the host, and returning from the exit
+ * handler back to the guest.
+ *
+ * Returns:    Next address after end of written function.
+ */
+static void *kvm_mips_build_enter_guest(void *addr)
+{
+       u32 *p = addr;
+       unsigned int i;
+       struct uasm_label labels[2];
+       struct uasm_reloc relocs[2];
+       struct uasm_label *l = labels;
+       struct uasm_reloc *r = relocs;
+
+       memset(labels, 0, sizeof(labels));
+       memset(relocs, 0, sizeof(relocs));
+
+       /* Set Guest EPC */
+       UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1);
+       UASM_i_MTC0(&p, T0, C0_EPC);
+
+       /* Set the ASID for the Guest Kernel */
+       UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1);
+       UASM_i_LW(&p, T0, offsetof(struct mips_coproc, reg[MIPS_CP0_STATUS][0]),
+                 T0);
+       uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL);
+       uasm_i_xori(&p, T0, T0, KSU_USER);
+       uasm_il_bnez(&p, &r, T0, label_kernel_asid);
+        UASM_i_ADDIU(&p, T1, K1,
+                     offsetof(struct kvm_vcpu_arch, guest_kernel_asid));
+       /* else user */
+       UASM_i_ADDIU(&p, T1, K1,
+                    offsetof(struct kvm_vcpu_arch, guest_user_asid));
+       uasm_l_kernel_asid(&l, p);
+
+       /* t1: contains the base of the ASID array, need to get the cpu id  */
+       /* smp_processor_id */
+       uasm_i_lw(&p, T2, offsetof(struct thread_info, cpu), GP);
+       /* x4 */
+       uasm_i_sll(&p, T2, T2, 2);
+       UASM_i_ADDU(&p, T3, T1, T2);
+       uasm_i_lw(&p, K0, 0, T3);
+#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
+       /* x sizeof(struct cpuinfo_mips)/4 */
+       uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4);
+       uasm_i_mul(&p, T2, T2, T3);
+
+       UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask);
+       UASM_i_ADDU(&p, AT, AT, T2);
+       UASM_i_LW(&p, T2, uasm_rel_lo((long)&cpu_data[0].asid_mask), AT);
+       uasm_i_and(&p, K0, K0, T2);
+#else
+       uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID);
+#endif
+       uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+       uasm_i_ehb(&p);
+
+       /* Disable RDHWR access */
+       uasm_i_mtc0(&p, ZERO, C0_HWRENA);
+
+       /* load the guest context from VCPU and return */
+       for (i = 1; i < 32; ++i) {
+               /* Guest k0/k1 loaded later */
+               if (i == K0 || i == K1)
+                       continue;
+               UASM_i_LW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1);
+       }
+
+#ifndef CONFIG_CPU_MIPSR6
+       /* Restore hi/lo */
+       UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, hi), K1);
+       uasm_i_mthi(&p, K0);
+
+       UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, lo), K1);
+       uasm_i_mtlo(&p, K0);
+#endif
+
+       /* Restore the guest's k0/k1 registers */
+       UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
+       UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1);
+
+       /* Jump to guest */
+       uasm_i_eret(&p);
+
+       uasm_resolve_relocs(relocs, labels);
+
+       return p;
+}
+
+/**
+ * kvm_mips_build_exception() - Assemble first level guest exception handler.
+ * @addr:      Address to start writing code.
+ * @handler:   Address of common handler (within range of @addr).
+ *
+ * Assemble exception vector code for guest execution. The generated vector will
+ * branch to the common exception handler generated by kvm_mips_build_exit().
+ *
+ * Returns:    Next address after end of written function.
+ */
+void *kvm_mips_build_exception(void *addr, void *handler)
+{
+       u32 *p = addr;
+       struct uasm_label labels[2];
+       struct uasm_reloc relocs[2];
+       struct uasm_label *l = labels;
+       struct uasm_reloc *r = relocs;
+
+       memset(labels, 0, sizeof(labels));
+       memset(relocs, 0, sizeof(relocs));
+
+       /* Save guest k1 into scratch register */
+       UASM_i_MTC0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
+
+       /* Get the VCPU pointer from the VCPU scratch register */
+       UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
+       UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+
+       /* Save guest k0 into VCPU structure */
+       UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
+
+       /* Branch to the common handler */
+       uasm_il_b(&p, &r, label_exit_common);
+        uasm_i_nop(&p);
+
+       uasm_l_exit_common(&l, handler);
+       uasm_resolve_relocs(relocs, labels);
+
+       return p;
+}
+
+/**
+ * kvm_mips_build_exit() - Assemble common guest exit handler.
+ * @addr:      Address to start writing code.
+ *
+ * Assemble the generic guest exit handling code. This is called by the
+ * exception vectors (generated by kvm_mips_build_exception()), and calls
+ * kvm_mips_handle_exit(), then either resumes the guest or returns to the host
+ * depending on the return value.
+ *
+ * Returns:    Next address after end of written function.
+ */
+void *kvm_mips_build_exit(void *addr)
+{
+       u32 *p = addr;
+       unsigned int i;
+       struct uasm_label labels[3];
+       struct uasm_reloc relocs[3];
+       struct uasm_label *l = labels;
+       struct uasm_reloc *r = relocs;
+
+       memset(labels, 0, sizeof(labels));
+       memset(relocs, 0, sizeof(relocs));
+
+       /*
+        * Generic Guest exception handler. We end up here when the guest
+        * does something that causes a trap to kernel mode.
+        *
+        * Both k0/k1 registers will have already been saved (k0 into the vcpu
+        * structure, and k1 into the scratch_tmp register).
+        *
+        * The k1 register will already contain the kvm_vcpu_arch pointer.
+        */
+
+       /* Start saving Guest context to VCPU */
+       for (i = 0; i < 32; ++i) {
+               /* Guest k0/k1 saved later */
+               if (i == K0 || i == K1)
+                       continue;
+               UASM_i_SW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1);
+       }
+
+#ifndef CONFIG_CPU_MIPSR6
+       /* We need to save hi/lo and restore them on the way out */
+       uasm_i_mfhi(&p, T0);
+       UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, hi), K1);
+
+       uasm_i_mflo(&p, T0);
+       UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1);
+#endif
+
+       /* Finally save guest k1 to VCPU */
+       uasm_i_ehb(&p);
+       UASM_i_MFC0(&p, T0, scratch_tmp[0], scratch_tmp[1]);
+       UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1);
+
+       /* Now that context has been saved, we can use other registers */
+
+       /* Restore vcpu */
+       UASM_i_MFC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+       uasm_i_move(&p, S1, A1);
+
+       /* Restore run (vcpu->run) */
+       UASM_i_LW(&p, A0, offsetof(struct kvm_vcpu, run), A1);
+       /* Save pointer to run in s0, will be saved by the compiler */
+       uasm_i_move(&p, S0, A0);
+
+       /*
+        * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process
+        * the exception
+        */
+       UASM_i_MFC0(&p, K0, C0_EPC);
+       UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, pc), K1);
+
+       UASM_i_MFC0(&p, K0, C0_BADVADDR);
+       UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_badvaddr),
+                 K1);
+
+       uasm_i_mfc0(&p, K0, C0_CAUSE);
+       uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1);
+
+       /* Now restore the host state just enough to run the handlers */
+
+       /* Switch EBASE to the one used by Linux */
+       /* load up the host EBASE */
+       uasm_i_mfc0(&p, V0, C0_STATUS);
+
+       uasm_i_lui(&p, AT, ST0_BEV >> 16);
+       uasm_i_or(&p, K0, V0, AT);
+
+       uasm_i_mtc0(&p, K0, C0_STATUS);
+       uasm_i_ehb(&p);
+
+       UASM_i_LA_mostly(&p, K0, (long)&ebase);
+       UASM_i_LW(&p, K0, uasm_rel_lo((long)&ebase), K0);
+       build_set_exc_base(&p, K0);
+
+       if (raw_cpu_has_fpu) {
+               /*
+                * If FPU is enabled, save FCR31 and clear it so that later
+                * ctc1's don't trigger FPE for pending exceptions.
+                */
+               uasm_i_lui(&p, AT, ST0_CU1 >> 16);
+               uasm_i_and(&p, V1, V0, AT);
+               uasm_il_beqz(&p, &r, V1, label_fpu_1);
+                uasm_i_nop(&p);
+               uasm_i_cfc1(&p, T0, 31);
+               uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.fcr31),
+                         K1);
+               uasm_i_ctc1(&p, ZERO, 31);
+               uasm_l_fpu_1(&l, p);
+       }
+
+       if (cpu_has_msa) {
+               /*
+                * If MSA is enabled, save MSACSR and clear it so that later
+                * instructions don't trigger MSAFPE for pending exceptions.
+                */
+               uasm_i_mfc0(&p, T0, C0_CONFIG5);
+               uasm_i_ext(&p, T0, T0, 27, 1); /* MIPS_CONF5_MSAEN */
+               uasm_il_beqz(&p, &r, T0, label_msa_1);
+                uasm_i_nop(&p);
+               uasm_i_cfcmsa(&p, T0, MSA_CSR);
+               uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.msacsr),
+                         K1);
+               uasm_i_ctcmsa(&p, MSA_CSR, ZERO);
+               uasm_l_msa_1(&l, p);
+       }
+
+       /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
+       uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE));
+       uasm_i_and(&p, V0, V0, AT);
+       uasm_i_lui(&p, AT, ST0_CU0 >> 16);
+       uasm_i_or(&p, V0, V0, AT);
+       uasm_i_mtc0(&p, V0, C0_STATUS);
+       uasm_i_ehb(&p);
+
+       /* Load up host GP */
+       UASM_i_LW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1);
+
+       /* Need a stack before we can jump to "C" */
+       UASM_i_LW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+
+       /* Saved host state */
+       UASM_i_ADDIU(&p, SP, SP, -(int)sizeof(struct pt_regs));
+
+       /*
+        * XXXKYMA do we need to load the host ASID, maybe not because the
+        * kernel entries are marked GLOBAL, need to verify
+        */
+
+       /* Restore host scratch registers, as we'll have clobbered them */
+       kvm_mips_build_restore_scratch(&p, K0, SP);
+
+       /* Restore RDHWR access */
+       UASM_i_LA_mostly(&p, K0, (long)&hwrena);
+       uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0);
+       uasm_i_mtc0(&p, K0, C0_HWRENA);
+
+       /* Jump to handler */
+       /*
+        * XXXKYMA: not sure if this is safe, how large is the stack??
+        * Now jump to the kvm_mips_handle_exit() to see if we can deal
+        * with this in the kernel
+        */
+       UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit);
+       uasm_i_jalr(&p, RA, T9);
+        UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ);
+
+       uasm_resolve_relocs(relocs, labels);
+
+       p = kvm_mips_build_ret_from_exit(p);
+
+       return p;
+}
+
+/**
+ * kvm_mips_build_ret_from_exit() - Assemble guest exit return handler.
+ * @addr:      Address to start writing code.
+ *
+ * Assemble the code to handle the return from kvm_mips_handle_exit(), either
+ * resuming the guest or returning to the host depending on the return value.
+ *
+ * Returns:    Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_from_exit(void *addr)
+{
+       u32 *p = addr;
+       struct uasm_label labels[2];
+       struct uasm_reloc relocs[2];
+       struct uasm_label *l = labels;
+       struct uasm_reloc *r = relocs;
+
+       memset(labels, 0, sizeof(labels));
+       memset(relocs, 0, sizeof(relocs));
+
+       /* Return from handler Make sure interrupts are disabled */
+       uasm_i_di(&p, ZERO);
+       uasm_i_ehb(&p);
+
+       /*
+        * XXXKYMA: k0/k1 could have been blown away if we processed
+        * an exception while we were handling the exception from the
+        * guest, reload k1
+        */
+
+       uasm_i_move(&p, K1, S1);
+       UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+
+       /*
+        * Check return value, should tell us if we are returning to the
+        * host (handle I/O etc)or resuming the guest
+        */
+       uasm_i_andi(&p, T0, V0, RESUME_HOST);
+       uasm_il_bnez(&p, &r, T0, label_return_to_host);
+        uasm_i_nop(&p);
+
+       p = kvm_mips_build_ret_to_guest(p);
+
+       uasm_l_return_to_host(&l, p);
+       p = kvm_mips_build_ret_to_host(p);
+
+       uasm_resolve_relocs(relocs, labels);
+
+       return p;
+}
+
+/**
+ * kvm_mips_build_ret_to_guest() - Assemble code to return to the guest.
+ * @addr:      Address to start writing code.
+ *
+ * Assemble the code to handle return from the guest exit handler
+ * (kvm_mips_handle_exit()) back to the guest.
+ *
+ * Returns:    Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_to_guest(void *addr)
+{
+       u32 *p = addr;
+
+       /* Put the saved pointer to vcpu (s1) back into the scratch register */
+       UASM_i_MTC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
+
+       /* Load up the Guest EBASE to minimize the window where BEV is set */
+       UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
+
+       /* Switch EBASE back to the one used by KVM */
+       uasm_i_mfc0(&p, V1, C0_STATUS);
+       uasm_i_lui(&p, AT, ST0_BEV >> 16);
+       uasm_i_or(&p, K0, V1, AT);
+       uasm_i_mtc0(&p, K0, C0_STATUS);
+       uasm_i_ehb(&p);
+       build_set_exc_base(&p, T0);
+
+       /* Setup status register for running guest in UM */
+       uasm_i_ori(&p, V1, V1, ST0_EXL | KSU_USER | ST0_IE);
+       UASM_i_LA(&p, AT, ~(ST0_CU0 | ST0_MX));
+       uasm_i_and(&p, V1, V1, AT);
+       uasm_i_mtc0(&p, V1, C0_STATUS);
+       uasm_i_ehb(&p);
+
+       p = kvm_mips_build_enter_guest(p);
+
+       return p;
+}
+
+/**
+ * kvm_mips_build_ret_to_host() - Assemble code to return to the host.
+ * @addr:      Address to start writing code.
+ *
+ * Assemble the code to handle return from the guest exit handler
+ * (kvm_mips_handle_exit()) back to the host, i.e. to the caller of the vcpu_run
+ * function generated by kvm_mips_build_vcpu_run().
+ *
+ * Returns:    Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_to_host(void *addr)
+{
+       u32 *p = addr;
+       unsigned int i;
+
+       /* EBASE is already pointing to Linux */
+       UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+       UASM_i_ADDIU(&p, K1, K1, -(int)sizeof(struct pt_regs));
+
+       /*
+        * r2/v0 is the return code, shift it down by 2 (arithmetic)
+        * to recover the err code
+        */
+       uasm_i_sra(&p, K0, V0, 2);
+       uasm_i_move(&p, V0, K0);
+
+       /* Load context saved on the host stack */
+       for (i = 16; i < 31; ++i) {
+               if (i == 24)
+                       i = 28;
+               UASM_i_LW(&p, i, offsetof(struct pt_regs, regs[i]), K1);
+       }
+
+       /* Restore RDHWR access */
+       UASM_i_LA_mostly(&p, K0, (long)&hwrena);
+       uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0);
+       uasm_i_mtc0(&p, K0, C0_HWRENA);
+
+       /* Restore RA, which is the address we will return to */
+       UASM_i_LW(&p, RA, offsetof(struct pt_regs, regs[RA]), K1);
+       uasm_i_jr(&p, RA);
+        uasm_i_nop(&p);
+
+       return p;
+}
+
index 531fbf5..16f17c6 100644 (file)
 #include <asm/mipsregs.h>
 #include <asm/regdef.h>
 
+/* preprocessor replaces the fp in ".set fp=64" with $30 otherwise */
+#undef fp
+
        .set    noreorder
        .set    noat
 
 LEAF(__kvm_save_fpu)
        .set    push
-       .set    mips64r2
        SET_HARDFLOAT
+       .set    fp=64
        mfc0    t0, CP0_STATUS
        sll     t0, t0, 5                       # is Status.FR set?
        bgez    t0, 1f                          # no: skip odd doubles
@@ -63,8 +66,8 @@ LEAF(__kvm_save_fpu)
 
 LEAF(__kvm_restore_fpu)
        .set    push
-       .set    mips64r2
        SET_HARDFLOAT
+       .set    fp=64
        mfc0    t0, CP0_STATUS
        sll     t0, t0, 5                       # is Status.FR set?
        bgez    t0, 1f                          # no: skip odd doubles
index 95f7906..ad28dac 100644 (file)
 
 #include "interrupt.h"
 
-void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority)
+void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
 {
        set_bit(priority, &vcpu->arch.pending_exceptions);
 }
 
-void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority)
+void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
 {
        clear_bit(priority, &vcpu->arch.pending_exceptions);
 }
@@ -114,10 +114,10 @@ void kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
 
 /* Deliver the interrupt of the corresponding priority, if possible. */
 int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
-                           uint32_t cause)
+                           u32 cause)
 {
        int allowed = 0;
-       uint32_t exccode;
+       u32 exccode;
 
        struct kvm_vcpu_arch *arch = &vcpu->arch;
        struct mips_coproc *cop0 = vcpu->arch.cop0;
@@ -196,12 +196,12 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
 }
 
 int kvm_mips_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
-                         uint32_t cause)
+                         u32 cause)
 {
        return 1;
 }
 
-void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, uint32_t cause)
+void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, u32 cause)
 {
        unsigned long *pending = &vcpu->arch.pending_exceptions;
        unsigned long *pending_clr = &vcpu->arch.pending_exceptions_clr;
index 2143884..fb118a2 100644 (file)
 #define MIPS_EXC_MAX                12
 /* XXXSL More to follow */
 
-extern char __kvm_mips_vcpu_run_end[];
-extern char mips32_exception[], mips32_exceptionEnd[];
-extern char mips32_GuestException[], mips32_GuestExceptionEnd[];
-
 #define C_TI        (_ULCAST_(1) << 30)
 
 #define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0)
 #define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE   (0)
 
-void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority);
-void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority);
+void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
+void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
 int kvm_mips_pending_timer(struct kvm_vcpu *vcpu);
 
 void kvm_mips_queue_timer_int_cb(struct kvm_vcpu *vcpu);
@@ -48,7 +44,7 @@ void kvm_mips_queue_io_int_cb(struct kvm_vcpu *vcpu,
 void kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
                                struct kvm_mips_interrupt *irq);
 int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
-                           uint32_t cause);
+                           u32 cause);
 int kvm_mips_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
-                         uint32_t cause);
-void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, uint32_t cause);
+                         u32 cause);
+void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, u32 cause);
diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S
deleted file mode 100644 (file)
index 828fcfc..0000000
+++ /dev/null
@@ -1,605 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Main entry point for the guest, exception handling.
- *
- * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
- * Authors: Sanjay Lal <sanjayl@kymasys.com>
- */
-
-#include <asm/asm.h>
-#include <asm/asmmacro.h>
-#include <asm/regdef.h>
-#include <asm/mipsregs.h>
-#include <asm/stackframe.h>
-#include <asm/asm-offsets.h>
-
-#define _C_LABEL(x)     x
-#define MIPSX(name)     mips32_ ## name
-#define CALLFRAME_SIZ   32
-
-/*
- * VECTOR
- *  exception vector entrypoint
- */
-#define VECTOR(x, regmask)      \
-    .ent    _C_LABEL(x),0;      \
-    EXPORT(x);
-
-#define VECTOR_END(x)      \
-    EXPORT(x);
-
-/* Overload, Danger Will Robinson!! */
-#define PT_HOST_USERLOCAL   PT_EPC
-
-#define CP0_DDATA_LO        $28,3
-
-/* Resume Flags */
-#define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
-
-#define RESUME_GUEST            0
-#define RESUME_HOST             RESUME_FLAG_HOST
-
-/*
- * __kvm_mips_vcpu_run: entry point to the guest
- * a0: run
- * a1: vcpu
- */
-       .set    noreorder
-
-FEXPORT(__kvm_mips_vcpu_run)
-       /* k0/k1 not being used in host kernel context */
-       INT_ADDIU k1, sp, -PT_SIZE
-       LONG_S  $16, PT_R16(k1)
-       LONG_S  $17, PT_R17(k1)
-       LONG_S  $18, PT_R18(k1)
-       LONG_S  $19, PT_R19(k1)
-       LONG_S  $20, PT_R20(k1)
-       LONG_S  $21, PT_R21(k1)
-       LONG_S  $22, PT_R22(k1)
-       LONG_S  $23, PT_R23(k1)
-
-       LONG_S  $28, PT_R28(k1)
-       LONG_S  $29, PT_R29(k1)
-       LONG_S  $30, PT_R30(k1)
-       LONG_S  $31, PT_R31(k1)
-
-       /* Save hi/lo */
-       mflo    v0
-       LONG_S  v0, PT_LO(k1)
-       mfhi    v1
-       LONG_S  v1, PT_HI(k1)
-
-       /* Save host status */
-       mfc0    v0, CP0_STATUS
-       LONG_S  v0, PT_STATUS(k1)
-
-       /* Save DDATA_LO, will be used to store pointer to vcpu */
-       mfc0    v1, CP0_DDATA_LO
-       LONG_S  v1, PT_HOST_USERLOCAL(k1)
-
-       /* DDATA_LO has pointer to vcpu */
-       mtc0    a1, CP0_DDATA_LO
-
-       /* Offset into vcpu->arch */
-       INT_ADDIU k1, a1, VCPU_HOST_ARCH
-
-       /*
-        * Save the host stack to VCPU, used for exception processing
-        * when we exit from the Guest
-        */
-       LONG_S  sp, VCPU_HOST_STACK(k1)
-
-       /* Save the kernel gp as well */
-       LONG_S  gp, VCPU_HOST_GP(k1)
-
-       /*
-        * Setup status register for running the guest in UM, interrupts
-        * are disabled
-        */
-       li      k0, (ST0_EXL | KSU_USER | ST0_BEV)
-       mtc0    k0, CP0_STATUS
-       ehb
-
-       /* load up the new EBASE */
-       LONG_L  k0, VCPU_GUEST_EBASE(k1)
-       mtc0    k0, CP0_EBASE
-
-       /*
-        * Now that the new EBASE has been loaded, unset BEV, set
-        * interrupt mask as it was but make sure that timer interrupts
-        * are enabled
-        */
-       li      k0, (ST0_EXL | KSU_USER | ST0_IE)
-       andi    v0, v0, ST0_IM
-       or      k0, k0, v0
-       mtc0    k0, CP0_STATUS
-       ehb
-
-       /* Set Guest EPC */
-       LONG_L  t0, VCPU_PC(k1)
-       mtc0    t0, CP0_EPC
-
-FEXPORT(__kvm_mips_load_asid)
-       /* Set the ASID for the Guest Kernel */
-       PTR_L   t0, VCPU_COP0(k1)
-       LONG_L  t0, COP0_STATUS(t0)
-       andi    t0, KSU_USER | ST0_ERL | ST0_EXL
-       xori    t0, KSU_USER
-       bnez    t0, 1f          /* If kernel */
-        INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-       INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
-1:
-       /* t1: contains the base of the ASID array, need to get the cpu id */
-       LONG_L  t2, TI_CPU($28)             /* smp_processor_id */
-       INT_SLL t2, t2, 2                   /* x4 */
-       REG_ADDU t3, t1, t2
-       LONG_L  k0, (t3)
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
-       li      t3, CPUINFO_SIZE/4
-       mul     t2, t2, t3              /* x sizeof(struct cpuinfo_mips)/4 */
-       LONG_L  t2, (cpu_data + CPUINFO_ASID_MASK)(t2)
-       and     k0, k0, t2
-#else
-       andi    k0, k0, MIPS_ENTRYHI_ASID
-#endif
-       mtc0    k0, CP0_ENTRYHI
-       ehb
-
-       /* Disable RDHWR access */
-       mtc0    zero, CP0_HWRENA
-
-       .set    noat
-       /* Now load up the Guest Context from VCPU */
-       LONG_L  $1, VCPU_R1(k1)
-       LONG_L  $2, VCPU_R2(k1)
-       LONG_L  $3, VCPU_R3(k1)
-
-       LONG_L  $4, VCPU_R4(k1)
-       LONG_L  $5, VCPU_R5(k1)
-       LONG_L  $6, VCPU_R6(k1)
-       LONG_L  $7, VCPU_R7(k1)
-
-       LONG_L  $8, VCPU_R8(k1)
-       LONG_L  $9, VCPU_R9(k1)
-       LONG_L  $10, VCPU_R10(k1)
-       LONG_L  $11, VCPU_R11(k1)
-       LONG_L  $12, VCPU_R12(k1)
-       LONG_L  $13, VCPU_R13(k1)
-       LONG_L  $14, VCPU_R14(k1)
-       LONG_L  $15, VCPU_R15(k1)
-       LONG_L  $16, VCPU_R16(k1)
-       LONG_L  $17, VCPU_R17(k1)
-       LONG_L  $18, VCPU_R18(k1)
-       LONG_L  $19, VCPU_R19(k1)
-       LONG_L  $20, VCPU_R20(k1)
-       LONG_L  $21, VCPU_R21(k1)
-       LONG_L  $22, VCPU_R22(k1)
-       LONG_L  $23, VCPU_R23(k1)
-       LONG_L  $24, VCPU_R24(k1)
-       LONG_L  $25, VCPU_R25(k1)
-
-       /* k0/k1 loaded up later */
-
-       LONG_L  $28, VCPU_R28(k1)
-       LONG_L  $29, VCPU_R29(k1)
-       LONG_L  $30, VCPU_R30(k1)
-       LONG_L  $31, VCPU_R31(k1)
-
-       /* Restore hi/lo */
-       LONG_L  k0, VCPU_LO(k1)
-       mtlo    k0
-
-       LONG_L  k0, VCPU_HI(k1)
-       mthi    k0
-
-FEXPORT(__kvm_mips_load_k0k1)
-       /* Restore the guest's k0/k1 registers */
-       LONG_L  k0, VCPU_R26(k1)
-       LONG_L  k1, VCPU_R27(k1)
-
-       /* Jump to guest */
-       eret
-EXPORT(__kvm_mips_vcpu_run_end)
-
-VECTOR(MIPSX(exception), unknown)
-/* Find out what mode we came from and jump to the proper handler. */
-       mtc0    k0, CP0_ERROREPC        #01: Save guest k0
-       ehb                             #02:
-
-       mfc0    k0, CP0_EBASE           #02: Get EBASE
-       INT_SRL k0, k0, 10              #03: Get rid of CPUNum
-       INT_SLL k0, k0, 10              #04
-       LONG_S  k1, 0x3000(k0)          #05: Save k1 @ offset 0x3000
-       INT_ADDIU k0, k0, 0x2000        #06: Exception handler is
-                                       #    installed @ offset 0x2000
-       j       k0                      #07: jump to the function
-        nop                            #08: branch delay slot
-VECTOR_END(MIPSX(exceptionEnd))
-.end MIPSX(exception)
-
-/*
- * Generic Guest exception handler. We end up here when the guest
- * does something that causes a trap to kernel mode.
- */
-NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
-       /* Get the VCPU pointer from DDTATA_LO */
-       mfc0    k1, CP0_DDATA_LO
-       INT_ADDIU k1, k1, VCPU_HOST_ARCH
-
-       /* Start saving Guest context to VCPU */
-       LONG_S  $0, VCPU_R0(k1)
-       LONG_S  $1, VCPU_R1(k1)
-       LONG_S  $2, VCPU_R2(k1)
-       LONG_S  $3, VCPU_R3(k1)
-       LONG_S  $4, VCPU_R4(k1)
-       LONG_S  $5, VCPU_R5(k1)
-       LONG_S  $6, VCPU_R6(k1)
-       LONG_S  $7, VCPU_R7(k1)
-       LONG_S  $8, VCPU_R8(k1)
-       LONG_S  $9, VCPU_R9(k1)
-       LONG_S  $10, VCPU_R10(k1)
-       LONG_S  $11, VCPU_R11(k1)
-       LONG_S  $12, VCPU_R12(k1)
-       LONG_S  $13, VCPU_R13(k1)
-       LONG_S  $14, VCPU_R14(k1)
-       LONG_S  $15, VCPU_R15(k1)
-       LONG_S  $16, VCPU_R16(k1)
-       LONG_S  $17, VCPU_R17(k1)
-       LONG_S  $18, VCPU_R18(k1)
-       LONG_S  $19, VCPU_R19(k1)
-       LONG_S  $20, VCPU_R20(k1)
-       LONG_S  $21, VCPU_R21(k1)
-       LONG_S  $22, VCPU_R22(k1)
-       LONG_S  $23, VCPU_R23(k1)
-       LONG_S  $24, VCPU_R24(k1)
-       LONG_S  $25, VCPU_R25(k1)
-
-       /* Guest k0/k1 saved later */
-
-       LONG_S  $28, VCPU_R28(k1)
-       LONG_S  $29, VCPU_R29(k1)
-       LONG_S  $30, VCPU_R30(k1)
-       LONG_S  $31, VCPU_R31(k1)
-
-       .set at
-
-       /* We need to save hi/lo and restore them on the way out */
-       mfhi    t0
-       LONG_S  t0, VCPU_HI(k1)
-
-       mflo    t0
-       LONG_S  t0, VCPU_LO(k1)
-
-       /* Finally save guest k0/k1 to VCPU */
-       mfc0    t0, CP0_ERROREPC
-       LONG_S  t0, VCPU_R26(k1)
-
-       /* Get GUEST k1 and save it in VCPU */
-       PTR_LI  t1, ~0x2ff
-       mfc0    t0, CP0_EBASE
-       and     t0, t0, t1
-       LONG_L  t0, 0x3000(t0)
-       LONG_S  t0, VCPU_R27(k1)
-
-       /* Now that context has been saved, we can use other registers */
-
-       /* Restore vcpu */
-       mfc0    a1, CP0_DDATA_LO
-       move    s1, a1
-
-       /* Restore run (vcpu->run) */
-       LONG_L  a0, VCPU_RUN(a1)
-       /* Save pointer to run in s0, will be saved by the compiler */
-       move    s0, a0
-
-       /*
-        * Save Host level EPC, BadVaddr and Cause to VCPU, useful to
-        * process the exception
-        */
-       mfc0    k0,CP0_EPC
-       LONG_S  k0, VCPU_PC(k1)
-
-       mfc0    k0, CP0_BADVADDR
-       LONG_S  k0, VCPU_HOST_CP0_BADVADDR(k1)
-
-       mfc0    k0, CP0_CAUSE
-       LONG_S  k0, VCPU_HOST_CP0_CAUSE(k1)
-
-       mfc0    k0, CP0_ENTRYHI
-       LONG_S  k0, VCPU_HOST_ENTRYHI(k1)
-
-       /* Now restore the host state just enough to run the handlers */
-
-       /* Switch EBASE to the one used by Linux */
-       /* load up the host EBASE */
-       mfc0    v0, CP0_STATUS
-
-       or      k0, v0, ST0_BEV
-
-       mtc0    k0, CP0_STATUS
-       ehb
-
-       LONG_L  k0, VCPU_HOST_EBASE(k1)
-       mtc0    k0,CP0_EBASE
-
-       /*
-        * If FPU is enabled, save FCR31 and clear it so that later ctc1's don't
-        * trigger FPE for pending exceptions.
-        */
-       and     v1, v0, ST0_CU1
-       beqz    v1, 1f
-        nop
-       .set    push
-       SET_HARDFLOAT
-       cfc1    t0, fcr31
-       sw      t0, VCPU_FCR31(k1)
-       ctc1    zero,fcr31
-       .set    pop
-1:
-
-#ifdef CONFIG_CPU_HAS_MSA
-       /*
-        * If MSA is enabled, save MSACSR and clear it so that later
-        * instructions don't trigger MSAFPE for pending exceptions.
-        */
-       mfc0    t0, CP0_CONFIG3
-       ext     t0, t0, 28, 1 /* MIPS_CONF3_MSAP */
-       beqz    t0, 1f
-        nop
-       mfc0    t0, CP0_CONFIG5
-       ext     t0, t0, 27, 1 /* MIPS_CONF5_MSAEN */
-       beqz    t0, 1f
-        nop
-       _cfcmsa t0, MSA_CSR
-       sw      t0, VCPU_MSA_CSR(k1)
-       _ctcmsa MSA_CSR, zero
-1:
-#endif
-
-       /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
-       and     v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
-       or      v0, v0, ST0_CU0
-       mtc0    v0, CP0_STATUS
-       ehb
-
-       /* Load up host GP */
-       LONG_L  gp, VCPU_HOST_GP(k1)
-
-       /* Need a stack before we can jump to "C" */
-       LONG_L  sp, VCPU_HOST_STACK(k1)
-
-       /* Saved host state */
-       INT_ADDIU sp, sp, -PT_SIZE
-
-       /*
-        * XXXKYMA do we need to load the host ASID, maybe not because the
-        * kernel entries are marked GLOBAL, need to verify
-        */
-
-       /* Restore host DDATA_LO */
-       LONG_L  k0, PT_HOST_USERLOCAL(sp)
-       mtc0    k0, CP0_DDATA_LO
-
-       /* Restore RDHWR access */
-       PTR_LI  k0, 0x2000000F
-       mtc0    k0, CP0_HWRENA
-
-       /* Jump to handler */
-FEXPORT(__kvm_mips_jump_to_handler)
-       /*
-        * XXXKYMA: not sure if this is safe, how large is the stack??
-        * Now jump to the kvm_mips_handle_exit() to see if we can deal
-        * with this in the kernel
-        */
-       PTR_LA  t9, kvm_mips_handle_exit
-       jalr.hb t9
-        INT_ADDIU sp, sp, -CALLFRAME_SIZ           /* BD Slot */
-
-       /* Return from handler Make sure interrupts are disabled */
-       di
-       ehb
-
-       /*
-        * XXXKYMA: k0/k1 could have been blown away if we processed
-        * an exception while we were handling the exception from the
-        * guest, reload k1
-        */
-
-       move    k1, s1
-       INT_ADDIU k1, k1, VCPU_HOST_ARCH
-
-       /*
-        * Check return value, should tell us if we are returning to the
-        * host (handle I/O etc)or resuming the guest
-        */
-       andi    t0, v0, RESUME_HOST
-       bnez    t0, __kvm_mips_return_to_host
-        nop
-
-__kvm_mips_return_to_guest:
-       /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
-       mtc0    s1, CP0_DDATA_LO
-
-       /* Load up the Guest EBASE to minimize the window where BEV is set */
-       LONG_L  t0, VCPU_GUEST_EBASE(k1)
-
-       /* Switch EBASE back to the one used by KVM */
-       mfc0    v1, CP0_STATUS
-       or      k0, v1, ST0_BEV
-       mtc0    k0, CP0_STATUS
-       ehb
-       mtc0    t0, CP0_EBASE
-
-       /* Setup status register for running guest in UM */
-       or      v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
-       and     v1, v1, ~(ST0_CU0 | ST0_MX)
-       mtc0    v1, CP0_STATUS
-       ehb
-
-       /* Set Guest EPC */
-       LONG_L  t0, VCPU_PC(k1)
-       mtc0    t0, CP0_EPC
-
-       /* Set the ASID for the Guest Kernel */
-       PTR_L   t0, VCPU_COP0(k1)
-       LONG_L  t0, COP0_STATUS(t0)
-       andi    t0, KSU_USER | ST0_ERL | ST0_EXL
-       xori    t0, KSU_USER
-       bnez    t0, 1f          /* If kernel */
-        INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-       INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
-1:
-       /* t1: contains the base of the ASID array, need to get the cpu id  */
-       LONG_L  t2, TI_CPU($28)         /* smp_processor_id */
-       INT_SLL t2, t2, 2               /* x4 */
-       REG_ADDU t3, t1, t2
-       LONG_L  k0, (t3)
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
-       li      t3, CPUINFO_SIZE/4
-       mul     t2, t2, t3              /* x sizeof(struct cpuinfo_mips)/4 */
-       LONG_L  t2, (cpu_data + CPUINFO_ASID_MASK)(t2)
-       and     k0, k0, t2
-#else
-       andi    k0, k0, MIPS_ENTRYHI_ASID
-#endif
-       mtc0    k0, CP0_ENTRYHI
-       ehb
-
-       /* Disable RDHWR access */
-       mtc0    zero, CP0_HWRENA
-
-       .set    noat
-       /* load the guest context from VCPU and return */
-       LONG_L  $0, VCPU_R0(k1)
-       LONG_L  $1, VCPU_R1(k1)
-       LONG_L  $2, VCPU_R2(k1)
-       LONG_L  $3, VCPU_R3(k1)
-       LONG_L  $4, VCPU_R4(k1)
-       LONG_L  $5, VCPU_R5(k1)
-       LONG_L  $6, VCPU_R6(k1)
-       LONG_L  $7, VCPU_R7(k1)
-       LONG_L  $8, VCPU_R8(k1)
-       LONG_L  $9, VCPU_R9(k1)
-       LONG_L  $10, VCPU_R10(k1)
-       LONG_L  $11, VCPU_R11(k1)
-       LONG_L  $12, VCPU_R12(k1)
-       LONG_L  $13, VCPU_R13(k1)
-       LONG_L  $14, VCPU_R14(k1)
-       LONG_L  $15, VCPU_R15(k1)
-       LONG_L  $16, VCPU_R16(k1)
-       LONG_L  $17, VCPU_R17(k1)
-       LONG_L  $18, VCPU_R18(k1)
-       LONG_L  $19, VCPU_R19(k1)
-       LONG_L  $20, VCPU_R20(k1)
-       LONG_L  $21, VCPU_R21(k1)
-       LONG_L  $22, VCPU_R22(k1)
-       LONG_L  $23, VCPU_R23(k1)
-       LONG_L  $24, VCPU_R24(k1)
-       LONG_L  $25, VCPU_R25(k1)
-
-       /* $/k1 loaded later */
-       LONG_L  $28, VCPU_R28(k1)
-       LONG_L  $29, VCPU_R29(k1)
-       LONG_L  $30, VCPU_R30(k1)
-       LONG_L  $31, VCPU_R31(k1)
-
-FEXPORT(__kvm_mips_skip_guest_restore)
-       LONG_L  k0, VCPU_HI(k1)
-       mthi    k0
-
-       LONG_L  k0, VCPU_LO(k1)
-       mtlo    k0
-
-       LONG_L  k0, VCPU_R26(k1)
-       LONG_L  k1, VCPU_R27(k1)
-
-       eret
-       .set    at
-
-__kvm_mips_return_to_host:
-       /* EBASE is already pointing to Linux */
-       LONG_L  k1, VCPU_HOST_STACK(k1)
-       INT_ADDIU k1,k1, -PT_SIZE
-
-       /* Restore host DDATA_LO */
-       LONG_L  k0, PT_HOST_USERLOCAL(k1)
-       mtc0    k0, CP0_DDATA_LO
-
-       /*
-        * r2/v0 is the return code, shift it down by 2 (arithmetic)
-        * to recover the err code
-        */
-       INT_SRA k0, v0, 2
-       move    $2, k0
-
-       /* Load context saved on the host stack */
-       LONG_L  $16, PT_R16(k1)
-       LONG_L  $17, PT_R17(k1)
-       LONG_L  $18, PT_R18(k1)
-       LONG_L  $19, PT_R19(k1)
-       LONG_L  $20, PT_R20(k1)
-       LONG_L  $21, PT_R21(k1)
-       LONG_L  $22, PT_R22(k1)
-       LONG_L  $23, PT_R23(k1)
-
-       LONG_L  $28, PT_R28(k1)
-       LONG_L  $29, PT_R29(k1)
-       LONG_L  $30, PT_R30(k1)
-
-       LONG_L  k0, PT_HI(k1)
-       mthi    k0
-
-       LONG_L  k0, PT_LO(k1)
-       mtlo    k0
-
-       /* Restore RDHWR access */
-       PTR_LI  k0, 0x2000000F
-       mtc0    k0, CP0_HWRENA
-
-       /* Restore RA, which is the address we will return to */
-       LONG_L  ra, PT_R31(k1)
-       j       ra
-        nop
-
-VECTOR_END(MIPSX(GuestExceptionEnd))
-.end MIPSX(GuestException)
-
-MIPSX(exceptions):
-       ####
-       ##### The exception handlers.
-       #####
-       .word _C_LABEL(MIPSX(GuestException))   #  0
-       .word _C_LABEL(MIPSX(GuestException))   #  1
-       .word _C_LABEL(MIPSX(GuestException))   #  2
-       .word _C_LABEL(MIPSX(GuestException))   #  3
-       .word _C_LABEL(MIPSX(GuestException))   #  4
-       .word _C_LABEL(MIPSX(GuestException))   #  5
-       .word _C_LABEL(MIPSX(GuestException))   #  6
-       .word _C_LABEL(MIPSX(GuestException))   #  7
-       .word _C_LABEL(MIPSX(GuestException))   #  8
-       .word _C_LABEL(MIPSX(GuestException))   #  9
-       .word _C_LABEL(MIPSX(GuestException))   # 10
-       .word _C_LABEL(MIPSX(GuestException))   # 11
-       .word _C_LABEL(MIPSX(GuestException))   # 12
-       .word _C_LABEL(MIPSX(GuestException))   # 13
-       .word _C_LABEL(MIPSX(GuestException))   # 14
-       .word _C_LABEL(MIPSX(GuestException))   # 15
-       .word _C_LABEL(MIPSX(GuestException))   # 16
-       .word _C_LABEL(MIPSX(GuestException))   # 17
-       .word _C_LABEL(MIPSX(GuestException))   # 18
-       .word _C_LABEL(MIPSX(GuestException))   # 19
-       .word _C_LABEL(MIPSX(GuestException))   # 20
-       .word _C_LABEL(MIPSX(GuestException))   # 21
-       .word _C_LABEL(MIPSX(GuestException))   # 22
-       .word _C_LABEL(MIPSX(GuestException))   # 23
-       .word _C_LABEL(MIPSX(GuestException))   # 24
-       .word _C_LABEL(MIPSX(GuestException))   # 25
-       .word _C_LABEL(MIPSX(GuestException))   # 26
-       .word _C_LABEL(MIPSX(GuestException))   # 27
-       .word _C_LABEL(MIPSX(GuestException))   # 28
-       .word _C_LABEL(MIPSX(GuestException))   # 29
-       .word _C_LABEL(MIPSX(GuestException))   # 30
-       .word _C_LABEL(MIPSX(GuestException))   # 31
index 44da525..a6ea084 100644 (file)
@@ -9,6 +9,7 @@
  * Authors: Sanjay Lal <sanjayl@kymasys.com>
  */
 
+#include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kdebug.h>
@@ -147,7 +148,7 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
        /* Put the pages we reserved for the guest pmap */
        for (i = 0; i < kvm->arch.guest_pmap_npages; i++) {
                if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE)
-                       kvm_mips_release_pfn_clean(kvm->arch.guest_pmap[i]);
+                       kvm_release_pfn_clean(kvm->arch.guest_pmap[i]);
        }
        kfree(kvm->arch.guest_pmap);
 
@@ -244,10 +245,27 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
        }
 }
 
+static inline void dump_handler(const char *symbol, void *start, void *end)
+{
+       u32 *p;
+
+       pr_debug("LEAF(%s)\n", symbol);
+
+       pr_debug("\t.set push\n");
+       pr_debug("\t.set noreorder\n");
+
+       for (p = start; p < (u32 *)end; ++p)
+               pr_debug("\t.word\t0x%08x\t\t# %p\n", *p, p);
+
+       pr_debug("\t.set\tpop\n");
+
+       pr_debug("\tEND(%s)\n", symbol);
+}
+
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
-       int err, size, offset;
-       void *gebase;
+       int err, size;
+       void *gebase, *p, *handler;
        int i;
 
        struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
@@ -273,9 +291,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
        else
                size = 0x4000;
 
-       /* Save Linux EBASE */
-       vcpu->arch.host_ebase = (void *)read_c0_ebase();
-
        gebase = kzalloc(ALIGN(size, PAGE_SIZE), GFP_KERNEL);
 
        if (!gebase) {
@@ -285,44 +300,53 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
        kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n",
                  ALIGN(size, PAGE_SIZE), gebase);
 
+       /*
+        * Check new ebase actually fits in CP0_EBase. The lack of a write gate
+        * limits us to the low 512MB of physical address space. If the memory
+        * we allocate is out of range, just give up now.
+        */
+       if (!cpu_has_ebase_wg && virt_to_phys(gebase) >= 0x20000000) {
+               kvm_err("CP0_EBase.WG required for guest exception base %pK\n",
+                       gebase);
+               err = -ENOMEM;
+               goto out_free_gebase;
+       }
+
        /* Save new ebase */
        vcpu->arch.guest_ebase = gebase;
 
-       /* Copy L1 Guest Exception handler to correct offset */
+       /* Build guest exception vectors dynamically in unmapped memory */
+       handler = gebase + 0x2000;
 
        /* TLB Refill, EXL = 0 */
-       memcpy(gebase, mips32_exception,
-              mips32_exceptionEnd - mips32_exception);
+       kvm_mips_build_exception(gebase, handler);
 
        /* General Exception Entry point */
-       memcpy(gebase + 0x180, mips32_exception,
-              mips32_exceptionEnd - mips32_exception);
+       kvm_mips_build_exception(gebase + 0x180, handler);
 
        /* For vectored interrupts poke the exception code @ all offsets 0-7 */
        for (i = 0; i < 8; i++) {
                kvm_debug("L1 Vectored handler @ %p\n",
                          gebase + 0x200 + (i * VECTORSPACING));
-               memcpy(gebase + 0x200 + (i * VECTORSPACING), mips32_exception,
-                      mips32_exceptionEnd - mips32_exception);
+               kvm_mips_build_exception(gebase + 0x200 + i * VECTORSPACING,
+                                        handler);
        }
 
-       /* General handler, relocate to unmapped space for sanity's sake */
-       offset = 0x2000;
-       kvm_debug("Installing KVM Exception handlers @ %p, %#x bytes\n",
-                 gebase + offset,
-                 mips32_GuestExceptionEnd - mips32_GuestException);
+       /* General exit handler */
+       p = handler;
+       p = kvm_mips_build_exit(p);
 
-       memcpy(gebase + offset, mips32_GuestException,
-              mips32_GuestExceptionEnd - mips32_GuestException);
+       /* Guest entry routine */
+       vcpu->arch.vcpu_run = p;
+       p = kvm_mips_build_vcpu_run(p);
 
-#ifdef MODULE
-       offset += mips32_GuestExceptionEnd - mips32_GuestException;
-       memcpy(gebase + offset, (char *)__kvm_mips_vcpu_run,
-              __kvm_mips_vcpu_run_end - (char *)__kvm_mips_vcpu_run);
-       vcpu->arch.vcpu_run = gebase + offset;
-#else
-       vcpu->arch.vcpu_run = __kvm_mips_vcpu_run;
-#endif
+       /* Dump the generated code */
+       pr_debug("#include <asm/asm.h>\n");
+       pr_debug("#include <asm/regdef.h>\n");
+       pr_debug("\n");
+       dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p);
+       dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200);
+       dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run);
 
        /* Invalidate the icache for these ranges */
        local_flush_icache_range((unsigned long)gebase,
@@ -408,17 +432,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
        kvm_mips_deliver_interrupts(vcpu,
                                    kvm_read_c0_guest_cause(vcpu->arch.cop0));
 
-       __kvm_guest_enter();
+       guest_enter_irqoff();
 
        /* Disable hardware page table walking while in guest */
        htw_stop();
 
+       trace_kvm_enter(vcpu);
        r = vcpu->arch.vcpu_run(run, vcpu);
+       trace_kvm_out(vcpu);
 
        /* Re-enable HTW before enabling interrupts */
        htw_start();
 
-       __kvm_guest_exit();
+       guest_exit_irqoff();
        local_irq_enable();
 
        if (vcpu->sigset_active)
@@ -507,8 +533,10 @@ static u64 kvm_mips_get_one_regs[] = {
        KVM_REG_MIPS_R30,
        KVM_REG_MIPS_R31,
 
+#ifndef CONFIG_CPU_MIPSR6
        KVM_REG_MIPS_HI,
        KVM_REG_MIPS_LO,
+#endif
        KVM_REG_MIPS_PC,
 
        KVM_REG_MIPS_CP0_INDEX,
@@ -539,6 +567,104 @@ static u64 kvm_mips_get_one_regs[] = {
        KVM_REG_MIPS_COUNT_HZ,
 };
 
+static u64 kvm_mips_get_one_regs_fpu[] = {
+       KVM_REG_MIPS_FCR_IR,
+       KVM_REG_MIPS_FCR_CSR,
+};
+
+static u64 kvm_mips_get_one_regs_msa[] = {
+       KVM_REG_MIPS_MSA_IR,
+       KVM_REG_MIPS_MSA_CSR,
+};
+
+static u64 kvm_mips_get_one_regs_kscratch[] = {
+       KVM_REG_MIPS_CP0_KSCRATCH1,
+       KVM_REG_MIPS_CP0_KSCRATCH2,
+       KVM_REG_MIPS_CP0_KSCRATCH3,
+       KVM_REG_MIPS_CP0_KSCRATCH4,
+       KVM_REG_MIPS_CP0_KSCRATCH5,
+       KVM_REG_MIPS_CP0_KSCRATCH6,
+};
+
+static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
+{
+       unsigned long ret;
+
+       ret = ARRAY_SIZE(kvm_mips_get_one_regs);
+       if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) {
+               ret += ARRAY_SIZE(kvm_mips_get_one_regs_fpu) + 48;
+               /* odd doubles */
+               if (boot_cpu_data.fpu_id & MIPS_FPIR_F64)
+                       ret += 16;
+       }
+       if (kvm_mips_guest_can_have_msa(&vcpu->arch))
+               ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32;
+       ret += __arch_hweight8(vcpu->arch.kscratch_enabled);
+       ret += kvm_mips_callbacks->num_regs(vcpu);
+
+       return ret;
+}
+
+static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
+{
+       u64 index;
+       unsigned int i;
+
+       if (copy_to_user(indices, kvm_mips_get_one_regs,
+                        sizeof(kvm_mips_get_one_regs)))
+               return -EFAULT;
+       indices += ARRAY_SIZE(kvm_mips_get_one_regs);
+
+       if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) {
+               if (copy_to_user(indices, kvm_mips_get_one_regs_fpu,
+                                sizeof(kvm_mips_get_one_regs_fpu)))
+                       return -EFAULT;
+               indices += ARRAY_SIZE(kvm_mips_get_one_regs_fpu);
+
+               for (i = 0; i < 32; ++i) {
+                       index = KVM_REG_MIPS_FPR_32(i);
+                       if (copy_to_user(indices, &index, sizeof(index)))
+                               return -EFAULT;
+                       ++indices;
+
+                       /* skip odd doubles if no F64 */
+                       if (i & 1 && !(boot_cpu_data.fpu_id & MIPS_FPIR_F64))
+                               continue;
+
+                       index = KVM_REG_MIPS_FPR_64(i);
+                       if (copy_to_user(indices, &index, sizeof(index)))
+                               return -EFAULT;
+                       ++indices;
+               }
+       }
+
+       if (kvm_mips_guest_can_have_msa(&vcpu->arch)) {
+               if (copy_to_user(indices, kvm_mips_get_one_regs_msa,
+                                sizeof(kvm_mips_get_one_regs_msa)))
+                       return -EFAULT;
+               indices += ARRAY_SIZE(kvm_mips_get_one_regs_msa);
+
+               for (i = 0; i < 32; ++i) {
+                       index = KVM_REG_MIPS_VEC_128(i);
+                       if (copy_to_user(indices, &index, sizeof(index)))
+                               return -EFAULT;
+                       ++indices;
+               }
+       }
+
+       for (i = 0; i < 6; ++i) {
+               if (!(vcpu->arch.kscratch_enabled & BIT(i + 2)))
+                       continue;
+
+               if (copy_to_user(indices, &kvm_mips_get_one_regs_kscratch[i],
+                                sizeof(kvm_mips_get_one_regs_kscratch[i])))
+                       return -EFAULT;
+               ++indices;
+       }
+
+       return kvm_mips_callbacks->copy_reg_indices(vcpu, indices);
+}
+
 static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
                            const struct kvm_one_reg *reg)
 {
@@ -554,12 +680,14 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_R0 ... KVM_REG_MIPS_R31:
                v = (long)vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0];
                break;
+#ifndef CONFIG_CPU_MIPSR6
        case KVM_REG_MIPS_HI:
                v = (long)vcpu->arch.hi;
                break;
        case KVM_REG_MIPS_LO:
                v = (long)vcpu->arch.lo;
                break;
+#endif
        case KVM_REG_MIPS_PC:
                v = (long)vcpu->arch.pc;
                break;
@@ -688,17 +816,37 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_CP0_ERROREPC:
                v = (long)kvm_read_c0_guest_errorepc(cop0);
                break;
+       case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+               idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+               if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
+                       return -EINVAL;
+               switch (idx) {
+               case 2:
+                       v = (long)kvm_read_c0_guest_kscratch1(cop0);
+                       break;
+               case 3:
+                       v = (long)kvm_read_c0_guest_kscratch2(cop0);
+                       break;
+               case 4:
+                       v = (long)kvm_read_c0_guest_kscratch3(cop0);
+                       break;
+               case 5:
+                       v = (long)kvm_read_c0_guest_kscratch4(cop0);
+                       break;
+               case 6:
+                       v = (long)kvm_read_c0_guest_kscratch5(cop0);
+                       break;
+               case 7:
+                       v = (long)kvm_read_c0_guest_kscratch6(cop0);
+                       break;
+               }
+               break;
        /* registers to be handled specially */
-       case KVM_REG_MIPS_CP0_COUNT:
-       case KVM_REG_MIPS_COUNT_CTL:
-       case KVM_REG_MIPS_COUNT_RESUME:
-       case KVM_REG_MIPS_COUNT_HZ:
+       default:
                ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v);
                if (ret)
                        return ret;
                break;
-       default:
-               return -EINVAL;
        }
        if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U64) {
                u64 __user *uaddr64 = (u64 __user *)(long)reg->addr;
@@ -755,12 +903,14 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_R1 ... KVM_REG_MIPS_R31:
                vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0] = v;
                break;
+#ifndef CONFIG_CPU_MIPSR6
        case KVM_REG_MIPS_HI:
                vcpu->arch.hi = v;
                break;
        case KVM_REG_MIPS_LO:
                vcpu->arch.lo = v;
                break;
+#endif
        case KVM_REG_MIPS_PC:
                vcpu->arch.pc = v;
                break;
@@ -859,22 +1009,34 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_CP0_ERROREPC:
                kvm_write_c0_guest_errorepc(cop0, v);
                break;
+       case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+               idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+               if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
+                       return -EINVAL;
+               switch (idx) {
+               case 2:
+                       kvm_write_c0_guest_kscratch1(cop0, v);
+                       break;
+               case 3:
+                       kvm_write_c0_guest_kscratch2(cop0, v);
+                       break;
+               case 4:
+                       kvm_write_c0_guest_kscratch3(cop0, v);
+                       break;
+               case 5:
+                       kvm_write_c0_guest_kscratch4(cop0, v);
+                       break;
+               case 6:
+                       kvm_write_c0_guest_kscratch5(cop0, v);
+                       break;
+               case 7:
+                       kvm_write_c0_guest_kscratch6(cop0, v);
+                       break;
+               }
+               break;
        /* registers to be handled specially */
-       case KVM_REG_MIPS_CP0_COUNT:
-       case KVM_REG_MIPS_CP0_COMPARE:
-       case KVM_REG_MIPS_CP0_CAUSE:
-       case KVM_REG_MIPS_CP0_CONFIG:
-       case KVM_REG_MIPS_CP0_CONFIG1:
-       case KVM_REG_MIPS_CP0_CONFIG2:
-       case KVM_REG_MIPS_CP0_CONFIG3:
-       case KVM_REG_MIPS_CP0_CONFIG4:
-       case KVM_REG_MIPS_CP0_CONFIG5:
-       case KVM_REG_MIPS_COUNT_CTL:
-       case KVM_REG_MIPS_COUNT_RESUME:
-       case KVM_REG_MIPS_COUNT_HZ:
-               return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
        default:
-               return -EINVAL;
+               return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
        }
        return 0;
 }
@@ -927,23 +1089,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
        }
        case KVM_GET_REG_LIST: {
                struct kvm_reg_list __user *user_list = argp;
-               u64 __user *reg_dest;
                struct kvm_reg_list reg_list;
                unsigned n;
 
                if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
                        return -EFAULT;
                n = reg_list.n;
-               reg_list.n = ARRAY_SIZE(kvm_mips_get_one_regs);
+               reg_list.n = kvm_mips_num_regs(vcpu);
                if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
                        return -EFAULT;
                if (n < reg_list.n)
                        return -E2BIG;
-               reg_dest = user_list->reg;
-               if (copy_to_user(reg_dest, kvm_mips_get_one_regs,
-                                sizeof(kvm_mips_get_one_regs)))
-                       return -EFAULT;
-               return 0;
+               return kvm_mips_copy_reg_indices(vcpu, user_list->reg);
        }
        case KVM_NMI:
                /* Treat the NMI as a CPU reset */
@@ -1222,7 +1379,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 static void kvm_mips_set_c0_status(void)
 {
-       uint32_t status = read_c0_status();
+       u32 status = read_c0_status();
 
        if (cpu_has_dsp)
                status |= (ST0_MX);
@@ -1236,9 +1393,9 @@ static void kvm_mips_set_c0_status(void)
  */
 int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
-       uint32_t cause = vcpu->arch.host_cp0_cause;
-       uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
+       u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
        unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
@@ -1260,6 +1417,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
        kvm_debug("kvm_mips_handle_exit: cause: %#x, PC: %p, kvm_run: %p, kvm_vcpu: %p\n",
                        cause, opc, run, vcpu);
+       trace_kvm_exit(vcpu, exccode);
 
        /*
         * Do a privilege check, if in UM most of these exit conditions end up
@@ -1279,7 +1437,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
                kvm_debug("[%d]EXCCODE_INT @ %p\n", vcpu->vcpu_id, opc);
 
                ++vcpu->stat.int_exits;
-               trace_kvm_exit(vcpu, INT_EXITS);
 
                if (need_resched())
                        cond_resched();
@@ -1291,7 +1448,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
                kvm_debug("EXCCODE_CPU: @ PC: %p\n", opc);
 
                ++vcpu->stat.cop_unusable_exits;
-               trace_kvm_exit(vcpu, COP_UNUSABLE_EXITS);
                ret = kvm_mips_callbacks->handle_cop_unusable(vcpu);
                /* XXXKYMA: Might need to return to user space */
                if (run->exit_reason == KVM_EXIT_IRQ_WINDOW_OPEN)
@@ -1300,7 +1456,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
        case EXCCODE_MOD:
                ++vcpu->stat.tlbmod_exits;
-               trace_kvm_exit(vcpu, TLBMOD_EXITS);
                ret = kvm_mips_callbacks->handle_tlb_mod(vcpu);
                break;
 
@@ -1310,7 +1465,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
                          badvaddr);
 
                ++vcpu->stat.tlbmiss_st_exits;
-               trace_kvm_exit(vcpu, TLBMISS_ST_EXITS);
                ret = kvm_mips_callbacks->handle_tlb_st_miss(vcpu);
                break;
 
@@ -1319,61 +1473,51 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
                          cause, opc, badvaddr);
 
                ++vcpu->stat.tlbmiss_ld_exits;
-               trace_kvm_exit(vcpu, TLBMISS_LD_EXITS);
                ret = kvm_mips_callbacks->handle_tlb_ld_miss(vcpu);
                break;
 
        case EXCCODE_ADES:
                ++vcpu->stat.addrerr_st_exits;
-               trace_kvm_exit(vcpu, ADDRERR_ST_EXITS);
                ret = kvm_mips_callbacks->handle_addr_err_st(vcpu);
                break;
 
        case EXCCODE_ADEL:
                ++vcpu->stat.addrerr_ld_exits;
-               trace_kvm_exit(vcpu, ADDRERR_LD_EXITS);
                ret = kvm_mips_callbacks->handle_addr_err_ld(vcpu);
                break;
 
        case EXCCODE_SYS:
                ++vcpu->stat.syscall_exits;
-               trace_kvm_exit(vcpu, SYSCALL_EXITS);
                ret = kvm_mips_callbacks->handle_syscall(vcpu);
                break;
 
        case EXCCODE_RI:
                ++vcpu->stat.resvd_inst_exits;
-               trace_kvm_exit(vcpu, RESVD_INST_EXITS);
                ret = kvm_mips_callbacks->handle_res_inst(vcpu);
                break;
 
        case EXCCODE_BP:
                ++vcpu->stat.break_inst_exits;
-               trace_kvm_exit(vcpu, BREAK_INST_EXITS);
                ret = kvm_mips_callbacks->handle_break(vcpu);
                break;
 
        case EXCCODE_TR:
                ++vcpu->stat.trap_inst_exits;
-               trace_kvm_exit(vcpu, TRAP_INST_EXITS);
                ret = kvm_mips_callbacks->handle_trap(vcpu);
                break;
 
        case EXCCODE_MSAFPE:
                ++vcpu->stat.msa_fpe_exits;
-               trace_kvm_exit(vcpu, MSA_FPE_EXITS);
                ret = kvm_mips_callbacks->handle_msa_fpe(vcpu);
                break;
 
        case EXCCODE_FPE:
                ++vcpu->stat.fpe_exits;
-               trace_kvm_exit(vcpu, FPE_EXITS);
                ret = kvm_mips_callbacks->handle_fpe(vcpu);
                break;
 
        case EXCCODE_MSADIS:
                ++vcpu->stat.msa_disabled_exits;
-               trace_kvm_exit(vcpu, MSA_DISABLED_EXITS);
                ret = kvm_mips_callbacks->handle_msa_disabled(vcpu);
                break;
 
@@ -1400,11 +1544,13 @@ skip_emul:
                        run->exit_reason = KVM_EXIT_INTR;
                        ret = (-EINTR << 2) | RESUME_HOST;
                        ++vcpu->stat.signal_exits;
-                       trace_kvm_exit(vcpu, SIGNAL_EXITS);
+                       trace_kvm_exit(vcpu, KVM_TRACE_EXIT_SIGNAL);
                }
        }
 
        if (ret == RESUME_GUEST) {
+               trace_kvm_reenter(vcpu);
+
                /*
                 * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context
                 * is live), restore FCR31 / MSACSR.
@@ -1450,7 +1596,7 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu)
         * not to clobber the status register directly via the commpage.
         */
        if (cpu_has_msa && sr & ST0_CU1 && !(sr & ST0_FR) &&
-           vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+           vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
                kvm_lose_fpu(vcpu);
 
        /*
@@ -1465,9 +1611,12 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu)
        enable_fpu_hazard();
 
        /* If guest FPU state not active, restore it now */
-       if (!(vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)) {
+       if (!(vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)) {
                __kvm_restore_fpu(&vcpu->arch);
-               vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU;
+               vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU;
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_FPU);
+       } else {
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_ENABLE, KVM_TRACE_AUX_FPU);
        }
 
        preempt_enable();
@@ -1494,8 +1643,8 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
                 * interacts with MSA state, so play it safe and save it first.
                 */
                if (!(sr & ST0_FR) &&
-                   (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU |
-                               KVM_MIPS_FPU_MSA)) == KVM_MIPS_FPU_FPU)
+                   (vcpu->arch.aux_inuse & (KVM_MIPS_AUX_FPU |
+                               KVM_MIPS_AUX_MSA)) == KVM_MIPS_AUX_FPU)
                        kvm_lose_fpu(vcpu);
 
                change_c0_status(ST0_CU1 | ST0_FR, sr);
@@ -1509,22 +1658,26 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
        set_c0_config5(MIPS_CONF5_MSAEN);
        enable_fpu_hazard();
 
-       switch (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA)) {
-       case KVM_MIPS_FPU_FPU:
+       switch (vcpu->arch.aux_inuse & (KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA)) {
+       case KVM_MIPS_AUX_FPU:
                /*
                 * Guest FPU state already loaded, only restore upper MSA state
                 */
                __kvm_restore_msa_upper(&vcpu->arch);
-               vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA;
+               vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA;
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_MSA);
                break;
        case 0:
                /* Neither FPU or MSA already active, restore full MSA state */
                __kvm_restore_msa(&vcpu->arch);
-               vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA;
+               vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA;
                if (kvm_mips_guest_has_fpu(&vcpu->arch))
-                       vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU;
+                       vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU;
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE,
+                             KVM_TRACE_AUX_FPU_MSA);
                break;
        default:
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_ENABLE, KVM_TRACE_AUX_MSA);
                break;
        }
 
@@ -1536,13 +1689,15 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
 void kvm_drop_fpu(struct kvm_vcpu *vcpu)
 {
        preempt_disable();
-       if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) {
+       if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
                disable_msa();
-               vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_MSA;
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_DISCARD, KVM_TRACE_AUX_MSA);
+               vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_MSA;
        }
-       if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+       if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
                clear_c0_status(ST0_CU1 | ST0_FR);
-               vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU;
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_DISCARD, KVM_TRACE_AUX_FPU);
+               vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;
        }
        preempt_enable();
 }
@@ -1558,25 +1713,27 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu)
         */
 
        preempt_disable();
-       if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) {
+       if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
                set_c0_config5(MIPS_CONF5_MSAEN);
                enable_fpu_hazard();
 
                __kvm_save_msa(&vcpu->arch);
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU_MSA);
 
                /* Disable MSA & FPU */
                disable_msa();
-               if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+               if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
                        clear_c0_status(ST0_CU1 | ST0_FR);
                        disable_fpu_hazard();
                }
-               vcpu->arch.fpu_inuse &= ~(KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA);
-       } else if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+               vcpu->arch.aux_inuse &= ~(KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA);
+       } else if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
                set_c0_status(ST0_CU1);
                enable_fpu_hazard();
 
                __kvm_save_fpu(&vcpu->arch);
-               vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU;
+               vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;
+               trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU);
 
                /* Disable FPU */
                clear_c0_status(ST0_CU1 | ST0_FR);
@@ -1638,6 +1795,10 @@ static int __init kvm_mips_init(void)
 {
        int ret;
 
+       ret = kvm_mips_entry_setup();
+       if (ret)
+               return ret;
+
        ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
 
        if (ret)
@@ -1645,18 +1806,6 @@ static int __init kvm_mips_init(void)
 
        register_die_notifier(&kvm_mips_csr_die_notifier);
 
-       /*
-        * On MIPS, kernel modules are executed from "mapped space", which
-        * requires TLBs. The TLB handling code is statically linked with
-        * the rest of the kernel (tlb.c) to avoid the possibility of
-        * double faulting. The issue is that the TLB code references
-        * routines that are part of the the KVM module, which are only
-        * available once the module is loaded.
-        */
-       kvm_mips_gfn_to_pfn = gfn_to_pfn;
-       kvm_mips_release_pfn_clean = kvm_release_pfn_clean;
-       kvm_mips_is_error_pfn = is_error_pfn;
-
        return 0;
 }
 
@@ -1664,10 +1813,6 @@ static void __exit kvm_mips_exit(void)
 {
        kvm_exit();
 
-       kvm_mips_gfn_to_pfn = NULL;
-       kvm_mips_release_pfn_clean = NULL;
-       kvm_mips_is_error_pfn = NULL;
-
        unregister_die_notifier(&kvm_mips_csr_die_notifier);
 }
 
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
new file mode 100644 (file)
index 0000000..57319ee
--- /dev/null
@@ -0,0 +1,375 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS MMU handling in the KVM module.
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
+ */
+
+#include <linux/highmem.h>
+#include <linux/kvm_host.h>
+#include <asm/mmu_context.h>
+
+static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
+{
+       int cpu = smp_processor_id();
+
+       return vcpu->arch.guest_kernel_asid[cpu] &
+                       cpu_asid_mask(&cpu_data[cpu]);
+}
+
+static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
+{
+       int cpu = smp_processor_id();
+
+       return vcpu->arch.guest_user_asid[cpu] &
+                       cpu_asid_mask(&cpu_data[cpu]);
+}
+
+static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
+{
+       int srcu_idx, err = 0;
+       kvm_pfn_t pfn;
+
+       if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
+               return 0;
+
+       srcu_idx = srcu_read_lock(&kvm->srcu);
+       pfn = gfn_to_pfn(kvm, gfn);
+
+       if (is_error_pfn(pfn)) {
+               kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn);
+               err = -EFAULT;
+               goto out;
+       }
+
+       kvm->arch.guest_pmap[gfn] = pfn;
+out:
+       srcu_read_unlock(&kvm->srcu, srcu_idx);
+       return err;
+}
+
+/* Translate guest KSEG0 addresses to Host PA */
+unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
+                                                   unsigned long gva)
+{
+       gfn_t gfn;
+       unsigned long offset = gva & ~PAGE_MASK;
+       struct kvm *kvm = vcpu->kvm;
+
+       if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) {
+               kvm_err("%s/%p: Invalid gva: %#lx\n", __func__,
+                       __builtin_return_address(0), gva);
+               return KVM_INVALID_PAGE;
+       }
+
+       gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT);
+
+       if (gfn >= kvm->arch.guest_pmap_npages) {
+               kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn,
+                       gva);
+               return KVM_INVALID_PAGE;
+       }
+
+       if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
+               return KVM_INVALID_ADDR;
+
+       return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
+}
+
+/* XXXKYMA: Must be called with interrupts disabled */
+int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
+                                   struct kvm_vcpu *vcpu)
+{
+       gfn_t gfn;
+       kvm_pfn_t pfn0, pfn1;
+       unsigned long vaddr = 0;
+       unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
+       struct kvm *kvm = vcpu->kvm;
+       const int flush_dcache_mask = 0;
+       int ret;
+
+       if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
+               kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
+               kvm_mips_dump_host_tlbs();
+               return -1;
+       }
+
+       gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
+       if (gfn >= kvm->arch.guest_pmap_npages) {
+               kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
+                       gfn, badvaddr);
+               kvm_mips_dump_host_tlbs();
+               return -1;
+       }
+       vaddr = badvaddr & (PAGE_MASK << 1);
+
+       if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
+               return -1;
+
+       if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0)
+               return -1;
+
+       pfn0 = kvm->arch.guest_pmap[gfn & ~0x1];
+       pfn1 = kvm->arch.guest_pmap[gfn | 0x1];
+
+       entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
+               ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+               ENTRYLO_D | ENTRYLO_V;
+       entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
+               ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+               ENTRYLO_D | ENTRYLO_V;
+
+       preempt_disable();
+       entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
+       ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+                                     flush_dcache_mask);
+       preempt_enable();
+
+       return ret;
+}
+
+int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
+                                        struct kvm_mips_tlb *tlb)
+{
+       unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
+       struct kvm *kvm = vcpu->kvm;
+       kvm_pfn_t pfn0, pfn1;
+       int ret;
+
+       if ((tlb->tlb_hi & VPN2_MASK) == 0) {
+               pfn0 = 0;
+               pfn1 = 0;
+       } else {
+               if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[0])
+                                          >> PAGE_SHIFT) < 0)
+                       return -1;
+
+               if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[1])
+                                          >> PAGE_SHIFT) < 0)
+                       return -1;
+
+               pfn0 = kvm->arch.guest_pmap[
+                       mips3_tlbpfn_to_paddr(tlb->tlb_lo[0]) >> PAGE_SHIFT];
+               pfn1 = kvm->arch.guest_pmap[
+                       mips3_tlbpfn_to_paddr(tlb->tlb_lo[1]) >> PAGE_SHIFT];
+       }
+
+       /* Get attributes from the Guest TLB */
+       entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
+               ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+               (tlb->tlb_lo[0] & ENTRYLO_D) |
+               (tlb->tlb_lo[0] & ENTRYLO_V);
+       entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
+               ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+               (tlb->tlb_lo[1] & ENTRYLO_D) |
+               (tlb->tlb_lo[1] & ENTRYLO_V);
+
+       kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
+                 tlb->tlb_lo[0], tlb->tlb_lo[1]);
+
+       preempt_disable();
+       entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
+                                              kvm_mips_get_kernel_asid(vcpu) :
+                                              kvm_mips_get_user_asid(vcpu));
+       ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+                                     tlb->tlb_mask);
+       preempt_enable();
+
+       return ret;
+}
+
+void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
+                            struct kvm_vcpu *vcpu)
+{
+       unsigned long asid = asid_cache(cpu);
+
+       asid += cpu_asid_inc();
+       if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
+               if (cpu_has_vtag_icache)
+                       flush_icache_all();
+
+               kvm_local_flush_tlb_all();      /* start new asid cycle */
+
+               if (!asid)      /* fix version if needed */
+                       asid = asid_first_version(cpu);
+       }
+
+       cpu_context(cpu, mm) = asid_cache(cpu) = asid;
+}
+
+/**
+ * kvm_mips_migrate_count() - Migrate timer.
+ * @vcpu:      Virtual CPU.
+ *
+ * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it
+ * if it was running prior to being cancelled.
+ *
+ * Must be called when the VCPU is migrated to a different CPU to ensure that
+ * timer expiry during guest execution interrupts the guest and causes the
+ * interrupt to be delivered in a timely manner.
+ */
+static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
+{
+       if (hrtimer_cancel(&vcpu->arch.comparecount_timer))
+               hrtimer_restart(&vcpu->arch.comparecount_timer);
+}
+
+/* Restore ASID once we are scheduled back after preemption */
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
+       unsigned long flags;
+       int newasid = 0;
+
+       kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
+
+       /* Allocate new kernel and user ASIDs if needed */
+
+       local_irq_save(flags);
+
+       if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
+                                               asid_version_mask(cpu)) {
+               kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
+               vcpu->arch.guest_kernel_asid[cpu] =
+                   vcpu->arch.guest_kernel_mm.context.asid[cpu];
+               kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
+               vcpu->arch.guest_user_asid[cpu] =
+                   vcpu->arch.guest_user_mm.context.asid[cpu];
+               newasid++;
+
+               kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
+                         cpu_context(cpu, current->mm));
+               kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
+                         cpu, vcpu->arch.guest_kernel_asid[cpu]);
+               kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
+                         vcpu->arch.guest_user_asid[cpu]);
+       }
+
+       if (vcpu->arch.last_sched_cpu != cpu) {
+               kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
+                         vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
+               /*
+                * Migrate the timer interrupt to the current CPU so that it
+                * always interrupts the guest and synchronously triggers a
+                * guest timer interrupt.
+                */
+               kvm_mips_migrate_count(vcpu);
+       }
+
+       if (!newasid) {
+               /*
+                * If we preempted while the guest was executing, then reload
+                * the pre-empted ASID
+                */
+               if (current->flags & PF_VCPU) {
+                       write_c0_entryhi(vcpu->arch.
+                                        preempt_entryhi & asid_mask);
+                       ehb();
+               }
+       } else {
+               /* New ASIDs were allocated for the VM */
+
+               /*
+                * Were we in guest context? If so then the pre-empted ASID is
+                * no longer valid, we need to set it to what it should be based
+                * on the mode of the Guest (Kernel/User)
+                */
+               if (current->flags & PF_VCPU) {
+                       if (KVM_GUEST_KERNEL_MODE(vcpu))
+                               write_c0_entryhi(vcpu->arch.
+                                                guest_kernel_asid[cpu] &
+                                                asid_mask);
+                       else
+                               write_c0_entryhi(vcpu->arch.
+                                                guest_user_asid[cpu] &
+                                                asid_mask);
+                       ehb();
+               }
+       }
+
+       /* restore guest state to registers */
+       kvm_mips_callbacks->vcpu_set_regs(vcpu);
+
+       local_irq_restore(flags);
+
+}
+
+/* ASID can change if another task is scheduled during preemption */
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+       unsigned long flags;
+       int cpu;
+
+       local_irq_save(flags);
+
+       cpu = smp_processor_id();
+
+       vcpu->arch.preempt_entryhi = read_c0_entryhi();
+       vcpu->arch.last_sched_cpu = cpu;
+
+       /* save guest state in registers */
+       kvm_mips_callbacks->vcpu_get_regs(vcpu);
+
+       if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
+            asid_version_mask(cpu))) {
+               kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
+                         cpu_context(cpu, current->mm));
+               drop_mmu_context(current->mm, cpu);
+       }
+       write_c0_entryhi(cpu_asid(cpu, current->mm));
+       ehb();
+
+       local_irq_restore(flags);
+}
+
+u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       unsigned long paddr, flags, vpn2, asid;
+       unsigned long va = (unsigned long)opc;
+       void *vaddr;
+       u32 inst;
+       int index;
+
+       if (KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0 ||
+           KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
+               local_irq_save(flags);
+               index = kvm_mips_host_tlb_lookup(vcpu, va);
+               if (index >= 0) {
+                       inst = *(opc);
+               } else {
+                       vpn2 = va & VPN2_MASK;
+                       asid = kvm_read_c0_guest_entryhi(cop0) &
+                                               KVM_ENTRYHI_ASID;
+                       index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
+                       if (index < 0) {
+                               kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
+                                       __func__, opc, vcpu, read_c0_entryhi());
+                               kvm_mips_dump_host_tlbs();
+                               kvm_mips_dump_guest_tlbs(vcpu);
+                               local_irq_restore(flags);
+                               return KVM_INVALID_INST;
+                       }
+                       kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
+                                                            &vcpu->arch.
+                                                            guest_tlb[index]);
+                       inst = *(opc);
+               }
+               local_irq_restore(flags);
+       } else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
+               paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va);
+               vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
+               vaddr += paddr & ~PAGE_MASK;
+               inst = *(u32 *)vaddr;
+               kunmap_atomic(vaddr);
+       } else {
+               kvm_err("%s: illegal address: %p\n", __func__, opc);
+               return KVM_INVALID_INST;
+       }
+
+       return inst;
+}
index 888bb67..53f851a 100644 (file)
 
 #include <linux/kvm_host.h>
 
-char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES] = {
-       "WAIT",
-       "CACHE",
-       "Signal",
-       "Interrupt",
-       "COP0/1 Unusable",
-       "TLB Mod",
-       "TLB Miss (LD)",
-       "TLB Miss (ST)",
-       "Address Err (ST)",
-       "Address Error (LD)",
-       "System Call",
-       "Reserved Inst",
-       "Break Inst",
-       "Trap Inst",
-       "MSA FPE",
-       "FPE",
-       "MSA Disabled",
-       "D-Cache Flushes",
-};
-
 char *kvm_cop0_str[N_MIPS_COPROC_REGS] = {
        "Index",
        "Random",
index ed021ae..254377d 100644 (file)
@@ -14,7 +14,7 @@
 #include <linux/smp.h>
 #include <linux/mm.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kvm_host.h>
 #include <linux/srcu.h>
 
@@ -24,6 +24,7 @@
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
+#include <asm/tlbdebug.h>
 
 #undef CONFIG_MIPS_MT
 #include <asm/r4kcache.h>
 #define KVM_GUEST_PC_TLB    0
 #define KVM_GUEST_SP_TLB    1
 
-#define PRIx64 "llx"
-
 atomic_t kvm_mips_instance;
 EXPORT_SYMBOL_GPL(kvm_mips_instance);
 
-/* These function pointers are initialized once the KVM module is loaded */
-kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
-EXPORT_SYMBOL_GPL(kvm_mips_gfn_to_pfn);
-
-void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
-EXPORT_SYMBOL_GPL(kvm_mips_release_pfn_clean);
-
-bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
-EXPORT_SYMBOL_GPL(kvm_mips_is_error_pfn);
-
-uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
+static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 {
        int cpu = smp_processor_id();
 
@@ -55,7 +44,7 @@ uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
                        cpu_asid_mask(&cpu_data[cpu]);
 }
 
-uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
+static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
 {
        int cpu = smp_processor_id();
 
@@ -63,7 +52,7 @@ uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
                        cpu_asid_mask(&cpu_data[cpu]);
 }
 
-inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
+inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
 {
        return vcpu->kvm->arch.commpage_tlb;
 }
@@ -72,50 +61,15 @@ inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
 
 void kvm_mips_dump_host_tlbs(void)
 {
-       unsigned long old_entryhi;
-       unsigned long old_pagemask;
-       struct kvm_mips_tlb tlb;
        unsigned long flags;
-       int i;
 
        local_irq_save(flags);
 
-       old_entryhi = read_c0_entryhi();
-       old_pagemask = read_c0_pagemask();
-
        kvm_info("HOST TLBs:\n");
-       kvm_info("ASID: %#lx\n", read_c0_entryhi() &
-                cpu_asid_mask(&current_cpu_data));
-
-       for (i = 0; i < current_cpu_data.tlbsize; i++) {
-               write_c0_index(i);
-               mtc0_tlbw_hazard();
-
-               tlb_read();
-               tlbw_use_hazard();
+       dump_tlb_regs();
+       pr_info("\n");
+       dump_tlb_all();
 
-               tlb.tlb_hi = read_c0_entryhi();
-               tlb.tlb_lo0 = read_c0_entrylo0();
-               tlb.tlb_lo1 = read_c0_entrylo1();
-               tlb.tlb_mask = read_c0_pagemask();
-
-               kvm_info("TLB%c%3d Hi 0x%08lx ",
-                        (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*',
-                        i, tlb.tlb_hi);
-               kvm_info("Lo0=0x%09" PRIx64 " %c%c attr %lx ",
-                        (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
-                        (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ',
-                        (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ',
-                        (tlb.tlb_lo0 >> 3) & 7);
-               kvm_info("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n",
-                        (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
-                        (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ',
-                        (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ',
-                        (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask);
-       }
-       write_c0_entryhi(old_entryhi);
-       write_c0_pagemask(old_pagemask);
-       mtc0_tlbw_hazard();
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(kvm_mips_dump_host_tlbs);
@@ -132,74 +86,24 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu)
        for (i = 0; i < KVM_MIPS_GUEST_TLB_SIZE; i++) {
                tlb = vcpu->arch.guest_tlb[i];
                kvm_info("TLB%c%3d Hi 0x%08lx ",
-                        (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*',
+                        (tlb.tlb_lo[0] | tlb.tlb_lo[1]) & ENTRYLO_V
+                                                       ? ' ' : '*',
                         i, tlb.tlb_hi);
-               kvm_info("Lo0=0x%09" PRIx64 " %c%c attr %lx ",
-                        (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
-                        (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ',
-                        (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ',
-                        (tlb.tlb_lo0 >> 3) & 7);
-               kvm_info("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n",
-                        (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
-                        (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ',
-                        (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ',
-                        (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask);
+               kvm_info("Lo0=0x%09llx %c%c attr %lx ",
+                        (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[0]),
+                        (tlb.tlb_lo[0] & ENTRYLO_D) ? 'D' : ' ',
+                        (tlb.tlb_lo[0] & ENTRYLO_G) ? 'G' : ' ',
+                        (tlb.tlb_lo[0] & ENTRYLO_C) >> ENTRYLO_C_SHIFT);
+               kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n",
+                        (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[1]),
+                        (tlb.tlb_lo[1] & ENTRYLO_D) ? 'D' : ' ',
+                        (tlb.tlb_lo[1] & ENTRYLO_G) ? 'G' : ' ',
+                        (tlb.tlb_lo[1] & ENTRYLO_C) >> ENTRYLO_C_SHIFT,
+                        tlb.tlb_mask);
        }
 }
 EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs);
 
-static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
-{
-       int srcu_idx, err = 0;
-       kvm_pfn_t pfn;
-
-       if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
-               return 0;
-
-       srcu_idx = srcu_read_lock(&kvm->srcu);
-       pfn = kvm_mips_gfn_to_pfn(kvm, gfn);
-
-       if (kvm_mips_is_error_pfn(pfn)) {
-               kvm_err("Couldn't get pfn for gfn %#" PRIx64 "!\n", gfn);
-               err = -EFAULT;
-               goto out;
-       }
-
-       kvm->arch.guest_pmap[gfn] = pfn;
-out:
-       srcu_read_unlock(&kvm->srcu, srcu_idx);
-       return err;
-}
-
-/* Translate guest KSEG0 addresses to Host PA */
-unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
-                                                   unsigned long gva)
-{
-       gfn_t gfn;
-       uint32_t offset = gva & ~PAGE_MASK;
-       struct kvm *kvm = vcpu->kvm;
-
-       if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) {
-               kvm_err("%s/%p: Invalid gva: %#lx\n", __func__,
-                       __builtin_return_address(0), gva);
-               return KVM_INVALID_PAGE;
-       }
-
-       gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT);
-
-       if (gfn >= kvm->arch.guest_pmap_npages) {
-               kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn,
-                       gva);
-               return KVM_INVALID_PAGE;
-       }
-
-       if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
-               return KVM_INVALID_ADDR;
-
-       return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_translate_guest_kseg0_to_hpa);
-
 /* XXXKYMA: Must be called with interrupts disabled */
 /* set flush_dcache_mask == 0 if no dcache flush required */
 int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
@@ -243,12 +147,12 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
 
        /* Flush D-cache */
        if (flush_dcache_mask) {
-               if (entrylo0 & MIPS3_PG_V) {
+               if (entrylo0 & ENTRYLO_V) {
                        ++vcpu->stat.flush_dcache_exits;
                        flush_data_cache_page((entryhi & VPN2_MASK) &
                                              ~flush_dcache_mask);
                }
-               if (entrylo1 & MIPS3_PG_V) {
+               if (entrylo1 & ENTRYLO_V) {
                        ++vcpu->stat.flush_dcache_exits;
                        flush_data_cache_page(((entryhi & VPN2_MASK) &
                                               ~flush_dcache_mask) |
@@ -259,96 +163,35 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
        /* Restore old ASID */
        write_c0_entryhi(old_entryhi);
        mtc0_tlbw_hazard();
-       tlbw_use_hazard();
        local_irq_restore(flags);
        return 0;
 }
-
-/* XXXKYMA: Must be called with interrupts disabled */
-int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
-                                   struct kvm_vcpu *vcpu)
-{
-       gfn_t gfn;
-       kvm_pfn_t pfn0, pfn1;
-       unsigned long vaddr = 0;
-       unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
-       int even;
-       struct kvm *kvm = vcpu->kvm;
-       const int flush_dcache_mask = 0;
-       int ret;
-
-       if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
-               kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
-               kvm_mips_dump_host_tlbs();
-               return -1;
-       }
-
-       gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
-       if (gfn >= kvm->arch.guest_pmap_npages) {
-               kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
-                       gfn, badvaddr);
-               kvm_mips_dump_host_tlbs();
-               return -1;
-       }
-       even = !(gfn & 0x1);
-       vaddr = badvaddr & (PAGE_MASK << 1);
-
-       if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
-               return -1;
-
-       if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0)
-               return -1;
-
-       if (even) {
-               pfn0 = kvm->arch.guest_pmap[gfn];
-               pfn1 = kvm->arch.guest_pmap[gfn ^ 0x1];
-       } else {
-               pfn0 = kvm->arch.guest_pmap[gfn ^ 0x1];
-               pfn1 = kvm->arch.guest_pmap[gfn];
-       }
-
-       entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
-                  (1 << 2) | (0x1 << 1);
-       entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
-                  (1 << 2) | (0x1 << 1);
-
-       preempt_disable();
-       entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
-       ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
-                                     flush_dcache_mask);
-       preempt_enable();
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_handle_kseg0_tlb_fault);
+EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write);
 
 int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
        struct kvm_vcpu *vcpu)
 {
-       kvm_pfn_t pfn0, pfn1;
+       kvm_pfn_t pfn;
        unsigned long flags, old_entryhi = 0, vaddr = 0;
-       unsigned long entrylo0 = 0, entrylo1 = 0;
+       unsigned long entrylo[2] = { 0, 0 };
+       unsigned int pair_idx;
 
-       pfn0 = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT;
-       pfn1 = 0;
-       entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
-                  (1 << 2) | (0x1 << 1);
-       entrylo1 = 0;
+       pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage));
+       pair_idx = (badvaddr >> PAGE_SHIFT) & 1;
+       entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) |
+               ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+               ENTRYLO_D | ENTRYLO_V;
 
        local_irq_save(flags);
 
        old_entryhi = read_c0_entryhi();
        vaddr = badvaddr & (PAGE_MASK << 1);
        write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu));
-       mtc0_tlbw_hazard();
-       write_c0_entrylo0(entrylo0);
-       mtc0_tlbw_hazard();
-       write_c0_entrylo1(entrylo1);
-       mtc0_tlbw_hazard();
+       write_c0_entrylo0(entrylo[0]);
+       write_c0_entrylo1(entrylo[1]);
        write_c0_index(kvm_mips_get_commpage_asid(vcpu));
        mtc0_tlbw_hazard();
        tlb_write_indexed();
-       mtc0_tlbw_hazard();
        tlbw_use_hazard();
 
        kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n",
@@ -358,68 +201,12 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
        /* Restore old ASID */
        write_c0_entryhi(old_entryhi);
        mtc0_tlbw_hazard();
-       tlbw_use_hazard();
        local_irq_restore(flags);
 
        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_mips_handle_commpage_tlb_fault);
 
-int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
-                                        struct kvm_mips_tlb *tlb,
-                                        unsigned long *hpa0,
-                                        unsigned long *hpa1)
-{
-       unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
-       struct kvm *kvm = vcpu->kvm;
-       kvm_pfn_t pfn0, pfn1;
-       int ret;
-
-       if ((tlb->tlb_hi & VPN2_MASK) == 0) {
-               pfn0 = 0;
-               pfn1 = 0;
-       } else {
-               if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
-                                          >> PAGE_SHIFT) < 0)
-                       return -1;
-
-               if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
-                                          >> PAGE_SHIFT) < 0)
-                       return -1;
-
-               pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
-                                           >> PAGE_SHIFT];
-               pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
-                                           >> PAGE_SHIFT];
-       }
-
-       if (hpa0)
-               *hpa0 = pfn0 << PAGE_SHIFT;
-
-       if (hpa1)
-               *hpa1 = pfn1 << PAGE_SHIFT;
-
-       /* Get attributes from the Guest TLB */
-       entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
-                  (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V);
-       entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
-                  (tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V);
-
-       kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
-                 tlb->tlb_lo0, tlb->tlb_lo1);
-
-       preempt_disable();
-       entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
-                                              kvm_mips_get_kernel_asid(vcpu) :
-                                              kvm_mips_get_user_asid(vcpu));
-       ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
-                                     tlb->tlb_mask);
-       preempt_enable();
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_handle_mapped_seg_tlb_fault);
-
 int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
 {
        int i;
@@ -435,7 +222,7 @@ int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
        }
 
        kvm_debug("%s: entryhi: %#lx, index: %d lo0: %#lx, lo1: %#lx\n",
-                 __func__, entryhi, index, tlb[i].tlb_lo0, tlb[i].tlb_lo1);
+                 __func__, entryhi, index, tlb[i].tlb_lo[0], tlb[i].tlb_lo[1]);
 
        return index;
 }
@@ -467,7 +254,6 @@ int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr)
        /* Restore old ASID */
        write_c0_entryhi(old_entryhi);
        mtc0_tlbw_hazard();
-       tlbw_use_hazard();
 
        local_irq_restore(flags);
 
@@ -498,21 +284,16 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
 
        if (idx > 0) {
                write_c0_entryhi(UNIQUE_ENTRYHI(idx));
-               mtc0_tlbw_hazard();
-
                write_c0_entrylo0(0);
-               mtc0_tlbw_hazard();
-
                write_c0_entrylo1(0);
                mtc0_tlbw_hazard();
 
                tlb_write_indexed();
-               mtc0_tlbw_hazard();
+               tlbw_use_hazard();
        }
 
        write_c0_entryhi(old_entryhi);
        mtc0_tlbw_hazard();
-       tlbw_use_hazard();
 
        local_irq_restore(flags);
 
@@ -540,61 +321,39 @@ void kvm_mips_flush_host_tlb(int skip_kseg0)
        /* Blast 'em all away. */
        for (entry = 0; entry < maxentry; entry++) {
                write_c0_index(entry);
-               mtc0_tlbw_hazard();
 
                if (skip_kseg0) {
+                       mtc0_tlbr_hazard();
                        tlb_read();
-                       tlbw_use_hazard();
+                       tlb_read_hazard();
 
                        entryhi = read_c0_entryhi();
 
                        /* Don't blow away guest kernel entries */
                        if (KVM_GUEST_KSEGX(entryhi) == KVM_GUEST_KSEG0)
                                continue;
+
+                       write_c0_pagemask(old_pagemask);
                }
 
                /* Make sure all entries differ. */
                write_c0_entryhi(UNIQUE_ENTRYHI(entry));
-               mtc0_tlbw_hazard();
                write_c0_entrylo0(0);
-               mtc0_tlbw_hazard();
                write_c0_entrylo1(0);
                mtc0_tlbw_hazard();
 
                tlb_write_indexed();
-               mtc0_tlbw_hazard();
+               tlbw_use_hazard();
        }
 
-       tlbw_use_hazard();
-
        write_c0_entryhi(old_entryhi);
        write_c0_pagemask(old_pagemask);
        mtc0_tlbw_hazard();
-       tlbw_use_hazard();
 
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb);
 
-void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
-                            struct kvm_vcpu *vcpu)
-{
-       unsigned long asid = asid_cache(cpu);
-
-       asid += cpu_asid_inc();
-       if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
-               if (cpu_has_vtag_icache)
-                       flush_icache_all();
-
-               kvm_local_flush_tlb_all();      /* start new asid cycle */
-
-               if (!asid)      /* fix version if needed */
-                       asid = asid_first_version(cpu);
-       }
-
-       cpu_context(cpu, mm) = asid_cache(cpu) = asid;
-}
-
 void kvm_local_flush_tlb_all(void)
 {
        unsigned long flags;
@@ -614,185 +373,12 @@ void kvm_local_flush_tlb_all(void)
                write_c0_index(entry);
                mtc0_tlbw_hazard();
                tlb_write_indexed();
+               tlbw_use_hazard();
                entry++;
        }
-       tlbw_use_hazard();
        write_c0_entryhi(old_ctx);
        mtc0_tlbw_hazard();
 
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all);
-
-/**
- * kvm_mips_migrate_count() - Migrate timer.
- * @vcpu:      Virtual CPU.
- *
- * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it
- * if it was running prior to being cancelled.
- *
- * Must be called when the VCPU is migrated to a different CPU to ensure that
- * timer expiry during guest execution interrupts the guest and causes the
- * interrupt to be delivered in a timely manner.
- */
-static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
-{
-       if (hrtimer_cancel(&vcpu->arch.comparecount_timer))
-               hrtimer_restart(&vcpu->arch.comparecount_timer);
-}
-
-/* Restore ASID once we are scheduled back after preemption */
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-       unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
-       unsigned long flags;
-       int newasid = 0;
-
-       kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
-
-       /* Allocate new kernel and user ASIDs if needed */
-
-       local_irq_save(flags);
-
-       if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
-                                               asid_version_mask(cpu)) {
-               kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
-               vcpu->arch.guest_kernel_asid[cpu] =
-                   vcpu->arch.guest_kernel_mm.context.asid[cpu];
-               kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
-               vcpu->arch.guest_user_asid[cpu] =
-                   vcpu->arch.guest_user_mm.context.asid[cpu];
-               newasid++;
-
-               kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
-                         cpu_context(cpu, current->mm));
-               kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
-                         cpu, vcpu->arch.guest_kernel_asid[cpu]);
-               kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
-                         vcpu->arch.guest_user_asid[cpu]);
-       }
-
-       if (vcpu->arch.last_sched_cpu != cpu) {
-               kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
-                         vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
-               /*
-                * Migrate the timer interrupt to the current CPU so that it
-                * always interrupts the guest and synchronously triggers a
-                * guest timer interrupt.
-                */
-               kvm_mips_migrate_count(vcpu);
-       }
-
-       if (!newasid) {
-               /*
-                * If we preempted while the guest was executing, then reload
-                * the pre-empted ASID
-                */
-               if (current->flags & PF_VCPU) {
-                       write_c0_entryhi(vcpu->arch.
-                                        preempt_entryhi & asid_mask);
-                       ehb();
-               }
-       } else {
-               /* New ASIDs were allocated for the VM */
-
-               /*
-                * Were we in guest context? If so then the pre-empted ASID is
-                * no longer valid, we need to set it to what it should be based
-                * on the mode of the Guest (Kernel/User)
-                */
-               if (current->flags & PF_VCPU) {
-                       if (KVM_GUEST_KERNEL_MODE(vcpu))
-                               write_c0_entryhi(vcpu->arch.
-                                                guest_kernel_asid[cpu] &
-                                                asid_mask);
-                       else
-                               write_c0_entryhi(vcpu->arch.
-                                                guest_user_asid[cpu] &
-                                                asid_mask);
-                       ehb();
-               }
-       }
-
-       /* restore guest state to registers */
-       kvm_mips_callbacks->vcpu_set_regs(vcpu);
-
-       local_irq_restore(flags);
-
-}
-EXPORT_SYMBOL_GPL(kvm_arch_vcpu_load);
-
-/* ASID can change if another task is scheduled during preemption */
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-{
-       unsigned long flags;
-       uint32_t cpu;
-
-       local_irq_save(flags);
-
-       cpu = smp_processor_id();
-
-       vcpu->arch.preempt_entryhi = read_c0_entryhi();
-       vcpu->arch.last_sched_cpu = cpu;
-
-       /* save guest state in registers */
-       kvm_mips_callbacks->vcpu_get_regs(vcpu);
-
-       if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
-            asid_version_mask(cpu))) {
-               kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
-                         cpu_context(cpu, current->mm));
-               drop_mmu_context(current->mm, cpu);
-       }
-       write_c0_entryhi(cpu_asid(cpu, current->mm));
-       ehb();
-
-       local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_vcpu_put);
-
-uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu)
-{
-       struct mips_coproc *cop0 = vcpu->arch.cop0;
-       unsigned long paddr, flags, vpn2, asid;
-       uint32_t inst;
-       int index;
-
-       if (KVM_GUEST_KSEGX((unsigned long) opc) < KVM_GUEST_KSEG0 ||
-           KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
-               local_irq_save(flags);
-               index = kvm_mips_host_tlb_lookup(vcpu, (unsigned long) opc);
-               if (index >= 0) {
-                       inst = *(opc);
-               } else {
-                       vpn2 = (unsigned long) opc & VPN2_MASK;
-                       asid = kvm_read_c0_guest_entryhi(cop0) &
-                                               KVM_ENTRYHI_ASID;
-                       index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
-                       if (index < 0) {
-                               kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
-                                       __func__, opc, vcpu, read_c0_entryhi());
-                               kvm_mips_dump_host_tlbs();
-                               local_irq_restore(flags);
-                               return KVM_INVALID_INST;
-                       }
-                       kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
-                                                            &vcpu->arch.
-                                                            guest_tlb[index],
-                                                            NULL, NULL);
-                       inst = *(opc);
-               }
-               local_irq_restore(flags);
-       } else if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
-               paddr =
-                   kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
-                                                         (unsigned long) opc);
-               inst = *(uint32_t *) CKSEG0ADDR(paddr);
-       } else {
-               kvm_err("%s: illegal address: %p\n", __func__, opc);
-               return KVM_INVALID_INST;
-       }
-
-       return inst;
-}
-EXPORT_SYMBOL_GPL(kvm_get_inst);
index bd6437f..c858cf1 100644 (file)
 #define TRACE_INCLUDE_PATH .
 #define TRACE_INCLUDE_FILE trace
 
-/* Tracepoints for VM eists */
-extern char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES];
+/*
+ * Tracepoints for VM enters
+ */
+DECLARE_EVENT_CLASS(kvm_transition,
+       TP_PROTO(struct kvm_vcpu *vcpu),
+       TP_ARGS(vcpu),
+       TP_STRUCT__entry(
+               __field(unsigned long, pc)
+       ),
+
+       TP_fast_assign(
+               __entry->pc = vcpu->arch.pc;
+       ),
+
+       TP_printk("PC: 0x%08lx",
+                 __entry->pc)
+);
+
+DEFINE_EVENT(kvm_transition, kvm_enter,
+            TP_PROTO(struct kvm_vcpu *vcpu),
+            TP_ARGS(vcpu));
+
+DEFINE_EVENT(kvm_transition, kvm_reenter,
+            TP_PROTO(struct kvm_vcpu *vcpu),
+            TP_ARGS(vcpu));
+
+DEFINE_EVENT(kvm_transition, kvm_out,
+            TP_PROTO(struct kvm_vcpu *vcpu),
+            TP_ARGS(vcpu));
+
+/* The first 32 exit reasons correspond to Cause.ExcCode */
+#define KVM_TRACE_EXIT_INT              0
+#define KVM_TRACE_EXIT_TLBMOD           1
+#define KVM_TRACE_EXIT_TLBMISS_LD       2
+#define KVM_TRACE_EXIT_TLBMISS_ST       3
+#define KVM_TRACE_EXIT_ADDRERR_LD       4
+#define KVM_TRACE_EXIT_ADDRERR_ST       5
+#define KVM_TRACE_EXIT_SYSCALL          8
+#define KVM_TRACE_EXIT_BREAK_INST       9
+#define KVM_TRACE_EXIT_RESVD_INST      10
+#define KVM_TRACE_EXIT_COP_UNUSABLE    11
+#define KVM_TRACE_EXIT_TRAP_INST       13
+#define KVM_TRACE_EXIT_MSA_FPE         14
+#define KVM_TRACE_EXIT_FPE             15
+#define KVM_TRACE_EXIT_MSA_DISABLED    21
+/* Further exit reasons */
+#define KVM_TRACE_EXIT_WAIT            32
+#define KVM_TRACE_EXIT_CACHE           33
+#define KVM_TRACE_EXIT_SIGNAL          34
+
+/* Tracepoints for VM exits */
+#define kvm_trace_symbol_exit_types                            \
+       { KVM_TRACE_EXIT_INT,           "Interrupt" },          \
+       { KVM_TRACE_EXIT_TLBMOD,        "TLB Mod" },            \
+       { KVM_TRACE_EXIT_TLBMISS_LD,    "TLB Miss (LD)" },      \
+       { KVM_TRACE_EXIT_TLBMISS_ST,    "TLB Miss (ST)" },      \
+       { KVM_TRACE_EXIT_ADDRERR_LD,    "Address Error (LD)" }, \
+       { KVM_TRACE_EXIT_ADDRERR_ST,    "Address Err (ST)" },   \
+       { KVM_TRACE_EXIT_SYSCALL,       "System Call" },        \
+       { KVM_TRACE_EXIT_BREAK_INST,    "Break Inst" },         \
+       { KVM_TRACE_EXIT_RESVD_INST,    "Reserved Inst" },      \
+       { KVM_TRACE_EXIT_COP_UNUSABLE,  "COP0/1 Unusable" },    \
+       { KVM_TRACE_EXIT_TRAP_INST,     "Trap Inst" },          \
+       { KVM_TRACE_EXIT_MSA_FPE,       "MSA FPE" },            \
+       { KVM_TRACE_EXIT_FPE,           "FPE" },                \
+       { KVM_TRACE_EXIT_MSA_DISABLED,  "MSA Disabled" },       \
+       { KVM_TRACE_EXIT_WAIT,          "WAIT" },               \
+       { KVM_TRACE_EXIT_CACHE,         "CACHE" },              \
+       { KVM_TRACE_EXIT_SIGNAL,        "Signal" }
 
 TRACE_EVENT(kvm_exit,
            TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
@@ -34,10 +101,173 @@ TRACE_EVENT(kvm_exit,
            ),
 
            TP_printk("[%s]PC: 0x%08lx",
-                     kvm_mips_exit_types_str[__entry->reason],
+                     __print_symbolic(__entry->reason,
+                                      kvm_trace_symbol_exit_types),
                      __entry->pc)
 );
 
+#define KVM_TRACE_MFC0         0
+#define KVM_TRACE_MTC0         1
+#define KVM_TRACE_DMFC0                2
+#define KVM_TRACE_DMTC0                3
+#define KVM_TRACE_RDHWR                4
+
+#define KVM_TRACE_HWR_COP0     0
+#define KVM_TRACE_HWR_HWR      1
+
+#define KVM_TRACE_COP0(REG, SEL)       ((KVM_TRACE_HWR_COP0 << 8) |    \
+                                        ((REG) << 3) | (SEL))
+#define KVM_TRACE_HWR(REG, SEL)                ((KVM_TRACE_HWR_HWR  << 8) |    \
+                                        ((REG) << 3) | (SEL))
+
+#define kvm_trace_symbol_hwr_ops                               \
+       { KVM_TRACE_MFC0,               "MFC0" },               \
+       { KVM_TRACE_MTC0,               "MTC0" },               \
+       { KVM_TRACE_DMFC0,              "DMFC0" },              \
+       { KVM_TRACE_DMTC0,              "DMTC0" },              \
+       { KVM_TRACE_RDHWR,              "RDHWR" }
+
+#define kvm_trace_symbol_hwr_cop                               \
+       { KVM_TRACE_HWR_COP0,           "COP0" },               \
+       { KVM_TRACE_HWR_HWR,            "HWR" }
+
+#define kvm_trace_symbol_hwr_regs                              \
+       { KVM_TRACE_COP0( 0, 0),        "Index" },              \
+       { KVM_TRACE_COP0( 2, 0),        "EntryLo0" },           \
+       { KVM_TRACE_COP0( 3, 0),        "EntryLo1" },           \
+       { KVM_TRACE_COP0( 4, 0),        "Context" },            \
+       { KVM_TRACE_COP0( 4, 2),        "UserLocal" },          \
+       { KVM_TRACE_COP0( 5, 0),        "PageMask" },           \
+       { KVM_TRACE_COP0( 6, 0),        "Wired" },              \
+       { KVM_TRACE_COP0( 7, 0),        "HWREna" },             \
+       { KVM_TRACE_COP0( 8, 0),        "BadVAddr" },           \
+       { KVM_TRACE_COP0( 9, 0),        "Count" },              \
+       { KVM_TRACE_COP0(10, 0),        "EntryHi" },            \
+       { KVM_TRACE_COP0(11, 0),        "Compare" },            \
+       { KVM_TRACE_COP0(12, 0),        "Status" },             \
+       { KVM_TRACE_COP0(12, 1),        "IntCtl" },             \
+       { KVM_TRACE_COP0(12, 2),        "SRSCtl" },             \
+       { KVM_TRACE_COP0(13, 0),        "Cause" },              \
+       { KVM_TRACE_COP0(14, 0),        "EPC" },                \
+       { KVM_TRACE_COP0(15, 0),        "PRId" },               \
+       { KVM_TRACE_COP0(15, 1),        "EBase" },              \
+       { KVM_TRACE_COP0(16, 0),        "Config" },             \
+       { KVM_TRACE_COP0(16, 1),        "Config1" },            \
+       { KVM_TRACE_COP0(16, 2),        "Config2" },            \
+       { KVM_TRACE_COP0(16, 3),        "Config3" },            \
+       { KVM_TRACE_COP0(16, 4),        "Config4" },            \
+       { KVM_TRACE_COP0(16, 5),        "Config5" },            \
+       { KVM_TRACE_COP0(16, 7),        "Config7" },            \
+       { KVM_TRACE_COP0(26, 0),        "ECC" },                \
+       { KVM_TRACE_COP0(30, 0),        "ErrorEPC" },           \
+       { KVM_TRACE_COP0(31, 2),        "KScratch1" },          \
+       { KVM_TRACE_COP0(31, 3),        "KScratch2" },          \
+       { KVM_TRACE_COP0(31, 4),        "KScratch3" },          \
+       { KVM_TRACE_COP0(31, 5),        "KScratch4" },          \
+       { KVM_TRACE_COP0(31, 6),        "KScratch5" },          \
+       { KVM_TRACE_COP0(31, 7),        "KScratch6" },          \
+       { KVM_TRACE_HWR( 0, 0),         "CPUNum" },             \
+       { KVM_TRACE_HWR( 1, 0),         "SYNCI_Step" },         \
+       { KVM_TRACE_HWR( 2, 0),         "CC" },                 \
+       { KVM_TRACE_HWR( 3, 0),         "CCRes" },              \
+       { KVM_TRACE_HWR(29, 0),         "ULR" }
+
+TRACE_EVENT(kvm_hwr,
+           TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op, unsigned int reg,
+                    unsigned long val),
+           TP_ARGS(vcpu, op, reg, val),
+           TP_STRUCT__entry(
+                       __field(unsigned long, val)
+                       __field(u16, reg)
+                       __field(u8, op)
+           ),
+
+           TP_fast_assign(
+                       __entry->val = val;
+                       __entry->reg = reg;
+                       __entry->op = op;
+           ),
+
+           TP_printk("%s %s (%s:%u:%u) 0x%08lx",
+                     __print_symbolic(__entry->op,
+                                      kvm_trace_symbol_hwr_ops),
+                     __print_symbolic(__entry->reg,
+                                      kvm_trace_symbol_hwr_regs),
+                     __print_symbolic(__entry->reg >> 8,
+                                      kvm_trace_symbol_hwr_cop),
+                     (__entry->reg >> 3) & 0x1f,
+                     __entry->reg & 0x7,
+                     __entry->val)
+);
+
+#define KVM_TRACE_AUX_RESTORE          0
+#define KVM_TRACE_AUX_SAVE             1
+#define KVM_TRACE_AUX_ENABLE           2
+#define KVM_TRACE_AUX_DISABLE          3
+#define KVM_TRACE_AUX_DISCARD          4
+
+#define KVM_TRACE_AUX_FPU              1
+#define KVM_TRACE_AUX_MSA              2
+#define KVM_TRACE_AUX_FPU_MSA          3
+
+#define kvm_trace_symbol_aux_op                \
+       { KVM_TRACE_AUX_RESTORE, "restore" },   \
+       { KVM_TRACE_AUX_SAVE,    "save" },      \
+       { KVM_TRACE_AUX_ENABLE,  "enable" },    \
+       { KVM_TRACE_AUX_DISABLE, "disable" },   \
+       { KVM_TRACE_AUX_DISCARD, "discard" }
+
+#define kvm_trace_symbol_aux_state             \
+       { KVM_TRACE_AUX_FPU,     "FPU" },       \
+       { KVM_TRACE_AUX_MSA,     "MSA" },       \
+       { KVM_TRACE_AUX_FPU_MSA, "FPU & MSA" }
+
+TRACE_EVENT(kvm_aux,
+           TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op,
+                    unsigned int state),
+           TP_ARGS(vcpu, op, state),
+           TP_STRUCT__entry(
+                       __field(unsigned long, pc)
+                       __field(u8, op)
+                       __field(u8, state)
+           ),
+
+           TP_fast_assign(
+                       __entry->pc = vcpu->arch.pc;
+                       __entry->op = op;
+                       __entry->state = state;
+           ),
+
+           TP_printk("%s %s PC: 0x%08lx",
+                     __print_symbolic(__entry->op,
+                                      kvm_trace_symbol_aux_op),
+                     __print_symbolic(__entry->state,
+                                      kvm_trace_symbol_aux_state),
+                     __entry->pc)
+);
+
+TRACE_EVENT(kvm_asid_change,
+           TP_PROTO(struct kvm_vcpu *vcpu, unsigned int old_asid,
+                    unsigned int new_asid),
+           TP_ARGS(vcpu, old_asid, new_asid),
+           TP_STRUCT__entry(
+                       __field(unsigned long, pc)
+                       __field(u8, old_asid)
+                       __field(u8, new_asid)
+           ),
+
+           TP_fast_assign(
+                       __entry->pc = vcpu->arch.pc;
+                       __entry->old_asid = old_asid;
+                       __entry->new_asid = new_asid;
+           ),
+
+           TP_printk("PC: 0x%08lx old: 0x%02x new: 0x%02x",
+                     __entry->pc,
+                     __entry->old_asid,
+                     __entry->new_asid)
+);
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */
index 6ba0faf..0915539 100644 (file)
@@ -21,7 +21,7 @@
 static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
 {
        gpa_t gpa;
-       uint32_t kseg = KSEGX(gva);
+       gva_t kseg = KSEGX(gva);
 
        if ((kseg == CKSEG0) || (kseg == CKSEG1))
                gpa = CPHYSADDR(gva);
@@ -40,8 +40,8 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -87,15 +87,15 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
        unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
        if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
            || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-               kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n",
+               kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
                          cause, opc, badvaddr);
                er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu);
 
@@ -111,14 +111,14 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
                 * when we are not using HIGHMEM. Need to address this in a
                 * HIGHMEM kernel
                 */
-               kvm_err("TLB MOD fault not handled, cause %#lx, PC: %p, BadVaddr: %#lx\n",
+               kvm_err("TLB MOD fault not handled, cause %#x, PC: %p, BadVaddr: %#lx\n",
                        cause, opc, badvaddr);
                kvm_mips_dump_host_tlbs();
                kvm_arch_vcpu_dump_regs(vcpu);
                run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                ret = RESUME_HOST;
        } else {
-               kvm_err("Illegal TLB Mod fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
+               kvm_err("Illegal TLB Mod fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
                        cause, opc, badvaddr);
                kvm_mips_dump_host_tlbs();
                kvm_arch_vcpu_dump_regs(vcpu);
@@ -128,59 +128,12 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
        return ret;
 }
 
-static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
-{
-       struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
-       unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
-       enum emulation_result er = EMULATE_DONE;
-       int ret = RESUME_GUEST;
-
-       if (((badvaddr & PAGE_MASK) == KVM_GUEST_COMMPAGE_ADDR)
-           && KVM_GUEST_KERNEL_MODE(vcpu)) {
-               if (kvm_mips_handle_commpage_tlb_fault(badvaddr, vcpu) < 0) {
-                       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-                       ret = RESUME_HOST;
-               }
-       } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
-                  || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-               kvm_debug("USER ADDR TLB LD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n",
-                         cause, opc, badvaddr);
-               er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu);
-               if (er == EMULATE_DONE)
-                       ret = RESUME_GUEST;
-               else {
-                       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-                       ret = RESUME_HOST;
-               }
-       } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
-               /*
-                * All KSEG0 faults are handled by KVM, as the guest kernel does
-                * not expect to ever get them
-                */
-               if (kvm_mips_handle_kseg0_tlb_fault
-                   (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) {
-                       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-                       ret = RESUME_HOST;
-               }
-       } else {
-               kvm_err("Illegal TLB LD fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
-                       cause, opc, badvaddr);
-               kvm_mips_dump_host_tlbs();
-               kvm_arch_vcpu_dump_regs(vcpu);
-               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               ret = RESUME_HOST;
-       }
-       return ret;
-}
-
-static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
+static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
        unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -192,8 +145,8 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
                }
        } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
                   || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-               kvm_debug("USER ADDR TLB ST fault: PC: %#lx, BadVaddr: %#lx\n",
-                         vcpu->arch.pc, badvaddr);
+               kvm_debug("USER ADDR TLB %s fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
+                         store ? "ST" : "LD", cause, opc, badvaddr);
 
                /*
                 * User Address (UA) fault, this could happen if
@@ -213,14 +166,18 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
                        ret = RESUME_HOST;
                }
        } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
+               /*
+                * All KSEG0 faults are handled by KVM, as the guest kernel does
+                * not expect to ever get them
+                */
                if (kvm_mips_handle_kseg0_tlb_fault
                    (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) {
                        run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                        ret = RESUME_HOST;
                }
        } else {
-               kvm_err("Illegal TLB ST fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
-                       cause, opc, badvaddr);
+               kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
+                       store ? "ST" : "LD", cause, opc, badvaddr);
                kvm_mips_dump_host_tlbs();
                kvm_arch_vcpu_dump_regs(vcpu);
                run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -229,12 +186,22 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
        return ret;
 }
 
+static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
+{
+       return kvm_trap_emul_handle_tlb_miss(vcpu, true);
+}
+
+static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
+{
+       return kvm_trap_emul_handle_tlb_miss(vcpu, false);
+}
+
 static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
        unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -251,7 +218,7 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
                        ret = RESUME_HOST;
                }
        } else {
-               kvm_err("Address Error (STORE): cause %#lx, PC: %p, BadVaddr: %#lx\n",
+               kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n",
                        cause, opc, badvaddr);
                run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                ret = RESUME_HOST;
@@ -262,9 +229,9 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
        unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -280,7 +247,7 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
                        ret = RESUME_HOST;
                }
        } else {
-               kvm_err("Address Error (LOAD): cause %#lx, PC: %p, BadVaddr: %#lx\n",
+               kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n",
                        cause, opc, badvaddr);
                run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                ret = RESUME_HOST;
@@ -292,8 +259,8 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -310,8 +277,8 @@ static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -328,8 +295,8 @@ static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -346,8 +313,8 @@ static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *)vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -364,8 +331,8 @@ static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *)vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -382,8 +349,8 @@ static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_fpe(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *)vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -407,8 +374,8 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        struct kvm_run *run = vcpu->run;
-       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
-       unsigned long cause = vcpu->arch.host_cp0_cause;
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
@@ -451,24 +418,41 @@ static int kvm_trap_emul_vm_init(struct kvm *kvm)
 
 static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
 {
+       vcpu->arch.kscratch_enabled = 0xfc;
+
        return 0;
 }
 
 static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
-       uint32_t config1;
+       u32 config, config1;
        int vcpu_id = vcpu->vcpu_id;
 
        /*
         * Arch specific stuff, set up config registers properly so that the
-        * guest will come up as expected, for now we simulate a MIPS 24kc
+        * guest will come up as expected
         */
+#ifndef CONFIG_CPU_MIPSR6
+       /* r2-r5, simulate a MIPS 24kc */
        kvm_write_c0_guest_prid(cop0, 0x00019300);
-       /* Have config1, Cacheable, noncoherent, write-back, write allocate */
-       kvm_write_c0_guest_config(cop0, MIPS_CONF_M | (0x3 << CP0C0_K0) |
-                                 (0x1 << CP0C0_AR) |
-                                 (MMU_TYPE_R4000 << CP0C0_MT));
+#else
+       /* r6+, simulate a generic QEMU machine */
+       kvm_write_c0_guest_prid(cop0, 0x00010000);
+#endif
+       /*
+        * Have config1, Cacheable, noncoherent, write-back, write allocate.
+        * Endianness, arch revision & virtually tagged icache should match
+        * host.
+        */
+       config = read_c0_config() & MIPS_CONF_AR;
+       config |= MIPS_CONF_M | CONF_CM_CACHABLE_NONCOHERENT | MIPS_CONF_MT_TLB;
+#ifdef CONFIG_CPU_BIG_ENDIAN
+       config |= CONF_BE;
+#endif
+       if (cpu_has_vtag_icache)
+               config |= MIPS_CONF_VI;
+       kvm_write_c0_guest_config(cop0, config);
 
        /* Read the cache characteristics from the host Config1 Register */
        config1 = (read_c0_config1() & ~0x7f);
@@ -478,9 +462,8 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
        config1 |= ((KVM_MIPS_GUEST_TLB_SIZE - 1) << 25);
 
        /* We unset some bits that we aren't emulating */
-       config1 &=
-           ~((1 << CP0C1_C2) | (1 << CP0C1_MD) | (1 << CP0C1_PC) |
-             (1 << CP0C1_WR) | (1 << CP0C1_CA));
+       config1 &= ~(MIPS_CONF1_C2 | MIPS_CONF1_MD | MIPS_CONF1_PC |
+                    MIPS_CONF1_WR | MIPS_CONF1_CA);
        kvm_write_c0_guest_config1(cop0, config1);
 
        /* Have config3, no tertiary/secondary caches implemented */
@@ -511,6 +494,17 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
+static int kvm_trap_emul_copy_reg_indices(struct kvm_vcpu *vcpu,
+                                         u64 __user *indices)
+{
+       return 0;
+}
+
 static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
                                     const struct kvm_one_reg *reg,
                                     s64 *v)
@@ -660,6 +654,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
        .dequeue_io_int = kvm_mips_dequeue_io_int_cb,
        .irq_deliver = kvm_mips_irq_deliver_cb,
        .irq_clear = kvm_mips_irq_clear_cb,
+       .num_regs = kvm_trap_emul_num_regs,
+       .copy_reg_indices = kvm_trap_emul_copy_reg_indices,
        .get_one_reg = kvm_trap_emul_get_one_reg,
        .set_one_reg = kvm_trap_emul_set_one_reg,
        .vcpu_get_regs = kvm_trap_emul_vcpu_get_regs,
index d96e912..6dc07fb 100644 (file)
@@ -627,8 +627,8 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
                                dec_insn.pc_inc +
                                dec_insn.next_pc_inc;
                return 1;
-       case cbcond0_op:
-       case cbcond1_op:
+       case pop10_op:
+       case pop30_op:
                if (!cpu_has_mips_r6)
                        break;
                if (insn.i_format.rt && !insn.i_format.rs)
@@ -683,14 +683,14 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
                        dec_insn.next_pc_inc;
 
                return 1;
-       case beqzcjic_op:
+       case pop66_op:
                if (!cpu_has_mips_r6)
                        break;
                *contpc = regs->cp0_epc + dec_insn.pc_inc +
                        dec_insn.next_pc_inc;
 
                return 1;
-       case bnezcjialc_op:
+       case pop76_op:
                if (!cpu_has_mips_r6)
                        break;
                if (!insn.i_format.rs)
index ef7f925..7a9c345 100644 (file)
@@ -1206,7 +1206,7 @@ static void probe_pcache(void)
                              c->icache.linesz;
                c->icache.waybit = __ffs(icache_size/c->icache.ways);
 
-               if (config & 0x8)               /* VI bit */
+               if (config & MIPS_CONF_VI)
                        c->icache.flags |= MIPS_CACHE_VTAG;
 
                /*
index d78178d..277cf52 100644 (file)
@@ -53,8 +53,13 @@ static struct insn insn_table_MM[] = {
        { insn_bltzl, 0, 0 },
        { insn_bne, M(mm_bne32_op, 0, 0, 0, 0, 0), RT | RS | BIMM },
        { insn_cache, M(mm_pool32b_op, 0, 0, mm_cache_func, 0, 0), RT | RS | SIMM },
+       { insn_cfc1, M(mm_pool32f_op, 0, 0, 0, mm_cfc1_op, mm_32f_73_op), RT | RS },
+       { insn_cfcmsa, M(mm_pool32s_op, 0, msa_cfc_op, 0, 0, mm_32s_elm_op), RD | RE },
+       { insn_ctc1, M(mm_pool32f_op, 0, 0, 0, mm_ctc1_op, mm_32f_73_op), RT | RS },
+       { insn_ctcmsa, M(mm_pool32s_op, 0, msa_ctc_op, 0, 0, mm_32s_elm_op), RD | RE },
        { insn_daddu, 0, 0 },
        { insn_daddiu, 0, 0 },
+       { insn_di, M(mm_pool32a_op, 0, 0, 0, mm_di_op, mm_pool32axf_op), RS },
        { insn_divu, M(mm_pool32a_op, 0, 0, 0, mm_divu_op, mm_pool32axf_op), RT | RS },
        { insn_dmfc0, 0, 0 },
        { insn_dmtc0, 0, 0 },
@@ -84,6 +89,8 @@ static struct insn insn_table_MM[] = {
        { insn_mfhi, M(mm_pool32a_op, 0, 0, 0, mm_mfhi32_op, mm_pool32axf_op), RS },
        { insn_mflo, M(mm_pool32a_op, 0, 0, 0, mm_mflo32_op, mm_pool32axf_op), RS },
        { insn_mtc0, M(mm_pool32a_op, 0, 0, 0, mm_mtc0_op, mm_pool32axf_op), RT | RS | RD },
+       { insn_mthi, M(mm_pool32a_op, 0, 0, 0, mm_mthi32_op, mm_pool32axf_op), RS },
+       { insn_mtlo, M(mm_pool32a_op, 0, 0, 0, mm_mtlo32_op, mm_pool32axf_op), RS },
        { insn_mul, M(mm_pool32a_op, 0, 0, 0, 0, mm_mul_op), RT | RS | RD },
        { insn_or, M(mm_pool32a_op, 0, 0, 0, 0, mm_or32_op), RT | RS | RD },
        { insn_ori, M(mm_ori32_op, 0, 0, 0, 0, 0), RT | RS | UIMM },
@@ -166,13 +173,15 @@ static void build_insn(u32 **buf, enum opcode opc, ...)
        op = ip->match;
        va_start(ap, opc);
        if (ip->fields & RS) {
-               if (opc == insn_mfc0 || opc == insn_mtc0)
+               if (opc == insn_mfc0 || opc == insn_mtc0 ||
+                   opc == insn_cfc1 || opc == insn_ctc1)
                        op |= build_rt(va_arg(ap, u32));
                else
                        op |= build_rs(va_arg(ap, u32));
        }
        if (ip->fields & RT) {
-               if (opc == insn_mfc0 || opc == insn_mtc0)
+               if (opc == insn_mfc0 || opc == insn_mtc0 ||
+                   opc == insn_cfc1 || opc == insn_ctc1)
                        op |= build_rs(va_arg(ap, u32));
                else
                        op |= build_rt(va_arg(ap, u32));
index 9c2220a..cec5241 100644 (file)
@@ -67,9 +67,14 @@ static struct insn insn_table[] = {
 #else
        { insn_cache,  M6(cache_op, 0, 0, 0, cache6_op),  RS | RT | SIMM9 },
 #endif
+       { insn_cfc1, M(cop1_op, cfc_op, 0, 0, 0, 0), RT | RD },
+       { insn_cfcmsa, M(msa_op, 0, msa_cfc_op, 0, 0, msa_elm_op), RD | RE },
+       { insn_ctc1, M(cop1_op, ctc_op, 0, 0, 0, 0), RT | RD },
+       { insn_ctcmsa, M(msa_op, 0, msa_ctc_op, 0, 0, msa_elm_op), RD | RE },
        { insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM },
        { insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD },
        { insn_dinsm, M(spec3_op, 0, 0, 0, 0, dinsm_op), RS | RT | RD | RE },
+       { insn_di, M(cop0_op, mfmc0_op, 0, 12, 0, 0), RT },
        { insn_dins, M(spec3_op, 0, 0, 0, 0, dins_op), RS | RT | RD | RE },
        { insn_divu, M(spec_op, 0, 0, 0, 0, divu_op), RS | RT },
        { insn_dmfc0, M(cop0_op, dmfc_op, 0, 0, 0, 0), RT | RD | SET},
@@ -114,7 +119,13 @@ static struct insn insn_table[] = {
        { insn_mflo,  M(spec_op, 0, 0, 0, 0, mflo_op), RD },
        { insn_mtc0,  M(cop0_op, mtc_op, 0, 0, 0, 0),  RT | RD | SET},
        { insn_mthc0,  M(cop0_op, mthc0_op, 0, 0, 0, 0),  RT | RD | SET},
+       { insn_mthi,  M(spec_op, 0, 0, 0, 0, mthi_op), RS },
+       { insn_mtlo,  M(spec_op, 0, 0, 0, 0, mtlo_op), RS },
+#ifndef CONFIG_CPU_MIPSR6
        { insn_mul, M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD},
+#else
+       { insn_mul, M(spec_op, 0, 0, 0, mult_mul_op, mult_op), RS | RT | RD},
+#endif
        { insn_ori,  M(ori_op, 0, 0, 0, 0, 0),  RS | RT | UIMM },
        { insn_or,  M(spec_op, 0, 0, 0, 0, or_op),  RS | RT | RD },
 #ifndef CONFIG_CPU_MIPSR6
index ad718de..3e0282d 100644 (file)
@@ -49,18 +49,19 @@ enum opcode {
        insn_invalid,
        insn_addiu, insn_addu, insn_and, insn_andi, insn_bbit0, insn_bbit1,
        insn_beq, insn_beql, insn_bgez, insn_bgezl, insn_bltz, insn_bltzl,
-       insn_bne, insn_cache, insn_daddiu, insn_daddu, insn_dins, insn_dinsm,
-       insn_divu, insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll,
+       insn_bne, insn_cache, insn_cfc1, insn_cfcmsa, insn_ctc1, insn_ctcmsa,
+       insn_daddiu, insn_daddu, insn_di, insn_dins, insn_dinsm, insn_divu,
+       insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll,
        insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret,
        insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, insn_jr, insn_lb,
        insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, insn_lui, insn_lw,
        insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0,
-       insn_mthc0, insn_mul, insn_or, insn_ori, insn_pref, insn_rfe,
-       insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll, insn_sllv, insn_slt,
-       insn_sltiu, insn_sltu, insn_sra, insn_srl, insn_srlv, insn_subu,
-       insn_sw, insn_sync, insn_syscall, insn_tlbp, insn_tlbr, insn_tlbwi,
-       insn_tlbwr, insn_wait, insn_wsbh, insn_xor, insn_xori, insn_yield,
-       insn_lddir, insn_ldpte,
+       insn_mthc0, insn_mthi, insn_mtlo, insn_mul, insn_or, insn_ori,
+       insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll,
+       insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, insn_srl,
+       insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp,
+       insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor,
+       insn_xori, insn_yield, insn_lddir, insn_ldpte,
 };
 
 struct insn {
@@ -268,10 +269,15 @@ I_u1s2(_bltz)
 I_u1s2(_bltzl)
 I_u1u2s3(_bne)
 I_u2s3u1(_cache)
+I_u1u2(_cfc1)
+I_u2u1(_cfcmsa)
+I_u1u2(_ctc1)
+I_u2u1(_ctcmsa)
 I_u1u2u3(_dmfc0)
 I_u1u2u3(_dmtc0)
 I_u2u1s3(_daddiu)
 I_u3u1u2(_daddu)
+I_u1(_di);
 I_u1u2(_divu)
 I_u2u1u3(_dsll)
 I_u2u1u3(_dsll32)
@@ -301,6 +307,8 @@ I_u1(_mfhi)
 I_u1(_mflo)
 I_u1u2u3(_mtc0)
 I_u1u2u3(_mthc0)
+I_u1(_mthi)
+I_u1(_mtlo)
 I_u3u1u2(_mul)
 I_u2u1u3(_ori)
 I_u3u1u2(_or)
diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h
new file mode 100644 (file)
index 0000000..88b4901
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * Hypervisor Maintenance Interrupt header file.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ *
+ * Copyright 2015 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#ifndef __ASM_PPC64_HMI_H__
+#define __ASM_PPC64_HMI_H__
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+#define        CORE_TB_RESYNC_REQ_BIT          63
+#define MAX_SUBCORE_PER_CORE           4
+
+/*
+ * sibling_subcore_state structure is used to co-ordinate all threads
+ * during HMI to avoid TB corruption. This structure is allocated once
+ * per each core and shared by all threads on that core.
+ */
+struct sibling_subcore_state {
+       unsigned long   flags;
+       u8              in_guest[MAX_SUBCORE_PER_CORE];
+};
+
+extern void wait_for_subcore_guest_exit(void);
+extern void wait_for_tb_resync(void);
+#else
+static inline void wait_for_subcore_guest_exit(void) { }
+static inline void wait_for_tb_resync(void) { }
+#endif
+#endif /* __ASM_PPC64_HMI_H__ */
index ad171e9..148303e 100644 (file)
@@ -26,6 +26,7 @@
 #include <asm/kvm_book3s_asm.h>
 #endif
 #include <asm/accounting.h>
+#include <asm/hmi.h>
 
 register struct paca_struct *local_paca asm("r13");
 
@@ -182,6 +183,11 @@ struct paca_struct {
         */
        u16 in_mce;
        u8 hmi_event_available;          /* HMI event is available */
+       /*
+        * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
+        * more details
+        */
+       struct sibling_subcore_state *sibling_subcore_state;
 #endif
 
        /* Stuff for accurate time accounting */
index fe4c075..b2027a5 100644 (file)
@@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32)          += vdso32/
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)       += hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)    += cpu_setup_ppc970.o cpu_setup_pa6t.o
 obj-$(CONFIG_PPC_BOOK3S_64)    += cpu_setup_power.o
-obj-$(CONFIG_PPC_BOOK3S_64)    += mce.o mce_power.o
+obj-$(CONFIG_PPC_BOOK3S_64)    += mce.o mce_power.o hmi.o
 obj-$(CONFIG_PPC_BOOK3E_64)    += exceptions-64e.o idle_book3e.o
 obj-$(CONFIG_PPC64)            += vdso64/
 obj-$(CONFIG_ALTIVEC)          += vecemu.o
index 6200e49..694def6 100644 (file)
@@ -671,6 +671,8 @@ BEGIN_FTR_SECTION
        beq     h_doorbell_common
        cmpwi   r3,0xea0
        beq     h_virt_irq_common
+       cmpwi   r3,0xe60
+       beq     hmi_exception_common
 FTR_SECTION_ELSE
        cmpwi   r3,0xa00
        beq     doorbell_super_common
@@ -1172,7 +1174,7 @@ fwnmi_data_area:
 
        .globl hmi_exception_early
 hmi_exception_early:
-       EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
+       EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, 0xe62)
        mr      r10,r1                  /* Save r1                      */
        ld      r1,PACAEMERGSP(r13)     /* Use emergency stack          */
        subi    r1,r1,INT_FRAME_SIZE    /* alloc stack frame            */
diff --git a/arch/powerpc/kernel/hmi.c b/arch/powerpc/kernel/hmi.c
new file mode 100644 (file)
index 0000000..e3f738e
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * Hypervisor Maintenance Interrupt (HMI) handling.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ *
+ * Copyright 2015 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <asm/paca.h>
+#include <asm/hmi.h>
+
+void wait_for_subcore_guest_exit(void)
+{
+       int i;
+
+       /*
+        * NULL bitmap pointer indicates that KVM module hasn't
+        * been loaded yet and hence no guests are running.
+        * If no KVM is in use, no need to co-ordinate among threads
+        * as all of them will always be in host and no one is going
+        * to modify TB other than the opal hmi handler.
+        * Hence, just return from here.
+        */
+       if (!local_paca->sibling_subcore_state)
+               return;
+
+       for (i = 0; i < MAX_SUBCORE_PER_CORE; i++)
+               while (local_paca->sibling_subcore_state->in_guest[i])
+                       cpu_relax();
+}
+
+void wait_for_tb_resync(void)
+{
+       if (!local_paca->sibling_subcore_state)
+               return;
+
+       while (test_bit(CORE_TB_RESYNC_REQ_BIT,
+                               &local_paca->sibling_subcore_state->flags))
+               cpu_relax();
+}
index 335eb6c..8a56a51 100644 (file)
@@ -336,7 +336,9 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);            \
        ld      r2,PACATOC(r13);                                        \
        ld      r1,PACAR1(r13);                                         \
        std     r3,ORIG_GPR3(r1);       /* Save original r3 */          \
-       bl      opal_rm_handle_hmi;                                     \
+       li      r3,0;                   /* NULL argument */             \
+       bl      hmi_exception_realmode;                                 \
+       nop;                                                            \
        ld      r3,ORIG_GPR3(r1);       /* Restore original r3 */       \
 20:    nop;
 
index f7e2f2e..2cb5892 100644 (file)
@@ -61,6 +61,7 @@
 #include <asm/tm.h>
 #include <asm/debug.h>
 #include <asm/asm-prototypes.h>
+#include <asm/hmi.h>
 #include <sysdev/fsl_pci.h>
 
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
@@ -308,9 +309,13 @@ long hmi_exception_realmode(struct pt_regs *regs)
 {
        __this_cpu_inc(irq_stat.hmi_exceptions);
 
+       wait_for_subcore_guest_exit();
+
        if (ppc_md.hmi_exception_early)
                ppc_md.hmi_exception_early(regs);
 
+       wait_for_tb_resync();
+
        return 0;
 }
 
index e20beae..2fd5580 100644 (file)
@@ -52,6 +52,7 @@
 #include <asm/switch_to.h>
 #include <asm/smp.h>
 #include <asm/dbell.h>
+#include <asm/hmi.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
@@ -2522,7 +2523,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
                        spin_unlock(&pvc->lock);
 
-       kvm_guest_enter();
+       guest_enter();
 
        srcu_idx = srcu_read_lock(&vc->kvm->srcu);
 
@@ -2570,7 +2571,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
        /* make sure updates to secondary vcpu structs are visible now */
        smp_mb();
-       kvm_guest_exit();
+       guest_exit();
 
        for (sub = 0; sub < core_info.n_subcores; ++sub)
                list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
@@ -3401,6 +3402,38 @@ static struct kvmppc_ops kvm_ops_hv = {
        .hcall_implemented = kvmppc_hcall_impl_hv,
 };
 
+static int kvm_init_subcore_bitmap(void)
+{
+       int i, j;
+       int nr_cores = cpu_nr_cores();
+       struct sibling_subcore_state *sibling_subcore_state;
+
+       for (i = 0; i < nr_cores; i++) {
+               int first_cpu = i * threads_per_core;
+               int node = cpu_to_node(first_cpu);
+
+               /* Ignore if it is already allocated. */
+               if (paca[first_cpu].sibling_subcore_state)
+                       continue;
+
+               sibling_subcore_state =
+                       kmalloc_node(sizeof(struct sibling_subcore_state),
+                                                       GFP_KERNEL, node);
+               if (!sibling_subcore_state)
+                       return -ENOMEM;
+
+               memset(sibling_subcore_state, 0,
+                               sizeof(struct sibling_subcore_state));
+
+               for (j = 0; j < threads_per_core; j++) {
+                       int cpu = first_cpu + j;
+
+                       paca[cpu].sibling_subcore_state = sibling_subcore_state;
+               }
+       }
+       return 0;
+}
+
 static int kvmppc_book3s_init_hv(void)
 {
        int r;
@@ -3411,6 +3444,10 @@ static int kvmppc_book3s_init_hv(void)
        if (r < 0)
                return -ENODEV;
 
+       r = kvm_init_subcore_bitmap();
+       if (r)
+               return r;
+
        kvm_ops_hv.owner = THIS_MODULE;
        kvmppc_hv_ops = &kvm_ops_hv;
 
index 93b5f5c..0fa70a9 100644 (file)
@@ -13,6 +13,9 @@
 #include <linux/kernel.h>
 #include <asm/opal.h>
 #include <asm/mce.h>
+#include <asm/machdep.h>
+#include <asm/cputhreads.h>
+#include <asm/hmi.h>
 
 /* SRR1 bits for machine check on POWER7 */
 #define SRR1_MC_LDSTERR                (1ul << (63-42))
@@ -140,3 +143,176 @@ long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
 {
        return kvmppc_realmode_mc_power7(vcpu);
 }
+
+/* Check if dynamic split is in force and return subcore size accordingly. */
+static inline int kvmppc_cur_subcore_size(void)
+{
+       if (local_paca->kvm_hstate.kvm_split_mode)
+               return local_paca->kvm_hstate.kvm_split_mode->subcore_size;
+
+       return threads_per_subcore;
+}
+
+void kvmppc_subcore_enter_guest(void)
+{
+       int thread_id, subcore_id;
+
+       thread_id = cpu_thread_in_core(local_paca->paca_index);
+       subcore_id = thread_id / kvmppc_cur_subcore_size();
+
+       local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
+}
+
+void kvmppc_subcore_exit_guest(void)
+{
+       int thread_id, subcore_id;
+
+       thread_id = cpu_thread_in_core(local_paca->paca_index);
+       subcore_id = thread_id / kvmppc_cur_subcore_size();
+
+       local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
+}
+
+static bool kvmppc_tb_resync_required(void)
+{
+       if (test_and_set_bit(CORE_TB_RESYNC_REQ_BIT,
+                               &local_paca->sibling_subcore_state->flags))
+               return false;
+
+       return true;
+}
+
+static void kvmppc_tb_resync_done(void)
+{
+       clear_bit(CORE_TB_RESYNC_REQ_BIT,
+                       &local_paca->sibling_subcore_state->flags);
+}
+
+/*
+ * kvmppc_realmode_hmi_handler() is called only by primary thread during
+ * guest exit path.
+ *
+ * There are multiple reasons why HMI could occur, one of them is
+ * Timebase (TB) error. If this HMI is due to TB error, then TB would
+ * have been in stopped state. The opal hmi handler Will fix it and
+ * restore the TB value with host timebase value. For HMI caused due
+ * to non-TB errors, opal hmi handler will not touch/restore TB register
+ * and hence there won't be any change in TB value.
+ *
+ * Since we are not sure about the cause of this HMI, we can't be sure
+ * about the content of TB register whether it holds guest or host timebase
+ * value. Hence the idea is to resync the TB on every HMI, so that we
+ * know about the exact state of the TB value. Resync TB call will
+ * restore TB to host timebase.
+ *
+ * Things to consider:
+ * - On TB error, HMI interrupt is reported on all the threads of the core
+ *   that has encountered TB error irrespective of split-core mode.
+ * - The very first thread on the core that get chance to fix TB error
+ *   would rsync the TB with local chipTOD value.
+ * - The resync TB is a core level action i.e. it will sync all the TBs
+ *   in that core independent of split-core mode. This means if we trigger
+ *   TB sync from a thread from one subcore, it would affect TB values of
+ *   sibling subcores of the same core.
+ *
+ * All threads need to co-ordinate before making opal hmi handler.
+ * All threads will use sibling_subcore_state->in_guest[] (shared by all
+ * threads in the core) in paca which holds information about whether
+ * sibling subcores are in Guest mode or host mode. The in_guest[] array
+ * is of size MAX_SUBCORE_PER_CORE=4, indexed using subcore id to set/unset
+ * subcore status. Only primary threads from each subcore is responsible
+ * to set/unset its designated array element while entering/exiting the
+ * guset.
+ *
+ * After invoking opal hmi handler call, one of the thread (of entire core)
+ * will need to resync the TB. Bit 63 from subcore state bitmap flags
+ * (sibling_subcore_state->flags) will be used to co-ordinate between
+ * primary threads to decide who takes up the responsibility.
+ *
+ * This is what we do:
+ * - Primary thread from each subcore tries to set resync required bit[63]
+ *   of paca->sibling_subcore_state->flags.
+ * - The first primary thread that is able to set the flag takes the
+ *   responsibility of TB resync. (Let us call it as thread leader)
+ * - All other threads which are in host will call
+ *   wait_for_subcore_guest_exit() and wait for in_guest[0-3] from
+ *   paca->sibling_subcore_state to get cleared.
+ * - All the primary thread will clear its subcore status from subcore
+ *   state in_guest[] array respectively.
+ * - Once all primary threads clear in_guest[0-3], all of them will invoke
+ *   opal hmi handler.
+ * - Now all threads will wait for TB resync to complete by invoking
+ *   wait_for_tb_resync() except the thread leader.
+ * - Thread leader will do a TB resync by invoking opal_resync_timebase()
+ *   call and the it will clear the resync required bit.
+ * - All other threads will now come out of resync wait loop and proceed
+ *   with individual execution.
+ * - On return of this function, primary thread will signal all
+ *   secondary threads to proceed.
+ * - All secondary threads will eventually call opal hmi handler on
+ *   their exit path.
+ */
+
+long kvmppc_realmode_hmi_handler(void)
+{
+       int ptid = local_paca->kvm_hstate.ptid;
+       bool resync_req;
+
+       /* This is only called on primary thread. */
+       BUG_ON(ptid != 0);
+       __this_cpu_inc(irq_stat.hmi_exceptions);
+
+       /*
+        * By now primary thread has already completed guest->host
+        * partition switch but haven't signaled secondaries yet.
+        * All the secondary threads on this subcore is waiting
+        * for primary thread to signal them to go ahead.
+        *
+        * For threads from subcore which isn't in guest, they all will
+        * wait until all other subcores on this core exit the guest.
+        *
+        * Now set the resync required bit. If you are the first to
+        * set this bit then kvmppc_tb_resync_required() function will
+        * return true. For rest all other subcores
+        * kvmppc_tb_resync_required() will return false.
+        *
+        * If resync_req == true, then this thread is responsible to
+        * initiate TB resync after hmi handler has completed.
+        * All other threads on this core will wait until this thread
+        * clears the resync required bit flag.
+        */
+       resync_req = kvmppc_tb_resync_required();
+
+       /* Reset the subcore status to indicate it has exited guest */
+       kvmppc_subcore_exit_guest();
+
+       /*
+        * Wait for other subcores on this core to exit the guest.
+        * All the primary threads and threads from subcore that are
+        * not in guest will wait here until all subcores are out
+        * of guest context.
+        */
+       wait_for_subcore_guest_exit();
+
+       /*
+        * At this point we are sure that primary threads from each
+        * subcore on this core have completed guest->host partition
+        * switch. Now it is safe to call HMI handler.
+        */
+       if (ppc_md.hmi_exception_early)
+               ppc_md.hmi_exception_early(NULL);
+
+       /*
+        * Check if this thread is responsible to resync TB.
+        * All other threads will wait until this thread completes the
+        * TB resync.
+        */
+       if (resync_req) {
+               opal_resync_timebase();
+               /* Reset TB resync req bit */
+               kvmppc_tb_resync_done();
+       } else {
+               wait_for_tb_resync();
+       }
+       return 0;
+}
index 86f0cae..9756555 100644 (file)
@@ -29,6 +29,7 @@
 #include <asm/kvm_book3s_asm.h>
 #include <asm/book3s/64/mmu-hash.h>
 #include <asm/tm.h>
+#include <asm/opal.h>
 
 #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
 
@@ -373,6 +374,18 @@ kvm_secondary_got_guest:
        lwsync
        std     r0, HSTATE_KVM_VCORE(r13)
 
+       /*
+        * All secondaries exiting guest will fall through this path.
+        * Before proceeding, just check for HMI interrupt and
+        * invoke opal hmi handler. By now we are sure that the
+        * primary thread on this core/subcore has already made partition
+        * switch/TB resync and we are good to call opal hmi handler.
+        */
+       cmpwi   r12, BOOK3S_INTERRUPT_HMI
+       bne     kvm_no_guest
+
+       li      r3,0                    /* NULL argument */
+       bl      hmi_exception_realmode
 /*
  * At this point we have finished executing in the guest.
  * We need to wait for hwthread_req to become zero, since
@@ -427,6 +440,22 @@ kvm_no_guest:
  * whole-core mode, so we need to nap.
  */
 kvm_unsplit_nap:
+       /*
+        * When secondaries are napping in kvm_unsplit_nap() with
+        * hwthread_req = 1, HMI goes ignored even though subcores are
+        * already exited the guest. Hence HMI keeps waking up secondaries
+        * from nap in a loop and secondaries always go back to nap since
+        * no vcore is assigned to them. This makes impossible for primary
+        * thread to get hold of secondary threads resulting into a soft
+        * lockup in KVM path.
+        *
+        * Let us check if HMI is pending and handle it before we go to nap.
+        */
+       cmpwi   r12, BOOK3S_INTERRUPT_HMI
+       bne     55f
+       li      r3, 0                   /* NULL argument */
+       bl      hmi_exception_realmode
+55:
        /*
         * Ensure that secondary doesn't nap when it has
         * its vcore pointer set.
@@ -601,6 +630,11 @@ BEGIN_FTR_SECTION
        mtspr   SPRN_DPDES, r8
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
+       /* Mark the subcore state as inside guest */
+       bl      kvmppc_subcore_enter_guest
+       nop
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       ld      r4, HSTATE_KVM_VCPU(r13)
        li      r0,1
        stb     r0,VCORE_IN_GUEST(r5)   /* signal secondaries to continue */
 
@@ -655,112 +689,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
-       b       skip_tm
-END_FTR_SECTION_IFCLR(CPU_FTR_TM)
-
-       /* Turn on TM/FP/VSX/VMX so we can restore them. */
-       mfmsr   r5
-       li      r6, MSR_TM >> 32
-       sldi    r6, r6, 32
-       or      r5, r5, r6
-       ori     r5, r5, MSR_FP
-       oris    r5, r5, (MSR_VEC | MSR_VSX)@h
-       mtmsrd  r5
-
-       /*
-        * The user may change these outside of a transaction, so they must
-        * always be context switched.
-        */
-       ld      r5, VCPU_TFHAR(r4)
-       ld      r6, VCPU_TFIAR(r4)
-       ld      r7, VCPU_TEXASR(r4)
-       mtspr   SPRN_TFHAR, r5
-       mtspr   SPRN_TFIAR, r6
-       mtspr   SPRN_TEXASR, r7
-
-       ld      r5, VCPU_MSR(r4)
-       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-       beq     skip_tm /* TM not active in guest */
-
-       /* Make sure the failure summary is set, otherwise we'll program check
-        * when we trechkpt.  It's possible that this might have been not set
-        * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
-        * host.
-        */
-       oris    r7, r7, (TEXASR_FS)@h
-       mtspr   SPRN_TEXASR, r7
-
-       /*
-        * We need to load up the checkpointed state for the guest.
-        * We need to do this early as it will blow away any GPRs, VSRs and
-        * some SPRs.
-        */
-
-       mr      r31, r4
-       addi    r3, r31, VCPU_FPRS_TM
-       bl      load_fp_state
-       addi    r3, r31, VCPU_VRS_TM
-       bl      load_vr_state
-       mr      r4, r31
-       lwz     r7, VCPU_VRSAVE_TM(r4)
-       mtspr   SPRN_VRSAVE, r7
-
-       ld      r5, VCPU_LR_TM(r4)
-       lwz     r6, VCPU_CR_TM(r4)
-       ld      r7, VCPU_CTR_TM(r4)
-       ld      r8, VCPU_AMR_TM(r4)
-       ld      r9, VCPU_TAR_TM(r4)
-       mtlr    r5
-       mtcr    r6
-       mtctr   r7
-       mtspr   SPRN_AMR, r8
-       mtspr   SPRN_TAR, r9
-
-       /*
-        * Load up PPR and DSCR values but don't put them in the actual SPRs
-        * till the last moment to avoid running with userspace PPR and DSCR for
-        * too long.
-        */
-       ld      r29, VCPU_DSCR_TM(r4)
-       ld      r30, VCPU_PPR_TM(r4)
-
-       std     r2, PACATMSCRATCH(r13) /* Save TOC */
-
-       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
-       li      r5, 0
-       mtmsrd  r5, 1
-
-       /* Load GPRs r0-r28 */
-       reg = 0
-       .rept   29
-       ld      reg, VCPU_GPRS_TM(reg)(r31)
-       reg = reg + 1
-       .endr
-
-       mtspr   SPRN_DSCR, r29
-       mtspr   SPRN_PPR, r30
-
-       /* Load final GPRs */
-       ld      29, VCPU_GPRS_TM(29)(r31)
-       ld      30, VCPU_GPRS_TM(30)(r31)
-       ld      31, VCPU_GPRS_TM(31)(r31)
-
-       /* TM checkpointed state is now setup.  All GPRs are now volatile. */
-       TRECHKPT
-
-       /* Now let's get back the state we need. */
-       HMT_MEDIUM
-       GET_PACA(r13)
-       ld      r29, HSTATE_DSCR(r13)
-       mtspr   SPRN_DSCR, r29
-       ld      r4, HSTATE_KVM_VCPU(r13)
-       ld      r1, HSTATE_HOST_R1(r13)
-       ld      r2, PACATMSCRATCH(r13)
-
-       /* Set the MSR RI since we have our registers back. */
-       li      r5, MSR_RI
-       mtmsrd  r5, 1
-skip_tm:
+       bl      kvmppc_restore_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
 
        /* Load guest PMU registers */
@@ -841,12 +771,6 @@ BEGIN_FTR_SECTION
        /* Skip next section on POWER7 */
        b       8f
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
-       /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
-       mfmsr   r8
-       li      r0, 1
-       rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-       mtmsrd  r8
-
        /* Load up POWER8-specific registers */
        ld      r5, VCPU_IAMR(r4)
        lwz     r6, VCPU_PSPB(r4)
@@ -1436,106 +1360,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
-       b       2f
-END_FTR_SECTION_IFCLR(CPU_FTR_TM)
-       /* Turn on TM. */
-       mfmsr   r8
-       li      r0, 1
-       rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-       mtmsrd  r8
-
-       ld      r5, VCPU_MSR(r9)
-       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-       beq     1f      /* TM not active in guest. */
-
-       li      r3, TM_CAUSE_KVM_RESCHED
-
-       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
-       li      r5, 0
-       mtmsrd  r5, 1
-
-       /* All GPRs are volatile at this point. */
-       TRECLAIM(R3)
-
-       /* Temporarily store r13 and r9 so we have some regs to play with */
-       SET_SCRATCH0(r13)
-       GET_PACA(r13)
-       std     r9, PACATMSCRATCH(r13)
-       ld      r9, HSTATE_KVM_VCPU(r13)
-
-       /* Get a few more GPRs free. */
-       std     r29, VCPU_GPRS_TM(29)(r9)
-       std     r30, VCPU_GPRS_TM(30)(r9)
-       std     r31, VCPU_GPRS_TM(31)(r9)
-
-       /* Save away PPR and DSCR soon so don't run with user values. */
-       mfspr   r31, SPRN_PPR
-       HMT_MEDIUM
-       mfspr   r30, SPRN_DSCR
-       ld      r29, HSTATE_DSCR(r13)
-       mtspr   SPRN_DSCR, r29
-
-       /* Save all but r9, r13 & r29-r31 */
-       reg = 0
-       .rept   29
-       .if (reg != 9) && (reg != 13)
-       std     reg, VCPU_GPRS_TM(reg)(r9)
-       .endif
-       reg = reg + 1
-       .endr
-       /* ... now save r13 */
-       GET_SCRATCH0(r4)
-       std     r4, VCPU_GPRS_TM(13)(r9)
-       /* ... and save r9 */
-       ld      r4, PACATMSCRATCH(r13)
-       std     r4, VCPU_GPRS_TM(9)(r9)
-
-       /* Reload stack pointer and TOC. */
-       ld      r1, HSTATE_HOST_R1(r13)
-       ld      r2, PACATOC(r13)
-
-       /* Set MSR RI now we have r1 and r13 back. */
-       li      r5, MSR_RI
-       mtmsrd  r5, 1
-
-       /* Save away checkpinted SPRs. */
-       std     r31, VCPU_PPR_TM(r9)
-       std     r30, VCPU_DSCR_TM(r9)
-       mflr    r5
-       mfcr    r6
-       mfctr   r7
-       mfspr   r8, SPRN_AMR
-       mfspr   r10, SPRN_TAR
-       std     r5, VCPU_LR_TM(r9)
-       stw     r6, VCPU_CR_TM(r9)
-       std     r7, VCPU_CTR_TM(r9)
-       std     r8, VCPU_AMR_TM(r9)
-       std     r10, VCPU_TAR_TM(r9)
-
-       /* Restore r12 as trap number. */
-       lwz     r12, VCPU_TRAP(r9)
-
-       /* Save FP/VSX. */
-       addi    r3, r9, VCPU_FPRS_TM
-       bl      store_fp_state
-       addi    r3, r9, VCPU_VRS_TM
-       bl      store_vr_state
-       mfspr   r6, SPRN_VRSAVE
-       stw     r6, VCPU_VRSAVE_TM(r9)
-1:
-       /*
-        * We need to save these SPRs after the treclaim so that the software
-        * error code is recorded correctly in the TEXASR.  Also the user may
-        * change these outside of a transaction, so they must always be
-        * context switched.
-        */
-       mfspr   r5, SPRN_TFHAR
-       mfspr   r6, SPRN_TFIAR
-       mfspr   r7, SPRN_TEXASR
-       std     r5, VCPU_TFHAR(r9)
-       std     r6, VCPU_TFIAR(r9)
-       std     r7, VCPU_TEXASR(r9)
-2:
+       bl      kvmppc_save_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
 
        /* Increment yield count if they have a VPA */
@@ -1683,6 +1509,23 @@ BEGIN_FTR_SECTION
        mtspr   SPRN_DPDES, r8
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
+       /* If HMI, call kvmppc_realmode_hmi_handler() */
+       cmpwi   r12, BOOK3S_INTERRUPT_HMI
+       bne     27f
+       bl      kvmppc_realmode_hmi_handler
+       nop
+       li      r12, BOOK3S_INTERRUPT_HMI
+       /*
+        * At this point kvmppc_realmode_hmi_handler would have resync-ed
+        * the TB. Hence it is not required to subtract guest timebase
+        * offset from timebase. So, skip it.
+        *
+        * Also, do not call kvmppc_subcore_exit_guest() because it has
+        * been invoked as part of kvmppc_realmode_hmi_handler().
+        */
+       b       30f
+
+27:
        /* Subtract timebase offset from timebase */
        ld      r8,VCORE_TB_OFFSET(r5)
        cmpdi   r8,0
@@ -1698,8 +1541,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        addis   r8,r8,0x100             /* if so, increment upper 40 bits */
        mtspr   SPRN_TBU40,r8
 
+17:    bl      kvmppc_subcore_exit_guest
+       nop
+30:    ld      r5,HSTATE_KVM_VCORE(r13)
+       ld      r4,VCORE_KVM(r5)        /* pointer to struct kvm */
+
        /* Reset PCR */
-17:    ld      r0, VCORE_PCR(r5)
+       ld      r0, VCORE_PCR(r5)
        cmpdi   r0, 0
        beq     18f
        li      r0, 0
@@ -2245,6 +2093,13 @@ _GLOBAL(kvmppc_h_cede)           /* r3 = vcpu pointer, r11 = msr, r13 = paca */
        /* save FP state */
        bl      kvmppc_save_fp
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       bl      kvmppc_save_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
+
        /*
         * Set DEC to the smaller of DEC and HDEC, so that we wake
         * no later than the end of our timeslice (HDEC interrupts
@@ -2321,6 +2176,12 @@ kvm_end_cede:
        bl      kvmhv_accumulate_time
 #endif
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+       bl      kvmppc_restore_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
+
        /* load up FP state */
        bl      kvmppc_load_fp
 
@@ -2461,6 +2322,8 @@ BEGIN_FTR_SECTION
        cmpwi   r6, 3                   /* hypervisor doorbell? */
        beq     3f
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       cmpwi   r6, 0xa                 /* Hypervisor maintenance ? */
+       beq     4f
        li      r3, 1                   /* anything else, return 1 */
 0:     blr
 
@@ -2482,6 +2345,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        li      r3, -1
        blr
 
+       /* Woken up due to Hypervisor maintenance interrupt */
+4:     li      r12, BOOK3S_INTERRUPT_HMI
+       li      r3, 1
+       blr
+
 /*
  * Determine what sort of external interrupt is pending (if any).
  * Returns:
@@ -2631,6 +2499,239 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
        mr      r4,r31
        blr
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Save transactional state and TM-related registers.
+ * Called with r9 pointing to the vcpu struct.
+ * This can modify all checkpointed registers, but
+ * restores r1, r2 and r9 (vcpu pointer) before exit.
+ */
+kvmppc_save_tm:
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+
+       /* Turn on TM. */
+       mfmsr   r8
+       li      r0, 1
+       rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+       mtmsrd  r8
+
+       ld      r5, VCPU_MSR(r9)
+       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+       beq     1f      /* TM not active in guest. */
+
+       std     r1, HSTATE_HOST_R1(r13)
+       li      r3, TM_CAUSE_KVM_RESCHED
+
+       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+       li      r5, 0
+       mtmsrd  r5, 1
+
+       /* All GPRs are volatile at this point. */
+       TRECLAIM(R3)
+
+       /* Temporarily store r13 and r9 so we have some regs to play with */
+       SET_SCRATCH0(r13)
+       GET_PACA(r13)
+       std     r9, PACATMSCRATCH(r13)
+       ld      r9, HSTATE_KVM_VCPU(r13)
+
+       /* Get a few more GPRs free. */
+       std     r29, VCPU_GPRS_TM(29)(r9)
+       std     r30, VCPU_GPRS_TM(30)(r9)
+       std     r31, VCPU_GPRS_TM(31)(r9)
+
+       /* Save away PPR and DSCR soon so don't run with user values. */
+       mfspr   r31, SPRN_PPR
+       HMT_MEDIUM
+       mfspr   r30, SPRN_DSCR
+       ld      r29, HSTATE_DSCR(r13)
+       mtspr   SPRN_DSCR, r29
+
+       /* Save all but r9, r13 & r29-r31 */
+       reg = 0
+       .rept   29
+       .if (reg != 9) && (reg != 13)
+       std     reg, VCPU_GPRS_TM(reg)(r9)
+       .endif
+       reg = reg + 1
+       .endr
+       /* ... now save r13 */
+       GET_SCRATCH0(r4)
+       std     r4, VCPU_GPRS_TM(13)(r9)
+       /* ... and save r9 */
+       ld      r4, PACATMSCRATCH(r13)
+       std     r4, VCPU_GPRS_TM(9)(r9)
+
+       /* Reload stack pointer and TOC. */
+       ld      r1, HSTATE_HOST_R1(r13)
+       ld      r2, PACATOC(r13)
+
+       /* Set MSR RI now we have r1 and r13 back. */
+       li      r5, MSR_RI
+       mtmsrd  r5, 1
+
+       /* Save away checkpinted SPRs. */
+       std     r31, VCPU_PPR_TM(r9)
+       std     r30, VCPU_DSCR_TM(r9)
+       mflr    r5
+       mfcr    r6
+       mfctr   r7
+       mfspr   r8, SPRN_AMR
+       mfspr   r10, SPRN_TAR
+       std     r5, VCPU_LR_TM(r9)
+       stw     r6, VCPU_CR_TM(r9)
+       std     r7, VCPU_CTR_TM(r9)
+       std     r8, VCPU_AMR_TM(r9)
+       std     r10, VCPU_TAR_TM(r9)
+
+       /* Restore r12 as trap number. */
+       lwz     r12, VCPU_TRAP(r9)
+
+       /* Save FP/VSX. */
+       addi    r3, r9, VCPU_FPRS_TM
+       bl      store_fp_state
+       addi    r3, r9, VCPU_VRS_TM
+       bl      store_vr_state
+       mfspr   r6, SPRN_VRSAVE
+       stw     r6, VCPU_VRSAVE_TM(r9)
+1:
+       /*
+        * We need to save these SPRs after the treclaim so that the software
+        * error code is recorded correctly in the TEXASR.  Also the user may
+        * change these outside of a transaction, so they must always be
+        * context switched.
+        */
+       mfspr   r5, SPRN_TFHAR
+       mfspr   r6, SPRN_TFIAR
+       mfspr   r7, SPRN_TEXASR
+       std     r5, VCPU_TFHAR(r9)
+       std     r6, VCPU_TFIAR(r9)
+       std     r7, VCPU_TEXASR(r9)
+
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
+       blr
+
+/*
+ * Restore transactional state and TM-related registers.
+ * Called with r4 pointing to the vcpu struct.
+ * This potentially modifies all checkpointed registers.
+ * It restores r1, r2, r4 from the PACA.
+ */
+kvmppc_restore_tm:
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+
+       /* Turn on TM/FP/VSX/VMX so we can restore them. */
+       mfmsr   r5
+       li      r6, MSR_TM >> 32
+       sldi    r6, r6, 32
+       or      r5, r5, r6
+       ori     r5, r5, MSR_FP
+       oris    r5, r5, (MSR_VEC | MSR_VSX)@h
+       mtmsrd  r5
+
+       /*
+        * The user may change these outside of a transaction, so they must
+        * always be context switched.
+        */
+       ld      r5, VCPU_TFHAR(r4)
+       ld      r6, VCPU_TFIAR(r4)
+       ld      r7, VCPU_TEXASR(r4)
+       mtspr   SPRN_TFHAR, r5
+       mtspr   SPRN_TFIAR, r6
+       mtspr   SPRN_TEXASR, r7
+
+       ld      r5, VCPU_MSR(r4)
+       rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+       beqlr           /* TM not active in guest */
+       std     r1, HSTATE_HOST_R1(r13)
+
+       /* Make sure the failure summary is set, otherwise we'll program check
+        * when we trechkpt.  It's possible that this might have been not set
+        * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
+        * host.
+        */
+       oris    r7, r7, (TEXASR_FS)@h
+       mtspr   SPRN_TEXASR, r7
+
+       /*
+        * We need to load up the checkpointed state for the guest.
+        * We need to do this early as it will blow away any GPRs, VSRs and
+        * some SPRs.
+        */
+
+       mr      r31, r4
+       addi    r3, r31, VCPU_FPRS_TM
+       bl      load_fp_state
+       addi    r3, r31, VCPU_VRS_TM
+       bl      load_vr_state
+       mr      r4, r31
+       lwz     r7, VCPU_VRSAVE_TM(r4)
+       mtspr   SPRN_VRSAVE, r7
+
+       ld      r5, VCPU_LR_TM(r4)
+       lwz     r6, VCPU_CR_TM(r4)
+       ld      r7, VCPU_CTR_TM(r4)
+       ld      r8, VCPU_AMR_TM(r4)
+       ld      r9, VCPU_TAR_TM(r4)
+       mtlr    r5
+       mtcr    r6
+       mtctr   r7
+       mtspr   SPRN_AMR, r8
+       mtspr   SPRN_TAR, r9
+
+       /*
+        * Load up PPR and DSCR values but don't put them in the actual SPRs
+        * till the last moment to avoid running with userspace PPR and DSCR for
+        * too long.
+        */
+       ld      r29, VCPU_DSCR_TM(r4)
+       ld      r30, VCPU_PPR_TM(r4)
+
+       std     r2, PACATMSCRATCH(r13) /* Save TOC */
+
+       /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+       li      r5, 0
+       mtmsrd  r5, 1
+
+       /* Load GPRs r0-r28 */
+       reg = 0
+       .rept   29
+       ld      reg, VCPU_GPRS_TM(reg)(r31)
+       reg = reg + 1
+       .endr
+
+       mtspr   SPRN_DSCR, r29
+       mtspr   SPRN_PPR, r30
+
+       /* Load final GPRs */
+       ld      29, VCPU_GPRS_TM(29)(r31)
+       ld      30, VCPU_GPRS_TM(30)(r31)
+       ld      31, VCPU_GPRS_TM(31)(r31)
+
+       /* TM checkpointed state is now setup.  All GPRs are now volatile. */
+       TRECHKPT
+
+       /* Now let's get back the state we need. */
+       HMT_MEDIUM
+       GET_PACA(r13)
+       ld      r29, HSTATE_DSCR(r13)
+       mtspr   SPRN_DSCR, r29
+       ld      r4, HSTATE_KVM_VCPU(r13)
+       ld      r1, HSTATE_HOST_R1(r13)
+       ld      r2, PACATMSCRATCH(r13)
+
+       /* Set the MSR RI since we have our registers back. */
+       li      r5, MSR_RI
+       mtmsrd  r5, 1
+
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
+       blr
+#endif
+
 /*
  * We come here if we get any exception or interrupt while we are
  * executing host real mode code while in guest MMU context.
index c4f7d6b..e76f79a 100644 (file)
@@ -914,7 +914,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
        /* We get here with MSR.EE=1 */
 
        trace_kvm_exit(exit_nr, vcpu);
-       kvm_guest_exit();
+       guest_exit();
 
        switch (exit_nr) {
        case BOOK3S_INTERRUPT_INST_STORAGE:
@@ -1049,7 +1049,17 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
                int emul;
 
 program_interrupt:
-               flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+               /*
+                * shadow_srr1 only contains valid flags if we came here via
+                * a program exception. The other exceptions (emulation assist,
+                * FP unavailable, etc.) do not provide flags in SRR1, so use
+                * an illegal-instruction exception when injecting a program
+                * interrupt into the guest.
+                */
+               if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
+                       flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+               else
+                       flags = SRR1_PROGILL;
 
                emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
                if (emul != EMULATE_DONE) {
@@ -1531,7 +1541,7 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
        kvmppc_clear_debug(vcpu);
 
-       /* No need for kvm_guest_exit. It's done in handle_exit.
+       /* No need for guest_exit. It's done in handle_exit.
           We also get here with interrupts enabled. */
 
        /* Make sure we save the guest FPU/Altivec/VSX state */
index 4afae69..02b4672 100644 (file)
@@ -776,7 +776,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
        ret = __kvmppc_vcpu_run(kvm_run, vcpu);
 
-       /* No need for kvm_guest_exit. It's done in handle_exit.
+       /* No need for guest_exit. It's done in handle_exit.
           We also get here with interrupts enabled. */
 
        /* Switch back to user space debug context */
@@ -1012,7 +1012,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
        }
 
        trace_kvm_exit(exit_nr, vcpu);
-       __kvm_guest_exit();
+       guest_exit_irqoff();
 
        local_irq_enable();
 
index 5cc2e7a..b379146 100644 (file)
@@ -302,7 +302,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        advance = 0;
                        printk(KERN_ERR "Couldn't emulate instruction 0x%08x "
                               "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst));
-                       kvmppc_core_queue_program(vcpu, 0);
                }
        }
 
index 6249cdc..ed38f81 100644 (file)
@@ -1823,7 +1823,8 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
        return 0;
 }
 
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+                         struct kvm_kernel_irq_routing_entry *e,
                          const struct kvm_irq_routing_entry *ue)
 {
        int r = -EINVAL;
index 02416fe..6ce40dd 100644 (file)
@@ -119,7 +119,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
                        continue;
                }
 
-               __kvm_guest_enter();
+               guest_enter_irqoff();
                return 1;
        }
 
@@ -588,6 +588,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = 1;
                break;
 #endif
+       case KVM_CAP_PPC_HTM:
+               r = cpu_has_feature(CPU_FTR_TM_COMP) &&
+                   is_kvmppc_hv_enabled(kvm);
+               break;
        default:
                r = 0;
                break;
index cf928bb..3d29d40 100644 (file)
@@ -64,7 +64,6 @@ END_FTR_SECTION(0, 1);                                                \
        OPAL_BRANCH(opal_tracepoint_entry) \
        mfcr    r12;                    \
        stw     r12,8(r1);              \
-       std     r1,PACAR1(r13);         \
        li      r11,0;                  \
        mfmsr   r12;                    \
        ori     r11,r11,MSR_EE;         \
@@ -127,7 +126,6 @@ opal_tracepoint_entry:
        mfcr    r12
        std     r11,16(r1)
        stw     r12,8(r1)
-       std     r1,PACAR1(r13)
        li      r11,0
        mfmsr   r12
        ori     r11,r11,MSR_EE
index 67d43a0..28f03ca 100644 (file)
 #include <asm/ebcdic.h>
 #include "hypfs.h"
 
-#define LPAR_NAME_LEN 8                /* lpar name len in diag 204 data */
-#define CPU_NAME_LEN 16                /* type name len of cpus in diag224 name table */
 #define TMP_SIZE 64            /* size of temporary buffers */
 
 #define DBFS_D204_HDR_VERSION  0
 
-/* diag 204 subcodes */
-enum diag204_sc {
-       SUBC_STIB4 = 4,
-       SUBC_RSI = 5,
-       SUBC_STIB6 = 6,
-       SUBC_STIB7 = 7
-};
-
-/* The two available diag 204 data formats */
-enum diag204_format {
-       INFO_SIMPLE = 0,
-       INFO_EXT = 0x00010000
-};
-
-/* bit is set in flags, when physical cpu info is included in diag 204 data */
-#define LPAR_PHYS_FLG  0x80
-
 static char *diag224_cpu_names;                        /* diag 224 name table */
 static enum diag204_sc diag204_store_sc;       /* used subcode for store */
 static enum diag204_format diag204_info_type;  /* used diag 204 data format */
@@ -53,7 +34,7 @@ static int diag204_buf_pages;         /* number of pages for diag204 data */
 static struct dentry *dbfs_d204_file;
 
 /*
- * DIAG 204 data structures and member access functions.
+ * DIAG 204 member access functions.
  *
  * Since we have two different diag 204 data formats for old and new s390
  * machines, we do not access the structs directly, but use getter functions for
@@ -62,304 +43,173 @@ static struct dentry *dbfs_d204_file;
 
 /* Time information block */
 
-struct info_blk_hdr {
-       __u8  npar;
-       __u8  flags;
-       __u16 tslice;
-       __u16 phys_cpus;
-       __u16 this_part;
-       __u64 curtod;
-} __attribute__ ((packed));
-
-struct x_info_blk_hdr {
-       __u8  npar;
-       __u8  flags;
-       __u16 tslice;
-       __u16 phys_cpus;
-       __u16 this_part;
-       __u64 curtod1;
-       __u64 curtod2;
-       char reserved[40];
-} __attribute__ ((packed));
-
 static inline int info_blk_hdr__size(enum diag204_format type)
 {
-       if (type == INFO_SIMPLE)
-               return sizeof(struct info_blk_hdr);
-       else /* INFO_EXT */
-               return sizeof(struct x_info_blk_hdr);
+       if (type == DIAG204_INFO_SIMPLE)
+               return sizeof(struct diag204_info_blk_hdr);
+       else /* DIAG204_INFO_EXT */
+               return sizeof(struct diag204_x_info_blk_hdr);
 }
 
 static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct info_blk_hdr *)hdr)->npar;
-       else /* INFO_EXT */
-               return ((struct x_info_blk_hdr *)hdr)->npar;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_info_blk_hdr *)hdr)->npar;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_info_blk_hdr *)hdr)->npar;
 }
 
 static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct info_blk_hdr *)hdr)->flags;
-       else /* INFO_EXT */
-               return ((struct x_info_blk_hdr *)hdr)->flags;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_info_blk_hdr *)hdr)->flags;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_info_blk_hdr *)hdr)->flags;
 }
 
 static inline __u16 info_blk_hdr__pcpus(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct info_blk_hdr *)hdr)->phys_cpus;
-       else /* INFO_EXT */
-               return ((struct x_info_blk_hdr *)hdr)->phys_cpus;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_info_blk_hdr *)hdr)->phys_cpus;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_info_blk_hdr *)hdr)->phys_cpus;
 }
 
 /* Partition header */
 
-struct part_hdr {
-       __u8 pn;
-       __u8 cpus;
-       char reserved[6];
-       char part_name[LPAR_NAME_LEN];
-} __attribute__ ((packed));
-
-struct x_part_hdr {
-       __u8  pn;
-       __u8  cpus;
-       __u8  rcpus;
-       __u8  pflag;
-       __u32 mlu;
-       char  part_name[LPAR_NAME_LEN];
-       char  lpc_name[8];
-       char  os_name[8];
-       __u64 online_cs;
-       __u64 online_es;
-       __u8  upid;
-       char  reserved1[3];
-       __u32 group_mlu;
-       char  group_name[8];
-       char  reserved2[32];
-} __attribute__ ((packed));
-
 static inline int part_hdr__size(enum diag204_format type)
 {
-       if (type == INFO_SIMPLE)
-               return sizeof(struct part_hdr);
-       else /* INFO_EXT */
-               return sizeof(struct x_part_hdr);
+       if (type == DIAG204_INFO_SIMPLE)
+               return sizeof(struct diag204_part_hdr);
+       else /* DIAG204_INFO_EXT */
+               return sizeof(struct diag204_x_part_hdr);
 }
 
 static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct part_hdr *)hdr)->cpus;
-       else /* INFO_EXT */
-               return ((struct x_part_hdr *)hdr)->rcpus;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_part_hdr *)hdr)->cpus;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_part_hdr *)hdr)->rcpus;
 }
 
 static inline void part_hdr__part_name(enum diag204_format type, void *hdr,
                                       char *name)
 {
-       if (type == INFO_SIMPLE)
-               memcpy(name, ((struct part_hdr *)hdr)->part_name,
-                      LPAR_NAME_LEN);
-       else /* INFO_EXT */
-               memcpy(name, ((struct x_part_hdr *)hdr)->part_name,
-                      LPAR_NAME_LEN);
-       EBCASC(name, LPAR_NAME_LEN);
-       name[LPAR_NAME_LEN] = 0;
+       if (type == DIAG204_INFO_SIMPLE)
+               memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name,
+                      DIAG204_LPAR_NAME_LEN);
+       else /* DIAG204_INFO_EXT */
+               memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name,
+                      DIAG204_LPAR_NAME_LEN);
+       EBCASC(name, DIAG204_LPAR_NAME_LEN);
+       name[DIAG204_LPAR_NAME_LEN] = 0;
        strim(name);
 }
 
-struct cpu_info {
-       __u16 cpu_addr;
-       char  reserved1[2];
-       __u8  ctidx;
-       __u8  cflag;
-       __u16 weight;
-       __u64 acc_time;
-       __u64 lp_time;
-} __attribute__ ((packed));
-
-struct x_cpu_info {
-       __u16 cpu_addr;
-       char  reserved1[2];
-       __u8  ctidx;
-       __u8  cflag;
-       __u16 weight;
-       __u64 acc_time;
-       __u64 lp_time;
-       __u16 min_weight;
-       __u16 cur_weight;
-       __u16 max_weight;
-       char  reseved2[2];
-       __u64 online_time;
-       __u64 wait_time;
-       __u32 pma_weight;
-       __u32 polar_weight;
-       char  reserved3[40];
-} __attribute__ ((packed));
-
 /* CPU info block */
 
 static inline int cpu_info__size(enum diag204_format type)
 {
-       if (type == INFO_SIMPLE)
-               return sizeof(struct cpu_info);
-       else /* INFO_EXT */
-               return sizeof(struct x_cpu_info);
+       if (type == DIAG204_INFO_SIMPLE)
+               return sizeof(struct diag204_cpu_info);
+       else /* DIAG204_INFO_EXT */
+               return sizeof(struct diag204_x_cpu_info);
 }
 
 static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct cpu_info *)hdr)->ctidx;
-       else /* INFO_EXT */
-               return ((struct x_cpu_info *)hdr)->ctidx;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_cpu_info *)hdr)->ctidx;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_cpu_info *)hdr)->ctidx;
 }
 
 static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct cpu_info *)hdr)->cpu_addr;
-       else /* INFO_EXT */
-               return ((struct x_cpu_info *)hdr)->cpu_addr;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_cpu_info *)hdr)->cpu_addr;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_cpu_info *)hdr)->cpu_addr;
 }
 
 static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct cpu_info *)hdr)->acc_time;
-       else /* INFO_EXT */
-               return ((struct x_cpu_info *)hdr)->acc_time;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_cpu_info *)hdr)->acc_time;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_cpu_info *)hdr)->acc_time;
 }
 
 static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct cpu_info *)hdr)->lp_time;
-       else /* INFO_EXT */
-               return ((struct x_cpu_info *)hdr)->lp_time;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_cpu_info *)hdr)->lp_time;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_cpu_info *)hdr)->lp_time;
 }
 
 static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
+       if (type == DIAG204_INFO_SIMPLE)
                return 0;       /* online_time not available in simple info */
-       else /* INFO_EXT */
-               return ((struct x_cpu_info *)hdr)->online_time;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_cpu_info *)hdr)->online_time;
 }
 
 /* Physical header */
 
-struct phys_hdr {
-       char reserved1[1];
-       __u8 cpus;
-       char reserved2[6];
-       char mgm_name[8];
-} __attribute__ ((packed));
-
-struct x_phys_hdr {
-       char reserved1[1];
-       __u8 cpus;
-       char reserved2[6];
-       char mgm_name[8];
-       char reserved3[80];
-} __attribute__ ((packed));
-
 static inline int phys_hdr__size(enum diag204_format type)
 {
-       if (type == INFO_SIMPLE)
-               return sizeof(struct phys_hdr);
-       else /* INFO_EXT */
-               return sizeof(struct x_phys_hdr);
+       if (type == DIAG204_INFO_SIMPLE)
+               return sizeof(struct diag204_phys_hdr);
+       else /* DIAG204_INFO_EXT */
+               return sizeof(struct diag204_x_phys_hdr);
 }
 
 static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct phys_hdr *)hdr)->cpus;
-       else /* INFO_EXT */
-               return ((struct x_phys_hdr *)hdr)->cpus;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_phys_hdr *)hdr)->cpus;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_phys_hdr *)hdr)->cpus;
 }
 
 /* Physical CPU info block */
 
-struct phys_cpu {
-       __u16 cpu_addr;
-       char  reserved1[2];
-       __u8  ctidx;
-       char  reserved2[3];
-       __u64 mgm_time;
-       char  reserved3[8];
-} __attribute__ ((packed));
-
-struct x_phys_cpu {
-       __u16 cpu_addr;
-       char  reserved1[2];
-       __u8  ctidx;
-       char  reserved2[3];
-       __u64 mgm_time;
-       char  reserved3[80];
-} __attribute__ ((packed));
-
 static inline int phys_cpu__size(enum diag204_format type)
 {
-       if (type == INFO_SIMPLE)
-               return sizeof(struct phys_cpu);
-       else /* INFO_EXT */
-               return sizeof(struct x_phys_cpu);
+       if (type == DIAG204_INFO_SIMPLE)
+               return sizeof(struct diag204_phys_cpu);
+       else /* DIAG204_INFO_EXT */
+               return sizeof(struct diag204_x_phys_cpu);
 }
 
 static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct phys_cpu *)hdr)->cpu_addr;
-       else /* INFO_EXT */
-               return ((struct x_phys_cpu *)hdr)->cpu_addr;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_phys_cpu *)hdr)->cpu_addr;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr;
 }
 
 static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct phys_cpu *)hdr)->mgm_time;
-       else /* INFO_EXT */
-               return ((struct x_phys_cpu *)hdr)->mgm_time;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_phys_cpu *)hdr)->mgm_time;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_phys_cpu *)hdr)->mgm_time;
 }
 
 static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
 {
-       if (type == INFO_SIMPLE)
-               return ((struct phys_cpu *)hdr)->ctidx;
-       else /* INFO_EXT */
-               return ((struct x_phys_cpu *)hdr)->ctidx;
+       if (type == DIAG204_INFO_SIMPLE)
+               return ((struct diag204_phys_cpu *)hdr)->ctidx;
+       else /* DIAG204_INFO_EXT */
+               return ((struct diag204_x_phys_cpu *)hdr)->ctidx;
 }
 
 /* Diagnose 204 functions */
-
-static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
-{
-       register unsigned long _subcode asm("0") = *subcode;
-       register unsigned long _size asm("1") = size;
-
-       asm volatile(
-               "       diag    %2,%0,0x204\n"
-               "0:     nopr    %%r7\n"
-               EX_TABLE(0b,0b)
-               : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
-       *subcode = _subcode;
-       return _size;
-}
-
-static int diag204(unsigned long subcode, unsigned long size, void *addr)
-{
-       diag_stat_inc(DIAG_STAT_X204);
-       size = __diag204(&subcode, size, addr);
-       if (subcode)
-               return -1;
-       return size;
-}
-
 /*
  * For the old diag subcode 4 with simple data format we have to use real
  * memory. If we use subcode 6 or 7 with extended data format, we can (and
@@ -411,12 +261,12 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages)
                *pages = diag204_buf_pages;
                return diag204_buf;
        }
-       if (fmt == INFO_SIMPLE) {
+       if (fmt == DIAG204_INFO_SIMPLE) {
                *pages = 1;
                return diag204_alloc_rbuf();
-       } else {/* INFO_EXT */
-               *pages = diag204((unsigned long)SUBC_RSI |
-                                (unsigned long)INFO_EXT, 0, NULL);
+       } else {/* DIAG204_INFO_EXT */
+               *pages = diag204((unsigned long)DIAG204_SUBC_RSI |
+                                (unsigned long)DIAG204_INFO_EXT, 0, NULL);
                if (*pages <= 0)
                        return ERR_PTR(-ENOSYS);
                else
@@ -443,18 +293,18 @@ static int diag204_probe(void)
        void *buf;
        int pages, rc;
 
-       buf = diag204_get_buffer(INFO_EXT, &pages);
+       buf = diag204_get_buffer(DIAG204_INFO_EXT, &pages);
        if (!IS_ERR(buf)) {
-               if (diag204((unsigned long)SUBC_STIB7 |
-                           (unsigned long)INFO_EXT, pages, buf) >= 0) {
-                       diag204_store_sc = SUBC_STIB7;
-                       diag204_info_type = INFO_EXT;
+               if (diag204((unsigned long)DIAG204_SUBC_STIB7 |
+                           (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
+                       diag204_store_sc = DIAG204_SUBC_STIB7;
+                       diag204_info_type = DIAG204_INFO_EXT;
                        goto out;
                }
-               if (diag204((unsigned long)SUBC_STIB6 |
-                           (unsigned long)INFO_EXT, pages, buf) >= 0) {
-                       diag204_store_sc = SUBC_STIB6;
-                       diag204_info_type = INFO_EXT;
+               if (diag204((unsigned long)DIAG204_SUBC_STIB6 |
+                           (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
+                       diag204_store_sc = DIAG204_SUBC_STIB6;
+                       diag204_info_type = DIAG204_INFO_EXT;
                        goto out;
                }
                diag204_free_buffer();
@@ -462,15 +312,15 @@ static int diag204_probe(void)
 
        /* subcodes 6 and 7 failed, now try subcode 4 */
 
-       buf = diag204_get_buffer(INFO_SIMPLE, &pages);
+       buf = diag204_get_buffer(DIAG204_INFO_SIMPLE, &pages);
        if (IS_ERR(buf)) {
                rc = PTR_ERR(buf);
                goto fail_alloc;
        }
-       if (diag204((unsigned long)SUBC_STIB4 |
-                   (unsigned long)INFO_SIMPLE, pages, buf) >= 0) {
-               diag204_store_sc = SUBC_STIB4;
-               diag204_info_type = INFO_SIMPLE;
+       if (diag204((unsigned long)DIAG204_SUBC_STIB4 |
+                   (unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) {
+               diag204_store_sc = DIAG204_SUBC_STIB4;
+               diag204_info_type = DIAG204_INFO_SIMPLE;
                goto out;
        } else {
                rc = -ENOSYS;
@@ -510,20 +360,6 @@ out:
 
 /* Diagnose 224 functions */
 
-static int diag224(void *ptr)
-{
-       int rc = -EOPNOTSUPP;
-
-       diag_stat_inc(DIAG_STAT_X224);
-       asm volatile(
-               "       diag    %1,%2,0x224\n"
-               "0:     lhi     %0,0x0\n"
-               "1:\n"
-               EX_TABLE(0b,1b)
-               : "+d" (rc) :"d" (0), "d" (ptr) : "memory");
-       return rc;
-}
-
 static int diag224_get_name_table(void)
 {
        /* memory must be below 2GB */
@@ -545,9 +381,9 @@ static void diag224_delete_name_table(void)
 
 static int diag224_idx2name(int index, char *name)
 {
-       memcpy(name, diag224_cpu_names + ((index + 1) * CPU_NAME_LEN),
-               CPU_NAME_LEN);
-       name[CPU_NAME_LEN] = 0;
+       memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN),
+              DIAG204_CPU_NAME_LEN);
+       name[DIAG204_CPU_NAME_LEN] = 0;
        strim(name);
        return 0;
 }
@@ -603,7 +439,7 @@ __init int hypfs_diag_init(void)
                pr_err("The hardware system does not support hypfs\n");
                return -ENODATA;
        }
-       if (diag204_info_type == INFO_EXT) {
+       if (diag204_info_type == DIAG204_INFO_EXT) {
                rc = hypfs_dbfs_create_file(&dbfs_file_d204);
                if (rc)
                        return rc;
@@ -651,7 +487,7 @@ static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info)
                              cpu_info__lp_time(diag204_info_type, cpu_info));
        if (IS_ERR(rc))
                return PTR_ERR(rc);
-       if (diag204_info_type == INFO_EXT) {
+       if (diag204_info_type == DIAG204_INFO_EXT) {
                rc = hypfs_create_u64(cpu_dir, "onlinetime",
                                      cpu_info__online_time(diag204_info_type,
                                                            cpu_info));
@@ -667,12 +503,12 @@ static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr)
 {
        struct dentry *cpus_dir;
        struct dentry *lpar_dir;
-       char lpar_name[LPAR_NAME_LEN + 1];
+       char lpar_name[DIAG204_LPAR_NAME_LEN + 1];
        void *cpu_info;
        int i;
 
        part_hdr__part_name(diag204_info_type, part_hdr, lpar_name);
-       lpar_name[LPAR_NAME_LEN] = 0;
+       lpar_name[DIAG204_LPAR_NAME_LEN] = 0;
        lpar_dir = hypfs_mkdir(systems_dir, lpar_name);
        if (IS_ERR(lpar_dir))
                return lpar_dir;
@@ -755,7 +591,8 @@ int hypfs_diag_create_files(struct dentry *root)
                        goto err_out;
                }
        }
-       if (info_blk_hdr__flags(diag204_info_type, time_hdr) & LPAR_PHYS_FLG) {
+       if (info_blk_hdr__flags(diag204_info_type, time_hdr) &
+           DIAG204_LPAR_PHYS_FLG) {
                ptr = hypfs_create_phys_files(root, part_hdr);
                if (IS_ERR(ptr)) {
                        rc = PTR_ERR(ptr);
index 1a82cf2..d28621d 100644 (file)
@@ -20,6 +20,9 @@
 #define CPACF_KMC              0xb92f          /* MSA  */
 #define CPACF_KIMD             0xb93e          /* MSA  */
 #define CPACF_KLMD             0xb93f          /* MSA  */
+#define CPACF_PCKMO            0xb928          /* MSA3 */
+#define CPACF_KMF              0xb92a          /* MSA4 */
+#define CPACF_KMO              0xb92b          /* MSA4 */
 #define CPACF_PCC              0xb92c          /* MSA4 */
 #define CPACF_KMCTR            0xb92d          /* MSA4 */
 #define CPACF_PPNO             0xb93c          /* MSA5 */
@@ -136,6 +139,7 @@ static inline void __cpacf_query(unsigned int opcode, unsigned char *status)
        register unsigned long r1 asm("1") = (unsigned long) status;
 
        asm volatile(
+               "       spm 0\n" /* pckmo doesn't change the cc */
                /* Parameter registers are ignored, but may not be 0 */
                "0:     .insn   rrf,%[opc] << 16,2,2,2,0\n"
                "       brc     1,0b\n" /* handle partial completion */
@@ -157,6 +161,12 @@ static inline int cpacf_query(unsigned int opcode, unsigned int func)
                if (!test_facility(17)) /* check for MSA */
                        return 0;
                break;
+       case CPACF_PCKMO:
+               if (!test_facility(76)) /* check for MSA3 */
+                       return 0;
+               break;
+       case CPACF_KMF:
+       case CPACF_KMO:
        case CPACF_PCC:
        case CPACF_KMCTR:
                if (!test_facility(77)) /* check for MSA4 */
index 86cae09..8acf482 100644 (file)
@@ -78,4 +78,153 @@ struct diag210 {
 
 extern int diag210(struct diag210 *addr);
 
+/* bit is set in flags, when physical cpu info is included in diag 204 data */
+#define DIAG204_LPAR_PHYS_FLG 0x80
+#define DIAG204_LPAR_NAME_LEN 8                /* lpar name len in diag 204 data */
+#define DIAG204_CPU_NAME_LEN 16                /* type name len of cpus in diag224 name table */
+
+/* diag 204 subcodes */
+enum diag204_sc {
+       DIAG204_SUBC_STIB4 = 4,
+       DIAG204_SUBC_RSI = 5,
+       DIAG204_SUBC_STIB6 = 6,
+       DIAG204_SUBC_STIB7 = 7
+};
+
+/* The two available diag 204 data formats */
+enum diag204_format {
+       DIAG204_INFO_SIMPLE = 0,
+       DIAG204_INFO_EXT = 0x00010000
+};
+
+enum diag204_cpu_flags {
+       DIAG204_CPU_ONLINE = 0x20,
+       DIAG204_CPU_CAPPED = 0x40,
+};
+
+struct diag204_info_blk_hdr {
+       __u8  npar;
+       __u8  flags;
+       __u16 tslice;
+       __u16 phys_cpus;
+       __u16 this_part;
+       __u64 curtod;
+} __packed;
+
+struct diag204_x_info_blk_hdr {
+       __u8  npar;
+       __u8  flags;
+       __u16 tslice;
+       __u16 phys_cpus;
+       __u16 this_part;
+       __u64 curtod1;
+       __u64 curtod2;
+       char reserved[40];
+} __packed;
+
+struct diag204_part_hdr {
+       __u8 pn;
+       __u8 cpus;
+       char reserved[6];
+       char part_name[DIAG204_LPAR_NAME_LEN];
+} __packed;
+
+struct diag204_x_part_hdr {
+       __u8  pn;
+       __u8  cpus;
+       __u8  rcpus;
+       __u8  pflag;
+       __u32 mlu;
+       char  part_name[DIAG204_LPAR_NAME_LEN];
+       char  lpc_name[8];
+       char  os_name[8];
+       __u64 online_cs;
+       __u64 online_es;
+       __u8  upid;
+       __u8  reserved:3;
+       __u8  mtid:5;
+       char  reserved1[2];
+       __u32 group_mlu;
+       char  group_name[8];
+       char  hardware_group_name[8];
+       char  reserved2[24];
+} __packed;
+
+struct diag204_cpu_info {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       __u8  cflag;
+       __u16 weight;
+       __u64 acc_time;
+       __u64 lp_time;
+} __packed;
+
+struct diag204_x_cpu_info {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       __u8  cflag;
+       __u16 weight;
+       __u64 acc_time;
+       __u64 lp_time;
+       __u16 min_weight;
+       __u16 cur_weight;
+       __u16 max_weight;
+       char  reseved2[2];
+       __u64 online_time;
+       __u64 wait_time;
+       __u32 pma_weight;
+       __u32 polar_weight;
+       __u32 cpu_type_cap;
+       __u32 group_cpu_type_cap;
+       char  reserved3[32];
+} __packed;
+
+struct diag204_phys_hdr {
+       char reserved1[1];
+       __u8 cpus;
+       char reserved2[6];
+       char mgm_name[8];
+} __packed;
+
+struct diag204_x_phys_hdr {
+       char reserved1[1];
+       __u8 cpus;
+       char reserved2[6];
+       char mgm_name[8];
+       char reserved3[80];
+} __packed;
+
+struct diag204_phys_cpu {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       char  reserved2[3];
+       __u64 mgm_time;
+       char  reserved3[8];
+} __packed;
+
+struct diag204_x_phys_cpu {
+       __u16 cpu_addr;
+       char  reserved1[2];
+       __u8  ctidx;
+       char  reserved2[1];
+       __u16 weight;
+       __u64 mgm_time;
+       char  reserved3[80];
+} __packed;
+
+struct diag204_x_part_block {
+       struct diag204_x_part_hdr hdr;
+       struct diag204_x_cpu_info cpus[];
+} __packed;
+
+struct diag204_x_phys_block {
+       struct diag204_x_phys_hdr hdr;
+       struct diag204_x_phys_cpu cpus[];
+} __packed;
+
+int diag204(unsigned long subcode, unsigned long size, void *addr);
+int diag224(void *ptr);
 #endif /* _ASM_S390_DIAG_H */
index d054c1b..741ddba 100644 (file)
 
 /**
  * struct gmap_struct - guest address space
+ * @list: list head for the mm->context gmap list
  * @crst_list: list of all crst tables used in the guest address space
  * @mm: pointer to the parent mm_struct
  * @guest_to_host: radix tree with guest to host address translation
  * @host_to_guest: radix tree with pointer to segment table entries
  * @guest_table_lock: spinlock to protect all entries in the guest page table
+ * @ref_count: reference counter for the gmap structure
  * @table: pointer to the page directory
  * @asce: address space control element for gmap page table
  * @pfault_enabled: defines if pfaults are applicable for the guest
+ * @host_to_rmap: radix tree with gmap_rmap lists
+ * @children: list of shadow gmap structures
+ * @pt_list: list of all page tables used in the shadow guest address space
+ * @shadow_lock: spinlock to protect the shadow gmap list
+ * @parent: pointer to the parent gmap for shadow guest address spaces
+ * @orig_asce: ASCE for which the shadow page table has been created
+ * @edat_level: edat level to be used for the shadow translation
+ * @removed: flag to indicate if a shadow guest address space has been removed
+ * @initialized: flag to indicate if a shadow guest address space can be used
  */
 struct gmap {
        struct list_head list;
@@ -26,26 +37,64 @@ struct gmap {
        struct radix_tree_root guest_to_host;
        struct radix_tree_root host_to_guest;
        spinlock_t guest_table_lock;
+       atomic_t ref_count;
        unsigned long *table;
        unsigned long asce;
        unsigned long asce_end;
        void *private;
        bool pfault_enabled;
+       /* Additional data for shadow guest address spaces */
+       struct radix_tree_root host_to_rmap;
+       struct list_head children;
+       struct list_head pt_list;
+       spinlock_t shadow_lock;
+       struct gmap *parent;
+       unsigned long orig_asce;
+       int edat_level;
+       bool removed;
+       bool initialized;
 };
 
+/**
+ * struct gmap_rmap - reverse mapping for shadow page table entries
+ * @next: pointer to next rmap in the list
+ * @raddr: virtual rmap address in the shadow guest address space
+ */
+struct gmap_rmap {
+       struct gmap_rmap *next;
+       unsigned long raddr;
+};
+
+#define gmap_for_each_rmap(pos, head) \
+       for (pos = (head); pos; pos = pos->next)
+
+#define gmap_for_each_rmap_safe(pos, n, head) \
+       for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
+
 /**
  * struct gmap_notifier - notify function block for page invalidation
  * @notifier_call: address of callback function
  */
 struct gmap_notifier {
        struct list_head list;
-       void (*notifier_call)(struct gmap *gmap, unsigned long gaddr);
+       struct rcu_head rcu;
+       void (*notifier_call)(struct gmap *gmap, unsigned long start,
+                             unsigned long end);
 };
 
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit);
-void gmap_free(struct gmap *gmap);
+static inline int gmap_is_shadow(struct gmap *gmap)
+{
+       return !!gmap->parent;
+}
+
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
+void gmap_remove(struct gmap *gmap);
+struct gmap *gmap_get(struct gmap *gmap);
+void gmap_put(struct gmap *gmap);
+
 void gmap_enable(struct gmap *gmap);
 void gmap_disable(struct gmap *gmap);
+struct gmap *gmap_get_enabled(void);
 int gmap_map_segment(struct gmap *gmap, unsigned long from,
                     unsigned long to, unsigned long len);
 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
@@ -57,8 +106,29 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
 void __gmap_zap(struct gmap *, unsigned long gaddr);
 void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
 
-void gmap_register_ipte_notifier(struct gmap_notifier *);
-void gmap_unregister_ipte_notifier(struct gmap_notifier *);
-int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len);
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
+
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+                        int edat_level);
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+                   int fake);
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+                   int fake);
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+                   int fake);
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+                   int fake);
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+                          unsigned long *pgt, int *dat_protection, int *fake);
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
+
+void gmap_register_pte_notifier(struct gmap_notifier *);
+void gmap_unregister_pte_notifier(struct gmap_notifier *);
+void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
+                    unsigned long bits);
+
+int gmap_mprotect_notify(struct gmap *, unsigned long start,
+                        unsigned long len, int prot);
 
 #endif /* _ASM_S390_GMAP_H */
index ac82e8e..8e5daf7 100644 (file)
@@ -43,6 +43,7 @@
 /* s390-specific vcpu->requests bit members */
 #define KVM_REQ_ENABLE_IBS         8
 #define KVM_REQ_DISABLE_IBS        9
+#define KVM_REQ_ICPT_OPEREXC       10
 
 #define SIGP_CTRL_C            0x80
 #define SIGP_CTRL_SCN_MASK     0x3f
@@ -145,7 +146,7 @@ struct kvm_s390_sie_block {
        __u64   cputm;                  /* 0x0028 */
        __u64   ckc;                    /* 0x0030 */
        __u64   epoch;                  /* 0x0038 */
-       __u8    reserved40[4];          /* 0x0040 */
+       __u32   svcc;                   /* 0x0040 */
 #define LCTL_CR0       0x8000
 #define LCTL_CR6       0x0200
 #define LCTL_CR9       0x0040
@@ -154,6 +155,7 @@ struct kvm_s390_sie_block {
 #define LCTL_CR14      0x0002
        __u16   lctl;                   /* 0x0044 */
        __s16   icpua;                  /* 0x0046 */
+#define ICTL_OPEREXC   0x80000000
 #define ICTL_PINT      0x20000000
 #define ICTL_LPSW      0x00400000
 #define ICTL_STCTL     0x00040000
@@ -166,6 +168,9 @@ struct kvm_s390_sie_block {
 #define ICPT_INST      0x04
 #define ICPT_PROGI     0x08
 #define ICPT_INSTPROGI 0x0C
+#define ICPT_EXTINT    0x14
+#define ICPT_VALIDITY  0x20
+#define ICPT_STOP      0x28
 #define ICPT_OPEREXC   0x2C
 #define ICPT_PARTEXEC  0x38
 #define ICPT_IOINST    0x40
@@ -185,7 +190,9 @@ struct kvm_s390_sie_block {
        __u32   scaol;                  /* 0x0064 */
        __u8    reserved68[4];          /* 0x0068 */
        __u32   todpr;                  /* 0x006c */
-       __u8    reserved70[32];         /* 0x0070 */
+       __u8    reserved70[16];         /* 0x0070 */
+       __u64   mso;                    /* 0x0080 */
+       __u64   msl;                    /* 0x0088 */
        psw_t   gpsw;                   /* 0x0090 */
        __u64   gg14;                   /* 0x00a0 */
        __u64   gg15;                   /* 0x00a8 */
@@ -223,7 +230,7 @@ struct kvm_s390_sie_block {
        __u8    reserved1e6[2];         /* 0x01e6 */
        __u64   itdba;                  /* 0x01e8 */
        __u64   riccbd;                 /* 0x01f0 */
-       __u8    reserved1f8[8];         /* 0x01f8 */
+       __u64   gvrd;                   /* 0x01f8 */
 } __attribute__((packed));
 
 struct kvm_s390_itdb {
@@ -256,6 +263,7 @@ struct kvm_vcpu_stat {
        u32 instruction_stctg;
        u32 exit_program_interruption;
        u32 exit_instr_and_program;
+       u32 exit_operation_exception;
        u32 deliver_external_call;
        u32 deliver_emergency_signal;
        u32 deliver_service_signal;
@@ -278,7 +286,9 @@ struct kvm_vcpu_stat {
        u32 instruction_stsi;
        u32 instruction_stfl;
        u32 instruction_tprot;
+       u32 instruction_sie;
        u32 instruction_essa;
+       u32 instruction_sthyi;
        u32 instruction_sigp_sense;
        u32 instruction_sigp_sense_running;
        u32 instruction_sigp_external_call;
@@ -541,12 +551,16 @@ struct kvm_guestdbg_info_arch {
 
 struct kvm_vcpu_arch {
        struct kvm_s390_sie_block *sie_block;
+       /* if vsie is active, currently executed shadow sie control block */
+       struct kvm_s390_sie_block *vsie_block;
        unsigned int      host_acrs[NUM_ACRS];
        struct fpu        host_fpregs;
        struct kvm_s390_local_interrupt local_int;
        struct hrtimer    ckc_timer;
        struct kvm_s390_pgm_info pgm;
        struct gmap *gmap;
+       /* backup location for the currently enabled gmap when scheduled out */
+       struct gmap *enabled_gmap;
        struct kvm_guestdbg_info_arch guestdbg;
        unsigned long pfault_token;
        unsigned long pfault_select;
@@ -631,6 +645,14 @@ struct sie_page2 {
        u8 reserved900[0x1000 - 0x900];                 /* 0x0900 */
 } __packed;
 
+struct kvm_s390_vsie {
+       struct mutex mutex;
+       struct radix_tree_root addr_to_page;
+       int page_count;
+       int next;
+       struct page *pages[KVM_MAX_VCPUS];
+};
+
 struct kvm_arch{
        void *sca;
        int use_esca;
@@ -646,15 +668,20 @@ struct kvm_arch{
        int user_cpu_state_ctrl;
        int user_sigp;
        int user_stsi;
+       int user_instr0;
        struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
        wait_queue_head_t ipte_wq;
        int ipte_lock_count;
        struct mutex ipte_mutex;
+       struct ratelimit_state sthyi_limit;
        spinlock_t start_stop_lock;
        struct sie_page2 *sie_page2;
        struct kvm_s390_cpu_model model;
        struct kvm_s390_crypto crypto;
+       struct kvm_s390_vsie vsie;
        u64 epoch;
+       /* subset of available cpu features enabled by user space */
+       DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
 };
 
 #define KVM_HVA_ERR_BAD                (-1UL)
index 1822643..6d39329 100644 (file)
@@ -8,8 +8,9 @@ typedef struct {
        cpumask_t cpu_attach_mask;
        atomic_t flush_count;
        unsigned int flush_mm;
-       spinlock_t list_lock;
+       spinlock_t pgtable_lock;
        struct list_head pgtable_list;
+       spinlock_t gmap_lock;
        struct list_head gmap_list;
        unsigned long asce;
        unsigned long asce_limit;
@@ -22,9 +23,11 @@ typedef struct {
        unsigned int use_skey:1;
 } mm_context_t;
 
-#define INIT_MM_CONTEXT(name)                                                \
-       .context.list_lock    = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \
-       .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list),    \
+#define INIT_MM_CONTEXT(name)                                             \
+       .context.pgtable_lock =                                            \
+                       __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock),   \
+       .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
+       .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \
        .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
 
 static inline int tprot(unsigned long addr)
index f77c638..c6a088c 100644 (file)
@@ -15,8 +15,9 @@
 static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
 {
-       spin_lock_init(&mm->context.list_lock);
+       spin_lock_init(&mm->context.pgtable_lock);
        INIT_LIST_HEAD(&mm->context.pgtable_list);
+       spin_lock_init(&mm->context.gmap_lock);
        INIT_LIST_HEAD(&mm->context.gmap_list);
        cpumask_clear(&mm->context.cpu_attach_mask);
        atomic_set(&mm->context.flush_count, 0);
index b2146c4..69b8a41 100644 (file)
@@ -111,13 +111,14 @@ static inline unsigned char page_get_storage_key(unsigned long addr)
 
 static inline int page_reset_referenced(unsigned long addr)
 {
-       unsigned int ipm;
+       int cc;
 
        asm volatile(
                "       rrbe    0,%1\n"
                "       ipm     %0\n"
-               : "=d" (ipm) : "a" (addr) : "cc");
-       return !!(ipm & 0x20000000);
+               "       srl     %0,28\n"
+               : "=d" (cc) : "a" (addr) : "cc");
+       return cc;
 }
 
 /* Bits int the storage key */
@@ -148,6 +149,8 @@ static inline int devmem_is_allowed(unsigned long pfn)
 #define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
 #define page_to_phys(page)     (page_to_pfn(page) << PAGE_SHIFT)
 #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_virt(pfn)       __va((pfn) << PAGE_SHIFT)
+#define page_to_virt(page)     pfn_to_virt(page_to_pfn(page))
 
 #define VM_DATA_DEFAULT_FLAGS  (VM_READ | VM_WRITE | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
index da34cb6..f4eb984 100644 (file)
@@ -19,8 +19,10 @@ unsigned long *crst_table_alloc(struct mm_struct *);
 void crst_table_free(struct mm_struct *, unsigned long *);
 
 unsigned long *page_table_alloc(struct mm_struct *);
+struct page *page_table_alloc_pgste(struct mm_struct *mm);
 void page_table_free(struct mm_struct *, unsigned long *);
 void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
+void page_table_free_pgste(struct page *page);
 extern int page_table_allocate_pgste;
 
 static inline void clear_table(unsigned long *s, unsigned long val, size_t n)
index 48d383a..72c7f60 100644 (file)
@@ -277,6 +277,7 @@ static inline int is_module_addr(void *addr)
 /* Bits in the region table entry */
 #define _REGION_ENTRY_ORIGIN   ~0xfffUL/* region/segment table origin      */
 #define _REGION_ENTRY_PROTECT  0x200   /* region protection bit            */
+#define _REGION_ENTRY_OFFSET   0xc0    /* region table offset              */
 #define _REGION_ENTRY_INVALID  0x20    /* invalid region table entry       */
 #define _REGION_ENTRY_TYPE_MASK        0x0c    /* region/segment table type mask   */
 #define _REGION_ENTRY_TYPE_R1  0x0c    /* region first table type          */
@@ -364,6 +365,7 @@ static inline int is_module_addr(void *addr)
 #define PGSTE_GC_BIT   0x0002000000000000UL
 #define PGSTE_UC_BIT   0x0000800000000000UL    /* user dirty (migration) */
 #define PGSTE_IN_BIT   0x0000400000000000UL    /* IPTE notify bit */
+#define PGSTE_VSIE_BIT 0x0000200000000000UL    /* ref'd in a shadow table */
 
 /* Guest Page State used for virtualization */
 #define _PGSTE_GPS_ZERO                0x0000000080000000UL
@@ -1002,15 +1004,26 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep, pte_t entry);
 void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void ptep_notify(struct mm_struct *mm, unsigned long addr,
+                pte_t *ptep, unsigned long bits);
+int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
+                   pte_t *ptep, int prot, unsigned long bit);
 void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep , int reset);
 void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+                   pte_t *sptep, pte_t *tptep, pte_t pte);
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
 
 bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
                          unsigned char key, bool nq);
-unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr);
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                              unsigned char key, unsigned char *oldkey,
+                              bool nq, bool mr, bool mc);
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr);
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                         unsigned char *key);
 
 /*
  * Certain architectures need to do special things when PTEs
index 0952920..0332317 100644 (file)
@@ -112,6 +112,8 @@ struct thread_struct {
         unsigned long ksp;              /* kernel stack pointer             */
        mm_segment_t mm_segment;
        unsigned long gmap_addr;        /* address of last gmap fault. */
+       unsigned int gmap_write_flag;   /* gmap fault write indication */
+       unsigned int gmap_int_code;     /* int code of last gmap fault */
        unsigned int gmap_pfault;       /* signal of a pending guest pfault */
        struct per_regs per_user;       /* User specified PER registers */
        struct per_event per_event;     /* Cause of the last PER trap */
index e4f6f73..2ad9c20 100644 (file)
@@ -32,12 +32,19 @@ struct sclp_core_entry {
        u8 reserved0;
        u8 : 4;
        u8 sief2 : 1;
-       u8 : 3;
-       u8 : 3;
+       u8 skey : 1;
+       u8 : 2;
+       u8 : 2;
+       u8 gpere : 1;
        u8 siif : 1;
        u8 sigpif : 1;
        u8 : 3;
-       u8 reserved2[10];
+       u8 reserved2[3];
+       u8 : 2;
+       u8 ib : 1;
+       u8 cei : 1;
+       u8 : 4;
+       u8 reserved3[6];
        u8 type;
        u8 reserved1;
 } __attribute__((packed));
@@ -59,6 +66,15 @@ struct sclp_info {
        unsigned char has_hvs : 1;
        unsigned char has_esca : 1;
        unsigned char has_sief2 : 1;
+       unsigned char has_64bscao : 1;
+       unsigned char has_gpere : 1;
+       unsigned char has_cmma : 1;
+       unsigned char has_gsls : 1;
+       unsigned char has_ib : 1;
+       unsigned char has_cei : 1;
+       unsigned char has_pfmfi : 1;
+       unsigned char has_ibs : 1;
+       unsigned char has_skey : 1;
        unsigned int ibc;
        unsigned int mtid;
        unsigned int mtid_cp;
@@ -101,5 +117,6 @@ int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count);
 int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count);
 void sclp_early_detect(void);
 void _sclp_print_early(const char *);
+void sclp_ocf_cpc_name_copy(char *dst);
 
 #endif /* _ASM_S390_SCLP_H */
index 3b8e99e..a2ffec4 100644 (file)
@@ -93,6 +93,47 @@ struct kvm_s390_vm_cpu_machine {
        __u64 fac_list[256];
 };
 
+#define KVM_S390_VM_CPU_PROCESSOR_FEAT 2
+#define KVM_S390_VM_CPU_MACHINE_FEAT   3
+
+#define KVM_S390_VM_CPU_FEAT_NR_BITS   1024
+#define KVM_S390_VM_CPU_FEAT_ESOP      0
+#define KVM_S390_VM_CPU_FEAT_SIEF2     1
+#define KVM_S390_VM_CPU_FEAT_64BSCAO   2
+#define KVM_S390_VM_CPU_FEAT_SIIF      3
+#define KVM_S390_VM_CPU_FEAT_GPERE     4
+#define KVM_S390_VM_CPU_FEAT_GSLS      5
+#define KVM_S390_VM_CPU_FEAT_IB                6
+#define KVM_S390_VM_CPU_FEAT_CEI       7
+#define KVM_S390_VM_CPU_FEAT_IBS       8
+#define KVM_S390_VM_CPU_FEAT_SKEY      9
+#define KVM_S390_VM_CPU_FEAT_CMMA      10
+#define KVM_S390_VM_CPU_FEAT_PFMFI     11
+#define KVM_S390_VM_CPU_FEAT_SIGPIF    12
+struct kvm_s390_vm_cpu_feat {
+       __u64 feat[16];
+};
+
+#define KVM_S390_VM_CPU_PROCESSOR_SUBFUNC      4
+#define KVM_S390_VM_CPU_MACHINE_SUBFUNC                5
+/* for "test bit" instructions MSB 0 bit ordering, for "query" raw blocks */
+struct kvm_s390_vm_cpu_subfunc {
+       __u8 plo[32];           /* always */
+       __u8 ptff[16];          /* with TOD-clock steering */
+       __u8 kmac[16];          /* with MSA */
+       __u8 kmc[16];           /* with MSA */
+       __u8 km[16];            /* with MSA */
+       __u8 kimd[16];          /* with MSA */
+       __u8 klmd[16];          /* with MSA */
+       __u8 pckmo[16];         /* with MSA3 */
+       __u8 kmctr[16];         /* with MSA4 */
+       __u8 kmf[16];           /* with MSA4 */
+       __u8 kmo[16];           /* with MSA4 */
+       __u8 pcc[16];           /* with MSA4 */
+       __u8 ppno[16];          /* with MSA5 */
+       __u8 reserved[1824];
+};
+
 /* kvm attributes for crypto */
 #define KVM_S390_VM_CRYPTO_ENABLE_AES_KW       0
 #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW       1
index 8fb5d4a..3ac6343 100644 (file)
        exit_code_ipa0(0xB2, 0x4c, "TAR"),      \
        exit_code_ipa0(0xB2, 0x50, "CSP"),      \
        exit_code_ipa0(0xB2, 0x54, "MVPG"),     \
+       exit_code_ipa0(0xB2, 0x56, "STHYI"),    \
        exit_code_ipa0(0xB2, 0x58, "BSG"),      \
        exit_code_ipa0(0xB2, 0x5a, "BSA"),      \
        exit_code_ipa0(0xB2, 0x5f, "CHSC"),     \
index 48b37b8..a97354c 100644 (file)
@@ -162,6 +162,30 @@ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
 }
 EXPORT_SYMBOL(diag14);
 
+static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
+{
+       register unsigned long _subcode asm("0") = *subcode;
+       register unsigned long _size asm("1") = size;
+
+       asm volatile(
+               "       diag    %2,%0,0x204\n"
+               "0:     nopr    %%r7\n"
+               EX_TABLE(0b,0b)
+               : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
+       *subcode = _subcode;
+       return _size;
+}
+
+int diag204(unsigned long subcode, unsigned long size, void *addr)
+{
+       diag_stat_inc(DIAG_STAT_X204);
+       size = __diag204(&subcode, size, addr);
+       if (subcode)
+               return -1;
+       return size;
+}
+EXPORT_SYMBOL(diag204);
+
 /*
  * Diagnose 210: Get information about a virtual device
  */
@@ -196,3 +220,18 @@ int diag210(struct diag210 *addr)
        return ccode;
 }
 EXPORT_SYMBOL(diag210);
+
+int diag224(void *ptr)
+{
+       int rc = -EOPNOTSUPP;
+
+       diag_stat_inc(DIAG_STAT_X224);
+       asm volatile(
+               "       diag    %1,%2,0x224\n"
+               "0:     lhi     %0,0x0\n"
+               "1:\n"
+               EX_TABLE(0b,1b)
+               : "+d" (rc) :"d" (0), "d" (ptr) : "memory");
+       return rc;
+}
+EXPORT_SYMBOL(diag224);
index d42fa38..09a9e6d 100644 (file)
@@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqch
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
 kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o
+kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o
 
 obj-$(CONFIG_KVM) += kvm.o
index 1ea4095..ce865bd 100644 (file)
@@ -212,6 +212,11 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
            (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY))
                return -EOPNOTSUPP;
 
+       VCPU_EVENT(vcpu, 4, "diag 0x500 schid 0x%8.8x queue 0x%x cookie 0x%llx",
+                           (u32) vcpu->run->s.regs.gprs[2],
+                           (u32) vcpu->run->s.regs.gprs[3],
+                           vcpu->run->s.regs.gprs[4]);
+
        /*
         * The layout is as follows:
         * - gpr 2 contains the subchannel id (passed as addr)
index 66938d2..5420020 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/vmalloc.h>
 #include <linux/err.h>
 #include <asm/pgtable.h>
+#include <asm/gmap.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 #include <asm/switch_to.h>
@@ -476,18 +477,73 @@ enum {
        FSI_FETCH   = 2  /* Exception was due to fetch operation */
 };
 
-static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
-                        ar_t ar, enum gacc_mode mode)
+enum prot_type {
+       PROT_TYPE_LA   = 0,
+       PROT_TYPE_KEYC = 1,
+       PROT_TYPE_ALC  = 2,
+       PROT_TYPE_DAT  = 3,
+};
+
+static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
+                    ar_t ar, enum gacc_mode mode, enum prot_type prot)
 {
-       int rc;
-       struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
        struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
-       struct trans_exc_code_bits *tec_bits;
+       struct trans_exc_code_bits *tec;
 
        memset(pgm, 0, sizeof(*pgm));
-       tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-       tec_bits->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
-       tec_bits->as = psw.as;
+       pgm->code = code;
+       tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+
+       switch (code) {
+       case PGM_ASCE_TYPE:
+       case PGM_PAGE_TRANSLATION:
+       case PGM_REGION_FIRST_TRANS:
+       case PGM_REGION_SECOND_TRANS:
+       case PGM_REGION_THIRD_TRANS:
+       case PGM_SEGMENT_TRANSLATION:
+               /*
+                * op_access_id only applies to MOVE_PAGE -> set bit 61
+                * exc_access_id has to be set to 0 for some instructions. Both
+                * cases have to be handled by the caller. We can always store
+                * exc_access_id, as it is undefined for non-ar cases.
+                */
+               tec->addr = gva >> PAGE_SHIFT;
+               tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+               tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+               /* FALL THROUGH */
+       case PGM_ALEN_TRANSLATION:
+       case PGM_ALE_SEQUENCE:
+       case PGM_ASTE_VALIDITY:
+       case PGM_ASTE_SEQUENCE:
+       case PGM_EXTENDED_AUTHORITY:
+               pgm->exc_access_id = ar;
+               break;
+       case PGM_PROTECTION:
+               switch (prot) {
+               case PROT_TYPE_ALC:
+                       tec->b60 = 1;
+                       /* FALL THROUGH */
+               case PROT_TYPE_DAT:
+                       tec->b61 = 1;
+                       tec->addr = gva >> PAGE_SHIFT;
+                       tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+                       tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+                       /* exc_access_id is undefined for most cases */
+                       pgm->exc_access_id = ar;
+                       break;
+               default: /* LA and KEYC set b61 to 0, other params undefined */
+                       break;
+               }
+               break;
+       }
+       return code;
+}
+
+static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
+                        unsigned long ga, ar_t ar, enum gacc_mode mode)
+{
+       int rc;
+       struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
 
        if (!psw.t) {
                asce->val = 0;
@@ -510,21 +566,8 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
                return 0;
        case PSW_AS_ACCREG:
                rc = ar_translation(vcpu, asce, ar, mode);
-               switch (rc) {
-               case PGM_ALEN_TRANSLATION:
-               case PGM_ALE_SEQUENCE:
-               case PGM_ASTE_VALIDITY:
-               case PGM_ASTE_SEQUENCE:
-               case PGM_EXTENDED_AUTHORITY:
-                       vcpu->arch.pgm.exc_access_id = ar;
-                       break;
-               case PGM_PROTECTION:
-                       tec_bits->b60 = 1;
-                       tec_bits->b61 = 1;
-                       break;
-               }
                if (rc > 0)
-                       pgm->code = rc;
+                       return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_ALC);
                return rc;
        }
        return 0;
@@ -729,40 +772,31 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
        return 1;
 }
 
-static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
+static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar,
                            unsigned long *pages, unsigned long nr_pages,
                            const union asce asce, enum gacc_mode mode)
 {
-       struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
        psw_t *psw = &vcpu->arch.sie_block->gpsw;
-       struct trans_exc_code_bits *tec_bits;
-       int lap_enabled, rc;
+       int lap_enabled, rc = 0;
 
-       tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
        lap_enabled = low_address_protection_enabled(vcpu, asce);
        while (nr_pages) {
                ga = kvm_s390_logical_to_effective(vcpu, ga);
-               tec_bits->addr = ga >> PAGE_SHIFT;
-               if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) {
-                       pgm->code = PGM_PROTECTION;
-                       return pgm->code;
-               }
+               if (mode == GACC_STORE && lap_enabled && is_low_address(ga))
+                       return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode,
+                                        PROT_TYPE_LA);
                ga &= PAGE_MASK;
                if (psw_bits(*psw).t) {
                        rc = guest_translate(vcpu, ga, pages, asce, mode);
                        if (rc < 0)
                                return rc;
-                       if (rc == PGM_PROTECTION)
-                               tec_bits->b61 = 1;
-                       if (rc)
-                               pgm->code = rc;
                } else {
                        *pages = kvm_s390_real_to_abs(vcpu, ga);
                        if (kvm_is_error_gpa(vcpu->kvm, *pages))
-                               pgm->code = PGM_ADDRESSING;
+                               rc = PGM_ADDRESSING;
                }
-               if (pgm->code)
-                       return pgm->code;
+               if (rc)
+                       return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT);
                ga += PAGE_SIZE;
                pages++;
                nr_pages--;
@@ -783,7 +817,8 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
 
        if (!len)
                return 0;
-       rc = get_vcpu_asce(vcpu, &asce, ar, mode);
+       ga = kvm_s390_logical_to_effective(vcpu, ga);
+       rc = get_vcpu_asce(vcpu, &asce, ga, ar, mode);
        if (rc)
                return rc;
        nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
@@ -795,7 +830,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
        need_ipte_lock = psw_bits(*psw).t && !asce.r;
        if (need_ipte_lock)
                ipte_lock(vcpu);
-       rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, mode);
+       rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode);
        for (idx = 0; idx < nr_pages && !rc; idx++) {
                gpa = *(pages + idx) + (ga & ~PAGE_MASK);
                _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
@@ -846,37 +881,28 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
 int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
                            unsigned long *gpa, enum gacc_mode mode)
 {
-       struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
        psw_t *psw = &vcpu->arch.sie_block->gpsw;
-       struct trans_exc_code_bits *tec;
        union asce asce;
        int rc;
 
        gva = kvm_s390_logical_to_effective(vcpu, gva);
-       tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-       rc = get_vcpu_asce(vcpu, &asce, ar, mode);
-       tec->addr = gva >> PAGE_SHIFT;
+       rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
        if (rc)
                return rc;
        if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) {
-               if (mode == GACC_STORE) {
-                       rc = pgm->code = PGM_PROTECTION;
-                       return rc;
-               }
+               if (mode == GACC_STORE)
+                       return trans_exc(vcpu, PGM_PROTECTION, gva, 0,
+                                        mode, PROT_TYPE_LA);
        }
 
        if (psw_bits(*psw).t && !asce.r) {      /* Use DAT? */
                rc = guest_translate(vcpu, gva, gpa, asce, mode);
-               if (rc > 0) {
-                       if (rc == PGM_PROTECTION)
-                               tec->b61 = 1;
-                       pgm->code = rc;
-               }
+               if (rc > 0)
+                       return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT);
        } else {
-               rc = 0;
                *gpa = kvm_s390_real_to_abs(vcpu, gva);
                if (kvm_is_error_gpa(vcpu->kvm, *gpa))
-                       rc = pgm->code = PGM_ADDRESSING;
+                       return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0);
        }
 
        return rc;
@@ -915,20 +941,247 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
  */
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
 {
-       struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
-       psw_t *psw = &vcpu->arch.sie_block->gpsw;
-       struct trans_exc_code_bits *tec_bits;
        union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]};
 
        if (!ctlreg0.lap || !is_low_address(gra))
                return 0;
+       return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA);
+}
 
-       memset(pgm, 0, sizeof(*pgm));
-       tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-       tec_bits->fsi = FSI_STORE;
-       tec_bits->as = psw_bits(*psw).as;
-       tec_bits->addr = gra >> PAGE_SHIFT;
-       pgm->code = PGM_PROTECTION;
+/**
+ * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: pointer to the page table address result
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ */
+static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
+                                 unsigned long *pgt, int *dat_protection,
+                                 int *fake)
+{
+       struct gmap *parent;
+       union asce asce;
+       union vaddress vaddr;
+       unsigned long ptr;
+       int rc;
+
+       *fake = 0;
+       *dat_protection = 0;
+       parent = sg->parent;
+       vaddr.addr = saddr;
+       asce.val = sg->orig_asce;
+       ptr = asce.origin * 4096;
+       if (asce.r) {
+               *fake = 1;
+               asce.dt = ASCE_TYPE_REGION1;
+       }
+       switch (asce.dt) {
+       case ASCE_TYPE_REGION1:
+               if (vaddr.rfx01 > asce.tl && !asce.r)
+                       return PGM_REGION_FIRST_TRANS;
+               break;
+       case ASCE_TYPE_REGION2:
+               if (vaddr.rfx)
+                       return PGM_ASCE_TYPE;
+               if (vaddr.rsx01 > asce.tl)
+                       return PGM_REGION_SECOND_TRANS;
+               break;
+       case ASCE_TYPE_REGION3:
+               if (vaddr.rfx || vaddr.rsx)
+                       return PGM_ASCE_TYPE;
+               if (vaddr.rtx01 > asce.tl)
+                       return PGM_REGION_THIRD_TRANS;
+               break;
+       case ASCE_TYPE_SEGMENT:
+               if (vaddr.rfx || vaddr.rsx || vaddr.rtx)
+                       return PGM_ASCE_TYPE;
+               if (vaddr.sx01 > asce.tl)
+                       return PGM_SEGMENT_TRANSLATION;
+               break;
+       }
+
+       switch (asce.dt) {
+       case ASCE_TYPE_REGION1: {
+               union region1_table_entry rfte;
 
-       return pgm->code;
+               if (*fake) {
+                       /* offset in 16EB guest memory block */
+                       ptr = ptr + ((unsigned long) vaddr.rsx << 53UL);
+                       rfte.val = ptr;
+                       goto shadow_r2t;
+               }
+               rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
+               if (rc)
+                       return rc;
+               if (rfte.i)
+                       return PGM_REGION_FIRST_TRANS;
+               if (rfte.tt != TABLE_TYPE_REGION1)
+                       return PGM_TRANSLATION_SPEC;
+               if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
+                       return PGM_REGION_SECOND_TRANS;
+               if (sg->edat_level >= 1)
+                       *dat_protection |= rfte.p;
+               ptr = rfte.rto << 12UL;
+shadow_r2t:
+               rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
+               if (rc)
+                       return rc;
+               /* fallthrough */
+       }
+       case ASCE_TYPE_REGION2: {
+               union region2_table_entry rste;
+
+               if (*fake) {
+                       /* offset in 8PB guest memory block */
+                       ptr = ptr + ((unsigned long) vaddr.rtx << 42UL);
+                       rste.val = ptr;
+                       goto shadow_r3t;
+               }
+               rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
+               if (rc)
+                       return rc;
+               if (rste.i)
+                       return PGM_REGION_SECOND_TRANS;
+               if (rste.tt != TABLE_TYPE_REGION2)
+                       return PGM_TRANSLATION_SPEC;
+               if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
+                       return PGM_REGION_THIRD_TRANS;
+               if (sg->edat_level >= 1)
+                       *dat_protection |= rste.p;
+               ptr = rste.rto << 12UL;
+shadow_r3t:
+               rste.p |= *dat_protection;
+               rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
+               if (rc)
+                       return rc;
+               /* fallthrough */
+       }
+       case ASCE_TYPE_REGION3: {
+               union region3_table_entry rtte;
+
+               if (*fake) {
+                       /* offset in 4TB guest memory block */
+                       ptr = ptr + ((unsigned long) vaddr.sx << 31UL);
+                       rtte.val = ptr;
+                       goto shadow_sgt;
+               }
+               rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
+               if (rc)
+                       return rc;
+               if (rtte.i)
+                       return PGM_REGION_THIRD_TRANS;
+               if (rtte.tt != TABLE_TYPE_REGION3)
+                       return PGM_TRANSLATION_SPEC;
+               if (rtte.cr && asce.p && sg->edat_level >= 2)
+                       return PGM_TRANSLATION_SPEC;
+               if (rtte.fc && sg->edat_level >= 2) {
+                       *dat_protection |= rtte.fc0.p;
+                       *fake = 1;
+                       ptr = rtte.fc1.rfaa << 31UL;
+                       rtte.val = ptr;
+                       goto shadow_sgt;
+               }
+               if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
+                       return PGM_SEGMENT_TRANSLATION;
+               if (sg->edat_level >= 1)
+                       *dat_protection |= rtte.fc0.p;
+               ptr = rtte.fc0.sto << 12UL;
+shadow_sgt:
+               rtte.fc0.p |= *dat_protection;
+               rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
+               if (rc)
+                       return rc;
+               /* fallthrough */
+       }
+       case ASCE_TYPE_SEGMENT: {
+               union segment_table_entry ste;
+
+               if (*fake) {
+                       /* offset in 2G guest memory block */
+                       ptr = ptr + ((unsigned long) vaddr.sx << 20UL);
+                       ste.val = ptr;
+                       goto shadow_pgt;
+               }
+               rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
+               if (rc)
+                       return rc;
+               if (ste.i)
+                       return PGM_SEGMENT_TRANSLATION;
+               if (ste.tt != TABLE_TYPE_SEGMENT)
+                       return PGM_TRANSLATION_SPEC;
+               if (ste.cs && asce.p)
+                       return PGM_TRANSLATION_SPEC;
+               *dat_protection |= ste.fc0.p;
+               if (ste.fc && sg->edat_level >= 1) {
+                       *fake = 1;
+                       ptr = ste.fc1.sfaa << 20UL;
+                       ste.val = ptr;
+                       goto shadow_pgt;
+               }
+               ptr = ste.fc0.pto << 11UL;
+shadow_pgt:
+               ste.fc0.p |= *dat_protection;
+               rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
+               if (rc)
+                       return rc;
+       }
+       }
+       /* Return the parent address of the page table */
+       *pgt = ptr;
+       return 0;
+}
+
+/**
+ * kvm_s390_shadow_fault - handle fault on a shadow page table
+ * @vcpu: virtual cpu
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ *
+ * Returns: - 0 if the shadow fault was successfully resolved
+ *         - > 0 (pgm exception code) on exceptions while faulting
+ *         - -EAGAIN if the caller can retry immediately
+ *         - -EFAULT when accessing invalid guest addresses
+ *         - -ENOMEM if out of memory
+ */
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
+                         unsigned long saddr)
+{
+       union vaddress vaddr;
+       union page_table_entry pte;
+       unsigned long pgt;
+       int dat_protection, fake;
+       int rc;
+
+       down_read(&sg->mm->mmap_sem);
+       /*
+        * We don't want any guest-2 tables to change - so the parent
+        * tables/pointers we read stay valid - unshadowing is however
+        * always possible - only guest_table_lock protects us.
+        */
+       ipte_lock(vcpu);
+
+       rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
+       if (rc)
+               rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
+                                           &fake);
+
+       vaddr.addr = saddr;
+       if (fake) {
+               /* offset in 1MB guest memory block */
+               pte.val = pgt + ((unsigned long) vaddr.px << 12UL);
+               goto shadow_page;
+       }
+       if (!rc)
+               rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+       if (!rc && pte.i)
+               rc = PGM_PAGE_TRANSLATION;
+       if (!rc && (pte.z || (pte.co && sg->edat_level < 1)))
+               rc = PGM_TRANSLATION_SPEC;
+shadow_page:
+       pte.p |= dat_protection;
+       if (!rc)
+               rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
+       ipte_unlock(vcpu);
+       up_read(&sg->mm->mmap_sem);
+       return rc;
 }
index df0a79d..8756569 100644 (file)
@@ -361,4 +361,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
 int ipte_lock_held(struct kvm_vcpu *vcpu);
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
 
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
+                         unsigned long saddr);
+
 #endif /* __KVM_S390_GACCESS_H */
index e8c6843..31a0533 100644 (file)
@@ -439,6 +439,23 @@ exit_required:
 #define guest_per_enabled(vcpu) \
                             (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER)
 
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
+{
+       const u8 ilen = kvm_s390_get_ilen(vcpu);
+       struct kvm_s390_pgm_info pgm_info = {
+               .code = PGM_PER,
+               .per_code = PER_EVENT_IFETCH >> 24,
+               .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
+       };
+
+       /*
+        * The PSW points to the next instruction, therefore the intercepted
+        * instruction generated a PER i-fetch event. PER address therefore
+        * points at the previous PSW address (could be an EXECUTE function).
+        */
+       return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+}
+
 static void filter_guest_per_event(struct kvm_vcpu *vcpu)
 {
        u32 perc = vcpu->arch.sie_block->perc << 24;
@@ -465,7 +482,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
                guest_perc &= ~PER_EVENT_IFETCH;
 
        /* All other PER events will be given to the guest */
-       /* TODO: Check alterated address/address space */
+       /* TODO: Check altered address/address space */
 
        vcpu->arch.sie_block->perc = guest_perc >> 24;
 
index 2521571..dfd0ca2 100644 (file)
@@ -351,8 +351,26 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
        return -EOPNOTSUPP;
 }
 
+static int handle_operexc(struct kvm_vcpu *vcpu)
+{
+       vcpu->stat.exit_operation_exception++;
+       trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
+                                     vcpu->arch.sie_block->ipb);
+
+       if (vcpu->arch.sie_block->ipa == 0xb256 &&
+           test_kvm_facility(vcpu->kvm, 74))
+               return handle_sthyi(vcpu);
+
+       if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
+               return -EOPNOTSUPP;
+
+       return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+}
+
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 {
+       int rc, per_rc = 0;
+
        if (kvm_is_ucontrol(vcpu->kvm))
                return -EOPNOTSUPP;
 
@@ -361,7 +379,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
        case 0x18:
                return handle_noop(vcpu);
        case 0x04:
-               return handle_instruction(vcpu);
+               rc = handle_instruction(vcpu);
+               break;
        case 0x08:
                return handle_prog(vcpu);
        case 0x14:
@@ -372,9 +391,19 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
                return handle_validity(vcpu);
        case 0x28:
                return handle_stop(vcpu);
+       case 0x2c:
+               rc = handle_operexc(vcpu);
+               break;
        case 0x38:
-               return handle_partial_execution(vcpu);
+               rc = handle_partial_execution(vcpu);
+               break;
        default:
                return -EOPNOTSUPP;
        }
+
+       /* process PER, also if the instrution is processed in user space */
+       if (vcpu->arch.sie_block->icptstatus & 0x02 &&
+           (!rc || rc == -EOPNOTSUPP))
+               per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu);
+       return per_rc ? per_rc : rc;
 }
index 5a80af7..24524c0 100644 (file)
@@ -28,9 +28,6 @@
 #include "gaccess.h"
 #include "trace-s390.h"
 
-#define IOINT_SCHID_MASK 0x0000ffff
-#define IOINT_SSID_MASK 0x00030000
-#define IOINT_CSSID_MASK 0x03fc0000
 #define PFAULT_INIT 0x0600
 #define PFAULT_DONE 0x0680
 #define VIRTIO_PARAM 0x0d00
@@ -821,7 +818,14 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
                                        struct kvm_s390_interrupt_info,
                                        list);
        if (inti) {
-               VCPU_EVENT(vcpu, 4, "deliver: I/O 0x%llx", inti->type);
+               if (inti->type & KVM_S390_INT_IO_AI_MASK)
+                       VCPU_EVENT(vcpu, 4, "%s", "deliver: I/O (AI)");
+               else
+                       VCPU_EVENT(vcpu, 4, "deliver: I/O %x ss %x schid %04x",
+                       inti->io.subchannel_id >> 8,
+                       inti->io.subchannel_id >> 1 & 0x3,
+                       inti->io.subchannel_nr);
+
                vcpu->stat.deliver_io_int++;
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
                                inti->type,
@@ -991,6 +995,11 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
                swake_up(&vcpu->wq);
                vcpu->stat.halt_wakeup++;
        }
+       /*
+        * The VCPU might not be sleeping but is executing the VSIE. Let's
+        * kick it, so it leaves the SIE to process the request.
+        */
+       kvm_s390_vsie_kick(vcpu);
 }
 
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
@@ -1415,6 +1424,13 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
        }
        fi->counters[FIRQ_CNTR_IO] += 1;
 
+       if (inti->type & KVM_S390_INT_IO_AI_MASK)
+               VM_EVENT(kvm, 4, "%s", "inject: I/O (AI)");
+       else
+               VM_EVENT(kvm, 4, "inject: I/O %x ss %x schid %04x",
+                       inti->io.subchannel_id >> 8,
+                       inti->io.subchannel_id >> 1 & 0x3,
+                       inti->io.subchannel_nr);
        isc = int_word_to_isc(inti->io.io_int_word);
        list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
        list_add_tail(&inti->list, list);
@@ -1531,13 +1547,6 @@ int kvm_s390_inject_vm(struct kvm *kvm,
                inti->mchk.mcic = s390int->parm64;
                break;
        case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-               if (inti->type & KVM_S390_INT_IO_AI_MASK)
-                       VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
-               else
-                       VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
-                                s390int->type & IOINT_CSSID_MASK,
-                                s390int->type & IOINT_SSID_MASK,
-                                s390int->type & IOINT_SCHID_MASK);
                inti->io.subchannel_id = s390int->parm >> 16;
                inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
                inti->io.io_int_parm = s390int->parm64 >> 32;
@@ -2237,7 +2246,8 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
        return ret;
 }
 
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+                         struct kvm_kernel_irq_routing_entry *e,
                          const struct kvm_irq_routing_entry *ue)
 {
        int ret;
index 6f5c344..3f3ae48 100644 (file)
 #include <linux/init.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
+#include <linux/mman.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/vmalloc.h>
+#include <linux/bitmap.h>
 #include <asm/asm-offsets.h>
 #include <asm/lowcore.h>
 #include <asm/stp.h>
@@ -35,6 +37,8 @@
 #include <asm/switch_to.h>
 #include <asm/isc.h>
 #include <asm/sclp.h>
+#include <asm/cpacf.h>
+#include <asm/timex.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 
@@ -64,6 +68,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "exit_pei", VCPU_STAT(exit_pei) },
        { "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
        { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
+       { "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
        { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
        { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
        { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
@@ -94,6 +99,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "instruction_stsi", VCPU_STAT(instruction_stsi) },
        { "instruction_stfl", VCPU_STAT(instruction_stfl) },
        { "instruction_tprot", VCPU_STAT(instruction_tprot) },
+       { "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
+       { "instruction_sie", VCPU_STAT(instruction_sie) },
        { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
        { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
        { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
@@ -119,6 +126,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { NULL }
 };
 
+/* allow nested virtualization in KVM (if enabled by user space) */
+static int nested;
+module_param(nested, int, S_IRUGO);
+MODULE_PARM_DESC(nested, "Nested virtualization support");
+
 /* upper facilities limit for kvm */
 unsigned long kvm_s390_fac_list_mask[16] = {
        0xffe6000000000000UL,
@@ -131,7 +143,13 @@ unsigned long kvm_s390_fac_list_mask_size(void)
        return ARRAY_SIZE(kvm_s390_fac_list_mask);
 }
 
+/* available cpu features supported by kvm */
+static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+/* available subfunctions indicated via query / "test bit" */
+static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
+
 static struct gmap_notifier gmap_notifier;
+static struct gmap_notifier vsie_gmap_notifier;
 debug_info_t *kvm_s390_dbf;
 
 /* Section: not file related */
@@ -141,7 +159,8 @@ int kvm_arch_hardware_enable(void)
        return 0;
 }
 
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+                             unsigned long end);
 
 /*
  * This callback is executed during stop_machine(). All CPUs are therefore
@@ -163,6 +182,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
                        vcpu->arch.sie_block->epoch -= *delta;
                        if (vcpu->arch.cputm_enabled)
                                vcpu->arch.cputm_start += *delta;
+                       if (vcpu->arch.vsie_block)
+                               vcpu->arch.vsie_block->epoch -= *delta;
                }
        }
        return NOTIFY_OK;
@@ -175,7 +196,9 @@ static struct notifier_block kvm_clock_notifier = {
 int kvm_arch_hardware_setup(void)
 {
        gmap_notifier.notifier_call = kvm_gmap_notifier;
-       gmap_register_ipte_notifier(&gmap_notifier);
+       gmap_register_pte_notifier(&gmap_notifier);
+       vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+       gmap_register_pte_notifier(&vsie_gmap_notifier);
        atomic_notifier_chain_register(&s390_epoch_delta_notifier,
                                       &kvm_clock_notifier);
        return 0;
@@ -183,11 +206,109 @@ int kvm_arch_hardware_setup(void)
 
 void kvm_arch_hardware_unsetup(void)
 {
-       gmap_unregister_ipte_notifier(&gmap_notifier);
+       gmap_unregister_pte_notifier(&gmap_notifier);
+       gmap_unregister_pte_notifier(&vsie_gmap_notifier);
        atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
                                         &kvm_clock_notifier);
 }
 
+static void allow_cpu_feat(unsigned long nr)
+{
+       set_bit_inv(nr, kvm_s390_available_cpu_feat);
+}
+
+static inline int plo_test_bit(unsigned char nr)
+{
+       register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
+       int cc = 3; /* subfunction not available */
+
+       asm volatile(
+               /* Parameter registers are ignored for "test bit" */
+               "       plo     0,0,0,0(0)\n"
+               "       ipm     %0\n"
+               "       srl     %0,28\n"
+               : "=d" (cc)
+               : "d" (r0)
+               : "cc");
+       return cc == 0;
+}
+
+static void kvm_s390_cpu_feat_init(void)
+{
+       int i;
+
+       for (i = 0; i < 256; ++i) {
+               if (plo_test_bit(i))
+                       kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7);
+       }
+
+       if (test_facility(28)) /* TOD-clock steering */
+               ptff(kvm_s390_available_subfunc.ptff,
+                    sizeof(kvm_s390_available_subfunc.ptff),
+                    PTFF_QAF);
+
+       if (test_facility(17)) { /* MSA */
+               __cpacf_query(CPACF_KMAC, kvm_s390_available_subfunc.kmac);
+               __cpacf_query(CPACF_KMC, kvm_s390_available_subfunc.kmc);
+               __cpacf_query(CPACF_KM, kvm_s390_available_subfunc.km);
+               __cpacf_query(CPACF_KIMD, kvm_s390_available_subfunc.kimd);
+               __cpacf_query(CPACF_KLMD, kvm_s390_available_subfunc.klmd);
+       }
+       if (test_facility(76)) /* MSA3 */
+               __cpacf_query(CPACF_PCKMO, kvm_s390_available_subfunc.pckmo);
+       if (test_facility(77)) { /* MSA4 */
+               __cpacf_query(CPACF_KMCTR, kvm_s390_available_subfunc.kmctr);
+               __cpacf_query(CPACF_KMF, kvm_s390_available_subfunc.kmf);
+               __cpacf_query(CPACF_KMO, kvm_s390_available_subfunc.kmo);
+               __cpacf_query(CPACF_PCC, kvm_s390_available_subfunc.pcc);
+       }
+       if (test_facility(57)) /* MSA5 */
+               __cpacf_query(CPACF_PPNO, kvm_s390_available_subfunc.ppno);
+
+       if (MACHINE_HAS_ESOP)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
+       /*
+        * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
+        * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
+        */
+       if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+           !test_facility(3) || !nested)
+               return;
+       allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
+       if (sclp.has_64bscao)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
+       if (sclp.has_siif)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
+       if (sclp.has_gpere)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
+       if (sclp.has_gsls)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
+       if (sclp.has_ib)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
+       if (sclp.has_cei)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
+       if (sclp.has_ibs)
+               allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+       /*
+        * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
+        * all skey handling functions read/set the skey from the PGSTE
+        * instead of the real storage key.
+        *
+        * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make
+        * pages being detected as preserved although they are resident.
+        *
+        * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will
+        * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY.
+        *
+        * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
+        * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
+        * correctly shadowed. We can do that for the PGSTE but not for PTE.I.
+        *
+        * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
+        * cannot easily shadow the SCA because of the ipte lock.
+        */
+}
+
 int kvm_arch_init(void *opaque)
 {
        kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long));
@@ -199,6 +320,8 @@ int kvm_arch_init(void *opaque)
                return -ENOMEM;
        }
 
+       kvm_s390_cpu_feat_init();
+
        /* Register floating interrupt controller interface. */
        return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
 }
@@ -244,6 +367,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_S390_USER_STSI:
        case KVM_CAP_S390_SKEYS:
        case KVM_CAP_S390_IRQ_STATE:
+       case KVM_CAP_S390_USER_INSTR0:
                r = 1;
                break;
        case KVM_CAP_S390_MEM_OP:
@@ -251,8 +375,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                break;
        case KVM_CAP_NR_VCPUS:
        case KVM_CAP_MAX_VCPUS:
-               r = sclp.has_esca ? KVM_S390_ESCA_CPU_SLOTS
-                                 : KVM_S390_BSCA_CPU_SLOTS;
+               r = KVM_S390_BSCA_CPU_SLOTS;
+               if (sclp.has_esca && sclp.has_64bscao)
+                       r = KVM_S390_ESCA_CPU_SLOTS;
                break;
        case KVM_CAP_NR_MEMSLOTS:
                r = KVM_USER_MEM_SLOTS;
@@ -335,6 +460,16 @@ out:
        return r;
 }
 
+static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
+{
+       unsigned int i;
+       struct kvm_vcpu *vcpu;
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu);
+       }
+}
+
 static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 {
        int r;
@@ -355,7 +490,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
                break;
        case KVM_CAP_S390_VECTOR_REGISTERS:
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus)) {
+               if (kvm->created_vcpus) {
                        r = -EBUSY;
                } else if (MACHINE_HAS_VX) {
                        set_kvm_facility(kvm->arch.model.fac_mask, 129);
@@ -370,7 +505,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
        case KVM_CAP_S390_RI:
                r = -EINVAL;
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus)) {
+               if (kvm->created_vcpus) {
                        r = -EBUSY;
                } else if (test_facility(64)) {
                        set_kvm_facility(kvm->arch.model.fac_mask, 64);
@@ -386,6 +521,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
                kvm->arch.user_stsi = 1;
                r = 0;
                break;
+       case KVM_CAP_S390_USER_INSTR0:
+               VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0");
+               kvm->arch.user_instr0 = 1;
+               icpt_operexc_on_all_vcpus(kvm);
+               r = 0;
+               break;
        default:
                r = -EINVAL;
                break;
@@ -418,21 +559,23 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
        unsigned int idx;
        switch (attr->attr) {
        case KVM_S390_VM_MEM_ENABLE_CMMA:
-               /* enable CMMA only for z10 and later (EDAT_1) */
-               ret = -EINVAL;
-               if (!MACHINE_IS_LPAR || !MACHINE_HAS_EDAT1)
+               ret = -ENXIO;
+               if (!sclp.has_cmma)
                        break;
 
                ret = -EBUSY;
                VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support");
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus) == 0) {
+               if (!kvm->created_vcpus) {
                        kvm->arch.use_cmma = 1;
                        ret = 0;
                }
                mutex_unlock(&kvm->lock);
                break;
        case KVM_S390_VM_MEM_CLR_CMMA:
+               ret = -ENXIO;
+               if (!sclp.has_cmma)
+                       break;
                ret = -EINVAL;
                if (!kvm->arch.use_cmma)
                        break;
@@ -461,20 +604,20 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
                if (!new_limit)
                        return -EINVAL;
 
-               /* gmap_alloc takes last usable address */
+               /* gmap_create takes last usable address */
                if (new_limit != KVM_S390_NO_MEM_LIMIT)
                        new_limit -= 1;
 
                ret = -EBUSY;
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus) == 0) {
-                       /* gmap_alloc will round the limit up */
-                       struct gmap *new = gmap_alloc(current->mm, new_limit);
+               if (!kvm->created_vcpus) {
+                       /* gmap_create will round the limit up */
+                       struct gmap *new = gmap_create(current->mm, new_limit);
 
                        if (!new) {
                                ret = -ENOMEM;
                        } else {
-                               gmap_free(kvm->arch.gmap);
+                               gmap_remove(kvm->arch.gmap);
                                new->private = kvm;
                                kvm->arch.gmap = new;
                                ret = 0;
@@ -644,7 +787,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
        int ret = 0;
 
        mutex_lock(&kvm->lock);
-       if (atomic_read(&kvm->online_vcpus)) {
+       if (kvm->created_vcpus) {
                ret = -EBUSY;
                goto out;
        }
@@ -676,6 +819,39 @@ out:
        return ret;
 }
 
+static int kvm_s390_set_processor_feat(struct kvm *kvm,
+                                      struct kvm_device_attr *attr)
+{
+       struct kvm_s390_vm_cpu_feat data;
+       int ret = -EBUSY;
+
+       if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data)))
+               return -EFAULT;
+       if (!bitmap_subset((unsigned long *) data.feat,
+                          kvm_s390_available_cpu_feat,
+                          KVM_S390_VM_CPU_FEAT_NR_BITS))
+               return -EINVAL;
+
+       mutex_lock(&kvm->lock);
+       if (!atomic_read(&kvm->online_vcpus)) {
+               bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
+                           KVM_S390_VM_CPU_FEAT_NR_BITS);
+               ret = 0;
+       }
+       mutex_unlock(&kvm->lock);
+       return ret;
+}
+
+static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
+                                         struct kvm_device_attr *attr)
+{
+       /*
+        * Once supported by kernel + hw, we have to store the subfunctions
+        * in kvm->arch and remember that user space configured them.
+        */
+       return -ENXIO;
+}
+
 static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
 {
        int ret = -ENXIO;
@@ -684,6 +860,12 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
        case KVM_S390_VM_CPU_PROCESSOR:
                ret = kvm_s390_set_processor(kvm, attr);
                break;
+       case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+               ret = kvm_s390_set_processor_feat(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+               ret = kvm_s390_set_processor_subfunc(kvm, attr);
+               break;
        }
        return ret;
 }
@@ -732,6 +914,50 @@ out:
        return ret;
 }
 
+static int kvm_s390_get_processor_feat(struct kvm *kvm,
+                                      struct kvm_device_attr *attr)
+{
+       struct kvm_s390_vm_cpu_feat data;
+
+       bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat,
+                   KVM_S390_VM_CPU_FEAT_NR_BITS);
+       if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+               return -EFAULT;
+       return 0;
+}
+
+static int kvm_s390_get_machine_feat(struct kvm *kvm,
+                                    struct kvm_device_attr *attr)
+{
+       struct kvm_s390_vm_cpu_feat data;
+
+       bitmap_copy((unsigned long *) data.feat,
+                   kvm_s390_available_cpu_feat,
+                   KVM_S390_VM_CPU_FEAT_NR_BITS);
+       if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+               return -EFAULT;
+       return 0;
+}
+
+static int kvm_s390_get_processor_subfunc(struct kvm *kvm,
+                                         struct kvm_device_attr *attr)
+{
+       /*
+        * Once we can actually configure subfunctions (kernel + hw support),
+        * we have to check if they were already set by user space, if so copy
+        * them from kvm->arch.
+        */
+       return -ENXIO;
+}
+
+static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
+                                       struct kvm_device_attr *attr)
+{
+       if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc,
+           sizeof(struct kvm_s390_vm_cpu_subfunc)))
+               return -EFAULT;
+       return 0;
+}
 static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
 {
        int ret = -ENXIO;
@@ -743,6 +969,18 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
        case KVM_S390_VM_CPU_MACHINE:
                ret = kvm_s390_get_machine(kvm, attr);
                break;
+       case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+               ret = kvm_s390_get_processor_feat(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_MACHINE_FEAT:
+               ret = kvm_s390_get_machine_feat(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+               ret = kvm_s390_get_processor_subfunc(kvm, attr);
+               break;
+       case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
+               ret = kvm_s390_get_machine_subfunc(kvm, attr);
+               break;
        }
        return ret;
 }
@@ -803,6 +1041,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
                switch (attr->attr) {
                case KVM_S390_VM_MEM_ENABLE_CMMA:
                case KVM_S390_VM_MEM_CLR_CMMA:
+                       ret = sclp.has_cmma ? 0 : -ENXIO;
+                       break;
                case KVM_S390_VM_MEM_LIMIT_SIZE:
                        ret = 0;
                        break;
@@ -826,8 +1066,13 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
                switch (attr->attr) {
                case KVM_S390_VM_CPU_PROCESSOR:
                case KVM_S390_VM_CPU_MACHINE:
+               case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+               case KVM_S390_VM_CPU_MACHINE_FEAT:
+               case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
                        ret = 0;
                        break;
+               /* configuring subfunctions is not supported yet */
+               case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
                default:
                        ret = -ENXIO;
                        break;
@@ -858,7 +1103,6 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 {
        uint8_t *keys;
        uint64_t hva;
-       unsigned long curkey;
        int i, r = 0;
 
        if (args->flags != 0)
@@ -879,26 +1123,27 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
        if (!keys)
                return -ENOMEM;
 
+       down_read(&current->mm->mmap_sem);
        for (i = 0; i < args->count; i++) {
                hva = gfn_to_hva(kvm, args->start_gfn + i);
                if (kvm_is_error_hva(hva)) {
                        r = -EFAULT;
-                       goto out;
+                       break;
                }
 
-               curkey = get_guest_storage_key(current->mm, hva);
-               if (IS_ERR_VALUE(curkey)) {
-                       r = curkey;
-                       goto out;
-               }
-               keys[i] = curkey;
+               r = get_guest_storage_key(current->mm, hva, &keys[i]);
+               if (r)
+                       break;
+       }
+       up_read(&current->mm->mmap_sem);
+
+       if (!r) {
+               r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
+                                sizeof(uint8_t) * args->count);
+               if (r)
+                       r = -EFAULT;
        }
 
-       r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
-                        sizeof(uint8_t) * args->count);
-       if (r)
-               r = -EFAULT;
-out:
        kvfree(keys);
        return r;
 }
@@ -935,24 +1180,25 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
        if (r)
                goto out;
 
+       down_read(&current->mm->mmap_sem);
        for (i = 0; i < args->count; i++) {
                hva = gfn_to_hva(kvm, args->start_gfn + i);
                if (kvm_is_error_hva(hva)) {
                        r = -EFAULT;
-                       goto out;
+                       break;
                }
 
                /* Lowest order bit is reserved */
                if (keys[i] & 0x01) {
                        r = -EINVAL;
-                       goto out;
+                       break;
                }
 
-               r = set_guest_storage_key(current->mm, hva,
-                                         (unsigned long)keys[i], 0);
+               r = set_guest_storage_key(current->mm, hva, keys[i], 0);
                if (r)
-                       goto out;
+                       break;
        }
+       up_read(&current->mm->mmap_sem);
 out:
        kvfree(keys);
        return r;
@@ -1129,6 +1375,7 @@ static void sca_dispose(struct kvm *kvm)
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
+       gfp_t alloc_flags = GFP_KERNEL;
        int i, rc;
        char debug_name[16];
        static unsigned long sca_offset;
@@ -1150,9 +1397,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        rc = -ENOMEM;
 
+       ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500);
+
        kvm->arch.use_esca = 0; /* start with basic SCA */
+       if (!sclp.has_64bscao)
+               alloc_flags |= GFP_DMA;
        rwlock_init(&kvm->arch.sca_lock);
-       kvm->arch.sca = (struct bsca_block *) get_zeroed_page(GFP_KERNEL);
+       kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
        if (!kvm->arch.sca)
                goto out_err;
        spin_lock(&kvm_lock);
@@ -1189,6 +1440,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask,
               S390_ARCH_FAC_LIST_SIZE_BYTE);
 
+       set_kvm_facility(kvm->arch.model.fac_mask, 74);
+       set_kvm_facility(kvm->arch.model.fac_list, 74);
+
        kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
        kvm->arch.model.ibc = sclp.ibc & 0x0fff;
 
@@ -1212,7 +1466,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
                else
                        kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE,
                                                    sclp.hamax + 1);
-               kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1);
+               kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
                if (!kvm->arch.gmap)
                        goto out_err;
                kvm->arch.gmap->private = kvm;
@@ -1224,6 +1478,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        kvm->arch.epoch = 0;
 
        spin_lock_init(&kvm->arch.start_stop_lock);
+       kvm_s390_vsie_init(kvm);
        KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
 
        return 0;
@@ -1245,7 +1500,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
                sca_del_vcpu(vcpu);
 
        if (kvm_is_ucontrol(vcpu->kvm))
-               gmap_free(vcpu->arch.gmap);
+               gmap_remove(vcpu->arch.gmap);
 
        if (vcpu->kvm->arch.use_cmma)
                kvm_s390_vcpu_unsetup_cmma(vcpu);
@@ -1278,16 +1533,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        debug_unregister(kvm->arch.dbf);
        free_page((unsigned long)kvm->arch.sie_page2);
        if (!kvm_is_ucontrol(kvm))
-               gmap_free(kvm->arch.gmap);
+               gmap_remove(kvm->arch.gmap);
        kvm_s390_destroy_adapters(kvm);
        kvm_s390_clear_float_irqs(kvm);
+       kvm_s390_vsie_destroy(kvm);
        KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
 }
 
 /* Section: vcpu related */
 static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
 {
-       vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
+       vcpu->arch.gmap = gmap_create(current->mm, -1UL);
        if (!vcpu->arch.gmap)
                return -ENOMEM;
        vcpu->arch.gmap->private = vcpu->kvm;
@@ -1396,7 +1652,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
 
        if (id < KVM_S390_BSCA_CPU_SLOTS)
                return true;
-       if (!sclp.has_esca)
+       if (!sclp.has_esca || !sclp.has_64bscao)
                return false;
 
        mutex_lock(&kvm->lock);
@@ -1537,7 +1793,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
        save_access_regs(vcpu->arch.host_acrs);
        restore_access_regs(vcpu->run->s.regs.acrs);
-       gmap_enable(vcpu->arch.gmap);
+       gmap_enable(vcpu->arch.enabled_gmap);
        atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
        if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
                __start_cpu_timer_accounting(vcpu);
@@ -1550,7 +1806,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
        if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
                __stop_cpu_timer_accounting(vcpu);
        atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
-       gmap_disable(vcpu->arch.gmap);
+       vcpu->arch.enabled_gmap = gmap_get_enabled();
+       gmap_disable(vcpu->arch.enabled_gmap);
 
        /* Save guest register state */
        save_fpu_regs();
@@ -1599,7 +1856,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
                vcpu->arch.gmap = vcpu->kvm->arch.gmap;
                sca_add_vcpu(vcpu);
        }
-
+       if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
+               vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+       /* make vcpu_load load the right gmap on the first trigger */
+       vcpu->arch.enabled_gmap = vcpu->arch.gmap;
 }
 
 static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
@@ -1658,15 +1918,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
        kvm_s390_vcpu_setup_model(vcpu);
 
-       vcpu->arch.sie_block->ecb = 0x02;
+       /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */
+       if (MACHINE_HAS_ESOP)
+               vcpu->arch.sie_block->ecb |= 0x02;
        if (test_kvm_facility(vcpu->kvm, 9))
                vcpu->arch.sie_block->ecb |= 0x04;
-       if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))
+       if (test_kvm_facility(vcpu->kvm, 73))
                vcpu->arch.sie_block->ecb |= 0x10;
 
-       if (test_kvm_facility(vcpu->kvm, 8))
+       if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
                vcpu->arch.sie_block->ecb2 |= 0x08;
-       vcpu->arch.sie_block->eca   = 0xC1002000U;
+       vcpu->arch.sie_block->eca = 0x1002000U;
+       if (sclp.has_cei)
+               vcpu->arch.sie_block->eca |= 0x80000000U;
+       if (sclp.has_ib)
+               vcpu->arch.sie_block->eca |= 0x40000000U;
        if (sclp.has_siif)
                vcpu->arch.sie_block->eca |= 1;
        if (sclp.has_sigpif)
@@ -1716,6 +1982,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
        vcpu->arch.sie_block = &sie_page->sie_block;
        vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
 
+       /* the real guest size will always be smaller than msl */
+       vcpu->arch.sie_block->mso = 0;
+       vcpu->arch.sie_block->msl = sclp.hamax;
+
        vcpu->arch.sie_block->icpua = id;
        spin_lock_init(&vcpu->arch.local_int.lock);
        vcpu->arch.local_int.float_int = &kvm->arch.float_int;
@@ -1784,16 +2054,25 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
        kvm_s390_vcpu_request(vcpu);
 }
 
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+                             unsigned long end)
 {
-       int i;
        struct kvm *kvm = gmap->private;
        struct kvm_vcpu *vcpu;
+       unsigned long prefix;
+       int i;
 
+       if (gmap_is_shadow(gmap))
+               return;
+       if (start >= 1UL << 31)
+               /* We are only interested in prefix pages */
+               return;
        kvm_for_each_vcpu(i, vcpu, kvm) {
                /* match against both prefix pages */
-               if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
-                       VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
+               prefix = kvm_s390_get_prefix(vcpu);
+               if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
+                       VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
+                                  start, end);
                        kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
                }
        }
@@ -2002,6 +2281,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 
        if (dbg->control & ~VALID_GUESTDBG_FLAGS)
                return -EINVAL;
+       if (!sclp.has_gpere)
+               return -EINVAL;
 
        if (dbg->control & KVM_GUESTDBG_ENABLE) {
                vcpu->guest_debug = dbg->control;
@@ -2070,16 +2351,16 @@ retry:
                return 0;
        /*
         * We use MMU_RELOAD just to re-arm the ipte notifier for the
-        * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
+        * guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
         * This ensures that the ipte instruction for this request has
         * already finished. We might race against a second unmapper that
         * wants to set the blocking bit. Lets just retry the request loop.
         */
        if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
                int rc;
-               rc = gmap_ipte_notify(vcpu->arch.gmap,
-                                     kvm_s390_get_prefix(vcpu),
-                                     PAGE_SIZE * 2);
+               rc = gmap_mprotect_notify(vcpu->arch.gmap,
+                                         kvm_s390_get_prefix(vcpu),
+                                         PAGE_SIZE * 2, PROT_WRITE);
                if (rc)
                        return rc;
                goto retry;
@@ -2108,6 +2389,11 @@ retry:
                goto retry;
        }
 
+       if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) {
+               vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+               goto retry;
+       }
+
        /* nothing to do, just clear the request */
        clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 
@@ -2362,14 +2648,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                 * guest_enter and guest_exit should be no uaccess.
                 */
                local_irq_disable();
-               __kvm_guest_enter();
+               guest_enter_irqoff();
                __disable_cpu_timer_accounting(vcpu);
                local_irq_enable();
                exit_reason = sie64a(vcpu->arch.sie_block,
                                     vcpu->run->s.regs.gprs);
                local_irq_disable();
                __enable_cpu_timer_accounting(vcpu);
-               __kvm_guest_exit();
+               guest_exit_irqoff();
                local_irq_enable();
                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 
@@ -2598,6 +2884,8 @@ static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
 
 static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
 {
+       if (!sclp.has_ibs)
+               return;
        kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
        kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
 }
index 8621ab0..b843286 100644 (file)
@@ -56,7 +56,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
 
 static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
 {
-       return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT;
+       return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
 }
 
 static inline int kvm_is_ucontrol(struct kvm *kvm)
@@ -175,6 +175,12 @@ static inline int set_kvm_facility(u64 *fac_list, unsigned long nr)
        return 0;
 }
 
+static inline int test_kvm_cpu_feat(struct kvm *kvm, unsigned long nr)
+{
+       WARN_ON_ONCE(nr >= KVM_S390_VM_CPU_FEAT_NR_BITS);
+       return test_bit_inv(nr, kvm->arch.cpu_feat);
+}
+
 /* are cpu states controlled by user space */
 static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
 {
@@ -232,6 +238,8 @@ static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen)
 }
 static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
 {
+       /* don't inject PER events if we re-execute the instruction */
+       vcpu->arch.sie_block->icptstatus &= ~0x02;
        kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu));
 }
 
@@ -246,10 +254,21 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
 
+/* implemented in vsie.c */
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+                                unsigned long end);
+void kvm_s390_vsie_init(struct kvm *kvm);
+void kvm_s390_vsie_destroy(struct kvm *kvm);
+
 /* implemented in sigp.c */
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
 
+/* implemented in sthyi.c */
+int handle_sthyi(struct kvm_vcpu *vcpu);
+
 /* implemented in kvm-s390.c */
 void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
 long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
@@ -360,6 +379,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
                            struct kvm_guest_debug *dbg);
 void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu);
 void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
 void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
 
 /* support for Basic/Extended SCA handling */
index 95916fa..4616038 100644 (file)
@@ -27,6 +27,7 @@
 #include <asm/io.h>
 #include <asm/ptrace.h>
 #include <asm/compat.h>
+#include <asm/sclp.h>
 #include "gaccess.h"
 #include "kvm-s390.h"
 #include "trace.h"
@@ -152,30 +153,166 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
 static int __skey_check_enable(struct kvm_vcpu *vcpu)
 {
        int rc = 0;
+
+       trace_kvm_s390_skey_related_inst(vcpu);
        if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
                return rc;
 
        rc = s390_enable_skey();
-       VCPU_EVENT(vcpu, 3, "%s", "enabling storage keys for guest");
-       trace_kvm_s390_skey_related_inst(vcpu);
-       vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+       VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
+       if (!rc)
+               vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
        return rc;
 }
 
-
-static int handle_skey(struct kvm_vcpu *vcpu)
+static int try_handle_skey(struct kvm_vcpu *vcpu)
 {
-       int rc = __skey_check_enable(vcpu);
+       int rc;
 
+       vcpu->stat.instruction_storage_key++;
+       rc = __skey_check_enable(vcpu);
        if (rc)
                return rc;
-       vcpu->stat.instruction_storage_key++;
-
+       if (sclp.has_skey) {
+               /* with storage-key facility, SIE interprets it for us */
+               kvm_s390_retry_instr(vcpu);
+               VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+               return -EAGAIN;
+       }
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+       return 0;
+}
 
-       kvm_s390_retry_instr(vcpu);
-       VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+static int handle_iske(struct kvm_vcpu *vcpu)
+{
+       unsigned long addr;
+       unsigned char key;
+       int reg1, reg2;
+       int rc;
+
+       rc = try_handle_skey(vcpu);
+       if (rc)
+               return rc != -EAGAIN ? rc : 0;
+
+       kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+       addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+       addr = kvm_s390_logical_to_effective(vcpu, addr);
+       addr = kvm_s390_real_to_abs(vcpu, addr);
+       addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+       if (kvm_is_error_hva(addr))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+       down_read(&current->mm->mmap_sem);
+       rc = get_guest_storage_key(current->mm, addr, &key);
+       up_read(&current->mm->mmap_sem);
+       if (rc)
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       vcpu->run->s.regs.gprs[reg1] &= ~0xff;
+       vcpu->run->s.regs.gprs[reg1] |= key;
+       return 0;
+}
+
+static int handle_rrbe(struct kvm_vcpu *vcpu)
+{
+       unsigned long addr;
+       int reg1, reg2;
+       int rc;
+
+       rc = try_handle_skey(vcpu);
+       if (rc)
+               return rc != -EAGAIN ? rc : 0;
+
+       kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+       addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+       addr = kvm_s390_logical_to_effective(vcpu, addr);
+       addr = kvm_s390_real_to_abs(vcpu, addr);
+       addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+       if (kvm_is_error_hva(addr))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+       down_read(&current->mm->mmap_sem);
+       rc = reset_guest_reference_bit(current->mm, addr);
+       up_read(&current->mm->mmap_sem);
+       if (rc < 0)
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+       kvm_s390_set_psw_cc(vcpu, rc);
+       return 0;
+}
+
+#define SSKE_NQ 0x8
+#define SSKE_MR 0x4
+#define SSKE_MC 0x2
+#define SSKE_MB 0x1
+static int handle_sske(struct kvm_vcpu *vcpu)
+{
+       unsigned char m3 = vcpu->arch.sie_block->ipb >> 28;
+       unsigned long start, end;
+       unsigned char key, oldkey;
+       int reg1, reg2;
+       int rc;
+
+       rc = try_handle_skey(vcpu);
+       if (rc)
+               return rc != -EAGAIN ? rc : 0;
+
+       if (!test_kvm_facility(vcpu->kvm, 8))
+               m3 &= ~SSKE_MB;
+       if (!test_kvm_facility(vcpu->kvm, 10))
+               m3 &= ~(SSKE_MC | SSKE_MR);
+       if (!test_kvm_facility(vcpu->kvm, 14))
+               m3 &= ~SSKE_NQ;
+
+       kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+       key = vcpu->run->s.regs.gprs[reg1] & 0xfe;
+       start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+       start = kvm_s390_logical_to_effective(vcpu, start);
+       if (m3 & SSKE_MB) {
+               /* start already designates an absolute address */
+               end = (start + (1UL << 20)) & ~((1UL << 20) - 1);
+       } else {
+               start = kvm_s390_real_to_abs(vcpu, start);
+               end = start + PAGE_SIZE;
+       }
+
+       while (start != end) {
+               unsigned long addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
+
+               if (kvm_is_error_hva(addr))
+                       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+               down_read(&current->mm->mmap_sem);
+               rc = cond_set_guest_storage_key(current->mm, addr, key, &oldkey,
+                                               m3 & SSKE_NQ, m3 & SSKE_MR,
+                                               m3 & SSKE_MC);
+               up_read(&current->mm->mmap_sem);
+               if (rc < 0)
+                       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+               start += PAGE_SIZE;
+       };
+
+       if (m3 & (SSKE_MC | SSKE_MR)) {
+               if (m3 & SSKE_MB) {
+                       /* skey in reg1 is unpredictable */
+                       kvm_s390_set_psw_cc(vcpu, 3);
+               } else {
+                       kvm_s390_set_psw_cc(vcpu, rc);
+                       vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL;
+                       vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8;
+               }
+       }
+       if (m3 & SSKE_MB) {
+               if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT)
+                       vcpu->run->s.regs.gprs[reg2] &= ~PAGE_MASK;
+               else
+                       vcpu->run->s.regs.gprs[reg2] &= ~0xfffff000UL;
+               end = kvm_s390_logical_to_effective(vcpu, end);
+               vcpu->run->s.regs.gprs[reg2] |= end;
+       }
        return 0;
 }
 
@@ -582,10 +719,11 @@ static const intercept_handler_t b2_handlers[256] = {
        [0x10] = handle_set_prefix,
        [0x11] = handle_store_prefix,
        [0x12] = handle_store_cpu_address,
+       [0x14] = kvm_s390_handle_vsie,
        [0x21] = handle_ipte_interlock,
-       [0x29] = handle_skey,
-       [0x2a] = handle_skey,
-       [0x2b] = handle_skey,
+       [0x29] = handle_iske,
+       [0x2a] = handle_rrbe,
+       [0x2b] = handle_sske,
        [0x2c] = handle_test_block,
        [0x30] = handle_io_inst,
        [0x31] = handle_io_inst,
@@ -654,8 +792,10 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
 
 static int handle_pfmf(struct kvm_vcpu *vcpu)
 {
+       bool mr = false, mc = false, nq;
        int reg1, reg2;
        unsigned long start, end;
+       unsigned char key;
 
        vcpu->stat.instruction_pfmf++;
 
@@ -675,15 +815,27 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
            !test_kvm_facility(vcpu->kvm, 14))
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-       /* No support for conditional-SSKE */
-       if (vcpu->run->s.regs.gprs[reg1] & (PFMF_MR | PFMF_MC))
-               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+       /* Only provide conditional-SSKE support if enabled for the guest */
+       if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK &&
+           test_kvm_facility(vcpu->kvm, 10)) {
+               mr = vcpu->run->s.regs.gprs[reg1] & PFMF_MR;
+               mc = vcpu->run->s.regs.gprs[reg1] & PFMF_MC;
+       }
 
+       nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ;
+       key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
        start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
        start = kvm_s390_logical_to_effective(vcpu, start);
 
+       if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
+               if (kvm_s390_check_low_addr_prot_real(vcpu, start))
+                       return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+       }
+
        switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
        case 0x00000000:
+               /* only 4k frames specify a real address */
+               start = kvm_s390_real_to_abs(vcpu, start);
                end = (start + (1UL << 12)) & ~((1UL << 12) - 1);
                break;
        case 0x00001000:
@@ -701,20 +853,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
        }
 
-       if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
-               if (kvm_s390_check_low_addr_prot_real(vcpu, start))
-                       return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
-       }
-
-       while (start < end) {
-               unsigned long useraddr, abs_addr;
+       while (start != end) {
+               unsigned long useraddr;
 
                /* Translate guest address to host address */
-               if ((vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) == 0)
-                       abs_addr = kvm_s390_real_to_abs(vcpu, start);
-               else
-                       abs_addr = start;
-               useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(abs_addr));
+               useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
                if (kvm_is_error_hva(useraddr))
                        return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
@@ -728,16 +871,25 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 
                        if (rc)
                                return rc;
-                       if (set_guest_storage_key(current->mm, useraddr,
-                                       vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
-                                       vcpu->run->s.regs.gprs[reg1] & PFMF_NQ))
+                       down_read(&current->mm->mmap_sem);
+                       rc = cond_set_guest_storage_key(current->mm, useraddr,
+                                                       key, NULL, nq, mr, mc);
+                       up_read(&current->mm->mmap_sem);
+                       if (rc < 0)
                                return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
                }
 
                start += PAGE_SIZE;
        }
-       if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC)
-               vcpu->run->s.regs.gprs[reg2] = end;
+       if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
+               if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) {
+                       vcpu->run->s.regs.gprs[reg2] = end;
+               } else {
+                       vcpu->run->s.regs.gprs[reg2] &= ~0xffffffffUL;
+                       end = kvm_s390_logical_to_effective(vcpu, end);
+                       vcpu->run->s.regs.gprs[reg2] |= end;
+               }
+       }
        return 0;
 }
 
@@ -1033,7 +1185,15 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int handle_ptff(struct kvm_vcpu *vcpu)
+{
+       /* we don't emulate any control instructions yet */
+       kvm_s390_set_psw_cc(vcpu, 3);
+       return 0;
+}
+
 static const intercept_handler_t x01_handlers[256] = {
+       [0x04] = handle_ptff,
        [0x07] = handle_sckpf,
 };
 
index 28ea0ca..1a252f5 100644 (file)
@@ -77,18 +77,18 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu,
        const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT;
        u16 p_asn, s_asn;
        psw_t *psw;
-       u32 flags;
+       bool idle;
 
-       flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags);
+       idle = is_vcpu_idle(vcpu);
        psw = &dst_vcpu->arch.sie_block->gpsw;
        p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff;  /* Primary ASN */
        s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff;  /* Secondary ASN */
 
        /* Inject the emergency signal? */
-       if (!(flags & CPUSTAT_STOPPED)
+       if (!is_vcpu_stopped(vcpu)
            || (psw->mask & psw_int_mask) != psw_int_mask
-           || ((flags & CPUSTAT_WAIT) && psw->addr != 0)
-           || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) {
+           || (idle && psw->addr != 0)
+           || (!idle && (asn == p_asn || asn == s_asn))) {
                return __inject_sigp_emergency(vcpu, dst_vcpu);
        } else {
                *reg &= 0xffffffff00000000UL;
diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c
new file mode 100644 (file)
index 0000000..bd98b7d
--- /dev/null
@@ -0,0 +1,471 @@
+/*
+ * store hypervisor information instruction emulation functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Janosch Frank <frankja@linux.vnet.ibm.com>
+ */
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/ratelimit.h>
+
+#include <asm/kvm_host.h>
+#include <asm/asm-offsets.h>
+#include <asm/sclp.h>
+#include <asm/diag.h>
+#include <asm/sysinfo.h>
+#include <asm/ebcdic.h>
+
+#include "kvm-s390.h"
+#include "gaccess.h"
+#include "trace.h"
+
+#define DED_WEIGHT 0xffff
+/*
+ * CP and IFL as EBCDIC strings, SP/0x40 determines the end of string
+ * as they are justified with spaces.
+ */
+#define CP  0xc3d7404040404040UL
+#define IFL 0xc9c6d34040404040UL
+
+enum hdr_flags {
+       HDR_NOT_LPAR   = 0x10,
+       HDR_STACK_INCM = 0x20,
+       HDR_STSI_UNAV  = 0x40,
+       HDR_PERF_UNAV  = 0x80,
+};
+
+enum mac_validity {
+       MAC_NAME_VLD = 0x20,
+       MAC_ID_VLD   = 0x40,
+       MAC_CNT_VLD  = 0x80,
+};
+
+enum par_flag {
+       PAR_MT_EN = 0x80,
+};
+
+enum par_validity {
+       PAR_GRP_VLD  = 0x08,
+       PAR_ID_VLD   = 0x10,
+       PAR_ABS_VLD  = 0x20,
+       PAR_WGHT_VLD = 0x40,
+       PAR_PCNT_VLD  = 0x80,
+};
+
+struct hdr_sctn {
+       u8 infhflg1;
+       u8 infhflg2; /* reserved */
+       u8 infhval1; /* reserved */
+       u8 infhval2; /* reserved */
+       u8 reserved[3];
+       u8 infhygct;
+       u16 infhtotl;
+       u16 infhdln;
+       u16 infmoff;
+       u16 infmlen;
+       u16 infpoff;
+       u16 infplen;
+       u16 infhoff1;
+       u16 infhlen1;
+       u16 infgoff1;
+       u16 infglen1;
+       u16 infhoff2;
+       u16 infhlen2;
+       u16 infgoff2;
+       u16 infglen2;
+       u16 infhoff3;
+       u16 infhlen3;
+       u16 infgoff3;
+       u16 infglen3;
+       u8 reserved2[4];
+} __packed;
+
+struct mac_sctn {
+       u8 infmflg1; /* reserved */
+       u8 infmflg2; /* reserved */
+       u8 infmval1;
+       u8 infmval2; /* reserved */
+       u16 infmscps;
+       u16 infmdcps;
+       u16 infmsifl;
+       u16 infmdifl;
+       char infmname[8];
+       char infmtype[4];
+       char infmmanu[16];
+       char infmseq[16];
+       char infmpman[4];
+       u8 reserved[4];
+} __packed;
+
+struct par_sctn {
+       u8 infpflg1;
+       u8 infpflg2; /* reserved */
+       u8 infpval1;
+       u8 infpval2; /* reserved */
+       u16 infppnum;
+       u16 infpscps;
+       u16 infpdcps;
+       u16 infpsifl;
+       u16 infpdifl;
+       u16 reserved;
+       char infppnam[8];
+       u32 infpwbcp;
+       u32 infpabcp;
+       u32 infpwbif;
+       u32 infpabif;
+       char infplgnm[8];
+       u32 infplgcp;
+       u32 infplgif;
+} __packed;
+
+struct sthyi_sctns {
+       struct hdr_sctn hdr;
+       struct mac_sctn mac;
+       struct par_sctn par;
+} __packed;
+
+struct cpu_inf {
+       u64 lpar_cap;
+       u64 lpar_grp_cap;
+       u64 lpar_weight;
+       u64 all_weight;
+       int cpu_num_ded;
+       int cpu_num_shd;
+};
+
+struct lpar_cpu_inf {
+       struct cpu_inf cp;
+       struct cpu_inf ifl;
+};
+
+static inline u64 cpu_id(u8 ctidx, void *diag224_buf)
+{
+       return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN));
+}
+
+/*
+ * Scales the cpu capping from the lpar range to the one expected in
+ * sthyi data.
+ *
+ * diag204 reports a cap in hundredths of processor units.
+ * z/VM's range for one core is 0 - 0x10000.
+ */
+static u32 scale_cap(u32 in)
+{
+       return (0x10000 * in) / 100;
+}
+
+static void fill_hdr(struct sthyi_sctns *sctns)
+{
+       sctns->hdr.infhdln = sizeof(sctns->hdr);
+       sctns->hdr.infmoff = sizeof(sctns->hdr);
+       sctns->hdr.infmlen = sizeof(sctns->mac);
+       sctns->hdr.infplen = sizeof(sctns->par);
+       sctns->hdr.infpoff = sctns->hdr.infhdln + sctns->hdr.infmlen;
+       sctns->hdr.infhtotl = sctns->hdr.infpoff + sctns->hdr.infplen;
+}
+
+static void fill_stsi_mac(struct sthyi_sctns *sctns,
+                         struct sysinfo_1_1_1 *sysinfo)
+{
+       if (stsi(sysinfo, 1, 1, 1))
+               return;
+
+       sclp_ocf_cpc_name_copy(sctns->mac.infmname);
+
+       memcpy(sctns->mac.infmtype, sysinfo->type, sizeof(sctns->mac.infmtype));
+       memcpy(sctns->mac.infmmanu, sysinfo->manufacturer, sizeof(sctns->mac.infmmanu));
+       memcpy(sctns->mac.infmpman, sysinfo->plant, sizeof(sctns->mac.infmpman));
+       memcpy(sctns->mac.infmseq, sysinfo->sequence, sizeof(sctns->mac.infmseq));
+
+       sctns->mac.infmval1 |= MAC_ID_VLD | MAC_NAME_VLD;
+}
+
+static void fill_stsi_par(struct sthyi_sctns *sctns,
+                         struct sysinfo_2_2_2 *sysinfo)
+{
+       if (stsi(sysinfo, 2, 2, 2))
+               return;
+
+       sctns->par.infppnum = sysinfo->lpar_number;
+       memcpy(sctns->par.infppnam, sysinfo->name, sizeof(sctns->par.infppnam));
+
+       sctns->par.infpval1 |= PAR_ID_VLD;
+}
+
+static void fill_stsi(struct sthyi_sctns *sctns)
+{
+       void *sysinfo;
+
+       /* Errors are handled through the validity bits in the response. */
+       sysinfo = (void *)__get_free_page(GFP_KERNEL);
+       if (!sysinfo)
+               return;
+
+       fill_stsi_mac(sctns, sysinfo);
+       fill_stsi_par(sctns, sysinfo);
+
+       free_pages((unsigned long)sysinfo, 0);
+}
+
+static void fill_diag_mac(struct sthyi_sctns *sctns,
+                         struct diag204_x_phys_block *block,
+                         void *diag224_buf)
+{
+       int i;
+
+       for (i = 0; i < block->hdr.cpus; i++) {
+               switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) {
+               case CP:
+                       if (block->cpus[i].weight == DED_WEIGHT)
+                               sctns->mac.infmdcps++;
+                       else
+                               sctns->mac.infmscps++;
+                       break;
+               case IFL:
+                       if (block->cpus[i].weight == DED_WEIGHT)
+                               sctns->mac.infmdifl++;
+                       else
+                               sctns->mac.infmsifl++;
+                       break;
+               }
+       }
+       sctns->mac.infmval1 |= MAC_CNT_VLD;
+}
+
+/* Returns a pointer to the the next partition block. */
+static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf,
+                                                bool this_lpar,
+                                                void *diag224_buf,
+                                                struct diag204_x_part_block *block)
+{
+       int i, capped = 0, weight_cp = 0, weight_ifl = 0;
+       struct cpu_inf *cpu_inf;
+
+       for (i = 0; i < block->hdr.rcpus; i++) {
+               if (!(block->cpus[i].cflag & DIAG204_CPU_ONLINE))
+                       continue;
+
+               switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) {
+               case CP:
+                       cpu_inf = &part_inf->cp;
+                       if (block->cpus[i].cur_weight < DED_WEIGHT)
+                               weight_cp |= block->cpus[i].cur_weight;
+                       break;
+               case IFL:
+                       cpu_inf = &part_inf->ifl;
+                       if (block->cpus[i].cur_weight < DED_WEIGHT)
+                               weight_ifl |= block->cpus[i].cur_weight;
+                       break;
+               default:
+                       continue;
+               }
+
+               if (!this_lpar)
+                       continue;
+
+               capped |= block->cpus[i].cflag & DIAG204_CPU_CAPPED;
+               cpu_inf->lpar_cap |= block->cpus[i].cpu_type_cap;
+               cpu_inf->lpar_grp_cap |= block->cpus[i].group_cpu_type_cap;
+
+               if (block->cpus[i].weight == DED_WEIGHT)
+                       cpu_inf->cpu_num_ded += 1;
+               else
+                       cpu_inf->cpu_num_shd += 1;
+       }
+
+       if (this_lpar && capped) {
+               part_inf->cp.lpar_weight = weight_cp;
+               part_inf->ifl.lpar_weight = weight_ifl;
+       }
+       part_inf->cp.all_weight += weight_cp;
+       part_inf->ifl.all_weight += weight_ifl;
+       return (struct diag204_x_part_block *)&block->cpus[i];
+}
+
+static void fill_diag(struct sthyi_sctns *sctns)
+{
+       int i, r, pages;
+       bool this_lpar;
+       void *diag204_buf;
+       void *diag224_buf = NULL;
+       struct diag204_x_info_blk_hdr *ti_hdr;
+       struct diag204_x_part_block *part_block;
+       struct diag204_x_phys_block *phys_block;
+       struct lpar_cpu_inf lpar_inf = {};
+
+       /* Errors are handled through the validity bits in the response. */
+       pages = diag204((unsigned long)DIAG204_SUBC_RSI |
+                       (unsigned long)DIAG204_INFO_EXT, 0, NULL);
+       if (pages <= 0)
+               return;
+
+       diag204_buf = vmalloc(PAGE_SIZE * pages);
+       if (!diag204_buf)
+               return;
+
+       r = diag204((unsigned long)DIAG204_SUBC_STIB7 |
+                   (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf);
+       if (r < 0)
+               goto out;
+
+       diag224_buf = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA);
+       if (!diag224_buf || diag224(diag224_buf))
+               goto out;
+
+       ti_hdr = diag204_buf;
+       part_block = diag204_buf + sizeof(*ti_hdr);
+
+       for (i = 0; i < ti_hdr->npar; i++) {
+               /*
+                * For the calling lpar we also need to get the cpu
+                * caps and weights. The time information block header
+                * specifies the offset to the partition block of the
+                * caller lpar, so we know when we process its data.
+                */
+               this_lpar = (void *)part_block - diag204_buf == ti_hdr->this_part;
+               part_block = lpar_cpu_inf(&lpar_inf, this_lpar, diag224_buf,
+                                         part_block);
+       }
+
+       phys_block = (struct diag204_x_phys_block *)part_block;
+       part_block = diag204_buf + ti_hdr->this_part;
+       if (part_block->hdr.mtid)
+               sctns->par.infpflg1 = PAR_MT_EN;
+
+       sctns->par.infpval1 |= PAR_GRP_VLD;
+       sctns->par.infplgcp = scale_cap(lpar_inf.cp.lpar_grp_cap);
+       sctns->par.infplgif = scale_cap(lpar_inf.ifl.lpar_grp_cap);
+       memcpy(sctns->par.infplgnm, part_block->hdr.hardware_group_name,
+              sizeof(sctns->par.infplgnm));
+
+       sctns->par.infpscps = lpar_inf.cp.cpu_num_shd;
+       sctns->par.infpdcps = lpar_inf.cp.cpu_num_ded;
+       sctns->par.infpsifl = lpar_inf.ifl.cpu_num_shd;
+       sctns->par.infpdifl = lpar_inf.ifl.cpu_num_ded;
+       sctns->par.infpval1 |= PAR_PCNT_VLD;
+
+       sctns->par.infpabcp = scale_cap(lpar_inf.cp.lpar_cap);
+       sctns->par.infpabif = scale_cap(lpar_inf.ifl.lpar_cap);
+       sctns->par.infpval1 |= PAR_ABS_VLD;
+
+       /*
+        * Everything below needs global performance data to be
+        * meaningful.
+        */
+       if (!(ti_hdr->flags & DIAG204_LPAR_PHYS_FLG)) {
+               sctns->hdr.infhflg1 |= HDR_PERF_UNAV;
+               goto out;
+       }
+
+       fill_diag_mac(sctns, phys_block, diag224_buf);
+
+       if (lpar_inf.cp.lpar_weight) {
+               sctns->par.infpwbcp = sctns->mac.infmscps * 0x10000 *
+                       lpar_inf.cp.lpar_weight / lpar_inf.cp.all_weight;
+       }
+
+       if (lpar_inf.ifl.lpar_weight) {
+               sctns->par.infpwbif = sctns->mac.infmsifl * 0x10000 *
+                       lpar_inf.ifl.lpar_weight / lpar_inf.ifl.all_weight;
+       }
+       sctns->par.infpval1 |= PAR_WGHT_VLD;
+
+out:
+       kfree(diag224_buf);
+       vfree(diag204_buf);
+}
+
+static int sthyi(u64 vaddr)
+{
+       register u64 code asm("0") = 0;
+       register u64 addr asm("2") = vaddr;
+       int cc;
+
+       asm volatile(
+               ".insn   rre,0xB2560000,%[code],%[addr]\n"
+               "ipm     %[cc]\n"
+               "srl     %[cc],28\n"
+               : [cc] "=d" (cc)
+               : [code] "d" (code), [addr] "a" (addr)
+               : "memory", "cc");
+       return cc;
+}
+
+int handle_sthyi(struct kvm_vcpu *vcpu)
+{
+       int reg1, reg2, r = 0;
+       u64 code, addr, cc = 0;
+       struct sthyi_sctns *sctns = NULL;
+
+       /*
+        * STHYI requires extensive locking in the higher hypervisors
+        * and is very computational/memory expensive. Therefore we
+        * ratelimit the executions per VM.
+        */
+       if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) {
+               kvm_s390_retry_instr(vcpu);
+               return 0;
+       }
+
+       kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+       code = vcpu->run->s.regs.gprs[reg1];
+       addr = vcpu->run->s.regs.gprs[reg2];
+
+       vcpu->stat.instruction_sthyi++;
+       VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr);
+       trace_kvm_s390_handle_sthyi(vcpu, code, addr);
+
+       if (reg1 == reg2 || reg1 & 1 || reg2 & 1 || addr & ~PAGE_MASK)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+       if (code & 0xffff) {
+               cc = 3;
+               goto out;
+       }
+
+       /*
+        * If the page has not yet been faulted in, we want to do that
+        * now and not after all the expensive calculations.
+        */
+       r = write_guest(vcpu, addr, reg2, &cc, 1);
+       if (r)
+               return kvm_s390_inject_prog_cond(vcpu, r);
+
+       sctns = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!sctns)
+               return -ENOMEM;
+
+       /*
+        * If we are a guest, we don't want to emulate an emulated
+        * instruction. We ask the hypervisor to provide the data.
+        */
+       if (test_facility(74)) {
+               cc = sthyi((u64)sctns);
+               goto out;
+       }
+
+       fill_hdr(sctns);
+       fill_stsi(sctns);
+       fill_diag(sctns);
+
+out:
+       if (!cc) {
+               r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
+               if (r) {
+                       free_page((unsigned long)sctns);
+                       return kvm_s390_inject_prog_cond(vcpu, r);
+               }
+       }
+
+       free_page((unsigned long)sctns);
+       vcpu->run->s.regs.gprs[reg2 + 1] = cc ? 4 : 0;
+       kvm_s390_set_psw_cc(vcpu, cc);
+       return r;
+}
index 916834d..4fc9d4e 100644 (file)
@@ -41,7 +41,7 @@ TRACE_EVENT(kvm_s390_skey_related_inst,
            TP_fast_assign(
                    VCPU_ASSIGN_COMMON
                    ),
-           VCPU_TP_PRINTK("%s", "first instruction related to skeys on vcpu")
+           VCPU_TP_PRINTK("%s", "storage key related instruction")
        );
 
 TRACE_EVENT(kvm_s390_major_guest_pfault,
@@ -185,8 +185,10 @@ TRACE_EVENT(kvm_s390_intercept_prog,
                    __entry->code = code;
                    ),
 
-           VCPU_TP_PRINTK("intercepted program interruption %04x",
-                          __entry->code)
+           VCPU_TP_PRINTK("intercepted program interruption %04x (%s)",
+                          __entry->code,
+                          __print_symbolic(__entry->code,
+                                           icpt_prog_codes))
        );
 
 /*
@@ -412,6 +414,47 @@ TRACE_EVENT(kvm_s390_handle_stsi,
                           __entry->addr)
        );
 
+TRACE_EVENT(kvm_s390_handle_operexc,
+           TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb),
+           TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(__u64, instruction)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->instruction = ((__u64)ipa << 48) |
+                   ((__u64)ipb << 16);
+                   ),
+
+           VCPU_TP_PRINTK("operation exception on instruction %016llx (%s)",
+                          __entry->instruction,
+                          __print_symbolic(icpt_insn_decoder(__entry->instruction),
+                                           icpt_insn_codes))
+       );
+
+TRACE_EVENT(kvm_s390_handle_sthyi,
+           TP_PROTO(VCPU_PROTO_COMMON, u64 code, u64 addr),
+           TP_ARGS(VCPU_ARGS_COMMON, code, addr),
+
+           TP_STRUCT__entry(
+                   VCPU_FIELD_COMMON
+                   __field(u64, code)
+                   __field(u64, addr)
+                   ),
+
+           TP_fast_assign(
+                   VCPU_ASSIGN_COMMON
+                   __entry->code = code;
+                   __entry->addr = addr;
+                   ),
+
+           VCPU_TP_PRINTK("STHYI fc: %llu addr: %016llx",
+                          __entry->code, __entry->addr)
+       );
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
new file mode 100644 (file)
index 0000000..c106488
--- /dev/null
@@ -0,0 +1,1091 @@
+/*
+ * kvm nested virtualization support for s390x
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
+ */
+#include <linux/vmalloc.h>
+#include <linux/kvm_host.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/bitmap.h>
+#include <asm/gmap.h>
+#include <asm/mmu_context.h>
+#include <asm/sclp.h>
+#include <asm/nmi.h>
+#include <asm/dis.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+
+struct vsie_page {
+       struct kvm_s390_sie_block scb_s;        /* 0x0000 */
+       /* the pinned originial scb */
+       struct kvm_s390_sie_block *scb_o;       /* 0x0200 */
+       /* the shadow gmap in use by the vsie_page */
+       struct gmap *gmap;                      /* 0x0208 */
+       /* address of the last reported fault to guest2 */
+       unsigned long fault_addr;               /* 0x0210 */
+       __u8 reserved[0x0700 - 0x0218];         /* 0x0218 */
+       struct kvm_s390_crypto_cb crycb;        /* 0x0700 */
+       __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
+} __packed;
+
+/* trigger a validity icpt for the given scb */
+static int set_validity_icpt(struct kvm_s390_sie_block *scb,
+                            __u16 reason_code)
+{
+       scb->ipa = 0x1000;
+       scb->ipb = ((__u32) reason_code) << 16;
+       scb->icptcode = ICPT_VALIDITY;
+       return 1;
+}
+
+/* mark the prefix as unmapped, this will block the VSIE */
+static void prefix_unmapped(struct vsie_page *vsie_page)
+{
+       atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* mark the prefix as unmapped and wait until the VSIE has been left */
+static void prefix_unmapped_sync(struct vsie_page *vsie_page)
+{
+       prefix_unmapped(vsie_page);
+       if (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+               atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags);
+       while (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+               cpu_relax();
+}
+
+/* mark the prefix as mapped, this will allow the VSIE to run */
+static void prefix_mapped(struct vsie_page *vsie_page)
+{
+       atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* test if the prefix is mapped into the gmap shadow */
+static int prefix_is_mapped(struct vsie_page *vsie_page)
+{
+       return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST);
+}
+
+/* copy the updated intervention request bits into the shadow scb */
+static void update_intervention_requests(struct vsie_page *vsie_page)
+{
+       const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT;
+       int cpuflags;
+
+       cpuflags = atomic_read(&vsie_page->scb_o->cpuflags);
+       atomic_andnot(bits, &vsie_page->scb_s.cpuflags);
+       atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags);
+}
+
+/* shadow (filter and validate) the cpuflags  */
+static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       int newflags, cpuflags = atomic_read(&scb_o->cpuflags);
+
+       /* we don't allow ESA/390 guests */
+       if (!(cpuflags & CPUSTAT_ZARCH))
+               return set_validity_icpt(scb_s, 0x0001U);
+
+       if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS))
+               return set_validity_icpt(scb_s, 0x0001U);
+       else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR))
+               return set_validity_icpt(scb_s, 0x0007U);
+
+       /* intervention requests will be set later */
+       newflags = CPUSTAT_ZARCH;
+       if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8))
+               newflags |= CPUSTAT_GED;
+       if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) {
+               if (cpuflags & CPUSTAT_GED)
+                       return set_validity_icpt(scb_s, 0x0001U);
+               newflags |= CPUSTAT_GED2;
+       }
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE))
+               newflags |= cpuflags & CPUSTAT_P;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS))
+               newflags |= cpuflags & CPUSTAT_SM;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
+               newflags |= cpuflags & CPUSTAT_IBS;
+
+       atomic_set(&scb_s->cpuflags, newflags);
+       return 0;
+}
+
+/*
+ * Create a shadow copy of the crycb block and setup key wrapping, if
+ * requested for guest 3 and enabled for guest 2.
+ *
+ * We only accept format-1 (no AP in g2), but convert it into format-2
+ * There is nothing to do for format-0.
+ *
+ * Returns: - 0 if shadowed or nothing to do
+ *          - > 0 if control has to be given to guest 2
+ */
+static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U;
+       unsigned long *b1, *b2;
+       u8 ecb3_flags;
+
+       scb_s->crycbd = 0;
+       if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
+               return 0;
+       /* format-1 is supported with message-security-assist extension 3 */
+       if (!test_kvm_facility(vcpu->kvm, 76))
+               return 0;
+       /* we may only allow it if enabled for guest 2 */
+       ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
+                    (ECB3_AES | ECB3_DEA);
+       if (!ecb3_flags)
+               return 0;
+
+       if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK))
+               return set_validity_icpt(scb_s, 0x003CU);
+       else if (!crycb_addr)
+               return set_validity_icpt(scb_s, 0x0039U);
+
+       /* copy only the wrapping keys */
+       if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56))
+               return set_validity_icpt(scb_s, 0x0035U);
+
+       scb_s->ecb3 |= ecb3_flags;
+       scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 |
+                       CRYCB_FORMAT2;
+
+       /* xor both blocks in one run */
+       b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
+       b2 = (unsigned long *)
+                           vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
+       /* as 56%8 == 0, bitmap_xor won't overwrite any data */
+       bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
+       return 0;
+}
+
+/* shadow (round up/down) the ibc to avoid validity icpt */
+static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
+
+       scb_s->ibc = 0;
+       /* ibc installed in g2 and requested for g3 */
+       if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) {
+               scb_s->ibc = scb_o->ibc & 0x0fffU;
+               /* takte care of the minimum ibc level of the machine */
+               if (scb_s->ibc < min_ibc)
+                       scb_s->ibc = min_ibc;
+               /* take care of the maximum ibc level set for the guest */
+               if (scb_s->ibc > vcpu->kvm->arch.model.ibc)
+                       scb_s->ibc = vcpu->kvm->arch.model.ibc;
+       }
+}
+
+/* unshadow the scb, copying parameters back to the real scb */
+static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+
+       /* interception */
+       scb_o->icptcode = scb_s->icptcode;
+       scb_o->icptstatus = scb_s->icptstatus;
+       scb_o->ipa = scb_s->ipa;
+       scb_o->ipb = scb_s->ipb;
+       scb_o->gbea = scb_s->gbea;
+
+       /* timer */
+       scb_o->cputm = scb_s->cputm;
+       scb_o->ckc = scb_s->ckc;
+       scb_o->todpr = scb_s->todpr;
+
+       /* guest state */
+       scb_o->gpsw = scb_s->gpsw;
+       scb_o->gg14 = scb_s->gg14;
+       scb_o->gg15 = scb_s->gg15;
+       memcpy(scb_o->gcr, scb_s->gcr, 128);
+       scb_o->pp = scb_s->pp;
+
+       /* interrupt intercept */
+       switch (scb_s->icptcode) {
+       case ICPT_PROGI:
+       case ICPT_INSTPROGI:
+       case ICPT_EXTINT:
+               memcpy((void *)((u64)scb_o + 0xc0),
+                      (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
+               break;
+       case ICPT_PARTEXEC:
+               /* MVPG only */
+               memcpy((void *)((u64)scb_o + 0xc0),
+                      (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
+               break;
+       }
+
+       if (scb_s->ihcpu != 0xffffU)
+               scb_o->ihcpu = scb_s->ihcpu;
+}
+
+/*
+ * Setup the shadow scb by copying and checking the relevant parts of the g2
+ * provided scb.
+ *
+ * Returns: - 0 if the scb has been shadowed
+ *          - > 0 if control has to be given to guest 2
+ */
+static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       bool had_tx = scb_s->ecb & 0x10U;
+       unsigned long new_mso = 0;
+       int rc;
+
+       /* make sure we don't have any leftovers when reusing the scb */
+       scb_s->icptcode = 0;
+       scb_s->eca = 0;
+       scb_s->ecb = 0;
+       scb_s->ecb2 = 0;
+       scb_s->ecb3 = 0;
+       scb_s->ecd = 0;
+       scb_s->fac = 0;
+
+       rc = prepare_cpuflags(vcpu, vsie_page);
+       if (rc)
+               goto out;
+
+       /* timer */
+       scb_s->cputm = scb_o->cputm;
+       scb_s->ckc = scb_o->ckc;
+       scb_s->todpr = scb_o->todpr;
+       scb_s->epoch = scb_o->epoch;
+
+       /* guest state */
+       scb_s->gpsw = scb_o->gpsw;
+       scb_s->gg14 = scb_o->gg14;
+       scb_s->gg15 = scb_o->gg15;
+       memcpy(scb_s->gcr, scb_o->gcr, 128);
+       scb_s->pp = scb_o->pp;
+
+       /* interception / execution handling */
+       scb_s->gbea = scb_o->gbea;
+       scb_s->lctl = scb_o->lctl;
+       scb_s->svcc = scb_o->svcc;
+       scb_s->ictl = scb_o->ictl;
+       /*
+        * SKEY handling functions can't deal with false setting of PTE invalid
+        * bits. Therefore we cannot provide interpretation and would later
+        * have to provide own emulation handlers.
+        */
+       scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+       scb_s->icpua = scb_o->icpua;
+
+       if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
+               new_mso = scb_o->mso & 0xfffffffffff00000UL;
+       /* if the hva of the prefix changes, we have to remap the prefix */
+       if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix)
+               prefix_unmapped(vsie_page);
+        /* SIE will do mso/msl validity and exception checks for us */
+       scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
+       scb_s->mso = new_mso;
+       scb_s->prefix = scb_o->prefix;
+
+       /* We have to definetly flush the tlb if this scb never ran */
+       if (scb_s->ihcpu != 0xffffU)
+               scb_s->ihcpu = scb_o->ihcpu;
+
+       /* MVPG and Protection Exception Interpretation are always available */
+       scb_s->eca |= scb_o->eca & 0x01002000U;
+       /* Host-protection-interruption introduced with ESOP */
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
+               scb_s->ecb |= scb_o->ecb & 0x02U;
+       /* transactional execution */
+       if (test_kvm_facility(vcpu->kvm, 73)) {
+               /* remap the prefix is tx is toggled on */
+               if ((scb_o->ecb & 0x10U) && !had_tx)
+                       prefix_unmapped(vsie_page);
+               scb_s->ecb |= scb_o->ecb & 0x10U;
+       }
+       /* SIMD */
+       if (test_kvm_facility(vcpu->kvm, 129)) {
+               scb_s->eca |= scb_o->eca & 0x00020000U;
+               scb_s->ecd |= scb_o->ecd & 0x20000000U;
+       }
+       /* Run-time-Instrumentation */
+       if (test_kvm_facility(vcpu->kvm, 64))
+               scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
+               scb_s->eca |= scb_o->eca & 0x00000001U;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
+               scb_s->eca |= scb_o->eca & 0x40000000U;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
+               scb_s->eca |= scb_o->eca & 0x80000000U;
+
+       prepare_ibc(vcpu, vsie_page);
+       rc = shadow_crycb(vcpu, vsie_page);
+out:
+       if (rc)
+               unshadow_scb(vcpu, vsie_page);
+       return rc;
+}
+
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+                                unsigned long end)
+{
+       struct kvm *kvm = gmap->private;
+       struct vsie_page *cur;
+       unsigned long prefix;
+       struct page *page;
+       int i;
+
+       if (!gmap_is_shadow(gmap))
+               return;
+       if (start >= 1UL << 31)
+               /* We are only interested in prefix pages */
+               return;
+
+       /*
+        * Only new shadow blocks are added to the list during runtime,
+        * therefore we can safely reference them all the time.
+        */
+       for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+               page = READ_ONCE(kvm->arch.vsie.pages[i]);
+               if (!page)
+                       continue;
+               cur = page_to_virt(page);
+               if (READ_ONCE(cur->gmap) != gmap)
+                       continue;
+               prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
+               /* with mso/msl, the prefix lies at an offset */
+               prefix += cur->scb_s.mso;
+               if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
+                       prefix_unmapped_sync(cur);
+       }
+}
+
+/*
+ * Map the first prefix page and if tx is enabled also the second prefix page.
+ *
+ * The prefix will be protected, a gmap notifier will inform about unmaps.
+ * The shadow scb must not be executed until the prefix is remapped, this is
+ * guaranteed by properly handling PROG_REQUEST.
+ *
+ * Returns: - 0 on if successfully mapped or already mapped
+ *          - > 0 if control has to be given to guest 2
+ *          - -EAGAIN if the caller can retry immediately
+ *          - -ENOMEM if out of memory
+ */
+static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+       int rc;
+
+       if (prefix_is_mapped(vsie_page))
+               return 0;
+
+       /* mark it as mapped so we can catch any concurrent unmappers */
+       prefix_mapped(vsie_page);
+
+       /* with mso/msl, the prefix lies at offset *mso* */
+       prefix += scb_s->mso;
+
+       rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+       if (!rc && (scb_s->ecb & 0x10U))
+               rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+                                          prefix + PAGE_SIZE);
+       /*
+        * We don't have to mprotect, we will be called for all unshadows.
+        * SIE will detect if protection applies and trigger a validity.
+        */
+       if (rc)
+               prefix_unmapped(vsie_page);
+       if (rc > 0 || rc == -EFAULT)
+               rc = set_validity_icpt(scb_s, 0x0037U);
+       return rc;
+}
+
+/*
+ * Pin the guest page given by gpa and set hpa to the pinned host address.
+ * Will always be pinned writable.
+ *
+ * Returns: - 0 on success
+ *          - -EINVAL if the gpa is not valid guest storage
+ *          - -ENOMEM if out of memory
+ */
+static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
+{
+       struct page *page;
+       hva_t hva;
+       int rc;
+
+       hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+       if (kvm_is_error_hva(hva))
+               return -EINVAL;
+       rc = get_user_pages_fast(hva, 1, 1, &page);
+       if (rc < 0)
+               return rc;
+       else if (rc != 1)
+               return -ENOMEM;
+       *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
+       return 0;
+}
+
+/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
+static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
+{
+       struct page *page;
+
+       page = virt_to_page(hpa);
+       set_page_dirty_lock(page);
+       put_page(page);
+       /* mark the page always as dirty for migration */
+       mark_page_dirty(kvm, gpa_to_gfn(gpa));
+}
+
+/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
+static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       hpa_t hpa;
+       gpa_t gpa;
+
+       hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
+       if (hpa) {
+               gpa = scb_o->scaol & ~0xfUL;
+               if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+                       gpa |= (u64) scb_o->scaoh << 32;
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               scb_s->scaol = 0;
+               scb_s->scaoh = 0;
+       }
+
+       hpa = scb_s->itdba;
+       if (hpa) {
+               gpa = scb_o->itdba & ~0xffUL;
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               scb_s->itdba = 0;
+       }
+
+       hpa = scb_s->gvrd;
+       if (hpa) {
+               gpa = scb_o->gvrd & ~0x1ffUL;
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               scb_s->gvrd = 0;
+       }
+
+       hpa = scb_s->riccbd;
+       if (hpa) {
+               gpa = scb_o->riccbd & ~0x3fUL;
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+               scb_s->riccbd = 0;
+       }
+}
+
+/*
+ * Instead of shadowing some blocks, we can simply forward them because the
+ * addresses in the scb are 64 bit long.
+ *
+ * This works as long as the data lies in one page. If blocks ever exceed one
+ * page, we have to fall back to shadowing.
+ *
+ * As we reuse the sca, the vcpu pointers contained in it are invalid. We must
+ * therefore not enable any facilities that access these pointers (e.g. SIGPIF).
+ *
+ * Returns: - 0 if all blocks were pinned.
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       hpa_t hpa;
+       gpa_t gpa;
+       int rc = 0;
+
+       gpa = scb_o->scaol & ~0xfUL;
+       if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+               gpa |= (u64) scb_o->scaoh << 32;
+       if (gpa) {
+               if (!(gpa & ~0x1fffUL))
+                       rc = set_validity_icpt(scb_s, 0x0038U);
+               else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
+                       rc = set_validity_icpt(scb_s, 0x0011U);
+               else if ((gpa & PAGE_MASK) !=
+                        ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
+                       rc = set_validity_icpt(scb_s, 0x003bU);
+               if (!rc) {
+                       rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+                       if (rc == -EINVAL)
+                               rc = set_validity_icpt(scb_s, 0x0034U);
+               }
+               if (rc)
+                       goto unpin;
+               scb_s->scaoh = (u32)((u64)hpa >> 32);
+               scb_s->scaol = (u32)(u64)hpa;
+       }
+
+       gpa = scb_o->itdba & ~0xffUL;
+       if (gpa && (scb_s->ecb & 0x10U)) {
+               if (!(gpa & ~0x1fffU)) {
+                       rc = set_validity_icpt(scb_s, 0x0080U);
+                       goto unpin;
+               }
+               /* 256 bytes cannot cross page boundaries */
+               rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+               if (rc == -EINVAL)
+                       rc = set_validity_icpt(scb_s, 0x0080U);
+               if (rc)
+                       goto unpin;
+               scb_s->itdba = hpa;
+       }
+
+       gpa = scb_o->gvrd & ~0x1ffUL;
+       if (gpa && (scb_s->eca & 0x00020000U) &&
+           !(scb_s->ecd & 0x20000000U)) {
+               if (!(gpa & ~0x1fffUL)) {
+                       rc = set_validity_icpt(scb_s, 0x1310U);
+                       goto unpin;
+               }
+               /*
+                * 512 bytes vector registers cannot cross page boundaries
+                * if this block gets bigger, we have to shadow it.
+                */
+               rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+               if (rc == -EINVAL)
+                       rc = set_validity_icpt(scb_s, 0x1310U);
+               if (rc)
+                       goto unpin;
+               scb_s->gvrd = hpa;
+       }
+
+       gpa = scb_o->riccbd & ~0x3fUL;
+       if (gpa && (scb_s->ecb3 & 0x01U)) {
+               if (!(gpa & ~0x1fffUL)) {
+                       rc = set_validity_icpt(scb_s, 0x0043U);
+                       goto unpin;
+               }
+               /* 64 bytes cannot cross page boundaries */
+               rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+               if (rc == -EINVAL)
+                       rc = set_validity_icpt(scb_s, 0x0043U);
+               /* Validity 0x0044 will be checked by SIE */
+               if (rc)
+                       goto unpin;
+               scb_s->gvrd = hpa;
+       }
+       return 0;
+unpin:
+       unpin_blocks(vcpu, vsie_page);
+       return rc;
+}
+
+/* unpin the scb provided by guest 2, marking it as dirty */
+static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+                     gpa_t gpa)
+{
+       hpa_t hpa = (hpa_t) vsie_page->scb_o;
+
+       if (hpa)
+               unpin_guest_page(vcpu->kvm, gpa, hpa);
+       vsie_page->scb_o = NULL;
+}
+
+/*
+ * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o.
+ *
+ * Returns: - 0 if the scb was pinned.
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+                  gpa_t gpa)
+{
+       hpa_t hpa;
+       int rc;
+
+       rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+       if (rc == -EINVAL) {
+               rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+               if (!rc)
+                       rc = 1;
+       }
+       if (!rc)
+               vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+       return rc;
+}
+
+/*
+ * Inject a fault into guest 2.
+ *
+ * Returns: - > 0 if control has to be given to guest 2
+ *            < 0 if an error occurred during injection.
+ */
+static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
+                       bool write_flag)
+{
+       struct kvm_s390_pgm_info pgm = {
+               .code = code,
+               .trans_exc_code =
+                       /* 0-51: virtual address */
+                       (vaddr & 0xfffffffffffff000UL) |
+                       /* 52-53: store / fetch */
+                       (((unsigned int) !write_flag) + 1) << 10,
+                       /* 62-63: asce id (alway primary == 0) */
+               .exc_access_id = 0, /* always primary */
+               .op_access_id = 0, /* not MVPG */
+       };
+       int rc;
+
+       if (code == PGM_PROTECTION)
+               pgm.trans_exc_code |= 0x4UL;
+
+       rc = kvm_s390_inject_prog_irq(vcpu, &pgm);
+       return rc ? rc : 1;
+}
+
+/*
+ * Handle a fault during vsie execution on a gmap shadow.
+ *
+ * Returns: - 0 if the fault was resolved
+ *          - > 0 if control has to be given to guest 2
+ *          - < 0 if an error occurred
+ */
+static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       int rc;
+
+       if (current->thread.gmap_int_code == PGM_PROTECTION)
+               /* we can directly forward all protection exceptions */
+               return inject_fault(vcpu, PGM_PROTECTION,
+                                   current->thread.gmap_addr, 1);
+
+       rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+                                  current->thread.gmap_addr);
+       if (rc > 0) {
+               rc = inject_fault(vcpu, rc,
+                                 current->thread.gmap_addr,
+                                 current->thread.gmap_write_flag);
+               if (rc >= 0)
+                       vsie_page->fault_addr = current->thread.gmap_addr;
+       }
+       return rc;
+}
+
+/*
+ * Retry the previous fault that required guest 2 intervention. This avoids
+ * one superfluous SIE re-entry and direct exit.
+ *
+ * Will ignore any errors. The next SIE fault will do proper fault handling.
+ */
+static void handle_last_fault(struct kvm_vcpu *vcpu,
+                             struct vsie_page *vsie_page)
+{
+       if (vsie_page->fault_addr)
+               kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+                                     vsie_page->fault_addr);
+       vsie_page->fault_addr = 0;
+}
+
+static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
+{
+       vsie_page->scb_s.icptcode = 0;
+}
+
+/* rewind the psw and clear the vsie icpt, so we can retry execution */
+static void retry_vsie_icpt(struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       int ilen = insn_length(scb_s->ipa >> 8);
+
+       /* take care of EXECUTE instructions */
+       if (scb_s->icptstatus & 1) {
+               ilen = (scb_s->icptstatus >> 4) & 0x6;
+               if (!ilen)
+                       ilen = 4;
+       }
+       scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen);
+       clear_vsie_icpt(vsie_page);
+}
+
+/*
+ * Try to shadow + enable the guest 2 provided facility list.
+ * Retry instruction execution if enabled for and provided by guest 2.
+ *
+ * Returns: - 0 if handled (retry or guest 2 icpt)
+ *          - > 0 if control has to be given to guest 2
+ */
+static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       __u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U;
+
+       if (fac && test_kvm_facility(vcpu->kvm, 7)) {
+               retry_vsie_icpt(vsie_page);
+               if (read_guest_real(vcpu, fac, &vsie_page->fac,
+                                   sizeof(vsie_page->fac)))
+                       return set_validity_icpt(scb_s, 0x1090U);
+               scb_s->fac = (__u32)(__u64) &vsie_page->fac;
+       }
+       return 0;
+}
+
+/*
+ * Run the vsie on a shadow scb and a shadow gmap, without any further
+ * sanity checks, handling SIE faults.
+ *
+ * Returns: - 0 everything went fine
+ *          - > 0 if control has to be given to guest 2
+ *          - < 0 if an error occurred
+ */
+static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+       int rc;
+
+       handle_last_fault(vcpu, vsie_page);
+
+       if (need_resched())
+               schedule();
+       if (test_cpu_flag(CIF_MCCK_PENDING))
+               s390_handle_mcck();
+
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+       local_irq_disable();
+       guest_enter_irqoff();
+       local_irq_enable();
+
+       rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
+
+       local_irq_disable();
+       guest_exit_irqoff();
+       local_irq_enable();
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+       if (rc > 0)
+               rc = 0; /* we could still have an icpt */
+       else if (rc == -EFAULT)
+               return handle_fault(vcpu, vsie_page);
+
+       switch (scb_s->icptcode) {
+       case ICPT_INST:
+               if (scb_s->ipa == 0xb2b0)
+                       rc = handle_stfle(vcpu, vsie_page);
+               break;
+       case ICPT_STOP:
+               /* stop not requested by g2 - must have been a kick */
+               if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT))
+                       clear_vsie_icpt(vsie_page);
+               break;
+       case ICPT_VALIDITY:
+               if ((scb_s->ipa & 0xf000) != 0xf000)
+                       scb_s->ipa += 0x1000;
+               break;
+       }
+       return rc;
+}
+
+static void release_gmap_shadow(struct vsie_page *vsie_page)
+{
+       if (vsie_page->gmap)
+               gmap_put(vsie_page->gmap);
+       WRITE_ONCE(vsie_page->gmap, NULL);
+       prefix_unmapped(vsie_page);
+}
+
+static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
+                              struct vsie_page *vsie_page)
+{
+       unsigned long asce;
+       union ctlreg0 cr0;
+       struct gmap *gmap;
+       int edat;
+
+       asce = vcpu->arch.sie_block->gcr[1];
+       cr0.val = vcpu->arch.sie_block->gcr[0];
+       edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+       edat += edat && test_kvm_facility(vcpu->kvm, 78);
+
+       /*
+        * ASCE or EDAT could have changed since last icpt, or the gmap
+        * we're holding has been unshadowed. If the gmap is still valid,
+        * we can safely reuse it.
+        */
+       if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
+               return 0;
+
+       /* release the old shadow - if any, and mark the prefix as unmapped */
+       release_gmap_shadow(vsie_page);
+       gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
+       if (IS_ERR(gmap))
+               return PTR_ERR(gmap);
+       gmap->private = vcpu->kvm;
+       WRITE_ONCE(vsie_page->gmap, gmap);
+       return 0;
+}
+
+/*
+ * Register the shadow scb at the VCPU, e.g. for kicking out of vsie.
+ */
+static void register_shadow_scb(struct kvm_vcpu *vcpu,
+                               struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+
+       WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
+       /*
+        * External calls have to lead to a kick of the vcpu and
+        * therefore the vsie -> Simulate Wait state.
+        */
+       atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+       /*
+        * We have to adjust the g3 epoch by the g2 epoch. The epoch will
+        * automatically be adjusted on tod clock changes via kvm_sync_clock.
+        */
+       preempt_disable();
+       scb_s->epoch += vcpu->kvm->arch.epoch;
+       preempt_enable();
+}
+
+/*
+ * Unregister a shadow scb from a VCPU.
+ */
+static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
+{
+       atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+       WRITE_ONCE(vcpu->arch.vsie_block, NULL);
+}
+
+/*
+ * Run the vsie on a shadowed scb, managing the gmap shadow, handling
+ * prefix pages and faults.
+ *
+ * Returns: - 0 if no errors occurred
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       int rc = 0;
+
+       while (1) {
+               rc = acquire_gmap_shadow(vcpu, vsie_page);
+               if (!rc)
+                       rc = map_prefix(vcpu, vsie_page);
+               if (!rc) {
+                       gmap_enable(vsie_page->gmap);
+                       update_intervention_requests(vsie_page);
+                       rc = do_vsie_run(vcpu, vsie_page);
+                       gmap_enable(vcpu->arch.gmap);
+               }
+               atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
+
+               if (rc == -EAGAIN)
+                       rc = 0;
+               if (rc || scb_s->icptcode || signal_pending(current) ||
+                   kvm_s390_vcpu_has_irq(vcpu, 0))
+                       break;
+       };
+
+       if (rc == -EFAULT) {
+               /*
+                * Addressing exceptions are always presentes as intercepts.
+                * As addressing exceptions are suppressing and our guest 3 PSW
+                * points at the responsible instruction, we have to
+                * forward the PSW and set the ilc. If we can't read guest 3
+                * instruction, we can use an arbitrary ilc. Let's always use
+                * ilen = 4 for now, so we can avoid reading in guest 3 virtual
+                * memory. (we could also fake the shadow so the hardware
+                * handles it).
+                */
+               scb_s->icptcode = ICPT_PROGI;
+               scb_s->iprcc = PGM_ADDRESSING;
+               scb_s->pgmilc = 4;
+               scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+       }
+       return rc;
+}
+
+/*
+ * Get or create a vsie page for a scb address.
+ *
+ * Returns: - address of a vsie page (cached or new one)
+ *          - NULL if the same scb address is already used by another VCPU
+ *          - ERR_PTR(-ENOMEM) if out of memory
+ */
+static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
+{
+       struct vsie_page *vsie_page;
+       struct page *page;
+       int nr_vcpus;
+
+       rcu_read_lock();
+       page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
+       rcu_read_unlock();
+       if (page) {
+               if (page_ref_inc_return(page) == 2)
+                       return page_to_virt(page);
+               page_ref_dec(page);
+       }
+
+       /*
+        * We want at least #online_vcpus shadows, so every VCPU can execute
+        * the VSIE in parallel.
+        */
+       nr_vcpus = atomic_read(&kvm->online_vcpus);
+
+       mutex_lock(&kvm->arch.vsie.mutex);
+       if (kvm->arch.vsie.page_count < nr_vcpus) {
+               page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
+               if (!page) {
+                       mutex_unlock(&kvm->arch.vsie.mutex);
+                       return ERR_PTR(-ENOMEM);
+               }
+               page_ref_inc(page);
+               kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
+               kvm->arch.vsie.page_count++;
+       } else {
+               /* reuse an existing entry that belongs to nobody */
+               while (true) {
+                       page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
+                       if (page_ref_inc_return(page) == 2)
+                               break;
+                       page_ref_dec(page);
+                       kvm->arch.vsie.next++;
+                       kvm->arch.vsie.next %= nr_vcpus;
+               }
+               radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+       }
+       page->index = addr;
+       /* double use of the same address */
+       if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
+               page_ref_dec(page);
+               mutex_unlock(&kvm->arch.vsie.mutex);
+               return NULL;
+       }
+       mutex_unlock(&kvm->arch.vsie.mutex);
+
+       vsie_page = page_to_virt(page);
+       memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
+       release_gmap_shadow(vsie_page);
+       vsie_page->fault_addr = 0;
+       vsie_page->scb_s.ihcpu = 0xffffU;
+       return vsie_page;
+}
+
+/* put a vsie page acquired via get_vsie_page */
+static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
+{
+       struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
+
+       page_ref_dec(page);
+}
+
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
+{
+       struct vsie_page *vsie_page;
+       unsigned long scb_addr;
+       int rc;
+
+       vcpu->stat.instruction_sie++;
+       if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2))
+               return -EOPNOTSUPP;
+       if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+               return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+       BUILD_BUG_ON(sizeof(struct vsie_page) != 4096);
+       scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL);
+
+       /* 512 byte alignment */
+       if (unlikely(scb_addr & 0x1ffUL))
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+       if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0))
+               return 0;
+
+       vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
+       if (IS_ERR(vsie_page))
+               return PTR_ERR(vsie_page);
+       else if (!vsie_page)
+               /* double use of sie control block - simply do nothing */
+               return 0;
+
+       rc = pin_scb(vcpu, vsie_page, scb_addr);
+       if (rc)
+               goto out_put;
+       rc = shadow_scb(vcpu, vsie_page);
+       if (rc)
+               goto out_unpin_scb;
+       rc = pin_blocks(vcpu, vsie_page);
+       if (rc)
+               goto out_unshadow;
+       register_shadow_scb(vcpu, vsie_page);
+       rc = vsie_run(vcpu, vsie_page);
+       unregister_shadow_scb(vcpu);
+       unpin_blocks(vcpu, vsie_page);
+out_unshadow:
+       unshadow_scb(vcpu, vsie_page);
+out_unpin_scb:
+       unpin_scb(vcpu, vsie_page, scb_addr);
+out_put:
+       put_vsie_page(vcpu->kvm, vsie_page);
+
+       return rc < 0 ? rc : 0;
+}
+
+/* Init the vsie data structures. To be called when a vm is initialized. */
+void kvm_s390_vsie_init(struct kvm *kvm)
+{
+       mutex_init(&kvm->arch.vsie.mutex);
+       INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
+}
+
+/* Destroy the vsie data structures. To be called when a vm is destroyed. */
+void kvm_s390_vsie_destroy(struct kvm *kvm)
+{
+       struct vsie_page *vsie_page;
+       struct page *page;
+       int i;
+
+       mutex_lock(&kvm->arch.vsie.mutex);
+       for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+               page = kvm->arch.vsie.pages[i];
+               kvm->arch.vsie.pages[i] = NULL;
+               vsie_page = page_to_virt(page);
+               release_gmap_shadow(vsie_page);
+               /* free the radix tree entry */
+               radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+               __free_page(page);
+       }
+       kvm->arch.vsie.page_count = 0;
+       mutex_unlock(&kvm->arch.vsie.mutex);
+}
+
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu)
+{
+       struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block);
+
+       /*
+        * Even if the VCPU lets go of the shadow sie block reference, it is
+        * still valid in the cache. So we can safely kick it.
+        */
+       if (scb) {
+               atomic_or(PROG_BLOCK_SIE, &scb->prog20);
+               if (scb->prog0c & PROG_IN_SIE)
+                       atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags);
+       }
+}
index 25783dc..a58bca6 100644 (file)
@@ -418,6 +418,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
                (struct gmap *) S390_lowcore.gmap : NULL;
        if (gmap) {
                current->thread.gmap_addr = address;
+               current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
+               current->thread.gmap_int_code = regs->int_code & 0xffff;
                address = __gmap_translate(gmap, address);
                if (address == -EFAULT) {
                        fault = VM_FAULT_BADMAP;
index 063c721..2ce6bb3 100644 (file)
 #include <asm/gmap.h>
 #include <asm/tlb.h>
 
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
+
 /**
- * gmap_alloc - allocate a guest address space
+ * gmap_alloc - allocate and initialize a guest address space
  * @mm: pointer to the parent mm_struct
  * @limit: maximum address of the gmap address space
  *
  * Returns a guest address space structure.
  */
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
+static struct gmap *gmap_alloc(unsigned long limit)
 {
        struct gmap *gmap;
        struct page *page;
@@ -55,10 +57,14 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
        if (!gmap)
                goto out;
        INIT_LIST_HEAD(&gmap->crst_list);
+       INIT_LIST_HEAD(&gmap->children);
+       INIT_LIST_HEAD(&gmap->pt_list);
        INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
        INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
+       INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
        spin_lock_init(&gmap->guest_table_lock);
-       gmap->mm = mm;
+       spin_lock_init(&gmap->shadow_lock);
+       atomic_set(&gmap->ref_count, 1);
        page = alloc_pages(GFP_KERNEL, 2);
        if (!page)
                goto out_free;
@@ -70,9 +76,6 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
        gmap->asce = atype | _ASCE_TABLE_LENGTH |
                _ASCE_USER_BITS | __pa(table);
        gmap->asce_end = limit;
-       down_write(&mm->mmap_sem);
-       list_add(&gmap->list, &mm->context.gmap_list);
-       up_write(&mm->mmap_sem);
        return gmap;
 
 out_free:
@@ -80,7 +83,28 @@ out_free:
 out:
        return NULL;
 }
-EXPORT_SYMBOL_GPL(gmap_alloc);
+
+/**
+ * gmap_create - create a guest address space
+ * @mm: pointer to the parent mm_struct
+ * @limit: maximum size of the gmap address space
+ *
+ * Returns a guest address space structure.
+ */
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
+{
+       struct gmap *gmap;
+
+       gmap = gmap_alloc(limit);
+       if (!gmap)
+               return NULL;
+       gmap->mm = mm;
+       spin_lock(&mm->context.gmap_lock);
+       list_add_rcu(&gmap->list, &mm->context.gmap_list);
+       spin_unlock(&mm->context.gmap_lock);
+       return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_create);
 
 static void gmap_flush_tlb(struct gmap *gmap)
 {
@@ -114,31 +138,117 @@ static void gmap_radix_tree_free(struct radix_tree_root *root)
        } while (nr > 0);
 }
 
+static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+{
+       struct gmap_rmap *rmap, *rnext, *head;
+       struct radix_tree_iter iter;
+       unsigned long indices[16];
+       unsigned long index;
+       void **slot;
+       int i, nr;
+
+       /* A radix tree is freed by deleting all of its entries */
+       index = 0;
+       do {
+               nr = 0;
+               radix_tree_for_each_slot(slot, root, &iter, index) {
+                       indices[nr] = iter.index;
+                       if (++nr == 16)
+                               break;
+               }
+               for (i = 0; i < nr; i++) {
+                       index = indices[i];
+                       head = radix_tree_delete(root, index);
+                       gmap_for_each_rmap_safe(rmap, rnext, head)
+                               kfree(rmap);
+               }
+       } while (nr > 0);
+}
+
 /**
  * gmap_free - free a guest address space
  * @gmap: pointer to the guest address space structure
+ *
+ * No locks required. There are no references to this gmap anymore.
  */
-void gmap_free(struct gmap *gmap)
+static void gmap_free(struct gmap *gmap)
 {
        struct page *page, *next;
 
-       /* Flush tlb. */
-       if (MACHINE_HAS_IDTE)
-               __tlb_flush_idte(gmap->asce);
-       else
-               __tlb_flush_global();
-
+       /* Flush tlb of all gmaps (if not already done for shadows) */
+       if (!(gmap_is_shadow(gmap) && gmap->removed))
+               gmap_flush_tlb(gmap);
        /* Free all segment & region tables. */
        list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
                __free_pages(page, 2);
        gmap_radix_tree_free(&gmap->guest_to_host);
        gmap_radix_tree_free(&gmap->host_to_guest);
-       down_write(&gmap->mm->mmap_sem);
-       list_del(&gmap->list);
-       up_write(&gmap->mm->mmap_sem);
+
+       /* Free additional data for a shadow gmap */
+       if (gmap_is_shadow(gmap)) {
+               /* Free all page tables. */
+               list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
+                       page_table_free_pgste(page);
+               gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+               /* Release reference to the parent */
+               gmap_put(gmap->parent);
+       }
+
        kfree(gmap);
 }
-EXPORT_SYMBOL_GPL(gmap_free);
+
+/**
+ * gmap_get - increase reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * Returns the gmap pointer
+ */
+struct gmap *gmap_get(struct gmap *gmap)
+{
+       atomic_inc(&gmap->ref_count);
+       return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get);
+
+/**
+ * gmap_put - decrease reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * If the reference counter reaches zero the guest address space is freed.
+ */
+void gmap_put(struct gmap *gmap)
+{
+       if (atomic_dec_return(&gmap->ref_count) == 0)
+               gmap_free(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_put);
+
+/**
+ * gmap_remove - remove a guest address space but do not free it yet
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_remove(struct gmap *gmap)
+{
+       struct gmap *sg, *next;
+
+       /* Remove all shadow gmaps linked to this gmap */
+       if (!list_empty(&gmap->children)) {
+               spin_lock(&gmap->shadow_lock);
+               list_for_each_entry_safe(sg, next, &gmap->children, list) {
+                       list_del(&sg->list);
+                       gmap_put(sg);
+               }
+               spin_unlock(&gmap->shadow_lock);
+       }
+       /* Remove gmap from the pre-mm list */
+       spin_lock(&gmap->mm->context.gmap_lock);
+       list_del_rcu(&gmap->list);
+       spin_unlock(&gmap->mm->context.gmap_lock);
+       synchronize_rcu();
+       /* Put reference */
+       gmap_put(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_remove);
 
 /**
  * gmap_enable - switch primary space to the guest address space
@@ -160,6 +270,17 @@ void gmap_disable(struct gmap *gmap)
 }
 EXPORT_SYMBOL_GPL(gmap_disable);
 
+/**
+ * gmap_get_enabled - get a pointer to the currently enabled gmap
+ *
+ * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
+ */
+struct gmap *gmap_get_enabled(void)
+{
+       return (struct gmap *) S390_lowcore.gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get_enabled);
+
 /*
  * gmap_alloc_table is assumed to be called with mmap_sem held
  */
@@ -175,7 +296,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
                return -ENOMEM;
        new = (unsigned long *) page_to_phys(page);
        crst_table_init(new, init);
-       spin_lock(&gmap->mm->page_table_lock);
+       spin_lock(&gmap->guest_table_lock);
        if (*table & _REGION_ENTRY_INVALID) {
                list_add(&page->lru, &gmap->crst_list);
                *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
@@ -183,7 +304,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
                page->index = gaddr;
                page = NULL;
        }
-       spin_unlock(&gmap->mm->page_table_lock);
+       spin_unlock(&gmap->guest_table_lock);
        if (page)
                __free_pages(page, 2);
        return 0;
@@ -219,6 +340,7 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
        unsigned long *entry;
        int flush = 0;
 
+       BUG_ON(gmap_is_shadow(gmap));
        spin_lock(&gmap->guest_table_lock);
        entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
        if (entry) {
@@ -258,6 +380,7 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
        unsigned long off;
        int flush;
 
+       BUG_ON(gmap_is_shadow(gmap));
        if ((to | len) & (PMD_SIZE - 1))
                return -EINVAL;
        if (len == 0 || to + len < to)
@@ -289,6 +412,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
        unsigned long off;
        int flush;
 
+       BUG_ON(gmap_is_shadow(gmap));
        if ((from | to | len) & (PMD_SIZE - 1))
                return -EINVAL;
        if (len == 0 || from + len < from || to + len < to ||
@@ -326,6 +450,8 @@ EXPORT_SYMBOL_GPL(gmap_map_segment);
  * This function does not establish potentially missing page table entries.
  * The mmap_sem of the mm that belongs to the address space must be held
  * when this function gets called.
+ *
+ * Note: Can also be called for shadow gmaps.
  */
 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 {
@@ -333,6 +459,7 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 
        vmaddr = (unsigned long)
                radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
+       /* Note: guest_to_host is empty for a shadow gmap */
        return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
 }
 EXPORT_SYMBOL_GPL(__gmap_translate);
@@ -369,11 +496,13 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table,
        struct gmap *gmap;
        int flush;
 
-       list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
                flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
                if (flush)
                        gmap_flush_tlb(gmap);
        }
+       rcu_read_unlock();
 }
 
 /**
@@ -397,6 +526,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
        pmd_t *pmd;
        int rc;
 
+       BUG_ON(gmap_is_shadow(gmap));
        /* Create higher level tables in the gmap page table */
        table = gmap->table;
        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
@@ -552,116 +682,1412 @@ static LIST_HEAD(gmap_notifier_list);
 static DEFINE_SPINLOCK(gmap_notifier_lock);
 
 /**
- * gmap_register_ipte_notifier - register a pte invalidation callback
+ * gmap_register_pte_notifier - register a pte invalidation callback
  * @nb: pointer to the gmap notifier block
  */
-void gmap_register_ipte_notifier(struct gmap_notifier *nb)
+void gmap_register_pte_notifier(struct gmap_notifier *nb)
 {
        spin_lock(&gmap_notifier_lock);
-       list_add(&nb->list, &gmap_notifier_list);
+       list_add_rcu(&nb->list, &gmap_notifier_list);
        spin_unlock(&gmap_notifier_lock);
 }
-EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
+EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
 
 /**
- * gmap_unregister_ipte_notifier - remove a pte invalidation callback
+ * gmap_unregister_pte_notifier - remove a pte invalidation callback
  * @nb: pointer to the gmap notifier block
  */
-void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
+void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
 {
        spin_lock(&gmap_notifier_lock);
-       list_del_init(&nb->list);
+       list_del_rcu(&nb->list);
        spin_unlock(&gmap_notifier_lock);
+       synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
+
+/**
+ * gmap_call_notifier - call all registered invalidation callbacks
+ * @gmap: pointer to guest mapping meta data structure
+ * @start: start virtual address in the guest address space
+ * @end: end virtual address in the guest address space
+ */
+static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
+                              unsigned long end)
+{
+       struct gmap_notifier *nb;
+
+       list_for_each_entry(nb, &gmap_notifier_list, list)
+               nb->notifier_call(gmap, start, end);
+}
+
+/**
+ * gmap_table_walk - walk the gmap page tables
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @level: page table level to stop at
+ *
+ * Returns a table entry pointer for the given guest address and @level
+ * @level=0 : returns a pointer to a page table table entry (or NULL)
+ * @level=1 : returns a pointer to a segment table entry (or NULL)
+ * @level=2 : returns a pointer to a region-3 table entry (or NULL)
+ * @level=3 : returns a pointer to a region-2 table entry (or NULL)
+ * @level=4 : returns a pointer to a region-1 table entry (or NULL)
+ *
+ * Returns NULL if the gmap page tables could not be walked to the
+ * requested level.
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static inline unsigned long *gmap_table_walk(struct gmap *gmap,
+                                            unsigned long gaddr, int level)
+{
+       unsigned long *table;
+
+       if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
+               return NULL;
+       if (gmap_is_shadow(gmap) && gmap->removed)
+               return NULL;
+       if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+               return NULL;
+       table = gmap->table;
+       switch (gmap->asce & _ASCE_TYPE_MASK) {
+       case _ASCE_TYPE_REGION1:
+               table += (gaddr >> 53) & 0x7ff;
+               if (level == 4)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_REGION2:
+               table += (gaddr >> 42) & 0x7ff;
+               if (level == 3)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_REGION3:
+               table += (gaddr >> 31) & 0x7ff;
+               if (level == 2)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+               /* Fallthrough */
+       case _ASCE_TYPE_SEGMENT:
+               table += (gaddr >> 20) & 0x7ff;
+               if (level == 1)
+                       break;
+               if (*table & _REGION_ENTRY_INVALID)
+                       return NULL;
+               table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+               table += (gaddr >> 12) & 0xff;
+       }
+       return table;
+}
+
+/**
+ * gmap_pte_op_walk - walk the gmap page table, get the page table lock
+ *                   and return the pte pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @ptl: pointer to the spinlock pointer
+ *
+ * Returns a pointer to the locked pte for a guest address, or NULL
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
+                              spinlock_t **ptl)
+{
+       unsigned long *table;
+
+       if (gmap_is_shadow(gmap))
+               spin_lock(&gmap->guest_table_lock);
+       /* Walk the gmap page table, lock and get pte pointer */
+       table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
+       if (!table || *table & _SEGMENT_ENTRY_INVALID) {
+               if (gmap_is_shadow(gmap))
+                       spin_unlock(&gmap->guest_table_lock);
+               return NULL;
+       }
+       if (gmap_is_shadow(gmap)) {
+               *ptl = &gmap->guest_table_lock;
+               return pte_offset_map((pmd_t *) table, gaddr);
+       }
+       return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
+}
+
+/**
+ * gmap_pte_op_fixup - force a page in and connect the gmap page table
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: address in the host process address space
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if the caller can retry __gmap_translate (might fail again),
+ * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
+ * up or connecting the gmap page table.
+ */
+static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
+                            unsigned long vmaddr, int prot)
+{
+       struct mm_struct *mm = gmap->mm;
+       unsigned int fault_flags;
+       bool unlocked = false;
+
+       BUG_ON(gmap_is_shadow(gmap));
+       fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+       if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+               return -EFAULT;
+       if (unlocked)
+               /* lost mmap_sem, caller has to retry __gmap_translate */
+               return 0;
+       /* Connect the page tables */
+       return __gmap_link(gmap, gaddr, vmaddr);
 }
-EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
 
 /**
- * gmap_ipte_notify - mark a range of ptes for invalidation notification
+ * gmap_pte_op_end - release the page table lock
+ * @ptl: pointer to the spinlock pointer
+ */
+static void gmap_pte_op_end(spinlock_t *ptl)
+{
+       spin_unlock(ptl);
+}
+
+/*
+ * gmap_protect_range - remove access rights to memory and set pgste bits
  * @gmap: pointer to guest mapping meta data structure
  * @gaddr: virtual address in the guest address space
  * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: pgste notification bits to set
  *
- * Returns 0 if for each page in the given range a gmap mapping exists and
- * the invalidation notification could be set. If the gmap mapping is missing
- * for one or more pages -EFAULT is returned. If no memory could be allocated
- * -ENOMEM is returned. This function establishes missing page table entries.
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+ *
+ * Called with sg->mm->mmap_sem in read.
+ *
+ * Note: Can also be called for shadow gmaps.
  */
-int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
+static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
+                             unsigned long len, int prot, unsigned long bits)
 {
-       unsigned long addr;
+       unsigned long vmaddr;
        spinlock_t *ptl;
        pte_t *ptep;
-       bool unlocked;
-       int rc = 0;
+       int rc;
+
+       while (len) {
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+               if (ptep) {
+                       rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
+                       gmap_pte_op_end(ptl);
+               }
+               if (rc) {
+                       vmaddr = __gmap_translate(gmap, gaddr);
+                       if (IS_ERR_VALUE(vmaddr))
+                               return vmaddr;
+                       rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
+                       if (rc)
+                               return rc;
+                       continue;
+               }
+               gaddr += PAGE_SIZE;
+               len -= PAGE_SIZE;
+       }
+       return 0;
+}
+
+/**
+ * gmap_mprotect_notify - change access rights for a range of ptes and
+ *                        call the notifier if any pte changes again
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if for each page in the given range a gmap mapping exists,
+ * the new access rights could be set and the notifier could be armed.
+ * If the gmap mapping is missing for one or more pages -EFAULT is
+ * returned. If no memory could be allocated -ENOMEM is returned.
+ * This function establishes missing page table entries.
+ */
+int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
+                        unsigned long len, int prot)
+{
+       int rc;
 
-       if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
+       if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
+               return -EINVAL;
+       if (!MACHINE_HAS_ESOP && prot == PROT_READ)
                return -EINVAL;
        down_read(&gmap->mm->mmap_sem);
-       while (len) {
-               unlocked = false;
-               /* Convert gmap address and connect the page tables */
-               addr = __gmap_translate(gmap, gaddr);
-               if (IS_ERR_VALUE(addr)) {
-                       rc = addr;
-                       break;
+       rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
+       up_read(&gmap->mm->mmap_sem);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+
+/**
+ * gmap_read_table - get an unsigned long value from a guest page table using
+ *                   absolute addressing, without marking the page referenced.
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @val: pointer to the unsigned long value to return
+ *
+ * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
+ * if reading using the virtual address failed.
+ *
+ * Called with gmap->mm->mmap_sem in read.
+ */
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
+{
+       unsigned long address, vmaddr;
+       spinlock_t *ptl;
+       pte_t *ptep, pte;
+       int rc;
+
+       while (1) {
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+               if (ptep) {
+                       pte = *ptep;
+                       if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
+                               address = pte_val(pte) & PAGE_MASK;
+                               address += gaddr & ~PAGE_MASK;
+                               *val = *(unsigned long *) address;
+                               pte_val(*ptep) |= _PAGE_YOUNG;
+                               /* Do *NOT* clear the _PAGE_INVALID bit! */
+                               rc = 0;
+                       }
+                       gmap_pte_op_end(ptl);
                }
-               /* Get the page mapped */
-               if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
-                                    &unlocked)) {
-                       rc = -EFAULT;
+               if (!rc)
+                       break;
+               vmaddr = __gmap_translate(gmap, gaddr);
+               if (IS_ERR_VALUE(vmaddr)) {
+                       rc = vmaddr;
                        break;
                }
-               /* While trying to map mmap_sem got unlocked. Let us retry */
-               if (unlocked)
-                       continue;
-               rc = __gmap_link(gmap, gaddr, addr);
+               rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
                if (rc)
                        break;
-               /* Walk the process page table, lock and get pte pointer */
-               ptep = get_locked_pte(gmap->mm, addr, &ptl);
-               VM_BUG_ON(!ptep);
-               /* Set notification bit in the pgste of the pte */
-               if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
-                       ptep_set_notify(gmap->mm, addr, ptep);
-                       gaddr += PAGE_SIZE;
-                       len -= PAGE_SIZE;
-               }
-               pte_unmap_unlock(ptep, ptl);
        }
-       up_read(&gmap->mm->mmap_sem);
        return rc;
 }
-EXPORT_SYMBOL_GPL(gmap_ipte_notify);
+EXPORT_SYMBOL_GPL(gmap_read_table);
 
 /**
- * ptep_notify - call all invalidation callbacks for a specific pte.
- * @mm: pointer to the process mm_struct
- * @addr: virtual address in the process address space
- * @pte: pointer to the page table entry
+ * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
+ * @sg: pointer to the shadow guest address space structure
+ * @vmaddr: vm address associated with the rmap
+ * @rmap: pointer to the rmap structure
  *
- * This function is assumed to be called with the page table lock held
- * for the pte to notify.
+ * Called with the sg->guest_table_lock
  */
-void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
+static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
+                                   struct gmap_rmap *rmap)
 {
-       unsigned long offset, gaddr;
-       unsigned long *table;
-       struct gmap_notifier *nb;
-       struct gmap *gmap;
+       void **slot;
 
-       offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
-       offset = offset * (4096 / sizeof(pte_t));
-       spin_lock(&gmap_notifier_lock);
-       list_for_each_entry(gmap, &mm->context.gmap_list, list) {
-               table = radix_tree_lookup(&gmap->host_to_guest,
-                                         vmaddr >> PMD_SHIFT);
-               if (!table)
+       BUG_ON(!gmap_is_shadow(sg));
+       slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+       if (slot) {
+               rmap->next = radix_tree_deref_slot_protected(slot,
+                                                       &sg->guest_table_lock);
+               radix_tree_replace_slot(slot, rmap);
+       } else {
+               rmap->next = NULL;
+               radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
+                                 rmap);
+       }
+}
+
+/**
+ * gmap_protect_rmap - modify access rights to memory and create an rmap
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow gmap
+ * @paddr: address in the parent guest address space
+ * @len: length of the memory area to protect
+ * @prot: indicates access rights: none, read-only or read-write
+ *
+ * Returns 0 if successfully protected and the rmap was created, -ENOMEM
+ * if out of memory and -EFAULT if paddr is invalid.
+ */
+static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
+                            unsigned long paddr, unsigned long len, int prot)
+{
+       struct gmap *parent;
+       struct gmap_rmap *rmap;
+       unsigned long vmaddr;
+       spinlock_t *ptl;
+       pte_t *ptep;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       parent = sg->parent;
+       while (len) {
+               vmaddr = __gmap_translate(parent, paddr);
+               if (IS_ERR_VALUE(vmaddr))
+                       return vmaddr;
+               rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+               if (!rmap)
+                       return -ENOMEM;
+               rmap->raddr = raddr;
+               rc = radix_tree_preload(GFP_KERNEL);
+               if (rc) {
+                       kfree(rmap);
+                       return rc;
+               }
+               rc = -EAGAIN;
+               ptep = gmap_pte_op_walk(parent, paddr, &ptl);
+               if (ptep) {
+                       spin_lock(&sg->guest_table_lock);
+                       rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
+                                            PGSTE_VSIE_BIT);
+                       if (!rc)
+                               gmap_insert_rmap(sg, vmaddr, rmap);
+                       spin_unlock(&sg->guest_table_lock);
+                       gmap_pte_op_end(ptl);
+               }
+               radix_tree_preload_end();
+               if (rc) {
+                       kfree(rmap);
+                       rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+                       if (rc)
+                               return rc;
                        continue;
-               gaddr = __gmap_segment_gaddr(table) + offset;
-               list_for_each_entry(nb, &gmap_notifier_list, list)
-                       nb->notifier_call(gmap, gaddr);
+               }
+               paddr += PAGE_SIZE;
+               len -= PAGE_SIZE;
        }
-       spin_unlock(&gmap_notifier_lock);
+       return 0;
+}
+
+#define _SHADOW_RMAP_MASK      0x7
+#define _SHADOW_RMAP_REGION1   0x5
+#define _SHADOW_RMAP_REGION2   0x4
+#define _SHADOW_RMAP_REGION3   0x3
+#define _SHADOW_RMAP_SEGMENT   0x2
+#define _SHADOW_RMAP_PGTABLE   0x1
+
+/**
+ * gmap_idte_one - invalidate a single region or segment table entry
+ * @asce: region or segment table *origin* + table-type bits
+ * @vaddr: virtual address to identify the table entry to flush
+ *
+ * The invalid bit of a single region or segment table entry is set
+ * and the associated TLB entries depending on the entry are flushed.
+ * The table-type of the @asce identifies the portion of the @vaddr
+ * that is used as the invalidation index.
+ */
+static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
+{
+       asm volatile(
+               "       .insn   rrf,0xb98e0000,%0,%1,0,0"
+               : : "a" (asce), "a" (vaddr) : "cc", "memory");
+}
+
+/**
+ * gmap_unshadow_page - remove a page from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long *table;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
+       if (!table || *table & _PAGE_INVALID)
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1);
+       ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
+}
+
+/**
+ * __gmap_unshadow_pgt - remove all entries from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @pgt: pointer to the start of a shadow page table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
+                               unsigned long *pgt)
+{
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       for (i = 0; i < 256; i++, raddr += 1UL << 12)
+               pgt[i] = _PAGE_INVALID;
+}
+
+/**
+ * gmap_unshadow_pgt - remove a shadow page table from a segment entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long sto, *ste, *pgt;
+       struct page *page;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
+       if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1);
+       sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff));
+       gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
+       pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+       *ste = _SEGMENT_ENTRY_EMPTY;
+       __gmap_unshadow_pgt(sg, raddr, pgt);
+       /* Free page table */
+       page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       page_table_free_pgste(page);
+}
+
+/**
+ * __gmap_unshadow_sgt - remove all entries from a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @sgt: pointer to the start of a shadow segment table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
+                               unsigned long *sgt)
+{
+       unsigned long asce, *pgt;
+       struct page *page;
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 20) {
+               if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
+                       continue;
+               pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+               sgt[i] = _SEGMENT_ENTRY_EMPTY;
+               __gmap_unshadow_pgt(sg, raddr, pgt);
+               /* Free page table */
+               page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               page_table_free_pgste(page);
+       }
+}
+
+/**
+ * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long r3o, *r3e, *sgt;
+       struct page *page;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
+       if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1);
+       r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff));
+       gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
+       sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+       *r3e = _REGION3_ENTRY_EMPTY;
+       __gmap_unshadow_sgt(sg, raddr, sgt);
+       /* Free segment table */
+       page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ * @r3t: pointer to the start of a shadow region-3 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r3t)
+{
+       unsigned long asce, *sgt;
+       struct page *page;
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r3t | _ASCE_TYPE_REGION3;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 31) {
+               if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+               r3t[i] = _REGION3_ENTRY_EMPTY;
+               __gmap_unshadow_sgt(sg, raddr, sgt);
+               /* Free segment table */
+               page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+}
+
+/**
+ * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long r2o, *r2e, *r3t;
+       struct page *page;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
+       if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1);
+       r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff));
+       gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
+       r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+       *r2e = _REGION2_ENTRY_EMPTY;
+       __gmap_unshadow_r3t(sg, raddr, r3t);
+       /* Free region 3 table */
+       page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r2t: pointer to the start of a shadow region-2 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r2t)
+{
+       unsigned long asce, *r3t;
+       struct page *page;
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r2t | _ASCE_TYPE_REGION2;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 42) {
+               if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+               r2t[i] = _REGION2_ENTRY_EMPTY;
+               __gmap_unshadow_r3t(sg, raddr, r3t);
+               /* Free region 3 table */
+               page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+}
+
+/**
+ * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
+{
+       unsigned long r1o, *r1e, *r2t;
+       struct page *page;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
+       if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
+               return;
+       gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1);
+       r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff));
+       gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
+       r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+       *r1e = _REGION1_ENTRY_EMPTY;
+       __gmap_unshadow_r2t(sg, raddr, r2t);
+       /* Free region 2 table */
+       page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+       list_del(&page->lru);
+       __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r1t: pointer to the start of a shadow region-1 table
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
+                               unsigned long *r1t)
+{
+       unsigned long asce, *r2t;
+       struct page *page;
+       int i;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+       for (i = 0; i < 2048; i++, raddr += 1UL << 53) {
+               if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
+                       continue;
+               r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
+               __gmap_unshadow_r2t(sg, raddr, r2t);
+               /* Clear entry and flush translation r1t -> r2t */
+               gmap_idte_one(asce, raddr);
+               r1t[i] = _REGION1_ENTRY_EMPTY;
+               /* Free region 2 table */
+               page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+               list_del(&page->lru);
+               __free_pages(page, 2);
+       }
+}
+
+/**
+ * gmap_unshadow - remove a shadow page table completely
+ * @sg: pointer to the shadow guest address space structure
+ *
+ * Called with sg->guest_table_lock
+ */
+static void gmap_unshadow(struct gmap *sg)
+{
+       unsigned long *table;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       if (sg->removed)
+               return;
+       sg->removed = 1;
+       gmap_call_notifier(sg, 0, -1UL);
+       gmap_flush_tlb(sg);
+       table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+       switch (sg->asce & _ASCE_TYPE_MASK) {
+       case _ASCE_TYPE_REGION1:
+               __gmap_unshadow_r1t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_REGION2:
+               __gmap_unshadow_r2t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_REGION3:
+               __gmap_unshadow_r3t(sg, 0, table);
+               break;
+       case _ASCE_TYPE_SEGMENT:
+               __gmap_unshadow_sgt(sg, 0, table);
+               break;
+       }
+}
+
+/**
+ * gmap_find_shadow - find a specific asce in the list of shadow tables
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns the pointer to a gmap if a shadow table with the given asce is
+ * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+ * otherwise NULL
+ */
+static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
+                                    int edat_level)
+{
+       struct gmap *sg;
+
+       list_for_each_entry(sg, &parent->children, list) {
+               if (sg->orig_asce != asce || sg->edat_level != edat_level ||
+                   sg->removed)
+                       continue;
+               if (!sg->initialized)
+                       return ERR_PTR(-EAGAIN);
+               atomic_inc(&sg->ref_count);
+               return sg;
+       }
+       return NULL;
+}
+
+/**
+ * gmap_shadow_valid - check if a shadow guest address space matches the
+ *                     given properties and is still valid
+ * @sg: pointer to the shadow guest address space structure
+ * @asce: ASCE for which the shadow table is requested
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns 1 if the gmap shadow is still valid and matches the given
+ * properties, the caller can continue using it. Returns 0 otherwise, the
+ * caller has to request a new shadow gmap in this case.
+ *
+ */
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+{
+       if (sg->removed)
+               return 0;
+       return sg->orig_asce == asce && sg->edat_level == edat_level;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_valid);
+
+/**
+ * gmap_shadow - create/find a shadow guest address space
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * The pages of the top level page table referred by the asce parameter
+ * will be set to read-only and marked in the PGSTEs of the kvm process.
+ * The shadow table will be removed automatically on any change to the
+ * PTE mapping for the source table.
+ *
+ * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+ * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+ * parent gmap table could not be protected.
+ */
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+                        int edat_level)
+{
+       struct gmap *sg, *new;
+       unsigned long limit;
+       int rc;
+
+       BUG_ON(gmap_is_shadow(parent));
+       spin_lock(&parent->shadow_lock);
+       sg = gmap_find_shadow(parent, asce, edat_level);
+       spin_unlock(&parent->shadow_lock);
+       if (sg)
+               return sg;
+       /* Create a new shadow gmap */
+       limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+       if (asce & _ASCE_REAL_SPACE)
+               limit = -1UL;
+       new = gmap_alloc(limit);
+       if (!new)
+               return ERR_PTR(-ENOMEM);
+       new->mm = parent->mm;
+       new->parent = gmap_get(parent);
+       new->orig_asce = asce;
+       new->edat_level = edat_level;
+       new->initialized = false;
+       spin_lock(&parent->shadow_lock);
+       /* Recheck if another CPU created the same shadow */
+       sg = gmap_find_shadow(parent, asce, edat_level);
+       if (sg) {
+               spin_unlock(&parent->shadow_lock);
+               gmap_free(new);
+               return sg;
+       }
+       if (asce & _ASCE_REAL_SPACE) {
+               /* only allow one real-space gmap shadow */
+               list_for_each_entry(sg, &parent->children, list) {
+                       if (sg->orig_asce & _ASCE_REAL_SPACE) {
+                               spin_lock(&sg->guest_table_lock);
+                               gmap_unshadow(sg);
+                               spin_unlock(&sg->guest_table_lock);
+                               list_del(&sg->list);
+                               gmap_put(sg);
+                               break;
+                       }
+               }
+       }
+       atomic_set(&new->ref_count, 2);
+       list_add(&new->list, &parent->children);
+       if (asce & _ASCE_REAL_SPACE) {
+               /* nothing to protect, return right away */
+               new->initialized = true;
+               spin_unlock(&parent->shadow_lock);
+               return new;
+       }
+       spin_unlock(&parent->shadow_lock);
+       /* protect after insertion, so it will get properly invalidated */
+       down_read(&parent->mm->mmap_sem);
+       rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
+                               ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096,
+                               PROT_READ, PGSTE_VSIE_BIT);
+       up_read(&parent->mm->mmap_sem);
+       spin_lock(&parent->shadow_lock);
+       new->initialized = true;
+       if (rc) {
+               list_del(&new->list);
+               gmap_free(new);
+               new = ERR_PTR(rc);
+       }
+       spin_unlock(&parent->shadow_lock);
+       return new;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow);
+
+/**
+ * gmap_shadow_r2t - create an empty shadow region 2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r2t: parent gmap address of the region 2 table to get shadowed
+ * @fake: r2t references contiguous guest memory block, not a r2t
+ *
+ * The r2t parameter specifies the address of the source table. The
+ * four pages of the source table are made read-only in the parent gmap
+ * address space. A write to the source table area @r2t will automatically
+ * remove the shadow r2 table and all of its decendents.
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+                   int fake)
+{
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_r2t, *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       /* Allocate a shadow region second table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = r2t & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_r2t = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= (r2t & _REGION_ENTRY_PROTECT);
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make r2t read-only in parent gmap page table */
+       raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1;
+       origin = r2t & _REGION_ENTRY_ORIGIN;
+       offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 4);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_r2t)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_r2t(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
+
+/**
+ * gmap_shadow_r3t - create a shadow region 3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r3t: parent gmap address of the region 3 table to get shadowed
+ * @fake: r3t references contiguous guest memory block, not a r3t
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+                   int fake)
+{
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_r3t, *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       /* Allocate a shadow region second table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = r3t & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_r3t = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+       }
+       crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= (r3t & _REGION_ENTRY_PROTECT);
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make r3t read-only in parent gmap page table */
+       raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2;
+       origin = r3t & _REGION_ENTRY_ORIGIN;
+       offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 3);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_r3t)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_r3t(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
+
+/**
+ * gmap_shadow_sgt - create a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @sgt: parent gmap address of the segment table to get shadowed
+ * @fake: sgt references contiguous guest memory block, not a sgt
+ *
+ * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+                   int fake)
+{
+       unsigned long raddr, origin, offset, len;
+       unsigned long *s_sgt, *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
+       /* Allocate a shadow segment table */
+       page = alloc_pages(GFP_KERNEL, 2);
+       if (!page)
+               return -ENOMEM;
+       page->index = sgt & _REGION_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_sgt = (unsigned long *) page_to_phys(page);
+       /* Install shadow region second table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _REGION_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _REGION_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+                _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
+       if (sg->edat_level >= 1)
+               *table |= sgt & _REGION_ENTRY_PROTECT;
+       list_add(&page->lru, &sg->crst_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_REGION_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make sgt read-only in parent gmap page table */
+       raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3;
+       origin = sgt & _REGION_ENTRY_ORIGIN;
+       offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+       len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+       rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 2);
+               if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+                             (unsigned long) s_sgt)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_REGION_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_sgt(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+out_free:
+       spin_unlock(&sg->guest_table_lock);
+       __free_pages(page, 2);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
+
+/**
+ * gmap_shadow_lookup_pgtable - find a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: the address in the shadow aguest address space
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @dat_protection: if the pgtable is marked as protected by dat
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if the shadow page table was found and -EAGAIN if the page
+ * table was not found.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+                          unsigned long *pgt, int *dat_protection,
+                          int *fake)
+{
+       unsigned long *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+       if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
+               /* Shadow page tables are full pages (pte+pgste) */
+               page = pfn_to_page(*table >> PAGE_SHIFT);
+               *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
+               *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+               *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
+               rc = 0;
+       } else  {
+               rc = -EAGAIN;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
+
+/**
+ * gmap_shadow_pgt - instantiate a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory,
+ * -EFAULT if an address in the parent gmap could not be resolved and
+ *
+ * Called with gmap->mm->mmap_sem in read
+ */
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+                   int fake)
+{
+       unsigned long raddr, origin;
+       unsigned long *s_pgt, *table;
+       struct page *page;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
+       /* Allocate a shadow page table */
+       page = page_table_alloc_pgste(sg->mm);
+       if (!page)
+               return -ENOMEM;
+       page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+       if (fake)
+               page->index |= GMAP_SHADOW_FAKE_TABLE;
+       s_pgt = (unsigned long *) page_to_phys(page);
+       /* Install shadow page table */
+       spin_lock(&sg->guest_table_lock);
+       table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+       if (!table) {
+               rc = -EAGAIN;           /* Race with unshadow */
+               goto out_free;
+       }
+       if (!(*table & _SEGMENT_ENTRY_INVALID)) {
+               rc = 0;                 /* Already established */
+               goto out_free;
+       } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
+               rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
+       }
+       /* mark as invalid as long as the parent table is not protected */
+       *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
+                (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
+       list_add(&page->lru, &sg->pt_list);
+       if (fake) {
+               /* nothing to protect for fake tables */
+               *table &= ~_SEGMENT_ENTRY_INVALID;
+               spin_unlock(&sg->guest_table_lock);
+               return 0;
+       }
+       spin_unlock(&sg->guest_table_lock);
+       /* Make pgt read-only in parent gmap page table (not the pgste) */
+       raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;
+       origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
+       rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
+       spin_lock(&sg->guest_table_lock);
+       if (!rc) {
+               table = gmap_table_walk(sg, saddr, 1);
+               if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
+                             (unsigned long) s_pgt)
+                       rc = -EAGAIN;           /* Race with unshadow */
+               else
+                       *table &= ~_SEGMENT_ENTRY_INVALID;
+       } else {
+               gmap_unshadow_pgt(sg, raddr);
+       }
+       spin_unlock(&sg->guest_table_lock);
+       return rc;
+out_free:
+       spin_unlock(&sg->guest_table_lock);
+       page_table_free_pgste(page);
+       return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
+
+/**
+ * gmap_shadow_page - create a shadow page mapping
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pte: pte in parent gmap address space to get shadowed
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
+{
+       struct gmap *parent;
+       struct gmap_rmap *rmap;
+       unsigned long vmaddr, paddr;
+       spinlock_t *ptl;
+       pte_t *sptep, *tptep;
+       int prot;
+       int rc;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       parent = sg->parent;
+       prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
+
+       rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+       if (!rmap)
+               return -ENOMEM;
+       rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
+
+       while (1) {
+               paddr = pte_val(pte) & PAGE_MASK;
+               vmaddr = __gmap_translate(parent, paddr);
+               if (IS_ERR_VALUE(vmaddr)) {
+                       rc = vmaddr;
+                       break;
+               }
+               rc = radix_tree_preload(GFP_KERNEL);
+               if (rc)
+                       break;
+               rc = -EAGAIN;
+               sptep = gmap_pte_op_walk(parent, paddr, &ptl);
+               if (sptep) {
+                       spin_lock(&sg->guest_table_lock);
+                       /* Get page table pointer */
+                       tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
+                       if (!tptep) {
+                               spin_unlock(&sg->guest_table_lock);
+                               gmap_pte_op_end(ptl);
+                               radix_tree_preload_end();
+                               break;
+                       }
+                       rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
+                       if (rc > 0) {
+                               /* Success and a new mapping */
+                               gmap_insert_rmap(sg, vmaddr, rmap);
+                               rmap = NULL;
+                               rc = 0;
+                       }
+                       gmap_pte_op_end(ptl);
+                       spin_unlock(&sg->guest_table_lock);
+               }
+               radix_tree_preload_end();
+               if (!rc)
+                       break;
+               rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+               if (rc)
+                       break;
+       }
+       kfree(rmap);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_page);
+
+/**
+ * gmap_shadow_notify - handle notifications for shadow gmap
+ *
+ * Called with sg->parent->shadow_lock.
+ */
+static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
+                              unsigned long offset, pte_t *pte)
+{
+       struct gmap_rmap *rmap, *rnext, *head;
+       unsigned long gaddr, start, end, bits, raddr;
+       unsigned long *table;
+
+       BUG_ON(!gmap_is_shadow(sg));
+       spin_lock(&sg->parent->guest_table_lock);
+       table = radix_tree_lookup(&sg->parent->host_to_guest,
+                                 vmaddr >> PMD_SHIFT);
+       gaddr = table ? __gmap_segment_gaddr(table) + offset : 0;
+       spin_unlock(&sg->parent->guest_table_lock);
+       if (!table)
+               return;
+
+       spin_lock(&sg->guest_table_lock);
+       if (sg->removed) {
+               spin_unlock(&sg->guest_table_lock);
+               return;
+       }
+       /* Check for top level table */
+       start = sg->orig_asce & _ASCE_ORIGIN;
+       end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096;
+       if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
+           gaddr < end) {
+               /* The complete shadow table has to go */
+               gmap_unshadow(sg);
+               spin_unlock(&sg->guest_table_lock);
+               list_del(&sg->list);
+               gmap_put(sg);
+               return;
+       }
+       /* Remove the page table tree from on specific entry */
+       head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12);
+       gmap_for_each_rmap_safe(rmap, rnext, head) {
+               bits = rmap->raddr & _SHADOW_RMAP_MASK;
+               raddr = rmap->raddr ^ bits;
+               switch (bits) {
+               case _SHADOW_RMAP_REGION1:
+                       gmap_unshadow_r2t(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_REGION2:
+                       gmap_unshadow_r3t(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_REGION3:
+                       gmap_unshadow_sgt(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_SEGMENT:
+                       gmap_unshadow_pgt(sg, raddr);
+                       break;
+               case _SHADOW_RMAP_PGTABLE:
+                       gmap_unshadow_page(sg, raddr);
+                       break;
+               }
+               kfree(rmap);
+       }
+       spin_unlock(&sg->guest_table_lock);
+}
+
+/**
+ * ptep_notify - call all invalidation callbacks for a specific pte.
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the process address space
+ * @pte: pointer to the page table entry
+ * @bits: bits from the pgste that caused the notify call
+ *
+ * This function is assumed to be called with the page table lock held
+ * for the pte to notify.
+ */
+void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
+                pte_t *pte, unsigned long bits)
+{
+       unsigned long offset, gaddr;
+       unsigned long *table;
+       struct gmap *gmap, *sg, *next;
+
+       offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
+       offset = offset * (4096 / sizeof(pte_t));
+       rcu_read_lock();
+       list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+               if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
+                       spin_lock(&gmap->shadow_lock);
+                       list_for_each_entry_safe(sg, next,
+                                                &gmap->children, list)
+                               gmap_shadow_notify(sg, vmaddr, offset, pte);
+                       spin_unlock(&gmap->shadow_lock);
+               }
+               if (!(bits & PGSTE_IN_BIT))
+                       continue;
+               spin_lock(&gmap->guest_table_lock);
+               table = radix_tree_lookup(&gmap->host_to_guest,
+                                         vmaddr >> PMD_SHIFT);
+               if (table)
+                       gaddr = __gmap_segment_gaddr(table) + offset;
+               spin_unlock(&gmap->guest_table_lock);
+               if (table)
+                       gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
+       }
+       rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(ptep_notify);
 
index e2565d2..995f785 100644 (file)
@@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
        return new;
 }
 
+#ifdef CONFIG_PGSTE
+
+struct page *page_table_alloc_pgste(struct mm_struct *mm)
+{
+       struct page *page;
+       unsigned long *table;
+
+       page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+       if (page) {
+               table = (unsigned long *) page_to_phys(page);
+               clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+               clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+       }
+       return page;
+}
+
+void page_table_free_pgste(struct page *page)
+{
+       __free_page(page);
+}
+
+#endif /* CONFIG_PGSTE */
+
 /*
  * page table entry allocation/free routines.
  */
@@ -149,7 +172,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
        /* Try to get a fragment of a 4K page as a 2K page table */
        if (!mm_alloc_pgste(mm)) {
                table = NULL;
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                if (!list_empty(&mm->context.pgtable_list)) {
                        page = list_first_entry(&mm->context.pgtable_list,
                                                struct page, lru);
@@ -164,7 +187,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
                                list_del(&page->lru);
                        }
                }
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
                if (table)
                        return table;
        }
@@ -187,9 +210,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
                /* Return the first 2K fragment of the page */
                atomic_set(&page->_mapcount, 1);
                clear_table(table, _PAGE_INVALID, PAGE_SIZE);
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                list_add(&page->lru, &mm->context.pgtable_list);
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
        }
        return table;
 }
@@ -203,13 +226,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
        if (!mm_alloc_pgste(mm)) {
                /* Free 2K page table fragment of a 4K page */
                bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
-               spin_lock_bh(&mm->context.list_lock);
+               spin_lock_bh(&mm->context.pgtable_lock);
                mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
                if (mask & 3)
                        list_add(&page->lru, &mm->context.pgtable_list);
                else
                        list_del(&page->lru);
-               spin_unlock_bh(&mm->context.list_lock);
+               spin_unlock_bh(&mm->context.pgtable_lock);
                if (mask != 0)
                        return;
        }
@@ -235,13 +258,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
                return;
        }
        bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
-       spin_lock_bh(&mm->context.list_lock);
+       spin_lock_bh(&mm->context.pgtable_lock);
        mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
        if (mask & 3)
                list_add_tail(&page->lru, &mm->context.pgtable_list);
        else
                list_del(&page->lru);
-       spin_unlock_bh(&mm->context.list_lock);
+       spin_unlock_bh(&mm->context.pgtable_lock);
        table = (unsigned long *) (__pa(table) | (1U << bit));
        tlb_remove_table(tlb, table);
 }
index b98d1a1..5f09201 100644 (file)
@@ -174,14 +174,17 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
        return pgste;
 }
 
-static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
-                                       unsigned long addr,
-                                       pte_t *ptep, pgste_t pgste)
+static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
+                                      unsigned long addr,
+                                      pte_t *ptep, pgste_t pgste)
 {
 #ifdef CONFIG_PGSTE
-       if (pgste_val(pgste) & PGSTE_IN_BIT) {
-               pgste_val(pgste) &= ~PGSTE_IN_BIT;
-               ptep_notify(mm, addr, ptep);
+       unsigned long bits;
+
+       bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+       if (bits) {
+               pgste_val(pgste) ^= bits;
+               ptep_notify(mm, addr, ptep, bits);
        }
 #endif
        return pgste;
@@ -194,7 +197,7 @@ static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
 
        if (mm_has_pgste(mm)) {
                pgste = pgste_get_lock(ptep);
-               pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+               pgste = pgste_pte_notify(mm, addr, ptep, pgste);
        }
        return pgste;
 }
@@ -459,6 +462,90 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
        preempt_enable();
 }
 
+/**
+ * ptep_force_prot - change access rights of a locked pte
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the guest address space
+ * @ptep: pointer to the page table entry
+ * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bit: pgste bit to set (e.g. for notification)
+ *
+ * Returns 0 if the access rights were changed and -EAGAIN if the current
+ * and requested access rights are incompatible.
+ */
+int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
+                   pte_t *ptep, int prot, unsigned long bit)
+{
+       pte_t entry;
+       pgste_t pgste;
+       int pte_i, pte_p;
+
+       pgste = pgste_get_lock(ptep);
+       entry = *ptep;
+       /* Check pte entry after all locks have been acquired */
+       pte_i = pte_val(entry) & _PAGE_INVALID;
+       pte_p = pte_val(entry) & _PAGE_PROTECT;
+       if ((pte_i && (prot != PROT_NONE)) ||
+           (pte_p && (prot & PROT_WRITE))) {
+               pgste_set_unlock(ptep, pgste);
+               return -EAGAIN;
+       }
+       /* Change access rights and set pgste bit */
+       if (prot == PROT_NONE && !pte_i) {
+               ptep_flush_direct(mm, addr, ptep);
+               pgste = pgste_update_all(entry, pgste, mm);
+               pte_val(entry) |= _PAGE_INVALID;
+       }
+       if (prot == PROT_READ && !pte_p) {
+               ptep_flush_direct(mm, addr, ptep);
+               pte_val(entry) &= ~_PAGE_INVALID;
+               pte_val(entry) |= _PAGE_PROTECT;
+       }
+       pgste_val(pgste) |= bit;
+       pgste = pgste_set_pte(ptep, pgste, entry);
+       pgste_set_unlock(ptep, pgste);
+       return 0;
+}
+
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+                   pte_t *sptep, pte_t *tptep, pte_t pte)
+{
+       pgste_t spgste, tpgste;
+       pte_t spte, tpte;
+       int rc = -EAGAIN;
+
+       if (!(pte_val(*tptep) & _PAGE_INVALID))
+               return 0;       /* already shadowed */
+       spgste = pgste_get_lock(sptep);
+       spte = *sptep;
+       if (!(pte_val(spte) & _PAGE_INVALID) &&
+           !((pte_val(spte) & _PAGE_PROTECT) &&
+             !(pte_val(pte) & _PAGE_PROTECT))) {
+               pgste_val(spgste) |= PGSTE_VSIE_BIT;
+               tpgste = pgste_get_lock(tptep);
+               pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
+                               (pte_val(pte) & _PAGE_PROTECT);
+               /* don't touch the storage key - it belongs to parent pgste */
+               tpgste = pgste_set_pte(tptep, tpgste, tpte);
+               pgste_set_unlock(tptep, tpgste);
+               rc = 1;
+       }
+       pgste_set_unlock(sptep, spgste);
+       return rc;
+}
+
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
+{
+       pgste_t pgste;
+
+       pgste = pgste_get_lock(ptep);
+       /* notifier is called by the caller */
+       ptep_flush_direct(mm, saddr, ptep);
+       /* don't touch the storage key - it belongs to parent pgste */
+       pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
+       pgste_set_unlock(ptep, pgste);
+}
+
 static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
 {
        if (!non_swap_entry(entry))
@@ -532,7 +619,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
        pgste_val(pgste) &= ~PGSTE_UC_BIT;
        pte = *ptep;
        if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
-               pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+               pgste = pgste_pte_notify(mm, addr, ptep, pgste);
                __ptep_ipte(addr, ptep);
                if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
                        pte_val(pte) |= _PAGE_PROTECT;
@@ -555,12 +642,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
        pgste_t old, new;
        pte_t *ptep;
 
-       down_read(&mm->mmap_sem);
        ptep = get_locked_pte(mm, addr, &ptl);
-       if (unlikely(!ptep)) {
-               up_read(&mm->mmap_sem);
+       if (unlikely(!ptep))
                return -EFAULT;
-       }
 
        new = old = pgste_get_lock(ptep);
        pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
@@ -587,45 +671,100 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 
        pgste_set_unlock(ptep, new);
        pte_unmap_unlock(ptep, ptl);
-       up_read(&mm->mmap_sem);
        return 0;
 }
 EXPORT_SYMBOL(set_guest_storage_key);
 
-unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
+/**
+ * Conditionally set a guest storage key (handling csske).
+ * oldkey will be updated when either mr or mc is set and a pointer is given.
+ *
+ * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest
+ * storage key was updated and -EFAULT on access errors.
+ */
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                              unsigned char key, unsigned char *oldkey,
+                              bool nq, bool mr, bool mc)
+{
+       unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT;
+       int rc;
+
+       /* we can drop the pgste lock between getting and setting the key */
+       if (mr | mc) {
+               rc = get_guest_storage_key(current->mm, addr, &tmp);
+               if (rc)
+                       return rc;
+               if (oldkey)
+                       *oldkey = tmp;
+               if (!mr)
+                       mask |= _PAGE_REFERENCED;
+               if (!mc)
+                       mask |= _PAGE_CHANGED;
+               if (!((tmp ^ key) & mask))
+                       return 0;
+       }
+       rc = set_guest_storage_key(current->mm, addr, key, nq);
+       return rc < 0 ? rc : 1;
+}
+EXPORT_SYMBOL(cond_set_guest_storage_key);
+
+/**
+ * Reset a guest reference bit (rrbe), returning the reference and changed bit.
+ *
+ * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
+ */
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
 {
-       unsigned char key;
        spinlock_t *ptl;
-       pgste_t pgste;
+       pgste_t old, new;
        pte_t *ptep;
+       int cc = 0;
 
-       down_read(&mm->mmap_sem);
        ptep = get_locked_pte(mm, addr, &ptl);
-       if (unlikely(!ptep)) {
-               up_read(&mm->mmap_sem);
+       if (unlikely(!ptep))
                return -EFAULT;
-       }
-       pgste = pgste_get_lock(ptep);
 
-       if (pte_val(*ptep) & _PAGE_INVALID) {
-               key  = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
-               key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
-               key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
-               key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
-       } else {
-               key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+       new = old = pgste_get_lock(ptep);
+       /* Reset guest reference bit only */
+       pgste_val(new) &= ~PGSTE_GR_BIT;
 
-               /* Reflect guest's logical view, not physical */
-               if (pgste_val(pgste) & PGSTE_GR_BIT)
-                       key |= _PAGE_REFERENCED;
-               if (pgste_val(pgste) & PGSTE_GC_BIT)
-                       key |= _PAGE_CHANGED;
+       if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+               cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
+               /* Merge real referenced bit into host-set */
+               pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
        }
+       /* Reflect guest's logical view, not physical */
+       cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
+       /* Changing the guest storage key is considered a change of the page */
+       if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
+               pgste_val(new) |= PGSTE_UC_BIT;
+
+       pgste_set_unlock(ptep, new);
+       pte_unmap_unlock(ptep, ptl);
+       return 0;
+}
+EXPORT_SYMBOL(reset_guest_reference_bit);
+
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+                         unsigned char *key)
+{
+       spinlock_t *ptl;
+       pgste_t pgste;
+       pte_t *ptep;
 
+       ptep = get_locked_pte(mm, addr, &ptl);
+       if (unlikely(!ptep))
+               return -EFAULT;
+
+       pgste = pgste_get_lock(ptep);
+       *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+       if (!(pte_val(*ptep) & _PAGE_INVALID))
+               *key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+       /* Reflect guest's logical view, not physical */
+       *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
        pgste_set_unlock(ptep, pgste);
        pte_unmap_unlock(ptep, ptl);
-       up_read(&mm->mmap_sem);
-       return key;
+       return 0;
 }
 EXPORT_SYMBOL(get_guest_storage_key);
 #endif
index 69e6286..33ae3a4 100644 (file)
@@ -35,8 +35,9 @@
 #include <asm/asm.h>
 #include <asm/kvm_page_track.h>
 
-#define KVM_MAX_VCPUS 255
-#define KVM_SOFT_MAX_VCPUS 160
+#define KVM_MAX_VCPUS 288
+#define KVM_SOFT_MAX_VCPUS 240
+#define KVM_MAX_VCPU_ID 1023
 #define KVM_USER_MEM_SLOTS 509
 /* memory slots that are not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 3
@@ -599,6 +600,7 @@ struct kvm_vcpu_arch {
        u64 mcg_cap;
        u64 mcg_status;
        u64 mcg_ctl;
+       u64 mcg_ext_ctl;
        u64 *mce_banks;
 
        /* Cache MMIO info */
@@ -682,9 +684,12 @@ struct kvm_arch_memory_slot {
 struct kvm_apic_map {
        struct rcu_head rcu;
        u8 mode;
-       struct kvm_lapic *phys_map[256];
-       /* first index is cluster id second is cpu id in a cluster */
-       struct kvm_lapic *logical_map[16][16];
+       u32 max_apic_id;
+       union {
+               struct kvm_lapic *xapic_flat_map[8];
+               struct kvm_lapic *xapic_cluster_map[16][4];
+       };
+       struct kvm_lapic *phys_map[];
 };
 
 /* Hyper-V emulation context */
@@ -779,6 +784,9 @@ struct kvm_arch {
        u32 ldr_mode;
        struct page *avic_logical_id_table_page;
        struct page *avic_physical_id_table_page;
+
+       bool x2apic_format;
+       bool x2apic_broadcast_quirk_disabled;
 };
 
 struct kvm_vm_stat {
@@ -1006,6 +1014,11 @@ struct kvm_x86_ops {
        int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
                              uint32_t guest_irq, bool set);
        void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
+
+       int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc);
+       void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
+
+       void (*setup_mce)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
@@ -1026,7 +1039,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_init_vm(struct kvm *kvm);
 void kvm_mmu_uninit_vm(struct kvm *kvm);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask);
+               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -1077,6 +1090,10 @@ extern u32  kvm_max_guest_tsc_khz;
 extern u8   kvm_tsc_scaling_ratio_frac_bits;
 /* maximum allowed value of TSC scaling ratio */
 extern u64  kvm_max_tsc_scaling_ratio;
+/* 1ull << kvm_tsc_scaling_ratio_frac_bits */
+extern u64  kvm_default_tsc_scaling_ratio;
+
+extern u64 kvm_mce_cap_supported;
 
 enum emulation_result {
        EMULATE_DONE,         /* no further processing */
@@ -1352,7 +1369,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
 bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
                             struct kvm_vcpu **dest_vcpu);
 
-void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
                     struct kvm_lapic_irq *irq);
 
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
index d0fe23e..14824fc 100644 (file)
@@ -193,7 +193,6 @@ struct __attribute__ ((__packed__)) vmcb {
        struct vmcb_save_area save;
 };
 
-#define SVM_CPUID_FEATURE_SHIFT 2
 #define SVM_CPUID_FUNC 0x8000000a
 
 #define SVM_VM_CR_SVM_DISABLE 4
index cce9ee6..0116b2e 100644 (file)
@@ -83,23 +83,19 @@ static inline void cpu_emergency_vmxoff(void)
  */
 static inline int cpu_has_svm(const char **msg)
 {
-       uint32_t eax, ebx, ecx, edx;
-
        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
                if (msg)
                        *msg = "not amd";
                return 0;
        }
 
-       cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
-       if (eax < SVM_CPUID_FUNC) {
+       if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) {
                if (msg)
                        *msg = "can't execute cpuid_8000000a";
                return 0;
        }
 
-       cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
-       if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
+       if (!boot_cpu_has(X86_FEATURE_SVM)) {
                if (msg)
                        *msg = "svm not available";
                return 0;
index 639a6e3..ab8e32f 100644 (file)
@@ -32,7 +32,6 @@ config KVM
        select HAVE_KVM_IRQ_BYPASS
        select HAVE_KVM_IRQ_ROUTING
        select HAVE_KVM_EVENTFD
-       select KVM_APIC_ARCHITECTURE
        select KVM_ASYNC_PF
        select USER_RETURN_NOTIFIER
        select KVM_MMIO
index a4bf5b4..5fb6c62 100644 (file)
@@ -645,7 +645,6 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
        .write    = speaker_ioport_write,
 };
 
-/* Caller must hold slots_lock */
 struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 {
        struct kvm_pit *pit;
@@ -690,6 +689,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
        kvm_pit_set_reinject(pit, true);
 
+       mutex_lock(&kvm->slots_lock);
        kvm_iodevice_init(&pit->dev, &pit_dev_ops);
        ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
                                      KVM_PIT_MEM_LENGTH, &pit->dev);
@@ -704,12 +704,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
                if (ret < 0)
                        goto fail_register_speaker;
        }
+       mutex_unlock(&kvm->slots_lock);
 
        return pit;
 
 fail_register_speaker:
        kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
 fail_register_pit:
+       mutex_unlock(&kvm->slots_lock);
        kvm_pit_set_reinject(pit, false);
        kthread_stop(pit->worker_task);
 fail_kthread:
index 95e0e64..b181426 100644 (file)
@@ -28,9 +28,7 @@
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
 #include <linux/stat.h>
-#include <linux/dmar.h>
 #include <linux/iommu.h>
-#include <linux/intel-iommu.h>
 #include "assigned-dev.h"
 
 static bool allow_unsafe_assigned_interrupts;
index dfb4c64..25810b1 100644 (file)
@@ -110,13 +110,17 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
        return r;
 }
 
-void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
                     struct kvm_lapic_irq *irq)
 {
-       trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
+       trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ?
+                                            (u64)e->msi.address_hi << 32 : 0),
+                             e->msi.data);
 
        irq->dest_id = (e->msi.address_lo &
                        MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+       if (kvm->arch.x2apic_format)
+               irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi);
        irq->vector = (e->msi.data &
                        MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
        irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
@@ -129,15 +133,24 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
 }
 EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
 
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+               struct kvm_kernel_irq_routing_entry *e)
+{
+       return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
                struct kvm *kvm, int irq_source_id, int level, bool line_status)
 {
        struct kvm_lapic_irq irq;
 
+       if (kvm_msi_route_invalid(kvm, e))
+               return -EINVAL;
+
        if (!level)
                return -1;
 
-       kvm_set_msi_irq(e, &irq);
+       kvm_set_msi_irq(kvm, e, &irq);
 
        return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
 }
@@ -153,7 +166,10 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
        if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
                return -EWOULDBLOCK;
 
-       kvm_set_msi_irq(e, &irq);
+       if (kvm_msi_route_invalid(kvm, e))
+               return -EINVAL;
+
+       kvm_set_msi_irq(kvm, e, &irq);
 
        if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
                return r;
@@ -248,7 +264,8 @@ static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
        return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
 }
 
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+                         struct kvm_kernel_irq_routing_entry *e,
                          const struct kvm_irq_routing_entry *ue)
 {
        int r = -EINVAL;
@@ -285,6 +302,9 @@ int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
                e->msi.address_lo = ue->u.msi.address_lo;
                e->msi.address_hi = ue->u.msi.address_hi;
                e->msi.data = ue->u.msi.data;
+
+               if (kvm_msi_route_invalid(kvm, e))
+                       goto out;
                break;
        case KVM_IRQ_ROUTING_HV_SINT:
                e->set = kvm_hv_set_sint;
@@ -388,21 +408,16 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
                               kvm->arch.nr_reserved_ioapic_pins);
        for (i = 0; i < nr_ioapic_pins; ++i) {
                hlist_for_each_entry(entry, &table->map[i], link) {
-                       u32 dest_id, dest_mode;
-                       bool level;
+                       struct kvm_lapic_irq irq;
 
                        if (entry->type != KVM_IRQ_ROUTING_MSI)
                                continue;
-                       dest_id = (entry->msi.address_lo >> 12) & 0xff;
-                       dest_mode = (entry->msi.address_lo >> 2) & 0x1;
-                       level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL;
-                       if (level && kvm_apic_match_dest(vcpu, NULL, 0,
-                                               dest_id, dest_mode)) {
-                               u32 vector = entry->msi.data & 0xff;
-
-                               __set_bit(vector,
-                                         ioapic_handled_vectors);
-                       }
+
+                       kvm_set_msi_irq(vcpu->kvm, entry, &irq);
+
+                       if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0,
+                                               irq.dest_id, irq.dest_mode))
+                               __set_bit(irq.vector, ioapic_handled_vectors);
                }
        }
        srcu_read_unlock(&kvm->irq_srcu, idx);
index 57549ed..730cf17 100644 (file)
@@ -115,26 +115,43 @@ static inline int apic_enabled(struct kvm_lapic *apic)
        (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
         APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
 
-/* The logical map is definitely wrong if we have multiple
- * modes at the same time.  (Physical map is always right.)
- */
-static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map)
-{
-       return !(map->mode & (map->mode - 1));
+static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
+               u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
+       switch (map->mode) {
+       case KVM_APIC_MODE_X2APIC: {
+               u32 offset = (dest_id >> 16) * 16;
+               u32 max_apic_id = map->max_apic_id;
+
+               if (offset <= max_apic_id) {
+                       u8 cluster_size = min(max_apic_id - offset + 1, 16U);
+
+                       *cluster = &map->phys_map[offset];
+                       *mask = dest_id & (0xffff >> (16 - cluster_size));
+               } else {
+                       *mask = 0;
+               }
+
+               return true;
+               }
+       case KVM_APIC_MODE_XAPIC_FLAT:
+               *cluster = map->xapic_flat_map;
+               *mask = dest_id & 0xff;
+               return true;
+       case KVM_APIC_MODE_XAPIC_CLUSTER:
+               *cluster = map->xapic_cluster_map[dest_id >> 4];
+               *mask = dest_id & 0xf;
+               return true;
+       default:
+               /* Not optimized. */
+               return false;
+       }
 }
 
-static inline void
-apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid)
+static void kvm_apic_map_free(struct rcu_head *rcu)
 {
-       unsigned lid_bits;
+       struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu);
 
-       BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER !=  4);
-       BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT    !=  8);
-       BUILD_BUG_ON(KVM_APIC_MODE_X2APIC        != 16);
-       lid_bits = map->mode;
-
-       *cid = dest_id >> lid_bits;
-       *lid = dest_id & ((1 << lid_bits) - 1);
+       kvfree(map);
 }
 
 static void recalculate_apic_map(struct kvm *kvm)
@@ -142,17 +159,26 @@ static void recalculate_apic_map(struct kvm *kvm)
        struct kvm_apic_map *new, *old = NULL;
        struct kvm_vcpu *vcpu;
        int i;
-
-       new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
+       u32 max_id = 255;
 
        mutex_lock(&kvm->arch.apic_map_lock);
 
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               if (kvm_apic_present(vcpu))
+                       max_id = max(max_id, kvm_apic_id(vcpu->arch.apic));
+
+       new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
+                          sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
+
        if (!new)
                goto out;
 
+       new->max_apic_id = max_id;
+
        kvm_for_each_vcpu(i, vcpu, kvm) {
                struct kvm_lapic *apic = vcpu->arch.apic;
-               u16 cid, lid;
+               struct kvm_lapic **cluster;
+               u16 mask;
                u32 ldr, aid;
 
                if (!kvm_apic_present(vcpu))
@@ -161,7 +187,7 @@ static void recalculate_apic_map(struct kvm *kvm)
                aid = kvm_apic_id(apic);
                ldr = kvm_lapic_get_reg(apic, APIC_LDR);
 
-               if (aid < ARRAY_SIZE(new->phys_map))
+               if (aid <= new->max_apic_id)
                        new->phys_map[aid] = apic;
 
                if (apic_x2apic_mode(apic)) {
@@ -174,13 +200,11 @@ static void recalculate_apic_map(struct kvm *kvm)
                                new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
                }
 
-               if (!kvm_apic_logical_map_valid(new))
+               if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))
                        continue;
 
-               apic_logical_id(new, ldr, &cid, &lid);
-
-               if (lid && cid < ARRAY_SIZE(new->logical_map))
-                       new->logical_map[cid][ffs(lid) - 1] = apic;
+               if (mask)
+                       cluster[ffs(mask) - 1] = apic;
        }
 out:
        old = rcu_dereference_protected(kvm->arch.apic_map,
@@ -189,7 +213,7 @@ out:
        mutex_unlock(&kvm->arch.apic_map_lock);
 
        if (old)
-               kfree_rcu(old, rcu);
+               call_rcu(&old->rcu, kvm_apic_map_free);
 
        kvm_make_scan_ioapic_request(kvm);
 }
@@ -210,7 +234,7 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
        }
 }
 
-static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
+static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
 {
        kvm_lapic_set_reg(apic, APIC_ID, id << 24);
        recalculate_apic_map(apic->vcpu->kvm);
@@ -222,11 +246,11 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
        recalculate_apic_map(apic->vcpu->kvm);
 }
 
-static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id)
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
 {
        u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
 
-       kvm_lapic_set_reg(apic, APIC_ID, id << 24);
+       kvm_lapic_set_reg(apic, APIC_ID, id);
        kvm_lapic_set_reg(apic, APIC_LDR, ldr);
        recalculate_apic_map(apic->vcpu->kvm);
 }
@@ -599,17 +623,30 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
        }
 }
 
-/* KVM APIC implementation has two quirks
- *  - dest always begins at 0 while xAPIC MDA has offset 24,
- *  - IOxAPIC messages have to be delivered (directly) to x2APIC.
+/* The KVM local APIC implementation has two quirks:
+ *
+ *  - the xAPIC MDA stores the destination at bits 24-31, while this
+ *    is not true of struct kvm_lapic_irq's dest_id field.  This is
+ *    just a quirk in the API and is not problematic.
+ *
+ *  - in-kernel IOAPIC messages have to be delivered directly to
+ *    x2APIC, because the kernel does not support interrupt remapping.
+ *    In order to support broadcast without interrupt remapping, x2APIC
+ *    rewrites the destination of non-IPI messages from APIC_BROADCAST
+ *    to X2APIC_BROADCAST.
+ *
+ * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API.  This is
+ * important when userspace wants to use x2APIC-format MSIs, because
+ * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
  */
-static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source,
-                                              struct kvm_lapic *target)
+static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
+               struct kvm_lapic *source, struct kvm_lapic *target)
 {
        bool ipi = source != NULL;
        bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
 
-       if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+       if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
+           !ipi && dest_id == APIC_BROADCAST && x2apic_mda)
                return X2APIC_BROADCAST;
 
        return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
@@ -619,7 +656,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                           int short_hand, unsigned int dest, int dest_mode)
 {
        struct kvm_lapic *target = vcpu->arch.apic;
-       u32 mda = kvm_apic_mda(dest, source, target);
+       u32 mda = kvm_apic_mda(vcpu, dest, source, target);
 
        apic_debug("target %p, source %p, dest 0x%x, "
                   "dest_mode 0x%x, short_hand 0x%x\n",
@@ -671,102 +708,126 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
        }
 }
 
-bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
+               struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
 {
-       struct kvm_apic_map *map;
-       unsigned long bitmap = 1;
-       struct kvm_lapic **dst;
-       int i;
-       bool ret, x2apic_ipi;
+       if (kvm->arch.x2apic_broadcast_quirk_disabled) {
+               if ((irq->dest_id == APIC_BROADCAST &&
+                               map->mode != KVM_APIC_MODE_X2APIC))
+                       return true;
+               if (irq->dest_id == X2APIC_BROADCAST)
+                       return true;
+       } else {
+               bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
+               if (irq->dest_id == (x2apic_ipi ?
+                                    X2APIC_BROADCAST : APIC_BROADCAST))
+                       return true;
+       }
 
-       *r = -1;
+       return false;
+}
 
-       if (irq->shorthand == APIC_DEST_SELF) {
-               *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
-               return true;
-       }
+/* Return true if the interrupt can be handled by using *bitmap as index mask
+ * for valid destinations in *dst array.
+ * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
+ * Note: we may have zero kvm_lapic destinations when we return true, which
+ * means that the interrupt should be dropped.  In this case, *bitmap would be
+ * zero and *dst undefined.
+ */
+static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
+               struct kvm_lapic **src, struct kvm_lapic_irq *irq,
+               struct kvm_apic_map *map, struct kvm_lapic ***dst,
+               unsigned long *bitmap)
+{
+       int i, lowest;
 
-       if (irq->shorthand)
+       if (irq->shorthand == APIC_DEST_SELF && src) {
+               *dst = src;
+               *bitmap = 1;
+               return true;
+       } else if (irq->shorthand)
                return false;
 
-       x2apic_ipi = src && apic_x2apic_mode(src);
-       if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
+       if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
                return false;
 
-       ret = true;
-       rcu_read_lock();
-       map = rcu_dereference(kvm->arch.apic_map);
-
-       if (!map) {
-               ret = false;
-               goto out;
+       if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+               if (irq->dest_id > map->max_apic_id) {
+                       *bitmap = 0;
+               } else {
+                       *dst = &map->phys_map[irq->dest_id];
+                       *bitmap = 1;
+               }
+               return true;
        }
 
-       if (irq->dest_mode == APIC_DEST_PHYSICAL) {
-               if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
-                       goto out;
+       *bitmap = 0;
+       if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst,
+                               (u16 *)bitmap))
+               return false;
 
-               dst = &map->phys_map[irq->dest_id];
-       } else {
-               u16 cid;
+       if (!kvm_lowest_prio_delivery(irq))
+               return true;
 
-               if (!kvm_apic_logical_map_valid(map)) {
-                       ret = false;
-                       goto out;
+       if (!kvm_vector_hashing_enabled()) {
+               lowest = -1;
+               for_each_set_bit(i, bitmap, 16) {
+                       if (!(*dst)[i])
+                               continue;
+                       if (lowest < 0)
+                               lowest = i;
+                       else if (kvm_apic_compare_prio((*dst)[i]->vcpu,
+                                               (*dst)[lowest]->vcpu) < 0)
+                               lowest = i;
                }
+       } else {
+               if (!*bitmap)
+                       return true;
 
-               apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
+               lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap),
+                               bitmap, 16);
 
-               if (cid >= ARRAY_SIZE(map->logical_map))
-                       goto out;
+               if (!(*dst)[lowest]) {
+                       kvm_apic_disabled_lapic_found(kvm);
+                       *bitmap = 0;
+                       return true;
+               }
+       }
 
-               dst = map->logical_map[cid];
+       *bitmap = (lowest >= 0) ? 1 << lowest : 0;
 
-               if (!kvm_lowest_prio_delivery(irq))
-                       goto set_irq;
+       return true;
+}
 
-               if (!kvm_vector_hashing_enabled()) {
-                       int l = -1;
-                       for_each_set_bit(i, &bitmap, 16) {
-                               if (!dst[i])
-                                       continue;
-                               if (l < 0)
-                                       l = i;
-                               else if (kvm_apic_compare_prio(dst[i]->vcpu,
-                                                       dst[l]->vcpu) < 0)
-                                       l = i;
-                       }
-                       bitmap = (l >= 0) ? 1 << l : 0;
-               } else {
-                       int idx;
-                       unsigned int dest_vcpus;
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+               struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+{
+       struct kvm_apic_map *map;
+       unsigned long bitmap;
+       struct kvm_lapic **dst = NULL;
+       int i;
+       bool ret;
 
-                       dest_vcpus = hweight16(bitmap);
-                       if (dest_vcpus == 0)
-                               goto out;
+       *r = -1;
 
-                       idx = kvm_vector_to_index(irq->vector,
-                               dest_vcpus, &bitmap, 16);
+       if (irq->shorthand == APIC_DEST_SELF) {
+               *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
+               return true;
+       }
 
-                       if (!dst[idx]) {
-                               kvm_apic_disabled_lapic_found(kvm);
-                               goto out;
-                       }
+       rcu_read_lock();
+       map = rcu_dereference(kvm->arch.apic_map);
 
-                       bitmap = (idx >= 0) ? 1 << idx : 0;
+       ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
+       if (ret)
+               for_each_set_bit(i, &bitmap, 16) {
+                       if (!dst[i])
+                               continue;
+                       if (*r < 0)
+                               *r = 0;
+                       *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
                }
-       }
 
-set_irq:
-       for_each_set_bit(i, &bitmap, 16) {
-               if (!dst[i])
-                       continue;
-               if (*r < 0)
-                       *r = 0;
-               *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
-       }
-out:
        rcu_read_unlock();
        return ret;
 }
@@ -789,8 +850,9 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
                        struct kvm_vcpu **dest_vcpu)
 {
        struct kvm_apic_map *map;
+       unsigned long bitmap;
+       struct kvm_lapic **dst = NULL;
        bool ret = false;
-       struct kvm_lapic *dst = NULL;
 
        if (irq->shorthand)
                return false;
@@ -798,69 +860,16 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
        rcu_read_lock();
        map = rcu_dereference(kvm->arch.apic_map);
 
-       if (!map)
-               goto out;
-
-       if (irq->dest_mode == APIC_DEST_PHYSICAL) {
-               if (irq->dest_id == 0xFF)
-                       goto out;
-
-               if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
-                       goto out;
-
-               dst = map->phys_map[irq->dest_id];
-               if (dst && kvm_apic_present(dst->vcpu))
-                       *dest_vcpu = dst->vcpu;
-               else
-                       goto out;
-       } else {
-               u16 cid;
-               unsigned long bitmap = 1;
-               int i, r = 0;
-
-               if (!kvm_apic_logical_map_valid(map))
-                       goto out;
-
-               apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
-
-               if (cid >= ARRAY_SIZE(map->logical_map))
-                       goto out;
-
-               if (kvm_vector_hashing_enabled() &&
-                               kvm_lowest_prio_delivery(irq)) {
-                       int idx;
-                       unsigned int dest_vcpus;
+       if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
+                       hweight16(bitmap) == 1) {
+               unsigned long i = find_first_bit(&bitmap, 16);
 
-                       dest_vcpus = hweight16(bitmap);
-                       if (dest_vcpus == 0)
-                               goto out;
-
-                       idx = kvm_vector_to_index(irq->vector, dest_vcpus,
-                                                 &bitmap, 16);
-
-                       dst = map->logical_map[cid][idx];
-                       if (!dst) {
-                               kvm_apic_disabled_lapic_found(kvm);
-                               goto out;
-                       }
-
-                       *dest_vcpu = dst->vcpu;
-               } else {
-                       for_each_set_bit(i, &bitmap, 16) {
-                               dst = map->logical_map[cid][i];
-                               if (++r == 2)
-                                       goto out;
-                       }
-
-                       if (dst && kvm_apic_present(dst->vcpu))
-                               *dest_vcpu = dst->vcpu;
-                       else
-                               goto out;
+               if (dst[i]) {
+                       *dest_vcpu = dst[i]->vcpu;
+                       ret = true;
                }
        }
 
-       ret = true;
-out:
        rcu_read_unlock();
        return ret;
 }
@@ -1127,12 +1136,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
                return 0;
 
        switch (offset) {
-       case APIC_ID:
-               if (apic_x2apic_mode(apic))
-                       val = kvm_apic_id(apic);
-               else
-                       val = kvm_apic_id(apic) << 24;
-               break;
        case APIC_ARBPRI:
                apic_debug("Access APIC ARBPRI register which is for P6\n");
                break;
@@ -1314,6 +1317,108 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
                        nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
 }
 
+static void start_sw_tscdeadline(struct kvm_lapic *apic)
+{
+       u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+       u64 ns = 0;
+       ktime_t expire;
+       struct kvm_vcpu *vcpu = apic->vcpu;
+       unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+       unsigned long flags;
+       ktime_t now;
+
+       if (unlikely(!tscdeadline || !this_tsc_khz))
+               return;
+
+       local_irq_save(flags);
+
+       now = apic->lapic_timer.timer.base->get_time();
+       guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+       if (likely(tscdeadline > guest_tsc)) {
+               ns = (tscdeadline - guest_tsc) * 1000000ULL;
+               do_div(ns, this_tsc_khz);
+               expire = ktime_add_ns(now, ns);
+               expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
+               hrtimer_start(&apic->lapic_timer.timer,
+                               expire, HRTIMER_MODE_ABS_PINNED);
+       } else
+               apic_timer_expired(apic);
+
+       local_irq_restore(flags);
+}
+
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
+
+static void cancel_hv_tscdeadline(struct kvm_lapic *apic)
+{
+       kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+       apic->lapic_timer.hv_timer_in_use = false;
+}
+
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+       WARN_ON(swait_active(&vcpu->wq));
+       cancel_hv_tscdeadline(apic);
+       apic_timer_expired(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+
+static bool start_hv_tscdeadline(struct kvm_lapic *apic)
+{
+       u64 tscdeadline = apic->lapic_timer.tscdeadline;
+
+       if (atomic_read(&apic->lapic_timer.pending) ||
+               kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
+               if (apic->lapic_timer.hv_timer_in_use)
+                       cancel_hv_tscdeadline(apic);
+       } else {
+               apic->lapic_timer.hv_timer_in_use = true;
+               hrtimer_cancel(&apic->lapic_timer.timer);
+
+               /* In case the sw timer triggered in the window */
+               if (atomic_read(&apic->lapic_timer.pending))
+                       cancel_hv_tscdeadline(apic);
+       }
+       trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+                       apic->lapic_timer.hv_timer_in_use);
+       return apic->lapic_timer.hv_timer_in_use;
+}
+
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       WARN_ON(apic->lapic_timer.hv_timer_in_use);
+
+       if (apic_lvtt_tscdeadline(apic))
+               start_hv_tscdeadline(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
+
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       /* Possibly the TSC deadline timer is not enabled yet */
+       if (!apic->lapic_timer.hv_timer_in_use)
+               return;
+
+       cancel_hv_tscdeadline(apic);
+
+       if (atomic_read(&apic->lapic_timer.pending))
+               return;
+
+       start_sw_tscdeadline(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
+
 static void start_apic_timer(struct kvm_lapic *apic)
 {
        ktime_t now;
@@ -1360,32 +1465,8 @@ static void start_apic_timer(struct kvm_lapic *apic)
                           ktime_to_ns(ktime_add_ns(now,
                                        apic->lapic_timer.period)));
        } else if (apic_lvtt_tscdeadline(apic)) {
-               /* lapic timer in tsc deadline mode */
-               u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
-               u64 ns = 0;
-               ktime_t expire;
-               struct kvm_vcpu *vcpu = apic->vcpu;
-               unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
-               unsigned long flags;
-
-               if (unlikely(!tscdeadline || !this_tsc_khz))
-                       return;
-
-               local_irq_save(flags);
-
-               now = apic->lapic_timer.timer.base->get_time();
-               guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
-               if (likely(tscdeadline > guest_tsc)) {
-                       ns = (tscdeadline - guest_tsc) * 1000000ULL;
-                       do_div(ns, this_tsc_khz);
-                       expire = ktime_add_ns(now, ns);
-                       expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
-                       hrtimer_start(&apic->lapic_timer.timer,
-                                     expire, HRTIMER_MODE_ABS_PINNED);
-               } else
-                       apic_timer_expired(apic);
-
-               local_irq_restore(flags);
+               if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic)))
+                       start_sw_tscdeadline(apic);
        }
 }
 
@@ -1413,7 +1494,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
        switch (reg) {
        case APIC_ID:           /* Local APIC ID */
                if (!apic_x2apic_mode(apic))
-                       kvm_apic_set_id(apic, val >> 24);
+                       kvm_apic_set_xapic_id(apic, val >> 24);
                else
                        ret = 1;
                break;
@@ -1674,9 +1755,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 
        /* update jump label if enable bit changes */
        if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
-               if (value & MSR_IA32_APICBASE_ENABLE)
+               if (value & MSR_IA32_APICBASE_ENABLE) {
+                       kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
                        static_key_slow_dec_deferred(&apic_hw_disabled);
-               else
+               else
                        static_key_slow_inc(&apic_hw_disabled.key);
                recalculate_apic_map(vcpu->kvm);
        }
@@ -1716,8 +1798,11 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
        /* Stop the timer in case it's a reset to an active apic */
        hrtimer_cancel(&apic->lapic_timer.timer);
 
-       if (!init_event)
-               kvm_apic_set_id(apic, vcpu->vcpu_id);
+       if (!init_event) {
+               kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE |
+                                        MSR_IA32_APICBASE_ENABLE);
+               kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+       }
        kvm_apic_set_version(apic->vcpu);
 
        for (i = 0; i < KVM_APIC_LVT_NUM; i++)
@@ -1856,9 +1941,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
         * thinking that APIC satet has changed.
         */
        vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
-       kvm_lapic_set_base(vcpu,
-                       APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
-
        static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
        kvm_lapic_reset(vcpu, false);
        kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
@@ -1938,17 +2020,48 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
        return vector;
 }
 
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
-               struct kvm_lapic_state *s)
+static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
+               struct kvm_lapic_state *s, bool set)
+{
+       if (apic_x2apic_mode(vcpu->arch.apic)) {
+               u32 *id = (u32 *)(s->regs + APIC_ID);
+
+               if (vcpu->kvm->arch.x2apic_format) {
+                       if (*id != vcpu->vcpu_id)
+                               return -EINVAL;
+               } else {
+                       if (set)
+                               *id >>= 24;
+                       else
+                               *id <<= 24;
+               }
+       }
+
+       return 0;
+}
+
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+       memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+       return kvm_apic_state_fixup(vcpu, s, false);
+}
+
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
+       int r;
+
 
        kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
        /* set SPIV separately to get count of SW disabled APICs right */
        apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+
+       r = kvm_apic_state_fixup(vcpu, s, true);
+       if (r)
+               return r;
        memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
-       /* call kvm_apic_set_id() to put apic into apic_map */
-       kvm_apic_set_id(apic, kvm_apic_id(apic));
+
+       recalculate_apic_map(vcpu->kvm);
        kvm_apic_set_version(vcpu);
 
        apic_update_ppr(apic);
@@ -1974,6 +2087,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
                kvm_rtc_eoi_tracking_restore_one(vcpu);
 
        vcpu->arch.apic_arb_prio = 0;
+
+       return 0;
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
index 891c6da..f60d01c 100644 (file)
@@ -20,6 +20,7 @@ struct kvm_timer {
        u64 tscdeadline;
        u64 expired_tscdeadline;
        atomic_t pending;                       /* accumulated triggered timers */
+       bool hv_timer_in_use;
 };
 
 struct kvm_lapic {
@@ -80,8 +81,8 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
-               struct kvm_lapic_state *s);
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
@@ -199,9 +200,15 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
        return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
 }
 
-static inline int kvm_apic_id(struct kvm_lapic *apic)
+static inline u32 kvm_apic_id(struct kvm_lapic *apic)
 {
-       return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+       /* To avoid a race between apic_base and following APIC_ID update when
+        * switching to x2apic_mode, the x2apic mode returns initial x2apic id.
+        */
+       if (apic_x2apic_mode(apic))
+               return apic->vcpu->vcpu_id;
+
+       return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
 }
 
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
@@ -212,4 +219,8 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
                        struct kvm_vcpu **dest_vcpu);
 int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
                        const unsigned long *bitmap, u32 bitmap_size);
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
 #endif
index 745a5f4..3d4cc8c 100644 (file)
@@ -176,6 +176,7 @@ static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mmio_mask;
+static u64 __read_mostly shadow_present_mask;
 
 static void mmu_spte_set(u64 *sptep, u64 spte);
 static void mmu_free_roots(struct kvm_vcpu *vcpu);
@@ -283,13 +284,14 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 }
 
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask)
+               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask)
 {
        shadow_user_mask = user_mask;
        shadow_accessed_mask = accessed_mask;
        shadow_dirty_mask = dirty_mask;
        shadow_nx_mask = nx_mask;
        shadow_x_mask = x_mask;
+       shadow_present_mask = p_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
@@ -305,7 +307,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-       return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
+       return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte);
 }
 
 static int is_large_pte(u64 pte)
@@ -524,7 +526,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
 }
 
 /* Rules for using mmu_spte_update:
- * Update the state bits, it means the mapped pfn is not changged.
+ * Update the state bits, it means the mapped pfn is not changed.
  *
  * Whenever we overwrite a writable spte with a read-only one we
  * should flush remote TLBs. Otherwise rmap_write_protect
@@ -2246,10 +2248,9 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
 {
        u64 spte;
 
-       BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
-                       VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+       BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
 
-       spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
+       spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
               shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
 
        mmu_spte_set(sptep, spte);
@@ -2516,13 +2517,19 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                    gfn_t gfn, kvm_pfn_t pfn, bool speculative,
                    bool can_unsync, bool host_writable)
 {
-       u64 spte;
+       u64 spte = 0;
        int ret = 0;
 
        if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
                return 0;
 
-       spte = PT_PRESENT_MASK;
+       /*
+        * For the EPT case, shadow_present_mask is 0 if hardware
+        * supports exec-only page table entries.  In that case,
+        * ACC_USER_MASK and shadow_user_mask are used to represent
+        * read access.  See FNAME(gpte_access) in paging_tmpl.h.
+        */
+       spte |= shadow_present_mask;
        if (!speculative)
                spte |= shadow_accessed_mask;
 
@@ -3190,7 +3197,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                MMU_WARN_ON(VALID_PAGE(root));
                if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
                        pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
-                       if (!is_present_gpte(pdptr)) {
+                       if (!(pdptr & PT_PRESENT_MASK)) {
                                vcpu->arch.mmu.pae_root[i] = 0;
                                continue;
                        }
@@ -3915,9 +3922,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
                                 *   clearer.
                                 */
                                smap = cr4_smap && u && !uf && !ff;
-                       } else
-                               /* Not really needed: no U/S accesses on ept  */
-                               u = 1;
+                       }
 
                        fault = (ff && !x) || (uf && !u) || (wf && !w) ||
                                (smapf && smap);
index 66b33b9..ddc56e9 100644 (file)
@@ -93,11 +93,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
        return kvm_mmu_load(vcpu);
 }
 
-static inline int is_present_gpte(unsigned long pte)
-{
-       return pte & PT_PRESENT_MASK;
-}
-
 /*
  * Currently, we have two sorts of write-protection, a) the first one
  * write-protects guest page to sync the guest modification, b) another one is
index bc019f7..a011054 100644 (file)
@@ -131,7 +131,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
 static inline int FNAME(is_present_gpte)(unsigned long pte)
 {
 #if PTTYPE != PTTYPE_EPT
-       return is_present_gpte(pte);
+       return pte & PT_PRESENT_MASK;
 #else
        return pte & 7;
 #endif
@@ -181,13 +181,19 @@ no_present:
        return true;
 }
 
+/*
+ * For PTTYPE_EPT, a page table can be executable but not readable
+ * on supported processors. Therefore, set_spte does not automatically
+ * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
+ * to signify readability since it isn't used in the EPT case
+ */
 static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 {
        unsigned access;
 #if PTTYPE == PTTYPE_EPT
        access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
                ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
-               ACC_USER_MASK;
+               ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
 #else
        BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
        BUILD_BUG_ON(ACC_EXEC_MASK != 1);
index ab38af4..9d4a850 100644 (file)
@@ -93,7 +93,7 @@ static unsigned intel_find_fixed_event(int idx)
        return intel_arch_events[fixed_pmc_events[idx]].event_type;
 }
 
-/* check if a PMC is enabled by comparising it with globl_ctrl bits. */
+/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
 static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
 {
        struct kvm_pmu *pmu = pmc_to_pmu(pmc);
index 16ef31b..af523d8 100644 (file)
@@ -1577,7 +1577,7 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
        /*
-        * Any change of EFLAGS.VM is accompained by a reload of SS
+        * Any change of EFLAGS.VM is accompanied by a reload of SS
         * (caused by either a task switch or an inter-privilege IRET),
         * so we do not need to update the CPL here.
         */
@@ -4940,6 +4940,12 @@ out:
 static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
 {
        local_irq_enable();
+       /*
+        * We must have an instruction with interrupts enabled, so
+        * the timer interrupt isn't delayed by the interrupt shadow.
+        */
+       asm("nop");
+       local_irq_disable();
 }
 
 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
index 8de9250..0a6cc67 100644 (file)
@@ -1348,6 +1348,21 @@ TRACE_EVENT(kvm_avic_unaccelerated_access,
                  __entry->vec)
 );
 
+TRACE_EVENT(kvm_hv_timer_state,
+               TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
+               TP_ARGS(vcpu_id, hv_timer_in_use),
+               TP_STRUCT__entry(
+                       __field(unsigned int, vcpu_id)
+                       __field(unsigned int, hv_timer_in_use)
+                       ),
+               TP_fast_assign(
+                       __entry->vcpu_id = vcpu_id;
+                       __entry->hv_timer_in_use = hv_timer_in_use;
+                       ),
+               TP_printk("vcpu_id %x hv_timer %x\n",
+                       __entry->vcpu_id,
+                       __entry->hv_timer_in_use)
+);
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
index df07a0a..bc354f0 100644 (file)
@@ -110,6 +110,13 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
 
 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 
+/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
+static int __read_mostly cpu_preemption_timer_multi;
+static bool __read_mostly enable_preemption_timer = 1;
+#ifdef CONFIG_X86_64
+module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+#endif
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON                                           \
@@ -398,6 +405,12 @@ struct nested_vmx {
        /* The host-usable pointer to the above */
        struct page *current_vmcs12_page;
        struct vmcs12 *current_vmcs12;
+       /*
+        * Cache of the guest's VMCS, existing outside of guest memory.
+        * Loaded from guest memory during VMPTRLD. Flushed to guest
+        * memory during VMXOFF, VMCLEAR, VMPTRLD.
+        */
+       struct vmcs12 *cached_vmcs12;
        struct vmcs *current_shadow_vmcs;
        /*
         * Indicates if the shadow vmcs must be updated with the
@@ -421,7 +434,6 @@ struct nested_vmx {
        struct pi_desc *pi_desc;
        bool pi_pending;
        u16 posted_intr_nv;
-       u64 msr_ia32_feature_control;
 
        struct hrtimer preemption_timer;
        bool preemption_timer_expired;
@@ -597,11 +609,22 @@ struct vcpu_vmx {
 #define PML_ENTITY_NUM         512
        struct page *pml_pg;
 
+       /* apic deadline value in host tsc */
+       u64 hv_deadline_tsc;
+
        u64 current_tsc_ratio;
 
        bool guest_pkru_valid;
        u32 guest_pkru;
        u32 host_pkru;
+
+       /*
+        * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+        * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+        * in msr_ia32_feature_control_valid_bits.
+        */
+       u64 msr_ia32_feature_control;
+       u64 msr_ia32_feature_control_valid_bits;
 };
 
 enum segment_cache_field {
@@ -841,7 +864,7 @@ static inline short vmcs_field_to_offset(unsigned long field)
 
 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 {
-       return to_vmx(vcpu)->nested.current_vmcs12;
+       return to_vmx(vcpu)->nested.cached_vmcs12;
 }
 
 static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
@@ -1056,6 +1079,58 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 }
 
+/*
+ * Comment's format: document - errata name - stepping - processor name.
+ * Refer from
+ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+ */
+static u32 vmx_preemption_cpu_tfms[] = {
+/* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
+0x000206E6,
+/* 323056.pdf - AAX65  - C2 - Xeon L3406 */
+/* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+/* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020652,
+/* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020655,
+/* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
+/* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
+/*
+ * 320767.pdf - AAP86  - B1 -
+ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+ */
+0x000106E5,
+/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+0x000106A0,
+/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+0x000106A1,
+/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+0x000106A4,
+ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+0x000106A5,
+};
+
+static inline bool cpu_has_broken_vmx_preemption_timer(void)
+{
+       u32 eax = cpuid_eax(0x00000001), i;
+
+       /* Clear the reserved bits */
+       eax &= ~(0x3U << 14 | 0xfU << 28);
+       for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
+               if (eax == vmx_preemption_cpu_tfms[i])
+                       return true;
+
+       return false;
+}
+
+static inline bool cpu_has_vmx_preemption_timer(void)
+{
+       return vmcs_config.pin_based_exec_ctrl &
+               PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
 static inline bool cpu_has_vmx_posted_intr(void)
 {
        return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
@@ -1603,6 +1678,11 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
        __vmcs_writel(field, __vmcs_readl(field) | mask);
 }
 
+static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+       vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
+}
+
 static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
 {
        vmcs_write32(VM_ENTRY_CONTROLS, val);
@@ -1631,6 +1711,11 @@ static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
        vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
 }
 
+static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+       vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
+}
+
 static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
 {
        vmcs_write32(VM_EXIT_CONTROLS, val);
@@ -2121,22 +2206,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+       bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
 
        if (!vmm_exclusive)
                kvm_cpu_vmxon(phys_addr);
-       else if (vmx->loaded_vmcs->cpu != cpu)
+       else if (!already_loaded)
                loaded_vmcs_clear(vmx->loaded_vmcs);
 
-       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
-               per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
-               vmcs_load(vmx->loaded_vmcs->vmcs);
-       }
-
-       if (vmx->loaded_vmcs->cpu != cpu) {
-               struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
-               unsigned long sysenter_esp;
-
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+       if (!already_loaded) {
                local_irq_disable();
                crash_disable_local_vmclear(cpu);
 
@@ -2151,6 +2228,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                         &per_cpu(loaded_vmcss_on_cpu, cpu));
                crash_enable_local_vmclear(cpu);
                local_irq_enable();
+       }
+
+       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+               per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+               vmcs_load(vmx->loaded_vmcs->vmcs);
+       }
+
+       if (!already_loaded) {
+               struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
+               unsigned long sysenter_esp;
+
+               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 
                /*
                 * Linux uses per-cpu TSS and GDT, so set these when switching
@@ -2716,6 +2805,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
                vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
                         VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
                         VMX_EPT_INVEPT_BIT;
+               if (cpu_has_vmx_ept_execute_only())
+                       vmx->nested.nested_vmx_ept_caps |=
+                               VMX_EPT_EXECUTE_ONLY_BIT;
                vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
                /*
                 * For nested guests, we don't do anything specific
@@ -2864,6 +2956,14 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
        return 0;
 }
 
+static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
+                                                uint64_t val)
+{
+       uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+
+       return !(val & ~valid_bits);
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -2905,10 +3005,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                msr_info->data = vmcs_read64(GUEST_BNDCFGS);
                break;
-       case MSR_IA32_FEATURE_CONTROL:
-               if (!nested_vmx_allowed(vcpu))
+       case MSR_IA32_MCG_EXT_CTL:
+               if (!msr_info->host_initiated &&
+                   !(to_vmx(vcpu)->msr_ia32_feature_control &
+                     FEATURE_CONTROL_LMCE))
                        return 1;
-               msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+               msr_info->data = vcpu->arch.mcg_ext_ctl;
+               break;
+       case MSR_IA32_FEATURE_CONTROL:
+               msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
                break;
        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                if (!nested_vmx_allowed(vcpu))
@@ -2998,12 +3103,20 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_TSC_ADJUST:
                ret = kvm_set_msr_common(vcpu, msr_info);
                break;
+       case MSR_IA32_MCG_EXT_CTL:
+               if ((!msr_info->host_initiated &&
+                    !(to_vmx(vcpu)->msr_ia32_feature_control &
+                      FEATURE_CONTROL_LMCE)) ||
+                   (data & ~MCG_EXT_CTL_LMCE_EN))
+                       return 1;
+               vcpu->arch.mcg_ext_ctl = data;
+               break;
        case MSR_IA32_FEATURE_CONTROL:
-               if (!nested_vmx_allowed(vcpu) ||
-                   (to_vmx(vcpu)->nested.msr_ia32_feature_control &
+               if (!vmx_feature_control_msr_valid(vcpu, data) ||
+                   (to_vmx(vcpu)->msr_ia32_feature_control &
                     FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
                        return 1;
-               vmx->nested.msr_ia32_feature_control = data;
+               vmx->msr_ia32_feature_control = data;
                if (msr_info->host_initiated && data == 0)
                        vmx_leave_nested(vcpu);
                break;
@@ -3297,25 +3410,27 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                      vmx_capability.ept, vmx_capability.vpid);
        }
 
-       min = VM_EXIT_SAVE_DEBUG_CONTROLS;
+       min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
 #ifdef CONFIG_X86_64
        min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
        opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
-               VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
+               VM_EXIT_CLEAR_BNDCFGS;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
                                &_vmexit_control) < 0)
                return -EIO;
 
        min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+                PIN_BASED_VMX_PREEMPTION_TIMER;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
                                &_pin_based_exec_control) < 0)
                return -EIO;
 
+       if (cpu_has_broken_vmx_preemption_timer())
+               _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
        if (!(_cpu_based_2nd_exec_control &
-               SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
-               !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+               SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
                _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
 
        min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
@@ -3364,7 +3479,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 
        /*
         * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
-        * but due to arrata below it can't be used. Workaround is to use
+        * but due to errata below it can't be used. Workaround is to use
         * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
         *
         * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
@@ -4781,6 +4896,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
 
        if (!kvm_vcpu_apicv_active(&vmx->vcpu))
                pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+       /* Enable the preemption timer dynamically */
+       pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
        return pin_based_exec_ctrl;
 }
 
@@ -4896,6 +5013,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
        /* Control */
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+       vmx->hv_deadline_tsc = -1;
 
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
@@ -6016,12 +6134,14 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
        trace_kvm_page_fault(gpa, exit_qualification);
 
-       /* It is a write fault? */
-       error_code = exit_qualification & PFERR_WRITE_MASK;
+       /* it is a read fault? */
+       error_code = (exit_qualification << 2) & PFERR_USER_MASK;
+       /* it is a write fault? */
+       error_code |= exit_qualification & PFERR_WRITE_MASK;
        /* It is a fetch fault? */
        error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
        /* ept page table is present? */
-       error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK;
+       error_code |= (exit_qualification & 0x38) != 0;
 
        vcpu->arch.exit_qualification = exit_qualification;
 
@@ -6355,9 +6475,6 @@ static __init int hardware_setup(void)
        for (msr = 0x800; msr <= 0x8ff; msr++)
                vmx_disable_intercept_msr_read_x2apic(msr);
 
-       /* According SDM, in x2apic mode, the whole id reg is used.  But in
-        * KVM, it only use the highest eight bits. Need to intercept it */
-       vmx_enable_intercept_msr_read_x2apic(0x802);
        /* TMCCT */
        vmx_enable_intercept_msr_read_x2apic(0x839);
        /* TPR */
@@ -6368,10 +6485,12 @@ static __init int hardware_setup(void)
        vmx_disable_intercept_msr_write_x2apic(0x83f);
 
        if (enable_ept) {
-               kvm_mmu_set_mask_ptes(0ull,
+               kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
                        (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
                        (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
-                       0ull, VMX_EPT_EXECUTABLE_MASK);
+                       0ull, VMX_EPT_EXECUTABLE_MASK,
+                       cpu_has_vmx_ept_execute_only() ?
+                                     0ull : VMX_EPT_READABLE_MASK);
                ept_set_mmio_spte_mask();
                kvm_enable_tdp();
        } else
@@ -6393,8 +6512,21 @@ static __init int hardware_setup(void)
                kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
        }
 
+       if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+               u64 vmx_msr;
+
+               rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+               cpu_preemption_timer_multi =
+                        vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+       } else {
+               kvm_x86_ops->set_hv_timer = NULL;
+               kvm_x86_ops->cancel_hv_timer = NULL;
+       }
+
        kvm_set_posted_intr_wakeup_handler(wakeup_handler);
 
+       kvm_mce_cap_supported |= MCG_LMCE_P;
+
        return alloc_kvm_area();
 
 out8:
@@ -6862,16 +6994,22 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                return 1;
        }
 
-       if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+       if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
                        != VMXON_NEEDED_FEATURES) {
                kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
+       vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+       if (!vmx->nested.cached_vmcs12)
+               return -ENOMEM;
+
        if (enable_shadow_vmcs) {
                shadow_vmcs = alloc_vmcs();
-               if (!shadow_vmcs)
+               if (!shadow_vmcs) {
+                       kfree(vmx->nested.cached_vmcs12);
                        return -ENOMEM;
+               }
                /* mark vmcs as shadow */
                shadow_vmcs->revision_id |= (1u << 31);
                /* init shadow vmcs */
@@ -6942,6 +7080,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
                vmcs_write64(VMCS_LINK_POINTER, -1ull);
        }
        vmx->nested.posted_intr_nv = -1;
+
+       /* Flush VMCS12 to guest memory */
+       memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
+              VMCS12_SIZE);
+
        kunmap(vmx->nested.current_vmcs12_page);
        nested_release_page(vmx->nested.current_vmcs12_page);
        vmx->nested.current_vmptr = -1ull;
@@ -6962,6 +7105,7 @@ static void free_nested(struct vcpu_vmx *vmx)
        nested_release_vmcs12(vmx);
        if (enable_shadow_vmcs)
                free_vmcs(vmx->nested.current_shadow_vmcs);
+       kfree(vmx->nested.cached_vmcs12);
        /* Unpin physical memory we referred to in current vmcs02 */
        if (vmx->nested.apic_access_page) {
                nested_release_page(vmx->nested.apic_access_page);
@@ -7365,6 +7509,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
                vmx->nested.current_vmptr = vmptr;
                vmx->nested.current_vmcs12 = new_vmcs12;
                vmx->nested.current_vmcs12_page = page;
+               /*
+                * Load VMCS12 from guest memory since it is not already
+                * cached.
+                */
+               memcpy(vmx->nested.cached_vmcs12,
+                      vmx->nested.current_vmcs12, VMCS12_SIZE);
+
                if (enable_shadow_vmcs) {
                        vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
                                      SECONDARY_EXEC_SHADOW_VMCS);
@@ -7560,6 +7711,12 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+       kvm_lapic_expired_hv_timer(vcpu);
+       return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7610,6 +7767,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_XSAVES]                  = handle_xsaves,
        [EXIT_REASON_XRSTORS]                 = handle_xrstors,
        [EXIT_REASON_PML_FULL]                = handle_pml_full,
+       [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -7918,6 +8076,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                 * the XSS exit bitmap in vmcs12.
                 */
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+       case EXIT_REASON_PREEMPTION_TIMER:
+               return false;
        default:
                return true;
        }
@@ -8303,7 +8463,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
         * the next L2->L1 exit.
         */
        if (!is_guest_mode(vcpu) ||
-           !nested_cpu_has2(vmx->nested.current_vmcs12,
+           !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
                             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
                vmcs_write64(APIC_ACCESS_ADDR, hpa);
 }
@@ -8436,7 +8596,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
                        "push %[sp]\n\t"
 #endif
                        "pushf\n\t"
-                       "orl $0x200, (%%" _ASM_SP ")\n\t"
                        __ASM_SIZE(push) " $%c[cs]\n\t"
                        "call *%[entry]\n\t"
                        :
@@ -8449,8 +8608,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
                        [ss]"i"(__KERNEL_DS),
                        [cs]"i"(__KERNEL_CS)
                        );
-       } else
-               local_irq_enable();
+       }
 }
 
 static bool vmx_has_high_real_mode_segbase(void)
@@ -8601,6 +8759,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
                                        msrs[i].host);
 }
 
+void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 tscl;
+       u32 delta_tsc;
+
+       if (vmx->hv_deadline_tsc == -1)
+               return;
+
+       tscl = rdtsc();
+       if (vmx->hv_deadline_tsc > tscl)
+               /* sure to be 32 bit only because checked on set_hv_timer */
+               delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+                       cpu_preemption_timer_multi);
+       else
+               delta_tsc = 0;
+
+       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+}
+
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8650,6 +8828,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        atomic_switch_perf_msrs(vmx);
        debugctlmsr = get_debugctlmsr();
 
+       vmx_arm_hv_timer(vcpu);
+
        vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
                /* Store host registers */
@@ -8940,6 +9120,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
        vmx->nested.current_vmptr = -1ull;
        vmx->nested.current_vmcs12 = NULL;
 
+       vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+
        return &vmx->vcpu;
 
 free_vmcs:
@@ -9080,6 +9262,13 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 
        if (cpu_has_secondary_exec_ctrls())
                vmcs_set_secondary_exec_control(secondary_exec_ctl);
+
+       if (nested_vmx_allowed(vcpu))
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+                       FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+       else
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+                       ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9636,9 +9825,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs_write64(VMCS_LINK_POINTER, -1ull);
 
        exec_control = vmcs12->pin_based_vm_exec_control;
-       exec_control |= vmcs_config.pin_based_exec_ctrl;
+
+       /* Preemption timer setting is only taken from vmcs01.  */
        exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       exec_control |= vmcs_config.pin_based_exec_ctrl;
+       if (vmx->hv_deadline_tsc == -1)
+               exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
 
+       /* Posted interrupts setting is only taken from vmcs12.  */
        if (nested_cpu_has_posted_intr(vmcs12)) {
                /*
                 * Note that we use L0's vector here and in
@@ -10556,8 +10750,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                                       vmcs12->vm_exit_intr_error_code,
                                       KVM_ISA_VMX);
 
-       vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
-       vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
+       vm_entry_controls_reset_shadow(vmx);
+       vm_exit_controls_reset_shadow(vmx);
        vmx_segment_cache_clear(vmx);
 
        /* if no vmcs02 cache requested, remove the one we used */
@@ -10566,8 +10760,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
        load_vmcs12_host_state(vcpu, vmcs12);
 
-       /* Update TSC_OFFSET if TSC was changed while L2 ran */
+       /* Update any VMCS fields that might have changed while L2 ran */
        vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+       if (vmx->hv_deadline_tsc == -1)
+               vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+                               PIN_BASED_VMX_PREEMPTION_TIMER);
+       else
+               vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+                             PIN_BASED_VMX_PREEMPTION_TIMER);
 
        /* This is needed for same reason as it was needed in prepare_vmcs02 */
        vmx->host_rsp = 0;
@@ -10647,6 +10847,64 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
        return X86EMUL_CONTINUE;
 }
 
+#ifdef CONFIG_X86_64
+/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+                                 u64 divisor, u64 *result)
+{
+       u64 low = a << shift, high = a >> (64 - shift);
+
+       /* To avoid the overflow on divq */
+       if (high >= divisor)
+               return 1;
+
+       /* Low hold the result, high hold rem which is discarded */
+       asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+           "rm" (divisor), "0" (low), "1" (high));
+       *result = low;
+
+       return 0;
+}
+
+static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 tscl = rdtsc();
+       u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+       u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+
+       /* Convert to host delta tsc if tsc scaling is enabled */
+       if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+                       u64_shl_div_u64(delta_tsc,
+                               kvm_tsc_scaling_ratio_frac_bits,
+                               vcpu->arch.tsc_scaling_ratio,
+                               &delta_tsc))
+               return -ERANGE;
+
+       /*
+        * If the delta tsc can't fit in the 32 bit after the multi shift,
+        * we can't use the preemption timer.
+        * It's possible that it fits on later vmentries, but checking
+        * on every vmentry is costly so we just use an hrtimer.
+        */
+       if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+               return -ERANGE;
+
+       vmx->hv_deadline_tsc = tscl + delta_tsc;
+       vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+                       PIN_BASED_VMX_PREEMPTION_TIMER);
+       return 0;
+}
+
+static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       vmx->hv_deadline_tsc = -1;
+       vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+                       PIN_BASED_VMX_PREEMPTION_TIMER);
+}
+#endif
+
 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
        if (ple_gap)
@@ -10691,7 +10949,7 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
  *   this case, return 1, otherwise, return 0.
  *
  */
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
+static int pi_pre_block(struct kvm_vcpu *vcpu)
 {
        unsigned long flags;
        unsigned int dest;
@@ -10758,7 +11016,18 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static void vmx_post_block(struct kvm_vcpu *vcpu)
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+       if (pi_pre_block(vcpu))
+               return 1;
+
+       if (kvm_lapic_hv_timer_in_use(vcpu))
+               kvm_lapic_switch_to_sw_timer(vcpu);
+
+       return 0;
+}
+
+static void pi_post_block(struct kvm_vcpu *vcpu)
 {
        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
        struct pi_desc old, new;
@@ -10800,6 +11069,14 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
        }
 }
 
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+       if (kvm_x86_ops->set_hv_timer)
+               kvm_lapic_switch_to_hv_timer(vcpu);
+
+       pi_post_block(vcpu);
+}
+
 /*
  * vmx_update_pi_irte - set IRTE for Posted-Interrupts
  *
@@ -10844,7 +11121,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                 * We will support full lowest-priority interrupt later.
                 */
 
-               kvm_set_msi_irq(e, &irq);
+               kvm_set_msi_irq(kvm, e, &irq);
                if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
                        /*
                         * Make sure the IRTE is in remapped mode if
@@ -10889,6 +11166,16 @@ out:
        return ret;
 }
 
+static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+                       FEATURE_CONTROL_LMCE;
+       else
+               to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+                       ~FEATURE_CONTROL_LMCE;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
@@ -11013,6 +11300,13 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .pmu_ops = &intel_pmu_ops,
 
        .update_pi_irte = vmx_update_pi_irte,
+
+#ifdef CONFIG_X86_64
+       .set_hv_timer = vmx_set_hv_timer,
+       .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
+
+       .setup_mce = vmx_setup_mce,
 };
 
 static int __init vmx_init(void)
index 9c496c7..19f9f9e 100644 (file)
@@ -71,7 +71,8 @@
 
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
+EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
 
 #define emul_to_vcpu(ctxt) \
        container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
@@ -90,8 +91,12 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
+#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+                                    KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static void process_nmi(struct kvm_vcpu *vcpu);
+static void enter_smm(struct kvm_vcpu *vcpu);
 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 
 struct kvm_x86_ops *kvm_x86_ops __read_mostly;
@@ -114,7 +119,8 @@ u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
 u64  __read_mostly kvm_max_tsc_scaling_ratio;
 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
-static u64 __read_mostly kvm_default_tsc_scaling_ratio;
+u64 __read_mostly kvm_default_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
 
 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 static u32 __read_mostly tsc_tolerance_ppm = 250;
@@ -538,7 +544,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
                goto out;
        }
        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
-               if (is_present_gpte(pdpte[i]) &&
+               if ((pdpte[i] & PT_PRESENT_MASK) &&
                    (pdpte[i] &
                     vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
                        ret = 0;
@@ -983,6 +989,7 @@ static u32 emulated_msrs[] = {
        MSR_IA32_MISC_ENABLE,
        MSR_IA32_MCG_STATUS,
        MSR_IA32_MCG_CTL,
+       MSR_IA32_MCG_EXT_CTL,
        MSR_IA32_SMBASE,
 };
 
@@ -1162,7 +1169,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
        int version;
        int r;
        struct pvclock_wall_clock wc;
-       struct timespec boot;
+       struct timespec64 boot;
 
        if (!wall_clock)
                return;
@@ -1185,13 +1192,13 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
         * wall clock specified here.  guest system time equals host
         * system time for us, thus we must fill in host boot time here.
         */
-       getboottime(&boot);
+       getboottime64(&boot);
 
        if (kvm->arch.kvmclock_offset) {
-               struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
-               boot = timespec_sub(boot, ts);
+               struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
+               boot = timespec64_sub(boot, ts);
        }
-       wc.sec = boot.tv_sec;
+       wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
        wc.nsec = boot.tv_nsec;
        wc.version = version;
 
@@ -2616,6 +2623,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_TSC_CONTROL:
                r = kvm_has_tsc_control;
                break;
+       case KVM_CAP_X2APIC_API:
+               r = KVM_X2APIC_API_VALID_FLAGS;
+               break;
        default:
                r = 0;
                break;
@@ -2678,11 +2688,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
                break;
        }
        case KVM_X86_GET_MCE_CAP_SUPPORTED: {
-               u64 mce_cap;
-
-               mce_cap = KVM_MCE_CAP_SUPPORTED;
                r = -EFAULT;
-               if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+               if (copy_to_user(argp, &kvm_mce_cap_supported,
+                                sizeof(kvm_mce_cap_supported)))
                        goto out;
                r = 0;
                break;
@@ -2734,6 +2742,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                                rdtsc() - vcpu->arch.last_host_tsc;
                if (tsc_delta < 0)
                        mark_tsc_unstable("KVM discovered backwards TSC");
+
+               if (kvm_lapic_hv_timer_in_use(vcpu) &&
+                               kvm_x86_ops->set_hv_timer(vcpu,
+                                       kvm_get_lapic_tscdeadline_msr(vcpu)))
+                       kvm_lapic_switch_to_sw_timer(vcpu);
                if (check_tsc_unstable()) {
                        u64 offset = kvm_compute_tsc_offset(vcpu,
                                                vcpu->arch.last_guest_tsc);
@@ -2767,15 +2780,17 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
        if (vcpu->arch.apicv_active)
                kvm_x86_ops->sync_pir_to_irr(vcpu);
 
-       memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-
-       return 0;
+       return kvm_apic_get_state(vcpu, s);
 }
 
 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
                                    struct kvm_lapic_state *s)
 {
-       kvm_apic_post_state_restore(vcpu, s);
+       int r;
+
+       r = kvm_apic_set_state(vcpu, s);
+       if (r)
+               return r;
        update_cr8_intercept(vcpu);
 
        return 0;
@@ -2860,7 +2875,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
        r = -EINVAL;
        if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
                goto out;
-       if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+       if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
                goto out;
        r = 0;
        vcpu->arch.mcg_cap = mcg_cap;
@@ -2870,6 +2885,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
        /* Init IA32_MCi_CTL to all 1s */
        for (bank = 0; bank < bank_num; bank++)
                vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+
+       if (kvm_x86_ops->setup_mce)
+               kvm_x86_ops->setup_mce(vcpu);
 out:
        return r;
 }
@@ -3768,7 +3786,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
                r = -EEXIST;
                if (irqchip_in_kernel(kvm))
                        goto split_irqchip_unlock;
-               if (atomic_read(&kvm->online_vcpus))
+               if (kvm->created_vcpus)
                        goto split_irqchip_unlock;
                r = kvm_setup_empty_irq_routing(kvm);
                if (r)
@@ -3782,6 +3800,18 @@ split_irqchip_unlock:
                mutex_unlock(&kvm->lock);
                break;
        }
+       case KVM_CAP_X2APIC_API:
+               r = -EINVAL;
+               if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
+                       break;
+
+               if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
+                       kvm->arch.x2apic_format = true;
+               if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+                       kvm->arch.x2apic_broadcast_quirk_disabled = true;
+
+               r = 0;
+               break;
        default:
                r = -EINVAL;
                break;
@@ -3833,7 +3863,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (kvm->arch.vpic)
                        goto create_irqchip_unlock;
                r = -EINVAL;
-               if (atomic_read(&kvm->online_vcpus))
+               if (kvm->created_vcpus)
                        goto create_irqchip_unlock;
                r = -ENOMEM;
                vpic = kvm_create_pic(kvm);
@@ -3873,7 +3903,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                                   sizeof(struct kvm_pit_config)))
                        goto out;
        create_pit:
-               mutex_lock(&kvm->slots_lock);
+               mutex_lock(&kvm->lock);
                r = -EEXIST;
                if (kvm->arch.vpit)
                        goto create_pit_unlock;
@@ -3882,7 +3912,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (kvm->arch.vpit)
                        r = 0;
        create_pit_unlock:
-               mutex_unlock(&kvm->slots_lock);
+               mutex_unlock(&kvm->lock);
                break;
        case KVM_GET_IRQCHIP: {
                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
@@ -3989,7 +4019,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
        case KVM_SET_BOOT_CPU_ID:
                r = 0;
                mutex_lock(&kvm->lock);
-               if (atomic_read(&kvm->online_vcpus) != 0)
+               if (kvm->created_vcpus)
                        r = -EBUSY;
                else
                        kvm->arch.bsp_vcpu_id = arg;
@@ -5297,13 +5327,8 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu)
                /* This is a good place to trace that we are exiting SMM.  */
                trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
 
-               if (unlikely(vcpu->arch.smi_pending)) {
-                       kvm_make_request(KVM_REQ_SMI, vcpu);
-                       vcpu->arch.smi_pending = 0;
-               } else {
-                       /* Process a latched INIT, if any.  */
-                       kvm_make_request(KVM_REQ_EVENT, vcpu);
-               }
+               /* Process a latched INIT or SMI, if any.  */
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
        }
 
        kvm_mmu_reset_context(vcpu);
@@ -5849,8 +5874,8 @@ int kvm_arch_init(void *opaque)
        kvm_x86_ops = ops;
 
        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-                       PT_DIRTY_MASK, PT64_NX_MASK, 0);
-
+                       PT_DIRTY_MASK, PT64_NX_MASK, 0,
+                       PT_PRESENT_MASK);
        kvm_timer_init();
 
        perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -6084,7 +6109,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
        }
 
        /* try to inject new event if pending */
-       if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+       if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+               vcpu->arch.smi_pending = false;
+               enter_smm(vcpu);
+       } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
                --vcpu->arch.nmi_pending;
                vcpu->arch.nmi_injected = true;
                kvm_x86_ops->set_nmi(vcpu);
@@ -6107,6 +6135,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
                        kvm_x86_ops->set_irq(vcpu);
                }
        }
+
        return 0;
 }
 
@@ -6130,7 +6159,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
 #define put_smstate(type, buf, offset, val)                      \
        *(type *)((buf) + (offset) - 0x7e00) = val
 
-static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
 {
        u32 flags = 0;
        flags |= seg->g       << 23;
@@ -6144,7 +6173,7 @@ static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
        return flags;
 }
 
-static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
 {
        struct kvm_segment seg;
        int offset;
@@ -6159,11 +6188,11 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
 
        put_smstate(u32, buf, offset + 8, seg.base);
        put_smstate(u32, buf, offset + 4, seg.limit);
-       put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
+       put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
 }
 
 #ifdef CONFIG_X86_64
-static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 {
        struct kvm_segment seg;
        int offset;
@@ -6172,7 +6201,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
        kvm_get_segment(vcpu, &seg, n);
        offset = 0x7e00 + n * 16;
 
-       flags = process_smi_get_segment_flags(&seg) >> 8;
+       flags = enter_smm_get_segment_flags(&seg) >> 8;
        put_smstate(u16, buf, offset, seg.selector);
        put_smstate(u16, buf, offset + 2, flags);
        put_smstate(u32, buf, offset + 4, seg.limit);
@@ -6180,7 +6209,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 }
 #endif
 
-static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
 {
        struct desc_ptr dt;
        struct kvm_segment seg;
@@ -6204,13 +6233,13 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
        put_smstate(u32, buf, 0x7fc4, seg.selector);
        put_smstate(u32, buf, 0x7f64, seg.base);
        put_smstate(u32, buf, 0x7f60, seg.limit);
-       put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
+       put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
 
        kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
        put_smstate(u32, buf, 0x7fc0, seg.selector);
        put_smstate(u32, buf, 0x7f80, seg.base);
        put_smstate(u32, buf, 0x7f7c, seg.limit);
-       put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
+       put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
 
        kvm_x86_ops->get_gdt(vcpu, &dt);
        put_smstate(u32, buf, 0x7f74, dt.address);
@@ -6221,7 +6250,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
        put_smstate(u32, buf, 0x7f54, dt.size);
 
        for (i = 0; i < 6; i++)
-               process_smi_save_seg_32(vcpu, buf, i);
+               enter_smm_save_seg_32(vcpu, buf, i);
 
        put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
 
@@ -6230,7 +6259,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
        put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
 }
 
-static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
 {
 #ifdef CONFIG_X86_64
        struct desc_ptr dt;
@@ -6262,7 +6291,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
 
        kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
        put_smstate(u16, buf, 0x7e90, seg.selector);
-       put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
+       put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
        put_smstate(u32, buf, 0x7e94, seg.limit);
        put_smstate(u64, buf, 0x7e98, seg.base);
 
@@ -6272,7 +6301,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
 
        kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
        put_smstate(u16, buf, 0x7e70, seg.selector);
-       put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
+       put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
        put_smstate(u32, buf, 0x7e74, seg.limit);
        put_smstate(u64, buf, 0x7e78, seg.base);
 
@@ -6281,31 +6310,26 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
        put_smstate(u64, buf, 0x7e68, dt.address);
 
        for (i = 0; i < 6; i++)
-               process_smi_save_seg_64(vcpu, buf, i);
+               enter_smm_save_seg_64(vcpu, buf, i);
 #else
        WARN_ON_ONCE(1);
 #endif
 }
 
-static void process_smi(struct kvm_vcpu *vcpu)
+static void enter_smm(struct kvm_vcpu *vcpu)
 {
        struct kvm_segment cs, ds;
        struct desc_ptr dt;
        char buf[512];
        u32 cr0;
 
-       if (is_smm(vcpu)) {
-               vcpu->arch.smi_pending = true;
-               return;
-       }
-
        trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
        vcpu->arch.hflags |= HF_SMM_MASK;
        memset(buf, 0, 512);
        if (guest_cpuid_has_longmode(vcpu))
-               process_smi_save_state_64(vcpu, buf);
+               enter_smm_save_state_64(vcpu, buf);
        else
-               process_smi_save_state_32(vcpu, buf);
+               enter_smm_save_state_32(vcpu, buf);
 
        kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
 
@@ -6361,6 +6385,12 @@ static void process_smi(struct kvm_vcpu *vcpu)
        kvm_mmu_reset_context(vcpu);
 }
 
+static void process_smi(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.smi_pending = true;
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
 void kvm_make_scan_ioapic_request(struct kvm *kvm)
 {
        kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
@@ -6555,8 +6585,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
                if (inject_pending_event(vcpu, req_int_win) != 0)
                        req_immediate_exit = true;
-               /* enable NMI/IRQ window open exits if needed */
                else {
+                       /* Enable NMI/IRQ window open exits if needed.
+                        *
+                        * SMIs have two cases: 1) they can be nested, and
+                        * then there is nothing to do here because RSM will
+                        * cause a vmexit anyway; 2) or the SMI can be pending
+                        * because inject_pending_event has completed the
+                        * injection of an IRQ or NMI from the previous vmexit,
+                        * and then we request an immediate exit to inject the SMI.
+                        */
+                       if (vcpu->arch.smi_pending && !is_smm(vcpu))
+                               req_immediate_exit = true;
                        if (vcpu->arch.nmi_pending)
                                kvm_x86_ops->enable_nmi_window(vcpu);
                        if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
@@ -6607,12 +6647,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
        kvm_load_guest_xcr0(vcpu);
 
-       if (req_immediate_exit)
+       if (req_immediate_exit) {
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
                smp_send_reschedule(vcpu->cpu);
+       }
 
        trace_kvm_entry(vcpu->vcpu_id);
        wait_lapic_expire(vcpu);
-       __kvm_guest_enter();
+       guest_enter_irqoff();
 
        if (unlikely(vcpu->arch.switch_db_regs)) {
                set_debugreg(0, 7);
@@ -6663,16 +6705,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
        ++vcpu->stat.exits;
 
-       /*
-        * We must have an instruction between local_irq_enable() and
-        * kvm_guest_exit(), so the timer interrupt isn't delayed by
-        * the interrupt shadow.  The stat.exits increment will do nicely.
-        * But we need to prevent reordering, hence this barrier():
-        */
-       barrier();
-
-       kvm_guest_exit();
+       guest_exit_irqoff();
 
+       local_irq_enable();
        preempt_enable();
 
        vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -7409,6 +7444,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
        vcpu->arch.hflags = 0;
 
+       vcpu->arch.smi_pending = 0;
        atomic_set(&vcpu->arch.nmi_queued, 0);
        vcpu->arch.nmi_pending = 0;
        vcpu->arch.nmi_injected = false;
@@ -7601,11 +7637,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
        return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
 }
 
-bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
-{
-       return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
-}
-
 struct static_key kvm_no_apic_vcpu __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
 
@@ -7872,7 +7903,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kfree(kvm->arch.vpic);
        kfree(kvm->arch.vioapic);
        kvm_free_vcpus(kvm);
-       kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+       kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
        kvm_mmu_uninit_vm(kvm);
 }
 
@@ -8380,7 +8411,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
        /*
         * When producer of consumer is unregistered, we change back to
         * remapped mode, so we can re-use the current implementation
-        * when the irq is masked/disabed or the consumer side (KVM
+        * when the irq is masked/disabled or the consumer side (KVM
         * int this case doesn't want to receive the interrupts.
        */
        ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
index 0ac520d..c71df0c 100644 (file)
@@ -46,7 +46,8 @@ struct read_info_sccb {
        u64     rnmax2;                 /* 104-111 */
        u8      _pad_112[116 - 112];    /* 112-115 */
        u8      fac116;                 /* 116 */
-       u8      _pad_117[119 - 117];    /* 117-118 */
+       u8      fac117;                 /* 117 */
+       u8      _pad_118;               /* 118 */
        u8      fac119;                 /* 119 */
        u16     hcpua;                  /* 120-121 */
        u8      _pad_122[124 - 122];    /* 122-123 */
@@ -114,7 +115,12 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
        sclp.facilities = sccb->facilities;
        sclp.has_sprp = !!(sccb->fac84 & 0x02);
        sclp.has_core_type = !!(sccb->fac84 & 0x01);
+       sclp.has_gsls = !!(sccb->fac85 & 0x80);
+       sclp.has_64bscao = !!(sccb->fac116 & 0x80);
+       sclp.has_cmma = !!(sccb->fac116 & 0x40);
        sclp.has_esca = !!(sccb->fac116 & 0x08);
+       sclp.has_pfmfi = !!(sccb->fac117 & 0x40);
+       sclp.has_ibs = !!(sccb->fac117 & 0x20);
        sclp.has_hvs = !!(sccb->fac119 & 0x80);
        if (sccb->fac85 & 0x02)
                S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP;
@@ -145,6 +151,10 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
                sclp.has_siif = cpue->siif;
                sclp.has_sigpif = cpue->sigpif;
                sclp.has_sief2 = cpue->sief2;
+               sclp.has_gpere = cpue->gpere;
+               sclp.has_ib = cpue->ib;
+               sclp.has_cei = cpue->cei;
+               sclp.has_skey = cpue->skey;
                break;
        }
 
index 2553db0..f59b717 100644 (file)
@@ -26,7 +26,7 @@
 #define OCF_LENGTH_CPC_NAME 8UL
 
 static char hmc_network[OCF_LENGTH_HMC_NETWORK + 1];
-static char cpc_name[OCF_LENGTH_CPC_NAME + 1];
+static char cpc_name[OCF_LENGTH_CPC_NAME]; /* in EBCDIC */
 
 static DEFINE_SPINLOCK(sclp_ocf_lock);
 static struct work_struct sclp_ocf_change_work;
@@ -72,9 +72,8 @@ static void sclp_ocf_handler(struct evbuf_header *evbuf)
        }
        if (cpc) {
                size = min(OCF_LENGTH_CPC_NAME, (size_t) cpc->length);
+               memset(cpc_name, 0, OCF_LENGTH_CPC_NAME);
                memcpy(cpc_name, cpc + 1, size);
-               EBCASC(cpc_name, size);
-               cpc_name[size] = 0;
        }
        spin_unlock(&sclp_ocf_lock);
        schedule_work(&sclp_ocf_change_work);
@@ -85,15 +84,23 @@ static struct sclp_register sclp_ocf_event = {
        .receiver_fn = sclp_ocf_handler,
 };
 
+void sclp_ocf_cpc_name_copy(char *dst)
+{
+       spin_lock_irq(&sclp_ocf_lock);
+       memcpy(dst, cpc_name, OCF_LENGTH_CPC_NAME);
+       spin_unlock_irq(&sclp_ocf_lock);
+}
+EXPORT_SYMBOL(sclp_ocf_cpc_name_copy);
+
 static ssize_t cpc_name_show(struct kobject *kobj,
                             struct kobj_attribute *attr, char *page)
 {
-       int rc;
+       char name[OCF_LENGTH_CPC_NAME + 1];
 
-       spin_lock_irq(&sclp_ocf_lock);
-       rc = snprintf(page, PAGE_SIZE, "%s\n", cpc_name);
-       spin_unlock_irq(&sclp_ocf_lock);
-       return rc;
+       sclp_ocf_cpc_name_copy(name);
+       name[OCF_LENGTH_CPC_NAME] = 0;
+       EBCASC(name, OCF_LENGTH_CPC_NAME);
+       return snprintf(page, PAGE_SIZE, "%s\n", name);
 }
 
 static struct kobj_attribute cpc_name_attr =
index da0a524..540da51 100644 (file)
@@ -1,6 +1,5 @@
 /*
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
+ * Copyright (C) 2015, 2016 ARM Ltd.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-
-#ifndef __ASM_ARM_KVM_VGIC_H
-#define __ASM_ARM_KVM_VGIC_H
-
-#ifdef CONFIG_KVM_NEW_VGIC
-#include <kvm/vgic/vgic.h>
-#else
+#ifndef __KVM_ARM_VGIC_H
+#define __KVM_ARM_VGIC_H
 
 #include <linux/kernel.h>
 #include <linux/kvm.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <kvm/iodev.h>
-#include <linux/irqchip/arm-gic-common.h>
+#include <linux/list.h>
 
-#define VGIC_NR_IRQS_LEGACY    256
+#define VGIC_V3_MAX_CPUS       255
+#define VGIC_V2_MAX_CPUS       8
+#define VGIC_NR_IRQS_LEGACY     256
 #define VGIC_NR_SGIS           16
 #define VGIC_NR_PPIS           16
 #define VGIC_NR_PRIVATE_IRQS   (VGIC_NR_SGIS + VGIC_NR_PPIS)
+#define VGIC_MAX_PRIVATE       (VGIC_NR_PRIVATE_IRQS - 1)
+#define VGIC_MAX_SPI           1019
+#define VGIC_MAX_RESERVED      1023
+#define VGIC_MIN_LPI           8192
 
-#define VGIC_V2_MAX_LRS                (1 << 6)
-#define VGIC_V3_MAX_LRS                16
-#define VGIC_MAX_IRQS          1024
-#define VGIC_V2_MAX_CPUS       8
-#define VGIC_V3_MAX_CPUS       255
+enum vgic_type {
+       VGIC_V2,                /* Good ol' GICv2 */
+       VGIC_V3,                /* New fancy GICv3 */
+};
 
-#if (VGIC_NR_IRQS_LEGACY & 31)
-#error "VGIC_NR_IRQS must be a multiple of 32"
-#endif
+/* same for all guests, as depending only on the _host's_ GIC model */
+struct vgic_global {
+       /* type of the host GIC */
+       enum vgic_type          type;
 
-#if (VGIC_NR_IRQS_LEGACY > VGIC_MAX_IRQS)
-#error "VGIC_NR_IRQS must be <= 1024"
-#endif
+       /* Physical address of vgic virtual cpu interface */
+       phys_addr_t             vcpu_base;
 
-/*
- * The GIC distributor registers describing interrupts have two parts:
- * - 32 per-CPU interrupts (SGI + PPI)
- * - a bunch of shared interrupts (SPI)
- */
-struct vgic_bitmap {
-       /*
-        * - One UL per VCPU for private interrupts (assumes UL is at
-        *   least 32 bits)
-        * - As many UL as necessary for shared interrupts.
-        *
-        * The private interrupts are accessed via the "private"
-        * field, one UL per vcpu (the state for vcpu n is in
-        * private[n]). The shared interrupts are accessed via the
-        * "shared" pointer (IRQn state is at bit n-32 in the bitmap).
-        */
-       unsigned long *private;
-       unsigned long *shared;
-};
+       /* virtual control interface mapping */
+       void __iomem            *vctrl_base;
 
-struct vgic_bytemap {
-       /*
-        * - 8 u32 per VCPU for private interrupts
-        * - As many u32 as necessary for shared interrupts.
-        *
-        * The private interrupts are accessed via the "private"
-        * field, (the state for vcpu n is in private[n*8] to
-        * private[n*8 + 7]). The shared interrupts are accessed via
-        * the "shared" pointer (IRQn state is at byte (n-32)%4 of the
-        * shared[(n-32)/4] word).
-        */
-       u32 *private;
-       u32 *shared;
-};
+       /* Number of implemented list registers */
+       int                     nr_lr;
 
-struct kvm_vcpu;
+       /* Maintenance IRQ number */
+       unsigned int            maint_irq;
 
-enum vgic_type {
-       VGIC_V2,                /* Good ol' GICv2 */
-       VGIC_V3,                /* New fancy GICv3 */
+       /* maximum number of VCPUs allowed (GICv2 limits us to 8) */
+       int                     max_gic_vcpus;
+
+       /* Only needed for the legacy KVM_CREATE_IRQCHIP */
+       bool                    can_emulate_gicv2;
 };
 
-#define LR_STATE_PENDING       (1 << 0)
-#define LR_STATE_ACTIVE                (1 << 1)
-#define LR_STATE_MASK          (3 << 0)
-#define LR_EOI_INT             (1 << 2)
-#define LR_HW                  (1 << 3)
+extern struct vgic_global kvm_vgic_global_state;
 
-struct vgic_lr {
-       unsigned irq:10;
-       union {
-               unsigned hwirq:10;
-               unsigned source:3;
-       };
-       unsigned state:4;
-};
+#define VGIC_V2_MAX_LRS                (1 << 6)
+#define VGIC_V3_MAX_LRS                16
+#define VGIC_V3_LR_INDEX(lr)   (VGIC_V3_MAX_LRS - 1 - lr)
 
-struct vgic_vmcr {
-       u32     ctlr;
-       u32     abpr;
-       u32     bpr;
-       u32     pmr;
+enum vgic_irq_config {
+       VGIC_CONFIG_EDGE = 0,
+       VGIC_CONFIG_LEVEL
 };
 
-struct vgic_ops {
-       struct vgic_lr  (*get_lr)(const struct kvm_vcpu *, int);
-       void    (*set_lr)(struct kvm_vcpu *, int, struct vgic_lr);
-       u64     (*get_elrsr)(const struct kvm_vcpu *vcpu);
-       u64     (*get_eisr)(const struct kvm_vcpu *vcpu);
-       void    (*clear_eisr)(struct kvm_vcpu *vcpu);
-       u32     (*get_interrupt_status)(const struct kvm_vcpu *vcpu);
-       void    (*enable_underflow)(struct kvm_vcpu *vcpu);
-       void    (*disable_underflow)(struct kvm_vcpu *vcpu);
-       void    (*get_vmcr)(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-       void    (*set_vmcr)(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-       void    (*enable)(struct kvm_vcpu *vcpu);
+struct vgic_irq {
+       spinlock_t irq_lock;            /* Protects the content of the struct */
+       struct list_head lpi_list;      /* Used to link all LPIs together */
+       struct list_head ap_list;
+
+       struct kvm_vcpu *vcpu;          /* SGIs and PPIs: The VCPU
+                                        * SPIs and LPIs: The VCPU whose ap_list
+                                        * this is queued on.
+                                        */
+
+       struct kvm_vcpu *target_vcpu;   /* The VCPU that this interrupt should
+                                        * be sent to, as a result of the
+                                        * targets reg (v2) or the
+                                        * affinity reg (v3).
+                                        */
+
+       u32 intid;                      /* Guest visible INTID */
+       bool pending;
+       bool line_level;                /* Level only */
+       bool soft_pending;              /* Level only */
+       bool active;                    /* not used for LPIs */
+       bool enabled;
+       bool hw;                        /* Tied to HW IRQ */
+       struct kref refcount;           /* Used for LPIs */
+       u32 hwintid;                    /* HW INTID number */
+       union {
+               u8 targets;                     /* GICv2 target VCPUs mask */
+               u32 mpidr;                      /* GICv3 target VCPU */
+       };
+       u8 source;                      /* GICv2 SGIs only */
+       u8 priority;
+       enum vgic_irq_config config;    /* Level or edge */
 };
 
-struct vgic_params {
-       /* vgic type */
-       enum vgic_type  type;
-       /* Physical address of vgic virtual cpu interface */
-       phys_addr_t     vcpu_base;
-       /* Number of list registers */
-       u32             nr_lr;
-       /* Interrupt number */
-       unsigned int    maint_irq;
-       /* Virtual control interface base address */
-       void __iomem    *vctrl_base;
-       int             max_gic_vcpus;
-       /* Only needed for the legacy KVM_CREATE_IRQCHIP */
-       bool            can_emulate_gicv2;
-};
+struct vgic_register_region;
+struct vgic_its;
 
-struct vgic_vm_ops {
-       bool    (*queue_sgi)(struct kvm_vcpu *, int irq);
-       void    (*add_sgi_source)(struct kvm_vcpu *, int irq, int source);
-       int     (*init_model)(struct kvm *);
-       int     (*map_resources)(struct kvm *, const struct vgic_params *);
+enum iodev_type {
+       IODEV_CPUIF,
+       IODEV_DIST,
+       IODEV_REDIST,
+       IODEV_ITS
 };
 
 struct vgic_io_device {
-       gpa_t addr;
-       int len;
-       const struct vgic_io_range *reg_ranges;
-       struct kvm_vcpu *redist_vcpu;
+       gpa_t base_addr;
+       union {
+               struct kvm_vcpu *redist_vcpu;
+               struct vgic_its *its;
+       };
+       const struct vgic_register_region *regions;
+       enum iodev_type iodev_type;
+       int nr_regions;
        struct kvm_io_device dev;
 };
 
-struct irq_phys_map {
-       u32                     virt_irq;
-       u32                     phys_irq;
-};
-
-struct irq_phys_map_entry {
-       struct list_head        entry;
-       struct rcu_head         rcu;
-       struct irq_phys_map     map;
+struct vgic_its {
+       /* The base address of the ITS control register frame */
+       gpa_t                   vgic_its_base;
+
+       bool                    enabled;
+       bool                    initialized;
+       struct vgic_io_device   iodev;
+       struct kvm_device       *dev;
+
+       /* These registers correspond to GITS_BASER{0,1} */
+       u64                     baser_device_table;
+       u64                     baser_coll_table;
+
+       /* Protects the command queue */
+       struct mutex            cmd_lock;
+       u64                     cbaser;
+       u32                     creadr;
+       u32                     cwriter;
+
+       /* Protects the device and collection lists */
+       struct mutex            its_lock;
+       struct list_head        device_list;
+       struct list_head        collection_list;
 };
 
 struct vgic_dist {
-       spinlock_t              lock;
        bool                    in_kernel;
        bool                    ready;
+       bool                    initialized;
 
        /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
        u32                     vgic_model;
 
-       int                     nr_cpus;
-       int                     nr_irqs;
+       /* Do injected MSIs require an additional device ID? */
+       bool                    msis_require_devid;
+
+       int                     nr_spis;
 
+       /* TODO: Consider moving to global state */
        /* Virtual control interface mapping */
        void __iomem            *vctrl_base;
 
-       /* Distributor and vcpu interface mapping in the guest */
-       phys_addr_t             vgic_dist_base;
-       /* GICv2 and GICv3 use different mapped register blocks */
+       /* base addresses in guest physical address space: */
+       gpa_t                   vgic_dist_base;         /* distributor */
        union {
-               phys_addr_t             vgic_cpu_base;
-               phys_addr_t             vgic_redist_base;
+               /* either a GICv2 CPU interface */
+               gpa_t                   vgic_cpu_base;
+               /* or a number of GICv3 redistributor regions */
+               gpa_t                   vgic_redist_base;
        };
 
-       /* Distributor enabled */
-       u32                     enabled;
-
-       /* Interrupt enabled (one bit per IRQ) */
-       struct vgic_bitmap      irq_enabled;
-
-       /* Level-triggered interrupt external input is asserted */
-       struct vgic_bitmap      irq_level;
-
-       /*
-        * Interrupt state is pending on the distributor
-        */
-       struct vgic_bitmap      irq_pending;
-
-       /*
-        * Tracks writes to GICD_ISPENDRn and GICD_ICPENDRn for level-triggered
-        * interrupts.  Essentially holds the state of the flip-flop in
-        * Figure 4-10 on page 4-101 in ARM IHI 0048B.b.
-        * Once set, it is only cleared for level-triggered interrupts on
-        * guest ACKs (when we queue it) or writes to GICD_ICPENDRn.
-        */
-       struct vgic_bitmap      irq_soft_pend;
-
-       /* Level-triggered interrupt queued on VCPU interface */
-       struct vgic_bitmap      irq_queued;
-
-       /* Interrupt was active when unqueue from VCPU interface */
-       struct vgic_bitmap      irq_active;
-
-       /* Interrupt priority. Not used yet. */
-       struct vgic_bytemap     irq_priority;
+       /* distributor enabled */
+       bool                    enabled;
 
-       /* Level/edge triggered */
-       struct vgic_bitmap      irq_cfg;
+       struct vgic_irq         *spis;
 
-       /*
-        * Source CPU per SGI and target CPU:
-        *
-        * Each byte represent a SGI observable on a VCPU, each bit of
-        * this byte indicating if the corresponding VCPU has
-        * generated this interrupt. This is a GICv2 feature only.
-        *
-        * For VCPUn (n < 8), irq_sgi_sources[n*16] to [n*16 + 15] are
-        * the SGIs observable on VCPUn.
-        */
-       u8                      *irq_sgi_sources;
+       struct vgic_io_device   dist_iodev;
 
-       /*
-        * Target CPU for each SPI:
-        *
-        * Array of available SPI, each byte indicating the target
-        * VCPU for SPI. IRQn (n >=32) is at irq_spi_cpu[n-32].
-        */
-       u8                      *irq_spi_cpu;
+       bool                    has_its;
 
        /*
-        * Reverse lookup of irq_spi_cpu for faster compute pending:
-        *
-        * Array of bitmaps, one per VCPU, describing if IRQn is
-        * routed to a particular VCPU.
+        * Contains the attributes and gpa of the LPI configuration table.
+        * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share
+        * one address across all redistributors.
+        * GICv3 spec: 6.1.2 "LPI Configuration tables"
         */
-       struct vgic_bitmap      *irq_spi_target;
-
-       /* Target MPIDR for each IRQ (needed for GICv3 IROUTERn) only */
-       u32                     *irq_spi_mpidr;
+       u64                     propbaser;
 
-       /* Bitmap indicating which CPU has something pending */
-       unsigned long           *irq_pending_on_cpu;
-
-       /* Bitmap indicating which CPU has active IRQs */
-       unsigned long           *irq_active_on_cpu;
-
-       struct vgic_vm_ops      vm_ops;
-       struct vgic_io_device   dist_iodev;
-       struct vgic_io_device   *redist_iodevs;
-
-       /* Virtual irq to hwirq mapping */
-       spinlock_t              irq_phys_map_lock;
-       struct list_head        irq_phys_map_list;
+       /* Protects the lpi_list and the count value below. */
+       spinlock_t              lpi_list_lock;
+       struct list_head        lpi_list_head;
+       int                     lpi_list_count;
 };
 
 struct vgic_v2_cpu_if {
@@ -298,78 +230,88 @@ struct vgic_v3_cpu_if {
 };
 
 struct vgic_cpu {
-       /* Pending/active/both interrupts on this VCPU */
-       DECLARE_BITMAP(pending_percpu, VGIC_NR_PRIVATE_IRQS);
-       DECLARE_BITMAP(active_percpu, VGIC_NR_PRIVATE_IRQS);
-       DECLARE_BITMAP(pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
-
-       /* Pending/active/both shared interrupts, dynamically sized */
-       unsigned long   *pending_shared;
-       unsigned long   *active_shared;
-       unsigned long   *pend_act_shared;
-
        /* CPU vif control registers for world switch */
        union {
                struct vgic_v2_cpu_if   vgic_v2;
                struct vgic_v3_cpu_if   vgic_v3;
        };
 
-       /* Protected by the distributor's irq_phys_map_lock */
-       struct list_head        irq_phys_map_list;
+       unsigned int used_lrs;
+       struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS];
 
-       u64             live_lrs;
-};
+       spinlock_t ap_list_lock;        /* Protects the ap_list */
+
+       /*
+        * List of IRQs that this VCPU should consider because they are either
+        * Active or Pending (hence the name; AP list), or because they recently
+        * were one of the two and need to be migrated off this list to another
+        * VCPU.
+        */
+       struct list_head ap_list_head;
 
-#define LR_EMPTY       0xff
+       u64 live_lrs;
 
-#define INT_STATUS_EOI         (1 << 0)
-#define INT_STATUS_UNDERFLOW   (1 << 1)
+       /*
+        * Members below are used with GICv3 emulation only and represent
+        * parts of the redistributor.
+        */
+       struct vgic_io_device   rd_iodev;
+       struct vgic_io_device   sgi_iodev;
 
-struct kvm;
-struct kvm_vcpu;
+       /* Contains the attributes and gpa of the LPI pending tables. */
+       u64 pendbaser;
+
+       bool lpis_enabled;
+};
 
 int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
-int kvm_vgic_hyp_init(void);
-int kvm_vgic_map_resources(struct kvm *kvm);
-int kvm_vgic_get_max_vcpus(void);
 void kvm_vgic_early_init(struct kvm *kvm);
 int kvm_vgic_create(struct kvm *kvm, u32 type);
 void kvm_vgic_destroy(struct kvm *kvm);
 void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
+int kvm_vgic_map_resources(struct kvm *kvm);
+int kvm_vgic_hyp_init(void);
+
+int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
                        bool level);
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
-                              unsigned int virt_irq, bool level);
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq);
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+                              bool level);
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq);
 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
 bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
 
+int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
+
 #define irqchip_in_kernel(k)   (!!((k)->arch.vgic.in_kernel))
-#define vgic_initialized(k)    (!!((k)->arch.vgic.nr_cpus))
+#define vgic_initialized(k)    ((k)->arch.vgic.initialized)
 #define vgic_ready(k)          ((k)->arch.vgic.ready)
 #define vgic_valid_spi(k, i)   (((i) >= VGIC_NR_PRIVATE_IRQS) && \
-                                ((i) < (k)->arch.vgic.nr_irqs))
+                       ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS))
+
+bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
+void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
+void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
 
-int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
-                 const struct vgic_ops **ops,
-                 const struct vgic_params **params);
 #ifdef CONFIG_KVM_ARM_VGIC_V3
-int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
-                 const struct vgic_ops **ops,
-                 const struct vgic_params **params);
+void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
 #else
-static inline int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
-                               const struct vgic_ops **ops,
-                               const struct vgic_params **params)
+static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
 {
-       return -ENODEV;
 }
 #endif
 
-#endif /* old VGIC include */
-#endif
+/**
+ * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
+ *
+ * The host's GIC naturally limits the maximum amount of VCPUs a guest
+ * can use.
+ */
+static inline int kvm_vgic_get_max_vcpus(void)
+{
+       return kvm_vgic_global_state.max_gic_vcpus;
+}
+
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
+
+#endif /* __KVM_ARM_VGIC_H */
diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h
deleted file mode 100644 (file)
index 3fbd175..0000000
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef __ASM_ARM_KVM_VGIC_VGIC_H
-#define __ASM_ARM_KVM_VGIC_VGIC_H
-
-#include <linux/kernel.h>
-#include <linux/kvm.h>
-#include <linux/irqreturn.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-#include <kvm/iodev.h>
-
-#define VGIC_V3_MAX_CPUS       255
-#define VGIC_V2_MAX_CPUS       8
-#define VGIC_NR_IRQS_LEGACY     256
-#define VGIC_NR_SGIS           16
-#define VGIC_NR_PPIS           16
-#define VGIC_NR_PRIVATE_IRQS   (VGIC_NR_SGIS + VGIC_NR_PPIS)
-#define VGIC_MAX_PRIVATE       (VGIC_NR_PRIVATE_IRQS - 1)
-#define VGIC_MAX_SPI           1019
-#define VGIC_MAX_RESERVED      1023
-#define VGIC_MIN_LPI           8192
-
-enum vgic_type {
-       VGIC_V2,                /* Good ol' GICv2 */
-       VGIC_V3,                /* New fancy GICv3 */
-};
-
-/* same for all guests, as depending only on the _host's_ GIC model */
-struct vgic_global {
-       /* type of the host GIC */
-       enum vgic_type          type;
-
-       /* Physical address of vgic virtual cpu interface */
-       phys_addr_t             vcpu_base;
-
-       /* virtual control interface mapping */
-       void __iomem            *vctrl_base;
-
-       /* Number of implemented list registers */
-       int                     nr_lr;
-
-       /* Maintenance IRQ number */
-       unsigned int            maint_irq;
-
-       /* maximum number of VCPUs allowed (GICv2 limits us to 8) */
-       int                     max_gic_vcpus;
-
-       /* Only needed for the legacy KVM_CREATE_IRQCHIP */
-       bool                    can_emulate_gicv2;
-};
-
-extern struct vgic_global kvm_vgic_global_state;
-
-#define VGIC_V2_MAX_LRS                (1 << 6)
-#define VGIC_V3_MAX_LRS                16
-#define VGIC_V3_LR_INDEX(lr)   (VGIC_V3_MAX_LRS - 1 - lr)
-
-enum vgic_irq_config {
-       VGIC_CONFIG_EDGE = 0,
-       VGIC_CONFIG_LEVEL
-};
-
-struct vgic_irq {
-       spinlock_t irq_lock;            /* Protects the content of the struct */
-       struct list_head ap_list;
-
-       struct kvm_vcpu *vcpu;          /* SGIs and PPIs: The VCPU
-                                        * SPIs and LPIs: The VCPU whose ap_list
-                                        * this is queued on.
-                                        */
-
-       struct kvm_vcpu *target_vcpu;   /* The VCPU that this interrupt should
-                                        * be sent to, as a result of the
-                                        * targets reg (v2) or the
-                                        * affinity reg (v3).
-                                        */
-
-       u32 intid;                      /* Guest visible INTID */
-       bool pending;
-       bool line_level;                /* Level only */
-       bool soft_pending;              /* Level only */
-       bool active;                    /* not used for LPIs */
-       bool enabled;
-       bool hw;                        /* Tied to HW IRQ */
-       u32 hwintid;                    /* HW INTID number */
-       union {
-               u8 targets;                     /* GICv2 target VCPUs mask */
-               u32 mpidr;                      /* GICv3 target VCPU */
-       };
-       u8 source;                      /* GICv2 SGIs only */
-       u8 priority;
-       enum vgic_irq_config config;    /* Level or edge */
-};
-
-struct vgic_register_region;
-
-struct vgic_io_device {
-       gpa_t base_addr;
-       struct kvm_vcpu *redist_vcpu;
-       const struct vgic_register_region *regions;
-       int nr_regions;
-       struct kvm_io_device dev;
-};
-
-struct vgic_dist {
-       bool                    in_kernel;
-       bool                    ready;
-       bool                    initialized;
-
-       /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
-       u32                     vgic_model;
-
-       int                     nr_spis;
-
-       /* TODO: Consider moving to global state */
-       /* Virtual control interface mapping */
-       void __iomem            *vctrl_base;
-
-       /* base addresses in guest physical address space: */
-       gpa_t                   vgic_dist_base;         /* distributor */
-       union {
-               /* either a GICv2 CPU interface */
-               gpa_t                   vgic_cpu_base;
-               /* or a number of GICv3 redistributor regions */
-               gpa_t                   vgic_redist_base;
-       };
-
-       /* distributor enabled */
-       bool                    enabled;
-
-       struct vgic_irq         *spis;
-
-       struct vgic_io_device   dist_iodev;
-       struct vgic_io_device   *redist_iodevs;
-};
-
-struct vgic_v2_cpu_if {
-       u32             vgic_hcr;
-       u32             vgic_vmcr;
-       u32             vgic_misr;      /* Saved only */
-       u64             vgic_eisr;      /* Saved only */
-       u64             vgic_elrsr;     /* Saved only */
-       u32             vgic_apr;
-       u32             vgic_lr[VGIC_V2_MAX_LRS];
-};
-
-struct vgic_v3_cpu_if {
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-       u32             vgic_hcr;
-       u32             vgic_vmcr;
-       u32             vgic_sre;       /* Restored only, change ignored */
-       u32             vgic_misr;      /* Saved only */
-       u32             vgic_eisr;      /* Saved only */
-       u32             vgic_elrsr;     /* Saved only */
-       u32             vgic_ap0r[4];
-       u32             vgic_ap1r[4];
-       u64             vgic_lr[VGIC_V3_MAX_LRS];
-#endif
-};
-
-struct vgic_cpu {
-       /* CPU vif control registers for world switch */
-       union {
-               struct vgic_v2_cpu_if   vgic_v2;
-               struct vgic_v3_cpu_if   vgic_v3;
-       };
-
-       unsigned int used_lrs;
-       struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS];
-
-       spinlock_t ap_list_lock;        /* Protects the ap_list */
-
-       /*
-        * List of IRQs that this VCPU should consider because they are either
-        * Active or Pending (hence the name; AP list), or because they recently
-        * were one of the two and need to be migrated off this list to another
-        * VCPU.
-        */
-       struct list_head ap_list_head;
-
-       u64 live_lrs;
-};
-
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
-void kvm_vgic_early_init(struct kvm *kvm);
-int kvm_vgic_create(struct kvm *kvm, u32 type);
-void kvm_vgic_destroy(struct kvm *kvm);
-void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
-void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
-int kvm_vgic_map_resources(struct kvm *kvm);
-int kvm_vgic_hyp_init(void);
-
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
-                       bool level);
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
-                              bool level);
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq);
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
-
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
-
-#define irqchip_in_kernel(k)   (!!((k)->arch.vgic.in_kernel))
-#define vgic_initialized(k)    ((k)->arch.vgic.initialized)
-#define vgic_ready(k)          ((k)->arch.vgic.ready)
-#define vgic_valid_spi(k, i)   (((i) >= VGIC_NR_PRIVATE_IRQS) && \
-                       ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS))
-
-bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
-
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
-#else
-static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
-{
-}
-#endif
-
-/**
- * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
- *
- * The host's GIC naturally limits the maximum amount of VCPUs a guest
- * can use.
- */
-static inline int kvm_vgic_get_max_vcpus(void)
-{
-       return kvm_vgic_global_state.max_gic_vcpus;
-}
-
-#endif /* __ASM_ARM_KVM_VGIC_VGIC_H */
index d9aef2a..c78fc27 100644 (file)
@@ -99,7 +99,8 @@ static inline void context_tracking_init(void) { }
 
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static inline void guest_enter(void)
+/* must be called with irqs disabled */
+static inline void guest_enter_irqoff(void)
 {
        if (vtime_accounting_cpu_enabled())
                vtime_guest_enter(current);
@@ -108,9 +109,19 @@ static inline void guest_enter(void)
 
        if (context_tracking_is_enabled())
                __context_tracking_enter(CONTEXT_GUEST);
+
+       /* KVM does not hold any references to rcu protected data when it
+        * switches CPU into a guest mode. In fact switching to a guest mode
+        * is very similar to exiting to userspace from rcu point of view. In
+        * addition CPU may stay in a guest mode for quite a long time (up to
+        * one time slice). Lets treat guest mode as quiescent state, just like
+        * we do with user-mode execution.
+        */
+       if (!context_tracking_cpu_is_enabled())
+               rcu_virt_note_context_switch(smp_processor_id());
 }
 
-static inline void guest_exit(void)
+static inline void guest_exit_irqoff(void)
 {
        if (context_tracking_is_enabled())
                __context_tracking_exit(CONTEXT_GUEST);
@@ -122,7 +133,7 @@ static inline void guest_exit(void)
 }
 
 #else
-static inline void guest_enter(void)
+static inline void guest_enter_irqoff(void)
 {
        /*
         * This is running in ioctl context so its safe
@@ -131,9 +142,10 @@ static inline void guest_enter(void)
         */
        vtime_account_system(current);
        current->flags |= PF_VCPU;
+       rcu_virt_note_context_switch(smp_processor_id());
 }
 
-static inline void guest_exit(void)
+static inline void guest_exit_irqoff(void)
 {
        /* Flush the guest cputime we spent on the guest */
        vtime_account_system(current);
@@ -141,4 +153,22 @@ static inline void guest_exit(void)
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
 
+static inline void guest_enter(void)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       guest_enter_irqoff();
+       local_irq_restore(flags);
+}
+
+static inline void guest_exit(void)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       guest_exit_irqoff();
+       local_irq_restore(flags);
+}
+
 #endif
index 107eed4..56b0b7e 100644 (file)
 #define GICR_WAKER_ProcessorSleep      (1U << 1)
 #define GICR_WAKER_ChildrenAsleep      (1U << 2)
 
-#define GICR_PROPBASER_NonShareable    (0U << 10)
-#define GICR_PROPBASER_InnerShareable  (1U << 10)
-#define GICR_PROPBASER_OuterShareable  (2U << 10)
-#define GICR_PROPBASER_SHAREABILITY_MASK (3UL << 10)
-#define GICR_PROPBASER_nCnB            (0U << 7)
-#define GICR_PROPBASER_nC              (1U << 7)
-#define GICR_PROPBASER_RaWt            (2U << 7)
-#define GICR_PROPBASER_RaWb            (3U << 7)
-#define GICR_PROPBASER_WaWt            (4U << 7)
-#define GICR_PROPBASER_WaWb            (5U << 7)
-#define GICR_PROPBASER_RaWaWt          (6U << 7)
-#define GICR_PROPBASER_RaWaWb          (7U << 7)
-#define GICR_PROPBASER_CACHEABILITY_MASK (7U << 7)
-#define GICR_PROPBASER_IDBITS_MASK     (0x1f)
-
-#define GICR_PENDBASER_NonShareable    (0U << 10)
-#define GICR_PENDBASER_InnerShareable  (1U << 10)
-#define GICR_PENDBASER_OuterShareable  (2U << 10)
-#define GICR_PENDBASER_SHAREABILITY_MASK (3UL << 10)
-#define GICR_PENDBASER_nCnB            (0U << 7)
-#define GICR_PENDBASER_nC              (1U << 7)
-#define GICR_PENDBASER_RaWt            (2U << 7)
-#define GICR_PENDBASER_RaWb            (3U << 7)
-#define GICR_PENDBASER_WaWt            (4U << 7)
-#define GICR_PENDBASER_WaWb            (5U << 7)
-#define GICR_PENDBASER_RaWaWt          (6U << 7)
-#define GICR_PENDBASER_RaWaWb          (7U << 7)
-#define GICR_PENDBASER_CACHEABILITY_MASK (7U << 7)
+#define GIC_BASER_CACHE_nCnB           0ULL
+#define GIC_BASER_CACHE_SameAsInner    0ULL
+#define GIC_BASER_CACHE_nC             1ULL
+#define GIC_BASER_CACHE_RaWt           2ULL
+#define GIC_BASER_CACHE_RaWb           3ULL
+#define GIC_BASER_CACHE_WaWt           4ULL
+#define GIC_BASER_CACHE_WaWb           5ULL
+#define GIC_BASER_CACHE_RaWaWt         6ULL
+#define GIC_BASER_CACHE_RaWaWb         7ULL
+#define GIC_BASER_CACHE_MASK           7ULL
+#define GIC_BASER_NonShareable         0ULL
+#define GIC_BASER_InnerShareable       1ULL
+#define GIC_BASER_OuterShareable       2ULL
+#define GIC_BASER_SHAREABILITY_MASK    3ULL
+
+#define GIC_BASER_CACHEABILITY(reg, inner_outer, type)                 \
+       (GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT)
+
+#define GIC_BASER_SHAREABILITY(reg, type)                              \
+       (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT)
+
+#define GICR_PROPBASER_SHAREABILITY_SHIFT              (10)
+#define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT                (7)
+#define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT                (56)
+#define GICR_PROPBASER_SHAREABILITY_MASK                               \
+       GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK)
+#define GICR_PROPBASER_INNER_CACHEABILITY_MASK                         \
+       GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK)
+#define GICR_PROPBASER_OUTER_CACHEABILITY_MASK                         \
+       GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK)
+#define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_PROPBASER_InnerShareable                                  \
+       GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)
+
+#define GICR_PROPBASER_nCnB    GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB)
+#define GICR_PROPBASER_nC      GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC)
+#define GICR_PROPBASER_RaWt    GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+#define GICR_PROPBASER_RaWb    GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+#define GICR_PROPBASER_WaWt    GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt)
+#define GICR_PROPBASER_WaWb    GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb)
+#define GICR_PROPBASER_RaWaWt  GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt)
+#define GICR_PROPBASER_RaWaWb  GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb)
+
+#define GICR_PROPBASER_IDBITS_MASK                     (0x1f)
+
+#define GICR_PENDBASER_SHAREABILITY_SHIFT              (10)
+#define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT                (7)
+#define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT                (56)
+#define GICR_PENDBASER_SHAREABILITY_MASK                               \
+       GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK)
+#define GICR_PENDBASER_INNER_CACHEABILITY_MASK                         \
+       GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK)
+#define GICR_PENDBASER_OUTER_CACHEABILITY_MASK                         \
+       GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK)
+#define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_PENDBASER_InnerShareable                                  \
+       GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)
+
+#define GICR_PENDBASER_nCnB    GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB)
+#define GICR_PENDBASER_nC      GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC)
+#define GICR_PENDBASER_RaWt    GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+#define GICR_PENDBASER_RaWb    GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+#define GICR_PENDBASER_WaWt    GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt)
+#define GICR_PENDBASER_WaWb    GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb)
+#define GICR_PENDBASER_RaWaWt  GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt)
+#define GICR_PENDBASER_RaWaWb  GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb)
+
+#define GICR_PENDBASER_PTZ                             BIT_ULL(62)
 
 /*
  * Re-Distributor registers, offsets from SGI_base
 #define GITS_CWRITER                   0x0088
 #define GITS_CREADR                    0x0090
 #define GITS_BASER                     0x0100
+#define GITS_IDREGS_BASE               0xffd0
+#define GITS_PIDR0                     0xffe0
+#define GITS_PIDR1                     0xffe4
 #define GITS_PIDR2                     GICR_PIDR2
+#define GITS_PIDR4                     0xffd0
+#define GITS_CIDR0                     0xfff0
+#define GITS_CIDR1                     0xfff4
+#define GITS_CIDR2                     0xfff8
+#define GITS_CIDR3                     0xfffc
 
 #define GITS_TRANSLATER                        0x10040
 
 #define GITS_CTLR_ENABLE               (1U << 0)
 #define GITS_CTLR_QUIESCENT            (1U << 31)
 
+#define GITS_TYPER_PLPIS               (1UL << 0)
+#define GITS_TYPER_IDBITS_SHIFT                8
 #define GITS_TYPER_DEVBITS_SHIFT       13
 #define GITS_TYPER_DEVBITS(r)          ((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
 #define GITS_TYPER_PTA                 (1UL << 19)
-
-#define GITS_CBASER_VALID              (1UL << 63)
-#define GITS_CBASER_nCnB               (0UL << 59)
-#define GITS_CBASER_nC                 (1UL << 59)
-#define GITS_CBASER_RaWt               (2UL << 59)
-#define GITS_CBASER_RaWb               (3UL << 59)
-#define GITS_CBASER_WaWt               (4UL << 59)
-#define GITS_CBASER_WaWb               (5UL << 59)
-#define GITS_CBASER_RaWaWt             (6UL << 59)
-#define GITS_CBASER_RaWaWb             (7UL << 59)
-#define GITS_CBASER_CACHEABILITY_MASK  (7UL << 59)
-#define GITS_CBASER_NonShareable       (0UL << 10)
-#define GITS_CBASER_InnerShareable     (1UL << 10)
-#define GITS_CBASER_OuterShareable     (2UL << 10)
-#define GITS_CBASER_SHAREABILITY_MASK  (3UL << 10)
+#define GITS_TYPER_HWCOLLCNT_SHIFT     24
+
+#define GITS_CBASER_VALID                      (1UL << 63)
+#define GITS_CBASER_SHAREABILITY_SHIFT         (10)
+#define GITS_CBASER_INNER_CACHEABILITY_SHIFT   (59)
+#define GITS_CBASER_OUTER_CACHEABILITY_SHIFT   (53)
+#define GITS_CBASER_SHAREABILITY_MASK                                  \
+       GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK)
+#define GITS_CBASER_INNER_CACHEABILITY_MASK                            \
+       GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK)
+#define GITS_CBASER_OUTER_CACHEABILITY_MASK                            \
+       GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK)
+#define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK
+
+#define GITS_CBASER_InnerShareable                                     \
+       GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable)
+
+#define GITS_CBASER_nCnB       GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB)
+#define GITS_CBASER_nC         GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC)
+#define GITS_CBASER_RaWt       GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+#define GITS_CBASER_RaWb       GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+#define GITS_CBASER_WaWt       GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt)
+#define GITS_CBASER_WaWb       GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb)
+#define GITS_CBASER_RaWaWt     GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt)
+#define GITS_CBASER_RaWaWb     GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb)
 
 #define GITS_BASER_NR_REGS             8
 
-#define GITS_BASER_VALID               (1UL << 63)
-#define GITS_BASER_INDIRECT            (1UL << 62)
-#define GITS_BASER_nCnB                        (0UL << 59)
-#define GITS_BASER_nC                  (1UL << 59)
-#define GITS_BASER_RaWt                        (2UL << 59)
-#define GITS_BASER_RaWb                        (3UL << 59)
-#define GITS_BASER_WaWt                        (4UL << 59)
-#define GITS_BASER_WaWb                        (5UL << 59)
-#define GITS_BASER_RaWaWt              (6UL << 59)
-#define GITS_BASER_RaWaWb              (7UL << 59)
-#define GITS_BASER_CACHEABILITY_MASK   (7UL << 59)
-#define GITS_BASER_TYPE_SHIFT          (56)
+#define GITS_BASER_VALID                       (1UL << 63)
+#define GITS_BASER_INDIRECT                    (1ULL << 62)
+
+#define GITS_BASER_INNER_CACHEABILITY_SHIFT    (59)
+#define GITS_BASER_OUTER_CACHEABILITY_SHIFT    (53)
+#define GITS_BASER_INNER_CACHEABILITY_MASK                             \
+       GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK)
+#define GITS_BASER_CACHEABILITY_MASK           GITS_BASER_INNER_CACHEABILITY_MASK
+#define GITS_BASER_OUTER_CACHEABILITY_MASK                             \
+       GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK)
+#define GITS_BASER_SHAREABILITY_MASK                                   \
+       GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK)
+
+#define GITS_BASER_nCnB                GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB)
+#define GITS_BASER_nC          GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC)
+#define GITS_BASER_RaWt                GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+#define GITS_BASER_RaWb                GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+#define GITS_BASER_WaWt                GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt)
+#define GITS_BASER_WaWb                GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb)
+#define GITS_BASER_RaWaWt      GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt)
+#define GITS_BASER_RaWaWb      GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb)
+
+#define GITS_BASER_TYPE_SHIFT                  (56)
 #define GITS_BASER_TYPE(r)             (((r) >> GITS_BASER_TYPE_SHIFT) & 7)
-#define GITS_BASER_ENTRY_SIZE_SHIFT    (48)
+#define GITS_BASER_ENTRY_SIZE_SHIFT            (48)
 #define GITS_BASER_ENTRY_SIZE(r)       ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0xff) + 1)
-#define GITS_BASER_NonShareable                (0UL << 10)
-#define GITS_BASER_InnerShareable      (1UL << 10)
-#define GITS_BASER_OuterShareable      (2UL << 10)
 #define GITS_BASER_SHAREABILITY_SHIFT  (10)
-#define GITS_BASER_SHAREABILITY_MASK   (3UL << GITS_BASER_SHAREABILITY_SHIFT)
+#define GITS_BASER_InnerShareable                                      \
+       GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)
 #define GITS_BASER_PAGE_SIZE_SHIFT     (8)
 #define GITS_BASER_PAGE_SIZE_4K                (0UL << GITS_BASER_PAGE_SIZE_SHIFT)
 #define GITS_BASER_PAGE_SIZE_16K       (1UL << GITS_BASER_PAGE_SIZE_SHIFT)
 #define GITS_BASER_PAGE_SIZE_MASK      (3UL << GITS_BASER_PAGE_SIZE_SHIFT)
 #define GITS_BASER_PAGES_MAX           256
 #define GITS_BASER_PAGES_SHIFT         (0)
+#define GITS_BASER_NR_PAGES(r)         (((r) & 0xff) + 1)
 
 #define GITS_BASER_TYPE_NONE           0
 #define GITS_BASER_TYPE_DEVICE         1
  */
 #define GITS_CMD_MAPD                  0x08
 #define GITS_CMD_MAPC                  0x09
-#define GITS_CMD_MAPVI                 0x0a
+#define GITS_CMD_MAPTI                 0x0a
+/* older GIC documentation used MAPVI for this command */
+#define GITS_CMD_MAPVI                 GITS_CMD_MAPTI
+#define GITS_CMD_MAPI                  0x0b
 #define GITS_CMD_MOVI                  0x01
 #define GITS_CMD_DISCARD               0x0f
 #define GITS_CMD_INV                   0x0c
 #define GITS_CMD_CLEAR                 0x04
 #define GITS_CMD_SYNC                  0x05
 
+/*
+ * ITS error numbers
+ */
+#define E_ITS_MOVI_UNMAPPED_INTERRUPT          0x010107
+#define E_ITS_MOVI_UNMAPPED_COLLECTION         0x010109
+#define E_ITS_CLEAR_UNMAPPED_INTERRUPT         0x010507
+#define E_ITS_MAPD_DEVICE_OOR                  0x010801
+#define E_ITS_MAPC_PROCNUM_OOR                 0x010902
+#define E_ITS_MAPC_COLLECTION_OOR              0x010903
+#define E_ITS_MAPTI_UNMAPPED_DEVICE            0x010a04
+#define E_ITS_MAPTI_PHYSICALID_OOR             0x010a06
+#define E_ITS_INV_UNMAPPED_INTERRUPT           0x010c07
+#define E_ITS_INVALL_UNMAPPED_COLLECTION       0x010d09
+#define E_ITS_MOVALL_PROCNUM_OOR               0x010e01
+#define E_ITS_DISCARD_UNMAPPED_INTERRUPT       0x010f07
+
 /*
  * CPU interface registers
  */
index 1c9c973..aafd702 100644 (file)
@@ -164,6 +164,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                            int len, struct kvm_io_device *dev);
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                              struct kvm_io_device *dev);
+struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                                        gpa_t addr);
 
 #ifdef CONFIG_KVM_ASYNC_PF
 struct kvm_async_pf {
@@ -371,7 +373,15 @@ struct kvm {
        struct srcu_struct srcu;
        struct srcu_struct irq_srcu;
        struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+
+       /*
+        * created_vcpus is protected by kvm->lock, and is incremented
+        * at the beginning of KVM_CREATE_VCPU.  online_vcpus is only
+        * incremented after storing the kvm_vcpu pointer in vcpus,
+        * and is accessed atomically.
+        */
        atomic_t online_vcpus;
+       int created_vcpus;
        int last_boosted_vcpu;
        struct list_head vm_list;
        struct mutex lock;
@@ -867,45 +877,6 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
 }
 #endif
 
-/* must be called with irqs disabled */
-static inline void __kvm_guest_enter(void)
-{
-       guest_enter();
-       /* KVM does not hold any references to rcu protected data when it
-        * switches CPU into a guest mode. In fact switching to a guest mode
-        * is very similar to exiting to userspace from rcu point of view. In
-        * addition CPU may stay in a guest mode for quite a long time (up to
-        * one time slice). Lets treat guest mode as quiescent state, just like
-        * we do with user-mode execution.
-        */
-       if (!context_tracking_cpu_is_enabled())
-               rcu_virt_note_context_switch(smp_processor_id());
-}
-
-/* must be called with irqs disabled */
-static inline void __kvm_guest_exit(void)
-{
-       guest_exit();
-}
-
-static inline void kvm_guest_enter(void)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-       __kvm_guest_enter();
-       local_irq_restore(flags);
-}
-
-static inline void kvm_guest_exit(void)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-       __kvm_guest_exit();
-       local_irq_restore(flags);
-}
-
 /*
  * search_memslots() and __gfn_to_memslot() are here because they are
  * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
@@ -1042,7 +1013,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
                        const struct kvm_irq_routing_entry *entries,
                        unsigned nr,
                        unsigned flags);
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+                         struct kvm_kernel_irq_routing_entry *e,
                          const struct kvm_irq_routing_entry *ue);
 void kvm_free_irq_routing(struct kvm *kvm);
 
@@ -1097,12 +1069,6 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
 #endif /* CONFIG_HAVE_KVM_EVENTFD */
 
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu);
-#else
-static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
-#endif
-
 static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 {
        /*
index 8b5e0a9..610e132 100644 (file)
@@ -124,6 +124,15 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
        return ret;
 }
 
+static inline int page_ref_inc_return(struct page *page)
+{
+       int ret = atomic_inc_return(&page->_refcount);
+
+       if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
+               __page_ref_mod_and_return(page, 1, ret);
+       return ret;
+}
+
 static inline int page_ref_dec_and_test(struct page *page)
 {
        int ret = atomic_dec_and_test(&page->_refcount);
index f28292d..8ade3eb 100644 (file)
@@ -151,8 +151,9 @@ TRACE_EVENT(kvm_msi_set_irq,
                __entry->data           = data;
        ),
 
-       TP_printk("dst %u vec %u (%s|%s|%s%s)",
-                 (u8)(__entry->address >> 12), (u8)__entry->data,
+       TP_printk("dst %llx vec %u (%s|%s|%s%s)",
+                 (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
+                 (u8)__entry->data,
                  __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
                  (__entry->address & (1<<2)) ? "logical" : "physical",
                  (__entry->data & (1<<15)) ? "level" : "edge",
index 05ebf47..e98bb4c 100644 (file)
@@ -866,6 +866,10 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_ARM_PMU_V3 126
 #define KVM_CAP_VCPU_ATTRIBUTES 127
 #define KVM_CAP_MAX_VCPU_ID 128
+#define KVM_CAP_X2APIC_API 129
+#define KVM_CAP_S390_USER_INSTR0 130
+#define KVM_CAP_MSI_DEVID 131
+#define KVM_CAP_PPC_HTM 132
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1024,12 +1028,14 @@ struct kvm_one_reg {
        __u64 addr;
 };
 
+#define KVM_MSI_VALID_DEVID    (1U << 0)
 struct kvm_msi {
        __u32 address_lo;
        __u32 address_hi;
        __u32 data;
        __u32 flags;
-       __u8  pad[16];
+       __u32 devid;
+       __u8  pad[12];
 };
 
 struct kvm_arm_device_addr {
@@ -1074,6 +1080,8 @@ enum kvm_device_type {
 #define KVM_DEV_TYPE_FLIC              KVM_DEV_TYPE_FLIC
        KVM_DEV_TYPE_ARM_VGIC_V3,
 #define KVM_DEV_TYPE_ARM_VGIC_V3       KVM_DEV_TYPE_ARM_VGIC_V3
+       KVM_DEV_TYPE_ARM_VGIC_ITS,
+#define KVM_DEV_TYPE_ARM_VGIC_ITS      KVM_DEV_TYPE_ARM_VGIC_ITS
        KVM_DEV_TYPE_MAX,
 };
 
@@ -1313,4 +1321,7 @@ struct kvm_assigned_msix_entry {
        __u16 padding[3];
 };
 
+#define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
+
 #endif /* __LINUX_KVM_H */
index 547741f..96b2b2f 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -723,6 +723,7 @@ retry:
        }
        return 0;
 }
+EXPORT_SYMBOL_GPL(fixup_user_fault);
 
 static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
                                                struct mm_struct *mm,
index e5d6108..b0cc1a3 100644 (file)
@@ -16,9 +16,6 @@ config HAVE_KVM_EVENTFD
        bool
        select EVENTFD
 
-config KVM_APIC_ARCHITECTURE
-       bool
-
 config KVM_MMIO
        bool
 
index 3a3a699..7cffd93 100644 (file)
 
 #include <asm/kvm_hyp.h>
 
-#ifdef CONFIG_KVM_NEW_VGIC
-extern struct vgic_global kvm_vgic_global_state;
-#define vgic_v2_params kvm_vgic_global_state
-#else
-extern struct vgic_params vgic_v2_params;
-#endif
-
 static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
                                            void __iomem *base)
 {
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+       int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
        u32 eisr0, eisr1;
        int i;
        bool expect_mi;
@@ -74,7 +67,7 @@ static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
 static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
 {
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+       int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
        u32 elrsr0, elrsr1;
 
        elrsr0 = readl_relaxed(base + GICH_ELRSR0);
@@ -93,7 +86,7 @@ static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
 static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
 {
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+       int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
        int i;
 
        for (i = 0; i < nr_lr; i++) {
@@ -147,7 +140,7 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
        struct vgic_dist *vgic = &kvm->arch.vgic;
        void __iomem *base = kern_hyp_va(vgic->vctrl_base);
-       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+       int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
        int i;
        u64 live_lrs = 0;
 
diff --git a/virt/kvm/arm/vgic-v2-emul.c b/virt/kvm/arm/vgic-v2-emul.c
deleted file mode 100644 (file)
index 1b0bee0..0000000
+++ /dev/null
@@ -1,856 +0,0 @@
-/*
- * Contains GICv2 specific emulation code, was in vgic.c before.
- *
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/uaccess.h>
-
-#include <linux/irqchip/arm-gic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-
-#define GICC_ARCH_VERSION_V2           0x2
-
-static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg);
-static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi)
-{
-       return dist->irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi;
-}
-
-static bool handle_mmio_misc(struct kvm_vcpu *vcpu,
-                            struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       u32 reg;
-       u32 word_offset = offset & 3;
-
-       switch (offset & ~3) {
-       case 0:                 /* GICD_CTLR */
-               reg = vcpu->kvm->arch.vgic.enabled;
-               vgic_reg_access(mmio, &reg, word_offset,
-                               ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-               if (mmio->is_write) {
-                       vcpu->kvm->arch.vgic.enabled = reg & 1;
-                       vgic_update_state(vcpu->kvm);
-                       return true;
-               }
-               break;
-
-       case 4:                 /* GICD_TYPER */
-               reg  = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
-               reg |= (vcpu->kvm->arch.vgic.nr_irqs >> 5) - 1;
-               vgic_reg_access(mmio, &reg, word_offset,
-                               ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-               break;
-
-       case 8:                 /* GICD_IIDR */
-               reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
-               vgic_reg_access(mmio, &reg, word_offset,
-                               ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-               break;
-       }
-
-       return false;
-}
-
-static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu,
-                                      struct kvm_exit_mmio *mmio,
-                                      phys_addr_t offset)
-{
-       return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-                                     vcpu->vcpu_id, ACCESS_WRITE_SETBIT);
-}
-
-static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu,
-                                        struct kvm_exit_mmio *mmio,
-                                        phys_addr_t offset)
-{
-       return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-                                     vcpu->vcpu_id, ACCESS_WRITE_CLEARBIT);
-}
-
-static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu,
-                                       struct kvm_exit_mmio *mmio,
-                                       phys_addr_t offset)
-{
-       return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
-                                          vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu,
-                                         struct kvm_exit_mmio *mmio,
-                                         phys_addr_t offset)
-{
-       return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
-                                            vcpu->vcpu_id);
-}
-
-static bool handle_mmio_set_active_reg(struct kvm_vcpu *vcpu,
-                                      struct kvm_exit_mmio *mmio,
-                                      phys_addr_t offset)
-{
-       return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
-                                         vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_active_reg(struct kvm_vcpu *vcpu,
-                                        struct kvm_exit_mmio *mmio,
-                                        phys_addr_t offset)
-{
-       return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
-                                           vcpu->vcpu_id);
-}
-
-static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
-                                    struct kvm_exit_mmio *mmio,
-                                    phys_addr_t offset)
-{
-       u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
-                                       vcpu->vcpu_id, offset);
-       vgic_reg_access(mmio, reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-       return false;
-}
-
-#define GICD_ITARGETSR_SIZE    32
-#define GICD_CPUTARGETS_BITS   8
-#define GICD_IRQS_PER_ITARGETSR        (GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS)
-static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       int i;
-       u32 val = 0;
-
-       irq -= VGIC_NR_PRIVATE_IRQS;
-
-       for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
-               val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8);
-
-       return val;
-}
-
-static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int i, c;
-       unsigned long *bmap;
-       u32 target;
-
-       irq -= VGIC_NR_PRIVATE_IRQS;
-
-       /*
-        * Pick the LSB in each byte. This ensures we target exactly
-        * one vcpu per IRQ. If the byte is null, assume we target
-        * CPU0.
-        */
-       for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) {
-               int shift = i * GICD_CPUTARGETS_BITS;
-
-               target = ffs((val >> shift) & 0xffU);
-               target = target ? (target - 1) : 0;
-               dist->irq_spi_cpu[irq + i] = target;
-               kvm_for_each_vcpu(c, vcpu, kvm) {
-                       bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
-                       if (c == target)
-                               set_bit(irq + i, bmap);
-                       else
-                               clear_bit(irq + i, bmap);
-               }
-       }
-}
-
-static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu,
-                                  struct kvm_exit_mmio *mmio,
-                                  phys_addr_t offset)
-{
-       u32 reg;
-
-       /* We treat the banked interrupts targets as read-only */
-       if (offset < 32) {
-               u32 roreg;
-
-               roreg = 1 << vcpu->vcpu_id;
-               roreg |= roreg << 8;
-               roreg |= roreg << 16;
-
-               vgic_reg_access(mmio, &roreg, offset,
-                               ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-               return false;
-       }
-
-       reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U);
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-       if (mmio->is_write) {
-               vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U);
-               vgic_update_state(vcpu->kvm);
-               return true;
-       }
-
-       return false;
-}
-
-static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
-                               struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       u32 *reg;
-
-       reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
-                                 vcpu->vcpu_id, offset >> 1);
-
-       return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu,
-                               struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       u32 reg;
-
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_VALUE);
-       if (mmio->is_write) {
-               vgic_dispatch_sgi(vcpu, reg);
-               vgic_update_state(vcpu->kvm);
-               return true;
-       }
-
-       return false;
-}
-
-/* Handle reads of GICD_CPENDSGIRn and GICD_SPENDSGIRn */
-static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
-                                       struct kvm_exit_mmio *mmio,
-                                       phys_addr_t offset)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       int sgi;
-       int min_sgi = (offset & ~0x3);
-       int max_sgi = min_sgi + 3;
-       int vcpu_id = vcpu->vcpu_id;
-       u32 reg = 0;
-
-       /* Copy source SGIs from distributor side */
-       for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
-               u8 sources = *vgic_get_sgi_sources(dist, vcpu_id, sgi);
-
-               reg |= ((u32)sources) << (8 * (sgi - min_sgi));
-       }
-
-       mmio_data_write(mmio, ~0, reg);
-       return false;
-}
-
-static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
-                                        struct kvm_exit_mmio *mmio,
-                                        phys_addr_t offset, bool set)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       int sgi;
-       int min_sgi = (offset & ~0x3);
-       int max_sgi = min_sgi + 3;
-       int vcpu_id = vcpu->vcpu_id;
-       u32 reg;
-       bool updated = false;
-
-       reg = mmio_data_read(mmio, ~0);
-
-       /* Clear pending SGIs on the distributor */
-       for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
-               u8 mask = reg >> (8 * (sgi - min_sgi));
-               u8 *src = vgic_get_sgi_sources(dist, vcpu_id, sgi);
-
-               if (set) {
-                       if ((*src & mask) != mask)
-                               updated = true;
-                       *src |= mask;
-               } else {
-                       if (*src & mask)
-                               updated = true;
-                       *src &= ~mask;
-               }
-       }
-
-       if (updated)
-               vgic_update_state(vcpu->kvm);
-
-       return updated;
-}
-
-static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu,
-                               struct kvm_exit_mmio *mmio,
-                               phys_addr_t offset)
-{
-       if (!mmio->is_write)
-               return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
-       else
-               return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, true);
-}
-
-static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu,
-                                 struct kvm_exit_mmio *mmio,
-                                 phys_addr_t offset)
-{
-       if (!mmio->is_write)
-               return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
-       else
-               return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false);
-}
-
-static const struct vgic_io_range vgic_dist_ranges[] = {
-       {
-               .base           = GIC_DIST_SOFTINT,
-               .len            = 4,
-               .handle_mmio    = handle_mmio_sgi_reg,
-       },
-       {
-               .base           = GIC_DIST_CTRL,
-               .len            = 12,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_misc,
-       },
-       {
-               .base           = GIC_DIST_IGROUP,
-               .len            = VGIC_MAX_IRQS / 8,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = GIC_DIST_ENABLE_SET,
-               .len            = VGIC_MAX_IRQS / 8,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_enable_reg,
-       },
-       {
-               .base           = GIC_DIST_ENABLE_CLEAR,
-               .len            = VGIC_MAX_IRQS / 8,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_enable_reg,
-       },
-       {
-               .base           = GIC_DIST_PENDING_SET,
-               .len            = VGIC_MAX_IRQS / 8,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_pending_reg,
-       },
-       {
-               .base           = GIC_DIST_PENDING_CLEAR,
-               .len            = VGIC_MAX_IRQS / 8,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_pending_reg,
-       },
-       {
-               .base           = GIC_DIST_ACTIVE_SET,
-               .len            = VGIC_MAX_IRQS / 8,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_active_reg,
-       },
-       {
-               .base           = GIC_DIST_ACTIVE_CLEAR,
-               .len            = VGIC_MAX_IRQS / 8,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_active_reg,
-       },
-       {
-               .base           = GIC_DIST_PRI,
-               .len            = VGIC_MAX_IRQS,
-               .bits_per_irq   = 8,
-               .handle_mmio    = handle_mmio_priority_reg,
-       },
-       {
-               .base           = GIC_DIST_TARGET,
-               .len            = VGIC_MAX_IRQS,
-               .bits_per_irq   = 8,
-               .handle_mmio    = handle_mmio_target_reg,
-       },
-       {
-               .base           = GIC_DIST_CONFIG,
-               .len            = VGIC_MAX_IRQS / 4,
-               .bits_per_irq   = 2,
-               .handle_mmio    = handle_mmio_cfg_reg,
-       },
-       {
-               .base           = GIC_DIST_SGI_PENDING_CLEAR,
-               .len            = VGIC_NR_SGIS,
-               .handle_mmio    = handle_mmio_sgi_clear,
-       },
-       {
-               .base           = GIC_DIST_SGI_PENDING_SET,
-               .len            = VGIC_NR_SGIS,
-               .handle_mmio    = handle_mmio_sgi_set,
-       },
-       {}
-};
-
-static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
-{
-       struct kvm *kvm = vcpu->kvm;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       int nrcpus = atomic_read(&kvm->online_vcpus);
-       u8 target_cpus;
-       int sgi, mode, c, vcpu_id;
-
-       vcpu_id = vcpu->vcpu_id;
-
-       sgi = reg & 0xf;
-       target_cpus = (reg >> 16) & 0xff;
-       mode = (reg >> 24) & 3;
-
-       switch (mode) {
-       case 0:
-               if (!target_cpus)
-                       return;
-               break;
-
-       case 1:
-               target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff;
-               break;
-
-       case 2:
-               target_cpus = 1 << vcpu_id;
-               break;
-       }
-
-       kvm_for_each_vcpu(c, vcpu, kvm) {
-               if (target_cpus & 1) {
-                       /* Flag the SGI as pending */
-                       vgic_dist_irq_set_pending(vcpu, sgi);
-                       *vgic_get_sgi_sources(dist, c, sgi) |= 1 << vcpu_id;
-                       kvm_debug("SGI%d from CPU%d to CPU%d\n",
-                                 sgi, vcpu_id, c);
-               }
-
-               target_cpus >>= 1;
-       }
-}
-
-static bool vgic_v2_queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       unsigned long sources;
-       int vcpu_id = vcpu->vcpu_id;
-       int c;
-
-       sources = *vgic_get_sgi_sources(dist, vcpu_id, irq);
-
-       for_each_set_bit(c, &sources, dist->nr_cpus) {
-               if (vgic_queue_irq(vcpu, c, irq))
-                       clear_bit(c, &sources);
-       }
-
-       *vgic_get_sgi_sources(dist, vcpu_id, irq) = sources;
-
-       /*
-        * If the sources bitmap has been cleared it means that we
-        * could queue all the SGIs onto link registers (see the
-        * clear_bit above), and therefore we are done with them in
-        * our emulated gic and can get rid of them.
-        */
-       if (!sources) {
-               vgic_dist_irq_clear_pending(vcpu, irq);
-               vgic_cpu_irq_clear(vcpu, irq);
-               return true;
-       }
-
-       return false;
-}
-
-/**
- * kvm_vgic_map_resources - Configure global VGIC state before running any VCPUs
- * @kvm: pointer to the kvm struct
- *
- * Map the virtual CPU interface into the VM before running any VCPUs.  We
- * can't do this at creation time, because user space must first set the
- * virtual CPU interface address in the guest physical address space.
- */
-static int vgic_v2_map_resources(struct kvm *kvm,
-                                const struct vgic_params *params)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       int ret = 0;
-
-       if (!irqchip_in_kernel(kvm))
-               return 0;
-
-       mutex_lock(&kvm->lock);
-
-       if (vgic_ready(kvm))
-               goto out;
-
-       if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
-           IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
-               kvm_err("Need to set vgic cpu and dist addresses first\n");
-               ret = -ENXIO;
-               goto out;
-       }
-
-       vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base,
-                                KVM_VGIC_V2_DIST_SIZE,
-                                vgic_dist_ranges, -1, &dist->dist_iodev);
-
-       /*
-        * Initialize the vgic if this hasn't already been done on demand by
-        * accessing the vgic state from userspace.
-        */
-       ret = vgic_init(kvm);
-       if (ret) {
-               kvm_err("Unable to allocate maps\n");
-               goto out_unregister;
-       }
-
-       ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
-                                   params->vcpu_base, KVM_VGIC_V2_CPU_SIZE,
-                                   true);
-       if (ret) {
-               kvm_err("Unable to remap VGIC CPU to VCPU\n");
-               goto out_unregister;
-       }
-
-       dist->ready = true;
-       goto out;
-
-out_unregister:
-       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev);
-
-out:
-       if (ret)
-               kvm_vgic_destroy(kvm);
-       mutex_unlock(&kvm->lock);
-       return ret;
-}
-
-static void vgic_v2_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       *vgic_get_sgi_sources(dist, vcpu->vcpu_id, irq) |= 1 << source;
-}
-
-static int vgic_v2_init_model(struct kvm *kvm)
-{
-       int i;
-
-       for (i = VGIC_NR_PRIVATE_IRQS; i < kvm->arch.vgic.nr_irqs; i += 4)
-               vgic_set_target_reg(kvm, 0, i);
-
-       return 0;
-}
-
-void vgic_v2_init_emulation(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       dist->vm_ops.queue_sgi = vgic_v2_queue_sgi;
-       dist->vm_ops.add_sgi_source = vgic_v2_add_sgi_source;
-       dist->vm_ops.init_model = vgic_v2_init_model;
-       dist->vm_ops.map_resources = vgic_v2_map_resources;
-
-       kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
-}
-
-static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu,
-                                struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       bool updated = false;
-       struct vgic_vmcr vmcr;
-       u32 *vmcr_field;
-       u32 reg;
-
-       vgic_get_vmcr(vcpu, &vmcr);
-
-       switch (offset & ~0x3) {
-       case GIC_CPU_CTRL:
-               vmcr_field = &vmcr.ctlr;
-               break;
-       case GIC_CPU_PRIMASK:
-               vmcr_field = &vmcr.pmr;
-               break;
-       case GIC_CPU_BINPOINT:
-               vmcr_field = &vmcr.bpr;
-               break;
-       case GIC_CPU_ALIAS_BINPOINT:
-               vmcr_field = &vmcr.abpr;
-               break;
-       default:
-               BUG();
-       }
-
-       if (!mmio->is_write) {
-               reg = *vmcr_field;
-               mmio_data_write(mmio, ~0, reg);
-       } else {
-               reg = mmio_data_read(mmio, ~0);
-               if (reg != *vmcr_field) {
-                       *vmcr_field = reg;
-                       vgic_set_vmcr(vcpu, &vmcr);
-                       updated = true;
-               }
-       }
-       return updated;
-}
-
-static bool handle_mmio_abpr(struct kvm_vcpu *vcpu,
-                            struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       return handle_cpu_mmio_misc(vcpu, mmio, GIC_CPU_ALIAS_BINPOINT);
-}
-
-static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu,
-                                 struct kvm_exit_mmio *mmio,
-                                 phys_addr_t offset)
-{
-       u32 reg;
-
-       if (mmio->is_write)
-               return false;
-
-       /* GICC_IIDR */
-       reg = (PRODUCT_ID_KVM << 20) |
-             (GICC_ARCH_VERSION_V2 << 16) |
-             (IMPLEMENTER_ARM << 0);
-       mmio_data_write(mmio, ~0, reg);
-       return false;
-}
-
-/*
- * CPU Interface Register accesses - these are not accessed by the VM, but by
- * user space for saving and restoring VGIC state.
- */
-static const struct vgic_io_range vgic_cpu_ranges[] = {
-       {
-               .base           = GIC_CPU_CTRL,
-               .len            = 12,
-               .handle_mmio    = handle_cpu_mmio_misc,
-       },
-       {
-               .base           = GIC_CPU_ALIAS_BINPOINT,
-               .len            = 4,
-               .handle_mmio    = handle_mmio_abpr,
-       },
-       {
-               .base           = GIC_CPU_ACTIVEPRIO,
-               .len            = 16,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = GIC_CPU_IDENT,
-               .len            = 4,
-               .handle_mmio    = handle_cpu_mmio_ident,
-       },
-};
-
-static int vgic_attr_regs_access(struct kvm_device *dev,
-                                struct kvm_device_attr *attr,
-                                u32 *reg, bool is_write)
-{
-       const struct vgic_io_range *r = NULL, *ranges;
-       phys_addr_t offset;
-       int ret, cpuid, c;
-       struct kvm_vcpu *vcpu, *tmp_vcpu;
-       struct vgic_dist *vgic;
-       struct kvm_exit_mmio mmio;
-       u32 data;
-
-       offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-       cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
-               KVM_DEV_ARM_VGIC_CPUID_SHIFT;
-
-       mutex_lock(&dev->kvm->lock);
-
-       ret = vgic_init(dev->kvm);
-       if (ret)
-               goto out;
-
-       if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       vcpu = kvm_get_vcpu(dev->kvm, cpuid);
-       vgic = &dev->kvm->arch.vgic;
-
-       mmio.len = 4;
-       mmio.is_write = is_write;
-       mmio.data = &data;
-       if (is_write)
-               mmio_data_write(&mmio, ~0, *reg);
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-               mmio.phys_addr = vgic->vgic_dist_base + offset;
-               ranges = vgic_dist_ranges;
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-               mmio.phys_addr = vgic->vgic_cpu_base + offset;
-               ranges = vgic_cpu_ranges;
-               break;
-       default:
-               BUG();
-       }
-       r = vgic_find_range(ranges, 4, offset);
-
-       if (unlikely(!r || !r->handle_mmio)) {
-               ret = -ENXIO;
-               goto out;
-       }
-
-
-       spin_lock(&vgic->lock);
-
-       /*
-        * Ensure that no other VCPU is running by checking the vcpu->cpu
-        * field.  If no other VPCUs are running we can safely access the VGIC
-        * state, because even if another VPU is run after this point, that
-        * VCPU will not touch the vgic state, because it will block on
-        * getting the vgic->lock in kvm_vgic_sync_hwstate().
-        */
-       kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
-               if (unlikely(tmp_vcpu->cpu != -1)) {
-                       ret = -EBUSY;
-                       goto out_vgic_unlock;
-               }
-       }
-
-       /*
-        * Move all pending IRQs from the LRs on all VCPUs so the pending
-        * state can be properly represented in the register state accessible
-        * through this API.
-        */
-       kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm)
-               vgic_unqueue_irqs(tmp_vcpu);
-
-       offset -= r->base;
-       r->handle_mmio(vcpu, &mmio, offset);
-
-       if (!is_write)
-               *reg = mmio_data_read(&mmio, ~0);
-
-       ret = 0;
-out_vgic_unlock:
-       spin_unlock(&vgic->lock);
-out:
-       mutex_unlock(&dev->kvm->lock);
-       return ret;
-}
-
-static int vgic_v2_create(struct kvm_device *dev, u32 type)
-{
-       return kvm_vgic_create(dev->kvm, type);
-}
-
-static void vgic_v2_destroy(struct kvm_device *dev)
-{
-       kfree(dev);
-}
-
-static int vgic_v2_set_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       int ret;
-
-       ret = vgic_set_common_attr(dev, attr);
-       if (ret != -ENXIO)
-               return ret;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-               u32 reg;
-
-               if (get_user(reg, uaddr))
-                       return -EFAULT;
-
-               return vgic_attr_regs_access(dev, attr, &reg, true);
-       }
-
-       }
-
-       return -ENXIO;
-}
-
-static int vgic_v2_get_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       int ret;
-
-       ret = vgic_get_common_attr(dev, attr);
-       if (ret != -ENXIO)
-               return ret;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-               u32 reg = 0;
-
-               ret = vgic_attr_regs_access(dev, attr, &reg, false);
-               if (ret)
-                       return ret;
-               return put_user(reg, uaddr);
-       }
-
-       }
-
-       return -ENXIO;
-}
-
-static int vgic_v2_has_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       phys_addr_t offset;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR:
-               switch (attr->attr) {
-               case KVM_VGIC_V2_ADDR_TYPE_DIST:
-               case KVM_VGIC_V2_ADDR_TYPE_CPU:
-                       return 0;
-               }
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-               offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-               return vgic_has_attr_regs(vgic_dist_ranges, offset);
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-               offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-               return vgic_has_attr_regs(vgic_cpu_ranges, offset);
-       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
-               return 0;
-       case KVM_DEV_ARM_VGIC_GRP_CTRL:
-               switch (attr->attr) {
-               case KVM_DEV_ARM_VGIC_CTRL_INIT:
-                       return 0;
-               }
-       }
-       return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v2_ops = {
-       .name = "kvm-arm-vgic-v2",
-       .create = vgic_v2_create,
-       .destroy = vgic_v2_destroy,
-       .set_attr = vgic_v2_set_attr,
-       .get_attr = vgic_v2_get_attr,
-       .has_attr = vgic_v2_has_attr,
-};
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
deleted file mode 100644 (file)
index 334cd7a..0000000
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (C) 2012,2013 ARM Limited, All Rights Reserved.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-
-#include <linux/irqchip/arm-gic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
-       struct vgic_lr lr_desc;
-       u32 val = vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr];
-
-       lr_desc.irq     = val & GICH_LR_VIRTUALID;
-       if (lr_desc.irq <= 15)
-               lr_desc.source  = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7;
-       else
-               lr_desc.source = 0;
-       lr_desc.state   = 0;
-
-       if (val & GICH_LR_PENDING_BIT)
-               lr_desc.state |= LR_STATE_PENDING;
-       if (val & GICH_LR_ACTIVE_BIT)
-               lr_desc.state |= LR_STATE_ACTIVE;
-       if (val & GICH_LR_EOI)
-               lr_desc.state |= LR_EOI_INT;
-       if (val & GICH_LR_HW) {
-               lr_desc.state |= LR_HW;
-               lr_desc.hwirq = (val & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT;
-       }
-
-       return lr_desc;
-}
-
-static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
-                          struct vgic_lr lr_desc)
-{
-       u32 lr_val;
-
-       lr_val = lr_desc.irq;
-
-       if (lr_desc.state & LR_STATE_PENDING)
-               lr_val |= GICH_LR_PENDING_BIT;
-       if (lr_desc.state & LR_STATE_ACTIVE)
-               lr_val |= GICH_LR_ACTIVE_BIT;
-       if (lr_desc.state & LR_EOI_INT)
-               lr_val |= GICH_LR_EOI;
-
-       if (lr_desc.state & LR_HW) {
-               lr_val |= GICH_LR_HW;
-               lr_val |= (u32)lr_desc.hwirq << GICH_LR_PHYSID_CPUID_SHIFT;
-       }
-
-       if (lr_desc.irq < VGIC_NR_SGIS)
-               lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
-
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
-
-       if (!(lr_desc.state & LR_STATE_MASK))
-               vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr);
-       else
-               vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr &= ~(1ULL << lr);
-}
-
-static u64 vgic_v2_get_elrsr(const struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr;
-}
-
-static u64 vgic_v2_get_eisr(const struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr;
-}
-
-static void vgic_v2_clear_eisr(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr = 0;
-}
-
-static u32 vgic_v2_get_interrupt_status(const struct kvm_vcpu *vcpu)
-{
-       u32 misr = vcpu->arch.vgic_cpu.vgic_v2.vgic_misr;
-       u32 ret = 0;
-
-       if (misr & GICH_MISR_EOI)
-               ret |= INT_STATUS_EOI;
-       if (misr & GICH_MISR_U)
-               ret |= INT_STATUS_UNDERFLOW;
-
-       return ret;
-}
-
-static void vgic_v2_enable_underflow(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr |= GICH_HCR_UIE;
-}
-
-static void vgic_v2_disable_underflow(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr &= ~GICH_HCR_UIE;
-}
-
-static void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-       u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr;
-
-       vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >> GICH_VMCR_CTRL_SHIFT;
-       vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> GICH_VMCR_ALIAS_BINPOINT_SHIFT;
-       vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >> GICH_VMCR_BINPOINT_SHIFT;
-       vmcrp->pmr  = (vmcr & GICH_VMCR_PRIMASK_MASK) >> GICH_VMCR_PRIMASK_SHIFT;
-}
-
-static void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-       u32 vmcr;
-
-       vmcr  = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK;
-       vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & GICH_VMCR_ALIAS_BINPOINT_MASK;
-       vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & GICH_VMCR_BINPOINT_MASK;
-       vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
-
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr;
-}
-
-static void vgic_v2_enable(struct kvm_vcpu *vcpu)
-{
-       /*
-        * By forcing VMCR to zero, the GIC will restore the binary
-        * points to their reset values. Anything else resets to zero
-        * anyway.
-        */
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
-
-       /* Get the show on the road... */
-       vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
-}
-
-static const struct vgic_ops vgic_v2_ops = {
-       .get_lr                 = vgic_v2_get_lr,
-       .set_lr                 = vgic_v2_set_lr,
-       .get_elrsr              = vgic_v2_get_elrsr,
-       .get_eisr               = vgic_v2_get_eisr,
-       .clear_eisr             = vgic_v2_clear_eisr,
-       .get_interrupt_status   = vgic_v2_get_interrupt_status,
-       .enable_underflow       = vgic_v2_enable_underflow,
-       .disable_underflow      = vgic_v2_disable_underflow,
-       .get_vmcr               = vgic_v2_get_vmcr,
-       .set_vmcr               = vgic_v2_set_vmcr,
-       .enable                 = vgic_v2_enable,
-};
-
-struct vgic_params __section(.hyp.text) vgic_v2_params;
-
-static void vgic_cpu_init_lrs(void *params)
-{
-       struct vgic_params *vgic = params;
-       int i;
-
-       for (i = 0; i < vgic->nr_lr; i++)
-               writel_relaxed(0, vgic->vctrl_base + GICH_LR0 + (i * 4));
-}
-
-/**
- * vgic_v2_probe - probe for a GICv2 compatible interrupt controller
- * @gic_kvm_info:      pointer to the GIC description
- * @ops:               address of a pointer to the GICv2 operations
- * @params:            address of a pointer to HW-specific parameters
- *
- * Returns 0 if a GICv2 has been found, with the low level operations
- * in *ops and the HW parameters in *params. Returns an error code
- * otherwise.
- */
-int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
-                  const struct vgic_ops **ops,
-                  const struct vgic_params **params)
-{
-       int ret;
-       struct vgic_params *vgic = &vgic_v2_params;
-       const struct resource *vctrl_res = &gic_kvm_info->vctrl;
-       const struct resource *vcpu_res = &gic_kvm_info->vcpu;
-
-       memset(vgic, 0, sizeof(*vgic));
-
-       if (!gic_kvm_info->maint_irq) {
-               kvm_err("error getting vgic maintenance irq\n");
-               ret = -ENXIO;
-               goto out;
-       }
-       vgic->maint_irq = gic_kvm_info->maint_irq;
-
-       if (!gic_kvm_info->vctrl.start) {
-               kvm_err("GICH not present in the firmware table\n");
-               ret = -ENXIO;
-               goto out;
-       }
-
-       vgic->vctrl_base = ioremap(gic_kvm_info->vctrl.start,
-                                  resource_size(&gic_kvm_info->vctrl));
-       if (!vgic->vctrl_base) {
-               kvm_err("Cannot ioremap GICH\n");
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       vgic->nr_lr = readl_relaxed(vgic->vctrl_base + GICH_VTR);
-       vgic->nr_lr = (vgic->nr_lr & 0x3f) + 1;
-
-       ret = create_hyp_io_mappings(vgic->vctrl_base,
-                                    vgic->vctrl_base + resource_size(vctrl_res),
-                                    vctrl_res->start);
-       if (ret) {
-               kvm_err("Cannot map VCTRL into hyp\n");
-               goto out_unmap;
-       }
-
-       if (!PAGE_ALIGNED(vcpu_res->start)) {
-               kvm_err("GICV physical address 0x%llx not page aligned\n",
-                       (unsigned long long)vcpu_res->start);
-               ret = -ENXIO;
-               goto out_unmap;
-       }
-
-       if (!PAGE_ALIGNED(resource_size(vcpu_res))) {
-               kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
-                       (unsigned long long)resource_size(vcpu_res),
-                       PAGE_SIZE);
-               ret = -ENXIO;
-               goto out_unmap;
-       }
-
-       vgic->can_emulate_gicv2 = true;
-       kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);
-
-       vgic->vcpu_base = vcpu_res->start;
-
-       kvm_info("GICH base=0x%llx, GICV base=0x%llx, IRQ=%d\n",
-                gic_kvm_info->vctrl.start, vgic->vcpu_base, vgic->maint_irq);
-
-       vgic->type = VGIC_V2;
-       vgic->max_gic_vcpus = VGIC_V2_MAX_CPUS;
-
-       on_each_cpu(vgic_cpu_init_lrs, vgic, 1);
-
-       *ops = &vgic_v2_ops;
-       *params = vgic;
-       goto out;
-
-out_unmap:
-       iounmap(vgic->vctrl_base);
-out:
-       return ret;
-}
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
deleted file mode 100644 (file)
index e661e7f..0000000
+++ /dev/null
@@ -1,1074 +0,0 @@
-/*
- * GICv3 distributor and redistributor emulation
- *
- * GICv3 emulation is currently only supported on a GICv3 host (because
- * we rely on the hardware's CPU interface virtualization support), but
- * supports both hardware with or without the optional GICv2 backwards
- * compatibility features.
- *
- * Limitations of the emulation:
- * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore)
- * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI.
- * - We do not support the message based interrupts (MBIs) triggered by
- *   writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0.
- * - We do not support the (optional) backwards compatibility feature.
- *   GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports
- *   the compatiblity feature, you can use a GICv2 in the guest, though.
- * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI.
- * - Priorities are not emulated (same as the GICv2 emulation). Linux
- *   as a guest is fine with this, because it does not use priorities.
- * - We only support Group1 interrupts. Again Linux uses only those.
- *
- * Copyright (C) 2014 ARM Ltd.
- * Author: Andre Przywara <andre.przywara@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-
-#include <linux/irqchip/arm-gic-v3.h>
-#include <kvm/arm_vgic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-
-static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu,
-                              struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       u32 reg = 0xffffffff;
-
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
-       return false;
-}
-
-static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
-                            struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       u32 reg = 0;
-
-       /*
-        * Force ARE and DS to 1, the guest cannot change this.
-        * For the time being we only support Group1 interrupts.
-        */
-       if (vcpu->kvm->arch.vgic.enabled)
-               reg = GICD_CTLR_ENABLE_SS_G1;
-       reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
-
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-       if (mmio->is_write) {
-               vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1);
-               vgic_update_state(vcpu->kvm);
-               return true;
-       }
-       return false;
-}
-
-/*
- * As this implementation does not provide compatibility
- * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
- * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
- * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
- */
-#define INTERRUPT_ID_BITS 10
-static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
-                             struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       u32 reg;
-
-       reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1;
-
-       reg |= (INTERRUPT_ID_BITS - 1) << 19;
-
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
-       return false;
-}
-
-static bool handle_mmio_iidr(struct kvm_vcpu *vcpu,
-                            struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-       u32 reg;
-
-       reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
-       return false;
-}
-
-static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu,
-                                           struct kvm_exit_mmio *mmio,
-                                           phys_addr_t offset)
-{
-       if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-               return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-                                             vcpu->vcpu_id,
-                                             ACCESS_WRITE_SETBIT);
-
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu,
-                                             struct kvm_exit_mmio *mmio,
-                                             phys_addr_t offset)
-{
-       if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-               return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-                                             vcpu->vcpu_id,
-                                             ACCESS_WRITE_CLEARBIT);
-
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu,
-                                            struct kvm_exit_mmio *mmio,
-                                            phys_addr_t offset)
-{
-       if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-               return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
-                                                  vcpu->vcpu_id);
-
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu,
-                                              struct kvm_exit_mmio *mmio,
-                                              phys_addr_t offset)
-{
-       if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-               return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
-                                                    vcpu->vcpu_id);
-
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_set_active_reg_dist(struct kvm_vcpu *vcpu,
-                                           struct kvm_exit_mmio *mmio,
-                                           phys_addr_t offset)
-{
-       if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-               return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
-                                                  vcpu->vcpu_id);
-
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_clear_active_reg_dist(struct kvm_vcpu *vcpu,
-                                             struct kvm_exit_mmio *mmio,
-                                             phys_addr_t offset)
-{
-       if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-               return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
-                                                   vcpu->vcpu_id);
-
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
-                                         struct kvm_exit_mmio *mmio,
-                                         phys_addr_t offset)
-{
-       u32 *reg;
-
-       if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) {
-               vgic_reg_access(mmio, NULL, offset,
-                               ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-               return false;
-       }
-
-       reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
-                                  vcpu->vcpu_id, offset);
-       vgic_reg_access(mmio, reg, offset,
-               ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-       return false;
-}
-
-static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu,
-                                    struct kvm_exit_mmio *mmio,
-                                    phys_addr_t offset)
-{
-       u32 *reg;
-
-       if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) {
-               vgic_reg_access(mmio, NULL, offset,
-                               ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-               return false;
-       }
-
-       reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
-                                 vcpu->vcpu_id, offset >> 1);
-
-       return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-/*
- * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word)
- * when we store the target MPIDR written by the guest.
- */
-static u32 compress_mpidr(unsigned long mpidr)
-{
-       u32 ret;
-
-       ret = MPIDR_AFFINITY_LEVEL(mpidr, 0);
-       ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8;
-       ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16;
-       ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24;
-
-       return ret;
-}
-
-static unsigned long uncompress_mpidr(u32 value)
-{
-       unsigned long mpidr;
-
-       mpidr  = ((value >>  0) & 0xFF) << MPIDR_LEVEL_SHIFT(0);
-       mpidr |= ((value >>  8) & 0xFF) << MPIDR_LEVEL_SHIFT(1);
-       mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2);
-       mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3);
-
-       return mpidr;
-}
-
-/*
- * Lookup the given MPIDR value to get the vcpu_id (if there is one)
- * and store that in the irq_spi_cpu[] array.
- * This limits the number of VCPUs to 255 for now, extending the data
- * type (or storing kvm_vcpu pointers) should lift the limit.
- * Store the original MPIDR value in an extra array to support read-as-written.
- * Unallocated MPIDRs are translated to a special value and caught
- * before any array accesses.
- */
-static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu,
-                                 struct kvm_exit_mmio *mmio,
-                                 phys_addr_t offset)
-{
-       struct kvm *kvm = vcpu->kvm;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       int spi;
-       u32 reg;
-       int vcpu_id;
-       unsigned long *bmap, mpidr;
-
-       /*
-        * The upper 32 bits of each 64 bit register are zero,
-        * as we don't support Aff3.
-        */
-       if ((offset & 4)) {
-               vgic_reg_access(mmio, NULL, offset,
-                               ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-               return false;
-       }
-
-       /* This region only covers SPIs, so no handling of private IRQs here. */
-       spi = offset / 8;
-
-       /* get the stored MPIDR for this IRQ */
-       mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]);
-       reg = mpidr;
-
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-
-       if (!mmio->is_write)
-               return false;
-
-       /*
-        * Now clear the currently assigned vCPU from the map, making room
-        * for the new one to be written below
-        */
-       vcpu = kvm_mpidr_to_vcpu(kvm, mpidr);
-       if (likely(vcpu)) {
-               vcpu_id = vcpu->vcpu_id;
-               bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
-               __clear_bit(spi, bmap);
-       }
-
-       dist->irq_spi_mpidr[spi] = compress_mpidr(reg);
-       vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK);
-
-       /*
-        * The spec says that non-existent MPIDR values should not be
-        * forwarded to any existent (v)CPU, but should be able to become
-        * pending anyway. We simply keep the irq_spi_target[] array empty, so
-        * the interrupt will never be injected.
-        * irq_spi_cpu[irq] gets a magic value in this case.
-        */
-       if (likely(vcpu)) {
-               vcpu_id = vcpu->vcpu_id;
-               dist->irq_spi_cpu[spi] = vcpu_id;
-               bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
-               __set_bit(spi, bmap);
-       } else {
-               dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED;
-       }
-
-       vgic_update_state(kvm);
-
-       return true;
-}
-
-/*
- * We should be careful about promising too much when a guest reads
- * this register. Don't claim to be like any hardware implementation,
- * but just report the GIC as version 3 - which is what a Linux guest
- * would check.
- */
-static bool handle_mmio_idregs(struct kvm_vcpu *vcpu,
-                              struct kvm_exit_mmio *mmio,
-                              phys_addr_t offset)
-{
-       u32 reg = 0;
-
-       switch (offset + GICD_IDREGS) {
-       case GICD_PIDR2:
-               reg = 0x3b;
-               break;
-       }
-
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
-       return false;
-}
-
-static const struct vgic_io_range vgic_v3_dist_ranges[] = {
-       {
-               .base           = GICD_CTLR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_ctlr,
-       },
-       {
-               .base           = GICD_TYPER,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_typer,
-       },
-       {
-               .base           = GICD_IIDR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_iidr,
-       },
-       {
-               /* this register is optional, it is RAZ/WI if not implemented */
-               .base           = GICD_STATUSR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this write only register is WI when TYPER.MBIS=0 */
-               .base           = GICD_SETSPI_NSR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this write only register is WI when TYPER.MBIS=0 */
-               .base           = GICD_CLRSPI_NSR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this is RAZ/WI when DS=1 */
-               .base           = GICD_SETSPI_SR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this is RAZ/WI when DS=1 */
-               .base           = GICD_CLRSPI_SR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = GICD_IGROUPR,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_rao_wi,
-       },
-       {
-               .base           = GICD_ISENABLER,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_enable_reg_dist,
-       },
-       {
-               .base           = GICD_ICENABLER,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_enable_reg_dist,
-       },
-       {
-               .base           = GICD_ISPENDR,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_pending_reg_dist,
-       },
-       {
-               .base           = GICD_ICPENDR,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_pending_reg_dist,
-       },
-       {
-               .base           = GICD_ISACTIVER,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_active_reg_dist,
-       },
-       {
-               .base           = GICD_ICACTIVER,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_active_reg_dist,
-       },
-       {
-               .base           = GICD_IPRIORITYR,
-               .len            = 0x400,
-               .bits_per_irq   = 8,
-               .handle_mmio    = handle_mmio_priority_reg_dist,
-       },
-       {
-               /* TARGETSRn is RES0 when ARE=1 */
-               .base           = GICD_ITARGETSR,
-               .len            = 0x400,
-               .bits_per_irq   = 8,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = GICD_ICFGR,
-               .len            = 0x100,
-               .bits_per_irq   = 2,
-               .handle_mmio    = handle_mmio_cfg_reg_dist,
-       },
-       {
-               /* this is RAZ/WI when DS=1 */
-               .base           = GICD_IGRPMODR,
-               .len            = 0x80,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this is RAZ/WI when DS=1 */
-               .base           = GICD_NSACR,
-               .len            = 0x100,
-               .bits_per_irq   = 2,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this is RAZ/WI when ARE=1 */
-               .base           = GICD_SGIR,
-               .len            = 0x04,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this is RAZ/WI when ARE=1 */
-               .base           = GICD_CPENDSGIR,
-               .len            = 0x10,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               /* this is RAZ/WI when ARE=1 */
-               .base           = GICD_SPENDSGIR,
-               .len            = 0x10,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = GICD_IROUTER + 0x100,
-               .len            = 0x1ee0,
-               .bits_per_irq   = 64,
-               .handle_mmio    = handle_mmio_route_reg,
-       },
-       {
-               .base           = GICD_IDREGS,
-               .len            = 0x30,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_idregs,
-       },
-       {},
-};
-
-static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
-                                   struct kvm_exit_mmio *mmio,
-                                   phys_addr_t offset)
-{
-       /* since we don't support LPIs, this register is zero for now */
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
-                                    struct kvm_exit_mmio *mmio,
-                                    phys_addr_t offset)
-{
-       u32 reg;
-       u64 mpidr;
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-       int target_vcpu_id = redist_vcpu->vcpu_id;
-
-       /* the upper 32 bits contain the affinity value */
-       if ((offset & ~3) == 4) {
-               mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
-               reg = compress_mpidr(mpidr);
-
-               vgic_reg_access(mmio, &reg, offset,
-                               ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-               return false;
-       }
-
-       reg = redist_vcpu->vcpu_id << 8;
-       if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
-               reg |= GICR_TYPER_LAST;
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu,
-                                             struct kvm_exit_mmio *mmio,
-                                             phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-
-       return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-                                     redist_vcpu->vcpu_id,
-                                     ACCESS_WRITE_SETBIT);
-}
-
-static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu,
-                                               struct kvm_exit_mmio *mmio,
-                                               phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-
-       return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-                                     redist_vcpu->vcpu_id,
-                                     ACCESS_WRITE_CLEARBIT);
-}
-
-static bool handle_mmio_set_active_reg_redist(struct kvm_vcpu *vcpu,
-                                             struct kvm_exit_mmio *mmio,
-                                             phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-
-       return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
-                                         redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_active_reg_redist(struct kvm_vcpu *vcpu,
-                                               struct kvm_exit_mmio *mmio,
-                                               phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-
-       return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
-                                            redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
-                                              struct kvm_exit_mmio *mmio,
-                                              phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-
-       return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
-                                          redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu,
-                                                struct kvm_exit_mmio *mmio,
-                                                phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-
-       return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
-                                            redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu,
-                                           struct kvm_exit_mmio *mmio,
-                                           phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-       u32 *reg;
-
-       reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
-                                  redist_vcpu->vcpu_id, offset);
-       vgic_reg_access(mmio, reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-       return false;
-}
-
-static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu,
-                                      struct kvm_exit_mmio *mmio,
-                                      phys_addr_t offset)
-{
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-
-       u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
-                                      redist_vcpu->vcpu_id, offset >> 1);
-
-       return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-#define SGI_base(x) ((x) + SZ_64K)
-
-static const struct vgic_io_range vgic_redist_ranges[] = {
-       {
-               .base           = GICR_CTLR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_ctlr_redist,
-       },
-       {
-               .base           = GICR_TYPER,
-               .len            = 0x08,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_typer_redist,
-       },
-       {
-               .base           = GICR_IIDR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_iidr,
-       },
-       {
-               .base           = GICR_WAKER,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = GICR_IDREGS,
-               .len            = 0x30,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_idregs,
-       },
-       {
-               .base           = SGI_base(GICR_IGROUPR0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_rao_wi,
-       },
-       {
-               .base           = SGI_base(GICR_ISENABLER0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_enable_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_ICENABLER0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_enable_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_ISPENDR0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_pending_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_ICPENDR0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_pending_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_ISACTIVER0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_set_active_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_ICACTIVER0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_clear_active_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_IPRIORITYR0),
-               .len            = 0x20,
-               .bits_per_irq   = 8,
-               .handle_mmio    = handle_mmio_priority_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_ICFGR0),
-               .len            = 0x08,
-               .bits_per_irq   = 2,
-               .handle_mmio    = handle_mmio_cfg_reg_redist,
-       },
-       {
-               .base           = SGI_base(GICR_IGRPMODR0),
-               .len            = 0x04,
-               .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = SGI_base(GICR_NSACR),
-               .len            = 0x04,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {},
-};
-
-static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
-       if (vgic_queue_irq(vcpu, 0, irq)) {
-               vgic_dist_irq_clear_pending(vcpu, irq);
-               vgic_cpu_irq_clear(vcpu, irq);
-               return true;
-       }
-
-       return false;
-}
-
-static int vgic_v3_map_resources(struct kvm *kvm,
-                                const struct vgic_params *params)
-{
-       int ret = 0;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       gpa_t rdbase = dist->vgic_redist_base;
-       struct vgic_io_device *iodevs = NULL;
-       int i;
-
-       if (!irqchip_in_kernel(kvm))
-               return 0;
-
-       mutex_lock(&kvm->lock);
-
-       if (vgic_ready(kvm))
-               goto out;
-
-       if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
-           IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
-               kvm_err("Need to set vgic distributor addresses first\n");
-               ret = -ENXIO;
-               goto out;
-       }
-
-       /*
-        * For a VGICv3 we require the userland to explicitly initialize
-        * the VGIC before we need to use it.
-        */
-       if (!vgic_initialized(kvm)) {
-               ret = -EBUSY;
-               goto out;
-       }
-
-       ret = vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base,
-                                      GIC_V3_DIST_SIZE, vgic_v3_dist_ranges,
-                                      -1, &dist->dist_iodev);
-       if (ret)
-               goto out;
-
-       iodevs = kcalloc(dist->nr_cpus, sizeof(iodevs[0]), GFP_KERNEL);
-       if (!iodevs) {
-               ret = -ENOMEM;
-               goto out_unregister;
-       }
-
-       for (i = 0; i < dist->nr_cpus; i++) {
-               ret = vgic_register_kvm_io_dev(kvm, rdbase,
-                                              SZ_128K, vgic_redist_ranges,
-                                              i, &iodevs[i]);
-               if (ret)
-                       goto out_unregister;
-               rdbase += GIC_V3_REDIST_SIZE;
-       }
-
-       dist->redist_iodevs = iodevs;
-       dist->ready = true;
-       goto out;
-
-out_unregister:
-       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev);
-       if (iodevs) {
-               for (i = 0; i < dist->nr_cpus; i++) {
-                       if (iodevs[i].dev.ops)
-                               kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
-                                                         &iodevs[i].dev);
-               }
-       }
-
-out:
-       if (ret)
-               kvm_vgic_destroy(kvm);
-       mutex_unlock(&kvm->lock);
-       return ret;
-}
-
-static int vgic_v3_init_model(struct kvm *kvm)
-{
-       int i;
-       u32 mpidr;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
-
-       dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]),
-                                     GFP_KERNEL);
-
-       if (!dist->irq_spi_mpidr)
-               return -ENOMEM;
-
-       /* Initialize the target VCPUs for each IRQ to VCPU 0 */
-       mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0)));
-       for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) {
-               dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0;
-               dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr;
-               vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1);
-       }
-
-       return 0;
-}
-
-/* GICv3 does not keep track of SGI sources anymore. */
-static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
-}
-
-void vgic_v3_init_emulation(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       dist->vm_ops.queue_sgi = vgic_v3_queue_sgi;
-       dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
-       dist->vm_ops.init_model = vgic_v3_init_model;
-       dist->vm_ops.map_resources = vgic_v3_map_resources;
-
-       kvm->arch.max_vcpus = KVM_MAX_VCPUS;
-}
-
-/*
- * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
- * generation register ICC_SGI1R_EL1) with a given VCPU.
- * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
- * return -1.
- */
-static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
-{
-       unsigned long affinity;
-       int level0;
-
-       /*
-        * Split the current VCPU's MPIDR into affinity level 0 and the
-        * rest as this is what we have to compare against.
-        */
-       affinity = kvm_vcpu_get_mpidr_aff(vcpu);
-       level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
-       affinity &= ~MPIDR_LEVEL_MASK;
-
-       /* bail out if the upper three levels don't match */
-       if (sgi_aff != affinity)
-               return -1;
-
-       /* Is this VCPU's bit set in the mask ? */
-       if (!(sgi_cpu_mask & BIT(level0)))
-               return -1;
-
-       return level0;
-}
-
-#define SGI_AFFINITY_LEVEL(reg, level) \
-       ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
-       >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
-
-/**
- * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
- * @vcpu: The VCPU requesting a SGI
- * @reg: The value written into the ICC_SGI1R_EL1 register by that VCPU
- *
- * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
- * This will trap in sys_regs.c and call this function.
- * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
- * target processors as well as a bitmask of 16 Aff0 CPUs.
- * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
- * check for matching ones. If this bit is set, we signal all, but not the
- * calling VCPU.
- */
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
-{
-       struct kvm *kvm = vcpu->kvm;
-       struct kvm_vcpu *c_vcpu;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       u16 target_cpus;
-       u64 mpidr;
-       int sgi, c;
-       int vcpu_id = vcpu->vcpu_id;
-       bool broadcast;
-       int updated = 0;
-
-       sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
-       broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
-       target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
-       mpidr = SGI_AFFINITY_LEVEL(reg, 3);
-       mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
-       mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
-
-       /*
-        * We take the dist lock here, because we come from the sysregs
-        * code path and not from the MMIO one (which already takes the lock).
-        */
-       spin_lock(&dist->lock);
-
-       /*
-        * We iterate over all VCPUs to find the MPIDRs matching the request.
-        * If we have handled one CPU, we clear it's bit to detect early
-        * if we are already finished. This avoids iterating through all
-        * VCPUs when most of the times we just signal a single VCPU.
-        */
-       kvm_for_each_vcpu(c, c_vcpu, kvm) {
-
-               /* Exit early if we have dealt with all requested CPUs */
-               if (!broadcast && target_cpus == 0)
-                       break;
-
-                /* Don't signal the calling VCPU */
-               if (broadcast && c == vcpu_id)
-                       continue;
-
-               if (!broadcast) {
-                       int level0;
-
-                       level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
-                       if (level0 == -1)
-                               continue;
-
-                       /* remove this matching VCPU from the mask */
-                       target_cpus &= ~BIT(level0);
-               }
-
-               /* Flag the SGI as pending */
-               vgic_dist_irq_set_pending(c_vcpu, sgi);
-               updated = 1;
-               kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c);
-       }
-       if (updated)
-               vgic_update_state(vcpu->kvm);
-       spin_unlock(&dist->lock);
-       if (updated)
-               vgic_kick_vcpus(vcpu->kvm);
-}
-
-static int vgic_v3_create(struct kvm_device *dev, u32 type)
-{
-       return kvm_vgic_create(dev->kvm, type);
-}
-
-static void vgic_v3_destroy(struct kvm_device *dev)
-{
-       kfree(dev);
-}
-
-static int vgic_v3_set_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       int ret;
-
-       ret = vgic_set_common_attr(dev, attr);
-       if (ret != -ENXIO)
-               return ret;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-               return -ENXIO;
-       }
-
-       return -ENXIO;
-}
-
-static int vgic_v3_get_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       int ret;
-
-       ret = vgic_get_common_attr(dev, attr);
-       if (ret != -ENXIO)
-               return ret;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-               return -ENXIO;
-       }
-
-       return -ENXIO;
-}
-
-static int vgic_v3_has_attr(struct kvm_device *dev,
-                           struct kvm_device_attr *attr)
-{
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR:
-               switch (attr->attr) {
-               case KVM_VGIC_V2_ADDR_TYPE_DIST:
-               case KVM_VGIC_V2_ADDR_TYPE_CPU:
-                       return -ENXIO;
-               case KVM_VGIC_V3_ADDR_TYPE_DIST:
-               case KVM_VGIC_V3_ADDR_TYPE_REDIST:
-                       return 0;
-               }
-               break;
-       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-               return -ENXIO;
-       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
-               return 0;
-       case KVM_DEV_ARM_VGIC_GRP_CTRL:
-               switch (attr->attr) {
-               case KVM_DEV_ARM_VGIC_CTRL_INIT:
-                       return 0;
-               }
-       }
-       return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v3_ops = {
-       .name = "kvm-arm-vgic-v3",
-       .create = vgic_v3_create,
-       .destroy = vgic_v3_destroy,
-       .set_attr = vgic_v3_set_attr,
-       .get_attr = vgic_v3_get_attr,
-       .has_attr = vgic_v3_has_attr,
-};
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
deleted file mode 100644 (file)
index 75b02fa..0000000
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Copyright (C) 2013 ARM Limited, All Rights Reserved.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/irqchip/arm-gic-common.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
-
-static u32 ich_vtr_el2;
-
-static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
-       struct vgic_lr lr_desc;
-       u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr];
-
-       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
-               lr_desc.irq = val & ICH_LR_VIRTUAL_ID_MASK;
-       else
-               lr_desc.irq = val & GICH_LR_VIRTUALID;
-
-       lr_desc.source = 0;
-       if (lr_desc.irq <= 15 &&
-           vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
-               lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7;
-
-       lr_desc.state = 0;
-
-       if (val & ICH_LR_PENDING_BIT)
-               lr_desc.state |= LR_STATE_PENDING;
-       if (val & ICH_LR_ACTIVE_BIT)
-               lr_desc.state |= LR_STATE_ACTIVE;
-       if (val & ICH_LR_EOI)
-               lr_desc.state |= LR_EOI_INT;
-       if (val & ICH_LR_HW) {
-               lr_desc.state |= LR_HW;
-               lr_desc.hwirq = (val >> ICH_LR_PHYS_ID_SHIFT) & GENMASK(9, 0);
-       }
-
-       return lr_desc;
-}
-
-static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
-                          struct vgic_lr lr_desc)
-{
-       u64 lr_val;
-
-       lr_val = lr_desc.irq;
-
-       /*
-        * Currently all guest IRQs are Group1, as Group0 would result
-        * in a FIQ in the guest, which it wouldn't expect.
-        * Eventually we want to make this configurable, so we may revisit
-        * this in the future.
-        */
-       switch (vcpu->kvm->arch.vgic.vgic_model) {
-       case KVM_DEV_TYPE_ARM_VGIC_V3:
-               lr_val |= ICH_LR_GROUP;
-               break;
-       case  KVM_DEV_TYPE_ARM_VGIC_V2:
-               if (lr_desc.irq < VGIC_NR_SGIS)
-                       lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
-               break;
-       default:
-               BUG();
-       }
-
-       if (lr_desc.state & LR_STATE_PENDING)
-               lr_val |= ICH_LR_PENDING_BIT;
-       if (lr_desc.state & LR_STATE_ACTIVE)
-               lr_val |= ICH_LR_ACTIVE_BIT;
-       if (lr_desc.state & LR_EOI_INT)
-               lr_val |= ICH_LR_EOI;
-       if (lr_desc.state & LR_HW) {
-               lr_val |= ICH_LR_HW;
-               lr_val |= ((u64)lr_desc.hwirq) << ICH_LR_PHYS_ID_SHIFT;
-       }
-
-       vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = lr_val;
-
-       if (!(lr_desc.state & LR_STATE_MASK))
-               vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
-       else
-               vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr &= ~(1U << lr);
-}
-
-static u64 vgic_v3_get_elrsr(const struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr;
-}
-
-static u64 vgic_v3_get_eisr(const struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr;
-}
-
-static void vgic_v3_clear_eisr(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr = 0;
-}
-
-static u32 vgic_v3_get_interrupt_status(const struct kvm_vcpu *vcpu)
-{
-       u32 misr = vcpu->arch.vgic_cpu.vgic_v3.vgic_misr;
-       u32 ret = 0;
-
-       if (misr & ICH_MISR_EOI)
-               ret |= INT_STATUS_EOI;
-       if (misr & ICH_MISR_U)
-               ret |= INT_STATUS_UNDERFLOW;
-
-       return ret;
-}
-
-static void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-       u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr;
-
-       vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT;
-       vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
-       vmcrp->bpr  = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
-       vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
-}
-
-static void vgic_v3_enable_underflow(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr |= ICH_HCR_UIE;
-}
-
-static void vgic_v3_disable_underflow(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr &= ~ICH_HCR_UIE;
-}
-
-static void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-       u32 vmcr;
-
-       vmcr  = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK;
-       vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
-       vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
-       vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
-
-       vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr;
-}
-
-static void vgic_v3_enable(struct kvm_vcpu *vcpu)
-{
-       struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
-
-       /*
-        * By forcing VMCR to zero, the GIC will restore the binary
-        * points to their reset values. Anything else resets to zero
-        * anyway.
-        */
-       vgic_v3->vgic_vmcr = 0;
-       vgic_v3->vgic_elrsr = ~0;
-
-       /*
-        * If we are emulating a GICv3, we do it in an non-GICv2-compatible
-        * way, so we force SRE to 1 to demonstrate this to the guest.
-        * This goes with the spec allowing the value to be RAO/WI.
-        */
-       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
-               vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
-       else
-               vgic_v3->vgic_sre = 0;
-
-       /* Get the show on the road... */
-       vgic_v3->vgic_hcr = ICH_HCR_EN;
-}
-
-static const struct vgic_ops vgic_v3_ops = {
-       .get_lr                 = vgic_v3_get_lr,
-       .set_lr                 = vgic_v3_set_lr,
-       .get_elrsr              = vgic_v3_get_elrsr,
-       .get_eisr               = vgic_v3_get_eisr,
-       .clear_eisr             = vgic_v3_clear_eisr,
-       .get_interrupt_status   = vgic_v3_get_interrupt_status,
-       .enable_underflow       = vgic_v3_enable_underflow,
-       .disable_underflow      = vgic_v3_disable_underflow,
-       .get_vmcr               = vgic_v3_get_vmcr,
-       .set_vmcr               = vgic_v3_set_vmcr,
-       .enable                 = vgic_v3_enable,
-};
-
-static struct vgic_params vgic_v3_params;
-
-static void vgic_cpu_init_lrs(void *params)
-{
-       kvm_call_hyp(__vgic_v3_init_lrs);
-}
-
-/**
- * vgic_v3_probe - probe for a GICv3 compatible interrupt controller
- * @gic_kvm_info:      pointer to the GIC description
- * @ops:               address of a pointer to the GICv3 operations
- * @params:            address of a pointer to HW-specific parameters
- *
- * Returns 0 if a GICv3 has been found, with the low level operations
- * in *ops and the HW parameters in *params. Returns an error code
- * otherwise.
- */
-int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
-                 const struct vgic_ops **ops,
-                 const struct vgic_params **params)
-{
-       int ret = 0;
-       struct vgic_params *vgic = &vgic_v3_params;
-       const struct resource *vcpu_res = &gic_kvm_info->vcpu;
-
-       vgic->maint_irq = gic_kvm_info->maint_irq;
-
-       ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
-
-       /*
-        * The ListRegs field is 5 bits, but there is a architectural
-        * maximum of 16 list registers. Just ignore bit 4...
-        */
-       vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1;
-       vgic->can_emulate_gicv2 = false;
-
-       if (!vcpu_res->start) {
-               kvm_info("GICv3: no GICV resource entry\n");
-               vgic->vcpu_base = 0;
-       } else if (!PAGE_ALIGNED(vcpu_res->start)) {
-               pr_warn("GICV physical address 0x%llx not page aligned\n",
-                       (unsigned long long)vcpu_res->start);
-               vgic->vcpu_base = 0;
-       } else if (!PAGE_ALIGNED(resource_size(vcpu_res))) {
-               pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n",
-                       (unsigned long long)resource_size(vcpu_res),
-                       PAGE_SIZE);
-       } else {
-               vgic->vcpu_base = vcpu_res->start;
-               vgic->can_emulate_gicv2 = true;
-               kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
-                                       KVM_DEV_TYPE_ARM_VGIC_V2);
-       }
-       if (vgic->vcpu_base == 0)
-               kvm_info("disabling GICv2 emulation\n");
-       kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3);
-
-       vgic->vctrl_base = NULL;
-       vgic->type = VGIC_V3;
-       vgic->max_gic_vcpus = VGIC_V3_MAX_CPUS;
-
-       kvm_info("GICV base=0x%llx, IRQ=%d\n",
-                vgic->vcpu_base, vgic->maint_irq);
-
-       on_each_cpu(vgic_cpu_init_lrs, vgic, 1);
-
-       *ops = &vgic_v3_ops;
-       *params = vgic;
-
-       return ret;
-}
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
deleted file mode 100644 (file)
index 67cb5e9..0000000
+++ /dev/null
@@ -1,2417 +0,0 @@
-/*
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/irq.h>
-#include <linux/rculist.h>
-#include <linux/uaccess.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-#include <trace/events/kvm.h>
-#include <asm/kvm.h>
-#include <kvm/iodev.h>
-#include <linux/irqchip/arm-gic-common.h>
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-/*
- * How the whole thing works (courtesy of Christoffer Dall):
- *
- * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if
- *   something is pending on the CPU interface.
- * - Interrupts that are pending on the distributor are stored on the
- *   vgic.irq_pending vgic bitmap (this bitmap is updated by both user land
- *   ioctls and guest mmio ops, and other in-kernel peripherals such as the
- *   arch. timers).
- * - Every time the bitmap changes, the irq_pending_on_cpu oracle is
- *   recalculated
- * - To calculate the oracle, we need info for each cpu from
- *   compute_pending_for_cpu, which considers:
- *   - PPI: dist->irq_pending & dist->irq_enable
- *   - SPI: dist->irq_pending & dist->irq_enable & dist->irq_spi_target
- *   - irq_spi_target is a 'formatted' version of the GICD_ITARGETSRn
- *     registers, stored on each vcpu. We only keep one bit of
- *     information per interrupt, making sure that only one vcpu can
- *     accept the interrupt.
- * - If any of the above state changes, we must recalculate the oracle.
- * - The same is true when injecting an interrupt, except that we only
- *   consider a single interrupt at a time. The irq_spi_cpu array
- *   contains the target CPU for each SPI.
- *
- * The handling of level interrupts adds some extra complexity. We
- * need to track when the interrupt has been EOIed, so we can sample
- * the 'line' again. This is achieved as such:
- *
- * - When a level interrupt is moved onto a vcpu, the corresponding
- *   bit in irq_queued is set. As long as this bit is set, the line
- *   will be ignored for further interrupts. The interrupt is injected
- *   into the vcpu with the GICH_LR_EOI bit set (generate a
- *   maintenance interrupt on EOI).
- * - When the interrupt is EOIed, the maintenance interrupt fires,
- *   and clears the corresponding bit in irq_queued. This allows the
- *   interrupt line to be sampled again.
- * - Note that level-triggered interrupts can also be set to pending from
- *   writes to GICD_ISPENDRn and lowering the external input line does not
- *   cause the interrupt to become inactive in such a situation.
- *   Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become
- *   inactive as long as the external input line is held high.
- *
- *
- * Initialization rules: there are multiple stages to the vgic
- * initialization, both for the distributor and the CPU interfaces.
- *
- * Distributor:
- *
- * - kvm_vgic_early_init(): initialization of static data that doesn't
- *   depend on any sizing information or emulation type. No allocation
- *   is allowed there.
- *
- * - vgic_init(): allocation and initialization of the generic data
- *   structures that depend on sizing information (number of CPUs,
- *   number of interrupts). Also initializes the vcpu specific data
- *   structures. Can be executed lazily for GICv2.
- *   [to be renamed to kvm_vgic_init??]
- *
- * CPU Interface:
- *
- * - kvm_vgic_cpu_early_init(): initialization of static data that
- *   doesn't depend on any sizing information or emulation type. No
- *   allocation is allowed there.
- */
-
-#include "vgic.h"
-
-static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
-static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu);
-static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
-static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
-static u64 vgic_get_elrsr(struct kvm_vcpu *vcpu);
-static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
-                                               int virt_irq);
-static int compute_pending_for_cpu(struct kvm_vcpu *vcpu);
-
-static const struct vgic_ops *vgic_ops;
-static const struct vgic_params *vgic;
-
-static void add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
-       vcpu->kvm->arch.vgic.vm_ops.add_sgi_source(vcpu, irq, source);
-}
-
-static bool queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
-       return vcpu->kvm->arch.vgic.vm_ops.queue_sgi(vcpu, irq);
-}
-
-int kvm_vgic_map_resources(struct kvm *kvm)
-{
-       return kvm->arch.vgic.vm_ops.map_resources(kvm, vgic);
-}
-
-/*
- * struct vgic_bitmap contains a bitmap made of unsigned longs, but
- * extracts u32s out of them.
- *
- * This does not work on 64-bit BE systems, because the bitmap access
- * will store two consecutive 32-bit words with the higher-addressed
- * register's bits at the lower index and the lower-addressed register's
- * bits at the higher index.
- *
- * Therefore, swizzle the register index when accessing the 32-bit word
- * registers to access the right register's value.
- */
-#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 64
-#define REG_OFFSET_SWIZZLE     1
-#else
-#define REG_OFFSET_SWIZZLE     0
-#endif
-
-static int vgic_init_bitmap(struct vgic_bitmap *b, int nr_cpus, int nr_irqs)
-{
-       int nr_longs;
-
-       nr_longs = nr_cpus + BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS);
-
-       b->private = kzalloc(sizeof(unsigned long) * nr_longs, GFP_KERNEL);
-       if (!b->private)
-               return -ENOMEM;
-
-       b->shared = b->private + nr_cpus;
-
-       return 0;
-}
-
-static void vgic_free_bitmap(struct vgic_bitmap *b)
-{
-       kfree(b->private);
-       b->private = NULL;
-       b->shared = NULL;
-}
-
-/*
- * Call this function to convert a u64 value to an unsigned long * bitmask
- * in a way that works on both 32-bit and 64-bit LE and BE platforms.
- *
- * Warning: Calling this function may modify *val.
- */
-static unsigned long *u64_to_bitmask(u64 *val)
-{
-#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32
-       *val = (*val >> 32) | (*val << 32);
-#endif
-       return (unsigned long *)val;
-}
-
-u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset)
-{
-       offset >>= 2;
-       if (!offset)
-               return (u32 *)(x->private + cpuid) + REG_OFFSET_SWIZZLE;
-       else
-               return (u32 *)(x->shared) + ((offset - 1) ^ REG_OFFSET_SWIZZLE);
-}
-
-static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x,
-                                  int cpuid, int irq)
-{
-       if (irq < VGIC_NR_PRIVATE_IRQS)
-               return test_bit(irq, x->private + cpuid);
-
-       return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared);
-}
-
-void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
-                            int irq, int val)
-{
-       unsigned long *reg;
-
-       if (irq < VGIC_NR_PRIVATE_IRQS) {
-               reg = x->private + cpuid;
-       } else {
-               reg = x->shared;
-               irq -= VGIC_NR_PRIVATE_IRQS;
-       }
-
-       if (val)
-               set_bit(irq, reg);
-       else
-               clear_bit(irq, reg);
-}
-
-static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid)
-{
-       return x->private + cpuid;
-}
-
-unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x)
-{
-       return x->shared;
-}
-
-static int vgic_init_bytemap(struct vgic_bytemap *x, int nr_cpus, int nr_irqs)
-{
-       int size;
-
-       size  = nr_cpus * VGIC_NR_PRIVATE_IRQS;
-       size += nr_irqs - VGIC_NR_PRIVATE_IRQS;
-
-       x->private = kzalloc(size, GFP_KERNEL);
-       if (!x->private)
-               return -ENOMEM;
-
-       x->shared = x->private + nr_cpus * VGIC_NR_PRIVATE_IRQS / sizeof(u32);
-       return 0;
-}
-
-static void vgic_free_bytemap(struct vgic_bytemap *b)
-{
-       kfree(b->private);
-       b->private = NULL;
-       b->shared = NULL;
-}
-
-u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
-{
-       u32 *reg;
-
-       if (offset < VGIC_NR_PRIVATE_IRQS) {
-               reg = x->private;
-               offset += cpuid * VGIC_NR_PRIVATE_IRQS;
-       } else {
-               reg = x->shared;
-               offset -= VGIC_NR_PRIVATE_IRQS;
-       }
-
-       return reg + (offset / sizeof(u32));
-}
-
-#define VGIC_CFG_LEVEL 0
-#define VGIC_CFG_EDGE  1
-
-static bool vgic_irq_is_edge(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       int irq_val;
-
-       irq_val = vgic_bitmap_get_irq_val(&dist->irq_cfg, vcpu->vcpu_id, irq);
-       return irq_val == VGIC_CFG_EDGE;
-}
-
-static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq);
-}
-
-static int vgic_irq_is_queued(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return vgic_bitmap_get_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq);
-}
-
-static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq);
-}
-
-static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 0);
-}
-
-static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0);
-}
-
-static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return vgic_bitmap_get_irq_val(&dist->irq_level, vcpu->vcpu_id, irq);
-}
-
-static void vgic_dist_irq_set_level(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_dist_irq_clear_level(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 0);
-}
-
-static int vgic_dist_irq_soft_pend(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return vgic_bitmap_get_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq);
-}
-
-static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0);
-       if (!vgic_dist_irq_get_level(vcpu, irq)) {
-               vgic_dist_irq_clear_pending(vcpu, irq);
-               if (!compute_pending_for_cpu(vcpu))
-                       clear_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
-       }
-}
-
-static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return vgic_bitmap_get_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq);
-}
-
-void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 1);
-}
-
-void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 0);
-}
-
-static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq)
-{
-       if (irq < VGIC_NR_PRIVATE_IRQS)
-               set_bit(irq, vcpu->arch.vgic_cpu.pending_percpu);
-       else
-               set_bit(irq - VGIC_NR_PRIVATE_IRQS,
-                       vcpu->arch.vgic_cpu.pending_shared);
-}
-
-void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
-{
-       if (irq < VGIC_NR_PRIVATE_IRQS)
-               clear_bit(irq, vcpu->arch.vgic_cpu.pending_percpu);
-       else
-               clear_bit(irq - VGIC_NR_PRIVATE_IRQS,
-                         vcpu->arch.vgic_cpu.pending_shared);
-}
-
-static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq)
-{
-       return !vgic_irq_is_queued(vcpu, irq);
-}
-
-/**
- * vgic_reg_access - access vgic register
- * @mmio:   pointer to the data describing the mmio access
- * @reg:    pointer to the virtual backing of vgic distributor data
- * @offset: least significant 2 bits used for word offset
- * @mode:   ACCESS_ mode (see defines above)
- *
- * Helper to make vgic register access easier using one of the access
- * modes defined for vgic register access
- * (read,raz,write-ignored,setbit,clearbit,write)
- */
-void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
-                    phys_addr_t offset, int mode)
-{
-       int word_offset = (offset & 3) * 8;
-       u32 mask = (1UL << (mmio->len * 8)) - 1;
-       u32 regval;
-
-       /*
-        * Any alignment fault should have been delivered to the guest
-        * directly (ARM ARM B3.12.7 "Prioritization of aborts").
-        */
-
-       if (reg) {
-               regval = *reg;
-       } else {
-               BUG_ON(mode != (ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED));
-               regval = 0;
-       }
-
-       if (mmio->is_write) {
-               u32 data = mmio_data_read(mmio, mask) << word_offset;
-               switch (ACCESS_WRITE_MASK(mode)) {
-               case ACCESS_WRITE_IGNORED:
-                       return;
-
-               case ACCESS_WRITE_SETBIT:
-                       regval |= data;
-                       break;
-
-               case ACCESS_WRITE_CLEARBIT:
-                       regval &= ~data;
-                       break;
-
-               case ACCESS_WRITE_VALUE:
-                       regval = (regval & ~(mask << word_offset)) | data;
-                       break;
-               }
-               *reg = regval;
-       } else {
-               switch (ACCESS_READ_MASK(mode)) {
-               case ACCESS_READ_RAZ:
-                       regval = 0;
-                       /* fall through */
-
-               case ACCESS_READ_VALUE:
-                       mmio_data_write(mmio, mask, regval >> word_offset);
-               }
-       }
-}
-
-bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
-                       phys_addr_t offset)
-{
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
-                           phys_addr_t offset, int vcpu_id, int access)
-{
-       u32 *reg;
-       int mode = ACCESS_READ_VALUE | access;
-       struct kvm_vcpu *target_vcpu = kvm_get_vcpu(kvm, vcpu_id);
-
-       reg = vgic_bitmap_get_reg(&kvm->arch.vgic.irq_enabled, vcpu_id, offset);
-       vgic_reg_access(mmio, reg, offset, mode);
-       if (mmio->is_write) {
-               if (access & ACCESS_WRITE_CLEARBIT) {
-                       if (offset < 4) /* Force SGI enabled */
-                               *reg |= 0xffff;
-                       vgic_retire_disabled_irqs(target_vcpu);
-               }
-               vgic_update_state(kvm);
-               return true;
-       }
-
-       return false;
-}
-
-bool vgic_handle_set_pending_reg(struct kvm *kvm,
-                                struct kvm_exit_mmio *mmio,
-                                phys_addr_t offset, int vcpu_id)
-{
-       u32 *reg, orig;
-       u32 level_mask;
-       int mode = ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       reg = vgic_bitmap_get_reg(&dist->irq_cfg, vcpu_id, offset);
-       level_mask = (~(*reg));
-
-       /* Mark both level and edge triggered irqs as pending */
-       reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
-       orig = *reg;
-       vgic_reg_access(mmio, reg, offset, mode);
-
-       if (mmio->is_write) {
-               /* Set the soft-pending flag only for level-triggered irqs */
-               reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
-                                         vcpu_id, offset);
-               vgic_reg_access(mmio, reg, offset, mode);
-               *reg &= level_mask;
-
-               /* Ignore writes to SGIs */
-               if (offset < 2) {
-                       *reg &= ~0xffff;
-                       *reg |= orig & 0xffff;
-               }
-
-               vgic_update_state(kvm);
-               return true;
-       }
-
-       return false;
-}
-
-bool vgic_handle_clear_pending_reg(struct kvm *kvm,
-                                  struct kvm_exit_mmio *mmio,
-                                  phys_addr_t offset, int vcpu_id)
-{
-       u32 *level_active;
-       u32 *reg, orig;
-       int mode = ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
-       orig = *reg;
-       vgic_reg_access(mmio, reg, offset, mode);
-       if (mmio->is_write) {
-               /* Re-set level triggered level-active interrupts */
-               level_active = vgic_bitmap_get_reg(&dist->irq_level,
-                                         vcpu_id, offset);
-               reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
-               *reg |= *level_active;
-
-               /* Ignore writes to SGIs */
-               if (offset < 2) {
-                       *reg &= ~0xffff;
-                       *reg |= orig & 0xffff;
-               }
-
-               /* Clear soft-pending flags */
-               reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
-                                         vcpu_id, offset);
-               vgic_reg_access(mmio, reg, offset, mode);
-
-               vgic_update_state(kvm);
-               return true;
-       }
-       return false;
-}
-
-bool vgic_handle_set_active_reg(struct kvm *kvm,
-                               struct kvm_exit_mmio *mmio,
-                               phys_addr_t offset, int vcpu_id)
-{
-       u32 *reg;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset);
-       vgic_reg_access(mmio, reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
-
-       if (mmio->is_write) {
-               vgic_update_state(kvm);
-               return true;
-       }
-
-       return false;
-}
-
-bool vgic_handle_clear_active_reg(struct kvm *kvm,
-                                 struct kvm_exit_mmio *mmio,
-                                 phys_addr_t offset, int vcpu_id)
-{
-       u32 *reg;
-       struct vgic_dist *dist = &kvm->arch.vgic;
-
-       reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset);
-       vgic_reg_access(mmio, reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
-
-       if (mmio->is_write) {
-               vgic_update_state(kvm);
-               return true;
-       }
-
-       return false;
-}
-
-static u32 vgic_cfg_expand(u16 val)
-{
-       u32 res = 0;
-       int i;
-
-       /*
-        * Turn a 16bit value like abcd...mnop into a 32bit word
-        * a0b0c0d0...m0n0o0p0, which is what the HW cfg register is.
-        */
-       for (i = 0; i < 16; i++)
-               res |= ((val >> i) & VGIC_CFG_EDGE) << (2 * i + 1);
-
-       return res;
-}
-
-static u16 vgic_cfg_compress(u32 val)
-{
-       u16 res = 0;
-       int i;
-
-       /*
-        * Turn a 32bit word a0b0c0d0...m0n0o0p0 into 16bit value like
-        * abcd...mnop which is what we really care about.
-        */
-       for (i = 0; i < 16; i++)
-               res |= ((val >> (i * 2 + 1)) & VGIC_CFG_EDGE) << i;
-
-       return res;
-}
-
-/*
- * The distributor uses 2 bits per IRQ for the CFG register, but the
- * LSB is always 0. As such, we only keep the upper bit, and use the
- * two above functions to compress/expand the bits
- */
-bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
-                        phys_addr_t offset)
-{
-       u32 val;
-
-       if (offset & 4)
-               val = *reg >> 16;
-       else
-               val = *reg & 0xffff;
-
-       val = vgic_cfg_expand(val);
-       vgic_reg_access(mmio, &val, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-       if (mmio->is_write) {
-               /* Ignore writes to read-only SGI and PPI bits */
-               if (offset < 8)
-                       return false;
-
-               val = vgic_cfg_compress(val);
-               if (offset & 4) {
-                       *reg &= 0xffff;
-                       *reg |= val << 16;
-               } else {
-                       *reg &= 0xffff << 16;
-                       *reg |= val;
-               }
-       }
-
-       return false;
-}
-
-/**
- * vgic_unqueue_irqs - move pending/active IRQs from LRs to the distributor
- * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs
- *
- * Move any IRQs that have already been assigned to LRs back to the
- * emulated distributor state so that the complete emulated state can be read
- * from the main emulation structures without investigating the LRs.
- */
-void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
-{
-       u64 elrsr = vgic_get_elrsr(vcpu);
-       unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
-       int i;
-
-       for_each_clear_bit(i, elrsr_ptr, vgic->nr_lr) {
-               struct vgic_lr lr = vgic_get_lr(vcpu, i);
-
-               /*
-                * There are three options for the state bits:
-                *
-                * 01: pending
-                * 10: active
-                * 11: pending and active
-                */
-               BUG_ON(!(lr.state & LR_STATE_MASK));
-
-               /* Reestablish SGI source for pending and active IRQs */
-               if (lr.irq < VGIC_NR_SGIS)
-                       add_sgi_source(vcpu, lr.irq, lr.source);
-
-               /*
-                * If the LR holds an active (10) or a pending and active (11)
-                * interrupt then move the active state to the
-                * distributor tracking bit.
-                */
-               if (lr.state & LR_STATE_ACTIVE)
-                       vgic_irq_set_active(vcpu, lr.irq);
-
-               /*
-                * Reestablish the pending state on the distributor and the
-                * CPU interface and mark the LR as free for other use.
-                */
-               vgic_retire_lr(i, vcpu);
-
-               /* Finally update the VGIC state. */
-               vgic_update_state(vcpu->kvm);
-       }
-}
-
-const
-struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges,
-                                     int len, gpa_t offset)
-{
-       while (ranges->len) {
-               if (offset >= ranges->base &&
-                   (offset + len) <= (ranges->base + ranges->len))
-                       return ranges;
-               ranges++;
-       }
-
-       return NULL;
-}
-
-static bool vgic_validate_access(const struct vgic_dist *dist,
-                                const struct vgic_io_range *range,
-                                unsigned long offset)
-{
-       int irq;
-
-       if (!range->bits_per_irq)
-               return true;    /* Not an irq-based access */
-
-       irq = offset * 8 / range->bits_per_irq;
-       if (irq >= dist->nr_irqs)
-               return false;
-
-       return true;
-}
-
-/*
- * Call the respective handler function for the given range.
- * We split up any 64 bit accesses into two consecutive 32 bit
- * handler calls and merge the result afterwards.
- * We do this in a little endian fashion regardless of the host's
- * or guest's endianness, because the GIC is always LE and the rest of
- * the code (vgic_reg_access) also puts it in a LE fashion already.
- * At this point we have already identified the handle function, so
- * range points to that one entry and offset is relative to this.
- */
-static bool call_range_handler(struct kvm_vcpu *vcpu,
-                              struct kvm_exit_mmio *mmio,
-                              unsigned long offset,
-                              const struct vgic_io_range *range)
-{
-       struct kvm_exit_mmio mmio32;
-       bool ret;
-
-       if (likely(mmio->len <= 4))
-               return range->handle_mmio(vcpu, mmio, offset);
-
-       /*
-        * Any access bigger than 4 bytes (that we currently handle in KVM)
-        * is actually 8 bytes long, caused by a 64-bit access
-        */
-
-       mmio32.len = 4;
-       mmio32.is_write = mmio->is_write;
-       mmio32.private = mmio->private;
-
-       mmio32.phys_addr = mmio->phys_addr + 4;
-       mmio32.data = &((u32 *)mmio->data)[1];
-       ret = range->handle_mmio(vcpu, &mmio32, offset + 4);
-
-       mmio32.phys_addr = mmio->phys_addr;
-       mmio32.data = &((u32 *)mmio->data)[0];
-       ret |= range->handle_mmio(vcpu, &mmio32, offset);
-
-       return ret;
-}
-
-/**
- * vgic_handle_mmio_access - handle an in-kernel MMIO access
- * This is called by the read/write KVM IO device wrappers below.
- * @vcpu:      pointer to the vcpu performing the access
- * @this:      pointer to the KVM IO device in charge
- * @addr:      guest physical address of the access
- * @len:       size of the access
- * @val:       pointer to the data region
- * @is_write:  read or write access
- *
- * returns true if the MMIO access could be performed
- */
-static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu,
-                                  struct kvm_io_device *this, gpa_t addr,
-                                  int len, void *val, bool is_write)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       struct vgic_io_device *iodev = container_of(this,
-                                                   struct vgic_io_device, dev);
-       const struct vgic_io_range *range;
-       struct kvm_exit_mmio mmio;
-       bool updated_state;
-       gpa_t offset;
-
-       offset = addr - iodev->addr;
-       range = vgic_find_range(iodev->reg_ranges, len, offset);
-       if (unlikely(!range || !range->handle_mmio)) {
-               pr_warn("Unhandled access %d %08llx %d\n", is_write, addr, len);
-               return -ENXIO;
-       }
-
-       mmio.phys_addr = addr;
-       mmio.len = len;
-       mmio.is_write = is_write;
-       mmio.data = val;
-       mmio.private = iodev->redist_vcpu;
-
-       spin_lock(&dist->lock);
-       offset -= range->base;
-       if (vgic_validate_access(dist, range, offset)) {
-               updated_state = call_range_handler(vcpu, &mmio, offset, range);
-       } else {
-               if (!is_write)
-                       memset(val, 0, len);
-               updated_state = false;
-       }
-       spin_unlock(&dist->lock);
-
-       if (updated_state)
-               vgic_kick_vcpus(vcpu->kvm);
-
-       return 0;
-}
-
-static int vgic_handle_mmio_read(struct kvm_vcpu *vcpu,
-                                struct kvm_io_device *this,
-                                gpa_t addr, int len, void *val)
-{
-       return vgic_handle_mmio_access(vcpu, this, addr, len, val, false);
-}
-
-static int vgic_handle_mmio_write(struct kvm_vcpu *vcpu,
-                                 struct kvm_io_device *this,
-                                 gpa_t addr, int len, const void *val)
-{
-       return vgic_handle_mmio_access(vcpu, this, addr, len, (void *)val,
-                                      true);
-}
-
-static struct kvm_io_device_ops vgic_io_ops = {
-       .read   = vgic_handle_mmio_read,
-       .write  = vgic_handle_mmio_write,
-};
-
-/**
- * vgic_register_kvm_io_dev - register VGIC register frame on the KVM I/O bus
- * @kvm:            The VM structure pointer
- * @base:           The (guest) base address for the register frame
- * @len:            Length of the register frame window
- * @ranges:         Describing the handler functions for each register
- * @redist_vcpu_id: The VCPU ID to pass on to the handlers on call
- * @iodev:          Points to memory to be passed on to the handler
- *
- * @iodev stores the parameters of this function to be usable by the handler
- * respectively the dispatcher function (since the KVM I/O bus framework lacks
- * an opaque parameter). Initialization is done in this function, but the
- * reference should be valid and unique for the whole VGIC lifetime.
- * If the register frame is not mapped for a specific VCPU, pass -1 to
- * @redist_vcpu_id.
- */
-int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len,
-                            const struct vgic_io_range *ranges,
-                            int redist_vcpu_id,
-                            struct vgic_io_device *iodev)
-{
-       struct kvm_vcpu *vcpu = NULL;
-       int ret;
-
-       if (redist_vcpu_id >= 0)
-               vcpu = kvm_get_vcpu(kvm, redist_vcpu_id);
-
-       iodev->addr             = base;
-       iodev->len              = len;
-       iodev->reg_ranges       = ranges;
-       iodev->redist_vcpu      = vcpu;
-
-       kvm_iodevice_init(&iodev->dev, &vgic_io_ops);
-
-       mutex_lock(&kvm->slots_lock);
-
-       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, base, len,
-                                     &iodev->dev);
-       mutex_unlock(&kvm->slots_lock);
-
-       /* Mark the iodev as invalid if registration fails. */
-       if (ret)
-               iodev->dev.ops = NULL;
-
-       return ret;
-}
-
-static int vgic_nr_shared_irqs(struct vgic_dist *dist)
-{
-       return dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
-}
-
-static int compute_active_for_cpu(struct kvm_vcpu *vcpu)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       unsigned long *active, *enabled, *act_percpu, *act_shared;
-       unsigned long active_private, active_shared;
-       int nr_shared = vgic_nr_shared_irqs(dist);
-       int vcpu_id;
-
-       vcpu_id = vcpu->vcpu_id;
-       act_percpu = vcpu->arch.vgic_cpu.active_percpu;
-       act_shared = vcpu->arch.vgic_cpu.active_shared;
-
-       active = vgic_bitmap_get_cpu_map(&dist->irq_active, vcpu_id);
-       enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
-       bitmap_and(act_percpu, active, enabled, VGIC_NR_PRIVATE_IRQS);
-
-       active = vgic_bitmap_get_shared_map(&dist->irq_active);
-       enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
-       bitmap_and(act_shared, active, enabled, nr_shared);
-       bitmap_and(act_shared, act_shared,
-                  vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
-                  nr_shared);
-
-       active_private = find_first_bit(act_percpu, VGIC_NR_PRIVATE_IRQS);
-       active_shared = find_first_bit(act_shared, nr_shared);
-
-       return (active_private < VGIC_NR_PRIVATE_IRQS ||
-               active_shared < nr_shared);
-}
-
-static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       unsigned long *pending, *enabled, *pend_percpu, *pend_shared;
-       unsigned long pending_private, pending_shared;
-       int nr_shared = vgic_nr_shared_irqs(dist);
-       int vcpu_id;
-
-       vcpu_id = vcpu->vcpu_id;
-       pend_percpu = vcpu->arch.vgic_cpu.pending_percpu;
-       pend_shared = vcpu->arch.vgic_cpu.pending_shared;
-
-       if (!dist->enabled) {
-               bitmap_zero(pend_percpu, VGIC_NR_PRIVATE_IRQS);
-               bitmap_zero(pend_shared, nr_shared);
-               return 0;
-       }
-
-       pending = vgic_bitmap_get_cpu_map(&dist->irq_pending, vcpu_id);
-       enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
-       bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS);
-
-       pending = vgic_bitmap_get_shared_map(&dist->irq_pending);
-       enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
-       bitmap_and(pend_shared, pending, enabled, nr_shared);
-       bitmap_and(pend_shared, pend_shared,
-                  vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
-                  nr_shared);
-
-       pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS);
-       pending_shared = find_first_bit(pend_shared, nr_shared);
-       return (pending_private < VGIC_NR_PRIVATE_IRQS ||
-               pending_shared < vgic_nr_shared_irqs(dist));
-}
-
-/*
- * Update the interrupt state and determine which CPUs have pending
- * or active interrupts. Must be called with distributor lock held.
- */
-void vgic_update_state(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int c;
-
-       kvm_for_each_vcpu(c, vcpu, kvm) {
-               if (compute_pending_for_cpu(vcpu))
-                       set_bit(c, dist->irq_pending_on_cpu);
-
-               if (compute_active_for_cpu(vcpu))
-                       set_bit(c, dist->irq_active_on_cpu);
-               else
-                       clear_bit(c, dist->irq_active_on_cpu);
-       }
-}
-
-static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
-       return vgic_ops->get_lr(vcpu, lr);
-}
-
-static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr,
-                              struct vgic_lr vlr)
-{
-       vgic_ops->set_lr(vcpu, lr, vlr);
-}
-
-static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu)
-{
-       return vgic_ops->get_elrsr(vcpu);
-}
-
-static inline u64 vgic_get_eisr(struct kvm_vcpu *vcpu)
-{
-       return vgic_ops->get_eisr(vcpu);
-}
-
-static inline void vgic_clear_eisr(struct kvm_vcpu *vcpu)
-{
-       vgic_ops->clear_eisr(vcpu);
-}
-
-static inline u32 vgic_get_interrupt_status(struct kvm_vcpu *vcpu)
-{
-       return vgic_ops->get_interrupt_status(vcpu);
-}
-
-static inline void vgic_enable_underflow(struct kvm_vcpu *vcpu)
-{
-       vgic_ops->enable_underflow(vcpu);
-}
-
-static inline void vgic_disable_underflow(struct kvm_vcpu *vcpu)
-{
-       vgic_ops->disable_underflow(vcpu);
-}
-
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-       vgic_ops->get_vmcr(vcpu, vmcr);
-}
-
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-       vgic_ops->set_vmcr(vcpu, vmcr);
-}
-
-static inline void vgic_enable(struct kvm_vcpu *vcpu)
-{
-       vgic_ops->enable(vcpu);
-}
-
-static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu)
-{
-       struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr);
-
-       vgic_irq_clear_queued(vcpu, vlr.irq);
-
-       /*
-        * We must transfer the pending state back to the distributor before
-        * retiring the LR, otherwise we may loose edge-triggered interrupts.
-        */
-       if (vlr.state & LR_STATE_PENDING) {
-               vgic_dist_irq_set_pending(vcpu, vlr.irq);
-               vlr.hwirq = 0;
-       }
-
-       vlr.state = 0;
-       vgic_set_lr(vcpu, lr_nr, vlr);
-}
-
-static bool dist_active_irq(struct kvm_vcpu *vcpu)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       return test_bit(vcpu->vcpu_id, dist->irq_active_on_cpu);
-}
-
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
-{
-       int i;
-
-       for (i = 0; i < vgic->nr_lr; i++) {
-               struct vgic_lr vlr = vgic_get_lr(vcpu, i);
-
-               if (vlr.irq == virt_irq && vlr.state & LR_STATE_ACTIVE)
-                       return true;
-       }
-
-       return vgic_irq_is_active(vcpu, virt_irq);
-}
-
-/*
- * An interrupt may have been disabled after being made pending on the
- * CPU interface (the classic case is a timer running while we're
- * rebooting the guest - the interrupt would kick as soon as the CPU
- * interface gets enabled, with deadly consequences).
- *
- * The solution is to examine already active LRs, and check the
- * interrupt is still enabled. If not, just retire it.
- */
-static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
-{
-       u64 elrsr = vgic_get_elrsr(vcpu);
-       unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
-       int lr;
-
-       for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
-               struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
-               if (!vgic_irq_is_enabled(vcpu, vlr.irq))
-                       vgic_retire_lr(lr, vcpu);
-       }
-}
-
-static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
-                                int lr_nr, struct vgic_lr vlr)
-{
-       if (vgic_irq_is_active(vcpu, irq)) {
-               vlr.state |= LR_STATE_ACTIVE;
-               kvm_debug("Set active, clear distributor: 0x%x\n", vlr.state);
-               vgic_irq_clear_active(vcpu, irq);
-               vgic_update_state(vcpu->kvm);
-       } else {
-               WARN_ON(!vgic_dist_irq_is_pending(vcpu, irq));
-               vlr.state |= LR_STATE_PENDING;
-               kvm_debug("Set pending: 0x%x\n", vlr.state);
-       }
-
-       if (!vgic_irq_is_edge(vcpu, irq))
-               vlr.state |= LR_EOI_INT;
-
-       if (vlr.irq >= VGIC_NR_SGIS) {
-               struct irq_phys_map *map;
-               map = vgic_irq_map_search(vcpu, irq);
-
-               if (map) {
-                       vlr.hwirq = map->phys_irq;
-                       vlr.state |= LR_HW;
-                       vlr.state &= ~LR_EOI_INT;
-
-                       /*
-                        * Make sure we're not going to sample this
-                        * again, as a HW-backed interrupt cannot be
-                        * in the PENDING_ACTIVE stage.
-                        */
-                       vgic_irq_set_queued(vcpu, irq);
-               }
-       }
-
-       vgic_set_lr(vcpu, lr_nr, vlr);
-}
-
-/*
- * Queue an interrupt to a CPU virtual interface. Return true on success,
- * or false if it wasn't possible to queue it.
- * sgi_source must be zero for any non-SGI interrupts.
- */
-bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       u64 elrsr = vgic_get_elrsr(vcpu);
-       unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
-       struct vgic_lr vlr;
-       int lr;
-
-       /* Sanitize the input... */
-       BUG_ON(sgi_source_id & ~7);
-       BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS);
-       BUG_ON(irq >= dist->nr_irqs);
-
-       kvm_debug("Queue IRQ%d\n", irq);
-
-       /* Do we have an active interrupt for the same CPUID? */
-       for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
-               vlr = vgic_get_lr(vcpu, lr);
-               if (vlr.irq == irq && vlr.source == sgi_source_id) {
-                       kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq);
-                       vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
-                       return true;
-               }
-       }
-
-       /* Try to use another LR for this interrupt */
-       lr = find_first_bit(elrsr_ptr, vgic->nr_lr);
-       if (lr >= vgic->nr_lr)
-               return false;
-
-       kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id);
-
-       vlr.irq = irq;
-       vlr.source = sgi_source_id;
-       vlr.state = 0;
-       vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
-
-       return true;
-}
-
-static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq)
-{
-       if (!vgic_can_sample_irq(vcpu, irq))
-               return true; /* level interrupt, already queued */
-
-       if (vgic_queue_irq(vcpu, 0, irq)) {
-               if (vgic_irq_is_edge(vcpu, irq)) {
-                       vgic_dist_irq_clear_pending(vcpu, irq);
-                       vgic_cpu_irq_clear(vcpu, irq);
-               } else {
-                       vgic_irq_set_queued(vcpu, irq);
-               }
-
-               return true;
-       }
-
-       return false;
-}
-
-/*
- * Fill the list registers with pending interrupts before running the
- * guest.
- */
-static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       unsigned long *pa_percpu, *pa_shared;
-       int i, vcpu_id;
-       int overflow = 0;
-       int nr_shared = vgic_nr_shared_irqs(dist);
-
-       vcpu_id = vcpu->vcpu_id;
-
-       pa_percpu = vcpu->arch.vgic_cpu.pend_act_percpu;
-       pa_shared = vcpu->arch.vgic_cpu.pend_act_shared;
-
-       bitmap_or(pa_percpu, vgic_cpu->pending_percpu, vgic_cpu->active_percpu,
-                 VGIC_NR_PRIVATE_IRQS);
-       bitmap_or(pa_shared, vgic_cpu->pending_shared, vgic_cpu->active_shared,
-                 nr_shared);
-       /*
-        * We may not have any pending interrupt, or the interrupts
-        * may have been serviced from another vcpu. In all cases,
-        * move along.
-        */
-       if (!kvm_vgic_vcpu_pending_irq(vcpu) && !dist_active_irq(vcpu))
-               goto epilog;
-
-       /* SGIs */
-       for_each_set_bit(i, pa_percpu, VGIC_NR_SGIS) {
-               if (!queue_sgi(vcpu, i))
-                       overflow = 1;
-       }
-
-       /* PPIs */
-       for_each_set_bit_from(i, pa_percpu, VGIC_NR_PRIVATE_IRQS) {
-               if (!vgic_queue_hwirq(vcpu, i))
-                       overflow = 1;
-       }
-
-       /* SPIs */
-       for_each_set_bit(i, pa_shared, nr_shared) {
-               if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS))
-                       overflow = 1;
-       }
-
-
-
-
-epilog:
-       if (overflow) {
-               vgic_enable_underflow(vcpu);
-       } else {
-               vgic_disable_underflow(vcpu);
-               /*
-                * We're about to run this VCPU, and we've consumed
-                * everything the distributor had in store for
-                * us. Claim we don't have anything pending. We'll
-                * adjust that if needed while exiting.
-                */
-               clear_bit(vcpu_id, dist->irq_pending_on_cpu);
-       }
-}
-
-static int process_queued_irq(struct kvm_vcpu *vcpu,
-                                  int lr, struct vgic_lr vlr)
-{
-       int pending = 0;
-
-       /*
-        * If the IRQ was EOIed (called from vgic_process_maintenance) or it
-        * went from active to non-active (called from vgic_sync_hwirq) it was
-        * also ACKed and we we therefore assume we can clear the soft pending
-        * state (should it had been set) for this interrupt.
-        *
-        * Note: if the IRQ soft pending state was set after the IRQ was
-        * acked, it actually shouldn't be cleared, but we have no way of
-        * knowing that unless we start trapping ACKs when the soft-pending
-        * state is set.
-        */
-       vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
-
-       /*
-        * Tell the gic to start sampling this interrupt again.
-        */
-       vgic_irq_clear_queued(vcpu, vlr.irq);
-
-       /* Any additional pending interrupt? */
-       if (vgic_irq_is_edge(vcpu, vlr.irq)) {
-               BUG_ON(!(vlr.state & LR_HW));
-               pending = vgic_dist_irq_is_pending(vcpu, vlr.irq);
-       } else {
-               if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
-                       vgic_cpu_irq_set(vcpu, vlr.irq);
-                       pending = 1;
-               } else {
-                       vgic_dist_irq_clear_pending(vcpu, vlr.irq);
-                       vgic_cpu_irq_clear(vcpu, vlr.irq);
-               }
-       }
-
-       /*
-        * Despite being EOIed, the LR may not have
-        * been marked as empty.
-        */
-       vlr.state = 0;
-       vlr.hwirq = 0;
-       vgic_set_lr(vcpu, lr, vlr);
-
-       return pending;
-}
-
-static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
-{
-       u32 status = vgic_get_interrupt_status(vcpu);
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       struct kvm *kvm = vcpu->kvm;
-       int level_pending = 0;
-
-       kvm_debug("STATUS = %08x\n", status);
-
-       if (status & INT_STATUS_EOI) {
-               /*
-                * Some level interrupts have been EOIed. Clear their
-                * active bit.
-                */
-               u64 eisr = vgic_get_eisr(vcpu);
-               unsigned long *eisr_ptr = u64_to_bitmask(&eisr);
-               int lr;
-
-               for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) {
-                       struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
-                       WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq));
-                       WARN_ON(vlr.state & LR_STATE_MASK);
-
-
-                       /*
-                        * kvm_notify_acked_irq calls kvm_set_irq()
-                        * to reset the IRQ level, which grabs the dist->lock
-                        * so we call this before taking the dist->lock.
-                        */
-                       kvm_notify_acked_irq(kvm, 0,
-                                            vlr.irq - VGIC_NR_PRIVATE_IRQS);
-
-                       spin_lock(&dist->lock);
-                       level_pending |= process_queued_irq(vcpu, lr, vlr);
-                       spin_unlock(&dist->lock);
-               }
-       }
-
-       if (status & INT_STATUS_UNDERFLOW)
-               vgic_disable_underflow(vcpu);
-
-       /*
-        * In the next iterations of the vcpu loop, if we sync the vgic state
-        * after flushing it, but before entering the guest (this happens for
-        * pending signals and vmid rollovers), then make sure we don't pick
-        * up any old maintenance interrupts here.
-        */
-       vgic_clear_eisr(vcpu);
-
-       return level_pending;
-}
-
-/*
- * Save the physical active state, and reset it to inactive.
- *
- * Return true if there's a pending forwarded interrupt to queue.
- */
-static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       bool level_pending;
-
-       if (!(vlr.state & LR_HW))
-               return false;
-
-       if (vlr.state & LR_STATE_ACTIVE)
-               return false;
-
-       spin_lock(&dist->lock);
-       level_pending = process_queued_irq(vcpu, lr, vlr);
-       spin_unlock(&dist->lock);
-       return level_pending;
-}
-
-/* Sync back the VGIC state after a guest run */
-static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       u64 elrsr;
-       unsigned long *elrsr_ptr;
-       int lr, pending;
-       bool level_pending;
-
-       level_pending = vgic_process_maintenance(vcpu);
-
-       /* Deal with HW interrupts, and clear mappings for empty LRs */
-       for (lr = 0; lr < vgic->nr_lr; lr++) {
-               struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
-               level_pending |= vgic_sync_hwirq(vcpu, lr, vlr);
-               BUG_ON(vlr.irq >= dist->nr_irqs);
-       }
-
-       /* Check if we still have something up our sleeve... */
-       elrsr = vgic_get_elrsr(vcpu);
-       elrsr_ptr = u64_to_bitmask(&elrsr);
-       pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr);
-       if (level_pending || pending < vgic->nr_lr)
-               set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
-}
-
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       if (!irqchip_in_kernel(vcpu->kvm))
-               return;
-
-       spin_lock(&dist->lock);
-       __kvm_vgic_flush_hwstate(vcpu);
-       spin_unlock(&dist->lock);
-}
-
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-       if (!irqchip_in_kernel(vcpu->kvm))
-               return;
-
-       __kvm_vgic_sync_hwstate(vcpu);
-}
-
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-       if (!irqchip_in_kernel(vcpu->kvm))
-               return 0;
-
-       return test_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
-}
-
-void vgic_kick_vcpus(struct kvm *kvm)
-{
-       struct kvm_vcpu *vcpu;
-       int c;
-
-       /*
-        * We've injected an interrupt, time to find out who deserves
-        * a good kick...
-        */
-       kvm_for_each_vcpu(c, vcpu, kvm) {
-               if (kvm_vgic_vcpu_pending_irq(vcpu))
-                       kvm_vcpu_kick(vcpu);
-       }
-}
-
-static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
-{
-       int edge_triggered = vgic_irq_is_edge(vcpu, irq);
-
-       /*
-        * Only inject an interrupt if:
-        * - edge triggered and we have a rising edge
-        * - level triggered and we change level
-        */
-       if (edge_triggered) {
-               int state = vgic_dist_irq_is_pending(vcpu, irq);
-               return level > state;
-       } else {
-               int state = vgic_dist_irq_get_level(vcpu, irq);
-               return level != state;
-       }
-}
-
-static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
-                                  unsigned int irq_num, bool level)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int edge_triggered, level_triggered;
-       int enabled;
-       bool ret = true, can_inject = true;
-
-       trace_vgic_update_irq_pending(cpuid, irq_num, level);
-
-       if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
-               return -EINVAL;
-
-       spin_lock(&dist->lock);
-
-       vcpu = kvm_get_vcpu(kvm, cpuid);
-       edge_triggered = vgic_irq_is_edge(vcpu, irq_num);
-       level_triggered = !edge_triggered;
-
-       if (!vgic_validate_injection(vcpu, irq_num, level)) {
-               ret = false;
-               goto out;
-       }
-
-       if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
-               cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
-               if (cpuid == VCPU_NOT_ALLOCATED) {
-                       /* Pretend we use CPU0, and prevent injection */
-                       cpuid = 0;
-                       can_inject = false;
-               }
-               vcpu = kvm_get_vcpu(kvm, cpuid);
-       }
-
-       kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid);
-
-       if (level) {
-               if (level_triggered)
-                       vgic_dist_irq_set_level(vcpu, irq_num);
-               vgic_dist_irq_set_pending(vcpu, irq_num);
-       } else {
-               if (level_triggered) {
-                       vgic_dist_irq_clear_level(vcpu, irq_num);
-                       if (!vgic_dist_irq_soft_pend(vcpu, irq_num)) {
-                               vgic_dist_irq_clear_pending(vcpu, irq_num);
-                               vgic_cpu_irq_clear(vcpu, irq_num);
-                               if (!compute_pending_for_cpu(vcpu))
-                                       clear_bit(cpuid, dist->irq_pending_on_cpu);
-                       }
-               }
-
-               ret = false;
-               goto out;
-       }
-
-       enabled = vgic_irq_is_enabled(vcpu, irq_num);
-
-       if (!enabled || !can_inject) {
-               ret = false;
-               goto out;
-       }
-
-       if (!vgic_can_sample_irq(vcpu, irq_num)) {
-               /*
-                * Level interrupt in progress, will be picked up
-                * when EOId.
-                */
-               ret = false;
-               goto out;
-       }
-
-       if (level) {
-               vgic_cpu_irq_set(vcpu, irq_num);
-               set_bit(cpuid, dist->irq_pending_on_cpu);
-       }
-
-out:
-       spin_unlock(&dist->lock);
-
-       if (ret) {
-               /* kick the specified vcpu */
-               kvm_vcpu_kick(kvm_get_vcpu(kvm, cpuid));
-       }
-
-       return 0;
-}
-
-static int vgic_lazy_init(struct kvm *kvm)
-{
-       int ret = 0;
-
-       if (unlikely(!vgic_initialized(kvm))) {
-               /*
-                * We only provide the automatic initialization of the VGIC
-                * for the legacy case of a GICv2. Any other type must
-                * be explicitly initialized once setup with the respective
-                * KVM device call.
-                */
-               if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
-                       return -EBUSY;
-
-               mutex_lock(&kvm->lock);
-               ret = vgic_init(kvm);
-               mutex_unlock(&kvm->lock);
-       }
-
-       return ret;
-}
-
-/**
- * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
- * @kvm:     The VM structure pointer
- * @cpuid:   The CPU for PPIs
- * @irq_num: The IRQ number that is assigned to the device. This IRQ
- *           must not be mapped to a HW interrupt.
- * @level:   Edge-triggered:  true:  to trigger the interrupt
- *                           false: to ignore the call
- *          Level-sensitive  true:  raise the input signal
- *                           false: lower the input signal
- *
- * The GIC is not concerned with devices being active-LOW or active-HIGH for
- * level-sensitive interrupts.  You can think of the level parameter as 1
- * being HIGH and 0 being LOW and all devices being active-HIGH.
- */
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
-                       bool level)
-{
-       struct irq_phys_map *map;
-       int ret;
-
-       ret = vgic_lazy_init(kvm);
-       if (ret)
-               return ret;
-
-       map = vgic_irq_map_search(kvm_get_vcpu(kvm, cpuid), irq_num);
-       if (map)
-               return -EINVAL;
-
-       return vgic_update_irq_pending(kvm, cpuid, irq_num, level);
-}
-
-/**
- * kvm_vgic_inject_mapped_irq - Inject a physically mapped IRQ to the vgic
- * @kvm:     The VM structure pointer
- * @cpuid:   The CPU for PPIs
- * @virt_irq: The virtual IRQ to be injected
- * @level:   Edge-triggered:  true:  to trigger the interrupt
- *                           false: to ignore the call
- *          Level-sensitive  true:  raise the input signal
- *                           false: lower the input signal
- *
- * The GIC is not concerned with devices being active-LOW or active-HIGH for
- * level-sensitive interrupts.  You can think of the level parameter as 1
- * being HIGH and 0 being LOW and all devices being active-HIGH.
- */
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
-                              unsigned int virt_irq, bool level)
-{
-       int ret;
-
-       ret = vgic_lazy_init(kvm);
-       if (ret)
-               return ret;
-
-       return vgic_update_irq_pending(kvm, cpuid, virt_irq, level);
-}
-
-static irqreturn_t vgic_maintenance_handler(int irq, void *data)
-{
-       /*
-        * We cannot rely on the vgic maintenance interrupt to be
-        * delivered synchronously. This means we can only use it to
-        * exit the VM, and we perform the handling of EOIed
-        * interrupts on the exit path (see vgic_process_maintenance).
-        */
-       return IRQ_HANDLED;
-}
-
-static struct list_head *vgic_get_irq_phys_map_list(struct kvm_vcpu *vcpu,
-                                                   int virt_irq)
-{
-       if (virt_irq < VGIC_NR_PRIVATE_IRQS)
-               return &vcpu->arch.vgic_cpu.irq_phys_map_list;
-       else
-               return &vcpu->kvm->arch.vgic.irq_phys_map_list;
-}
-
-/**
- * kvm_vgic_map_phys_irq - map a virtual IRQ to a physical IRQ
- * @vcpu: The VCPU pointer
- * @virt_irq: The virtual IRQ number for the guest
- * @phys_irq: The hardware IRQ number of the host
- *
- * Establish a mapping between a guest visible irq (@virt_irq) and a
- * hardware irq (@phys_irq). On injection, @virt_irq will be associated with
- * the physical interrupt represented by @phys_irq. This mapping can be
- * established multiple times as long as the parameters are the same.
- *
- * Returns 0 on success or an error value otherwise.
- */
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
-       struct irq_phys_map *map;
-       struct irq_phys_map_entry *entry;
-       int ret = 0;
-
-       /* Create a new mapping */
-       entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-       if (!entry)
-               return -ENOMEM;
-
-       spin_lock(&dist->irq_phys_map_lock);
-
-       /* Try to match an existing mapping */
-       map = vgic_irq_map_search(vcpu, virt_irq);
-       if (map) {
-               /* Make sure this mapping matches */
-               if (map->phys_irq != phys_irq)
-                       ret = -EINVAL;
-
-               /* Found an existing, valid mapping */
-               goto out;
-       }
-
-       map           = &entry->map;
-       map->virt_irq = virt_irq;
-       map->phys_irq = phys_irq;
-
-       list_add_tail_rcu(&entry->entry, root);
-
-out:
-       spin_unlock(&dist->irq_phys_map_lock);
-       /* If we've found a hit in the existing list, free the useless
-        * entry */
-       if (ret || map != &entry->map)
-               kfree(entry);
-       return ret;
-}
-
-static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
-                                               int virt_irq)
-{
-       struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
-       struct irq_phys_map_entry *entry;
-       struct irq_phys_map *map;
-
-       rcu_read_lock();
-
-       list_for_each_entry_rcu(entry, root, entry) {
-               map = &entry->map;
-               if (map->virt_irq == virt_irq) {
-                       rcu_read_unlock();
-                       return map;
-               }
-       }
-
-       rcu_read_unlock();
-
-       return NULL;
-}
-
-static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
-{
-       struct irq_phys_map_entry *entry;
-
-       entry = container_of(rcu, struct irq_phys_map_entry, rcu);
-       kfree(entry);
-}
-
-/**
- * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
- * @vcpu: The VCPU pointer
- * @virt_irq: The virtual IRQ number to be unmapped
- *
- * Remove an existing mapping between virtual and physical interrupts.
- */
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       struct irq_phys_map_entry *entry;
-       struct list_head *root;
-
-       root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
-
-       spin_lock(&dist->irq_phys_map_lock);
-
-       list_for_each_entry(entry, root, entry) {
-               if (entry->map.virt_irq == virt_irq) {
-                       list_del_rcu(&entry->entry);
-                       call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
-                       break;
-               }
-       }
-
-       spin_unlock(&dist->irq_phys_map_lock);
-
-       return 0;
-}
-
-static void vgic_destroy_irq_phys_map(struct kvm *kvm, struct list_head *root)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct irq_phys_map_entry *entry;
-
-       spin_lock(&dist->irq_phys_map_lock);
-
-       list_for_each_entry(entry, root, entry) {
-               list_del_rcu(&entry->entry);
-               call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
-       }
-
-       spin_unlock(&dist->irq_phys_map_lock);
-}
-
-void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-       kfree(vgic_cpu->pending_shared);
-       kfree(vgic_cpu->active_shared);
-       kfree(vgic_cpu->pend_act_shared);
-       vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list);
-       vgic_cpu->pending_shared = NULL;
-       vgic_cpu->active_shared = NULL;
-       vgic_cpu->pend_act_shared = NULL;
-}
-
-static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       int nr_longs = BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS);
-       int sz = nr_longs * sizeof(unsigned long);
-       vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
-       vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL);
-       vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL);
-
-       if (!vgic_cpu->pending_shared
-               || !vgic_cpu->active_shared
-               || !vgic_cpu->pend_act_shared) {
-               kvm_vgic_vcpu_destroy(vcpu);
-               return -ENOMEM;
-       }
-
-       return 0;
-}
-
-/**
- * kvm_vgic_vcpu_early_init - Earliest possible per-vcpu vgic init stage
- *
- * No memory allocation should be performed here, only static init.
- */
-void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu)
-{
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       INIT_LIST_HEAD(&vgic_cpu->irq_phys_map_list);
-}
-
-/**
- * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
- *
- * The host's GIC naturally limits the maximum amount of VCPUs a guest
- * can use.
- */
-int kvm_vgic_get_max_vcpus(void)
-{
-       return vgic->max_gic_vcpus;
-}
-
-void kvm_vgic_destroy(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int i;
-
-       kvm_for_each_vcpu(i, vcpu, kvm)
-               kvm_vgic_vcpu_destroy(vcpu);
-
-       vgic_free_bitmap(&dist->irq_enabled);
-       vgic_free_bitmap(&dist->irq_level);
-       vgic_free_bitmap(&dist->irq_pending);
-       vgic_free_bitmap(&dist->irq_soft_pend);
-       vgic_free_bitmap(&dist->irq_queued);
-       vgic_free_bitmap(&dist->irq_cfg);
-       vgic_free_bytemap(&dist->irq_priority);
-       if (dist->irq_spi_target) {
-               for (i = 0; i < dist->nr_cpus; i++)
-                       vgic_free_bitmap(&dist->irq_spi_target[i]);
-       }
-       kfree(dist->irq_sgi_sources);
-       kfree(dist->irq_spi_cpu);
-       kfree(dist->irq_spi_mpidr);
-       kfree(dist->irq_spi_target);
-       kfree(dist->irq_pending_on_cpu);
-       kfree(dist->irq_active_on_cpu);
-       vgic_destroy_irq_phys_map(kvm, &dist->irq_phys_map_list);
-       dist->irq_sgi_sources = NULL;
-       dist->irq_spi_cpu = NULL;
-       dist->irq_spi_target = NULL;
-       dist->irq_pending_on_cpu = NULL;
-       dist->irq_active_on_cpu = NULL;
-       dist->nr_cpus = 0;
-}
-
-/*
- * Allocate and initialize the various data structures. Must be called
- * with kvm->lock held!
- */
-int vgic_init(struct kvm *kvm)
-{
-       struct vgic_dist *dist = &kvm->arch.vgic;
-       struct kvm_vcpu *vcpu;
-       int nr_cpus, nr_irqs;
-       int ret, i, vcpu_id;
-
-       if (vgic_initialized(kvm))
-               return 0;
-
-       nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus);
-       if (!nr_cpus)           /* No vcpus? Can't be good... */
-               return -ENODEV;
-
-       /*
-        * If nobody configured the number of interrupts, use the
-        * legacy one.
-        */
-       if (!dist->nr_irqs)
-               dist->nr_irqs = VGIC_NR_IRQS_LEGACY;
-
-       nr_irqs = dist->nr_irqs;
-
-       ret  = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs);
-       ret |= vgic_init_bitmap(&dist->irq_level, nr_cpus, nr_irqs);
-       ret |= vgic_init_bitmap(&dist->irq_pending, nr_cpus, nr_irqs);
-       ret |= vgic_init_bitmap(&dist->irq_soft_pend, nr_cpus, nr_irqs);
-       ret |= vgic_init_bitmap(&dist->irq_queued, nr_cpus, nr_irqs);
-       ret |= vgic_init_bitmap(&dist->irq_active, nr_cpus, nr_irqs);
-       ret |= vgic_init_bitmap(&dist->irq_cfg, nr_cpus, nr_irqs);
-       ret |= vgic_init_bytemap(&dist->irq_priority, nr_cpus, nr_irqs);
-
-       if (ret)
-               goto out;
-
-       dist->irq_sgi_sources = kzalloc(nr_cpus * VGIC_NR_SGIS, GFP_KERNEL);
-       dist->irq_spi_cpu = kzalloc(nr_irqs - VGIC_NR_PRIVATE_IRQS, GFP_KERNEL);
-       dist->irq_spi_target = kzalloc(sizeof(*dist->irq_spi_target) * nr_cpus,
-                                      GFP_KERNEL);
-       dist->irq_pending_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
-                                          GFP_KERNEL);
-       dist->irq_active_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
-                                          GFP_KERNEL);
-       if (!dist->irq_sgi_sources ||
-           !dist->irq_spi_cpu ||
-           !dist->irq_spi_target ||
-           !dist->irq_pending_on_cpu ||
-           !dist->irq_active_on_cpu) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       for (i = 0; i < nr_cpus; i++)
-               ret |= vgic_init_bitmap(&dist->irq_spi_target[i],
-                                       nr_cpus, nr_irqs);
-
-       if (ret)
-               goto out;
-
-       ret = kvm->arch.vgic.vm_ops.init_model(kvm);
-       if (ret)
-               goto out;
-
-       kvm_for_each_vcpu(vcpu_id, vcpu, kvm) {
-               ret = vgic_vcpu_init_maps(vcpu, nr_irqs);
-               if (ret) {
-                       kvm_err("VGIC: Failed to allocate vcpu memory\n");
-                       break;
-               }
-
-               /*
-                * Enable and configure all SGIs to be edge-triggere and
-                * configure all PPIs as level-triggered.
-                */
-               for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
-                       if (i < VGIC_NR_SGIS) {
-                               /* SGIs */
-                               vgic_bitmap_set_irq_val(&dist->irq_enabled,
-                                                       vcpu->vcpu_id, i, 1);
-                               vgic_bitmap_set_irq_val(&dist->irq_cfg,
-                                                       vcpu->vcpu_id, i,
-                                                       VGIC_CFG_EDGE);
-                       } else if (i < VGIC_NR_PRIVATE_IRQS) {
-                               /* PPIs */
-                               vgic_bitmap_set_irq_val(&dist->irq_cfg,
-                                                       vcpu->vcpu_id, i,
-                                                       VGIC_CFG_LEVEL);
-                       }
-               }
-
-               vgic_enable(vcpu);
-       }
-
-out:
-       if (ret)
-               kvm_vgic_destroy(kvm);
-
-       return ret;
-}
-
-static int init_vgic_model(struct kvm *kvm, int type)
-{
-       switch (type) {
-       case KVM_DEV_TYPE_ARM_VGIC_V2:
-               vgic_v2_init_emulation(kvm);
-               break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-       case KVM_DEV_TYPE_ARM_VGIC_V3:
-               vgic_v3_init_emulation(kvm);
-               break;
-#endif
-       default:
-               return -ENODEV;
-       }
-
-       if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus)
-               return -E2BIG;
-
-       return 0;
-}
-
-/**
- * kvm_vgic_early_init - Earliest possible vgic initialization stage
- *
- * No memory allocation should be performed here, only static init.
- */
-void kvm_vgic_early_init(struct kvm *kvm)
-{
-       spin_lock_init(&kvm->arch.vgic.lock);
-       spin_lock_init(&kvm->arch.vgic.irq_phys_map_lock);
-       INIT_LIST_HEAD(&kvm->arch.vgic.irq_phys_map_list);
-}
-
-int kvm_vgic_create(struct kvm *kvm, u32 type)
-{
-       int i, vcpu_lock_idx = -1, ret;
-       struct kvm_vcpu *vcpu;
-
-       mutex_lock(&kvm->lock);
-
-       if (irqchip_in_kernel(kvm)) {
-               ret = -EEXIST;
-               goto out;
-       }
-
-       /*
-        * This function is also called by the KVM_CREATE_IRQCHIP handler,
-        * which had no chance yet to check the availability of the GICv2
-        * emulation. So check this here again. KVM_CREATE_DEVICE does
-        * the proper checks already.
-        */
-       if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && !vgic->can_emulate_gicv2) {
-               ret = -ENODEV;
-               goto out;
-       }
-
-       /*
-        * Any time a vcpu is run, vcpu_load is called which tries to grab the
-        * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure
-        * that no other VCPUs are run while we create the vgic.
-        */
-       ret = -EBUSY;
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (!mutex_trylock(&vcpu->mutex))
-                       goto out_unlock;
-               vcpu_lock_idx = i;
-       }
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (vcpu->arch.has_run_once)
-                       goto out_unlock;
-       }
-       ret = 0;
-
-       ret = init_vgic_model(kvm, type);
-       if (ret)
-               goto out_unlock;
-
-       kvm->arch.vgic.in_kernel = true;
-       kvm->arch.vgic.vgic_model = type;
-       kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
-       kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
-       kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
-       kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
-
-out_unlock:
-       for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
-               vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
-               mutex_unlock(&vcpu->mutex);
-       }
-
-out:
-       mutex_unlock(&kvm->lock);
-       return ret;
-}
-
-static int vgic_ioaddr_overlap(struct kvm *kvm)
-{
-       phys_addr_t dist = kvm->arch.vgic.vgic_dist_base;
-       phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base;
-
-       if (IS_VGIC_ADDR_UNDEF(dist) || IS_VGIC_ADDR_UNDEF(cpu))
-               return 0;
-       if ((dist <= cpu && dist + KVM_VGIC_V2_DIST_SIZE > cpu) ||
-           (cpu <= dist && cpu + KVM_VGIC_V2_CPU_SIZE > dist))
-               return -EBUSY;
-       return 0;
-}
-
-static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr,
-                             phys_addr_t addr, phys_addr_t size)
-{
-       int ret;
-
-       if (addr & ~KVM_PHYS_MASK)
-               return -E2BIG;
-
-       if (addr & (SZ_4K - 1))
-               return -EINVAL;
-
-       if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
-               return -EEXIST;
-       if (addr + size < addr)
-               return -EINVAL;
-
-       *ioaddr = addr;
-       ret = vgic_ioaddr_overlap(kvm);
-       if (ret)
-               *ioaddr = VGIC_ADDR_UNDEF;
-
-       return ret;
-}
-
-/**
- * kvm_vgic_addr - set or get vgic VM base addresses
- * @kvm:   pointer to the vm struct
- * @type:  the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
- * @addr:  pointer to address value
- * @write: if true set the address in the VM address space, if false read the
- *          address
- *
- * Set or get the vgic base addresses for the distributor and the virtual CPU
- * interface in the VM physical address space.  These addresses are properties
- * of the emulated core/SoC and therefore user space initially knows this
- * information.
- */
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
-{
-       int r = 0;
-       struct vgic_dist *vgic = &kvm->arch.vgic;
-       int type_needed;
-       phys_addr_t *addr_ptr, block_size;
-       phys_addr_t alignment;
-
-       mutex_lock(&kvm->lock);
-       switch (type) {
-       case KVM_VGIC_V2_ADDR_TYPE_DIST:
-               type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
-               addr_ptr = &vgic->vgic_dist_base;
-               block_size = KVM_VGIC_V2_DIST_SIZE;
-               alignment = SZ_4K;
-               break;
-       case KVM_VGIC_V2_ADDR_TYPE_CPU:
-               type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
-               addr_ptr = &vgic->vgic_cpu_base;
-               block_size = KVM_VGIC_V2_CPU_SIZE;
-               alignment = SZ_4K;
-               break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-       case KVM_VGIC_V3_ADDR_TYPE_DIST:
-               type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
-               addr_ptr = &vgic->vgic_dist_base;
-               block_size = KVM_VGIC_V3_DIST_SIZE;
-               alignment = SZ_64K;
-               break;
-       case KVM_VGIC_V3_ADDR_TYPE_REDIST:
-               type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
-               addr_ptr = &vgic->vgic_redist_base;
-               block_size = KVM_VGIC_V3_REDIST_SIZE;
-               alignment = SZ_64K;
-               break;
-#endif
-       default:
-               r = -ENODEV;
-               goto out;
-       }
-
-       if (vgic->vgic_model != type_needed) {
-               r = -ENODEV;
-               goto out;
-       }
-
-       if (write) {
-               if (!IS_ALIGNED(*addr, alignment))
-                       r = -EINVAL;
-               else
-                       r = vgic_ioaddr_assign(kvm, addr_ptr, *addr,
-                                              block_size);
-       } else {
-               *addr = *addr_ptr;
-       }
-
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-
-int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
-       int r;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-               u64 addr;
-               unsigned long type = (unsigned long)attr->attr;
-
-               if (copy_from_user(&addr, uaddr, sizeof(addr)))
-                       return -EFAULT;
-
-               r = kvm_vgic_addr(dev->kvm, type, &addr, true);
-               return (r == -ENODEV) ? -ENXIO : r;
-       }
-       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-               u32 val;
-               int ret = 0;
-
-               if (get_user(val, uaddr))
-                       return -EFAULT;
-
-               /*
-                * We require:
-                * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
-                * - at most 1024 interrupts
-                * - a multiple of 32 interrupts
-                */
-               if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
-                   val > VGIC_MAX_IRQS ||
-                   (val & 31))
-                       return -EINVAL;
-
-               mutex_lock(&dev->kvm->lock);
-
-               if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_irqs)
-                       ret = -EBUSY;
-               else
-                       dev->kvm->arch.vgic.nr_irqs = val;
-
-               mutex_unlock(&dev->kvm->lock);
-
-               return ret;
-       }
-       case KVM_DEV_ARM_VGIC_GRP_CTRL: {
-               switch (attr->attr) {
-               case KVM_DEV_ARM_VGIC_CTRL_INIT:
-                       r = vgic_init(dev->kvm);
-                       return r;
-               }
-               break;
-       }
-       }
-
-       return -ENXIO;
-}
-
-int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
-       int r = -ENXIO;
-
-       switch (attr->group) {
-       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-               u64 addr;
-               unsigned long type = (unsigned long)attr->attr;
-
-               r = kvm_vgic_addr(dev->kvm, type, &addr, false);
-               if (r)
-                       return (r == -ENODEV) ? -ENXIO : r;
-
-               if (copy_to_user(uaddr, &addr, sizeof(addr)))
-                       return -EFAULT;
-               break;
-       }
-       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
-               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-
-               r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr);
-               break;
-       }
-
-       }
-
-       return r;
-}
-
-int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset)
-{
-       if (vgic_find_range(ranges, 4, offset))
-               return 0;
-       else
-               return -ENXIO;
-}
-
-static int vgic_starting_cpu(unsigned int cpu)
-{
-       enable_percpu_irq(vgic->maint_irq, 0);
-       return 0;
-}
-
-static int vgic_dying_cpu(unsigned int cpu)
-{
-       disable_percpu_irq(vgic->maint_irq);
-       return 0;
-}
-
-static int kvm_vgic_probe(void)
-{
-       const struct gic_kvm_info *gic_kvm_info;
-       int ret;
-
-       gic_kvm_info = gic_get_kvm_info();
-       if (!gic_kvm_info)
-               return -ENODEV;
-
-       switch (gic_kvm_info->type) {
-       case GIC_V2:
-               ret = vgic_v2_probe(gic_kvm_info, &vgic_ops, &vgic);
-               break;
-       case GIC_V3:
-               ret = vgic_v3_probe(gic_kvm_info, &vgic_ops, &vgic);
-               break;
-       default:
-               ret = -ENODEV;
-       }
-
-       return ret;
-}
-
-int kvm_vgic_hyp_init(void)
-{
-       int ret;
-
-       ret = kvm_vgic_probe();
-       if (ret) {
-               kvm_err("error: KVM vGIC probing failed\n");
-               return ret;
-       }
-
-       ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler,
-                                "vgic", kvm_get_running_vcpus());
-       if (ret) {
-               kvm_err("Cannot register interrupt %d\n", vgic->maint_irq);
-               return ret;
-       }
-
-       cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_STARTING,
-                         "AP_KVM_ARM_VGIC_STARTING", vgic_starting_cpu,
-                         vgic_dying_cpu);
-       return 0;
-}
-
-int kvm_irq_map_gsi(struct kvm *kvm,
-                   struct kvm_kernel_irq_routing_entry *entries,
-                   int gsi)
-{
-       return 0;
-}
-
-int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
-{
-       return pin;
-}
-
-int kvm_set_irq(struct kvm *kvm, int irq_source_id,
-               u32 irq, int level, bool line_status)
-{
-       unsigned int spi = irq + VGIC_NR_PRIVATE_IRQS;
-
-       trace_kvm_set_irq(irq, level, irq_source_id);
-
-       BUG_ON(!vgic_initialized(kvm));
-
-       return kvm_vgic_inject_irq(kvm, 0, spi, level);
-}
-
-/* MSI not implemented yet */
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-               struct kvm *kvm, int irq_source_id,
-               int level, bool line_status)
-{
-       return 0;
-}
diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h
deleted file mode 100644 (file)
index 0df74cb..0000000
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (C) 2012-2014 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * Derived from virt/kvm/arm/vgic.c
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __KVM_VGIC_H__
-#define __KVM_VGIC_H__
-
-#include <kvm/iodev.h>
-
-#define VGIC_ADDR_UNDEF                (-1)
-#define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
-
-#define PRODUCT_ID_KVM         0x4b    /* ASCII code K */
-#define IMPLEMENTER_ARM                0x43b
-
-#define ACCESS_READ_VALUE      (1 << 0)
-#define ACCESS_READ_RAZ                (0 << 0)
-#define ACCESS_READ_MASK(x)    ((x) & (1 << 0))
-#define ACCESS_WRITE_IGNORED   (0 << 1)
-#define ACCESS_WRITE_SETBIT    (1 << 1)
-#define ACCESS_WRITE_CLEARBIT  (2 << 1)
-#define ACCESS_WRITE_VALUE     (3 << 1)
-#define ACCESS_WRITE_MASK(x)   ((x) & (3 << 1))
-
-#define VCPU_NOT_ALLOCATED     ((u8)-1)
-
-unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x);
-
-void vgic_update_state(struct kvm *kvm);
-int vgic_init_common_maps(struct kvm *kvm);
-
-u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset);
-u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset);
-
-void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq);
-void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq);
-void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq);
-void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
-                            int irq, int val);
-
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-
-bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq);
-void vgic_unqueue_irqs(struct kvm_vcpu *vcpu);
-
-struct kvm_exit_mmio {
-       phys_addr_t     phys_addr;
-       void            *data;
-       u32             len;
-       bool            is_write;
-       void            *private;
-};
-
-void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
-                    phys_addr_t offset, int mode);
-bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
-                       phys_addr_t offset);
-
-static inline
-u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask)
-{
-       return le32_to_cpu(*((u32 *)mmio->data)) & mask;
-}
-
-static inline
-void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value)
-{
-       *((u32 *)mmio->data) = cpu_to_le32(value) & mask;
-}
-
-struct vgic_io_range {
-       phys_addr_t base;
-       unsigned long len;
-       int bits_per_irq;
-       bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
-                           phys_addr_t offset);
-};
-
-int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len,
-                            const struct vgic_io_range *ranges,
-                            int redist_id,
-                            struct vgic_io_device *iodev);
-
-static inline bool is_in_range(phys_addr_t addr, unsigned long len,
-                              phys_addr_t baseaddr, unsigned long size)
-{
-       return (addr >= baseaddr) && (addr + len <= baseaddr + size);
-}
-
-const
-struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges,
-                                     int len, gpa_t offset);
-
-bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
-                           phys_addr_t offset, int vcpu_id, int access);
-
-bool vgic_handle_set_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
-                                phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_clear_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
-                                  phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_set_active_reg(struct kvm *kvm,
-                               struct kvm_exit_mmio *mmio,
-                               phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_clear_active_reg(struct kvm *kvm,
-                                 struct kvm_exit_mmio *mmio,
-                                 phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
-                        phys_addr_t offset);
-
-void vgic_kick_vcpus(struct kvm *kvm);
-
-int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset);
-int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
-int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
-
-int vgic_init(struct kvm *kvm);
-void vgic_v2_init_emulation(struct kvm *kvm);
-void vgic_v3_init_emulation(struct kvm *kvm);
-
-#endif
index 2c7f0d5..1e30ce0 100644 (file)
@@ -157,6 +157,9 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
        struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
        int i;
 
+       INIT_LIST_HEAD(&dist->lpi_list_head);
+       spin_lock_init(&dist->lpi_list_lock);
+
        dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
        if (!dist->spis)
                return  -ENOMEM;
@@ -177,6 +180,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
                spin_lock_init(&irq->irq_lock);
                irq->vcpu = NULL;
                irq->target_vcpu = vcpu0;
+               kref_init(&irq->refcount);
                if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
                        irq->targets = 0;
                else
@@ -211,6 +215,7 @@ static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
                irq->vcpu = NULL;
                irq->target_vcpu = vcpu;
                irq->targets = 1U << vcpu->vcpu_id;
+               kref_init(&irq->refcount);
                if (vgic_irq_is_sgi(i)) {
                        /* SGIs */
                        irq->enabled = 1;
@@ -253,6 +258,9 @@ int vgic_init(struct kvm *kvm)
        if (ret)
                goto out;
 
+       if (vgic_has_its(kvm))
+               dist->msis_require_devid = true;
+
        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_vgic_vcpu_init(vcpu);
 
@@ -271,7 +279,6 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
        dist->initialized = false;
 
        kfree(dist->spis);
-       kfree(dist->redist_iodevs);
        dist->nr_spis = 0;
 
        mutex_unlock(&kvm->lock);
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
new file mode 100644 (file)
index 0000000..07411cf
--- /dev/null
@@ -0,0 +1,1500 @@
+/*
+ * GICv3 ITS emulation
+ *
+ * Copyright (C) 2015,2016 ARM Ltd.
+ * Author: Andre Przywara <andre.przywara@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/uaccess.h>
+
+#include <linux/irqchip/arm-gic-v3.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+/*
+ * Creates a new (reference to a) struct vgic_irq for a given LPI.
+ * If this LPI is already mapped on another ITS, we increase its refcount
+ * and return a pointer to the existing structure.
+ * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq.
+ * This function returns a pointer to the _unlocked_ structure.
+ */
+static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq;
+
+       /* In this case there is no put, since we keep the reference. */
+       if (irq)
+               return irq;
+
+       irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL);
+       if (!irq)
+               return NULL;
+
+       INIT_LIST_HEAD(&irq->lpi_list);
+       INIT_LIST_HEAD(&irq->ap_list);
+       spin_lock_init(&irq->irq_lock);
+
+       irq->config = VGIC_CONFIG_EDGE;
+       kref_init(&irq->refcount);
+       irq->intid = intid;
+
+       spin_lock(&dist->lpi_list_lock);
+
+       /*
+        * There could be a race with another vgic_add_lpi(), so we need to
+        * check that we don't add a second list entry with the same LPI.
+        */
+       list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) {
+               if (oldirq->intid != intid)
+                       continue;
+
+               /* Someone was faster with adding this LPI, lets use that. */
+               kfree(irq);
+               irq = oldirq;
+
+               /*
+                * This increases the refcount, the caller is expected to
+                * call vgic_put_irq() on the returned pointer once it's
+                * finished with the IRQ.
+                */
+               vgic_get_irq_kref(irq);
+
+               goto out_unlock;
+       }
+
+       list_add_tail(&irq->lpi_list, &dist->lpi_list_head);
+       dist->lpi_list_count++;
+
+out_unlock:
+       spin_unlock(&dist->lpi_list_lock);
+
+       return irq;
+}
+
+struct its_device {
+       struct list_head dev_list;
+
+       /* the head for the list of ITTEs */
+       struct list_head itt_head;
+       u32 device_id;
+};
+
+#define COLLECTION_NOT_MAPPED ((u32)~0)
+
+struct its_collection {
+       struct list_head coll_list;
+
+       u32 collection_id;
+       u32 target_addr;
+};
+
+#define its_is_collection_mapped(coll) ((coll) && \
+                               ((coll)->target_addr != COLLECTION_NOT_MAPPED))
+
+struct its_itte {
+       struct list_head itte_list;
+
+       struct vgic_irq *irq;
+       struct its_collection *collection;
+       u32 lpi;
+       u32 event_id;
+};
+
+/*
+ * Find and returns a device in the device table for an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_device *find_its_device(struct vgic_its *its, u32 device_id)
+{
+       struct its_device *device;
+
+       list_for_each_entry(device, &its->device_list, dev_list)
+               if (device_id == device->device_id)
+                       return device;
+
+       return NULL;
+}
+
+/*
+ * Find and returns an interrupt translation table entry (ITTE) for a given
+ * Device ID/Event ID pair on an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_itte *find_itte(struct vgic_its *its, u32 device_id,
+                                 u32 event_id)
+{
+       struct its_device *device;
+       struct its_itte *itte;
+
+       device = find_its_device(its, device_id);
+       if (device == NULL)
+               return NULL;
+
+       list_for_each_entry(itte, &device->itt_head, itte_list)
+               if (itte->event_id == event_id)
+                       return itte;
+
+       return NULL;
+}
+
+/* To be used as an iterator this macro misses the enclosing parentheses */
+#define for_each_lpi_its(dev, itte, its) \
+       list_for_each_entry(dev, &(its)->device_list, dev_list) \
+               list_for_each_entry(itte, &(dev)->itt_head, itte_list)
+
+/*
+ * We only implement 48 bits of PA at the moment, although the ITS
+ * supports more. Let's be restrictive here.
+ */
+#define BASER_ADDRESS(x)       ((x) & GENMASK_ULL(47, 16))
+#define CBASER_ADDRESS(x)      ((x) & GENMASK_ULL(47, 12))
+#define PENDBASER_ADDRESS(x)   ((x) & GENMASK_ULL(47, 16))
+#define PROPBASER_ADDRESS(x)   ((x) & GENMASK_ULL(47, 12))
+
+#define GIC_LPI_OFFSET 8192
+
+/*
+ * Finds and returns a collection in the ITS collection table.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_collection *find_collection(struct vgic_its *its, int coll_id)
+{
+       struct its_collection *collection;
+
+       list_for_each_entry(collection, &its->collection_list, coll_list) {
+               if (coll_id == collection->collection_id)
+                       return collection;
+       }
+
+       return NULL;
+}
+
+#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED)
+#define LPI_PROP_PRIORITY(p)   ((p) & 0xfc)
+
+/*
+ * Reads the configuration data for a given LPI from guest memory and
+ * updates the fields in struct vgic_irq.
+ * If filter_vcpu is not NULL, applies only if the IRQ is targeting this
+ * VCPU. Unconditionally applies if filter_vcpu is NULL.
+ */
+static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
+                            struct kvm_vcpu *filter_vcpu)
+{
+       u64 propbase = PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
+       u8 prop;
+       int ret;
+
+       ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
+                            &prop, 1);
+
+       if (ret)
+               return ret;
+
+       spin_lock(&irq->irq_lock);
+
+       if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
+               irq->priority = LPI_PROP_PRIORITY(prop);
+               irq->enabled = LPI_PROP_ENABLE_BIT(prop);
+
+               vgic_queue_irq_unlock(kvm, irq);
+       } else {
+               spin_unlock(&irq->irq_lock);
+       }
+
+       return 0;
+}
+
+/*
+ * Create a snapshot of the current LPI list, so that we can enumerate all
+ * LPIs without holding any lock.
+ * Returns the array length and puts the kmalloc'ed array into intid_ptr.
+ */
+static int vgic_copy_lpi_list(struct kvm *kvm, u32 **intid_ptr)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_irq *irq;
+       u32 *intids;
+       int irq_count = dist->lpi_list_count, i = 0;
+
+       /*
+        * We use the current value of the list length, which may change
+        * after the kmalloc. We don't care, because the guest shouldn't
+        * change anything while the command handling is still running,
+        * and in the worst case we would miss a new IRQ, which one wouldn't
+        * expect to be covered by this command anyway.
+        */
+       intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL);
+       if (!intids)
+               return -ENOMEM;
+
+       spin_lock(&dist->lpi_list_lock);
+       list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+               /* We don't need to "get" the IRQ, as we hold the list lock. */
+               intids[i] = irq->intid;
+               if (++i == irq_count)
+                       break;
+       }
+       spin_unlock(&dist->lpi_list_lock);
+
+       *intid_ptr = intids;
+       return irq_count;
+}
+
+/*
+ * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI
+ * is targeting) to the VGIC's view, which deals with target VCPUs.
+ * Needs to be called whenever either the collection for a LPIs has
+ * changed or the collection itself got retargeted.
+ */
+static void update_affinity_itte(struct kvm *kvm, struct its_itte *itte)
+{
+       struct kvm_vcpu *vcpu;
+
+       if (!its_is_collection_mapped(itte->collection))
+               return;
+
+       vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr);
+
+       spin_lock(&itte->irq->irq_lock);
+       itte->irq->target_vcpu = vcpu;
+       spin_unlock(&itte->irq->irq_lock);
+}
+
+/*
+ * Updates the target VCPU for every LPI targeting this collection.
+ * Must be called with the its_lock mutex held.
+ */
+static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its,
+                                      struct its_collection *coll)
+{
+       struct its_device *device;
+       struct its_itte *itte;
+
+       for_each_lpi_its(device, itte, its) {
+               if (!itte->collection || coll != itte->collection)
+                       continue;
+
+               update_affinity_itte(kvm, itte);
+       }
+}
+
+static u32 max_lpis_propbaser(u64 propbaser)
+{
+       int nr_idbits = (propbaser & 0x1f) + 1;
+
+       return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS);
+}
+
+/*
+ * Scan the whole LPI pending table and sync the pending bit in there
+ * with our own data structures. This relies on the LPI being
+ * mapped before.
+ */
+static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
+{
+       gpa_t pendbase = PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+       struct vgic_irq *irq;
+       int last_byte_offset = -1;
+       int ret = 0;
+       u32 *intids;
+       int nr_irqs, i;
+
+       nr_irqs = vgic_copy_lpi_list(vcpu->kvm, &intids);
+       if (nr_irqs < 0)
+               return nr_irqs;
+
+       for (i = 0; i < nr_irqs; i++) {
+               int byte_offset, bit_nr;
+               u8 pendmask;
+
+               byte_offset = intids[i] / BITS_PER_BYTE;
+               bit_nr = intids[i] % BITS_PER_BYTE;
+
+               /*
+                * For contiguously allocated LPIs chances are we just read
+                * this very same byte in the last iteration. Reuse that.
+                */
+               if (byte_offset != last_byte_offset) {
+                       ret = kvm_read_guest(vcpu->kvm, pendbase + byte_offset,
+                                            &pendmask, 1);
+                       if (ret) {
+                               kfree(intids);
+                               return ret;
+                       }
+                       last_byte_offset = byte_offset;
+               }
+
+               irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
+               spin_lock(&irq->irq_lock);
+               irq->pending = pendmask & (1U << bit_nr);
+               vgic_queue_irq_unlock(vcpu->kvm, irq);
+               vgic_put_irq(vcpu->kvm, irq);
+       }
+
+       kfree(intids);
+
+       return ret;
+}
+
+static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
+                                            struct vgic_its *its,
+                                            gpa_t addr, unsigned int len)
+{
+       u32 reg = 0;
+
+       mutex_lock(&its->cmd_lock);
+       if (its->creadr == its->cwriter)
+               reg |= GITS_CTLR_QUIESCENT;
+       if (its->enabled)
+               reg |= GITS_CTLR_ENABLE;
+       mutex_unlock(&its->cmd_lock);
+
+       return reg;
+}
+
+static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       its->enabled = !!(val & GITS_CTLR_ENABLE);
+}
+
+static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
+                                             struct vgic_its *its,
+                                             gpa_t addr, unsigned int len)
+{
+       u64 reg = GITS_TYPER_PLPIS;
+
+       /*
+        * We use linear CPU numbers for redistributor addressing,
+        * so GITS_TYPER.PTA is 0.
+        * Also we force all PROPBASER registers to be the same, so
+        * CommonLPIAff is 0 as well.
+        * To avoid memory waste in the guest, we keep the number of IDBits and
+        * DevBits low - as least for the time being.
+        */
+       reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
+       reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
+
+       return extract_bytes(reg, addr & 7, len);
+}
+
+static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm,
+                                            struct vgic_its *its,
+                                            gpa_t addr, unsigned int len)
+{
+       return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+}
+
+static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
+                                              struct vgic_its *its,
+                                              gpa_t addr, unsigned int len)
+{
+       switch (addr & 0xffff) {
+       case GITS_PIDR0:
+               return 0x92;    /* part number, bits[7:0] */
+       case GITS_PIDR1:
+               return 0xb4;    /* part number, bits[11:8] */
+       case GITS_PIDR2:
+               return GIC_PIDR2_ARCH_GICv3 | 0x0b;
+       case GITS_PIDR4:
+               return 0x40;    /* This is a 64K software visible page */
+       /* The following are the ID registers for (any) GIC. */
+       case GITS_CIDR0:
+               return 0x0d;
+       case GITS_CIDR1:
+               return 0xf0;
+       case GITS_CIDR2:
+               return 0x05;
+       case GITS_CIDR3:
+               return 0xb1;
+       }
+
+       return 0;
+}
+
+/*
+ * Find the target VCPU and the LPI number for a given devid/eventid pair
+ * and make this IRQ pending, possibly injecting it.
+ * Must be called with the its_lock mutex held.
+ */
+static void vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
+                                u32 devid, u32 eventid)
+{
+       struct its_itte *itte;
+
+       if (!its->enabled)
+               return;
+
+       itte = find_itte(its, devid, eventid);
+       /* Triggering an unmapped IRQ gets silently dropped. */
+       if (itte && its_is_collection_mapped(itte->collection)) {
+               struct kvm_vcpu *vcpu;
+
+               vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr);
+               if (vcpu && vcpu->arch.vgic_cpu.lpis_enabled) {
+                       spin_lock(&itte->irq->irq_lock);
+                       itte->irq->pending = true;
+                       vgic_queue_irq_unlock(kvm, itte->irq);
+               }
+       }
+}
+
+/*
+ * Queries the KVM IO bus framework to get the ITS pointer from the given
+ * doorbell address.
+ * We then call vgic_its_trigger_msi() with the decoded data.
+ */
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+       u64 address;
+       struct kvm_io_device *kvm_io_dev;
+       struct vgic_io_device *iodev;
+
+       if (!vgic_has_its(kvm))
+               return -ENODEV;
+
+       if (!(msi->flags & KVM_MSI_VALID_DEVID))
+               return -EINVAL;
+
+       address = (u64)msi->address_hi << 32 | msi->address_lo;
+
+       kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address);
+       if (!kvm_io_dev)
+               return -ENODEV;
+
+       iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
+
+       mutex_lock(&iodev->its->its_lock);
+       vgic_its_trigger_msi(kvm, iodev->its, msi->devid, msi->data);
+       mutex_unlock(&iodev->its->its_lock);
+
+       return 0;
+}
+
+/* Requires the its_lock to be held. */
+static void its_free_itte(struct kvm *kvm, struct its_itte *itte)
+{
+       list_del(&itte->itte_list);
+
+       /* This put matches the get in vgic_add_lpi. */
+       vgic_put_irq(kvm, itte->irq);
+
+       kfree(itte);
+}
+
+static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
+{
+       return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1);
+}
+
+#define its_cmd_get_command(cmd)       its_cmd_mask_field(cmd, 0,  0,  8)
+#define its_cmd_get_deviceid(cmd)      its_cmd_mask_field(cmd, 0, 32, 32)
+#define its_cmd_get_id(cmd)            its_cmd_mask_field(cmd, 1,  0, 32)
+#define its_cmd_get_physical_id(cmd)   its_cmd_mask_field(cmd, 1, 32, 32)
+#define its_cmd_get_collection(cmd)    its_cmd_mask_field(cmd, 2,  0, 16)
+#define its_cmd_get_target_addr(cmd)   its_cmd_mask_field(cmd, 2, 16, 32)
+#define its_cmd_get_validbit(cmd)      its_cmd_mask_field(cmd, 2, 63,  1)
+
+/*
+ * The DISCARD command frees an Interrupt Translation Table Entry (ITTE).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its,
+                                      u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       struct its_itte *itte;
+
+
+       itte = find_itte(its, device_id, event_id);
+       if (itte && itte->collection) {
+               /*
+                * Though the spec talks about removing the pending state, we
+                * don't bother here since we clear the ITTE anyway and the
+                * pending state is a property of the ITTE struct.
+                */
+               its_free_itte(kvm, itte);
+               return 0;
+       }
+
+       return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
+}
+
+/*
+ * The MOVI command moves an ITTE to a different collection.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
+                                   u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       u32 coll_id = its_cmd_get_collection(its_cmd);
+       struct kvm_vcpu *vcpu;
+       struct its_itte *itte;
+       struct its_collection *collection;
+
+       itte = find_itte(its, device_id, event_id);
+       if (!itte)
+               return E_ITS_MOVI_UNMAPPED_INTERRUPT;
+
+       if (!its_is_collection_mapped(itte->collection))
+               return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+       collection = find_collection(its, coll_id);
+       if (!its_is_collection_mapped(collection))
+               return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+       itte->collection = collection;
+       vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+       spin_lock(&itte->irq->irq_lock);
+       itte->irq->target_vcpu = vcpu;
+       spin_unlock(&itte->irq->irq_lock);
+
+       return 0;
+}
+
+/*
+ * Check whether an ID can be stored into the corresponding guest table.
+ * For a direct table this is pretty easy, but gets a bit nasty for
+ * indirect tables. We check whether the resulting guest physical address
+ * is actually valid (covered by a memslot and guest accessbible).
+ * For this we have to read the respective first level entry.
+ */
+static bool vgic_its_check_id(struct vgic_its *its, u64 baser, int id)
+{
+       int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+       int index;
+       u64 indirect_ptr;
+       gfn_t gfn;
+
+       if (!(baser & GITS_BASER_INDIRECT)) {
+               phys_addr_t addr;
+
+               if (id >= (l1_tbl_size / GITS_BASER_ENTRY_SIZE(baser)))
+                       return false;
+
+               addr = BASER_ADDRESS(baser) + id * GITS_BASER_ENTRY_SIZE(baser);
+               gfn = addr >> PAGE_SHIFT;
+
+               return kvm_is_visible_gfn(its->dev->kvm, gfn);
+       }
+
+       /* calculate and check the index into the 1st level */
+       index = id / (SZ_64K / GITS_BASER_ENTRY_SIZE(baser));
+       if (index >= (l1_tbl_size / sizeof(u64)))
+               return false;
+
+       /* Each 1st level entry is represented by a 64-bit value. */
+       if (kvm_read_guest(its->dev->kvm,
+                          BASER_ADDRESS(baser) + index * sizeof(indirect_ptr),
+                          &indirect_ptr, sizeof(indirect_ptr)))
+               return false;
+
+       indirect_ptr = le64_to_cpu(indirect_ptr);
+
+       /* check the valid bit of the first level entry */
+       if (!(indirect_ptr & BIT_ULL(63)))
+               return false;
+
+       /*
+        * Mask the guest physical address and calculate the frame number.
+        * Any address beyond our supported 48 bits of PA will be caught
+        * by the actual check in the final step.
+        */
+       indirect_ptr &= GENMASK_ULL(51, 16);
+
+       /* Find the address of the actual entry */
+       index = id % (SZ_64K / GITS_BASER_ENTRY_SIZE(baser));
+       indirect_ptr += index * GITS_BASER_ENTRY_SIZE(baser);
+       gfn = indirect_ptr >> PAGE_SHIFT;
+
+       return kvm_is_visible_gfn(its->dev->kvm, gfn);
+}
+
+static int vgic_its_alloc_collection(struct vgic_its *its,
+                                    struct its_collection **colp,
+                                    u32 coll_id)
+{
+       struct its_collection *collection;
+
+       if (!vgic_its_check_id(its, its->baser_coll_table, coll_id))
+               return E_ITS_MAPC_COLLECTION_OOR;
+
+       collection = kzalloc(sizeof(*collection), GFP_KERNEL);
+
+       collection->collection_id = coll_id;
+       collection->target_addr = COLLECTION_NOT_MAPPED;
+
+       list_add_tail(&collection->coll_list, &its->collection_list);
+       *colp = collection;
+
+       return 0;
+}
+
+static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id)
+{
+       struct its_collection *collection;
+       struct its_device *device;
+       struct its_itte *itte;
+
+       /*
+        * Clearing the mapping for that collection ID removes the
+        * entry from the list. If there wasn't any before, we can
+        * go home early.
+        */
+       collection = find_collection(its, coll_id);
+       if (!collection)
+               return;
+
+       for_each_lpi_its(device, itte, its)
+               if (itte->collection &&
+                   itte->collection->collection_id == coll_id)
+                       itte->collection = NULL;
+
+       list_del(&collection->coll_list);
+       kfree(collection);
+}
+
+/*
+ * The MAPTI and MAPI commands map LPIs to ITTEs.
+ * Must be called with its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
+                                   u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       u32 coll_id = its_cmd_get_collection(its_cmd);
+       struct its_itte *itte;
+       struct its_device *device;
+       struct its_collection *collection, *new_coll = NULL;
+       int lpi_nr;
+
+       device = find_its_device(its, device_id);
+       if (!device)
+               return E_ITS_MAPTI_UNMAPPED_DEVICE;
+
+       if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
+               lpi_nr = its_cmd_get_physical_id(its_cmd);
+       else
+               lpi_nr = event_id;
+       if (lpi_nr < GIC_LPI_OFFSET ||
+           lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser))
+               return E_ITS_MAPTI_PHYSICALID_OOR;
+
+       collection = find_collection(its, coll_id);
+       if (!collection) {
+               int ret = vgic_its_alloc_collection(its, &collection, coll_id);
+               if (ret)
+                       return ret;
+               new_coll = collection;
+       }
+
+       itte = find_itte(its, device_id, event_id);
+       if (!itte) {
+               itte = kzalloc(sizeof(struct its_itte), GFP_KERNEL);
+               if (!itte) {
+                       if (new_coll)
+                               vgic_its_free_collection(its, coll_id);
+                       return -ENOMEM;
+               }
+
+               itte->event_id  = event_id;
+               list_add_tail(&itte->itte_list, &device->itt_head);
+       }
+
+       itte->collection = collection;
+       itte->lpi = lpi_nr;
+       itte->irq = vgic_add_lpi(kvm, lpi_nr);
+       update_affinity_itte(kvm, itte);
+
+       /*
+        * We "cache" the configuration table entries in out struct vgic_irq's.
+        * However we only have those structs for mapped IRQs, so we read in
+        * the respective config data from memory here upon mapping the LPI.
+        */
+       update_lpi_config(kvm, itte->irq, NULL);
+
+       return 0;
+}
+
+/* Requires the its_lock to be held. */
+static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device)
+{
+       struct its_itte *itte, *temp;
+
+       /*
+        * The spec says that unmapping a device with still valid
+        * ITTEs associated is UNPREDICTABLE. We remove all ITTEs,
+        * since we cannot leave the memory unreferenced.
+        */
+       list_for_each_entry_safe(itte, temp, &device->itt_head, itte_list)
+               its_free_itte(kvm, itte);
+
+       list_del(&device->dev_list);
+       kfree(device);
+}
+
+/*
+ * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
+                                   u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       bool valid = its_cmd_get_validbit(its_cmd);
+       struct its_device *device;
+
+       if (!vgic_its_check_id(its, its->baser_device_table, device_id))
+               return E_ITS_MAPD_DEVICE_OOR;
+
+       device = find_its_device(its, device_id);
+
+       /*
+        * The spec says that calling MAPD on an already mapped device
+        * invalidates all cached data for this device. We implement this
+        * by removing the mapping and re-establishing it.
+        */
+       if (device)
+               vgic_its_unmap_device(kvm, device);
+
+       /*
+        * The spec does not say whether unmapping a not-mapped device
+        * is an error, so we are done in any case.
+        */
+       if (!valid)
+               return 0;
+
+       device = kzalloc(sizeof(struct its_device), GFP_KERNEL);
+       if (!device)
+               return -ENOMEM;
+
+       device->device_id = device_id;
+       INIT_LIST_HEAD(&device->itt_head);
+
+       list_add_tail(&device->dev_list, &its->device_list);
+
+       return 0;
+}
+
+/*
+ * The MAPC command maps collection IDs to redistributors.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
+                                   u64 *its_cmd)
+{
+       u16 coll_id;
+       u32 target_addr;
+       struct its_collection *collection;
+       bool valid;
+
+       valid = its_cmd_get_validbit(its_cmd);
+       coll_id = its_cmd_get_collection(its_cmd);
+       target_addr = its_cmd_get_target_addr(its_cmd);
+
+       if (target_addr >= atomic_read(&kvm->online_vcpus))
+               return E_ITS_MAPC_PROCNUM_OOR;
+
+       if (!valid) {
+               vgic_its_free_collection(its, coll_id);
+       } else {
+               collection = find_collection(its, coll_id);
+
+               if (!collection) {
+                       int ret;
+
+                       ret = vgic_its_alloc_collection(its, &collection,
+                                                       coll_id);
+                       if (ret)
+                               return ret;
+                       collection->target_addr = target_addr;
+               } else {
+                       collection->target_addr = target_addr;
+                       update_affinity_collection(kvm, its, collection);
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * The CLEAR command removes the pending state for a particular LPI.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its,
+                                    u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       struct its_itte *itte;
+
+
+       itte = find_itte(its, device_id, event_id);
+       if (!itte)
+               return E_ITS_CLEAR_UNMAPPED_INTERRUPT;
+
+       itte->irq->pending = false;
+
+       return 0;
+}
+
+/*
+ * The INV command syncs the configuration bits from the memory table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
+                                  u64 *its_cmd)
+{
+       u32 device_id = its_cmd_get_deviceid(its_cmd);
+       u32 event_id = its_cmd_get_id(its_cmd);
+       struct its_itte *itte;
+
+
+       itte = find_itte(its, device_id, event_id);
+       if (!itte)
+               return E_ITS_INV_UNMAPPED_INTERRUPT;
+
+       return update_lpi_config(kvm, itte->irq, NULL);
+}
+
+/*
+ * The INVALL command requests flushing of all IRQ data in this collection.
+ * Find the VCPU mapped to that collection, then iterate over the VM's list
+ * of mapped LPIs and update the configuration for each IRQ which targets
+ * the specified vcpu. The configuration will be read from the in-memory
+ * configuration table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
+                                     u64 *its_cmd)
+{
+       u32 coll_id = its_cmd_get_collection(its_cmd);
+       struct its_collection *collection;
+       struct kvm_vcpu *vcpu;
+       struct vgic_irq *irq;
+       u32 *intids;
+       int irq_count, i;
+
+       collection = find_collection(its, coll_id);
+       if (!its_is_collection_mapped(collection))
+               return E_ITS_INVALL_UNMAPPED_COLLECTION;
+
+       vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+       irq_count = vgic_copy_lpi_list(kvm, &intids);
+       if (irq_count < 0)
+               return irq_count;
+
+       for (i = 0; i < irq_count; i++) {
+               irq = vgic_get_irq(kvm, NULL, intids[i]);
+               if (!irq)
+                       continue;
+               update_lpi_config(kvm, irq, vcpu);
+               vgic_put_irq(kvm, irq);
+       }
+
+       kfree(intids);
+
+       return 0;
+}
+
+/*
+ * The MOVALL command moves the pending state of all IRQs targeting one
+ * redistributor to another. We don't hold the pending state in the VCPUs,
+ * but in the IRQs instead, so there is really not much to do for us here.
+ * However the spec says that no IRQ must target the old redistributor
+ * afterwards, so we make sure that no LPI is using the associated target_vcpu.
+ * This command affects all LPIs in the system that target that redistributor.
+ */
+static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
+                                     u64 *its_cmd)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       u32 target1_addr = its_cmd_get_target_addr(its_cmd);
+       u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32);
+       struct kvm_vcpu *vcpu1, *vcpu2;
+       struct vgic_irq *irq;
+
+       if (target1_addr >= atomic_read(&kvm->online_vcpus) ||
+           target2_addr >= atomic_read(&kvm->online_vcpus))
+               return E_ITS_MOVALL_PROCNUM_OOR;
+
+       if (target1_addr == target2_addr)
+               return 0;
+
+       vcpu1 = kvm_get_vcpu(kvm, target1_addr);
+       vcpu2 = kvm_get_vcpu(kvm, target2_addr);
+
+       spin_lock(&dist->lpi_list_lock);
+
+       list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+               spin_lock(&irq->irq_lock);
+
+               if (irq->target_vcpu == vcpu1)
+                       irq->target_vcpu = vcpu2;
+
+               spin_unlock(&irq->irq_lock);
+       }
+
+       spin_unlock(&dist->lpi_list_lock);
+
+       return 0;
+}
+
+/*
+ * The INT command injects the LPI associated with that DevID/EvID pair.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its,
+                                  u64 *its_cmd)
+{
+       u32 msi_data = its_cmd_get_id(its_cmd);
+       u64 msi_devid = its_cmd_get_deviceid(its_cmd);
+
+       vgic_its_trigger_msi(kvm, its, msi_devid, msi_data);
+
+       return 0;
+}
+
+/*
+ * This function is called with the its_cmd lock held, but the ITS data
+ * structure lock dropped.
+ */
+static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
+                                  u64 *its_cmd)
+{
+       int ret = -ENODEV;
+
+       mutex_lock(&its->its_lock);
+       switch (its_cmd_get_command(its_cmd)) {
+       case GITS_CMD_MAPD:
+               ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MAPC:
+               ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MAPI:
+               ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MAPTI:
+               ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MOVI:
+               ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_DISCARD:
+               ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_CLEAR:
+               ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_MOVALL:
+               ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_INT:
+               ret = vgic_its_cmd_handle_int(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_INV:
+               ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_INVALL:
+               ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd);
+               break;
+       case GITS_CMD_SYNC:
+               /* we ignore this command: we are in sync all of the time */
+               ret = 0;
+               break;
+       }
+       mutex_unlock(&its->its_lock);
+
+       return ret;
+}
+
+static u64 vgic_sanitise_its_baser(u64 reg)
+{
+       reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK,
+                                 GITS_BASER_SHAREABILITY_SHIFT,
+                                 vgic_sanitise_shareability);
+       reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK,
+                                 GITS_BASER_INNER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_inner_cacheability);
+       reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK,
+                                 GITS_BASER_OUTER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_outer_cacheability);
+
+       /* Bits 15:12 contain bits 51:48 of the PA, which we don't support. */
+       reg &= ~GENMASK_ULL(15, 12);
+
+       /* We support only one (ITS) page size: 64K */
+       reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K;
+
+       return reg;
+}
+
+static u64 vgic_sanitise_its_cbaser(u64 reg)
+{
+       reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK,
+                                 GITS_CBASER_SHAREABILITY_SHIFT,
+                                 vgic_sanitise_shareability);
+       reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK,
+                                 GITS_CBASER_INNER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_inner_cacheability);
+       reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK,
+                                 GITS_CBASER_OUTER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_outer_cacheability);
+
+       /*
+        * Sanitise the physical address to be 64k aligned.
+        * Also limit the physical addresses to 48 bits.
+        */
+       reg &= ~(GENMASK_ULL(51, 48) | GENMASK_ULL(15, 12));
+
+       return reg;
+}
+
+static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm,
+                                              struct vgic_its *its,
+                                              gpa_t addr, unsigned int len)
+{
+       return extract_bytes(its->cbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its,
+                                      gpa_t addr, unsigned int len,
+                                      unsigned long val)
+{
+       /* When GITS_CTLR.Enable is 1, this register is RO. */
+       if (its->enabled)
+               return;
+
+       mutex_lock(&its->cmd_lock);
+       its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val);
+       its->cbaser = vgic_sanitise_its_cbaser(its->cbaser);
+       its->creadr = 0;
+       /*
+        * CWRITER is architecturally UNKNOWN on reset, but we need to reset
+        * it to CREADR to make sure we start with an empty command buffer.
+        */
+       its->cwriter = its->creadr;
+       mutex_unlock(&its->cmd_lock);
+}
+
+#define ITS_CMD_BUFFER_SIZE(baser)     ((((baser) & 0xff) + 1) << 12)
+#define ITS_CMD_SIZE                   32
+#define ITS_CMD_OFFSET(reg)            ((reg) & GENMASK(19, 5))
+
+/*
+ * By writing to CWRITER the guest announces new commands to be processed.
+ * To avoid any races in the first place, we take the its_cmd lock, which
+ * protects our ring buffer variables, so that there is only one user
+ * per ITS handling commands at a given time.
+ */
+static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its,
+                                       gpa_t addr, unsigned int len,
+                                       unsigned long val)
+{
+       gpa_t cbaser;
+       u64 cmd_buf[4];
+       u32 reg;
+
+       if (!its)
+               return;
+
+       mutex_lock(&its->cmd_lock);
+
+       reg = update_64bit_reg(its->cwriter, addr & 7, len, val);
+       reg = ITS_CMD_OFFSET(reg);
+       if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
+               mutex_unlock(&its->cmd_lock);
+               return;
+       }
+
+       its->cwriter = reg;
+       cbaser = CBASER_ADDRESS(its->cbaser);
+
+       while (its->cwriter != its->creadr) {
+               int ret = kvm_read_guest(kvm, cbaser + its->creadr,
+                                        cmd_buf, ITS_CMD_SIZE);
+               /*
+                * If kvm_read_guest() fails, this could be due to the guest
+                * programming a bogus value in CBASER or something else going
+                * wrong from which we cannot easily recover.
+                * According to section 6.3.2 in the GICv3 spec we can just
+                * ignore that command then.
+                */
+               if (!ret)
+                       vgic_its_handle_command(kvm, its, cmd_buf);
+
+               its->creadr += ITS_CMD_SIZE;
+               if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser))
+                       its->creadr = 0;
+       }
+
+       mutex_unlock(&its->cmd_lock);
+}
+
+static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm,
+                                               struct vgic_its *its,
+                                               gpa_t addr, unsigned int len)
+{
+       return extract_bytes(its->cwriter, addr & 0x7, len);
+}
+
+static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm,
+                                              struct vgic_its *its,
+                                              gpa_t addr, unsigned int len)
+{
+       return extract_bytes(its->creadr, addr & 0x7, len);
+}
+
+#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7)
+static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm,
+                                             struct vgic_its *its,
+                                             gpa_t addr, unsigned int len)
+{
+       u64 reg;
+
+       switch (BASER_INDEX(addr)) {
+       case 0:
+               reg = its->baser_device_table;
+               break;
+       case 1:
+               reg = its->baser_coll_table;
+               break;
+       default:
+               reg = 0;
+               break;
+       }
+
+       return extract_bytes(reg, addr & 7, len);
+}
+
+#define GITS_BASER_RO_MASK     (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56))
+static void vgic_mmio_write_its_baser(struct kvm *kvm,
+                                     struct vgic_its *its,
+                                     gpa_t addr, unsigned int len,
+                                     unsigned long val)
+{
+       u64 entry_size, device_type;
+       u64 reg, *regptr, clearbits = 0;
+
+       /* When GITS_CTLR.Enable is 1, we ignore write accesses. */
+       if (its->enabled)
+               return;
+
+       switch (BASER_INDEX(addr)) {
+       case 0:
+               regptr = &its->baser_device_table;
+               entry_size = 8;
+               device_type = GITS_BASER_TYPE_DEVICE;
+               break;
+       case 1:
+               regptr = &its->baser_coll_table;
+               entry_size = 8;
+               device_type = GITS_BASER_TYPE_COLLECTION;
+               clearbits = GITS_BASER_INDIRECT;
+               break;
+       default:
+               return;
+       }
+
+       reg = update_64bit_reg(*regptr, addr & 7, len, val);
+       reg &= ~GITS_BASER_RO_MASK;
+       reg &= ~clearbits;
+
+       reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
+       reg |= device_type << GITS_BASER_TYPE_SHIFT;
+       reg = vgic_sanitise_its_baser(reg);
+
+       *regptr = reg;
+}
+
+#define REGISTER_ITS_DESC(off, rd, wr, length, acc)            \
+{                                                              \
+       .reg_offset = off,                                      \
+       .len = length,                                          \
+       .access_flags = acc,                                    \
+       .its_read = rd,                                         \
+       .its_write = wr,                                        \
+}
+
+static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its,
+                             gpa_t addr, unsigned int len, unsigned long val)
+{
+       /* Ignore */
+}
+
+static struct vgic_register_region its_registers[] = {
+       REGISTER_ITS_DESC(GITS_CTLR,
+               vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_IIDR,
+               vgic_mmio_read_its_iidr, its_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_TYPER,
+               vgic_mmio_read_its_typer, its_mmio_write_wi, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_CBASER,
+               vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_CWRITER,
+               vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_CREADR,
+               vgic_mmio_read_its_creadr, its_mmio_write_wi, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_BASER,
+               vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_ITS_DESC(GITS_IDREGS_BASE,
+               vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30,
+               VGIC_ACCESS_32bit),
+};
+
+/* This is called on setting the LPI enable bit in the redistributor. */
+void vgic_enable_lpis(struct kvm_vcpu *vcpu)
+{
+       if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ))
+               its_sync_lpi_pending_table(vcpu);
+}
+
+static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its)
+{
+       struct vgic_io_device *iodev = &its->iodev;
+       int ret;
+
+       if (its->initialized)
+               return 0;
+
+       if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base))
+               return -ENXIO;
+
+       iodev->regions = its_registers;
+       iodev->nr_regions = ARRAY_SIZE(its_registers);
+       kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops);
+
+       iodev->base_addr = its->vgic_its_base;
+       iodev->iodev_type = IODEV_ITS;
+       iodev->its = its;
+       mutex_lock(&kvm->slots_lock);
+       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr,
+                                     KVM_VGIC_V3_ITS_SIZE, &iodev->dev);
+       mutex_unlock(&kvm->slots_lock);
+
+       if (!ret)
+               its->initialized = true;
+
+       return ret;
+}
+
+#define INITIAL_BASER_VALUE                                              \
+       (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb)                | \
+        GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner)         | \
+        GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)             | \
+        ((8ULL - 1) << GITS_BASER_ENTRY_SIZE_SHIFT)                    | \
+        GITS_BASER_PAGE_SIZE_64K)
+
+#define INITIAL_PROPBASER_VALUE                                                  \
+       (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb)            | \
+        GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner)     | \
+        GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable))
+
+static int vgic_its_create(struct kvm_device *dev, u32 type)
+{
+       struct vgic_its *its;
+
+       if (type != KVM_DEV_TYPE_ARM_VGIC_ITS)
+               return -ENODEV;
+
+       its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL);
+       if (!its)
+               return -ENOMEM;
+
+       mutex_init(&its->its_lock);
+       mutex_init(&its->cmd_lock);
+
+       its->vgic_its_base = VGIC_ADDR_UNDEF;
+
+       INIT_LIST_HEAD(&its->device_list);
+       INIT_LIST_HEAD(&its->collection_list);
+
+       dev->kvm->arch.vgic.has_its = true;
+       its->initialized = false;
+       its->enabled = false;
+       its->dev = dev;
+
+       its->baser_device_table = INITIAL_BASER_VALUE                   |
+               ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT);
+       its->baser_coll_table = INITIAL_BASER_VALUE |
+               ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT);
+       dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE;
+
+       dev->private = its;
+
+       return 0;
+}
+
+static void vgic_its_destroy(struct kvm_device *kvm_dev)
+{
+       struct kvm *kvm = kvm_dev->kvm;
+       struct vgic_its *its = kvm_dev->private;
+       struct its_device *dev;
+       struct its_itte *itte;
+       struct list_head *dev_cur, *dev_temp;
+       struct list_head *cur, *temp;
+
+       /*
+        * We may end up here without the lists ever having been initialized.
+        * Check this and bail out early to avoid dereferencing a NULL pointer.
+        */
+       if (!its->device_list.next)
+               return;
+
+       mutex_lock(&its->its_lock);
+       list_for_each_safe(dev_cur, dev_temp, &its->device_list) {
+               dev = container_of(dev_cur, struct its_device, dev_list);
+               list_for_each_safe(cur, temp, &dev->itt_head) {
+                       itte = (container_of(cur, struct its_itte, itte_list));
+                       its_free_itte(kvm, itte);
+               }
+               list_del(dev_cur);
+               kfree(dev);
+       }
+
+       list_for_each_safe(cur, temp, &its->collection_list) {
+               list_del(cur);
+               kfree(container_of(cur, struct its_collection, coll_list));
+       }
+       mutex_unlock(&its->its_lock);
+
+       kfree(its);
+}
+
+static int vgic_its_has_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR:
+               switch (attr->attr) {
+               case KVM_VGIC_ITS_ADDR_TYPE:
+                       return 0;
+               }
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_CTRL:
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                       return 0;
+               }
+               break;
+       }
+       return -ENXIO;
+}
+
+static int vgic_its_set_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+       struct vgic_its *its = dev->private;
+       int ret;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               unsigned long type = (unsigned long)attr->attr;
+               u64 addr;
+
+               if (type != KVM_VGIC_ITS_ADDR_TYPE)
+                       return -ENODEV;
+
+               if (its->initialized)
+                       return -EBUSY;
+
+               if (copy_from_user(&addr, uaddr, sizeof(addr)))
+                       return -EFAULT;
+
+               ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base,
+                                       addr, SZ_64K);
+               if (ret)
+                       return ret;
+
+               its->vgic_its_base = addr;
+
+               return 0;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_CTRL:
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                       return vgic_its_init_its(dev->kvm, its);
+               }
+               break;
+       }
+       return -ENXIO;
+}
+
+static int vgic_its_get_attr(struct kvm_device *dev,
+                            struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+               struct vgic_its *its = dev->private;
+               u64 addr = its->vgic_its_base;
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               unsigned long type = (unsigned long)attr->attr;
+
+               if (type != KVM_VGIC_ITS_ADDR_TYPE)
+                       return -ENODEV;
+
+               if (copy_to_user(uaddr, &addr, sizeof(addr)))
+                       return -EFAULT;
+               break;
+       default:
+               return -ENXIO;
+       }
+       }
+
+       return 0;
+}
+
+static struct kvm_device_ops kvm_arm_vgic_its_ops = {
+       .name = "kvm-arm-vgic-its",
+       .create = vgic_its_create,
+       .destroy = vgic_its_destroy,
+       .set_attr = vgic_its_set_attr,
+       .get_attr = vgic_its_get_attr,
+       .has_attr = vgic_its_has_attr,
+};
+
+int kvm_vgic_register_its_device(void)
+{
+       return kvm_register_device_ops(&kvm_arm_vgic_its_ops,
+                                      KVM_DEV_TYPE_ARM_VGIC_ITS);
+}
index 0130c4b..1813f93 100644 (file)
@@ -21,8 +21,8 @@
 
 /* common helpers */
 
-static int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
-                            phys_addr_t addr, phys_addr_t alignment)
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+                     phys_addr_t addr, phys_addr_t alignment)
 {
        if (addr & ~KVM_PHYS_MASK)
                return -E2BIG;
@@ -210,20 +210,27 @@ static void vgic_destroy(struct kvm_device *dev)
        kfree(dev);
 }
 
-void kvm_register_vgic_device(unsigned long type)
+int kvm_register_vgic_device(unsigned long type)
 {
+       int ret = -ENODEV;
+
        switch (type) {
        case KVM_DEV_TYPE_ARM_VGIC_V2:
-               kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
-                                       KVM_DEV_TYPE_ARM_VGIC_V2);
+               ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
+                                             KVM_DEV_TYPE_ARM_VGIC_V2);
                break;
 #ifdef CONFIG_KVM_ARM_VGIC_V3
        case KVM_DEV_TYPE_ARM_VGIC_V3:
-               kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
-                                       KVM_DEV_TYPE_ARM_VGIC_V3);
+               ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
+                                             KVM_DEV_TYPE_ARM_VGIC_V3);
+               if (ret)
+                       break;
+               ret = kvm_vgic_register_its_device();
                break;
 #endif
        }
+
+       return ret;
 }
 
 /** vgic_attr_regs_access: allows user space to read/write VGIC registers
@@ -428,4 +435,3 @@ struct kvm_device_ops kvm_arm_vgic_v3_ops = {
 };
 
 #endif /* CONFIG_KVM_ARM_VGIC_V3 */
-
index a213936..b44b359 100644 (file)
@@ -102,6 +102,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
                irq->source |= 1U << source_vcpu->vcpu_id;
 
                vgic_queue_irq_unlock(source_vcpu->kvm, irq);
+               vgic_put_irq(source_vcpu->kvm, irq);
        }
 }
 
@@ -116,6 +117,8 @@ static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
                struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 
                val |= (u64)irq->targets << (i * 8);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 
        return val;
@@ -143,6 +146,7 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
                irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
 
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -157,6 +161,8 @@ static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
                struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 
                val |= (u64)irq->source << (i * 8);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
        return val;
 }
@@ -178,6 +184,7 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
                        irq->pending = false;
 
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -201,6 +208,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
                } else {
                        spin_unlock(&irq->irq_lock);
                }
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -429,6 +437,7 @@ int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
        struct vgic_io_device dev = {
                .regions = vgic_v2_cpu_registers,
                .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers),
+               .iodev_type = IODEV_CPUIF,
        };
 
        return vgic_uaccess(vcpu, &dev, is_write, offset, val);
@@ -440,6 +449,7 @@ int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
        struct vgic_io_device dev = {
                .regions = vgic_v2_dist_registers,
                .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers),
+               .iodev_type = IODEV_DIST,
        };
 
        return vgic_uaccess(vcpu, &dev, is_write, offset, val);
index a0c515a..ff668e0 100644 (file)
 #include "vgic-mmio.h"
 
 /* extract @num bytes at @offset bytes offset in data */
-static unsigned long extract_bytes(unsigned long data, unsigned int offset,
-                                  unsigned int num)
+unsigned long extract_bytes(unsigned long data, unsigned int offset,
+                           unsigned int num)
 {
        return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
 }
 
+/* allows updates of any half of a 64-bit register (or the whole thing) */
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+                    unsigned long val)
+{
+       int lower = (offset & 4) * 8;
+       int upper = lower + 8 * len - 1;
+
+       reg &= ~GENMASK_ULL(upper, lower);
+       val &= GENMASK_ULL(len * 8 - 1, 0);
+
+       return reg | ((u64)val << lower);
+}
+
+bool vgic_has_its(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+
+       if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
+               return false;
+
+       return dist->has_its;
+}
+
 static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
                                            gpa_t addr, unsigned int len)
 {
@@ -43,7 +66,12 @@ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
        case GICD_TYPER:
                value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
                value = (value >> 5) - 1;
-               value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+               if (vgic_has_its(vcpu->kvm)) {
+                       value |= (INTERRUPT_ID_BITS_ITS - 1) << 19;
+                       value |= GICD_TYPER_LPIS;
+               } else {
+                       value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+               }
                break;
        case GICD_IIDR:
                value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
@@ -80,15 +108,17 @@ static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
 {
        int intid = VGIC_ADDR_TO_INTID(addr, 64);
        struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+       unsigned long ret = 0;
 
        if (!irq)
                return 0;
 
        /* The upper word is RAZ for us. */
-       if (addr & 4)
-               return 0;
+       if (!(addr & 4))
+               ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
 
-       return extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
+       vgic_put_irq(vcpu->kvm, irq);
+       return ret;
 }
 
 static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
@@ -96,15 +126,17 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
                                    unsigned long val)
 {
        int intid = VGIC_ADDR_TO_INTID(addr, 64);
-       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
-
-       if (!irq)
-               return;
+       struct vgic_irq *irq;
 
        /* The upper word is WI for us since we don't implement Aff3. */
        if (addr & 4)
                return;
 
+       irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+
+       if (!irq)
+               return;
+
        spin_lock(&irq->irq_lock);
 
        /* We only care about and preserve Aff0, Aff1 and Aff2. */
@@ -112,6 +144,32 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
        irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
 
        spin_unlock(&irq->irq_lock);
+       vgic_put_irq(vcpu->kvm, irq);
+}
+
+static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu,
+                                            gpa_t addr, unsigned int len)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+       return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0;
+}
+
+
+static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       bool was_enabled = vgic_cpu->lpis_enabled;
+
+       if (!vgic_has_its(vcpu->kvm))
+               return;
+
+       vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
+
+       if (!was_enabled && vgic_cpu->lpis_enabled)
+               vgic_enable_lpis(vcpu);
 }
 
 static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
@@ -125,6 +183,8 @@ static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
        value |= ((target_vcpu_id & 0xffff) << 8);
        if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
                value |= GICR_TYPER_LAST;
+       if (vgic_has_its(vcpu->kvm))
+               value |= GICR_TYPER_PLPIS;
 
        return extract_bytes(value, addr & 7, len);
 }
@@ -147,6 +207,142 @@ static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
        return 0;
 }
 
+/* We want to avoid outer shareable. */
+u64 vgic_sanitise_shareability(u64 field)
+{
+       switch (field) {
+       case GIC_BASER_OuterShareable:
+               return GIC_BASER_InnerShareable;
+       default:
+               return field;
+       }
+}
+
+/* Avoid any inner non-cacheable mapping. */
+u64 vgic_sanitise_inner_cacheability(u64 field)
+{
+       switch (field) {
+       case GIC_BASER_CACHE_nCnB:
+       case GIC_BASER_CACHE_nC:
+               return GIC_BASER_CACHE_RaWb;
+       default:
+               return field;
+       }
+}
+
+/* Non-cacheable or same-as-inner are OK. */
+u64 vgic_sanitise_outer_cacheability(u64 field)
+{
+       switch (field) {
+       case GIC_BASER_CACHE_SameAsInner:
+       case GIC_BASER_CACHE_nC:
+               return field;
+       default:
+               return GIC_BASER_CACHE_nC;
+       }
+}
+
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+                       u64 (*sanitise_fn)(u64))
+{
+       u64 field = (reg & field_mask) >> field_shift;
+
+       field = sanitise_fn(field) << field_shift;
+       return (reg & ~field_mask) | field;
+}
+
+#define PROPBASER_RES0_MASK                                            \
+       (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5))
+#define PENDBASER_RES0_MASK                                            \
+       (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) |      \
+        GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0))
+
+static u64 vgic_sanitise_pendbaser(u64 reg)
+{
+       reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK,
+                                 GICR_PENDBASER_SHAREABILITY_SHIFT,
+                                 vgic_sanitise_shareability);
+       reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK,
+                                 GICR_PENDBASER_INNER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_inner_cacheability);
+       reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK,
+                                 GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_outer_cacheability);
+
+       reg &= ~PENDBASER_RES0_MASK;
+       reg &= ~GENMASK_ULL(51, 48);
+
+       return reg;
+}
+
+static u64 vgic_sanitise_propbaser(u64 reg)
+{
+       reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK,
+                                 GICR_PROPBASER_SHAREABILITY_SHIFT,
+                                 vgic_sanitise_shareability);
+       reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK,
+                                 GICR_PROPBASER_INNER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_inner_cacheability);
+       reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK,
+                                 GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT,
+                                 vgic_sanitise_outer_cacheability);
+
+       reg &= ~PROPBASER_RES0_MASK;
+       reg &= ~GENMASK_ULL(51, 48);
+       return reg;
+}
+
+static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu,
+                                            gpa_t addr, unsigned int len)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+       return extract_bytes(dist->propbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       u64 propbaser = dist->propbaser;
+
+       /* Storing a value with LPIs already enabled is undefined */
+       if (vgic_cpu->lpis_enabled)
+               return;
+
+       propbaser = update_64bit_reg(propbaser, addr & 4, len, val);
+       propbaser = vgic_sanitise_propbaser(propbaser);
+
+       dist->propbaser = propbaser;
+}
+
+static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu,
+                                            gpa_t addr, unsigned int len)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+       return extract_bytes(vgic_cpu->pendbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       u64 pendbaser = vgic_cpu->pendbaser;
+
+       /* Storing a value with LPIs already enabled is undefined */
+       if (vgic_cpu->lpis_enabled)
+               return;
+
+       pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val);
+       pendbaser = vgic_sanitise_pendbaser(pendbaser);
+
+       vgic_cpu->pendbaser = pendbaser;
+}
+
 /*
  * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
  * redistributors, while SPIs are covered by registers in the distributor
@@ -218,7 +414,7 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = {
 
 static const struct vgic_register_region vgic_v3_rdbase_registers[] = {
        REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
-               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+               vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
                vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
@@ -227,10 +423,10 @@ static const struct vgic_register_region vgic_v3_rdbase_registers[] = {
                vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
-               vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+               vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
-               vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+               vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
                vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
@@ -285,24 +481,18 @@ unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev)
 
 int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
 {
-       int nr_vcpus = atomic_read(&kvm->online_vcpus);
        struct kvm_vcpu *vcpu;
-       struct vgic_io_device *devices;
        int c, ret = 0;
 
-       devices = kmalloc(sizeof(struct vgic_io_device) * nr_vcpus * 2,
-                         GFP_KERNEL);
-       if (!devices)
-               return -ENOMEM;
-
        kvm_for_each_vcpu(c, vcpu, kvm) {
                gpa_t rd_base = redist_base_address + c * SZ_64K * 2;
                gpa_t sgi_base = rd_base + SZ_64K;
-               struct vgic_io_device *rd_dev = &devices[c * 2];
-               struct vgic_io_device *sgi_dev = &devices[c * 2 + 1];
+               struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
+               struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev;
 
                kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
                rd_dev->base_addr = rd_base;
+               rd_dev->iodev_type = IODEV_REDIST;
                rd_dev->regions = vgic_v3_rdbase_registers;
                rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers);
                rd_dev->redist_vcpu = vcpu;
@@ -317,6 +507,7 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
 
                kvm_iodevice_init(&sgi_dev->dev, &kvm_io_gic_ops);
                sgi_dev->base_addr = sgi_base;
+               sgi_dev->iodev_type = IODEV_REDIST;
                sgi_dev->regions = vgic_v3_sgibase_registers;
                sgi_dev->nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers);
                sgi_dev->redist_vcpu = vcpu;
@@ -335,14 +526,15 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
        if (ret) {
                /* The current c failed, so we start with the previous one. */
                for (c--; c >= 0; c--) {
+                       struct vgic_cpu *vgic_cpu;
+
+                       vcpu = kvm_get_vcpu(kvm, c);
+                       vgic_cpu = &vcpu->arch.vgic_cpu;
                        kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
-                                                 &devices[c * 2].dev);
+                                                 &vgic_cpu->rd_iodev.dev);
                        kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
-                                                 &devices[c * 2 + 1].dev);
+                                                 &vgic_cpu->sgi_iodev.dev);
                }
-               kfree(devices);
-       } else {
-               kvm->arch.vgic.redist_iodevs = devices;
        }
 
        return ret;
@@ -451,5 +643,6 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
                irq->pending = true;
 
                vgic_queue_irq_unlock(vcpu->kvm, irq);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
index 9f6fab7..3bad3c5 100644 (file)
@@ -56,6 +56,8 @@ unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
 
                if (irq->enabled)
                        value |= (1U << i);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 
        return value;
@@ -74,6 +76,8 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
                spin_lock(&irq->irq_lock);
                irq->enabled = true;
                vgic_queue_irq_unlock(vcpu->kvm, irq);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -92,6 +96,7 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
                irq->enabled = false;
 
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -108,6 +113,8 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
 
                if (irq->pending)
                        value |= (1U << i);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 
        return value;
@@ -129,6 +136,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
                        irq->soft_pending = true;
 
                vgic_queue_irq_unlock(vcpu->kvm, irq);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -152,6 +160,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
                }
 
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -168,6 +177,8 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
 
                if (irq->active)
                        value |= (1U << i);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 
        return value;
@@ -242,6 +253,7 @@ void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
        for_each_set_bit(i, &val, len * 8) {
                struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
                vgic_mmio_change_active(vcpu, irq, false);
+               vgic_put_irq(vcpu->kvm, irq);
        }
        vgic_change_active_finish(vcpu, intid);
 }
@@ -257,6 +269,7 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
        for_each_set_bit(i, &val, len * 8) {
                struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
                vgic_mmio_change_active(vcpu, irq, true);
+               vgic_put_irq(vcpu->kvm, irq);
        }
        vgic_change_active_finish(vcpu, intid);
 }
@@ -272,6 +285,8 @@ unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
                struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 
                val |= (u64)irq->priority << (i * 8);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 
        return val;
@@ -298,6 +313,8 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
                /* Narrow the priority range to what we actually support */
                irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
                spin_unlock(&irq->irq_lock);
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -313,6 +330,8 @@ unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
 
                if (irq->config == VGIC_CONFIG_EDGE)
                        value |= (2U << (i * 2));
+
+               vgic_put_irq(vcpu->kvm, irq);
        }
 
        return value;
@@ -326,7 +345,7 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
        int i;
 
        for (i = 0; i < len * 4; i++) {
-               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+               struct vgic_irq *irq;
 
                /*
                 * The configuration cannot be changed for SGIs in general,
@@ -337,14 +356,18 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
                if (intid + i < VGIC_NR_PRIVATE_IRQS)
                        continue;
 
+               irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
                spin_lock(&irq->irq_lock);
+
                if (test_bit(i * 2 + 1, &val)) {
                        irq->config = VGIC_CONFIG_EDGE;
                } else {
                        irq->config = VGIC_CONFIG_LEVEL;
                        irq->pending = irq->line_level | irq->soft_pending;
                }
+
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -450,8 +473,7 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 {
        struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
        const struct vgic_register_region *region;
-       struct kvm_vcpu *r_vcpu;
-       unsigned long data;
+       unsigned long data = 0;
 
        region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
                                       addr - iodev->base_addr);
@@ -460,8 +482,21 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
                return 0;
        }
 
-       r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
-       data = region->read(r_vcpu, addr, len);
+       switch (iodev->iodev_type) {
+       case IODEV_CPUIF:
+               data = region->read(vcpu, addr, len);
+               break;
+       case IODEV_DIST:
+               data = region->read(vcpu, addr, len);
+               break;
+       case IODEV_REDIST:
+               data = region->read(iodev->redist_vcpu, addr, len);
+               break;
+       case IODEV_ITS:
+               data = region->its_read(vcpu->kvm, iodev->its, addr, len);
+               break;
+       }
+
        vgic_data_host_to_mmio_bus(val, len, data);
        return 0;
 }
@@ -471,7 +506,6 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 {
        struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
        const struct vgic_register_region *region;
-       struct kvm_vcpu *r_vcpu;
        unsigned long data = vgic_data_mmio_bus_to_host(val, len);
 
        region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
@@ -482,8 +516,21 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
        if (!check_region(region, addr, len))
                return 0;
 
-       r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
-       region->write(r_vcpu, addr, len, data);
+       switch (iodev->iodev_type) {
+       case IODEV_CPUIF:
+               region->write(vcpu, addr, len, data);
+               break;
+       case IODEV_DIST:
+               region->write(vcpu, addr, len, data);
+               break;
+       case IODEV_REDIST:
+               region->write(iodev->redist_vcpu, addr, len, data);
+               break;
+       case IODEV_ITS:
+               region->its_write(vcpu->kvm, iodev->its, addr, len, data);
+               break;
+       }
+
        return 0;
 }
 
@@ -513,6 +560,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
        }
 
        io_device->base_addr = dist_base_address;
+       io_device->iodev_type = IODEV_DIST;
        io_device->redist_vcpu = NULL;
 
        mutex_lock(&kvm->slots_lock);
index 8509014..0b3ecf9 100644 (file)
@@ -21,10 +21,19 @@ struct vgic_register_region {
        unsigned int len;
        unsigned int bits_per_irq;
        unsigned int access_flags;
-       unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
-                             unsigned int len);
-       void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len,
-                     unsigned long val);
+       union {
+               unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
+                                     unsigned int len);
+               unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its,
+                                         gpa_t addr, unsigned int len);
+       };
+       union {
+               void (*write)(struct kvm_vcpu *vcpu, gpa_t addr,
+                             unsigned int len, unsigned long val);
+               void (*its_write)(struct kvm *kvm, struct vgic_its *its,
+                                 gpa_t addr, unsigned int len,
+                                 unsigned long val);
+       };
 };
 
 extern struct kvm_io_device_ops kvm_io_gic_ops;
@@ -87,6 +96,12 @@ unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
 void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
                                unsigned long data);
 
+unsigned long extract_bytes(unsigned long data, unsigned int offset,
+                           unsigned int num);
+
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+                    unsigned long val);
+
 unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
                                 gpa_t addr, unsigned int len);
 
@@ -147,4 +162,12 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
 
 unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
 
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+u64 vgic_sanitise_outer_cacheability(u64 reg);
+u64 vgic_sanitise_inner_cacheability(u64 reg);
+u64 vgic_sanitise_shareability(u64 reg);
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+                       u64 (*sanitise_fn)(u64));
+#endif
+
 #endif
index e31405e..0bf6709 100644 (file)
@@ -124,6 +124,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
                }
 
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -332,20 +333,25 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
        vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
        kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
 
+       ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+       if (ret) {
+               kvm_err("Cannot register GICv2 KVM device\n");
+               iounmap(kvm_vgic_global_state.vctrl_base);
+               return ret;
+       }
+
        ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base,
                                     kvm_vgic_global_state.vctrl_base +
                                         resource_size(&info->vctrl),
                                     info->vctrl.start);
-
        if (ret) {
                kvm_err("Cannot map VCTRL into hyp\n");
+               kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
                iounmap(kvm_vgic_global_state.vctrl_base);
                return ret;
        }
 
        kvm_vgic_global_state.can_emulate_gicv2 = true;
-       kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
-
        kvm_vgic_global_state.vcpu_base = info->vcpu.start;
        kvm_vgic_global_state.type = VGIC_V2;
        kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
index 346b4ad..0506543 100644 (file)
@@ -81,6 +81,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
                else
                        intid = val & GICH_LR_VIRTUALID;
                irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+               if (!irq)       /* An LPI could have been unmapped. */
+                       continue;
 
                spin_lock(&irq->irq_lock);
 
@@ -113,6 +115,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
                }
 
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(vcpu->kvm, irq);
        }
 }
 
@@ -190,6 +193,11 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
        vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
 }
 
+#define INITIAL_PENDBASER_VALUE                                                  \
+       (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb)            | \
+       GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner)      | \
+       GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))
+
 void vgic_v3_enable(struct kvm_vcpu *vcpu)
 {
        struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
@@ -207,10 +215,12 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
         * way, so we force SRE to 1 to demonstrate this to the guest.
         * This goes with the spec allowing the value to be RAO/WI.
         */
-       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
                vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
-       else
+               vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE;
+       } else {
                vgic_v3->vgic_sre = 0;
+       }
 
        /* Get the show on the road... */
        vgic_v3->vgic_hcr = ICH_HCR_EN;
@@ -296,6 +306,7 @@ out:
 int vgic_v3_probe(const struct gic_kvm_info *info)
 {
        u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
+       int ret;
 
        /*
         * The ListRegs field is 5 bits, but there is a architectural
@@ -319,12 +330,22 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
        } else {
                kvm_vgic_global_state.vcpu_base = info->vcpu.start;
                kvm_vgic_global_state.can_emulate_gicv2 = true;
-               kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+               ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+               if (ret) {
+                       kvm_err("Cannot register GICv2 KVM device.\n");
+                       return ret;
+               }
                kvm_info("vgic-v2@%llx\n", info->vcpu.start);
        }
+       ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
+       if (ret) {
+               kvm_err("Cannot register GICv3 KVM device.\n");
+               kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
+               return ret;
+       }
+
        if (kvm_vgic_global_state.vcpu_base == 0)
                kvm_info("disabling GICv2 emulation\n");
-       kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
 
        kvm_vgic_global_state.vctrl_base = NULL;
        kvm_vgic_global_state.type = VGIC_V3;
index 69b61ab..39f3358 100644 (file)
@@ -33,10 +33,17 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
 
 /*
  * Locking order is always:
- *   vgic_cpu->ap_list_lock
- *     vgic_irq->irq_lock
+ * its->cmd_lock (mutex)
+ *   its->its_lock (mutex)
+ *     vgic_cpu->ap_list_lock
+ *       kvm->lpi_list_lock
+ *         vgic_irq->irq_lock
  *
- * (that is, always take the ap_list_lock before the struct vgic_irq lock).
+ * If you need to take multiple locks, always take the upper lock first,
+ * then the lower ones, e.g. first take the its_lock, then the irq_lock.
+ * If you are already holding a lock and need to take a higher one, you
+ * have to drop the lower ranking lock first and re-aquire it after having
+ * taken the upper one.
  *
  * When taking more than one ap_list_lock at the same time, always take the
  * lowest numbered VCPU's ap_list_lock first, so:
@@ -45,6 +52,41 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
  *     spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
  */
 
+/*
+ * Iterate over the VM's list of mapped LPIs to find the one with a
+ * matching interrupt ID and return a reference to the IRQ structure.
+ */
+static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct vgic_irq *irq = NULL;
+
+       spin_lock(&dist->lpi_list_lock);
+
+       list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+               if (irq->intid != intid)
+                       continue;
+
+               /*
+                * This increases the refcount, the caller is expected to
+                * call vgic_put_irq() later once it's finished with the IRQ.
+                */
+               vgic_get_irq_kref(irq);
+               goto out_unlock;
+       }
+       irq = NULL;
+
+out_unlock:
+       spin_unlock(&dist->lpi_list_lock);
+
+       return irq;
+}
+
+/*
+ * This looks up the virtual interrupt ID to get the corresponding
+ * struct vgic_irq. It also increases the refcount, so any caller is expected
+ * to call vgic_put_irq() once it's finished with this IRQ.
+ */
 struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
                              u32 intid)
 {
@@ -56,14 +98,43 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
        if (intid <= VGIC_MAX_SPI)
                return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
 
-       /* LPIs are not yet covered */
+       /* LPIs */
        if (intid >= VGIC_MIN_LPI)
-               return NULL;
+               return vgic_get_lpi(kvm, intid);
 
        WARN(1, "Looking up struct vgic_irq for reserved INTID");
        return NULL;
 }
 
+/*
+ * We can't do anything in here, because we lack the kvm pointer to
+ * lock and remove the item from the lpi_list. So we keep this function
+ * empty and use the return value of kref_put() to trigger the freeing.
+ */
+static void vgic_irq_release(struct kref *ref)
+{
+}
+
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
+{
+       struct vgic_dist *dist;
+
+       if (irq->intid < VGIC_MIN_LPI)
+               return;
+
+       if (!kref_put(&irq->refcount, vgic_irq_release))
+               return;
+
+       dist = &kvm->arch.vgic;
+
+       spin_lock(&dist->lpi_list_lock);
+       list_del(&irq->lpi_list);
+       dist->lpi_list_count--;
+       spin_unlock(&dist->lpi_list_lock);
+
+       kfree(irq);
+}
+
 /**
  * kvm_vgic_target_oracle - compute the target vcpu for an irq
  *
@@ -236,6 +307,11 @@ retry:
                goto retry;
        }
 
+       /*
+        * Grab a reference to the irq to reflect the fact that it is
+        * now in the ap_list.
+        */
+       vgic_get_irq_kref(irq);
        list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
        irq->vcpu = vcpu;
 
@@ -269,14 +345,17 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
        if (!irq)
                return -EINVAL;
 
-       if (irq->hw != mapped_irq)
+       if (irq->hw != mapped_irq) {
+               vgic_put_irq(kvm, irq);
                return -EINVAL;
+       }
 
        spin_lock(&irq->irq_lock);
 
        if (!vgic_validate_injection(irq, level)) {
                /* Nothing to see here, move along... */
                spin_unlock(&irq->irq_lock);
+               vgic_put_irq(kvm, irq);
                return 0;
        }
 
@@ -288,6 +367,7 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
        }
 
        vgic_queue_irq_unlock(kvm, irq);
+       vgic_put_irq(kvm, irq);
 
        return 0;
 }
@@ -330,25 +410,28 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq)
        irq->hwintid = phys_irq;
 
        spin_unlock(&irq->irq_lock);
+       vgic_put_irq(vcpu->kvm, irq);
 
        return 0;
 }
 
 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
 {
-       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
-
-       BUG_ON(!irq);
+       struct vgic_irq *irq;
 
        if (!vgic_initialized(vcpu->kvm))
                return -EAGAIN;
 
+       irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
+       BUG_ON(!irq);
+
        spin_lock(&irq->irq_lock);
 
        irq->hw = false;
        irq->hwintid = 0;
 
        spin_unlock(&irq->irq_lock);
+       vgic_put_irq(vcpu->kvm, irq);
 
        return 0;
 }
@@ -386,6 +469,15 @@ retry:
                        list_del(&irq->ap_list);
                        irq->vcpu = NULL;
                        spin_unlock(&irq->irq_lock);
+
+                       /*
+                        * This vgic_put_irq call matches the
+                        * vgic_get_irq_kref in vgic_queue_irq_unlock,
+                        * where we added the LPI to the ap_list. As
+                        * we remove the irq from the list, we drop
+                        * also drop the refcount.
+                        */
+                       vgic_put_irq(vcpu->kvm, irq);
                        continue;
                }
 
@@ -614,6 +706,15 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
        spin_lock(&irq->irq_lock);
        map_is_active = irq->hw && irq->active;
        spin_unlock(&irq->irq_lock);
+       vgic_put_irq(vcpu->kvm, irq);
 
        return map_is_active;
 }
+
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+       if (vgic_has_its(kvm))
+               return vgic_its_inject_msi(kvm, msi);
+       else
+               return -ENODEV;
+}
index 7b300ca..1d8e21d 100644 (file)
@@ -25,6 +25,7 @@
 #define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
 
 #define INTERRUPT_ID_BITS_SPIS 10
+#define INTERRUPT_ID_BITS_ITS  16
 #define VGIC_PRI_BITS          5
 
 #define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
@@ -38,9 +39,13 @@ struct vgic_vmcr {
 
 struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
                              u32 intid);
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
 bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq);
 void vgic_kick_vcpus(struct kvm *kvm);
 
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+                     phys_addr_t addr, phys_addr_t alignment);
+
 void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu);
 void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
 void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
@@ -59,6 +64,14 @@ int vgic_v2_map_resources(struct kvm *kvm);
 int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
                             enum vgic_type);
 
+static inline void vgic_get_irq_kref(struct vgic_irq *irq)
+{
+       if (irq->intid < VGIC_MIN_LPI)
+               return;
+
+       kref_get(&irq->refcount);
+}
+
 #ifdef CONFIG_KVM_ARM_VGIC_V3
 void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu);
 void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
@@ -71,6 +84,10 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu);
 int vgic_v3_probe(const struct gic_kvm_info *info);
 int vgic_v3_map_resources(struct kvm *kvm);
 int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
+bool vgic_has_its(struct kvm *kvm);
+int kvm_vgic_register_its_device(void);
+void vgic_enable_lpis(struct kvm_vcpu *vcpu);
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
 #else
 static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
 {
@@ -122,9 +139,28 @@ static inline int vgic_register_redist_iodevs(struct kvm *kvm,
 {
        return -ENODEV;
 }
+
+static inline bool vgic_has_its(struct kvm *kvm)
+{
+       return false;
+}
+
+static inline int kvm_vgic_register_its_device(void)
+{
+       return -ENODEV;
+}
+
+static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+       return -ENODEV;
+}
 #endif
 
-void kvm_register_vgic_device(unsigned long type);
+int kvm_register_vgic_device(unsigned long type);
 int vgic_lazy_init(struct kvm *kvm);
 int vgic_init(struct kvm *kvm);
 
index 8db197b..df99e9c 100644 (file)
@@ -135,7 +135,8 @@ void kvm_free_irq_routing(struct kvm *kvm)
        free_irq_routing_table(rt);
 }
 
-static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+static int setup_routing_entry(struct kvm *kvm,
+                              struct kvm_irq_routing_table *rt,
                               struct kvm_kernel_irq_routing_entry *e,
                               const struct kvm_irq_routing_entry *ue)
 {
@@ -154,7 +155,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
 
        e->gsi = ue->gsi;
        e->type = ue->type;
-       r = kvm_set_routing_entry(e, ue);
+       r = kvm_set_routing_entry(kvm, e, ue);
        if (r)
                goto out;
        if (e->type == KVM_IRQ_ROUTING_IRQCHIP)
@@ -211,7 +212,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
                        kfree(e);
                        goto out;
                }
-               r = setup_routing_entry(new, e, ue);
+               r = setup_routing_entry(kvm, new, e, ue);
                if (r) {
                        kfree(e);
                        goto out;
index 2e79136..cc081cc 100644 (file)
@@ -1444,6 +1444,52 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
        return true;
 }
 
+static int hva_to_pfn_remapped(struct vm_area_struct *vma,
+                              unsigned long addr, bool *async,
+                              bool write_fault, kvm_pfn_t *p_pfn)
+{
+       unsigned long pfn;
+       int r;
+
+       r = follow_pfn(vma, addr, &pfn);
+       if (r) {
+               /*
+                * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
+                * not call the fault handler, so do it here.
+                */
+               bool unlocked = false;
+               r = fixup_user_fault(current, current->mm, addr,
+                                    (write_fault ? FAULT_FLAG_WRITE : 0),
+                                    &unlocked);
+               if (unlocked)
+                       return -EAGAIN;
+               if (r)
+                       return r;
+
+               r = follow_pfn(vma, addr, &pfn);
+               if (r)
+                       return r;
+
+       }
+
+
+       /*
+        * Get a reference here because callers of *hva_to_pfn* and
+        * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
+        * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
+        * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+        * simply do nothing for reserved pfns.
+        *
+        * Whoever called remap_pfn_range is also going to call e.g.
+        * unmap_mapping_range before the underlying pages are freed,
+        * causing a call to our MMU notifier.
+        */ 
+       kvm_get_pfn(pfn);
+
+       *p_pfn = pfn;
+       return 0;
+}
+
 /*
  * Pin guest page in memory and return its pfn.
  * @addr: host virtual address which maps memory to the guest
@@ -1463,7 +1509,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 {
        struct vm_area_struct *vma;
        kvm_pfn_t pfn = 0;
-       int npages;
+       int npages, r;
 
        /* we can do it either atomically or asynchronously, not both */
        BUG_ON(atomic && async);
@@ -1485,14 +1531,17 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
                goto exit;
        }
 
+retry:
        vma = find_vma_intersection(current->mm, addr, addr + 1);
 
        if (vma == NULL)
                pfn = KVM_PFN_ERR_FAULT;
-       else if ((vma->vm_flags & VM_PFNMAP)) {
-               pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
-                       vma->vm_pgoff;
-               BUG_ON(!kvm_is_reserved_pfn(pfn));
+       else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
+               r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn);
+               if (r == -EAGAIN)
+                       goto retry;
+               if (r < 0)
+                       pfn = KVM_PFN_ERR_FAULT;
        } else {
                if (async && vma_is_valid(vma, write_fault))
                        *async = true;
@@ -2348,9 +2397,20 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        if (id >= KVM_MAX_VCPU_ID)
                return -EINVAL;
 
+       mutex_lock(&kvm->lock);
+       if (kvm->created_vcpus == KVM_MAX_VCPUS) {
+               mutex_unlock(&kvm->lock);
+               return -EINVAL;
+       }
+
+       kvm->created_vcpus++;
+       mutex_unlock(&kvm->lock);
+
        vcpu = kvm_arch_vcpu_create(kvm, id);
-       if (IS_ERR(vcpu))
-               return PTR_ERR(vcpu);
+       if (IS_ERR(vcpu)) {
+               r = PTR_ERR(vcpu);
+               goto vcpu_decrement;
+       }
 
        preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 
@@ -2359,14 +2419,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
                goto vcpu_destroy;
 
        mutex_lock(&kvm->lock);
-       if (!kvm_vcpu_compatible(vcpu)) {
-               r = -EINVAL;
-               goto unlock_vcpu_destroy;
-       }
-       if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
-               r = -EINVAL;
-               goto unlock_vcpu_destroy;
-       }
        if (kvm_get_vcpu_by_id(kvm, id)) {
                r = -EEXIST;
                goto unlock_vcpu_destroy;
@@ -2399,6 +2451,10 @@ unlock_vcpu_destroy:
        mutex_unlock(&kvm->lock);
 vcpu_destroy:
        kvm_arch_vcpu_destroy(vcpu);
+vcpu_decrement:
+       mutex_lock(&kvm->lock);
+       kvm->created_vcpus--;
+       mutex_unlock(&kvm->lock);
        return r;
 }
 
@@ -3487,6 +3543,30 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
        return r;
 }
 
+struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                                        gpa_t addr)
+{
+       struct kvm_io_bus *bus;
+       int dev_idx, srcu_idx;
+       struct kvm_io_device *iodev = NULL;
+
+       srcu_idx = srcu_read_lock(&kvm->srcu);
+
+       bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+
+       dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
+       if (dev_idx < 0)
+               goto out_unlock;
+
+       iodev = bus->range[dev_idx].dev;
+
+out_unlock:
+       srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+       return iodev;
+}
+EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
+
 static int kvm_debugfs_open(struct inode *inode, struct file *file,
                           int (*get)(void *, u64 *), int (*set)(void *, u64),
                           const char *fmt)