Merge tag 'kvm-arm-for-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm...
authorPaolo Bonzini <pbonzini@redhat.com>
Wed, 9 Mar 2016 10:50:42 +0000 (11:50 +0100)
committerPaolo Bonzini <pbonzini@redhat.com>
Wed, 9 Mar 2016 10:50:42 +0000 (11:50 +0100)
KVM/ARM updates for 4.6

- VHE support so that we can run the kernel at EL2 on ARMv8.1 systems
- PMU support for guests
- 32bit world switch rewritten in C
- Various optimizations to the vgic save/restore code

Conflicts:
include/uapi/linux/kvm.h

64 files changed:
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/devices/s390_flic.txt
Documentation/virtual/kvm/devices/vm.txt
Documentation/virtual/kvm/mmu.txt
arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/pgtable.h
arch/powerpc/include/asm/smp.h
arch/powerpc/include/asm/xics.h
arch/powerpc/include/uapi/asm/kvm.h
arch/powerpc/kernel/smp.c
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_64_vio.c
arch/powerpc/kvm/book3s_64_vio_hv.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_builtin.c
arch/powerpc/kvm/book3s_hv_rm_xics.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_pr_papr.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/mm/pgtable.c
arch/powerpc/perf/hv-24x7.c
arch/powerpc/sysdev/xics/icp-native.c
arch/s390/include/asm/kvm_host.h
arch/s390/include/uapi/asm/kvm.h
arch/s390/kvm/gaccess.c
arch/s390/kvm/gaccess.h
arch/s390/kvm/intercept.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/priv.c
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/kvm_page_track.h [new file with mode: 0644]
arch/x86/include/uapi/asm/hyperv.h
arch/x86/kvm/Makefile
arch/x86/kvm/assigned-dev.c
arch/x86/kvm/hyperv.c
arch/x86/kvm/i8254.c
arch/x86/kvm/i8254.h
arch/x86/kvm/ioapic.c
arch/x86/kvm/ioapic.h
arch/x86/kvm/irq.c
arch/x86/kvm/irq.h
arch/x86/kvm/irq_comm.c
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/page_track.c [new file with mode: 0644]
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/pmu.c
arch/x86/kvm/svm.c
arch/x86/kvm/trace.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
drivers/hv/hyperv_vmbus.h
include/trace/events/kvm.h
include/uapi/linux/kvm.h
virt/kvm/async_pf.c
virt/kvm/kvm_main.c

index cb2ef0b..4d0542c 100644 (file)
@@ -3039,6 +3039,87 @@ Returns: 0 on success, -1 on error
 
 Queues an SMI on the thread's vcpu.
 
+4.97 KVM_CAP_PPC_MULTITCE
+
+Capability: KVM_CAP_PPC_MULTITCE
+Architectures: ppc
+Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significantly accelerates DMA operations for PPC KVM guests.
+User space should expect that its handlers for these hypercalls
+are not going to be called if user space previously registered LIOBN
+in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
+present in the "ibm,hypertas-functions" device-tree property.
+
+The hypercalls mentioned above may or may not be processed successfully
+in the kernel based fast path. If they can not be handled by the kernel,
+they will get passed on to user space. So user space still has to have
+an implementation for these despite the in kernel acceleration.
+
+This capability is always enabled.
+
+4.98 KVM_CREATE_SPAPR_TCE_64
+
+Capability: KVM_CAP_SPAPR_TCE_64
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce_64 (in)
+Returns: file descriptor for manipulating the created TCE table
+
+This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit
+windows, described in 4.62 KVM_CREATE_SPAPR_TCE
+
+This capability uses extended struct in ioctl interface:
+
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+       __u64 liobn;
+       __u32 page_shift;
+       __u32 flags;
+       __u64 offset;   /* in pages */
+       __u64 size;     /* in pages */
+};
+
+The aim of extension is to support an additional bigger DMA window with
+a variable page size.
+KVM_CREATE_SPAPR_TCE_64 receives a 64bit window size, an IOMMU page shift and
+a bus offset of the corresponding DMA window, @size and @offset are numbers
+of IOMMU pages.
+
+@flags are not used at the moment.
+
+The rest of functionality is identical to KVM_CREATE_SPAPR_TCE.
+
+4.98 KVM_REINJECT_CONTROL
+
+Capability: KVM_CAP_REINJECT_CONTROL
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_reinject_control (in)
+Returns: 0 on success,
+         -EFAULT if struct kvm_reinject_control cannot be read,
+         -ENXIO if KVM_CREATE_PIT or KVM_CREATE_PIT2 didn't succeed earlier.
+
+i8254 (PIT) has two modes, reinject and !reinject.  The default is reinject,
+where KVM queues elapsed i8254 ticks and monitors completion of interrupt from
+vector(s) that i8254 injects.  Reinject mode dequeues a tick and injects its
+interrupt whenever there isn't a pending interrupt from i8254.
+!reinject mode injects an interrupt as soon as a tick arrives.
+
+struct kvm_reinject_control {
+       __u8 pit_reinject;
+       __u8 reserved[31];
+};
+
+pit_reinject = 0 (!reinject mode) is recommended, unless running an old
+operating system that uses the PIT for timing (e.g. Linux 2.4.x).
+
 5. The kvm_run structure
 ------------------------
 
@@ -3343,6 +3424,7 @@ EOI was received.
 
                struct kvm_hyperv_exit {
 #define KVM_EXIT_HYPERV_SYNIC          1
+#define KVM_EXIT_HYPERV_HCALL          2
                        __u32 type;
                        union {
                                struct {
@@ -3351,6 +3433,11 @@ EOI was received.
                                        __u64 evt_page;
                                        __u64 msg_page;
                                } synic;
+                               struct {
+                                       __u64 input;
+                                       __u64 result;
+                                       __u64 params[2];
+                               } hcall;
                        } u;
                };
                /* KVM_EXIT_HYPERV */
index d1ad9d5..e3e314c 100644 (file)
@@ -88,6 +88,8 @@ struct kvm_s390_io_adapter_req {
       perform a gmap translation for the guest address provided in addr,
       pin a userspace page for the translated address and add it to the
       list of mappings
+      Note: A new mapping will be created unconditionally; therefore,
+            the calling code should avoid making duplicate mappings.
 
     KVM_S390_IO_ADAPTER_UNMAP
       release a userspace page for the translated address specified in addr
index f083a16..a9ea877 100644 (file)
@@ -84,3 +84,55 @@ Returns:    -EBUSY in case 1 or more vcpus are already activated (only in write
            -EFAULT if the given address is not accessible from kernel space
            -ENOMEM if not enough memory is available to process the ioctl
            0 in case of success
+
+3. GROUP: KVM_S390_VM_TOD
+Architectures: s390
+
+3.1. ATTRIBUTE: KVM_S390_VM_TOD_HIGH
+
+Allows user space to set/get the TOD clock extension (u8).
+
+Parameters: address of a buffer in user space to store the data (u8) to
+Returns:    -EFAULT if the given address is not accessible from kernel space
+           -EINVAL if setting the TOD clock extension to != 0 is not supported
+
+3.2. ATTRIBUTE: KVM_S390_VM_TOD_LOW
+
+Allows user space to set/get bits 0-63 of the TOD clock register as defined in
+the POP (u64).
+
+Parameters: address of a buffer in user space to store the data (u64) to
+Returns:    -EFAULT if the given address is not accessible from kernel space
+
+4. GROUP: KVM_S390_VM_CRYPTO
+Architectures: s390
+
+4.1. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_AES_KW (w/o)
+
+Allows user space to enable aes key wrapping, including generating a new
+wrapping key.
+
+Parameters: none
+Returns:    0
+
+4.2. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_DEA_KW (w/o)
+
+Allows user space to enable dea key wrapping, including generating a new
+wrapping key.
+
+Parameters: none
+Returns:    0
+
+4.3. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_AES_KW (w/o)
+
+Allows user space to disable aes key wrapping, clearing the wrapping key.
+
+Parameters: none
+Returns:    0
+
+4.4. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_DEA_KW (w/o)
+
+Allows user space to disable dea key wrapping, clearing the wrapping key.
+
+Parameters: none
+Returns:    0
index daf9c0f..dda2e93 100644 (file)
@@ -391,11 +391,11 @@ To instantiate a large spte, four constraints must be satisfied:
   write-protected pages
 - the guest page must be wholly contained by a single memory slot
 
-To check the last two conditions, the mmu maintains a ->write_count set of
+To check the last two conditions, the mmu maintains a ->disallow_lpage set of
 arrays for each memory slot and large page size.  Every write protected page
-causes its write_count to be incremented, thus preventing instantiation of
+causes its disallow_lpage to be incremented, thus preventing instantiation of
 a large spte.  The frames at the end of an unaligned memory slot have
-artificially inflated ->write_counts so they can never be instantiated.
+artificially inflated ->disallow_lpages so they can never be instantiated.
 
 Zapping all pages (page generation count)
 =========================================
index 2aa79c8..7529aab 100644 (file)
@@ -33,8 +33,6 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
 }
 #endif
 
-#define SPAPR_TCE_SHIFT                12
-
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 #define KVM_DEFAULT_HPT_ORDER  24      /* 16MB HPT by default */
 #endif
index 9d08d8c..2e7c791 100644 (file)
@@ -182,7 +182,10 @@ struct kvmppc_spapr_tce_table {
        struct list_head list;
        struct kvm *kvm;
        u64 liobn;
-       u32 window_size;
+       struct rcu_head rcu;
+       u32 page_shift;
+       u64 offset;             /* in pages */
+       u64 size;               /* window size in pages */
        struct page *pages[0];
 };
 
index 2241d53..2544eda 100644 (file)
@@ -165,9 +165,25 @@ extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-                               struct kvm_create_spapr_tce *args);
+                               struct kvm_create_spapr_tce_64 *args);
+extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
+               struct kvm_vcpu *vcpu, unsigned long liobn);
+extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+               unsigned long ioba, unsigned long npages);
+extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
+               unsigned long tce);
+extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+               unsigned long *ua, unsigned long **prmap);
+extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
+               unsigned long idx, unsigned long tce);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                             unsigned long ioba, unsigned long tce);
+extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+               unsigned long liobn, unsigned long ioba,
+               unsigned long tce_list, unsigned long npages);
+extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+               unsigned long liobn, unsigned long ioba,
+               unsigned long tce_value, unsigned long npages);
 extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                             unsigned long ioba);
 extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
@@ -437,6 +453,8 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
 }
+extern void kvmppc_alloc_host_rm_ops(void);
+extern void kvmppc_free_host_rm_ops(void);
 extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
 extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
@@ -445,7 +463,11 @@ extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
 extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
                        struct kvm_vcpu *vcpu, u32 cpu);
+extern void kvmppc_xics_ipi_action(void);
+extern int h_ipi_redirect;
 #else
+static inline void kvmppc_alloc_host_rm_ops(void) {};
+static inline void kvmppc_free_host_rm_ops(void) {};
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
        { return 0; }
 static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
@@ -459,6 +481,33 @@ static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
        { return 0; }
 #endif
 
+/*
+ * Host-side operations we want to set up while running in real
+ * mode in the guest operating on the xics.
+ * Currently only VCPU wakeup is supported.
+ */
+
+union kvmppc_rm_state {
+       unsigned long raw;
+       struct {
+               u32 in_host;
+               u32 rm_action;
+       };
+};
+
+struct kvmppc_host_rm_core {
+       union kvmppc_rm_state rm_state;
+       void *rm_data;
+       char pad[112];
+};
+
+struct kvmppc_host_rm_ops {
+       struct kvmppc_host_rm_core      *rm_core;
+       void            (*vcpu_kick)(struct kvm_vcpu *vcpu);
+};
+
+extern struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
+
 static inline unsigned long kvmppc_get_epr(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_KVM_BOOKE_HV
index ac9fb11..47897a3 100644 (file)
@@ -78,6 +78,9 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
        }
        return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift);
 }
+
+unsigned long vmalloc_to_phys(void *vmalloc_addr);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_H */
index 825663c..78083ed 100644 (file)
@@ -114,6 +114,9 @@ extern int cpu_to_core_id(int cpu);
 #define PPC_MSG_TICK_BROADCAST 2
 #define PPC_MSG_DEBUGGER_BREAK  3
 
+/* This is only used by the powernv kernel */
+#define PPC_MSG_RM_HOST_ACTION 4
+
 /* for irq controllers that have dedicated ipis per message (4) */
 extern int smp_request_message_ipi(int virq, int message);
 extern const char *smp_ipi_name[];
@@ -121,6 +124,7 @@ extern const char *smp_ipi_name[];
 /* for irq controllers with only a single ipi */
 extern void smp_muxed_ipi_set_data(int cpu, unsigned long data);
 extern void smp_muxed_ipi_message_pass(int cpu, int msg);
+extern void smp_muxed_ipi_set_message(int cpu, int msg);
 extern irqreturn_t smp_ipi_demux(void);
 
 void smp_init_pSeries(void);
index 0e25bdb..2546048 100644 (file)
@@ -30,6 +30,7 @@
 #ifdef CONFIG_PPC_ICP_NATIVE
 extern int icp_native_init(void);
 extern void icp_native_flush_interrupt(void);
+extern void icp_native_cause_ipi_rm(int cpu);
 #else
 static inline int icp_native_init(void) { return -ENODEV; }
 #endif
index ab4d473..c93cf35 100644 (file)
@@ -333,6 +333,15 @@ struct kvm_create_spapr_tce {
        __u32 window_size;
 };
 
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+       __u64 liobn;
+       __u32 page_shift;
+       __u32 flags;
+       __u64 offset;   /* in pages */
+       __u64 size;     /* in pages */
+};
+
 /* for KVM_ALLOCATE_RMA */
 struct kvm_allocate_rma {
        __u64 rma_size;
index ec9ec20..cb8be5d 100644 (file)
@@ -206,7 +206,7 @@ int smp_request_message_ipi(int virq, int msg)
 
 #ifdef CONFIG_PPC_SMP_MUXED_IPI
 struct cpu_messages {
-       int messages;                   /* current messages */
+       long messages;                  /* current messages */
        unsigned long data;             /* data for cause ipi */
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message);
@@ -218,7 +218,7 @@ void smp_muxed_ipi_set_data(int cpu, unsigned long data)
        info->data = data;
 }
 
-void smp_muxed_ipi_message_pass(int cpu, int msg)
+void smp_muxed_ipi_set_message(int cpu, int msg)
 {
        struct cpu_messages *info = &per_cpu(ipi_message, cpu);
        char *message = (char *)&info->messages;
@@ -228,6 +228,13 @@ void smp_muxed_ipi_message_pass(int cpu, int msg)
         */
        smp_mb();
        message[msg] = 1;
+}
+
+void smp_muxed_ipi_message_pass(int cpu, int msg)
+{
+       struct cpu_messages *info = &per_cpu(ipi_message, cpu);
+
+       smp_muxed_ipi_set_message(cpu, msg);
        /*
         * cause_ipi functions are required to include a full barrier
         * before doing whatever causes the IPI.
@@ -236,20 +243,31 @@ void smp_muxed_ipi_message_pass(int cpu, int msg)
 }
 
 #ifdef __BIG_ENDIAN__
-#define IPI_MESSAGE(A) (1 << (24 - 8 * (A)))
+#define IPI_MESSAGE(A) (1uL << ((BITS_PER_LONG - 8) - 8 * (A)))
 #else
-#define IPI_MESSAGE(A) (1 << (8 * (A)))
+#define IPI_MESSAGE(A) (1uL << (8 * (A)))
 #endif
 
 irqreturn_t smp_ipi_demux(void)
 {
        struct cpu_messages *info = this_cpu_ptr(&ipi_message);
-       unsigned int all;
+       unsigned long all;
 
        mb();   /* order any irq clear */
 
        do {
                all = xchg(&info->messages, 0);
+#if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
+               /*
+                * Must check for PPC_MSG_RM_HOST_ACTION messages
+                * before PPC_MSG_CALL_FUNCTION messages because when
+                * a VM is destroyed, we call kick_all_cpus_sync()
+                * to ensure that any pending PPC_MSG_RM_HOST_ACTION
+                * messages have completed before we free any VCPUs.
+                */
+               if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION))
+                       kvmppc_xics_ipi_action();
+#endif
                if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION))
                        generic_smp_call_function_interrupt();
                if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
index 0570eef..7f7b6d8 100644 (file)
@@ -8,7 +8,7 @@ ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
 KVM := ../../../virt/kvm
 
 common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-               $(KVM)/eventfd.o
+               $(KVM)/eventfd.o $(KVM)/vfio.o
 
 CFLAGS_e500_mmu.o := -I.
 CFLAGS_e500_mmu_host.o := -I.
index 638c6d9..b34220d 100644 (file)
@@ -807,7 +807,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 {
 
 #ifdef CONFIG_PPC64
-       INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+       INIT_LIST_HEAD_RCU(&kvm->arch.spapr_tce_tables);
        INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
 #endif
 
index 54cf9bc..2c2d103 100644 (file)
@@ -14,6 +14,7 @@
  *
  * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
  */
 
 #include <linux/types.h>
 #include <asm/ppc-opcode.h>
 #include <asm/kvm_host.h>
 #include <asm/udbg.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
 
-#define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
+static unsigned long kvmppc_tce_pages(unsigned long iommu_pages)
+{
+       return ALIGN(iommu_pages * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+}
 
-static long kvmppc_stt_npages(unsigned long window_size)
+static unsigned long kvmppc_stt_pages(unsigned long tce_pages)
 {
-       return ALIGN((window_size >> SPAPR_TCE_SHIFT)
-                    * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+       unsigned long stt_bytes = sizeof(struct kvmppc_spapr_tce_table) +
+                       (tce_pages * sizeof(struct page *));
+
+       return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
 }
 
-static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
 {
-       struct kvm *kvm = stt->kvm;
-       int i;
+       long ret = 0;
 
-       mutex_lock(&kvm->lock);
-       list_del(&stt->list);
-       for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+       if (!current || !current->mm)
+               return ret; /* process exited */
+
+       down_write(&current->mm->mmap_sem);
+
+       if (inc) {
+               unsigned long locked, lock_limit;
+
+               locked = current->mm->locked_vm + stt_pages;
+               lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+               if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+                       ret = -ENOMEM;
+               else
+                       current->mm->locked_vm += stt_pages;
+       } else {
+               if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
+                       stt_pages = current->mm->locked_vm;
+
+               current->mm->locked_vm -= stt_pages;
+       }
+
+       pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
+                       inc ? '+' : '-',
+                       stt_pages << PAGE_SHIFT,
+                       current->mm->locked_vm << PAGE_SHIFT,
+                       rlimit(RLIMIT_MEMLOCK),
+                       ret ? " - exceeded" : "");
+
+       up_write(&current->mm->mmap_sem);
+
+       return ret;
+}
+
+static void release_spapr_tce_table(struct rcu_head *head)
+{
+       struct kvmppc_spapr_tce_table *stt = container_of(head,
+                       struct kvmppc_spapr_tce_table, rcu);
+       unsigned long i, npages = kvmppc_tce_pages(stt->size);
+
+       for (i = 0; i < npages; i++)
                __free_page(stt->pages[i]);
-       kfree(stt);
-       mutex_unlock(&kvm->lock);
 
-       kvm_put_kvm(kvm);
+       kfree(stt);
 }
 
 static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -65,7 +107,7 @@ static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
        struct page *page;
 
-       if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
+       if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
                return VM_FAULT_SIGBUS;
 
        page = stt->pages[vmf->pgoff];
@@ -88,7 +130,14 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
 {
        struct kvmppc_spapr_tce_table *stt = filp->private_data;
 
-       release_spapr_tce_table(stt);
+       list_del_rcu(&stt->list);
+
+       kvm_put_kvm(stt->kvm);
+
+       kvmppc_account_memlimit(
+               kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
+       call_rcu(&stt->rcu, release_spapr_tce_table);
+
        return 0;
 }
 
@@ -98,20 +147,29 @@ static const struct file_operations kvm_spapr_tce_fops = {
 };
 
 long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-                                  struct kvm_create_spapr_tce *args)
+                                  struct kvm_create_spapr_tce_64 *args)
 {
        struct kvmppc_spapr_tce_table *stt = NULL;
-       long npages;
+       unsigned long npages, size;
        int ret = -ENOMEM;
        int i;
 
+       if (!args->size)
+               return -EINVAL;
+
        /* Check this LIOBN hasn't been previously allocated */
        list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
                if (stt->liobn == args->liobn)
                        return -EBUSY;
        }
 
-       npages = kvmppc_stt_npages(args->window_size);
+       size = args->size;
+       npages = kvmppc_tce_pages(size);
+       ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
+       if (ret) {
+               stt = NULL;
+               goto fail;
+       }
 
        stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
                      GFP_KERNEL);
@@ -119,7 +177,9 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
                goto fail;
 
        stt->liobn = args->liobn;
-       stt->window_size = args->window_size;
+       stt->page_shift = args->page_shift;
+       stt->offset = args->offset;
+       stt->size = size;
        stt->kvm = kvm;
 
        for (i = 0; i < npages; i++) {
@@ -131,7 +191,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
        kvm_get_kvm(kvm);
 
        mutex_lock(&kvm->lock);
-       list_add(&stt->list, &kvm->arch.spapr_tce_tables);
+       list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
 
        mutex_unlock(&kvm->lock);
 
@@ -148,3 +208,59 @@ fail:
        }
        return ret;
 }
+
+long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+               unsigned long liobn, unsigned long ioba,
+               unsigned long tce_list, unsigned long npages)
+{
+       struct kvmppc_spapr_tce_table *stt;
+       long i, ret = H_SUCCESS, idx;
+       unsigned long entry, ua = 0;
+       u64 __user *tces, tce;
+
+       stt = kvmppc_find_table(vcpu, liobn);
+       if (!stt)
+               return H_TOO_HARD;
+
+       entry = ioba >> stt->page_shift;
+       /*
+        * SPAPR spec says that the maximum size of the list is 512 TCEs
+        * so the whole table fits in 4K page
+        */
+       if (npages > 512)
+               return H_PARAMETER;
+
+       if (tce_list & (SZ_4K - 1))
+               return H_PARAMETER;
+
+       ret = kvmppc_ioba_validate(stt, ioba, npages);
+       if (ret != H_SUCCESS)
+               return ret;
+
+       idx = srcu_read_lock(&vcpu->kvm->srcu);
+       if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
+               ret = H_TOO_HARD;
+               goto unlock_exit;
+       }
+       tces = (u64 __user *) ua;
+
+       for (i = 0; i < npages; ++i) {
+               if (get_user(tce, tces + i)) {
+                       ret = H_TOO_HARD;
+                       goto unlock_exit;
+               }
+               tce = be64_to_cpu(tce);
+
+               ret = kvmppc_tce_validate(stt, tce);
+               if (ret != H_SUCCESS)
+                       goto unlock_exit;
+
+               kvmppc_tce_put(stt, entry + i, tce);
+       }
+
+unlock_exit:
+       srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect);
index 89e96b3..44be73e 100644 (file)
@@ -14,6 +14,7 @@
  *
  * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
  */
 
 #include <linux/types.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 #include <asm/mmu-hash64.h>
+#include <asm/mmu_context.h>
 #include <asm/hvcall.h>
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
 #include <asm/kvm_host.h>
 #include <asm/udbg.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/iommu.h>
 
 #define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
 
-/* WARNING: This will be called in real-mode on HV KVM and virtual
+/*
+ * Finds a TCE table descriptor by LIOBN.
+ *
+ * WARNING: This will be called in real or virtual mode on HV KVM and virtual
  *          mode on PR KVM
  */
-long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
-                     unsigned long ioba, unsigned long tce)
+struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu,
+               unsigned long liobn)
 {
        struct kvm *kvm = vcpu->kvm;
        struct kvmppc_spapr_tce_table *stt;
 
+       list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list)
+               if (stt->liobn == liobn)
+                       return stt;
+
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(kvmppc_find_table);
+
+/*
+ * Validates IO address.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *          mode on PR KVM
+ */
+long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+               unsigned long ioba, unsigned long npages)
+{
+       unsigned long mask = (1ULL << stt->page_shift) - 1;
+       unsigned long idx = ioba >> stt->page_shift;
+
+       if ((ioba & mask) || (idx < stt->offset) ||
+                       (idx - stt->offset + npages > stt->size) ||
+                       (idx + npages < idx))
+               return H_PARAMETER;
+
+       return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
+
+/*
+ * Validates TCE address.
+ * At the moment flags and page mask are validated.
+ * As the host kernel does not access those addresses (just puts them
+ * to the table and user space is supposed to process them), we can skip
+ * checking other things (such as TCE is a guest RAM address or the page
+ * was actually allocated).
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *          mode on PR KVM
+ */
+long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
+{
+       unsigned long page_mask = ~((1ULL << stt->page_shift) - 1);
+       unsigned long mask = ~(page_mask | TCE_PCI_WRITE | TCE_PCI_READ);
+
+       if (tce & mask)
+               return H_PARAMETER;
+
+       return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
+
+/* Note on the use of page_address() in real mode,
+ *
+ * It is safe to use page_address() in real mode on ppc64 because
+ * page_address() is always defined as lowmem_page_address()
+ * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetic
+ * operation and does not access page struct.
+ *
+ * Theoretically page_address() could be defined different
+ * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL
+ * would have to be enabled.
+ * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64,
+ * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only
+ * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP
+ * is not expected to be enabled on ppc32, page_address()
+ * is safe for ppc32 as well.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *          mode on PR KVM
+ */
+static u64 *kvmppc_page_address(struct page *page)
+{
+#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL)
+#error TODO: fix to avoid page_address() here
+#endif
+       return (u64 *) page_address(page);
+}
+
+/*
+ * Handles TCE requests for emulated devices.
+ * Puts guest TCE values to the table and expects user space to convert them.
+ * Called in both real and virtual modes.
+ * Cannot fail so kvmppc_tce_validate must be called before it.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *          mode on PR KVM
+ */
+void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
+               unsigned long idx, unsigned long tce)
+{
+       struct page *page;
+       u64 *tbl;
+
+       idx -= stt->offset;
+       page = stt->pages[idx / TCES_PER_PAGE];
+       tbl = kvmppc_page_address(page);
+
+       tbl[idx % TCES_PER_PAGE] = tce;
+}
+EXPORT_SYMBOL_GPL(kvmppc_tce_put);
+
+long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+               unsigned long *ua, unsigned long **prmap)
+{
+       unsigned long gfn = gpa >> PAGE_SHIFT;
+       struct kvm_memory_slot *memslot;
+
+       memslot = search_memslots(kvm_memslots(kvm), gfn);
+       if (!memslot)
+               return -EINVAL;
+
+       *ua = __gfn_to_hva_memslot(memslot, gfn) |
+               (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+       if (prmap)
+               *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+#endif
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+                     unsigned long ioba, unsigned long tce)
+{
+       struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+       long ret;
+
        /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
        /*          liobn, ioba, tce); */
 
-       list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-               if (stt->liobn == liobn) {
-                       unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-                       struct page *page;
-                       u64 *tbl;
-
-                       /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
-                       /*          liobn, stt, stt->window_size); */
-                       if (ioba >= stt->window_size)
-                               return H_PARAMETER;
-
-                       page = stt->pages[idx / TCES_PER_PAGE];
-                       tbl = (u64 *)page_address(page);
-
-                       /* FIXME: Need to validate the TCE itself */
-                       /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
-                       tbl[idx % TCES_PER_PAGE] = tce;
-                       return H_SUCCESS;
-               }
-       }
+       if (!stt)
+               return H_TOO_HARD;
+
+       ret = kvmppc_ioba_validate(stt, ioba, 1);
+       if (ret != H_SUCCESS)
+               return ret;
 
-       /* Didn't find the liobn, punt it to userspace */
-       return H_TOO_HARD;
+       ret = kvmppc_tce_validate(stt, tce);
+       if (ret != H_SUCCESS)
+               return ret;
+
+       kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+
+       return H_SUCCESS;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
 
-long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
-                     unsigned long ioba)
+static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu,
+               unsigned long ua, unsigned long *phpa)
+{
+       pte_t *ptep, pte;
+       unsigned shift = 0;
+
+       ptep = __find_linux_pte_or_hugepte(vcpu->arch.pgdir, ua, NULL, &shift);
+       if (!ptep || !pte_present(*ptep))
+               return -ENXIO;
+       pte = *ptep;
+
+       if (!shift)
+               shift = PAGE_SHIFT;
+
+       /* Avoid handling anything potentially complicated in realmode */
+       if (shift > PAGE_SHIFT)
+               return -EAGAIN;
+
+       if (!pte_young(pte))
+               return -EAGAIN;
+
+       *phpa = (pte_pfn(pte) << PAGE_SHIFT) | (ua & ((1ULL << shift) - 1)) |
+                       (ua & ~PAGE_MASK);
+
+       return 0;
+}
+
+long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+               unsigned long liobn, unsigned long ioba,
+               unsigned long tce_list, unsigned long npages)
 {
-       struct kvm *kvm = vcpu->kvm;
        struct kvmppc_spapr_tce_table *stt;
+       long i, ret = H_SUCCESS;
+       unsigned long tces, entry, ua = 0;
+       unsigned long *rmap = NULL;
+
+       stt = kvmppc_find_table(vcpu, liobn);
+       if (!stt)
+               return H_TOO_HARD;
+
+       entry = ioba >> stt->page_shift;
+       /*
+        * The spec says that the maximum size of the list is 512 TCEs
+        * so the whole table addressed resides in 4K page
+        */
+       if (npages > 512)
+               return H_PARAMETER;
+
+       if (tce_list & (SZ_4K - 1))
+               return H_PARAMETER;
+
+       ret = kvmppc_ioba_validate(stt, ioba, npages);
+       if (ret != H_SUCCESS)
+               return ret;
 
-       list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-               if (stt->liobn == liobn) {
-                       unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-                       struct page *page;
-                       u64 *tbl;
+       if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
+               return H_TOO_HARD;
 
-                       if (ioba >= stt->window_size)
-                               return H_PARAMETER;
+       rmap = (void *) vmalloc_to_phys(rmap);
 
-                       page = stt->pages[idx / TCES_PER_PAGE];
-                       tbl = (u64 *)page_address(page);
+       /*
+        * Synchronize with the MMU notifier callbacks in
+        * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
+        * While we have the rmap lock, code running on other CPUs
+        * cannot finish unmapping the host real page that backs
+        * this guest real page, so we are OK to access the host
+        * real page.
+        */
+       lock_rmap(rmap);
+       if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
+               ret = H_TOO_HARD;
+               goto unlock_exit;
+       }
+
+       for (i = 0; i < npages; ++i) {
+               unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
+
+               ret = kvmppc_tce_validate(stt, tce);
+               if (ret != H_SUCCESS)
+                       goto unlock_exit;
 
-                       vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
-                       return H_SUCCESS;
-               }
+               kvmppc_tce_put(stt, entry + i, tce);
        }
 
-       /* Didn't find the liobn, punt it to userspace */
-       return H_TOO_HARD;
+unlock_exit:
+       unlock_rmap(rmap);
+
+       return ret;
+}
+
+long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+               unsigned long liobn, unsigned long ioba,
+               unsigned long tce_value, unsigned long npages)
+{
+       struct kvmppc_spapr_tce_table *stt;
+       long i, ret;
+
+       stt = kvmppc_find_table(vcpu, liobn);
+       if (!stt)
+               return H_TOO_HARD;
+
+       ret = kvmppc_ioba_validate(stt, ioba, npages);
+       if (ret != H_SUCCESS)
+               return ret;
+
+       /* Check permission bits only to allow userspace poison TCE for debug */
+       if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
+               return H_PARAMETER;
+
+       for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
+               kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
+
+       return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
+
+long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+                     unsigned long ioba)
+{
+       struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+       long ret;
+       unsigned long idx;
+       struct page *page;
+       u64 *tbl;
+
+       if (!stt)
+               return H_TOO_HARD;
+
+       ret = kvmppc_ioba_validate(stt, ioba, 1);
+       if (ret != H_SUCCESS)
+               return ret;
+
+       idx = (ioba >> stt->page_shift) - stt->offset;
+       page = stt->pages[idx / TCES_PER_PAGE];
+       tbl = (u64 *)page_address(page);
+
+       vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
+
+       return H_SUCCESS;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_get_tce);
+
+#endif /* KVM_BOOK3S_HV_POSSIBLE */
index baeddb0..f47fffe 100644 (file)
@@ -81,6 +81,17 @@ static int target_smt_mode;
 module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
 
+#ifdef CONFIG_KVM_XICS
+static struct kernel_param_ops module_param_ops = {
+       .set = param_set_int,
+       .get = param_get_int,
+};
+
+module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
+                                                       S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
+#endif
+
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
@@ -768,7 +779,31 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                if (kvmppc_xics_enabled(vcpu)) {
                        ret = kvmppc_xics_hcall(vcpu, req);
                        break;
-               } /* fallthrough */
+               }
+               return RESUME_HOST;
+       case H_PUT_TCE:
+               ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_PUT_TCE_INDIRECT:
+               ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6),
+                                               kvmppc_get_gpr(vcpu, 7));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
+       case H_STUFF_TCE:
+               ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5),
+                                               kvmppc_get_gpr(vcpu, 6),
+                                               kvmppc_get_gpr(vcpu, 7));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
        default:
                return RESUME_HOST;
        }
@@ -2278,6 +2313,46 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
        spin_unlock(&vc->lock);
 }
 
+/*
+ * Clear core from the list of active host cores as we are about to
+ * enter the guest. Only do this if it is the primary thread of the
+ * core (not if a subcore) that is entering the guest.
+ */
+static inline void kvmppc_clear_host_core(int cpu)
+{
+       int core;
+
+       if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+               return;
+       /*
+        * Memory barrier can be omitted here as we will do a smp_wmb()
+        * later in kvmppc_start_thread and we need ensure that state is
+        * visible to other CPUs only after we enter guest.
+        */
+       core = cpu >> threads_shift;
+       kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
+}
+
+/*
+ * Advertise this core as an active host core since we exited the guest
+ * Only need to do this if it is the primary thread of the core that is
+ * exiting.
+ */
+static inline void kvmppc_set_host_core(int cpu)
+{
+       int core;
+
+       if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+               return;
+
+       /*
+        * Memory barrier can be omitted here because we do a spin_unlock
+        * immediately after this which provides the memory barrier.
+        */
+       core = cpu >> threads_shift;
+       kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
+}
+
 /*
  * Run a set of guest threads on a physical core.
  * Called with vc->lock held.
@@ -2390,6 +2465,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                }
        }
 
+       kvmppc_clear_host_core(pcpu);
+
        /* Start all the threads */
        active = 0;
        for (sub = 0; sub < core_info.n_subcores; ++sub) {
@@ -2486,6 +2563,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                        kvmppc_ipi_thread(pcpu + i);
        }
 
+       kvmppc_set_host_core(pcpu);
+
        spin_unlock(&vc->lock);
 
        /* make sure updates to secondary vcpu structs are visible now */
@@ -2984,6 +3063,114 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
        goto out_srcu;
 }
 
+#ifdef CONFIG_KVM_XICS
+static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action,
+                       void *hcpu)
+{
+       unsigned long cpu = (long)hcpu;
+
+       switch (action) {
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
+               kvmppc_set_host_core(cpu);
+               break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+       case CPU_UP_CANCELED:
+       case CPU_UP_CANCELED_FROZEN:
+               kvmppc_clear_host_core(cpu);
+               break;
+#endif
+       default:
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block kvmppc_cpu_notifier = {
+           .notifier_call = kvmppc_cpu_notify,
+};
+
+/*
+ * Allocate a per-core structure for managing state about which cores are
+ * running in the host versus the guest and for exchanging data between
+ * real mode KVM and CPU running in the host.
+ * This is only done for the first VM.
+ * The allocated structure stays even if all VMs have stopped.
+ * It is only freed when the kvm-hv module is unloaded.
+ * It's OK for this routine to fail, we just don't support host
+ * core operations like redirecting H_IPI wakeups.
+ */
+void kvmppc_alloc_host_rm_ops(void)
+{
+       struct kvmppc_host_rm_ops *ops;
+       unsigned long l_ops;
+       int cpu, core;
+       int size;
+
+       /* Not the first time here ? */
+       if (kvmppc_host_rm_ops_hv != NULL)
+               return;
+
+       ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
+       if (!ops)
+               return;
+
+       size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
+       ops->rm_core = kzalloc(size, GFP_KERNEL);
+
+       if (!ops->rm_core) {
+               kfree(ops);
+               return;
+       }
+
+       get_online_cpus();
+
+       for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
+               if (!cpu_online(cpu))
+                       continue;
+
+               core = cpu >> threads_shift;
+               ops->rm_core[core].rm_state.in_host = 1;
+       }
+
+       ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
+
+       /*
+        * Make the contents of the kvmppc_host_rm_ops structure visible
+        * to other CPUs before we assign it to the global variable.
+        * Do an atomic assignment (no locks used here), but if someone
+        * beats us to it, just free our copy and return.
+        */
+       smp_wmb();
+       l_ops = (unsigned long) ops;
+
+       if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
+               put_online_cpus();
+               kfree(ops->rm_core);
+               kfree(ops);
+               return;
+       }
+
+       register_cpu_notifier(&kvmppc_cpu_notifier);
+
+       put_online_cpus();
+}
+
+void kvmppc_free_host_rm_ops(void)
+{
+       if (kvmppc_host_rm_ops_hv) {
+               unregister_cpu_notifier(&kvmppc_cpu_notifier);
+               kfree(kvmppc_host_rm_ops_hv->rm_core);
+               kfree(kvmppc_host_rm_ops_hv);
+               kvmppc_host_rm_ops_hv = NULL;
+       }
+}
+#endif
+
 static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 {
        unsigned long lpcr, lpid;
@@ -2996,6 +3183,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
                return -ENOMEM;
        kvm->arch.lpid = lpid;
 
+       kvmppc_alloc_host_rm_ops();
+
        /*
         * Since we don't flush the TLB when tearing down a VM,
         * and this lpid might have previously been used,
@@ -3229,6 +3418,7 @@ static int kvmppc_book3s_init_hv(void)
 
 static void kvmppc_book3s_exit_hv(void)
 {
+       kvmppc_free_host_rm_ops();
        kvmppc_hv_ops = NULL;
 }
 
index fd7006b..5f0380d 100644 (file)
@@ -283,3 +283,6 @@ void kvmhv_commence_exit(int trap)
                        kvmhv_interrupt_vcore(vc, ee);
        }
 }
+
+struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
+EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
index 24f5807..980d8a6 100644 (file)
 #include <asm/xics.h>
 #include <asm/debug.h>
 #include <asm/synch.h>
+#include <asm/cputhreads.h>
 #include <asm/ppc-opcode.h>
 
 #include "book3s_xics.h"
 
 #define DEBUG_PASSUP
 
+int h_ipi_redirect = 1;
+EXPORT_SYMBOL(h_ipi_redirect);
+
 static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
                            u32 new_irq);
 
@@ -50,11 +54,84 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics,
 
 /* -- ICP routines -- */
 
+#ifdef CONFIG_SMP
+static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
+{
+       int hcpu;
+
+       hcpu = hcore << threads_shift;
+       kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
+       smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
+       icp_native_cause_ipi_rm(hcpu);
+}
+#else
+static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { }
+#endif
+
+/*
+ * We start the search from our current CPU Id in the core map
+ * and go in a circle until we get back to our ID looking for a
+ * core that is running in host context and that hasn't already
+ * been targeted for another rm_host_ops.
+ *
+ * In the future, could consider using a fairer algorithm (one
+ * that distributes the IPIs better)
+ *
+ * Returns -1, if no CPU could be found in the host
+ * Else, returns a CPU Id which has been reserved for use
+ */
+static inline int grab_next_hostcore(int start,
+               struct kvmppc_host_rm_core *rm_core, int max, int action)
+{
+       bool success;
+       int core;
+       union kvmppc_rm_state old, new;
+
+       for (core = start + 1; core < max; core++)  {
+               old = new = READ_ONCE(rm_core[core].rm_state);
+
+               if (!old.in_host || old.rm_action)
+                       continue;
+
+               /* Try to grab this host core if not taken already. */
+               new.rm_action = action;
+
+               success = cmpxchg64(&rm_core[core].rm_state.raw,
+                                               old.raw, new.raw) == old.raw;
+               if (success) {
+                       /*
+                        * Make sure that the store to the rm_action is made
+                        * visible before we return to caller (and the
+                        * subsequent store to rm_data) to synchronize with
+                        * the IPI handler.
+                        */
+                       smp_wmb();
+                       return core;
+               }
+       }
+
+       return -1;
+}
+
+static inline int find_available_hostcore(int action)
+{
+       int core;
+       int my_core = smp_processor_id() >> threads_shift;
+       struct kvmppc_host_rm_core *rm_core = kvmppc_host_rm_ops_hv->rm_core;
+
+       core = grab_next_hostcore(my_core, rm_core, cpu_nr_cores(), action);
+       if (core == -1)
+               core = grab_next_hostcore(core, rm_core, my_core, action);
+
+       return core;
+}
+
 static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
                                struct kvm_vcpu *this_vcpu)
 {
        struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
        int cpu;
+       int hcore;
 
        /* Mark the target VCPU as having an interrupt pending */
        vcpu->stat.queue_intr++;
@@ -66,11 +143,22 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
                return;
        }
 
-       /* Check if the core is loaded, if not, too hard */
+       /*
+        * Check if the core is loaded,
+        * if not, find an available host core to post to wake the VCPU,
+        * if we can't find one, set up state to eventually return too hard.
+        */
        cpu = vcpu->arch.thread_cpu;
        if (cpu < 0 || cpu >= nr_cpu_ids) {
-               this_icp->rm_action |= XICS_RM_KICK_VCPU;
-               this_icp->rm_kick_target = vcpu;
+               hcore = -1;
+               if (kvmppc_host_rm_ops_hv && h_ipi_redirect)
+                       hcore = find_available_hostcore(XICS_RM_KICK_VCPU);
+               if (hcore != -1) {
+                       icp_send_hcore_msg(hcore, vcpu);
+               } else {
+                       this_icp->rm_action |= XICS_RM_KICK_VCPU;
+                       this_icp->rm_kick_target = vcpu;
+               }
                return;
        }
 
@@ -623,3 +711,40 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
  bail:
        return check_too_hard(xics, icp);
 }
+
+/*  --- Non-real mode XICS-related built-in routines ---  */
+
+/**
+ * Host Operations poked by RM KVM
+ */
+static void rm_host_ipi_action(int action, void *data)
+{
+       switch (action) {
+       case XICS_RM_KICK_VCPU:
+               kvmppc_host_rm_ops_hv->vcpu_kick(data);
+               break;
+       default:
+               WARN(1, "Unexpected rm_action=%d data=%p\n", action, data);
+               break;
+       }
+
+}
+
+void kvmppc_xics_ipi_action(void)
+{
+       int core;
+       unsigned int cpu = smp_processor_id();
+       struct kvmppc_host_rm_core *rm_corep;
+
+       core = cpu >> threads_shift;
+       rm_corep = &kvmppc_host_rm_ops_hv->rm_core[core];
+
+       if (rm_corep->rm_data) {
+               rm_host_ipi_action(rm_corep->rm_state.rm_action,
+                                                       rm_corep->rm_data);
+               /* Order these stores against the real mode KVM */
+               rm_corep->rm_data = NULL;
+               smp_wmb();
+               rm_corep->rm_state.rm_action = 0;
+       }
+}
index 6ee26de..ed16182 100644 (file)
@@ -2006,8 +2006,8 @@ hcall_real_table:
        .long   0               /* 0x12c */
        .long   0               /* 0x130 */
        .long   DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
-       .long   0               /* 0x138 */
-       .long   0               /* 0x13c */
+       .long   DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table
+       .long   DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table
        .long   0               /* 0x140 */
        .long   0               /* 0x144 */
        .long   0               /* 0x148 */
index f2c75a1..02176fd 100644 (file)
@@ -280,6 +280,37 @@ static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu)
        return EMULATE_DONE;
 }
 
+static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu)
+{
+       unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+       unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+       unsigned long tce = kvmppc_get_gpr(vcpu, 6);
+       unsigned long npages = kvmppc_get_gpr(vcpu, 7);
+       long rc;
+
+       rc = kvmppc_h_put_tce_indirect(vcpu, liobn, ioba,
+                       tce, npages);
+       if (rc == H_TOO_HARD)
+               return EMULATE_FAIL;
+       kvmppc_set_gpr(vcpu, 3, rc);
+       return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu)
+{
+       unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+       unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+       unsigned long tce_value = kvmppc_get_gpr(vcpu, 6);
+       unsigned long npages = kvmppc_get_gpr(vcpu, 7);
+       long rc;
+
+       rc = kvmppc_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages);
+       if (rc == H_TOO_HARD)
+               return EMULATE_FAIL;
+       kvmppc_set_gpr(vcpu, 3, rc);
+       return EMULATE_DONE;
+}
+
 static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 {
        long rc = kvmppc_xics_hcall(vcpu, cmd);
@@ -306,6 +337,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
                return kvmppc_h_pr_bulk_remove(vcpu);
        case H_PUT_TCE:
                return kvmppc_h_pr_put_tce(vcpu);
+       case H_PUT_TCE_INDIRECT:
+               return kvmppc_h_pr_put_tce_indirect(vcpu);
+       case H_STUFF_TCE:
+               return kvmppc_h_pr_stuff_tce(vcpu);
        case H_CEDE:
                kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE);
                kvm_vcpu_block(vcpu);
index a3b182d..19aa59b 100644 (file)
@@ -33,6 +33,7 @@
 #include <asm/tlbflush.h>
 #include <asm/cputhreads.h>
 #include <asm/irqflags.h>
+#include <asm/iommu.h>
 #include "timing.h"
 #include "irq.h"
 #include "../mm/mmu_decl.h"
@@ -437,6 +438,16 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        unsigned int i;
        struct kvm_vcpu *vcpu;
 
+#ifdef CONFIG_KVM_XICS
+       /*
+        * We call kick_all_cpus_sync() to ensure that all
+        * CPUs have executed any pending IPIs before we
+        * continue and free VCPUs structures below.
+        */
+       if (is_kvmppc_hv_enabled(kvm))
+               kick_all_cpus_sync();
+#endif
+
        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_arch_vcpu_free(vcpu);
 
@@ -509,6 +520,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
 #ifdef CONFIG_PPC_BOOK3S_64
        case KVM_CAP_SPAPR_TCE:
+       case KVM_CAP_SPAPR_TCE_64:
        case KVM_CAP_PPC_ALLOC_HTAB:
        case KVM_CAP_PPC_RTAS:
        case KVM_CAP_PPC_FIXUP_HCALL:
@@ -569,6 +581,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_PPC_GET_SMMU_INFO:
                r = 1;
                break;
+       case KVM_CAP_SPAPR_MULTITCE:
+               r = 1;
+               break;
 #endif
        default:
                r = 0;
@@ -1331,13 +1346,34 @@ long kvm_arch_vm_ioctl(struct file *filp,
                break;
        }
 #ifdef CONFIG_PPC_BOOK3S_64
+       case KVM_CREATE_SPAPR_TCE_64: {
+               struct kvm_create_spapr_tce_64 create_tce_64;
+
+               r = -EFAULT;
+               if (copy_from_user(&create_tce_64, argp, sizeof(create_tce_64)))
+                       goto out;
+               if (create_tce_64.flags) {
+                       r = -EINVAL;
+                       goto out;
+               }
+               r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64);
+               goto out;
+       }
        case KVM_CREATE_SPAPR_TCE: {
                struct kvm_create_spapr_tce create_tce;
+               struct kvm_create_spapr_tce_64 create_tce_64;
 
                r = -EFAULT;
                if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
                        goto out;
-               r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
+
+               create_tce_64.liobn = create_tce.liobn;
+               create_tce_64.page_shift = IOMMU_PAGE_SHIFT_4K;
+               create_tce_64.offset = 0;
+               create_tce_64.size = create_tce.window_size >>
+                               IOMMU_PAGE_SHIFT_4K;
+               create_tce_64.flags = 0;
+               r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64);
                goto out;
        }
        case KVM_PPC_GET_SMMU_INFO: {
index 83dfd79..de37ff4 100644 (file)
@@ -243,3 +243,11 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
 }
 #endif /* CONFIG_DEBUG_VM */
 
+unsigned long vmalloc_to_phys(void *va)
+{
+       unsigned long pfn = vmalloc_to_pfn(va);
+
+       BUG_ON(!pfn);
+       return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va);
+}
+EXPORT_SYMBOL_GPL(vmalloc_to_phys);
index 9f9dfda..3b09ecf 100644 (file)
@@ -493,14 +493,6 @@ static size_t event_to_attr_ct(struct hv_24x7_event_data *event)
        }
 }
 
-static unsigned long vmalloc_to_phys(void *v)
-{
-       struct page *p = vmalloc_to_page(v);
-
-       BUG_ON(!p);
-       return page_to_phys(p) + offset_in_page(v);
-}
-
 /* */
 struct event_uniq {
        struct rb_node node;
index eae3265..afdf62f 100644 (file)
@@ -159,6 +159,27 @@ static void icp_native_cause_ipi(int cpu, unsigned long data)
        icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+void icp_native_cause_ipi_rm(int cpu)
+{
+       /*
+        * Currently not used to send IPIs to another CPU
+        * on the same core. Only caller is KVM real mode.
+        * Need the physical address of the XICS to be
+        * previously saved in kvm_hstate in the paca.
+        */
+       unsigned long xics_phys;
+
+       /*
+        * Just like the cause_ipi functions, it is required to
+        * include a full barrier (out8 includes a sync) before
+        * causing the IPI.
+        */
+       xics_phys = paca[cpu].kvm_hstate.xics_phys;
+       out_rm8((u8 *)(xics_phys + XICS_MFRR), IPI_PRIORITY);
+}
+#endif
+
 /*
  * Called when an interrupt is received on an off-line CPU to
  * clear the interrupt, so that the CPU can go back to nap mode.
index 8959ebb..727e7f7 100644 (file)
@@ -229,17 +229,11 @@ struct kvm_s390_itdb {
        __u8    data[256];
 } __packed;
 
-struct kvm_s390_vregs {
-       __vector128 vrs[32];
-       __u8    reserved200[512];       /* for future vector expansion */
-} __packed;
-
 struct sie_page {
        struct kvm_s390_sie_block sie_block;
        __u8 reserved200[1024];         /* 0x0200 */
        struct kvm_s390_itdb itdb;      /* 0x0600 */
-       __u8 reserved700[1280];         /* 0x0700 */
-       struct kvm_s390_vregs vregs;    /* 0x0c00 */
+       __u8 reserved700[2304];         /* 0x0700 */
 } __packed;
 
 struct kvm_vcpu_stat {
index fe84bd5..347fe5a 100644 (file)
@@ -154,6 +154,7 @@ struct kvm_guest_debug_arch {
 #define KVM_SYNC_PFAULT (1UL << 5)
 #define KVM_SYNC_VRS    (1UL << 6)
 #define KVM_SYNC_RICCB  (1UL << 7)
+#define KVM_SYNC_FPRS   (1UL << 8)
 /* definition of registers in kvm_run */
 struct kvm_sync_regs {
        __u64 prefix;   /* prefix register */
@@ -168,9 +169,12 @@ struct kvm_sync_regs {
        __u64 pft;      /* pfault token [PFAULT] */
        __u64 pfs;      /* pfault select [PFAULT] */
        __u64 pfc;      /* pfault compare [PFAULT] */
-       __u64 vrs[32][2];       /* vector registers */
+       union {
+               __u64 vrs[32][2];       /* vector registers (KVM_SYNC_VRS) */
+               __u64 fprs[16];         /* fp registers (KVM_SYNC_FPRS) */
+       };
        __u8  reserved[512];    /* for future vector expansion */
-       __u32 fpc;      /* only valid with vector registers */
+       __u32 fpc;              /* valid on KVM_SYNC_VRS or KVM_SYNC_FPRS */
        __u8 padding[52];       /* riccb needs to be 64byte aligned */
        __u8 riccb[64];         /* runtime instrumentation controls block */
 };
index d30db40..66938d2 100644 (file)
@@ -373,7 +373,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu)
 }
 
 static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar,
-                         int write)
+                         enum gacc_mode mode)
 {
        union alet alet;
        struct ale ale;
@@ -454,7 +454,7 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar,
                }
        }
 
-       if (ale.fo == 1 && write)
+       if (ale.fo == 1 && mode == GACC_STORE)
                return PGM_PROTECTION;
 
        asce->val = aste.asce;
@@ -477,25 +477,28 @@ enum {
 };
 
 static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
-                        ar_t ar, int write)
+                        ar_t ar, enum gacc_mode mode)
 {
        int rc;
-       psw_t *psw = &vcpu->arch.sie_block->gpsw;
+       struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
        struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
        struct trans_exc_code_bits *tec_bits;
 
        memset(pgm, 0, sizeof(*pgm));
        tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-       tec_bits->fsi = write ? FSI_STORE : FSI_FETCH;
-       tec_bits->as = psw_bits(*psw).as;
+       tec_bits->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+       tec_bits->as = psw.as;
 
-       if (!psw_bits(*psw).t) {
+       if (!psw.t) {
                asce->val = 0;
                asce->r = 1;
                return 0;
        }
 
-       switch (psw_bits(vcpu->arch.sie_block->gpsw).as) {
+       if (mode == GACC_IFETCH)
+               psw.as = psw.as == PSW_AS_HOME ? PSW_AS_HOME : PSW_AS_PRIMARY;
+
+       switch (psw.as) {
        case PSW_AS_PRIMARY:
                asce->val = vcpu->arch.sie_block->gcr[1];
                return 0;
@@ -506,7 +509,7 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
                asce->val = vcpu->arch.sie_block->gcr[13];
                return 0;
        case PSW_AS_ACCREG:
-               rc = ar_translation(vcpu, asce, ar, write);
+               rc = ar_translation(vcpu, asce, ar, mode);
                switch (rc) {
                case PGM_ALEN_TRANSLATION:
                case PGM_ALE_SEQUENCE:
@@ -538,7 +541,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
  * @gva: guest virtual address
  * @gpa: points to where guest physical (absolute) address should be stored
  * @asce: effective asce
- * @write: indicates if access is a write access
+ * @mode: indicates the access mode to be used
  *
  * Translate a guest virtual address into a guest absolute address by means
  * of dynamic address translation as specified by the architecture.
@@ -554,7 +557,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
  */
 static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
                                     unsigned long *gpa, const union asce asce,
-                                    int write)
+                                    enum gacc_mode mode)
 {
        union vaddress vaddr = {.addr = gva};
        union raddress raddr = {.addr = gva};
@@ -699,7 +702,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 real_address:
        raddr.addr = kvm_s390_real_to_abs(vcpu, raddr.addr);
 absolute_address:
-       if (write && dat_protection)
+       if (mode == GACC_STORE && dat_protection)
                return PGM_PROTECTION;
        if (kvm_is_error_gpa(vcpu->kvm, raddr.addr))
                return PGM_ADDRESSING;
@@ -728,7 +731,7 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
 
 static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
                            unsigned long *pages, unsigned long nr_pages,
-                           const union asce asce, int write)
+                           const union asce asce, enum gacc_mode mode)
 {
        struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
        psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -740,13 +743,13 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
        while (nr_pages) {
                ga = kvm_s390_logical_to_effective(vcpu, ga);
                tec_bits->addr = ga >> PAGE_SHIFT;
-               if (write && lap_enabled && is_low_address(ga)) {
+               if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) {
                        pgm->code = PGM_PROTECTION;
                        return pgm->code;
                }
                ga &= PAGE_MASK;
                if (psw_bits(*psw).t) {
-                       rc = guest_translate(vcpu, ga, pages, asce, write);
+                       rc = guest_translate(vcpu, ga, pages, asce, mode);
                        if (rc < 0)
                                return rc;
                        if (rc == PGM_PROTECTION)
@@ -768,7 +771,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
 }
 
 int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
-                unsigned long len, int write)
+                unsigned long len, enum gacc_mode mode)
 {
        psw_t *psw = &vcpu->arch.sie_block->gpsw;
        unsigned long _len, nr_pages, gpa, idx;
@@ -780,7 +783,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
 
        if (!len)
                return 0;
-       rc = get_vcpu_asce(vcpu, &asce, ar, write);
+       rc = get_vcpu_asce(vcpu, &asce, ar, mode);
        if (rc)
                return rc;
        nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
@@ -792,11 +795,11 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
        need_ipte_lock = psw_bits(*psw).t && !asce.r;
        if (need_ipte_lock)
                ipte_lock(vcpu);
-       rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, write);
+       rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, mode);
        for (idx = 0; idx < nr_pages && !rc; idx++) {
                gpa = *(pages + idx) + (ga & ~PAGE_MASK);
                _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
-               if (write)
+               if (mode == GACC_STORE)
                        rc = kvm_write_guest(vcpu->kvm, gpa, data, _len);
                else
                        rc = kvm_read_guest(vcpu->kvm, gpa, data, _len);
@@ -812,7 +815,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
 }
 
 int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
-                     void *data, unsigned long len, int write)
+                     void *data, unsigned long len, enum gacc_mode mode)
 {
        unsigned long _len, gpa;
        int rc = 0;
@@ -820,7 +823,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
        while (len && !rc) {
                gpa = kvm_s390_real_to_abs(vcpu, gra);
                _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
-               if (write)
+               if (mode)
                        rc = write_guest_abs(vcpu, gpa, data, _len);
                else
                        rc = read_guest_abs(vcpu, gpa, data, _len);
@@ -841,7 +844,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
  * has to take care of this.
  */
 int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
-                           unsigned long *gpa, int write)
+                           unsigned long *gpa, enum gacc_mode mode)
 {
        struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
        psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -851,19 +854,19 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
 
        gva = kvm_s390_logical_to_effective(vcpu, gva);
        tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-       rc = get_vcpu_asce(vcpu, &asce, ar, write);
+       rc = get_vcpu_asce(vcpu, &asce, ar, mode);
        tec->addr = gva >> PAGE_SHIFT;
        if (rc)
                return rc;
        if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) {
-               if (write) {
+               if (mode == GACC_STORE) {
                        rc = pgm->code = PGM_PROTECTION;
                        return rc;
                }
        }
 
        if (psw_bits(*psw).t && !asce.r) {      /* Use DAT? */
-               rc = guest_translate(vcpu, gva, gpa, asce, write);
+               rc = guest_translate(vcpu, gva, gpa, asce, mode);
                if (rc > 0) {
                        if (rc == PGM_PROTECTION)
                                tec->b61 = 1;
@@ -883,7 +886,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
  * check_gva_range - test a range of guest virtual addresses for accessibility
  */
 int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
-                   unsigned long length, int is_write)
+                   unsigned long length, enum gacc_mode mode)
 {
        unsigned long gpa;
        unsigned long currlen;
@@ -892,7 +895,7 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
        ipte_lock(vcpu);
        while (length > 0 && !rc) {
                currlen = min(length, PAGE_SIZE - (gva % PAGE_SIZE));
-               rc = guest_translate_address(vcpu, gva, ar, &gpa, is_write);
+               rc = guest_translate_address(vcpu, gva, ar, &gpa, mode);
                gva += currlen;
                length -= currlen;
        }
index ef03726..df0a79d 100644 (file)
@@ -155,16 +155,22 @@ int read_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
        return kvm_read_guest(vcpu->kvm, gpa, data, len);
 }
 
+enum gacc_mode {
+       GACC_FETCH,
+       GACC_STORE,
+       GACC_IFETCH,
+};
+
 int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
-                           ar_t ar, unsigned long *gpa, int write);
+                           ar_t ar, unsigned long *gpa, enum gacc_mode mode);
 int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
-                   unsigned long length, int is_write);
+                   unsigned long length, enum gacc_mode mode);
 
 int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
-                unsigned long len, int write);
+                unsigned long len, enum gacc_mode mode);
 
 int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
-                     void *data, unsigned long len, int write);
+                     void *data, unsigned long len, enum gacc_mode mode);
 
 /**
  * write_guest - copy data from kernel space to guest space
@@ -215,7 +221,7 @@ static inline __must_check
 int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
                unsigned long len)
 {
-       return access_guest(vcpu, ga, ar, data, len, 1);
+       return access_guest(vcpu, ga, ar, data, len, GACC_STORE);
 }
 
 /**
@@ -235,7 +241,27 @@ static inline __must_check
 int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
               unsigned long len)
 {
-       return access_guest(vcpu, ga, ar, data, len, 0);
+       return access_guest(vcpu, ga, ar, data, len, GACC_FETCH);
+}
+
+/**
+ * read_guest_instr - copy instruction data from guest space to kernel space
+ * @vcpu: virtual cpu
+ * @data: destination address in kernel space
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from the current psw address (guest space) to @data (kernel
+ * space).
+ *
+ * The behaviour of read_guest_instr is identical to read_guest, except that
+ * instruction data will be read from primary space when in home-space or
+ * address-space mode.
+ */
+static inline __must_check
+int read_guest_instr(struct kvm_vcpu *vcpu, void *data, unsigned long len)
+{
+       return access_guest(vcpu, vcpu->arch.sie_block->gpsw.addr, 0, data, len,
+                           GACC_IFETCH);
 }
 
 /**
index d53c107..2e6b54e 100644 (file)
@@ -38,17 +38,32 @@ static const intercept_handler_t instruction_handlers[256] = {
        [0xeb] = kvm_s390_handle_eb,
 };
 
-void kvm_s390_rewind_psw(struct kvm_vcpu *vcpu, int ilc)
+u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block;
+       u8 ilen = 0;
 
-       /* Use the length of the EXECUTE instruction if necessary */
-       if (sie_block->icptstatus & 1) {
-               ilc = (sie_block->icptstatus >> 4) & 0x6;
-               if (!ilc)
-                       ilc = 4;
+       switch (vcpu->arch.sie_block->icptcode) {
+       case ICPT_INST:
+       case ICPT_INSTPROGI:
+       case ICPT_OPEREXC:
+       case ICPT_PARTEXEC:
+       case ICPT_IOINST:
+               /* instruction only stored for these icptcodes */
+               ilen = insn_length(vcpu->arch.sie_block->ipa >> 8);
+               /* Use the length of the EXECUTE instruction if necessary */
+               if (sie_block->icptstatus & 1) {
+                       ilen = (sie_block->icptstatus >> 4) & 0x6;
+                       if (!ilen)
+                               ilen = 4;
+               }
+               break;
+       case ICPT_PROGI:
+               /* bit 1+2 of pgmilc are the ilc, so we directly get ilen */
+               ilen = vcpu->arch.sie_block->pgmilc & 0x6;
+               break;
        }
-       sie_block->gpsw.addr = __rewind_psw(sie_block->gpsw, ilc);
+       return ilen;
 }
 
 static int handle_noop(struct kvm_vcpu *vcpu)
@@ -121,11 +136,13 @@ static int handle_instruction(struct kvm_vcpu *vcpu)
        return -EOPNOTSUPP;
 }
 
-static void __extract_prog_irq(struct kvm_vcpu *vcpu,
-                              struct kvm_s390_pgm_info *pgm_info)
+static int inject_prog_on_prog_intercept(struct kvm_vcpu *vcpu)
 {
-       memset(pgm_info, 0, sizeof(struct kvm_s390_pgm_info));
-       pgm_info->code = vcpu->arch.sie_block->iprcc;
+       struct kvm_s390_pgm_info pgm_info = {
+               .code = vcpu->arch.sie_block->iprcc,
+               /* the PSW has already been rewound */
+               .flags = KVM_S390_PGM_FLAGS_NO_REWIND,
+       };
 
        switch (vcpu->arch.sie_block->iprcc & ~PGM_PER) {
        case PGM_AFX_TRANSLATION:
@@ -138,7 +155,7 @@ static void __extract_prog_irq(struct kvm_vcpu *vcpu,
        case PGM_PRIMARY_AUTHORITY:
        case PGM_SECONDARY_AUTHORITY:
        case PGM_SPACE_SWITCH:
-               pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc;
+               pgm_info.trans_exc_code = vcpu->arch.sie_block->tecmc;
                break;
        case PGM_ALEN_TRANSLATION:
        case PGM_ALE_SEQUENCE:
@@ -146,7 +163,7 @@ static void __extract_prog_irq(struct kvm_vcpu *vcpu,
        case PGM_ASTE_SEQUENCE:
        case PGM_ASTE_VALIDITY:
        case PGM_EXTENDED_AUTHORITY:
-               pgm_info->exc_access_id = vcpu->arch.sie_block->eai;
+               pgm_info.exc_access_id = vcpu->arch.sie_block->eai;
                break;
        case PGM_ASCE_TYPE:
        case PGM_PAGE_TRANSLATION:
@@ -154,32 +171,33 @@ static void __extract_prog_irq(struct kvm_vcpu *vcpu,
        case PGM_REGION_SECOND_TRANS:
        case PGM_REGION_THIRD_TRANS:
        case PGM_SEGMENT_TRANSLATION:
-               pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc;
-               pgm_info->exc_access_id  = vcpu->arch.sie_block->eai;
-               pgm_info->op_access_id  = vcpu->arch.sie_block->oai;
+               pgm_info.trans_exc_code = vcpu->arch.sie_block->tecmc;
+               pgm_info.exc_access_id  = vcpu->arch.sie_block->eai;
+               pgm_info.op_access_id  = vcpu->arch.sie_block->oai;
                break;
        case PGM_MONITOR:
-               pgm_info->mon_class_nr = vcpu->arch.sie_block->mcn;
-               pgm_info->mon_code = vcpu->arch.sie_block->tecmc;
+               pgm_info.mon_class_nr = vcpu->arch.sie_block->mcn;
+               pgm_info.mon_code = vcpu->arch.sie_block->tecmc;
                break;
        case PGM_VECTOR_PROCESSING:
        case PGM_DATA:
-               pgm_info->data_exc_code = vcpu->arch.sie_block->dxc;
+               pgm_info.data_exc_code = vcpu->arch.sie_block->dxc;
                break;
        case PGM_PROTECTION:
-               pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc;
-               pgm_info->exc_access_id  = vcpu->arch.sie_block->eai;
+               pgm_info.trans_exc_code = vcpu->arch.sie_block->tecmc;
+               pgm_info.exc_access_id  = vcpu->arch.sie_block->eai;
                break;
        default:
                break;
        }
 
        if (vcpu->arch.sie_block->iprcc & PGM_PER) {
-               pgm_info->per_code = vcpu->arch.sie_block->perc;
-               pgm_info->per_atmid = vcpu->arch.sie_block->peratmid;
-               pgm_info->per_address = vcpu->arch.sie_block->peraddr;
-               pgm_info->per_access_id = vcpu->arch.sie_block->peraid;
+               pgm_info.per_code = vcpu->arch.sie_block->perc;
+               pgm_info.per_atmid = vcpu->arch.sie_block->peratmid;
+               pgm_info.per_address = vcpu->arch.sie_block->peraddr;
+               pgm_info.per_access_id = vcpu->arch.sie_block->peraid;
        }
+       return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
 }
 
 /*
@@ -208,7 +226,6 @@ static int handle_itdb(struct kvm_vcpu *vcpu)
 
 static int handle_prog(struct kvm_vcpu *vcpu)
 {
-       struct kvm_s390_pgm_info pgm_info;
        psw_t psw;
        int rc;
 
@@ -234,8 +251,7 @@ static int handle_prog(struct kvm_vcpu *vcpu)
        if (rc)
                return rc;
 
-       __extract_prog_irq(vcpu, &pgm_info);
-       return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+       return inject_prog_on_prog_intercept(vcpu);
 }
 
 /**
@@ -302,7 +318,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
 
        /* Make sure that the source is paged-in */
        rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg2],
-                                    reg2, &srcaddr, 0);
+                                    reg2, &srcaddr, GACC_FETCH);
        if (rc)
                return kvm_s390_inject_prog_cond(vcpu, rc);
        rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0);
@@ -311,14 +327,14 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
 
        /* Make sure that the destination is paged-in */
        rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg1],
-                                    reg1, &dstaddr, 1);
+                                    reg1, &dstaddr, GACC_STORE);
        if (rc)
                return kvm_s390_inject_prog_cond(vcpu, rc);
        rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1);
        if (rc != 0)
                return rc;
 
-       kvm_s390_rewind_psw(vcpu, 4);
+       kvm_s390_retry_instr(vcpu);
 
        return 0;
 }
index f88ca72..87e2d1a 100644 (file)
@@ -335,23 +335,6 @@ static void set_intercept_indicators(struct kvm_vcpu *vcpu)
        set_intercept_indicators_stop(vcpu);
 }
 
-static u16 get_ilc(struct kvm_vcpu *vcpu)
-{
-       switch (vcpu->arch.sie_block->icptcode) {
-       case ICPT_INST:
-       case ICPT_INSTPROGI:
-       case ICPT_OPEREXC:
-       case ICPT_PARTEXEC:
-       case ICPT_IOINST:
-               /* last instruction only stored for these icptcodes */
-               return insn_length(vcpu->arch.sie_block->ipa >> 8);
-       case ICPT_PROGI:
-               return vcpu->arch.sie_block->pgmilc;
-       default:
-               return 0;
-       }
-}
-
 static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -588,7 +571,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
        struct kvm_s390_pgm_info pgm_info;
        int rc = 0, nullifying = false;
-       u16 ilc = get_ilc(vcpu);
+       u16 ilen;
 
        spin_lock(&li->lock);
        pgm_info = li->irq.pgm;
@@ -596,8 +579,9 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
        memset(&li->irq.pgm, 0, sizeof(pgm_info));
        spin_unlock(&li->lock);
 
-       VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilc:%d",
-                  pgm_info.code, ilc);
+       ilen = pgm_info.flags & KVM_S390_PGM_FLAGS_ILC_MASK;
+       VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilen:%d",
+                  pgm_info.code, ilen);
        vcpu->stat.deliver_program_int++;
        trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
                                         pgm_info.code, 0);
@@ -681,10 +665,11 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
                                   (u8 *) __LC_PER_ACCESS_ID);
        }
 
-       if (nullifying && vcpu->arch.sie_block->icptcode == ICPT_INST)
-               kvm_s390_rewind_psw(vcpu, ilc);
+       if (nullifying && !(pgm_info.flags & KVM_S390_PGM_FLAGS_NO_REWIND))
+               kvm_s390_rewind_psw(vcpu, ilen);
 
-       rc |= put_guest_lc(vcpu, ilc, (u16 *) __LC_PGM_ILC);
+       /* bit 1+2 of the target are the ilc, so we can directly use ilen */
+       rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC);
        rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea,
                                 (u64 *) __LC_LAST_BREAK);
        rc |= put_guest_lc(vcpu, pgm_info.code,
@@ -1059,8 +1044,16 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
        trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
                                   irq->u.pgm.code, 0);
 
+       if (!(irq->u.pgm.flags & KVM_S390_PGM_FLAGS_ILC_VALID)) {
+               /* auto detection if no valid ILC was given */
+               irq->u.pgm.flags &= ~KVM_S390_PGM_FLAGS_ILC_MASK;
+               irq->u.pgm.flags |= kvm_s390_get_ilen(vcpu);
+               irq->u.pgm.flags |= KVM_S390_PGM_FLAGS_ILC_VALID;
+       }
+
        if (irq->u.pgm.code == PGM_PER) {
                li->irq.pgm.code |= PGM_PER;
+               li->irq.pgm.flags = irq->u.pgm.flags;
                /* only modify PER related information */
                li->irq.pgm.per_address = irq->u.pgm.per_address;
                li->irq.pgm.per_code = irq->u.pgm.per_code;
@@ -1069,6 +1062,7 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
        } else if (!(irq->u.pgm.code & PGM_PER)) {
                li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) |
                                   irq->u.pgm.code;
+               li->irq.pgm.flags = irq->u.pgm.flags;
                /* only modify non-PER information */
                li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code;
                li->irq.pgm.mon_code = irq->u.pgm.mon_code;
index 4af21c7..28bd5ea 100644 (file)
@@ -274,7 +274,6 @@ static void kvm_s390_sync_dirty_log(struct kvm *kvm,
        unsigned long address;
        struct gmap *gmap = kvm->arch.gmap;
 
-       down_read(&gmap->mm->mmap_sem);
        /* Loop over all guest pages */
        last_gfn = memslot->base_gfn + memslot->npages;
        for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) {
@@ -282,8 +281,10 @@ static void kvm_s390_sync_dirty_log(struct kvm *kvm,
 
                if (gmap_test_and_clear_dirty(address, gmap))
                        mark_page_dirty(kvm, cur_gfn);
+               if (fatal_signal_pending(current))
+                       return;
+               cond_resched();
        }
-       up_read(&gmap->mm->mmap_sem);
 }
 
 /* Section: vm related */
@@ -1414,8 +1415,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                                    KVM_SYNC_PFAULT;
        if (test_kvm_facility(vcpu->kvm, 64))
                vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
-       if (test_kvm_facility(vcpu->kvm, 129))
+       /* fprs can be synchronized via vrs, even if the guest has no vx. With
+        * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format.
+        */
+       if (MACHINE_HAS_VX)
                vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS;
+       else
+               vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS;
 
        if (kvm_is_ucontrol(vcpu->kvm))
                return __kvm_ucontrol_vcpu_init(vcpu);
@@ -1430,10 +1436,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc;
        vcpu->arch.host_fpregs.regs = current->thread.fpu.regs;
 
-       /* Depending on MACHINE_HAS_VX, data stored to vrs either
-        * has vector register or floating point register format.
-        */
-       current->thread.fpu.regs = vcpu->run->s.regs.vrs;
+       if (MACHINE_HAS_VX)
+               current->thread.fpu.regs = vcpu->run->s.regs.vrs;
+       else
+               current->thread.fpu.regs = vcpu->run->s.regs.fprs;
        current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
        if (test_fp_ctl(current->thread.fpu.fpc))
                /* User space provided an invalid FPC, let's clear it */
@@ -2158,8 +2164,10 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
 
 static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
 {
-       psw_t *psw = &vcpu->arch.sie_block->gpsw;
-       u8 opcode;
+       struct kvm_s390_pgm_info pgm_info = {
+               .code = PGM_ADDRESSING,
+       };
+       u8 opcode, ilen;
        int rc;
 
        VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
@@ -2173,12 +2181,21 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
         * to look up the current opcode to get the length of the instruction
         * to be able to forward the PSW.
         */
-       rc = read_guest(vcpu, psw->addr, 0, &opcode, 1);
-       if (rc)
-               return kvm_s390_inject_prog_cond(vcpu, rc);
-       psw->addr = __rewind_psw(*psw, -insn_length(opcode));
-
-       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       rc = read_guest_instr(vcpu, &opcode, 1);
+       ilen = insn_length(opcode);
+       if (rc < 0) {
+               return rc;
+       } else if (rc) {
+               /* Instruction-Fetching Exceptions - we can't detect the ilen.
+                * Forward by arbitrary ilc, injection will take care of
+                * nullification if necessary.
+                */
+               pgm_info = vcpu->arch.pgm;
+               ilen = 4;
+       }
+       pgm_info.flags = ilen | KVM_S390_PGM_FLAGS_ILC_VALID;
+       kvm_s390_forward_psw(vcpu, ilen);
+       return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
 }
 
 static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
@@ -2386,7 +2403,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
                                     fprs, 128);
        } else {
                rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
-                                    vcpu->run->s.regs.vrs, 128);
+                                    vcpu->run->s.regs.fprs, 128);
        }
        rc |= write_guest_abs(vcpu, gpa + __LC_GPREGS_SAVE_AREA,
                              vcpu->run->s.regs.gprs, 128);
@@ -2605,7 +2622,8 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
        switch (mop->op) {
        case KVM_S390_MEMOP_LOGICAL_READ:
                if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
-                       r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, false);
+                       r = check_gva_range(vcpu, mop->gaddr, mop->ar,
+                                           mop->size, GACC_FETCH);
                        break;
                }
                r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
@@ -2616,7 +2634,8 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
                break;
        case KVM_S390_MEMOP_LOGICAL_WRITE:
                if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
-                       r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, true);
+                       r = check_gva_range(vcpu, mop->gaddr, mop->ar,
+                                           mop->size, GACC_STORE);
                        break;
                }
                if (copy_from_user(tmpbuf, uaddr, mop->size)) {
index df1abad..1c756c7 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <asm/facility.h>
+#include <asm/processor.h>
 
 typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
 
@@ -212,8 +213,22 @@ int kvm_s390_reinject_io_int(struct kvm *kvm,
 int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked);
 
 /* implemented in intercept.c */
-void kvm_s390_rewind_psw(struct kvm_vcpu *vcpu, int ilc);
+u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu);
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu);
+static inline void kvm_s390_rewind_psw(struct kvm_vcpu *vcpu, int ilen)
+{
+       struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block;
+
+       sie_block->gpsw.addr = __rewind_psw(sie_block->gpsw, ilen);
+}
+static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen)
+{
+       kvm_s390_rewind_psw(vcpu, -ilen);
+}
+static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
+{
+       kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu));
+}
 
 /* implemented in priv.c */
 int is_valid_psw(psw_t *psw);
index ed74e86..add9909 100644 (file)
@@ -173,7 +173,7 @@ static int handle_skey(struct kvm_vcpu *vcpu)
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       kvm_s390_rewind_psw(vcpu, 4);
+       kvm_s390_retry_instr(vcpu);
        VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
        return 0;
 }
@@ -184,7 +184,7 @@ static int handle_ipte_interlock(struct kvm_vcpu *vcpu)
        if (psw_bits(vcpu->arch.sie_block->gpsw).p)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
        wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu));
-       kvm_s390_rewind_psw(vcpu, 4);
+       kvm_s390_retry_instr(vcpu);
        VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation");
        return 0;
 }
@@ -759,8 +759,8 @@ static int handle_essa(struct kvm_vcpu *vcpu)
        if (((vcpu->arch.sie_block->ipb & 0xf0000000) >> 28) > 6)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-       /* Rewind PSW to repeat the ESSA instruction */
-       kvm_s390_rewind_psw(vcpu, 4);
+       /* Retry the ESSA instruction */
+       kvm_s390_retry_instr(vcpu);
        vcpu->arch.sie_block->cbrlo &= PAGE_MASK;       /* reset nceo */
        cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
        down_read(&gmap->mm->mmap_sem);
@@ -981,11 +981,12 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
                return -EOPNOTSUPP;
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
                ipte_lock(vcpu);
-       ret = guest_translate_address(vcpu, address1, ar, &gpa, 1);
+       ret = guest_translate_address(vcpu, address1, ar, &gpa, GACC_STORE);
        if (ret == PGM_PROTECTION) {
                /* Write protected? Try again with read-only... */
                cc = 1;
-               ret = guest_translate_address(vcpu, address1, ar, &gpa, 0);
+               ret = guest_translate_address(vcpu, address1, ar, &gpa,
+                                             GACC_FETCH);
        }
        if (ret) {
                if (ret == PGM_ADDRESSING || ret == PGM_TRANSLATION_SPEC) {
index 44adbb8..d110dc4 100644 (file)
@@ -32,6 +32,7 @@
 #include <asm/mtrr.h>
 #include <asm/msr-index.h>
 #include <asm/asm.h>
+#include <asm/kvm_page_track.h>
 
 #define KVM_MAX_VCPUS 255
 #define KVM_SOFT_MAX_VCPUS 160
@@ -214,6 +215,14 @@ struct kvm_mmu_memory_cache {
        void *objects[KVM_NR_MEM_OBJS];
 };
 
+/*
+ * the pages used as guest page table on soft mmu are tracked by
+ * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
+ * by indirect shadow page can not be more than 15 bits.
+ *
+ * Currently, we used 14 bits that are @level, @cr4_pae, @quadrant, @access,
+ * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
+ */
 union kvm_mmu_page_role {
        unsigned word;
        struct {
@@ -276,7 +285,7 @@ struct kvm_mmu_page {
 #endif
 
        /* Number of writes since the last time traversal visited this page.  */
-       int write_flooding_count;
+       atomic_t write_flooding_count;
 };
 
 struct kvm_pio_request {
@@ -338,12 +347,8 @@ struct kvm_mmu {
 
        struct rsvd_bits_validate guest_rsvd_check;
 
-       /*
-        * Bitmap: bit set = last pte in walk
-        * index[0:1]: level (zero-based)
-        * index[2]: pte.ps
-        */
-       u8 last_pte_bitmap;
+       /* Can have large pages at levels 2..last_nonleaf_level-1. */
+       u8 last_nonleaf_level;
 
        bool nx;
 
@@ -644,12 +649,13 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_lpage_info {
-       int write_count;
+       int disallow_lpage;
 };
 
 struct kvm_arch_memory_slot {
        struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
        struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
+       unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
 };
 
 /*
@@ -694,6 +700,8 @@ struct kvm_arch {
         */
        struct list_head active_mmu_pages;
        struct list_head zapped_obsolete_pages;
+       struct kvm_page_track_notifier_node mmu_sp_tracker;
+       struct kvm_page_track_notifier_head track_notifier_head;
 
        struct list_head assigned_dev_head;
        struct iommu_domain *iommu_domain;
@@ -754,6 +762,8 @@ struct kvm_arch {
 
        bool irqchip_split;
        u8 nr_reserved_ioapic_pins;
+
+       bool disabled_lapic_found;
 };
 
 struct kvm_vm_stat {
@@ -988,6 +998,8 @@ void kvm_mmu_module_exit(void);
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 void kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_init_vm(struct kvm *kvm);
+void kvm_mmu_uninit_vm(struct kvm *kvm);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
@@ -1127,8 +1139,6 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                      const u8 *new, int bytes);
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
new file mode 100644 (file)
index 0000000..c2b8d24
--- /dev/null
@@ -0,0 +1,61 @@
+#ifndef _ASM_X86_KVM_PAGE_TRACK_H
+#define _ASM_X86_KVM_PAGE_TRACK_H
+
+enum kvm_page_track_mode {
+       KVM_PAGE_TRACK_WRITE,
+       KVM_PAGE_TRACK_MAX,
+};
+
+/*
+ * The notifier represented by @kvm_page_track_notifier_node is linked into
+ * the head which will be notified when guest is triggering the track event.
+ *
+ * Write access on the head is protected by kvm->mmu_lock, read access
+ * is protected by track_srcu.
+ */
+struct kvm_page_track_notifier_head {
+       struct srcu_struct track_srcu;
+       struct hlist_head track_notifier_list;
+};
+
+struct kvm_page_track_notifier_node {
+       struct hlist_node node;
+
+       /*
+        * It is called when guest is writing the write-tracked page
+        * and write emulation is finished at that time.
+        *
+        * @vcpu: the vcpu where the write access happened.
+        * @gpa: the physical address written by guest.
+        * @new: the data was written to the address.
+        * @bytes: the written length.
+        */
+       void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+                           int bytes);
+};
+
+void kvm_page_track_init(struct kvm *kvm);
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
+                                struct kvm_memory_slot *dont);
+int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+                                 unsigned long npages);
+
+void kvm_slot_page_track_add_page(struct kvm *kvm,
+                                 struct kvm_memory_slot *slot, gfn_t gfn,
+                                 enum kvm_page_track_mode mode);
+void kvm_slot_page_track_remove_page(struct kvm *kvm,
+                                    struct kvm_memory_slot *slot, gfn_t gfn,
+                                    enum kvm_page_track_mode mode);
+bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
+                             enum kvm_page_track_mode mode);
+
+void
+kvm_page_track_register_notifier(struct kvm *kvm,
+                                struct kvm_page_track_notifier_node *n);
+void
+kvm_page_track_unregister_notifier(struct kvm *kvm,
+                                  struct kvm_page_track_notifier_node *n);
+void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+                         int bytes);
+#endif
index 7956412..9b1a918 100644 (file)
                (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
 
 /* Declare the various hypercall operations. */
-#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT                0x0008
+#define HVCALL_NOTIFY_LONG_SPIN_WAIT           0x0008
+#define HVCALL_POST_MESSAGE                    0x005c
+#define HVCALL_SIGNAL_EVENT                    0x005d
 
 #define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE             0x00000001
 #define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT      12
index a1ff508..464fa47 100644 (file)
@@ -13,9 +13,10 @@ kvm-$(CONFIG_KVM_ASYNC_PF)   += $(KVM)/async_pf.o
 
 kvm-y                  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
                           i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
-                          hyperv.o
+                          hyperv.o page_track.o
 
 kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)    += assigned-dev.o iommu.o
+
 kvm-intel-y            += vmx.o pmu_intel.o
 kvm-amd-y              += svm.o pmu_amd.o
 
index 9dc091a..308b859 100644 (file)
@@ -51,11 +51,9 @@ struct kvm_assigned_dev_kernel {
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
                                                      int assigned_dev_id)
 {
-       struct list_head *ptr;
        struct kvm_assigned_dev_kernel *match;
 
-       list_for_each(ptr, head) {
-               match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+       list_for_each_entry(match, head, list) {
                if (match->assigned_dev_id == assigned_dev_id)
                        return match;
        }
@@ -373,14 +371,10 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 
 void kvm_free_all_assigned_devices(struct kvm *kvm)
 {
-       struct list_head *ptr, *ptr2;
-       struct kvm_assigned_dev_kernel *assigned_dev;
-
-       list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
-               assigned_dev = list_entry(ptr,
-                                         struct kvm_assigned_dev_kernel,
-                                         list);
+       struct kvm_assigned_dev_kernel *assigned_dev, *tmp;
 
+       list_for_each_entry_safe(assigned_dev, tmp,
+                                &kvm->arch.assigned_dev_head, list) {
                kvm_free_assigned_device(kvm, assigned_dev);
        }
 }
index c58ba67..5ff3485 100644 (file)
@@ -1043,6 +1043,27 @@ bool kvm_hv_hypercall_enabled(struct kvm *kvm)
        return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
 }
 
+static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+       bool longmode;
+
+       longmode = is_64_bit_mode(vcpu);
+       if (longmode)
+               kvm_register_write(vcpu, VCPU_REGS_RAX, result);
+       else {
+               kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32);
+               kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff);
+       }
+}
+
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+       struct kvm_run *run = vcpu->run;
+
+       kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result);
+       return 1;
+}
+
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 {
        u64 param, ingpa, outgpa, ret;
@@ -1055,7 +1076,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
         */
        if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
                kvm_queue_exception(vcpu, UD_VECTOR);
-               return 0;
+               return 1;
        }
 
        longmode = is_64_bit_mode(vcpu);
@@ -1083,22 +1104,33 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 
        trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
 
+       /* Hypercall continuation is not supported yet */
+       if (rep_cnt || rep_idx) {
+               res = HV_STATUS_INVALID_HYPERCALL_CODE;
+               goto set_result;
+       }
+
        switch (code) {
-       case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+       case HVCALL_NOTIFY_LONG_SPIN_WAIT:
                kvm_vcpu_on_spin(vcpu);
                break;
+       case HVCALL_POST_MESSAGE:
+       case HVCALL_SIGNAL_EVENT:
+               vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+               vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+               vcpu->run->hyperv.u.hcall.input = param;
+               vcpu->run->hyperv.u.hcall.params[0] = ingpa;
+               vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+               vcpu->arch.complete_userspace_io =
+                               kvm_hv_hypercall_complete_userspace;
+               return 0;
        default:
                res = HV_STATUS_INVALID_HYPERCALL_CODE;
                break;
        }
 
+set_result:
        ret = res | (((u64)rep_done & 0xfff) << 32);
-       if (longmode) {
-               kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
-       } else {
-               kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
-               kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
-       }
-
+       kvm_hv_hypercall_set_result(vcpu, ret);
        return 1;
 }
index b0ea42b..a4bf5b4 100644 (file)
 #define RW_STATE_WORD0 3
 #define RW_STATE_WORD1 4
 
-/* Compute with 96 bit intermediate result: (a*b)/c */
-static u64 muldiv64(u64 a, u32 b, u32 c)
+static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val)
 {
-       union {
-               u64 ll;
-               struct {
-                       u32 low, high;
-               } l;
-       } u, res;
-       u64 rl, rh;
-
-       u.ll = a;
-       rl = (u64)u.l.low * (u64)b;
-       rh = (u64)u.l.high * (u64)b;
-       rh += (rl >> 32);
-       res.l.high = div64_u64(rh, c);
-       res.l.low = div64_u64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
-       return res.ll;
-}
-
-static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
-{
-       struct kvm_kpit_channel_state *c =
-               &kvm->arch.vpit->pit_state.channels[channel];
-
-       WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+       struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
 
        switch (c->mode) {
        default:
@@ -97,18 +74,16 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
        c->gate = val;
 }
 
-static int pit_get_gate(struct kvm *kvm, int channel)
+static int pit_get_gate(struct kvm_pit *pit, int channel)
 {
-       WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
-       return kvm->arch.vpit->pit_state.channels[channel].gate;
+       return pit->pit_state.channels[channel].gate;
 }
 
-static s64 __kpit_elapsed(struct kvm *kvm)
+static s64 __kpit_elapsed(struct kvm_pit *pit)
 {
        s64 elapsed;
        ktime_t remaining;
-       struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+       struct kvm_kpit_state *ps = &pit->pit_state;
 
        if (!ps->period)
                return 0;
@@ -128,26 +103,23 @@ static s64 __kpit_elapsed(struct kvm *kvm)
        return elapsed;
 }
 
-static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c,
                        int channel)
 {
        if (channel == 0)
-               return __kpit_elapsed(kvm);
+               return __kpit_elapsed(pit);
 
        return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
 }
 
-static int pit_get_count(struct kvm *kvm, int channel)
+static int pit_get_count(struct kvm_pit *pit, int channel)
 {
-       struct kvm_kpit_channel_state *c =
-               &kvm->arch.vpit->pit_state.channels[channel];
+       struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
        s64 d, t;
        int counter;
 
-       WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
-       t = kpit_elapsed(kvm, c, channel);
-       d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+       t = kpit_elapsed(pit, c, channel);
+       d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 
        switch (c->mode) {
        case 0:
@@ -167,17 +139,14 @@ static int pit_get_count(struct kvm *kvm, int channel)
        return counter;
 }
 
-static int pit_get_out(struct kvm *kvm, int channel)
+static int pit_get_out(struct kvm_pit *pit, int channel)
 {
-       struct kvm_kpit_channel_state *c =
-               &kvm->arch.vpit->pit_state.channels[channel];
+       struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
        s64 d, t;
        int out;
 
-       WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
-       t = kpit_elapsed(kvm, c, channel);
-       d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+       t = kpit_elapsed(pit, c, channel);
+       d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 
        switch (c->mode) {
        default:
@@ -202,29 +171,23 @@ static int pit_get_out(struct kvm *kvm, int channel)
        return out;
 }
 
-static void pit_latch_count(struct kvm *kvm, int channel)
+static void pit_latch_count(struct kvm_pit *pit, int channel)
 {
-       struct kvm_kpit_channel_state *c =
-               &kvm->arch.vpit->pit_state.channels[channel];
-
-       WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+       struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
 
        if (!c->count_latched) {
-               c->latched_count = pit_get_count(kvm, channel);
+               c->latched_count = pit_get_count(pit, channel);
                c->count_latched = c->rw_mode;
        }
 }
 
-static void pit_latch_status(struct kvm *kvm, int channel)
+static void pit_latch_status(struct kvm_pit *pit, int channel)
 {
-       struct kvm_kpit_channel_state *c =
-               &kvm->arch.vpit->pit_state.channels[channel];
-
-       WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+       struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
 
        if (!c->status_latched) {
                /* TODO: Return NULL COUNT (bit 6). */
-               c->status = ((pit_get_out(kvm, channel) << 7) |
+               c->status = ((pit_get_out(pit, channel) << 7) |
                                (c->rw_mode << 4) |
                                (c->mode << 1) |
                                c->bcd);
@@ -232,26 +195,24 @@ static void pit_latch_status(struct kvm *kvm, int channel)
        }
 }
 
+static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps)
+{
+       return container_of(ps, struct kvm_pit, pit_state);
+}
+
 static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
        struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
                                                 irq_ack_notifier);
-       int value;
-
-       spin_lock(&ps->inject_lock);
-       value = atomic_dec_return(&ps->pending);
-       if (value < 0)
-               /* spurious acks can be generated if, for example, the
-                * PIC is being reset.  Handle it gracefully here
-                */
-               atomic_inc(&ps->pending);
-       else if (value > 0)
-               /* in this case, we had multiple outstanding pit interrupts
-                * that we needed to inject.  Reinject
-                */
-               queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
-       ps->irq_ack = 1;
-       spin_unlock(&ps->inject_lock);
+       struct kvm_pit *pit = pit_state_to_pit(ps);
+
+       atomic_set(&ps->irq_ack, 1);
+       /* irq_ack should be set before pending is read.  Order accesses with
+        * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work.
+        */
+       smp_mb();
+       if (atomic_dec_if_positive(&ps->pending) > 0)
+               queue_kthread_work(&pit->worker, &pit->expired);
 }
 
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -282,45 +243,36 @@ static void pit_do_work(struct kthread_work *work)
        struct kvm_vcpu *vcpu;
        int i;
        struct kvm_kpit_state *ps = &pit->pit_state;
-       int inject = 0;
 
-       /* Try to inject pending interrupts when
-        * last one has been acked.
+       if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
+               return;
+
+       kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
+       kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
+
+       /*
+        * Provides NMI watchdog support via Virtual Wire mode.
+        * The route is: PIT -> LVT0 in NMI mode.
+        *
+        * Note: Our Virtual Wire implementation does not follow
+        * the MP specification.  We propagate a PIT interrupt to all
+        * VCPUs and only when LVT0 is in NMI mode.  The interrupt can
+        * also be simultaneously delivered through PIC and IOAPIC.
         */
-       spin_lock(&ps->inject_lock);
-       if (ps->irq_ack) {
-               ps->irq_ack = 0;
-               inject = 1;
-       }
-       spin_unlock(&ps->inject_lock);
-       if (inject) {
-               kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
-               kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
-
-               /*
-                * Provides NMI watchdog support via Virtual Wire mode.
-                * The route is: PIT -> PIC -> LVT0 in NMI mode.
-                *
-                * Note: Our Virtual Wire implementation is simplified, only
-                * propagating PIT interrupts to all VCPUs when they have set
-                * LVT0 to NMI delivery. Other PIC interrupts are just sent to
-                * VCPU0, and only if its LVT0 is in EXTINT mode.
-                */
-               if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
-                       kvm_for_each_vcpu(i, vcpu, kvm)
-                               kvm_apic_nmi_wd_deliver(vcpu);
-       }
+       if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
+               kvm_for_each_vcpu(i, vcpu, kvm)
+                       kvm_apic_nmi_wd_deliver(vcpu);
 }
 
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 {
        struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
-       struct kvm_pit *pt = ps->kvm->arch.vpit;
+       struct kvm_pit *pt = pit_state_to_pit(ps);
 
-       if (ps->reinject || !atomic_read(&ps->pending)) {
+       if (atomic_read(&ps->reinject))
                atomic_inc(&ps->pending);
-               queue_kthread_work(&pt->worker, &pt->expired);
-       }
+
+       queue_kthread_work(&pt->worker, &pt->expired);
 
        if (ps->is_periodic) {
                hrtimer_add_expires_ns(&ps->timer, ps->period);
@@ -329,30 +281,54 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
                return HRTIMER_NORESTART;
 }
 
-static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
+static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
 {
-       struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+       atomic_set(&pit->pit_state.pending, 0);
+       atomic_set(&pit->pit_state.irq_ack, 1);
+}
+
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+{
+       struct kvm_kpit_state *ps = &pit->pit_state;
+       struct kvm *kvm = pit->kvm;
+
+       if (atomic_read(&ps->reinject) == reinject)
+               return;
+
+       if (reinject) {
+               /* The initial state is preserved while ps->reinject == 0. */
+               kvm_pit_reset_reinject(pit);
+               kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+               kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+       } else {
+               kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+               kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+       }
+
+       atomic_set(&ps->reinject, reinject);
+}
+
+static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
+{
+       struct kvm_kpit_state *ps = &pit->pit_state;
+       struct kvm *kvm = pit->kvm;
        s64 interval;
 
        if (!ioapic_in_kernel(kvm) ||
            ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
                return;
 
-       interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+       interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ);
 
        pr_debug("create pit timer, interval is %llu nsec\n", interval);
 
        /* TODO The new value only affected after the retriggered */
        hrtimer_cancel(&ps->timer);
-       flush_kthread_work(&ps->pit->expired);
+       flush_kthread_work(&pit->expired);
        ps->period = interval;
        ps->is_periodic = is_period;
 
-       ps->timer.function = pit_timer_fn;
-       ps->kvm = ps->pit->kvm;
-
-       atomic_set(&ps->pending, 0);
-       ps->irq_ack = 1;
+       kvm_pit_reset_reinject(pit);
 
        /*
         * Do not allow the guest to program periodic timers with small
@@ -375,11 +351,9 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
                      HRTIMER_MODE_ABS);
 }
 
-static void pit_load_count(struct kvm *kvm, int channel, u32 val)
+static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
 {
-       struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
-
-       WARN_ON(!mutex_is_locked(&ps->lock));
+       struct kvm_kpit_state *ps = &pit->pit_state;
 
        pr_debug("load_count val is %d, channel is %d\n", val, channel);
 
@@ -404,29 +378,33 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
        case 1:
         /* FIXME: enhance mode 4 precision */
        case 4:
-               create_pit_timer(kvm, val, 0);
+               create_pit_timer(pit, val, 0);
                break;
        case 2:
        case 3:
-               create_pit_timer(kvm, val, 1);
+               create_pit_timer(pit, val, 1);
                break;
        default:
-               destroy_pit_timer(kvm->arch.vpit);
+               destroy_pit_timer(pit);
        }
 }
 
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+               int hpet_legacy_start)
 {
        u8 saved_mode;
+
+       WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock));
+
        if (hpet_legacy_start) {
                /* save existing mode for later reenablement */
                WARN_ON(channel != 0);
-               saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
-               kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
-               pit_load_count(kvm, channel, val);
-               kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
+               saved_mode = pit->pit_state.channels[0].mode;
+               pit->pit_state.channels[0].mode = 0xff; /* disable timer */
+               pit_load_count(pit, channel, val);
+               pit->pit_state.channels[0].mode = saved_mode;
        } else {
-               pit_load_count(kvm, channel, val);
+               pit_load_count(pit, channel, val);
        }
 }
 
@@ -452,7 +430,6 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
 {
        struct kvm_pit *pit = dev_to_pit(this);
        struct kvm_kpit_state *pit_state = &pit->pit_state;
-       struct kvm *kvm = pit->kvm;
        int channel, access;
        struct kvm_kpit_channel_state *s;
        u32 val = *(u32 *) data;
@@ -476,9 +453,9 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
                                s = &pit_state->channels[channel];
                                if (val & (2 << channel)) {
                                        if (!(val & 0x20))
-                                               pit_latch_count(kvm, channel);
+                                               pit_latch_count(pit, channel);
                                        if (!(val & 0x10))
-                                               pit_latch_status(kvm, channel);
+                                               pit_latch_status(pit, channel);
                                }
                        }
                } else {
@@ -486,7 +463,7 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
                        s = &pit_state->channels[channel];
                        access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
                        if (access == 0) {
-                               pit_latch_count(kvm, channel);
+                               pit_latch_count(pit, channel);
                        } else {
                                s->rw_mode = access;
                                s->read_state = access;
@@ -503,17 +480,17 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
                switch (s->write_state) {
                default:
                case RW_STATE_LSB:
-                       pit_load_count(kvm, addr, val);
+                       pit_load_count(pit, addr, val);
                        break;
                case RW_STATE_MSB:
-                       pit_load_count(kvm, addr, val << 8);
+                       pit_load_count(pit, addr, val << 8);
                        break;
                case RW_STATE_WORD0:
                        s->write_latch = val;
                        s->write_state = RW_STATE_WORD1;
                        break;
                case RW_STATE_WORD1:
-                       pit_load_count(kvm, addr, s->write_latch | (val << 8));
+                       pit_load_count(pit, addr, s->write_latch | (val << 8));
                        s->write_state = RW_STATE_WORD0;
                        break;
                }
@@ -529,7 +506,6 @@ static int pit_ioport_read(struct kvm_vcpu *vcpu,
 {
        struct kvm_pit *pit = dev_to_pit(this);
        struct kvm_kpit_state *pit_state = &pit->pit_state;
-       struct kvm *kvm = pit->kvm;
        int ret, count;
        struct kvm_kpit_channel_state *s;
        if (!pit_in_range(addr))
@@ -566,20 +542,20 @@ static int pit_ioport_read(struct kvm_vcpu *vcpu,
                switch (s->read_state) {
                default:
                case RW_STATE_LSB:
-                       count = pit_get_count(kvm, addr);
+                       count = pit_get_count(pit, addr);
                        ret = count & 0xff;
                        break;
                case RW_STATE_MSB:
-                       count = pit_get_count(kvm, addr);
+                       count = pit_get_count(pit, addr);
                        ret = (count >> 8) & 0xff;
                        break;
                case RW_STATE_WORD0:
-                       count = pit_get_count(kvm, addr);
+                       count = pit_get_count(pit, addr);
                        ret = count & 0xff;
                        s->read_state = RW_STATE_WORD1;
                        break;
                case RW_STATE_WORD1:
-                       count = pit_get_count(kvm, addr);
+                       count = pit_get_count(pit, addr);
                        ret = (count >> 8) & 0xff;
                        s->read_state = RW_STATE_WORD0;
                        break;
@@ -600,14 +576,13 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu,
 {
        struct kvm_pit *pit = speaker_to_pit(this);
        struct kvm_kpit_state *pit_state = &pit->pit_state;
-       struct kvm *kvm = pit->kvm;
        u32 val = *(u32 *) data;
        if (addr != KVM_SPEAKER_BASE_ADDRESS)
                return -EOPNOTSUPP;
 
        mutex_lock(&pit_state->lock);
        pit_state->speaker_data_on = (val >> 1) & 1;
-       pit_set_gate(kvm, 2, val & 1);
+       pit_set_gate(pit, 2, val & 1);
        mutex_unlock(&pit_state->lock);
        return 0;
 }
@@ -618,7 +593,6 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
 {
        struct kvm_pit *pit = speaker_to_pit(this);
        struct kvm_kpit_state *pit_state = &pit->pit_state;
-       struct kvm *kvm = pit->kvm;
        unsigned int refresh_clock;
        int ret;
        if (addr != KVM_SPEAKER_BASE_ADDRESS)
@@ -628,8 +602,8 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
        refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
 
        mutex_lock(&pit_state->lock);
-       ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) |
-               (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4));
+       ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) |
+               (pit_get_out(pit, 2) << 5) | (refresh_clock << 4));
        if (len > sizeof(ret))
                len = sizeof(ret);
        memcpy(data, (char *)&ret, len);
@@ -637,33 +611,28 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-void kvm_pit_reset(struct kvm_pit *pit)
+static void kvm_pit_reset(struct kvm_pit *pit)
 {
        int i;
        struct kvm_kpit_channel_state *c;
 
-       mutex_lock(&pit->pit_state.lock);
        pit->pit_state.flags = 0;
        for (i = 0; i < 3; i++) {
                c = &pit->pit_state.channels[i];
                c->mode = 0xff;
                c->gate = (i != 2);
-               pit_load_count(pit->kvm, i, 0);
+               pit_load_count(pit, i, 0);
        }
-       mutex_unlock(&pit->pit_state.lock);
 
-       atomic_set(&pit->pit_state.pending, 0);
-       pit->pit_state.irq_ack = 1;
+       kvm_pit_reset_reinject(pit);
 }
 
 static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
 {
        struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
 
-       if (!mask) {
-               atomic_set(&pit->pit_state.pending, 0);
-               pit->pit_state.irq_ack = 1;
-       }
+       if (!mask)
+               kvm_pit_reset_reinject(pit);
 }
 
 static const struct kvm_io_device_ops pit_dev_ops = {
@@ -690,14 +659,10 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
                return NULL;
 
        pit->irq_source_id = kvm_request_irq_source_id(kvm);
-       if (pit->irq_source_id < 0) {
-               kfree(pit);
-               return NULL;
-       }
+       if (pit->irq_source_id < 0)
+               goto fail_request;
 
        mutex_init(&pit->pit_state.lock);
-       mutex_lock(&pit->pit_state.lock);
-       spin_lock_init(&pit->pit_state.inject_lock);
 
        pid = get_pid(task_tgid(current));
        pid_nr = pid_vnr(pid);
@@ -706,36 +671,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
        init_kthread_worker(&pit->worker);
        pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
                                       "kvm-pit/%d", pid_nr);
-       if (IS_ERR(pit->worker_task)) {
-               mutex_unlock(&pit->pit_state.lock);
-               kvm_free_irq_source_id(kvm, pit->irq_source_id);
-               kfree(pit);
-               return NULL;
-       }
+       if (IS_ERR(pit->worker_task))
+               goto fail_kthread;
+
        init_kthread_work(&pit->expired, pit_do_work);
 
-       kvm->arch.vpit = pit;
        pit->kvm = kvm;
 
        pit_state = &pit->pit_state;
-       pit_state->pit = pit;
        hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+       pit_state->timer.function = pit_timer_fn;
+
        pit_state->irq_ack_notifier.gsi = 0;
        pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
-       kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
-       pit_state->reinject = true;
-       mutex_unlock(&pit->pit_state.lock);
+       pit->mask_notifier.func = pit_mask_notifer;
 
        kvm_pit_reset(pit);
 
-       pit->mask_notifier.func = pit_mask_notifer;
-       kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+       kvm_pit_set_reinject(pit, true);
 
        kvm_iodevice_init(&pit->dev, &pit_dev_ops);
        ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
                                      KVM_PIT_MEM_LENGTH, &pit->dev);
        if (ret < 0)
-               goto fail;
+               goto fail_register_pit;
 
        if (flags & KVM_PIT_SPEAKER_DUMMY) {
                kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
@@ -743,42 +702,35 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
                                              KVM_SPEAKER_BASE_ADDRESS, 4,
                                              &pit->speaker_dev);
                if (ret < 0)
-                       goto fail_unregister;
+                       goto fail_register_speaker;
        }
 
        return pit;
 
-fail_unregister:
+fail_register_speaker:
        kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
-
-fail:
-       kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
-       kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
-       kvm_free_irq_source_id(kvm, pit->irq_source_id);
+fail_register_pit:
+       kvm_pit_set_reinject(pit, false);
        kthread_stop(pit->worker_task);
+fail_kthread:
+       kvm_free_irq_source_id(kvm, pit->irq_source_id);
+fail_request:
        kfree(pit);
        return NULL;
 }
 
 void kvm_free_pit(struct kvm *kvm)
 {
-       struct hrtimer *timer;
-
-       if (kvm->arch.vpit) {
-               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
-               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
-                                             &kvm->arch.vpit->speaker_dev);
-               kvm_unregister_irq_mask_notifier(kvm, 0,
-                                              &kvm->arch.vpit->mask_notifier);
-               kvm_unregister_irq_ack_notifier(kvm,
-                               &kvm->arch.vpit->pit_state.irq_ack_notifier);
-               mutex_lock(&kvm->arch.vpit->pit_state.lock);
-               timer = &kvm->arch.vpit->pit_state.timer;
-               hrtimer_cancel(timer);
-               flush_kthread_work(&kvm->arch.vpit->expired);
-               kthread_stop(kvm->arch.vpit->worker_task);
-               kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
-               mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-               kfree(kvm->arch.vpit);
+       struct kvm_pit *pit = kvm->arch.vpit;
+
+       if (pit) {
+               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+               kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
+               kvm_pit_set_reinject(pit, false);
+               hrtimer_cancel(&pit->pit_state.timer);
+               flush_kthread_work(&pit->expired);
+               kthread_stop(pit->worker_task);
+               kvm_free_irq_source_id(kvm, pit->irq_source_id);
+               kfree(pit);
        }
 }
index c84990b..2f5af07 100644 (file)
@@ -22,19 +22,18 @@ struct kvm_kpit_channel_state {
 };
 
 struct kvm_kpit_state {
+       /* All members before "struct mutex lock" are protected by the lock. */
        struct kvm_kpit_channel_state channels[3];
        u32 flags;
        bool is_periodic;
        s64 period;                             /* unit: ns */
        struct hrtimer timer;
-       atomic_t pending;                       /* accumulated triggered timers */
-       bool reinject;
-       struct kvm *kvm;
        u32    speaker_data_on;
+
        struct mutex lock;
-       struct kvm_pit *pit;
-       spinlock_t inject_lock;
-       unsigned long irq_ack;
+       atomic_t reinject;
+       atomic_t pending; /* accumulated triggered timers */
+       atomic_t irq_ack;
        struct kvm_irq_ack_notifier irq_ack_notifier;
 };
 
@@ -57,9 +56,11 @@ struct kvm_pit {
 #define KVM_MAX_PIT_INTR_INTERVAL   HZ / 100
 #define KVM_PIT_CHANNEL_MASK       0x3
 
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
 struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
 void kvm_free_pit(struct kvm *kvm);
-void kvm_pit_reset(struct kvm_pit *pit);
+
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+               int hpet_legacy_start);
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
 
 #endif
index 1facfd6..9db4709 100644 (file)
@@ -94,7 +94,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
 static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
 {
        ioapic->rtc_status.pending_eoi = 0;
-       bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS);
+       bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPUS);
 }
 
 static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
@@ -117,16 +117,16 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
                return;
 
        new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
-       old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+       old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
 
        if (new_val == old_val)
                return;
 
        if (new_val) {
-               __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+               __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
                ioapic->rtc_status.pending_eoi++;
        } else {
-               __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+               __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
                ioapic->rtc_status.pending_eoi--;
                rtc_status_pending_eoi_check_valid(ioapic);
        }
@@ -156,7 +156,8 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
 
 static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu)
 {
-       if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) {
+       if (test_and_clear_bit(vcpu->vcpu_id,
+                              ioapic->rtc_status.dest_map.map)) {
                --ioapic->rtc_status.pending_eoi;
                rtc_status_pending_eoi_check_valid(ioapic);
        }
@@ -236,10 +237,17 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
 {
        struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+       struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
        union kvm_ioapic_redirect_entry *e;
        int index;
 
        spin_lock(&ioapic->lock);
+
+       /* Make sure we see any missing RTC EOI */
+       if (test_bit(vcpu->vcpu_id, dest_map->map))
+               __set_bit(dest_map->vectors[vcpu->vcpu_id],
+                         ioapic_handled_vectors);
+
        for (index = 0; index < IOAPIC_NUM_PINS; index++) {
                e = &ioapic->redirtbl[index];
                if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
@@ -346,7 +354,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
                 */
                BUG_ON(ioapic->rtc_status.pending_eoi != 0);
                ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
-                               ioapic->rtc_status.dest_map);
+                                              &ioapic->rtc_status.dest_map);
                ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
        } else
                ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
@@ -407,8 +415,14 @@ static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
 static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
                        struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
-       int i;
+       struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
        struct kvm_lapic *apic = vcpu->arch.apic;
+       int i;
+
+       /* RTC special handling */
+       if (test_bit(vcpu->vcpu_id, dest_map->map) &&
+           vector == dest_map->vectors[vcpu->vcpu_id])
+               rtc_irq_eoi(ioapic, vcpu);
 
        for (i = 0; i < IOAPIC_NUM_PINS; i++) {
                union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
@@ -416,8 +430,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
                if (ent->fields.vector != vector)
                        continue;
 
-               if (i == RTC_GSI)
-                       rtc_irq_eoi(ioapic, vcpu);
                /*
                 * We are dropping lock while calling ack notifiers because ack
                 * notifier callbacks for assigned devices call into IOAPIC
index 2d16dc2..7d2692a 100644 (file)
@@ -40,9 +40,21 @@ struct kvm_vcpu;
 #define RTC_GSI -1U
 #endif
 
+struct dest_map {
+       /* vcpu bitmap where IRQ has been sent */
+       DECLARE_BITMAP(map, KVM_MAX_VCPUS);
+
+       /*
+        * Vector sent to a given vcpu, only valid when
+        * the vcpu's bit in map is set
+        */
+       u8 vectors[KVM_MAX_VCPUS];
+};
+
+
 struct rtc_status {
        int pending_eoi;
-       DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS);
+       struct dest_map dest_map;
 };
 
 union kvm_ioapic_redirect_entry {
@@ -118,7 +130,8 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
                       int level, bool line_status);
 void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq, unsigned long *dest_map);
+                            struct kvm_lapic_irq *irq,
+                            struct dest_map *dest_map);
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
index 3982b47..95fcc7b 100644 (file)
  */
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-       return apic_has_pending_timer(vcpu);
+       if (lapic_in_kernel(vcpu))
+               return apic_has_pending_timer(vcpu);
+
+       return 0;
 }
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 
@@ -137,8 +140,8 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
-       kvm_inject_apic_timer_irqs(vcpu);
-       /* TODO: PIT, RTC etc. */
+       if (lapic_in_kernel(vcpu))
+               kvm_inject_apic_timer_irqs(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
 
index ae5c78f..61ebdc1 100644 (file)
@@ -109,14 +109,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
        return ret;
 }
 
-static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
-{
-       /* Same as irqchip_in_kernel(vcpu->kvm), but with less
-        * pointer chasing and no unnecessary memory barriers.
-        */
-       return vcpu->arch.apic != NULL;
-}
-
 void kvm_pic_reset(struct kvm_kpic_state *s);
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
index 8fc89ef..54ead79 100644 (file)
@@ -34,6 +34,7 @@
 #include "lapic.h"
 
 #include "hyperv.h"
+#include "x86.h"
 
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
                           struct kvm *kvm, int irq_source_id, int level,
@@ -53,10 +54,12 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 }
 
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq, unsigned long *dest_map)
+               struct kvm_lapic_irq *irq, struct dest_map *dest_map)
 {
        int i, r = -1;
        struct kvm_vcpu *vcpu, *lowest = NULL;
+       unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+       unsigned int dest_vcpus = 0;
 
        if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
                        kvm_lowest_prio_delivery(irq)) {
@@ -67,6 +70,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
        if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
                return r;
 
+       memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
        kvm_for_each_vcpu(i, vcpu, kvm) {
                if (!kvm_apic_present(vcpu))
                        continue;
@@ -80,13 +85,25 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
                                r = 0;
                        r += kvm_apic_set_irq(vcpu, irq, dest_map);
                } else if (kvm_lapic_enabled(vcpu)) {
-                       if (!lowest)
-                               lowest = vcpu;
-                       else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
-                               lowest = vcpu;
+                       if (!kvm_vector_hashing_enabled()) {
+                               if (!lowest)
+                                       lowest = vcpu;
+                               else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+                                       lowest = vcpu;
+                       } else {
+                               __set_bit(i, dest_vcpu_bitmap);
+                               dest_vcpus++;
+                       }
                }
        }
 
+       if (dest_vcpus != 0) {
+               int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+                                       dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+               lowest = kvm_get_vcpu(kvm, idx);
+       }
+
        if (lowest)
                r = kvm_apic_set_irq(lowest, irq, dest_map);
 
index 36591fa..d9ae1ce 100644 (file)
@@ -281,7 +281,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
        struct kvm_cpuid_entry2 *feat;
        u32 v = APIC_VERSION;
 
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return;
 
        feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@ -475,26 +475,20 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
 
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 {
-       int highest_irr;
-
        /* This may race with setting of irr in __apic_accept_irq() and
         * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
         * will cause vmexit immediately and the value will be recalculated
         * on the next vmentry.
         */
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return 0;
-       highest_irr = apic_find_highest_irr(vcpu->arch.apic);
-
-       return highest_irr;
+       return apic_find_highest_irr(vcpu->arch.apic);
 }
 
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                             int vector, int level, int trig_mode,
-                            unsigned long *dest_map);
+                            struct dest_map *dest_map);
 
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
-               unsigned long *dest_map)
+                    struct dest_map *dest_map)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
@@ -675,8 +669,33 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
        }
 }
 
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+                      const unsigned long *bitmap, u32 bitmap_size)
+{
+       u32 mod;
+       int i, idx = -1;
+
+       mod = vector % dest_vcpus;
+
+       for (i = 0; i <= mod; i++) {
+               idx = find_next_bit(bitmap, bitmap_size, idx + 1);
+               BUG_ON(idx == bitmap_size);
+       }
+
+       return idx;
+}
+
+static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
+{
+       if (!kvm->arch.disabled_lapic_found) {
+               kvm->arch.disabled_lapic_found = true;
+               printk(KERN_INFO
+                      "Disabled LAPIC found during irq injection\n");
+       }
+}
+
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
+               struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
 {
        struct kvm_apic_map *map;
        unsigned long bitmap = 1;
@@ -727,21 +746,42 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 
                dst = map->logical_map[cid];
 
-               if (kvm_lowest_prio_delivery(irq)) {
+               if (!kvm_lowest_prio_delivery(irq))
+                       goto set_irq;
+
+               if (!kvm_vector_hashing_enabled()) {
                        int l = -1;
                        for_each_set_bit(i, &bitmap, 16) {
                                if (!dst[i])
                                        continue;
                                if (l < 0)
                                        l = i;
-                               else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
+                               else if (kvm_apic_compare_prio(dst[i]->vcpu,
+                                                       dst[l]->vcpu) < 0)
                                        l = i;
                        }
-
                        bitmap = (l >= 0) ? 1 << l : 0;
+               } else {
+                       int idx;
+                       unsigned int dest_vcpus;
+
+                       dest_vcpus = hweight16(bitmap);
+                       if (dest_vcpus == 0)
+                               goto out;
+
+                       idx = kvm_vector_to_index(irq->vector,
+                               dest_vcpus, &bitmap, 16);
+
+                       if (!dst[idx]) {
+                               kvm_apic_disabled_lapic_found(kvm);
+                               goto out;
+                       }
+
+                       bitmap = (idx >= 0) ? 1 << idx : 0;
                }
        }
 
+set_irq:
        for_each_set_bit(i, &bitmap, 16) {
                if (!dst[i])
                        continue;
@@ -754,6 +794,20 @@ out:
        return ret;
 }
 
+/*
+ * This routine tries to handler interrupts in posted mode, here is how
+ * it deals with different cases:
+ * - For single-destination interrupts, handle it in posted mode
+ * - Else if vector hashing is enabled and it is a lowest-priority
+ *   interrupt, handle it in posted mode and use the following mechanism
+ *   to find the destinaiton vCPU.
+ *     1. For lowest-priority interrupts, store all the possible
+ *        destination vCPUs in an array.
+ *     2. Use "guest vector % max number of destination vCPUs" to find
+ *        the right destination vCPU in the array for the lowest-priority
+ *        interrupt.
+ * - Otherwise, use remapped mode to inject the interrupt.
+ */
 bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
                        struct kvm_vcpu **dest_vcpu)
 {
@@ -795,16 +849,37 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
                if (cid >= ARRAY_SIZE(map->logical_map))
                        goto out;
 
-               for_each_set_bit(i, &bitmap, 16) {
-                       dst = map->logical_map[cid][i];
-                       if (++r == 2)
+               if (kvm_vector_hashing_enabled() &&
+                               kvm_lowest_prio_delivery(irq)) {
+                       int idx;
+                       unsigned int dest_vcpus;
+
+                       dest_vcpus = hweight16(bitmap);
+                       if (dest_vcpus == 0)
                                goto out;
-               }
 
-               if (dst && kvm_apic_present(dst->vcpu))
+                       idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+                                                 &bitmap, 16);
+
+                       dst = map->logical_map[cid][idx];
+                       if (!dst) {
+                               kvm_apic_disabled_lapic_found(kvm);
+                               goto out;
+                       }
+
                        *dest_vcpu = dst->vcpu;
-               else
-                       goto out;
+               } else {
+                       for_each_set_bit(i, &bitmap, 16) {
+                               dst = map->logical_map[cid][i];
+                               if (++r == 2)
+                                       goto out;
+                       }
+
+                       if (dst && kvm_apic_present(dst->vcpu))
+                               *dest_vcpu = dst->vcpu;
+                       else
+                               goto out;
+               }
        }
 
        ret = true;
@@ -819,7 +894,7 @@ out:
  */
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                             int vector, int level, int trig_mode,
-                            unsigned long *dest_map)
+                            struct dest_map *dest_map)
 {
        int result = 0;
        struct kvm_vcpu *vcpu = apic->vcpu;
@@ -839,8 +914,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 
                result = 1;
 
-               if (dest_map)
-                       __set_bit(vcpu->vcpu_id, dest_map);
+               if (dest_map) {
+                       __set_bit(vcpu->vcpu_id, dest_map->map);
+                       dest_map->vectors[vcpu->vcpu_id] = vector;
+               }
 
                if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
                        if (trig_mode)
@@ -1239,7 +1316,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
        struct kvm_lapic *apic = vcpu->arch.apic;
        u64 guest_tsc, tsc_deadline;
 
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return;
 
        if (apic->lapic_timer.expired_tscdeadline == 0)
@@ -1515,8 +1592,7 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
 
 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
 {
-       if (kvm_vcpu_has_lapic(vcpu))
-               apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+       apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
 
@@ -1566,7 +1642,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
-       if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+       if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
                        apic_lvtt_period(apic))
                return 0;
 
@@ -1577,7 +1653,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
-       if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+       if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
                        apic_lvtt_period(apic))
                return;
 
@@ -1590,9 +1666,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return;
-
        apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
                     | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
 }
@@ -1601,9 +1674,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
 {
        u64 tpr;
 
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return 0;
-
        tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
 
        return (tpr & 0xf0) >> 4;
@@ -1728,8 +1798,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
-       if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
-                       apic_lvt_enabled(apic, APIC_LVTT))
+       if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
                return atomic_read(&apic->lapic_timer.pending);
 
        return 0;
@@ -1826,7 +1895,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
        struct kvm_lapic *apic = vcpu->arch.apic;
        int highest_irr;
 
-       if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
+       if (!apic_enabled(apic))
                return -1;
 
        apic_update_ppr(apic);
@@ -1854,9 +1923,6 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
-       if (!kvm_vcpu_has_lapic(vcpu))
-               return;
-
        if (atomic_read(&apic->lapic_timer.pending) > 0) {
                kvm_apic_local_deliver(apic, APIC_LVTT);
                if (apic_lvtt_tscdeadline(apic))
@@ -1932,7 +1998,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 {
        struct hrtimer *timer;
 
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return;
 
        timer = &vcpu->arch.apic->lapic_timer.timer;
@@ -2105,7 +2171,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return 1;
 
        /* if this is ICR write vector before command */
@@ -2119,7 +2185,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
        struct kvm_lapic *apic = vcpu->arch.apic;
        u32 low, high = 0;
 
-       if (!kvm_vcpu_has_lapic(vcpu))
+       if (!lapic_in_kernel(vcpu))
                return 1;
 
        if (apic_reg_read(apic, reg, 4, &low))
@@ -2151,7 +2217,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
        u8 sipi_vector;
        unsigned long pe;
 
-       if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
+       if (!lapic_in_kernel(vcpu) || !apic->pending_events)
                return;
 
        /*
index 41bdb35..f71183e 100644 (file)
@@ -42,6 +42,9 @@ struct kvm_lapic {
        unsigned long pending_events;
        unsigned int sipi_vector;
 };
+
+struct dest_map;
+
 int kvm_create_lapic(struct kvm_vcpu *vcpu);
 void kvm_free_lapic(struct kvm_vcpu *vcpu);
 
@@ -60,11 +63,11 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
-               unsigned long *dest_map);
+                    struct dest_map *dest_map);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
 
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
+               struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
@@ -103,7 +106,7 @@ static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
 
 extern struct static_key kvm_no_apic_vcpu;
 
-static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu)
+static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
 {
        if (static_key_false(&kvm_no_apic_vcpu))
                return vcpu->arch.apic;
@@ -130,7 +133,7 @@ static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
 
 static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
 {
-       return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+       return lapic_in_kernel(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
 }
 
 static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
@@ -150,7 +153,7 @@ static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
 
 static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 {
-       return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events;
+       return lapic_in_kernel(vcpu) && vcpu->arch.apic->pending_events;
 }
 
 static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
@@ -161,7 +164,7 @@ static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
 
 static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
 {
-       return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+       return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
 }
 
 static inline int kvm_apic_id(struct kvm_lapic *apic)
@@ -175,4 +178,6 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu);
 
 bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
                        struct kvm_vcpu **dest_vcpu);
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+                       const unsigned long *bitmap, u32 bitmap_size);
 #endif
index 95a955d..2463de0 100644 (file)
@@ -41,6 +41,7 @@
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
 #include <asm/vmx.h>
+#include <asm/kvm_page_track.h>
 
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
@@ -776,62 +777,85 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
        return &slot->arch.lpage_info[level - 2][idx];
 }
 
+static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+                                           gfn_t gfn, int count)
+{
+       struct kvm_lpage_info *linfo;
+       int i;
+
+       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+               linfo = lpage_info_slot(gfn, slot, i);
+               linfo->disallow_lpage += count;
+               WARN_ON(linfo->disallow_lpage < 0);
+       }
+}
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+       update_gfn_disallow_lpage_count(slot, gfn, 1);
+}
+
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+       update_gfn_disallow_lpage_count(slot, gfn, -1);
+}
+
 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *slot;
-       struct kvm_lpage_info *linfo;
        gfn_t gfn;
-       int i;
 
+       kvm->arch.indirect_shadow_pages++;
        gfn = sp->gfn;
        slots = kvm_memslots_for_spte_role(kvm, sp->role);
        slot = __gfn_to_memslot(slots, gfn);
-       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
-               linfo = lpage_info_slot(gfn, slot, i);
-               linfo->write_count += 1;
-       }
-       kvm->arch.indirect_shadow_pages++;
+
+       /* the non-leaf shadow pages are keeping readonly. */
+       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+               return kvm_slot_page_track_add_page(kvm, slot, gfn,
+                                                   KVM_PAGE_TRACK_WRITE);
+
+       kvm_mmu_gfn_disallow_lpage(slot, gfn);
 }
 
 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *slot;
-       struct kvm_lpage_info *linfo;
        gfn_t gfn;
-       int i;
 
+       kvm->arch.indirect_shadow_pages--;
        gfn = sp->gfn;
        slots = kvm_memslots_for_spte_role(kvm, sp->role);
        slot = __gfn_to_memslot(slots, gfn);
-       for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
-               linfo = lpage_info_slot(gfn, slot, i);
-               linfo->write_count -= 1;
-               WARN_ON(linfo->write_count < 0);
-       }
-       kvm->arch.indirect_shadow_pages--;
+       if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+               return kvm_slot_page_track_remove_page(kvm, slot, gfn,
+                                                      KVM_PAGE_TRACK_WRITE);
+
+       kvm_mmu_gfn_allow_lpage(slot, gfn);
 }
 
-static int __has_wrprotected_page(gfn_t gfn, int level,
-                                 struct kvm_memory_slot *slot)
+static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
+                                         struct kvm_memory_slot *slot)
 {
        struct kvm_lpage_info *linfo;
 
        if (slot) {
                linfo = lpage_info_slot(gfn, slot, level);
-               return linfo->write_count;
+               return !!linfo->disallow_lpage;
        }
 
-       return 1;
+       return true;
 }
 
-static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
+                                       int level)
 {
        struct kvm_memory_slot *slot;
 
        slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-       return __has_wrprotected_page(gfn, level, slot);
+       return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
 }
 
 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
@@ -897,7 +921,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
        max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
 
        for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-               if (__has_wrprotected_page(large_gfn, level, slot))
+               if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
                        break;
 
        return level - 1;
@@ -1323,23 +1347,29 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
                kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+                                   struct kvm_memory_slot *slot, u64 gfn)
 {
-       struct kvm_memory_slot *slot;
        struct kvm_rmap_head *rmap_head;
        int i;
        bool write_protected = false;
 
-       slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-
        for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
                rmap_head = __gfn_to_rmap(gfn, i, slot);
-               write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true);
+               write_protected |= __rmap_write_protect(kvm, rmap_head, true);
        }
 
        return write_protected;
 }
 
+static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+{
+       struct kvm_memory_slot *slot;
+
+       slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+       return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+}
+
 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
 {
        u64 *sptep;
@@ -1754,7 +1784,7 @@ static void mark_unsync(u64 *spte)
 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
                               struct kvm_mmu_page *sp)
 {
-       return 1;
+       return 0;
 }
 
 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
@@ -1840,13 +1870,16 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
        return nr_unsync_leaf;
 }
 
+#define INVALID_INDEX (-1)
+
 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
                           struct kvm_mmu_pages *pvec)
 {
+       pvec->nr = 0;
        if (!sp->unsync_children)
                return 0;
 
-       mmu_pages_add(pvec, sp, 0);
+       mmu_pages_add(pvec, sp, INVALID_INDEX);
        return __mmu_unsync_walk(sp, pvec);
 }
 
@@ -1883,37 +1916,35 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
                if ((_sp)->role.direct || (_sp)->role.invalid) {} else
 
 /* @sp->gfn should be write-protected at the call site */
-static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-                          struct list_head *invalid_list, bool clear_unsync)
+static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+                           struct list_head *invalid_list)
 {
        if (sp->role.cr4_pae != !!is_pae(vcpu)) {
                kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-               return 1;
+               return false;
        }
 
-       if (clear_unsync)
-               kvm_unlink_unsync_page(vcpu->kvm, sp);
-
-       if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+       if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
                kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-               return 1;
+               return false;
        }
 
-       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-       return 0;
+       return true;
 }
 
-static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
-                                  struct kvm_mmu_page *sp)
+static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
+                                struct list_head *invalid_list,
+                                bool remote_flush, bool local_flush)
 {
-       LIST_HEAD(invalid_list);
-       int ret;
-
-       ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
-       if (ret)
-               kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+       if (!list_empty(invalid_list)) {
+               kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
+               return;
+       }
 
-       return ret;
+       if (remote_flush)
+               kvm_flush_remote_tlbs(vcpu->kvm);
+       else if (local_flush)
+               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
 
 #ifdef CONFIG_KVM_MMU_AUDIT
@@ -1923,46 +1954,38 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
 static void mmu_audit_disable(void) { }
 #endif
 
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                         struct list_head *invalid_list)
 {
-       return __kvm_sync_page(vcpu, sp, invalid_list, true);
+       kvm_unlink_unsync_page(vcpu->kvm, sp);
+       return __kvm_sync_page(vcpu, sp, invalid_list);
 }
 
 /* @gfn should be write-protected at the call site */
-static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
+                          struct list_head *invalid_list)
 {
        struct kvm_mmu_page *s;
-       LIST_HEAD(invalid_list);
-       bool flush = false;
+       bool ret = false;
 
        for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
                if (!s->unsync)
                        continue;
 
                WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
-               kvm_unlink_unsync_page(vcpu->kvm, s);
-               if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
-                       (vcpu->arch.mmu.sync_page(vcpu, s))) {
-                       kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
-                       continue;
-               }
-               flush = true;
+               ret |= kvm_sync_page(vcpu, s, invalid_list);
        }
 
-       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-       if (flush)
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+       return ret;
 }
 
 struct mmu_page_path {
-       struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
-       unsigned int idx[PT64_ROOT_LEVEL-1];
+       struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
+       unsigned int idx[PT64_ROOT_LEVEL];
 };
 
 #define for_each_sp(pvec, sp, parents, i)                      \
-               for (i = mmu_pages_next(&pvec, &parents, -1),   \
-                       sp = pvec.page[i].sp;                   \
+               for (i = mmu_pages_first(&pvec, &parents);      \
                        i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
                        i = mmu_pages_next(&pvec, &parents, i))
 
@@ -1974,19 +1997,43 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec,
 
        for (n = i+1; n < pvec->nr; n++) {
                struct kvm_mmu_page *sp = pvec->page[n].sp;
+               unsigned idx = pvec->page[n].idx;
+               int level = sp->role.level;
 
-               if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
-                       parents->idx[0] = pvec->page[n].idx;
-                       return n;
-               }
+               parents->idx[level-1] = idx;
+               if (level == PT_PAGE_TABLE_LEVEL)
+                       break;
 
-               parents->parent[sp->role.level-2] = sp;
-               parents->idx[sp->role.level-1] = pvec->page[n].idx;
+               parents->parent[level-2] = sp;
        }
 
        return n;
 }
 
+static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+                          struct mmu_page_path *parents)
+{
+       struct kvm_mmu_page *sp;
+       int level;
+
+       if (pvec->nr == 0)
+               return 0;
+
+       WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+
+       sp = pvec->page[0].sp;
+       level = sp->role.level;
+       WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+
+       parents->parent[level-2] = sp;
+
+       /* Also set up a sentinel.  Further entries in pvec are all
+        * children of sp, so this element is never overwritten.
+        */
+       parents->parent[level-1] = NULL;
+       return mmu_pages_next(pvec, parents, 0);
+}
+
 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
 {
        struct kvm_mmu_page *sp;
@@ -1994,22 +2041,14 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
 
        do {
                unsigned int idx = parents->idx[level];
-
                sp = parents->parent[level];
                if (!sp)
                        return;
 
+               WARN_ON(idx == INVALID_INDEX);
                clear_unsync_child_bit(sp, idx);
                level++;
-       } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
-}
-
-static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
-                              struct mmu_page_path *parents,
-                              struct kvm_mmu_pages *pvec)
-{
-       parents->parent[parent->role.level-1] = NULL;
-       pvec->nr = 0;
+       } while (!sp->unsync_children);
 }
 
 static void mmu_sync_children(struct kvm_vcpu *vcpu,
@@ -2020,30 +2059,36 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
        struct mmu_page_path parents;
        struct kvm_mmu_pages pages;
        LIST_HEAD(invalid_list);
+       bool flush = false;
 
-       kvm_mmu_pages_init(parent, &parents, &pages);
        while (mmu_unsync_walk(parent, &pages)) {
                bool protected = false;
 
                for_each_sp(pages, sp, parents, i)
                        protected |= rmap_write_protect(vcpu, sp->gfn);
 
-               if (protected)
+               if (protected) {
                        kvm_flush_remote_tlbs(vcpu->kvm);
+                       flush = false;
+               }
 
                for_each_sp(pages, sp, parents, i) {
-                       kvm_sync_page(vcpu, sp, &invalid_list);
+                       flush |= kvm_sync_page(vcpu, sp, &invalid_list);
                        mmu_pages_clear_parents(&parents);
                }
-               kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-               cond_resched_lock(&vcpu->kvm->mmu_lock);
-               kvm_mmu_pages_init(parent, &parents, &pages);
+               if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+                       kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+                       cond_resched_lock(&vcpu->kvm->mmu_lock);
+                       flush = false;
+               }
        }
+
+       kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
 }
 
 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
 {
-       sp->write_flooding_count = 0;
+       atomic_set(&sp->write_flooding_count,  0);
 }
 
 static void clear_sp_write_flooding_count(u64 *spte)
@@ -2069,6 +2114,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
        unsigned quadrant;
        struct kvm_mmu_page *sp;
        bool need_sync = false;
+       bool flush = false;
+       LIST_HEAD(invalid_list);
 
        role = vcpu->arch.mmu.base_role;
        role.level = level;
@@ -2092,8 +2139,16 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                if (sp->role.word != role.word)
                        continue;
 
-               if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
-                       break;
+               if (sp->unsync) {
+                       /* The page is good, but __kvm_sync_page might still end
+                        * up zapping it.  If so, break in order to rebuild it.
+                        */
+                       if (!__kvm_sync_page(vcpu, sp, &invalid_list))
+                               break;
+
+                       WARN_ON(!list_empty(&invalid_list));
+                       kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+               }
 
                if (sp->unsync_children)
                        kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@ -2112,16 +2167,24 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
        hlist_add_head(&sp->hash_link,
                &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
        if (!direct) {
-               if (rmap_write_protect(vcpu, gfn))
+               /*
+                * we should do write protection before syncing pages
+                * otherwise the content of the synced shadow page may
+                * be inconsistent with guest page table.
+                */
+               account_shadowed(vcpu->kvm, sp);
+               if (level == PT_PAGE_TABLE_LEVEL &&
+                     rmap_write_protect(vcpu, gfn))
                        kvm_flush_remote_tlbs(vcpu->kvm);
-               if (level > PT_PAGE_TABLE_LEVEL && need_sync)
-                       kvm_sync_pages(vcpu, gfn);
 
-               account_shadowed(vcpu->kvm, sp);
+               if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+                       flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
        }
        sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
        clear_page(sp->spt);
        trace_kvm_mmu_get_page(sp, true);
+
+       kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
        return sp;
 }
 
@@ -2269,7 +2332,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
        if (parent->role.level == PT_PAGE_TABLE_LEVEL)
                return 0;
 
-       kvm_mmu_pages_init(parent, &parents, &pages);
        while (mmu_unsync_walk(parent, &pages)) {
                struct kvm_mmu_page *sp;
 
@@ -2278,7 +2340,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
                        mmu_pages_clear_parents(&parents);
                        zapped++;
                }
-               kvm_mmu_pages_init(parent, &parents, &pages);
        }
 
        return zapped;
@@ -2354,8 +2415,8 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
        if (list_empty(&kvm->arch.active_mmu_pages))
                return false;
 
-       sp = list_entry(kvm->arch.active_mmu_pages.prev,
-                       struct kvm_mmu_page, link);
+       sp = list_last_entry(&kvm->arch.active_mmu_pages,
+                            struct kvm_mmu_page, link);
        kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 
        return true;
@@ -2408,7 +2469,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 
-static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
        trace_kvm_mmu_unsync_page(sp);
        ++vcpu->kvm->stat.mmu_unsync;
@@ -2417,37 +2478,26 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
        kvm_mmu_mark_parents_unsync(sp);
 }
 
-static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+                                  bool can_unsync)
 {
-       struct kvm_mmu_page *s;
-
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
-               if (s->unsync)
-                       continue;
-               WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
-               __kvm_unsync_page(vcpu, s);
-       }
-}
+       struct kvm_mmu_page *sp;
 
-static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
-                                 bool can_unsync)
-{
-       struct kvm_mmu_page *s;
-       bool need_unsync = false;
+       if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+               return true;
 
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
+       for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
                if (!can_unsync)
-                       return 1;
+                       return true;
 
-               if (s->role.level != PT_PAGE_TABLE_LEVEL)
-                       return 1;
+               if (sp->unsync)
+                       continue;
 
-               if (!s->unsync)
-                       need_unsync = true;
+               WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+               kvm_unsync_page(vcpu, sp);
        }
-       if (need_unsync)
-               kvm_unsync_pages(vcpu, gfn);
-       return 0;
+
+       return false;
 }
 
 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@ -2503,7 +2553,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                 * be fixed if guest refault.
                 */
                if (level > PT_PAGE_TABLE_LEVEL &&
-                   has_wrprotected_page(vcpu, gfn, level))
+                   mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
                        goto done;
 
                spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@ -2768,7 +2818,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
        if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
            level == PT_PAGE_TABLE_LEVEL &&
            PageTransCompound(pfn_to_page(pfn)) &&
-           !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
+           !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
                unsigned long mask;
                /*
                 * mmu_notifier_retry was successful and we hold the
@@ -2796,20 +2846,16 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                                kvm_pfn_t pfn, unsigned access, int *ret_val)
 {
-       bool ret = true;
-
        /* The pfn is invalid, report the error! */
        if (unlikely(is_error_pfn(pfn))) {
                *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
-               goto exit;
+               return true;
        }
 
        if (unlikely(is_noslot_pfn(pfn)))
                vcpu_cache_mmio_info(vcpu, gva, gfn, access);
 
-       ret = false;
-exit:
-       return ret;
+       return false;
 }
 
 static bool page_fault_can_be_fast(u32 error_code)
@@ -3273,7 +3319,7 @@ static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
        return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
 }
 
-static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 {
        if (direct)
                return vcpu_match_mmio_gpa(vcpu, addr);
@@ -3332,7 +3378,7 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
        u64 spte;
        bool reserved;
 
-       if (quickly_check_mmio_pf(vcpu, addr, direct))
+       if (mmio_info_in_cache(vcpu, addr, direct))
                return RET_MMIO_PF_EMULATE;
 
        reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
@@ -3362,20 +3408,53 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 }
 EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
 
+static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+                                        u32 error_code, gfn_t gfn)
+{
+       if (unlikely(error_code & PFERR_RSVD_MASK))
+               return false;
+
+       if (!(error_code & PFERR_PRESENT_MASK) ||
+             !(error_code & PFERR_WRITE_MASK))
+               return false;
+
+       /*
+        * guest is writing the page which is write tracked which can
+        * not be fixed by page fault handler.
+        */
+       if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+               return true;
+
+       return false;
+}
+
+static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+{
+       struct kvm_shadow_walk_iterator iterator;
+       u64 spte;
+
+       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               return;
+
+       walk_shadow_page_lockless_begin(vcpu);
+       for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+               clear_sp_write_flooding_count(iterator.sptep);
+               if (!is_shadow_present_pte(spte))
+                       break;
+       }
+       walk_shadow_page_lockless_end(vcpu);
+}
+
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
                                u32 error_code, bool prefault)
 {
-       gfn_t gfn;
+       gfn_t gfn = gva >> PAGE_SHIFT;
        int r;
 
        pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
 
-       if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, gva, true);
-
-               if (likely(r != RET_MMIO_PF_INVALID))
-                       return r;
-       }
+       if (page_fault_handle_page_track(vcpu, error_code, gfn))
+               return 1;
 
        r = mmu_topup_memory_caches(vcpu);
        if (r)
@@ -3383,7 +3462,6 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 
        MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
-       gfn = gva >> PAGE_SHIFT;
 
        return nonpaging_map(vcpu, gva & PAGE_MASK,
                             error_code, gfn, prefault);
@@ -3460,12 +3538,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 
        MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
-       if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, gpa, true);
-
-               if (likely(r != RET_MMIO_PF_INVALID))
-                       return r;
-       }
+       if (page_fault_handle_page_track(vcpu, error_code, gfn))
+               return 1;
 
        r = mmu_topup_memory_caches(vcpu);
        if (r)
@@ -3558,13 +3632,24 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
        return false;
 }
 
-static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
+static inline bool is_last_gpte(struct kvm_mmu *mmu,
+                               unsigned level, unsigned gpte)
 {
-       unsigned index;
+       /*
+        * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
+        * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+        * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+        */
+       gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
 
-       index = level - 1;
-       index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
-       return mmu->last_pte_bitmap & (1 << index);
+       /*
+        * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
+        * If it is clear, there are no large pages at this level, so clear
+        * PT_PAGE_SIZE_MASK in gpte if that is the case.
+        */
+       gpte &= level - mmu->last_nonleaf_level;
+
+       return gpte & PT_PAGE_SIZE_MASK;
 }
 
 #define PTTYPE_EPT 18 /* arbitrary */
@@ -3836,22 +3921,13 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
        }
 }
 
-static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
 {
-       u8 map;
-       unsigned level, root_level = mmu->root_level;
-       const unsigned ps_set_index = 1 << 2;  /* bit 2 of index: ps */
-
-       if (root_level == PT32E_ROOT_LEVEL)
-               --root_level;
-       /* PT_PAGE_TABLE_LEVEL always terminates */
-       map = 1 | (1 << ps_set_index);
-       for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
-               if (level <= PT_PDPE_LEVEL
-                   && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
-                       map |= 1 << (ps_set_index | (level - 1));
-       }
-       mmu->last_pte_bitmap = map;
+       unsigned root_level = mmu->root_level;
+
+       mmu->last_nonleaf_level = root_level;
+       if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
+               mmu->last_nonleaf_level++;
 }
 
 static void paging64_init_context_common(struct kvm_vcpu *vcpu,
@@ -3863,7 +3939,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
 
        reset_rsvds_bits_mask(vcpu, context);
        update_permission_bitmask(vcpu, context, false);
-       update_last_pte_bitmap(vcpu, context);
+       update_last_nonleaf_level(vcpu, context);
 
        MMU_WARN_ON(!is_pae(vcpu));
        context->page_fault = paging64_page_fault;
@@ -3890,7 +3966,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
 
        reset_rsvds_bits_mask(vcpu, context);
        update_permission_bitmask(vcpu, context, false);
-       update_last_pte_bitmap(vcpu, context);
+       update_last_nonleaf_level(vcpu, context);
 
        context->page_fault = paging32_page_fault;
        context->gva_to_gpa = paging32_gva_to_gpa;
@@ -3948,7 +4024,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        }
 
        update_permission_bitmask(vcpu, context, false);
-       update_last_pte_bitmap(vcpu, context);
+       update_last_nonleaf_level(vcpu, context);
        reset_tdp_shadow_zero_bits_mask(vcpu, context);
 }
 
@@ -4054,7 +4130,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
        }
 
        update_permission_bitmask(vcpu, g_context, false);
-       update_last_pte_bitmap(vcpu, g_context);
+       update_last_nonleaf_level(vcpu, g_context);
 }
 
 static void init_kvm_mmu(struct kvm_vcpu *vcpu)
@@ -4125,18 +4201,6 @@ static bool need_remote_flush(u64 old, u64 new)
        return (old & ~new & PT64_PERM_MASK) != 0;
 }
 
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
-                                   bool remote_flush, bool local_flush)
-{
-       if (zap_page)
-               return;
-
-       if (remote_flush)
-               kvm_flush_remote_tlbs(vcpu->kvm);
-       else if (local_flush)
-               kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-}
-
 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
                                    const u8 *new, int *bytes)
 {
@@ -4186,7 +4250,8 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
        if (sp->role.level == PT_PAGE_TABLE_LEVEL)
                return false;
 
-       return ++sp->write_flooding_count >= 3;
+       atomic_inc(&sp->write_flooding_count);
+       return atomic_read(&sp->write_flooding_count) >= 3;
 }
 
 /*
@@ -4248,15 +4313,15 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
        return spte;
 }
 
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                      const u8 *new, int bytes)
+static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+                             const u8 *new, int bytes)
 {
        gfn_t gfn = gpa >> PAGE_SHIFT;
        struct kvm_mmu_page *sp;
        LIST_HEAD(invalid_list);
        u64 entry, gentry, *spte;
        int npte;
-       bool remote_flush, local_flush, zap_page;
+       bool remote_flush, local_flush;
        union kvm_mmu_page_role mask = { };
 
        mask.cr0_wp = 1;
@@ -4273,7 +4338,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
                return;
 
-       zap_page = remote_flush = local_flush = false;
+       remote_flush = local_flush = false;
 
        pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
 
@@ -4293,8 +4358,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
                if (detect_write_misaligned(sp, gpa, bytes) ||
                      detect_write_flooding(sp)) {
-                       zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
-                                                    &invalid_list);
+                       kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
                        ++vcpu->kvm->stat.mmu_flooded;
                        continue;
                }
@@ -4316,8 +4380,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                        ++spte;
                }
        }
-       mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
-       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+       kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
        kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
        spin_unlock(&vcpu->kvm->mmu_lock);
 }
@@ -4354,32 +4417,34 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
        kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
-static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
-{
-       if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
-               return vcpu_match_mmio_gpa(vcpu, addr);
-
-       return vcpu_match_mmio_gva(vcpu, addr);
-}
-
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
                       void *insn, int insn_len)
 {
        int r, emulation_type = EMULTYPE_RETRY;
        enum emulation_result er;
+       bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
+
+       if (unlikely(error_code & PFERR_RSVD_MASK)) {
+               r = handle_mmio_page_fault(vcpu, cr2, direct);
+               if (r == RET_MMIO_PF_EMULATE) {
+                       emulation_type = 0;
+                       goto emulate;
+               }
+               if (r == RET_MMIO_PF_RETRY)
+                       return 1;
+               if (r < 0)
+                       return r;
+       }
 
        r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
        if (r < 0)
-               goto out;
-
-       if (!r) {
-               r = 1;
-               goto out;
-       }
+               return r;
+       if (!r)
+               return 1;
 
-       if (is_mmio_page_fault(vcpu, cr2))
+       if (mmio_info_in_cache(vcpu, cr2, direct))
                emulation_type = 0;
-
+emulate:
        er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
 
        switch (er) {
@@ -4393,8 +4458,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
        default:
                BUG();
        }
-out:
-       return r;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
@@ -4463,6 +4526,21 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
        init_kvm_mmu(vcpu);
 }
 
+void kvm_mmu_init_vm(struct kvm *kvm)
+{
+       struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+       node->track_write = kvm_mmu_pte_write;
+       kvm_page_track_register_notifier(kvm, node);
+}
+
+void kvm_mmu_uninit_vm(struct kvm *kvm)
+{
+       struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+       kvm_page_track_unregister_notifier(kvm, node);
+}
+
 /* The return value indicates if tlb flush on all vcpus is needed. */
 typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
 
index 55ffb7b..58fe98a 100644 (file)
@@ -174,4 +174,9 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 
 void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+                                   struct kvm_memory_slot *slot, u64 gfn);
 #endif
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
new file mode 100644 (file)
index 0000000..11f7643
--- /dev/null
@@ -0,0 +1,222 @@
+/*
+ * Support KVM gust page tracking
+ *
+ * This feature allows us to track page access in guest. Currently, only
+ * write access is tracked.
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_page_track.h>
+
+#include "mmu.h"
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
+                                struct kvm_memory_slot *dont)
+{
+       int i;
+
+       for (i = 0; i < KVM_PAGE_TRACK_MAX; i++)
+               if (!dont || free->arch.gfn_track[i] !=
+                     dont->arch.gfn_track[i]) {
+                       kvfree(free->arch.gfn_track[i]);
+                       free->arch.gfn_track[i] = NULL;
+               }
+}
+
+int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+                                 unsigned long npages)
+{
+       int  i;
+
+       for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+               slot->arch.gfn_track[i] = kvm_kvzalloc(npages *
+                                           sizeof(*slot->arch.gfn_track[i]));
+               if (!slot->arch.gfn_track[i])
+                       goto track_free;
+       }
+
+       return 0;
+
+track_free:
+       kvm_page_track_free_memslot(slot, NULL);
+       return -ENOMEM;
+}
+
+static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode)
+{
+       if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX)
+               return false;
+
+       return true;
+}
+
+static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
+                            enum kvm_page_track_mode mode, short count)
+{
+       int index, val;
+
+       index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+
+       val = slot->arch.gfn_track[mode][index];
+
+       if (WARN_ON(val + count < 0 || val + count > USHRT_MAX))
+               return;
+
+       slot->arch.gfn_track[mode][index] += count;
+}
+
+/*
+ * add guest page to the tracking pool so that corresponding access on that
+ * page will be intercepted.
+ *
+ * It should be called under the protection both of mmu-lock and kvm->srcu
+ * or kvm->slots_lock.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @slot: the @gfn belongs to.
+ * @gfn: the guest page.
+ * @mode: tracking mode, currently only write track is supported.
+ */
+void kvm_slot_page_track_add_page(struct kvm *kvm,
+                                 struct kvm_memory_slot *slot, gfn_t gfn,
+                                 enum kvm_page_track_mode mode)
+{
+
+       if (WARN_ON(!page_track_mode_is_valid(mode)))
+               return;
+
+       update_gfn_track(slot, gfn, mode, 1);
+
+       /*
+        * new track stops large page mapping for the
+        * tracked page.
+        */
+       kvm_mmu_gfn_disallow_lpage(slot, gfn);
+
+       if (mode == KVM_PAGE_TRACK_WRITE)
+               if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn))
+                       kvm_flush_remote_tlbs(kvm);
+}
+
+/*
+ * remove the guest page from the tracking pool which stops the interception
+ * of corresponding access on that page. It is the opposed operation of
+ * kvm_slot_page_track_add_page().
+ *
+ * It should be called under the protection both of mmu-lock and kvm->srcu
+ * or kvm->slots_lock.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @slot: the @gfn belongs to.
+ * @gfn: the guest page.
+ * @mode: tracking mode, currently only write track is supported.
+ */
+void kvm_slot_page_track_remove_page(struct kvm *kvm,
+                                    struct kvm_memory_slot *slot, gfn_t gfn,
+                                    enum kvm_page_track_mode mode)
+{
+       if (WARN_ON(!page_track_mode_is_valid(mode)))
+               return;
+
+       update_gfn_track(slot, gfn, mode, -1);
+
+       /*
+        * allow large page mapping for the tracked page
+        * after the tracker is gone.
+        */
+       kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
+
+/*
+ * check if the corresponding access on the specified guest page is tracked.
+ */
+bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
+                             enum kvm_page_track_mode mode)
+{
+       struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+       int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+
+       if (WARN_ON(!page_track_mode_is_valid(mode)))
+               return false;
+
+       return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
+}
+
+void kvm_page_track_init(struct kvm *kvm)
+{
+       struct kvm_page_track_notifier_head *head;
+
+       head = &kvm->arch.track_notifier_head;
+       init_srcu_struct(&head->track_srcu);
+       INIT_HLIST_HEAD(&head->track_notifier_list);
+}
+
+/*
+ * register the notifier so that event interception for the tracked guest
+ * pages can be received.
+ */
+void
+kvm_page_track_register_notifier(struct kvm *kvm,
+                                struct kvm_page_track_notifier_node *n)
+{
+       struct kvm_page_track_notifier_head *head;
+
+       head = &kvm->arch.track_notifier_head;
+
+       spin_lock(&kvm->mmu_lock);
+       hlist_add_head_rcu(&n->node, &head->track_notifier_list);
+       spin_unlock(&kvm->mmu_lock);
+}
+
+/*
+ * stop receiving the event interception. It is the opposed operation of
+ * kvm_page_track_register_notifier().
+ */
+void
+kvm_page_track_unregister_notifier(struct kvm *kvm,
+                                  struct kvm_page_track_notifier_node *n)
+{
+       struct kvm_page_track_notifier_head *head;
+
+       head = &kvm->arch.track_notifier_head;
+
+       spin_lock(&kvm->mmu_lock);
+       hlist_del_rcu(&n->node);
+       spin_unlock(&kvm->mmu_lock);
+       synchronize_srcu(&head->track_srcu);
+}
+
+/*
+ * Notify the node that write access is intercepted and write emulation is
+ * finished at this time.
+ *
+ * The node should figure out if the written page is the one that node is
+ * interested in by itself.
+ */
+void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+                         int bytes)
+{
+       struct kvm_page_track_notifier_head *head;
+       struct kvm_page_track_notifier_node *n;
+       int idx;
+
+       head = &vcpu->kvm->arch.track_notifier_head;
+
+       if (hlist_empty(&head->track_notifier_list))
+               return;
+
+       idx = srcu_read_lock(&head->track_srcu);
+       hlist_for_each_entry_rcu(n, &head->track_notifier_list, node)
+               if (n->track_write)
+                       n->track_write(vcpu, gpa, new, bytes);
+       srcu_read_unlock(&head->track_srcu, idx);
+}
index 2ce4f05..e159a81 100644 (file)
@@ -189,8 +189,11 @@ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
                ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
                ACC_USER_MASK;
 #else
-       access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-       access &= ~(gpte >> PT64_NX_SHIFT);
+       BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
+       BUILD_BUG_ON(ACC_EXEC_MASK != 1);
+       access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
+       /* Combine NX with P (which is set here) to get ACC_EXEC_MASK.  */
+       access ^= (gpte >> PT64_NX_SHIFT);
 #endif
 
        return access;
@@ -702,23 +705,16 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 
        pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
-       if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu));
-               if (likely(r != RET_MMIO_PF_INVALID))
-                       return r;
-
-               /*
-                * page fault with PFEC.RSVD  = 1 is caused by shadow
-                * page fault, should not be used to walk guest page
-                * table.
-                */
-               error_code &= ~PFERR_RSVD_MASK;
-       };
-
        r = mmu_topup_memory_caches(vcpu);
        if (r)
                return r;
 
+       /*
+        * If PFEC.RSVD is set, this is a shadow page fault.
+        * The bit needs to be cleared before walking guest page tables.
+        */
+       error_code &= ~PFERR_RSVD_MASK;
+
        /*
         * Look up the guest pte for the faulting address.
         */
@@ -735,6 +731,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
                return 0;
        }
 
+       if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
+               shadow_page_table_clear_flood(vcpu, addr);
+               return 1;
+       }
+
        vcpu->arch.write_fault_to_shadow_pgtable = false;
 
        is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
@@ -945,7 +946,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
                if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
                                               sizeof(pt_element_t)))
-                       return -EINVAL;
+                       return 0;
 
                if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
                        vcpu->kvm->tlbs_dirty++;
@@ -977,7 +978,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                         host_writable);
        }
 
-       return !nr_present;
+       return nr_present;
 }
 
 #undef pt_element_t
index 31aa2c8..06ce377 100644 (file)
@@ -257,7 +257,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
 
 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
 {
-       if (vcpu->arch.apic)
+       if (lapic_in_kernel(vcpu))
                kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
 }
 
index c13a64b..9507038 100644 (file)
@@ -1858,8 +1858,7 @@ static int halt_interception(struct vcpu_svm *svm)
 static int vmmcall_interception(struct vcpu_svm *svm)
 {
        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
-       kvm_emulate_hypercall(&svm->vcpu);
-       return 1;
+       return kvm_emulate_hypercall(&svm->vcpu);
 }
 
 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
index ad9f6a2..2f1ea2f 100644 (file)
@@ -996,11 +996,13 @@ TRACE_EVENT(kvm_enter_smm,
  * Tracepoint for VT-d posted-interrupts.
  */
 TRACE_EVENT(kvm_pi_irte_update,
-       TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
-                unsigned int gvec, u64 pi_desc_addr, bool set),
-       TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
+       TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
+                unsigned int gsi, unsigned int gvec,
+                u64 pi_desc_addr, bool set),
+       TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
 
        TP_STRUCT__entry(
+               __field(        unsigned int,   host_irq        )
                __field(        unsigned int,   vcpu_id         )
                __field(        unsigned int,   gsi             )
                __field(        unsigned int,   gvec            )
@@ -1009,6 +1011,7 @@ TRACE_EVENT(kvm_pi_irte_update,
        ),
 
        TP_fast_assign(
+               __entry->host_irq       = host_irq;
                __entry->vcpu_id        = vcpu_id;
                __entry->gsi            = gsi;
                __entry->gvec           = gvec;
@@ -1016,9 +1019,10 @@ TRACE_EVENT(kvm_pi_irte_update,
                __entry->set            = set;
        ),
 
-       TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
+       TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
                  "gvec: 0x%x, pi_desc_addr: 0x%llx",
                  __entry->set ? "enabled and being updated" : "disabled",
+                 __entry->host_irq,
                  __entry->vcpu_id,
                  __entry->gsi,
                  __entry->gvec,
index e2951b6..46154da 100644 (file)
@@ -961,25 +961,36 @@ static const u32 vmx_msr_index[] = {
        MSR_EFER, MSR_TSC_AUX, MSR_STAR,
 };
 
-static inline bool is_page_fault(u32 intr_info)
+static inline bool is_exception_n(u32 intr_info, u8 vector)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                             INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+               (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_debug(u32 intr_info)
+{
+       return is_exception_n(intr_info, DB_VECTOR);
+}
+
+static inline bool is_breakpoint(u32 intr_info)
+{
+       return is_exception_n(intr_info, BP_VECTOR);
+}
+
+static inline bool is_page_fault(u32 intr_info)
+{
+       return is_exception_n(intr_info, PF_VECTOR);
 }
 
 static inline bool is_no_device(u32 intr_info)
 {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+       return is_exception_n(intr_info, NM_VECTOR);
 }
 
 static inline bool is_invalid_opcode(u32 intr_info)
 {
-       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-                            INTR_INFO_VALID_MASK)) ==
-               (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+       return is_exception_n(intr_info, UD_VECTOR);
 }
 
 static inline bool is_external_interrupt(u32 intr_info)
@@ -5608,11 +5619,8 @@ static int handle_dr(struct kvm_vcpu *vcpu)
        }
 
        if (vcpu->guest_debug == 0) {
-               u32 cpu_based_vm_exec_control;
-
-               cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-               cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
-               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+               vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+                               CPU_BASED_MOV_DR_EXITING);
 
                /*
                 * No more DR vmexits; force a reload of the debug registers
@@ -5649,8 +5657,6 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
 
 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
 {
-       u32 cpu_based_vm_exec_control;
-
        get_debugreg(vcpu->arch.db[0], 0);
        get_debugreg(vcpu->arch.db[1], 1);
        get_debugreg(vcpu->arch.db[2], 2);
@@ -5659,10 +5665,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
        vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
 
        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
-
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+       vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
 }
 
 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@ -5747,8 +5750,7 @@ static int handle_halt(struct kvm_vcpu *vcpu)
 
 static int handle_vmcall(struct kvm_vcpu *vcpu)
 {
-       kvm_emulate_hypercall(vcpu);
-       return 1;
+       return kvm_emulate_hypercall(vcpu);
 }
 
 static int handle_invd(struct kvm_vcpu *vcpu)
@@ -6435,8 +6437,8 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
 
        if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
                /* Recycle the least recently used VMCS. */
-               item = list_entry(vmx->nested.vmcs02_pool.prev,
-                       struct vmcs02_list, list);
+               item = list_last_entry(&vmx->nested.vmcs02_pool,
+                                      struct vmcs02_list, list);
                item->vmptr = vmx->nested.current_vmptr;
                list_move(&item->list, &vmx->nested.vmcs02_pool);
                return &item->vmcs02;
@@ -7752,6 +7754,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                else if (is_no_device(intr_info) &&
                         !(vmcs12->guest_cr0 & X86_CR0_TS))
                        return false;
+               else if (is_debug(intr_info) &&
+                        vcpu->guest_debug &
+                        (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+                       return false;
+               else if (is_breakpoint(intr_info) &&
+                        vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+                       return false;
                return vmcs12->exception_bitmap &
                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
        case EXIT_REASON_EXTERNAL_INTERRUPT:
@@ -10764,13 +10773,26 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                 */
 
                kvm_set_msi_irq(e, &irq);
-               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+                       /*
+                        * Make sure the IRTE is in remapped mode if
+                        * we don't handle it in posted mode.
+                        */
+                       ret = irq_set_vcpu_affinity(host_irq, NULL);
+                       if (ret < 0) {
+                               printk(KERN_INFO
+                                  "failed to back to remapped mode, irq: %u\n",
+                                  host_irq);
+                               goto out;
+                       }
+
                        continue;
+               }
 
                vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
                vcpu_info.vector = irq.vector;
 
-               trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+               trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
                                vcpu_info.vector, vcpu_info.pi_desc_addr, set);
 
                if (set)
index f4891f2..60d6c00 100644 (file)
@@ -123,6 +123,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 unsigned int __read_mostly lapic_timer_advance_ns = 0;
 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
 
+static bool __read_mostly vector_hashing = true;
+module_param(vector_hashing, bool, S_IRUGO);
+
 static bool __read_mostly backwards_tsc_observed = false;
 
 #define KVM_NR_SHARED_MSRS 16
@@ -1196,17 +1199,11 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 
 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 {
-       uint32_t quotient, remainder;
-
-       /* Don't try to replace with do_div(), this one calculates
-        * "(dividend << 32) / divisor" */
-       __asm__ ( "divl %4"
-                 : "=a" (quotient), "=d" (remainder)
-                 : "0" (0), "1" (dividend), "r" (divisor) );
-       return quotient;
+       do_shl32_div32(dividend, divisor);
+       return dividend;
 }
 
-static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
                               s8 *pshift, u32 *pmultiplier)
 {
        uint64_t scaled64;
@@ -1214,8 +1211,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
        uint64_t tps64;
        uint32_t tps32;
 
-       tps64 = base_khz * 1000LL;
-       scaled64 = scaled_khz * 1000LL;
+       tps64 = base_hz;
+       scaled64 = scaled_hz;
        while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
                tps64 >>= 1;
                shift--;
@@ -1233,8 +1230,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
        *pshift = shift;
        *pmultiplier = div_frac(scaled64, tps32);
 
-       pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
-                __func__, base_khz, scaled_khz, shift, *pmultiplier);
+       pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
+                __func__, base_hz, scaled_hz, shift, *pmultiplier);
 }
 
 #ifdef CONFIG_X86_64
@@ -1293,23 +1290,23 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
        return 0;
 }
 
-static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
 {
        u32 thresh_lo, thresh_hi;
        int use_scaling = 0;
 
        /* tsc_khz can be zero if TSC calibration fails */
-       if (this_tsc_khz == 0) {
+       if (user_tsc_khz == 0) {
                /* set tsc_scaling_ratio to a safe value */
                vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
                return -1;
        }
 
        /* Compute a scale to convert nanoseconds in TSC cycles */
-       kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+       kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
                           &vcpu->arch.virtual_tsc_shift,
                           &vcpu->arch.virtual_tsc_mult);
-       vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+       vcpu->arch.virtual_tsc_khz = user_tsc_khz;
 
        /*
         * Compute the variation in TSC rate which is acceptable
@@ -1319,11 +1316,11 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
         */
        thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
        thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
-       if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
-               pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+       if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+               pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
                use_scaling = 1;
        }
-       return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+       return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
 }
 
 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@ -1716,7 +1713,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
-       unsigned long flags, this_tsc_khz, tgt_tsc_khz;
+       unsigned long flags, tgt_tsc_khz;
        struct kvm_vcpu_arch *vcpu = &v->arch;
        struct kvm_arch *ka = &v->kvm->arch;
        s64 kernel_ns;
@@ -1742,8 +1739,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
-       this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
-       if (unlikely(this_tsc_khz == 0)) {
+       tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+       if (unlikely(tgt_tsc_khz == 0)) {
                local_irq_restore(flags);
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
                return 1;
@@ -1778,13 +1775,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        if (!vcpu->pv_time_enabled)
                return 0;
 
-       if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
-               tgt_tsc_khz = kvm_has_tsc_control ?
-                       vcpu->virtual_tsc_khz : this_tsc_khz;
-               kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
+       if (kvm_has_tsc_control)
+               tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+
+       if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
+               kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
                                   &vcpu->hv_clock.tsc_shift,
                                   &vcpu->hv_clock.tsc_to_system_mul);
-               vcpu->hw_tsc_khz = this_tsc_khz;
+               vcpu->hw_tsc_khz = tgt_tsc_khz;
        }
 
        /* With all the info we got, fill in the values */
@@ -2988,7 +2986,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
        kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
 
        if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
-           kvm_vcpu_has_lapic(vcpu))
+           lapic_in_kernel(vcpu))
                vcpu->arch.apic->sipi_vector = events->sipi_vector;
 
        if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
@@ -3001,7 +2999,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                        vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
                else
                        vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
-               if (kvm_vcpu_has_lapic(vcpu)) {
+               if (lapic_in_kernel(vcpu)) {
                        if (events->smi.latched_init)
                                set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
                        else
@@ -3241,7 +3239,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        switch (ioctl) {
        case KVM_GET_LAPIC: {
                r = -EINVAL;
-               if (!vcpu->arch.apic)
+               if (!lapic_in_kernel(vcpu))
                        goto out;
                u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 
@@ -3259,7 +3257,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        }
        case KVM_SET_LAPIC: {
                r = -EINVAL;
-               if (!vcpu->arch.apic)
+               if (!lapic_in_kernel(vcpu))
                        goto out;
                u.lapic = memdup_user(argp, sizeof(*u.lapic));
                if (IS_ERR(u.lapic))
@@ -3606,20 +3604,26 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 
 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+       BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+       mutex_lock(&kps->lock);
+       memcpy(ps, &kps->channels, sizeof(*ps));
+       mutex_unlock(&kps->lock);
        return 0;
 }
 
 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
        int i;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+       struct kvm_pit *pit = kvm->arch.vpit;
+
+       mutex_lock(&pit->pit_state.lock);
+       memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
        for (i = 0; i < 3; i++)
-               kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+               kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+       mutex_unlock(&pit->pit_state.lock);
        return 0;
 }
 
@@ -3639,29 +3643,39 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
        int start = 0;
        int i;
        u32 prev_legacy, cur_legacy;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+       struct kvm_pit *pit = kvm->arch.vpit;
+
+       mutex_lock(&pit->pit_state.lock);
+       prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
        cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
        if (!prev_legacy && cur_legacy)
                start = 1;
-       memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
-              sizeof(kvm->arch.vpit->pit_state.channels));
-       kvm->arch.vpit->pit_state.flags = ps->flags;
+       memcpy(&pit->pit_state.channels, &ps->channels,
+              sizeof(pit->pit_state.channels));
+       pit->pit_state.flags = ps->flags;
        for (i = 0; i < 3; i++)
-               kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+               kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
                                   start && i == 0);
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       mutex_unlock(&pit->pit_state.lock);
        return 0;
 }
 
 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
                                 struct kvm_reinject_control *control)
 {
-       if (!kvm->arch.vpit)
+       struct kvm_pit *pit = kvm->arch.vpit;
+
+       if (!pit)
                return -ENXIO;
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+
+       /* pit->pit_state.lock was overloaded to prevent userspace from getting
+        * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+        * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
+        */
+       mutex_lock(&pit->pit_state.lock);
+       kvm_pit_set_reinject(pit, control->pit_reinject);
+       mutex_unlock(&pit->pit_state.lock);
+
        return 0;
 }
 
@@ -4094,7 +4108,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
 
        do {
                n = min(len, 8);
-               if (!(vcpu->arch.apic &&
+               if (!(lapic_in_kernel(vcpu) &&
                      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
                    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
                        break;
@@ -4114,7 +4128,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 
        do {
                n = min(len, 8);
-               if (!(vcpu->arch.apic &&
+               if (!(lapic_in_kernel(vcpu) &&
                      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
                                         addr, n, v))
                    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
@@ -4347,7 +4361,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
        ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
        if (ret < 0)
                return 0;
-       kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+       kvm_page_track_write(vcpu, gpa, val, bytes);
        return 1;
 }
 
@@ -4605,7 +4619,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
                return X86EMUL_CMPXCHG_FAILED;
 
        kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
-       kvm_mmu_pte_write(vcpu, gpa, new, bytes);
+       kvm_page_track_write(vcpu, gpa, new, bytes);
 
        return X86EMUL_CONTINUE;
 
@@ -6011,7 +6025,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
        if (!kvm_x86_ops->update_cr8_intercept)
                return;
 
-       if (!vcpu->arch.apic)
+       if (!lapic_in_kernel(vcpu))
                return;
 
        if (vcpu->arch.apicv_active)
@@ -7039,7 +7053,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
-       if (!kvm_vcpu_has_lapic(vcpu) &&
+       if (!lapic_in_kernel(vcpu) &&
            mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
                return -EINVAL;
 
@@ -7594,6 +7608,7 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
 }
 
 struct static_key kvm_no_apic_vcpu __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
@@ -7725,6 +7740,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
 
+       kvm_page_track_init(kvm);
+       kvm_mmu_init_vm(kvm);
+
        return 0;
 }
 
@@ -7851,6 +7869,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kfree(kvm->arch.vioapic);
        kvm_free_vcpus(kvm);
        kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+       kvm_mmu_uninit_vm(kvm);
 }
 
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
@@ -7872,6 +7891,8 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                        free->arch.lpage_info[i - 1] = NULL;
                }
        }
+
+       kvm_page_track_free_memslot(free, dont);
 }
 
 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
@@ -7880,6 +7901,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
        int i;
 
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+               struct kvm_lpage_info *linfo;
                unsigned long ugfn;
                int lpages;
                int level = i + 1;
@@ -7894,15 +7916,16 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
                if (i == 0)
                        continue;
 
-               slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
-                                       sizeof(*slot->arch.lpage_info[i - 1]));
-               if (!slot->arch.lpage_info[i - 1])
+               linfo = kvm_kvzalloc(lpages * sizeof(*linfo));
+               if (!linfo)
                        goto out_free;
 
+               slot->arch.lpage_info[i - 1] = linfo;
+
                if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       slot->arch.lpage_info[i - 1][0].write_count = 1;
+                       linfo[0].disallow_lpage = 1;
                if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
+                       linfo[lpages - 1].disallow_lpage = 1;
                ugfn = slot->userspace_addr >> PAGE_SHIFT;
                /*
                 * If the gfn and userspace address are not aligned wrt each
@@ -7914,10 +7937,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
                        unsigned long j;
 
                        for (j = 0; j < lpages; ++j)
-                               slot->arch.lpage_info[i - 1][j].write_count = 1;
+                               linfo[j].disallow_lpage = 1;
                }
        }
 
+       if (kvm_page_track_create_memslot(slot, npages))
+               goto out_free;
+
        return 0;
 
 out_free:
@@ -8371,6 +8397,12 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
        return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
 }
 
+bool kvm_vector_hashing_enabled(void)
+{
+       return vector_hashing;
+}
+EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
index f2afa5f..007940f 100644 (file)
@@ -179,6 +179,7 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
                                          int page_num);
+bool kvm_vector_hashing_enabled(void);
 
 #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
                                | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
@@ -192,4 +193,19 @@ extern unsigned int min_timer_period_us;
 extern unsigned int lapic_timer_advance_ns;
 
 extern struct static_key kvm_no_apic_vcpu;
+
+/* Same "calling convention" as do_div:
+ * - divide (n << 32) by base
+ * - put result in n
+ * - return remainder
+ */
+#define do_shl32_div32(n, base)                                        \
+       ({                                                      \
+           u32 __quot, __rem;                                  \
+           asm("divl %2" : "=a" (__quot), "=d" (__rem)         \
+                       : "rm" (base), "0" (0), "1" ((u32) n)); \
+           n = __quot;                                         \
+           __rem;                                              \
+        })
+
 #endif
index 4ebc796..2f8c0f4 100644 (file)
@@ -256,12 +256,6 @@ struct hv_monitor_page {
        u8 rsvdz4[1984];
 };
 
-/* Declare the various hypercall operations. */
-enum hv_call_code {
-       HVCALL_POST_MESSAGE     = 0x005c,
-       HVCALL_SIGNAL_EVENT     = 0x005d,
-};
-
 /* Definition of the hv_post_message hypercall input structure. */
 struct hv_input_post_message {
        union hv_connection_id connectionid;
index d6f8322..aa69253 100644 (file)
@@ -359,14 +359,15 @@ TRACE_EVENT(
 #endif
 
 TRACE_EVENT(kvm_halt_poll_ns,
-       TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
+       TP_PROTO(bool grow, unsigned int vcpu_id, unsigned int new,
+                unsigned int old),
        TP_ARGS(grow, vcpu_id, new, old),
 
        TP_STRUCT__entry(
                __field(bool, grow)
                __field(unsigned int, vcpu_id)
-               __field(int, new)
-               __field(int, old)
+               __field(unsigned int, new)
+               __field(unsigned int, old)
        ),
 
        TP_fast_assign(
@@ -376,7 +377,7 @@ TRACE_EVENT(kvm_halt_poll_ns,
                __entry->old            = old;
        ),
 
-       TP_printk("vcpu %u: halt_poll_ns %d (%s %d)",
+       TP_printk("vcpu %u: halt_poll_ns %u (%s %u)",
                        __entry->vcpu_id,
                        __entry->new,
                        __entry->grow ? "grow" : "shrink",
index 50f44a2..a7f1f80 100644 (file)
@@ -157,6 +157,7 @@ struct kvm_s390_skeys {
 
 struct kvm_hyperv_exit {
 #define KVM_EXIT_HYPERV_SYNIC          1
+#define KVM_EXIT_HYPERV_HCALL          2
        __u32 type;
        union {
                struct {
@@ -165,6 +166,11 @@ struct kvm_hyperv_exit {
                        __u64 evt_page;
                        __u64 msg_page;
                } synic;
+               struct {
+                       __u64 input;
+                       __u64 result;
+                       __u64 params[2];
+               } hcall;
        } u;
 };
 
@@ -541,7 +547,13 @@ struct kvm_s390_pgm_info {
        __u8 exc_access_id;
        __u8 per_access_id;
        __u8 op_access_id;
-       __u8 pad[3];
+#define KVM_S390_PGM_FLAGS_ILC_VALID   0x01
+#define KVM_S390_PGM_FLAGS_ILC_0       0x02
+#define KVM_S390_PGM_FLAGS_ILC_1       0x04
+#define KVM_S390_PGM_FLAGS_ILC_MASK    0x06
+#define KVM_S390_PGM_FLAGS_NO_REWIND   0x08
+       __u8 flags;
+       __u8 pad[2];
 };
 
 struct kvm_s390_prefix_info {
@@ -850,8 +862,9 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
 #define KVM_CAP_HYPERV_SYNIC 123
 #define KVM_CAP_S390_RI 124
-#define KVM_CAP_ARM_PMU_V3 125
-#define KVM_CAP_VCPU_ATTRIBUTES 126
+#define KVM_CAP_SPAPR_TCE_64 125
+#define KVM_CAP_ARM_PMU_V3 126
+#define KVM_CAP_VCPU_ATTRIBUTES 127
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1144,6 +1157,8 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_PPC_ALLOC_HTAB */
 #define KVM_PPC_ALLOCATE_HTAB    _IOWR(KVMIO, 0xa7, __u32)
 #define KVM_CREATE_SPAPR_TCE     _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
+#define KVM_CREATE_SPAPR_TCE_64          _IOW(KVMIO,  0xa8, \
+                                      struct kvm_create_spapr_tce_64)
 /* Available with KVM_CAP_RMA */
 #define KVM_ALLOCATE_RMA         _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 /* Available with KVM_CAP_PPC_HTAB_FD */
index db2dd33..b866374 100644 (file)
@@ -109,8 +109,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
        /* cancel outstanding work queue item */
        while (!list_empty(&vcpu->async_pf.queue)) {
                struct kvm_async_pf *work =
-                       list_entry(vcpu->async_pf.queue.next,
-                                  typeof(*work), queue);
+                       list_first_entry(&vcpu->async_pf.queue,
+                                        typeof(*work), queue);
                list_del(&work->queue);
 
 #ifdef CONFIG_KVM_ASYNC_PF_SYNC
@@ -127,8 +127,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
        spin_lock(&vcpu->async_pf.lock);
        while (!list_empty(&vcpu->async_pf.done)) {
                struct kvm_async_pf *work =
-                       list_entry(vcpu->async_pf.done.next,
-                                  typeof(*work), link);
+                       list_first_entry(&vcpu->async_pf.done,
+                                        typeof(*work), link);
                list_del(&work->link);
                kmem_cache_free(async_pf_cache, work);
        }
index a11cfd2..1eae052 100644 (file)
@@ -72,11 +72,11 @@ module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
 
 /* Default doubles per-vcpu halt_poll_ns. */
 static unsigned int halt_poll_ns_grow = 2;
-module_param(halt_poll_ns_grow, int, S_IRUGO);
+module_param(halt_poll_ns_grow, uint, S_IRUGO | S_IWUSR);
 
 /* Default resets per-vcpu halt_poll_ns . */
 static unsigned int halt_poll_ns_shrink;
-module_param(halt_poll_ns_shrink, int, S_IRUGO);
+module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR);
 
 /*
  * Ordering of locks:
@@ -620,13 +620,10 @@ void *kvm_kvzalloc(unsigned long size)
 
 static void kvm_destroy_devices(struct kvm *kvm)
 {
-       struct list_head *node, *tmp;
+       struct kvm_device *dev, *tmp;
 
-       list_for_each_safe(node, tmp, &kvm->devices) {
-               struct kvm_device *dev =
-                       list_entry(node, struct kvm_device, vm_node);
-
-               list_del(node);
+       list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
+               list_del(&dev->vm_node);
                dev->ops->destroy(dev);
        }
 }
@@ -1437,11 +1434,17 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
 {
        unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
 
-       if (addr == KVM_HVA_ERR_RO_BAD)
+       if (addr == KVM_HVA_ERR_RO_BAD) {
+               if (writable)
+                       *writable = false;
                return KVM_PFN_ERR_RO_FAULT;
+       }
 
-       if (kvm_is_error_hva(addr))
+       if (kvm_is_error_hva(addr)) {
+               if (writable)
+                       *writable = false;
                return KVM_PFN_NOSLOT;
+       }
 
        /* Do not map writable pfn in the readonly memslot. */
        if (writable && memslot_is_readonly(slot)) {
@@ -1943,14 +1946,15 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
 {
-       int old, val;
+       unsigned int old, val, grow;
 
        old = val = vcpu->halt_poll_ns;
+       grow = READ_ONCE(halt_poll_ns_grow);
        /* 10us base */
-       if (val == 0 && halt_poll_ns_grow)
+       if (val == 0 && grow)
                val = 10000;
        else
-               val *= halt_poll_ns_grow;
+               val *= grow;
 
        vcpu->halt_poll_ns = val;
        trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
@@ -1958,13 +1962,14 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
 
 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
 {
-       int old, val;
+       unsigned int old, val, shrink;
 
        old = val = vcpu->halt_poll_ns;
-       if (halt_poll_ns_shrink == 0)
+       shrink = READ_ONCE(halt_poll_ns_shrink);
+       if (shrink == 0)
                val = 0;
        else
-               val /= halt_poll_ns_shrink;
+               val /= shrink;
 
        vcpu->halt_poll_ns = val;
        trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);