Merge branch 'kvm-ppc-infrastructure' into kvm-ppc-next
authorPaul Mackerras <paulus@ozlabs.org>
Fri, 9 Sep 2016 06:24:23 +0000 (16:24 +1000)
committerPaul Mackerras <paulus@ozlabs.org>
Fri, 9 Sep 2016 06:24:23 +0000 (16:24 +1000)
This merges the topic branch 'kvm-ppc-infrastructure' into kvm-ppc-next
so that I can then apply further patches that need the changes in the
kvm-ppc-infrastructure branch.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
14 files changed:
arch/powerpc/include/asm/book3s/64/mmu-hash.h
arch/powerpc/include/asm/hmi.h
arch/powerpc/include/asm/io.h
arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/mmu.h
arch/powerpc/include/asm/paca.h
arch/powerpc/include/asm/pnv-pci.h
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/hmi.c [deleted file]
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s_hv_hmi.c [new file with mode: 0644]
arch/powerpc/mm/hash_native_64.c
arch/powerpc/mm/hash_utils_64.c
arch/powerpc/platforms/powernv/pci-ioda.c

index 287a656..e407af2 100644 (file)
@@ -244,6 +244,43 @@ static inline int segment_shift(int ssize)
        return SID_SHIFT_1T;
 }
 
+/*
+ * This array is indexed by the LP field of the HPTE second dword.
+ * Since this field may contain some RPN bits, some entries are
+ * replicated so that we get the same value irrespective of RPN.
+ * The top 4 bits are the page size index (MMU_PAGE_*) for the
+ * actual page size, the bottom 4 bits are the base page size.
+ */
+extern u8 hpte_page_sizes[1 << LP_BITS];
+
+static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l,
+                                            bool is_base_size)
+{
+       unsigned int i, lp;
+
+       if (!(h & HPTE_V_LARGE))
+               return 1ul << 12;
+
+       /* Look at the 8 bit LP value */
+       lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+       i = hpte_page_sizes[lp];
+       if (!i)
+               return 0;
+       if (!is_base_size)
+               i >>= 4;
+       return 1ul << mmu_psize_defs[i & 0xf].shift;
+}
+
+static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
+{
+       return __hpte_page_size(h, l, 0);
+}
+
+static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l)
+{
+       return __hpte_page_size(h, l, 1);
+}
+
 /*
  * The current system page and segment sizes
  */
index 88b4901..85b7a1a 100644 (file)
@@ -21,7 +21,7 @@
 #ifndef __ASM_PPC64_HMI_H__
 #define __ASM_PPC64_HMI_H__
 
-#ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 
 #define        CORE_TB_RESYNC_REQ_BIT          63
 #define MAX_SUBCORE_PER_CORE           4
index 2fd1690..f6fda84 100644 (file)
@@ -241,6 +241,35 @@ static inline void out_be64(volatile u64 __iomem *addr, u64 val)
 #endif
 #endif /* __powerpc64__ */
 
+
+/*
+ * Simple Cache inhibited accessors
+ * Unlike the DEF_MMIO_* macros, these don't include any h/w memory
+ * barriers, callers need to manage memory barriers on their own.
+ * These can only be used in hypervisor real mode.
+ */
+
+static inline u32 _lwzcix(unsigned long addr)
+{
+       u32 ret;
+
+       __asm__ __volatile__("lwzcix %0,0, %1"
+                            : "=r" (ret) : "r" (addr) : "memory");
+       return ret;
+}
+
+static inline void _stbcix(u64 addr, u8 val)
+{
+       __asm__ __volatile__("stbcix %0,0,%1"
+               : : "r" (val), "r" (addr) : "memory");
+}
+
+static inline void _stwcix(u64 addr, u32 val)
+{
+       __asm__ __volatile__("stwcix %0,0,%1"
+               : : "r" (val), "r" (addr) : "memory");
+}
+
 /*
  * Low level IO stream instructions are defined out of line for now
  */
index 88d17b4..4ffd5a1 100644 (file)
@@ -20,6 +20,8 @@
 #ifndef __ASM_KVM_BOOK3S_64_H__
 #define __ASM_KVM_BOOK3S_64_H__
 
+#include <asm/book3s/64/mmu-hash.h>
+
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
 {
@@ -97,56 +99,20 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
        hpte[0] = cpu_to_be64(hpte_v);
 }
 
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
-       int i, shift;
-       unsigned int mask;
-
-       /* start from 1 ignoring MMU_PAGE_4K */
-       for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
-               /* invalid penc */
-               if (mmu_psize_defs[psize].penc[i] == -1)
-                       continue;
-               /*
-                * encoding bits per actual page size
-                *        PTE LP     actual page size
-                *    rrrr rrrz         >=8KB
-                *    rrrr rrzz         >=16KB
-                *    rrrr rzzz         >=32KB
-                *    rrrr zzzz         >=64KB
-                * .......
-                */
-               shift = mmu_psize_defs[i].shift - LP_SHIFT;
-               if (shift > LP_BITS)
-                       shift = LP_BITS;
-               mask = (1 << shift) - 1;
-               if ((lp & mask) == mmu_psize_defs[psize].penc[i])
-                       return i;
-       }
-       return -1;
-}
-
 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
                                             unsigned long pte_index)
 {
-       int b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
+       int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
        unsigned int penc;
        unsigned long rb = 0, va_low, sllp;
        unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
 
        if (v & HPTE_V_LARGE) {
-               for (b_psize = 0; b_psize < MMU_PAGE_COUNT; b_psize++) {
-
-                       /* valid entries have a shift value */
-                       if (!mmu_psize_defs[b_psize].shift)
-                               continue;
-
-                       a_psize = __hpte_actual_psize(lp, b_psize);
-                       if (a_psize != -1)
-                               break;
-               }
+               i = hpte_page_sizes[lp];
+               b_psize = i & 0xf;
+               a_psize = i >> 4;
        }
+
        /*
         * Ignore the top 14 bits of va
         * v have top two bits covering segment size, hence move
@@ -215,45 +181,6 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
        return rb;
 }
 
-static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l,
-                                            bool is_base_size)
-{
-
-       int size, a_psize;
-       /* Look at the 8 bit LP value */
-       unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
-
-       /* only handle 4k, 64k and 16M pages for now */
-       if (!(h & HPTE_V_LARGE))
-               return 1ul << 12;
-       else {
-               for (size = 0; size < MMU_PAGE_COUNT; size++) {
-                       /* valid entries have a shift value */
-                       if (!mmu_psize_defs[size].shift)
-                               continue;
-
-                       a_psize = __hpte_actual_psize(lp, size);
-                       if (a_psize != -1) {
-                               if (is_base_size)
-                                       return 1ul << mmu_psize_defs[size].shift;
-                               return 1ul << mmu_psize_defs[a_psize].shift;
-                       }
-               }
-
-       }
-       return 0;
-}
-
-static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
-{
-       return __hpte_page_size(h, l, 0);
-}
-
-static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l)
-{
-       return __hpte_page_size(h, l, 1);
-}
-
 static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
 {
        return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
index e2fb408..b78e8d3 100644 (file)
@@ -271,6 +271,7 @@ static inline bool early_radix_enabled(void)
 #define MMU_PAGE_16G   13
 #define MMU_PAGE_64G   14
 
+/* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */
 #define MMU_PAGE_COUNT 15
 
 #ifdef CONFIG_PPC_BOOK3S_64
index 148303e..6a6792b 100644 (file)
@@ -183,11 +183,6 @@ struct paca_struct {
         */
        u16 in_mce;
        u8 hmi_event_available;          /* HMI event is available */
-       /*
-        * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
-        * more details
-        */
-       struct sibling_subcore_state *sibling_subcore_state;
 #endif
 
        /* Stuff for accurate time accounting */
@@ -202,6 +197,13 @@ struct paca_struct {
        struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
 #endif
        struct kvmppc_host_state kvm_hstate;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+       /*
+        * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
+        * more details
+        */
+       struct sibling_subcore_state *sibling_subcore_state;
+#endif
 #endif
 };
 
index 0cbd813..1b46b52 100644 (file)
@@ -12,6 +12,7 @@
 
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
+#include <linux/irq.h>
 #include <misc/cxl-base.h>
 #include <asm/opal-api.h>
 
@@ -33,6 +34,8 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num);
 void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num);
 int pnv_cxl_get_irq_count(struct pci_dev *dev);
 struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev);
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq);
+bool is_pnv_opal_msi(struct irq_chip *chip);
 
 #ifdef CONFIG_CXL_BASE
 int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
index b2027a5..fe4c075 100644 (file)
@@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32)          += vdso32/
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)       += hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)    += cpu_setup_ppc970.o cpu_setup_pa6t.o
 obj-$(CONFIG_PPC_BOOK3S_64)    += cpu_setup_power.o
-obj-$(CONFIG_PPC_BOOK3S_64)    += mce.o mce_power.o hmi.o
+obj-$(CONFIG_PPC_BOOK3S_64)    += mce.o mce_power.o
 obj-$(CONFIG_PPC_BOOK3E_64)    += exceptions-64e.o idle_book3e.o
 obj-$(CONFIG_PPC64)            += vdso64/
 obj-$(CONFIG_ALTIVEC)          += vecemu.o
diff --git a/arch/powerpc/kernel/hmi.c b/arch/powerpc/kernel/hmi.c
deleted file mode 100644 (file)
index e3f738e..0000000
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Hypervisor Maintenance Interrupt (HMI) handling.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.
- *
- * Copyright 2015 IBM Corporation
- * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
- */
-
-#undef DEBUG
-
-#include <linux/types.h>
-#include <linux/compiler.h>
-#include <asm/paca.h>
-#include <asm/hmi.h>
-
-void wait_for_subcore_guest_exit(void)
-{
-       int i;
-
-       /*
-        * NULL bitmap pointer indicates that KVM module hasn't
-        * been loaded yet and hence no guests are running.
-        * If no KVM is in use, no need to co-ordinate among threads
-        * as all of them will always be in host and no one is going
-        * to modify TB other than the opal hmi handler.
-        * Hence, just return from here.
-        */
-       if (!local_paca->sibling_subcore_state)
-               return;
-
-       for (i = 0; i < MAX_SUBCORE_PER_CORE; i++)
-               while (local_paca->sibling_subcore_state->in_guest[i])
-                       cpu_relax();
-}
-
-void wait_for_tb_resync(void)
-{
-       if (!local_paca->sibling_subcore_state)
-               return;
-
-       while (test_bit(CORE_TB_RESYNC_REQ_BIT,
-                               &local_paca->sibling_subcore_state->flags))
-               cpu_relax();
-}
index bd9b82f..7dd89b7 100644 (file)
@@ -77,6 +77,7 @@ kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
 
 ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
+       book3s_hv_hmi.o \
        book3s_hv_rmhandlers.o \
        book3s_hv_rm_mmu.o \
        book3s_hv_ras.o \
diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c
new file mode 100644 (file)
index 0000000..e3f738e
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * Hypervisor Maintenance Interrupt (HMI) handling.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ *
+ * Copyright 2015 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <asm/paca.h>
+#include <asm/hmi.h>
+
+void wait_for_subcore_guest_exit(void)
+{
+       int i;
+
+       /*
+        * NULL bitmap pointer indicates that KVM module hasn't
+        * been loaded yet and hence no guests are running.
+        * If no KVM is in use, no need to co-ordinate among threads
+        * as all of them will always be in host and no one is going
+        * to modify TB other than the opal hmi handler.
+        * Hence, just return from here.
+        */
+       if (!local_paca->sibling_subcore_state)
+               return;
+
+       for (i = 0; i < MAX_SUBCORE_PER_CORE; i++)
+               while (local_paca->sibling_subcore_state->in_guest[i])
+                       cpu_relax();
+}
+
+void wait_for_tb_resync(void)
+{
+       if (!local_paca->sibling_subcore_state)
+               return;
+
+       while (test_bit(CORE_TB_RESYNC_REQ_BIT,
+                               &local_paca->sibling_subcore_state->flags))
+               cpu_relax();
+}
index 0e4e965..83ddc0e 100644 (file)
@@ -493,36 +493,6 @@ static void native_hugepage_invalidate(unsigned long vsid,
 }
 #endif
 
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
-       int i, shift;
-       unsigned int mask;
-
-       /* start from 1 ignoring MMU_PAGE_4K */
-       for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
-               /* invalid penc */
-               if (mmu_psize_defs[psize].penc[i] == -1)
-                       continue;
-               /*
-                * encoding bits per actual page size
-                *        PTE LP     actual page size
-                *    rrrr rrrz         >=8KB
-                *    rrrr rrzz         >=16KB
-                *    rrrr rzzz         >=32KB
-                *    rrrr zzzz         >=64KB
-                * .......
-                */
-               shift = mmu_psize_defs[i].shift - LP_SHIFT;
-               if (shift > LP_BITS)
-                       shift = LP_BITS;
-               mask = (1 << shift) - 1;
-               if ((lp & mask) == mmu_psize_defs[psize].penc[i])
-                       return i;
-       }
-       return -1;
-}
-
 static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
                        int *psize, int *apsize, int *ssize, unsigned long *vpn)
 {
@@ -538,16 +508,8 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
                size   = MMU_PAGE_4K;
                a_size = MMU_PAGE_4K;
        } else {
-               for (size = 0; size < MMU_PAGE_COUNT; size++) {
-
-                       /* valid entries have a shift value */
-                       if (!mmu_psize_defs[size].shift)
-                               continue;
-
-                       a_size = __hpte_actual_psize(lp, size);
-                       if (a_size != -1)
-                               break;
-               }
+               size = hpte_page_sizes[lp] & 0xf;
+               a_size = hpte_page_sizes[lp] >> 4;
        }
        /* This works for all page sizes, and for 256M and 1T segments */
        if (cpu_has_feature(CPU_FTR_ARCH_300))
index 0821556..ef3ae89 100644 (file)
@@ -93,6 +93,9 @@ static unsigned long _SDR1;
 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
 EXPORT_SYMBOL_GPL(mmu_psize_defs);
 
+u8 hpte_page_sizes[1 << LP_BITS];
+EXPORT_SYMBOL_GPL(hpte_page_sizes);
+
 struct hash_pte *htab_address;
 unsigned long htab_size_bytes;
 unsigned long htab_hash_mask;
@@ -564,8 +567,60 @@ static void __init htab_scan_page_sizes(void)
 #endif /* CONFIG_HUGETLB_PAGE */
 }
 
+/*
+ * Fill in the hpte_page_sizes[] array.
+ * We go through the mmu_psize_defs[] array looking for all the
+ * supported base/actual page size combinations.  Each combination
+ * has a unique pagesize encoding (penc) value in the low bits of
+ * the LP field of the HPTE.  For actual page sizes less than 1MB,
+ * some of the upper LP bits are used for RPN bits, meaning that
+ * we need to fill in several entries in hpte_page_sizes[].
+ *
+ * In diagrammatic form, with r = RPN bits and z = page size bits:
+ *        PTE LP     actual page size
+ *    rrrr rrrz                >=8KB
+ *    rrrr rrzz                >=16KB
+ *    rrrr rzzz                >=32KB
+ *    rrrr zzzz                >=64KB
+ *    ...
+ *
+ * The zzzz bits are implementation-specific but are chosen so that
+ * no encoding for a larger page size uses the same value in its
+ * low-order N bits as the encoding for the 2^(12+N) byte page size
+ * (if it exists).
+ */
+static void init_hpte_page_sizes(void)
+{
+       long int ap, bp;
+       long int shift, penc;
+
+       for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
+               if (!mmu_psize_defs[bp].shift)
+                       continue;       /* not a supported page size */
+               for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
+                       penc = mmu_psize_defs[bp].penc[ap];
+                       if (penc == -1)
+                               continue;
+                       shift = mmu_psize_defs[ap].shift - LP_SHIFT;
+                       if (shift <= 0)
+                               continue;       /* should never happen */
+                       /*
+                        * For page sizes less than 1MB, this loop
+                        * replicates the entry for all possible values
+                        * of the rrrr bits.
+                        */
+                       while (penc < (1 << LP_BITS)) {
+                               hpte_page_sizes[penc] = (ap << 4) | bp;
+                               penc += 1 << shift;
+                       }
+               }
+       }
+}
+
 static void __init htab_init_page_sizes(void)
 {
+       init_hpte_page_sizes();
+
        if (!debug_pagealloc_enabled()) {
                /*
                 * Pick a size for the linear mapping. Currently, we only
index fd9444f..9ce48ae 100644 (file)
@@ -2710,15 +2710,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 }
 
 #ifdef CONFIG_PCI_MSI
-static void pnv_ioda2_msi_eoi(struct irq_data *d)
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
 {
-       unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
-       struct irq_chip *chip = irq_data_get_irq_chip(d);
        struct pnv_phb *phb = container_of(chip, struct pnv_phb,
                                           ioda.irq_chip);
+
+       return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+}
+
+static void pnv_ioda2_msi_eoi(struct irq_data *d)
+{
        int64_t rc;
+       unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+       struct irq_chip *chip = irq_data_get_irq_chip(d);
 
-       rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
+       rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
        WARN_ON_ONCE(rc);
 
        icp_native_eoi(d);
@@ -2748,6 +2754,16 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
        irq_set_chip(virq, &phb->ioda.irq_chip);
 }
 
+/*
+ * Returns true iff chip is something that we could call
+ * pnv_opal_pci_msi_eoi for.
+ */
+bool is_pnv_opal_msi(struct irq_chip *chip)
+{
+       return chip->irq_eoi == pnv_ioda2_msi_eoi;
+}
+EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
+
 static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
                                  unsigned int hwirq, unsigned int virq,
                                  unsigned int is_64, struct msi_msg *msg)