Merge tag 'for-linus-4.3-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 8 Sep 2015 18:46:48 +0000 (11:46 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 8 Sep 2015 18:46:48 +0000 (11:46 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 8 Sep 2015 18:46:48 +0000 (11:46 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 8 Sep 2015 18:46:48 +0000 (11:46 -0700)
diff --git a/Documentation/ABI/testing/sysfs-hypervisor-pmu b/Documentation/ABI/testing/sysfs-hypervisor-pmu

new file mode 100644 (file)

index 0000000..224faa1
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-hypervisor-pmu
@@ -0,0 +1,23 @@
+What:          /sys/hypervisor/pmu/pmu_mode
+Date:          August 2015
+KernelVersion: 4.3
+Contact:       Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Description:
+               Describes mode that Xen's performance-monitoring unit (PMU)
+               uses. Accepted values are
+                       "off"  -- PMU is disabled
+                       "self" -- The guest can profile itself
+                       "hv"   -- The guest can profile itself and, if it is
+                                 privileged (e.g. dom0), the hypervisor
+                       "all" --  The guest can profile itself, the hypervisor
+                                 and all other guests. Only available to
+                                 privileged guests.
+
+What:           /sys/hypervisor/pmu/pmu_features
+Date:           August 2015
+KernelVersion:  4.3
+Contact:        Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Description:
+               Describes Xen PMU features (as an integer). A set bit indicates
+               that the corresponding feature is enabled. See
+               include/xen/interface/xenpmu.h for available features
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index f0c9505..22a4b68 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -4106,6 +4106,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                         plus one apbt timer for broadcast timer.
                         x86_intel_mid_timer=apbt_only | lapic_and_apbt
  
+       xen_512gb_limit         [KNL,X86-64,XEN]
+                       Restricts the kernel running paravirtualized under Xen
+                       to use only up to 512 GB of RAM. The reason to do so is
+                       crash analysis tools and Xen tools for doing domain
+                       save/restore/migration must be enabled to handle larger
+                       domains.
+
         xen_emul_unplug=                [HW,X86,XEN]
                         Unplug Xen emulated devices
                         Format: [unplug0,][unplug1]
diff --git a/arch/arm/include/asm/xen/events.h b/arch/arm/include/asm/xen/events.h

index 8b1f37b..71e473d 100644 (file)
--- a/arch/arm/include/asm/xen/events.h
+++ b/arch/arm/include/asm/xen/events.h
@@ -20,4 +20,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
                                                             atomic64_t, \
                                                             counter), (val))
  
+/* Rebind event channel is supported by default */
+static inline bool xen_support_evtchn_rebind(void)
+{
+       return true;
+}
+
  #endif /* _ASM_ARM_XEN_EVENTS_H */
diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h

index 1bee8ca..98b1084 100644 (file)
--- a/arch/arm/include/asm/xen/page.h
+++ b/arch/arm/include/asm/xen/page.h
@@ -54,26 +54,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
  
  #define mfn_to_local_pfn(mfn) mfn_to_pfn(mfn)
  
-static inline xmaddr_t phys_to_machine(xpaddr_t phys)
-{
-       unsigned offset = phys.paddr & ~PAGE_MASK;
-       return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
-}
-
-static inline xpaddr_t machine_to_phys(xmaddr_t machine)
-{
-       unsigned offset = machine.maddr & ~PAGE_MASK;
-       return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
-}
  /* VIRT <-> MACHINE conversion */
-#define virt_to_machine(v)     (phys_to_machine(XPADDR(__pa(v))))
  #define virt_to_mfn(v)         (pfn_to_mfn(virt_to_pfn(v)))
  #define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
  
+/* Only used in PV code. But ARM guests are always HVM. */
  static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr)
  {
-       /* TODO: assuming it is mapped in the kernel 1:1 */
-       return virt_to_machine(vaddr);
+       BUG();
  }
  
  /* TODO: this shouldn't be here but it is because the frontend drivers
diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c

index 6c09cc4..c50c8d3 100644 (file)
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -45,13 +45,6 @@ static struct vcpu_info __percpu *xen_vcpu_info;
  unsigned long xen_released_pages;
  struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
  
-/* TODO: to be removed */
-__read_mostly int xen_have_vector_callback;
-EXPORT_SYMBOL_GPL(xen_have_vector_callback);
-
-int xen_platform_pci_unplug = XEN_UNPLUG_ALL;
-EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
-
  static __read_mostly unsigned int xen_events_irq;
  
  static __initdata struct device_node *xen_node;
diff --git a/arch/arm64/include/asm/xen/events.h b/arch/arm64/include/asm/xen/events.h

index 8655321..4318866 100644 (file)
--- a/arch/arm64/include/asm/xen/events.h
+++ b/arch/arm64/include/asm/xen/events.h
@@ -18,4 +18,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
  
  #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
  
+/* Rebind event channel is supported by default */
+static inline bool xen_support_evtchn_rebind(void)
+{
+       return true;
+}
+
  #endif /* _ASM_ARM64_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h

index 608a79d..e6911ca 100644 (file)
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
  /* No need for a barrier -- XCHG is a barrier on x86. */
  #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
  
+extern int xen_have_vector_callback;
+
+/*
+ * Events delivered via platform PCI interrupts are always
+ * routed to vcpu 0 and hence cannot be rebound.
+ */
+static inline bool xen_support_evtchn_rebind(void)
+{
+       return (!xen_hvm_domain() || xen_have_vector_callback);
+}
+
  #endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h

index ca08a27..83aea80 100644 (file)
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -465,6 +465,12 @@ HYPERVISOR_tmem_op(
         return _hypercall1(int, tmem_op, op);
  }
  
+static inline int
+HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
+{
+       return _hypercall2(int, xenpmu_op, op, arg);
+}
+
  static inline void
  MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
  {
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h

index 3400dba..62ca03e 100644 (file)
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -3,12 +3,38 @@
   *
   * Guest OS interface to x86 Xen.
   *
- * Copyright (c) 2004, K A Fraser
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
   */
  
  #ifndef _ASM_X86_XEN_INTERFACE_H
  #define _ASM_X86_XEN_INTERFACE_H
  
+/*
+ * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field
+ * in a struct in memory.
+ * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an
+ * hypercall argument.
+ * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but
+ * they might not be on other architectures.
+ */
  #ifdef __XEN__
  #define __DEFINE_GUEST_HANDLE(name, type) \
      typedef struct { type *p; } __guest_handle_ ## name
@@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t);
   * start of the GDT because some stupid OSes export hard-coded selector values
   * in their ABI. These hard-coded values are always near the start of the GDT,
   * so Xen places itself out of the way, at the far end of the GDT.
+ *
+ * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op
   */
  #define FIRST_RESERVED_GDT_PAGE  14
  #define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
  #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
  
  /*
- * Send an array of these to HYPERVISOR_set_trap_table()
+ * Send an array of these to HYPERVISOR_set_trap_table().
+ * Terminate the array with a sentinel entry, with traps[].address==0.
   * The privilege level specifies which modes may enter a trap via a software
   * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
   * privilege levels as follows:
@@ -118,10 +147,41 @@ struct trap_info {
  DEFINE_GUEST_HANDLE_STRUCT(trap_info);
  
  struct arch_shared_info {
-    unsigned long max_pfn;                  /* max pfn that appears in table */
-    /* Frame containing list of mfns containing list of mfns containing p2m. */
-    unsigned long pfn_to_mfn_frame_list_list;
-    unsigned long nmi_reason;
+       /*
+        * Number of valid entries in the p2m table(s) anchored at
+        * pfn_to_mfn_frame_list_list and/or p2m_vaddr.
+        */
+       unsigned long max_pfn;
+       /*
+        * Frame containing list of mfns containing list of mfns containing p2m.
+        * A value of 0 indicates it has not yet been set up, ~0 indicates it
+        * has been set to invalid e.g. due to the p2m being too large for the
+        * 3-level p2m tree. In this case the linear mapper p2m list anchored
+        * at p2m_vaddr is to be used.
+        */
+       xen_pfn_t pfn_to_mfn_frame_list_list;
+       unsigned long nmi_reason;
+       /*
+        * Following three fields are valid if p2m_cr3 contains a value
+        * different from 0.
+        * p2m_cr3 is the root of the address space where p2m_vaddr is valid.
+        * p2m_cr3 is in the same format as a cr3 value in the vcpu register
+        * state and holds the folded machine frame number (via xen_pfn_to_cr3)
+        * of a L3 or L4 page table.
+        * p2m_vaddr holds the virtual address of the linear p2m list. All
+        * entries in the range [0...max_pfn[ are accessible via this pointer.
+        * p2m_generation will be incremented by the guest before and after each
+        * change of the mappings of the p2m list. p2m_generation starts at 0
+        * and a value with the least significant bit set indicates that a
+        * mapping update is in progress. This allows guest external software
+        * (e.g. in Dom0) to verify that read mappings are consistent and
+        * whether they have changed since the last check.
+        * Modifying a p2m element in the linear p2m list is allowed via an
+        * atomic write only.
+        */
+       unsigned long p2m_cr3;          /* cr3 value of the p2m address space */
+       unsigned long p2m_vaddr;        /* virtual address of the p2m list */
+       unsigned long p2m_generation;   /* generation count of p2m mapping */
  };
  #endif /* !__ASSEMBLY__ */
  
@@ -137,13 +197,31 @@ struct arch_shared_info {
  /*
   * The following is all CPU context. Note that the fpu_ctxt block is filled
   * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ *
+ * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise
+ * for HVM and PVH guests, not all information in this structure is updated:
+ *
+ * - For HVM guests, the structures read include: fpu_ctxt (if
+ * VGCT_I387_VALID is set), flags, user_regs, debugreg[*]
+ *
+ * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to
+ * set cr3. All other fields not used should be set to 0.
   */
  struct vcpu_guest_context {
      /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
      struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
-#define VGCF_I387_VALID (1<<0)
-#define VGCF_HVM_GUEST  (1<<1)
-#define VGCF_IN_KERNEL  (1<<2)
+#define VGCF_I387_VALID                (1<<0)
+#define VGCF_IN_KERNEL                 (1<<2)
+#define _VGCF_i387_valid               0
+#define VGCF_i387_valid                (1<<_VGCF_i387_valid)
+#define _VGCF_in_kernel                2
+#define VGCF_in_kernel                 (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events)
+#define _VGCF_syscall_disables_events  4
+#define VGCF_syscall_disables_events   (1<<_VGCF_syscall_disables_events)
+#define _VGCF_online                   5
+#define VGCF_online                    (1<<_VGCF_online)
      unsigned long flags;                    /* VGCF_* flags                 */
      struct cpu_user_regs user_regs;         /* User-level CPU registers     */
      struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
@@ -172,6 +250,129 @@ struct vcpu_guest_context {
  #endif
  };
  DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
+
+/* AMD PMU registers and structures */
+struct xen_pmu_amd_ctxt {
+       /*
+        * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd).
+        * For PV(H) guests these fields are RO.
+        */
+       uint32_t counters;
+       uint32_t ctrls;
+
+       /* Counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+       uint64_t regs[];
+#elif defined(__GNUC__)
+       uint64_t regs[0];
+#endif
+};
+
+/* Intel PMU registers and structures */
+struct xen_pmu_cntr_pair {
+       uint64_t counter;
+       uint64_t control;
+};
+
+struct xen_pmu_intel_ctxt {
+       /*
+        * Offsets to fixed and architectural counter MSRs (relative to
+        * xen_pmu_arch.c.intel).
+        * For PV(H) guests these fields are RO.
+        */
+       uint32_t fixed_counters;
+       uint32_t arch_counters;
+
+       /* PMU registers */
+       uint64_t global_ctrl;
+       uint64_t global_ovf_ctrl;
+       uint64_t global_status;
+       uint64_t fixed_ctrl;
+       uint64_t ds_area;
+       uint64_t pebs_enable;
+       uint64_t debugctl;
+
+       /* Fixed and architectural counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+       uint64_t regs[];
+#elif defined(__GNUC__)
+       uint64_t regs[0];
+#endif
+};
+
+/* Sampled domain's registers */
+struct xen_pmu_regs {
+       uint64_t ip;
+       uint64_t sp;
+       uint64_t flags;
+       uint16_t cs;
+       uint16_t ss;
+       uint8_t cpl;
+       uint8_t pad[3];
+};
+
+/* PMU flags */
+#define PMU_CACHED        (1<<0) /* PMU MSRs are cached in the context */
+#define PMU_SAMPLE_USER           (1<<1) /* Sample is from user or kernel mode */
+#define PMU_SAMPLE_REAL           (1<<2) /* Sample is from realmode */
+#define PMU_SAMPLE_PV     (1<<3) /* Sample from a PV guest */
+
+/*
+ * Architecture-specific information describing state of the processor at
+ * the time of PMU interrupt.
+ * Fields of this structure marked as RW for guest should only be written by
+ * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the
+ * hypervisor during PMU interrupt). Hypervisor will read updated data in
+ * XENPMU_flush hypercall and clear PMU_CACHED bit.
+ */
+struct xen_pmu_arch {
+       union {
+               /*
+                * Processor's registers at the time of interrupt.
+                * WO for hypervisor, RO for guests.
+                */
+               struct xen_pmu_regs regs;
+               /*
+                * Padding for adding new registers to xen_pmu_regs in
+                * the future
+                */
+#define XENPMU_REGS_PAD_SZ  64
+               uint8_t pad[XENPMU_REGS_PAD_SZ];
+       } r;
+
+       /* WO for hypervisor, RO for guest */
+       uint64_t pmu_flags;
+
+       /*
+        * APIC LVTPC register.
+        * RW for both hypervisor and guest.
+        * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware
+        * during XENPMU_flush or XENPMU_lvtpc_set.
+        */
+       union {
+               uint32_t lapic_lvtpc;
+               uint64_t pad;
+       } l;
+
+       /*
+        * Vendor-specific PMU registers.
+        * RW for both hypervisor and guest (see exceptions above).
+        * Guest's updates to this field are verified and then loaded by the
+        * hypervisor into hardware during XENPMU_flush
+        */
+       union {
+               struct xen_pmu_amd_ctxt amd;
+               struct xen_pmu_intel_ctxt intel;
+
+               /*
+                * Padding for contexts (fixed parts only, does not include
+                * MSR banks that are specified by offsets)
+                */
+#define XENPMU_CTXT_PAD_SZ  128
+               uint8_t pad[XENPMU_CTXT_PAD_SZ];
+       } c;
+};
+
  #endif /* !__ASSEMBLY__ */
  
  /*
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h

index c44a5d5..a3804fb 100644 (file)
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -35,9 +35,7 @@ typedef struct xpaddr {
  #define FOREIGN_FRAME(m)       ((m) | FOREIGN_FRAME_BIT)
  #define IDENTITY_FRAME(m)      ((m) | IDENTITY_FRAME_BIT)
  
-/* Maximum amount of memory we can handle in a domain in pages */
-#define MAX_DOMAIN_PAGES                                               \
-    ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+#define P2M_PER_PAGE           (PAGE_SIZE / sizeof(unsigned long))
  
  extern unsigned long *machine_to_phys_mapping;
  extern unsigned long  machine_to_phys_nr;
@@ -48,8 +46,8 @@ extern unsigned long  xen_max_p2m_pfn;
  extern unsigned long get_phys_to_machine(unsigned long pfn);
  extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
  extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-extern unsigned long set_phys_range_identity(unsigned long pfn_s,
-                                            unsigned long pfn_e);
+extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+                                                   unsigned long pfn_e);
  
  extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
                                    struct gnttab_map_grant_ref *kmap_ops,
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig

index 4841453..c7b15f3 100644 (file)
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -7,6 +7,7 @@ config XEN
         depends on PARAVIRT
         select PARAVIRT_CLOCK
         select XEN_HAVE_PVMMU
+       select XEN_HAVE_VPMU
         depends on X86_64 || (X86_32 && X86_PAE)
         depends on X86_LOCAL_APIC && X86_TSC
         help
@@ -23,14 +24,18 @@ config XEN_PVHVM
         def_bool y
         depends on XEN && PCI && X86_LOCAL_APIC
  
-config XEN_MAX_DOMAIN_MEMORY
-       int
-       default 500 if X86_64
-       default 64 if X86_32
-       depends on XEN
-       help
-         This only affects the sizing of some bss arrays, the unused
-         portions of which are freed.
+config XEN_512GB
+       bool "Limit Xen pv-domain memory to 512GB"
+       depends on XEN && X86_64
+       default y
+       help
+         Limit paravirtualized user domains to 512GB of RAM.
+
+         The Xen tools and crash dump analysis tools might not support
+         pv-domains with more than 512 GB of RAM. This option controls the
+         default setting of the kernel to use only up to 512 GB or more.
+         It is always possible to change the default via specifying the
+         boot parameter "xen_512gb_limit".
  
  config XEN_SAVE_RESTORE
         bool
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile

index 4b6e29a..e47e527 100644 (file)
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,7 +13,7 @@ CFLAGS_mmu.o                  := $(nostackp)
  obj-y          := enlighten.o setup.o multicalls.o mmu.o irq.o \
                         time.o xen-asm.o xen-asm_$(BITS).o \
                         grant-table.o suspend.o platform-pci-unplug.o \
-                       p2m.o apic.o
+                       p2m.o apic.o pmu.o
  
  obj-$(CONFIG_EVENT_TRACING) += trace.o
  
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c

index 70e060a..acda713 100644 (file)
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -7,6 +7,7 @@
  #include <xen/xen.h>
  #include <xen/interface/physdev.h>
  #include "xen-ops.h"
+#include "pmu.h"
  #include "smp.h"
  
  static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
@@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg)
  
  static void xen_apic_write(u32 reg, u32 val)
  {
+       if (reg == APIC_LVTPC) {
+               (void)pmu_apic_update(reg);
+               return;
+       }
+
         /* Warn to see if there's any stray references */
         WARN(1,"register: %x, value: %x\n", reg, val);
  }
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c

index d9cfa45..30d12af 100644 (file)
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -84,6 +84,7 @@
  #include "mmu.h"
  #include "smp.h"
  #include "multicalls.h"
+#include "pmu.h"
  
  EXPORT_SYMBOL_GPL(hypercall_page);
  
@@ -1010,8 +1011,7 @@ static void xen_write_cr0(unsigned long cr0)
  
  static void xen_write_cr4(unsigned long cr4)
  {
-       cr4 &= ~X86_CR4_PGE;
-       cr4 &= ~X86_CR4_PSE;
+       cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
  
         native_write_cr4(cr4);
  }
@@ -1030,6 +1030,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
  {
         u64 val;
  
+       if (pmu_msr_read(msr, &val, err))
+               return val;
+
         val = native_read_msr_safe(msr, err);
         switch (msr) {
         case MSR_IA32_APICBASE:
@@ -1076,7 +1079,8 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
                    Xen console noise. */
  
         default:
-               ret = native_write_msr_safe(msr, low, high);
+               if (!pmu_msr_write(msr, low, high, &ret))
+                       ret = native_write_msr_safe(msr, low, high);
         }
  
         return ret;
@@ -1215,7 +1219,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
         .read_msr = xen_read_msr_safe,
         .write_msr = xen_write_msr_safe,
  
-       .read_pmc = native_read_pmc,
+       .read_pmc = xen_read_pmc,
  
         .iret = xen_iret,
  #ifdef CONFIG_X86_64
@@ -1264,6 +1268,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = {
  static void xen_reboot(int reason)
  {
         struct sched_shutdown r = { .reason = reason };
+       int cpu;
+
+       for_each_online_cpu(cpu)
+               xen_pmu_finish(cpu);
  
         if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
                 BUG();
@@ -1607,7 +1615,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
         early_boot_irqs_disabled = true;
  
         xen_raw_console_write("mapping kernel into physical memory\n");
-       xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);
+       xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
+                                  xen_start_info->nr_pages);
+       xen_reserve_special_pages();
  
         /*
          * Modify the cache mode translation tables to match Xen's PAT
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c

index dd151b2..2c50b44 100644 (file)
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
  DEFINE_PER_CPU(unsigned long, xen_cr3);         /* cr3 stored as physaddr */
  DEFINE_PER_CPU(unsigned long, xen_current_cr3);         /* actual vcpu cr3 */
  
+static phys_addr_t xen_pt_base, xen_pt_size __initdata;
  
  /*
   * Just beyond the highest usermode address.  STACK_TOP_MAX has a
@@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm)
  
  static void xen_post_allocator_init(void);
  
+static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+       struct mmuext_op op;
+
+       op.cmd = cmd;
+       op.arg1.mfn = pfn_to_mfn(pfn);
+       if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+               BUG();
+}
+
  #ifdef CONFIG_X86_64
  static void __init xen_cleanhighmap(unsigned long vaddr,
                                     unsigned long vaddr_end)
@@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
         xen_mc_flush();
  }
  
+/*
+ * Make a page range writeable and free it.
+ */
+static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
+{
+       void *vaddr = __va(paddr);
+       void *vaddr_end = vaddr + size;
+
+       for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
+               make_lowmem_page_readwrite(vaddr);
+
+       memblock_free(paddr, size);
+}
+
+static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
+{
+       unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
+
+       if (unpin)
+               pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
+       ClearPagePinned(virt_to_page(__va(pa)));
+       xen_free_ro_pages(pa, PAGE_SIZE);
+}
+
+/*
+ * Since it is well isolated we can (and since it is perhaps large we should)
+ * also free the page tables mapping the initial P->M table.
+ */
+static void __init xen_cleanmfnmap(unsigned long vaddr)
+{
+       unsigned long va = vaddr & PMD_MASK;
+       unsigned long pa;
+       pgd_t *pgd = pgd_offset_k(va);
+       pud_t *pud_page = pud_offset(pgd, 0);
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       unsigned int i;
+       bool unpin;
+
+       unpin = (vaddr == 2 * PGDIR_SIZE);
+       set_pgd(pgd, __pgd(0));
+       do {
+               pud = pud_page + pud_index(va);
+               if (pud_none(*pud)) {
+                       va += PUD_SIZE;
+               } else if (pud_large(*pud)) {
+                       pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+                       xen_free_ro_pages(pa, PUD_SIZE);
+                       va += PUD_SIZE;
+               } else {
+                       pmd = pmd_offset(pud, va);
+                       if (pmd_large(*pmd)) {
+                               pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+                               xen_free_ro_pages(pa, PMD_SIZE);
+                       } else if (!pmd_none(*pmd)) {
+                               pte = pte_offset_kernel(pmd, va);
+                               set_pmd(pmd, __pmd(0));
+                               for (i = 0; i < PTRS_PER_PTE; ++i) {
+                                       if (pte_none(pte[i]))
+                                               break;
+                                       pa = pte_pfn(pte[i]) << PAGE_SHIFT;
+                                       xen_free_ro_pages(pa, PAGE_SIZE);
+                               }
+                               xen_cleanmfnmap_free_pgtbl(pte, unpin);
+                       }
+                       va += PMD_SIZE;
+                       if (pmd_index(va))
+                               continue;
+                       set_pud(pud, __pud(0));
+                       xen_cleanmfnmap_free_pgtbl(pmd, unpin);
+               }
+
+       } while (pud_index(va) || pmd_index(va));
+       xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
+}
+
  static void __init xen_pagetable_p2m_free(void)
  {
         unsigned long size;
@@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void)
         /* using __ka address and sticking INVALID_P2M_ENTRY! */
         memset((void *)xen_start_info->mfn_list, 0xff, size);
  
-       /* We should be in __ka space. */
-       BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
         addr = xen_start_info->mfn_list;
-       /* We roundup to the PMD, which means that if anybody at this stage is
-        * using the __ka address of xen_start_info or xen_start_info->shared_info
-        * they are in going to crash. Fortunatly we have already revectored
-        * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
+       /*
+        * We could be in __ka space.
+        * We roundup to the PMD, which means that if anybody at this stage is
+        * using the __ka address of xen_start_info or
+        * xen_start_info->shared_info they are in going to crash. Fortunatly
+        * we have already revectored in xen_setup_kernel_pagetable and in
+        * xen_setup_shared_info.
+        */
         size = roundup(size, PMD_SIZE);
-       xen_cleanhighmap(addr, addr + size);
  
-       size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
-       memblock_free(__pa(xen_start_info->mfn_list), size);
+       if (addr >= __START_KERNEL_map) {
+               xen_cleanhighmap(addr, addr + size);
+               size = PAGE_ALIGN(xen_start_info->nr_pages *
+                                 sizeof(unsigned long));
+               memblock_free(__pa(addr), size);
+       } else {
+               xen_cleanmfnmap(addr);
+       }
+}
+
+static void __init xen_pagetable_cleanhighmap(void)
+{
+       unsigned long size;
+       unsigned long addr;
  
         /* At this stage, cleanup_highmap has already cleaned __ka space
          * from _brk_limit way up to the max_pfn_mapped (which is the end of
@@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void)
  
  #ifdef CONFIG_X86_64
         xen_pagetable_p2m_free();
+
+       xen_pagetable_cleanhighmap();
  #endif
         /* And revector! Bye bye old array */
         xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
@@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
  #else /* CONFIG_X86_64 */
  static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
  {
+       unsigned long pfn;
+
+       if (xen_feature(XENFEAT_writable_page_tables) ||
+           xen_feature(XENFEAT_auto_translated_physmap) ||
+           xen_start_info->mfn_list >= __START_KERNEL_map)
+               return pte;
+
+       /*
+        * Pages belonging to the initial p2m list mapped outside the default
+        * address range must be mapped read-only. This region contains the
+        * page tables for mapping the p2m list, too, and page tables MUST be
+        * mapped read-only.
+        */
+       pfn = pte_pfn(pte);
+       if (pfn >= xen_start_info->first_p2m_pfn &&
+           pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
+               pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
+
         return pte;
  }
  #endif /* CONFIG_X86_64 */
@@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
         native_set_pte(ptep, pte);
  }
  
-static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-{
-       struct mmuext_op op;
-       op.cmd = cmd;
-       op.arg1.mfn = pfn_to_mfn(pfn);
-       if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-               BUG();
-}
-
  /* Early in boot, while setting up the initial pagetable, assume
     everything is pinned. */
  static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
@@ -1815,7 +1927,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
          * mappings. Considering that on Xen after the kernel mappings we
          * have the mappings of some pages that don't exist in pfn space, we
          * set max_pfn_mapped to the last real pfn mapped. */
-       max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
+       if (xen_start_info->mfn_list < __START_KERNEL_map)
+               max_pfn_mapped = xen_start_info->first_p2m_pfn;
+       else
+               max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
  
         pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
         pt_end = pt_base + xen_start_info->nr_pt_frames;
@@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
         /* Graft it onto L4[511][510] */
         copy_page(level2_kernel_pgt, l2);
  
+       /* Copy the initial P->M table mappings if necessary. */
+       i = pgd_index(xen_start_info->mfn_list);
+       if (i && i < pgd_index(__START_KERNEL_map))
+               init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
+
         if (!xen_feature(XENFEAT_auto_translated_physmap)) {
                 /* Make pagetable pieces RO */
                 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
@@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
                 check_pt_base(&pt_base, &pt_end, addr[i]);
  
         /* Our (by three pages) smaller Xen pagetable that we are using */
-       memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
+       xen_pt_base = PFN_PHYS(pt_base);
+       xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
+       memblock_reserve(xen_pt_base, xen_pt_size);
+
         /* Revector the xen_start_info */
         xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
  }
+
+/*
+ * Read a value from a physical address.
+ */
+static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
+{
+       unsigned long *vaddr;
+       unsigned long val;
+
+       vaddr = early_memremap_ro(addr, sizeof(val));
+       val = *vaddr;
+       early_memunmap(vaddr, sizeof(val));
+       return val;
+}
+
+/*
+ * Translate a virtual address to a physical one without relying on mapped
+ * page tables.
+ */
+static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
+{
+       phys_addr_t pa;
+       pgd_t pgd;
+       pud_t pud;
+       pmd_t pmd;
+       pte_t pte;
+
+       pa = read_cr3();
+       pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
+                                                      sizeof(pgd)));
+       if (!pgd_present(pgd))
+               return 0;
+
+       pa = pgd_val(pgd) & PTE_PFN_MASK;
+       pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
+                                                      sizeof(pud)));
+       if (!pud_present(pud))
+               return 0;
+       pa = pud_pfn(pud) << PAGE_SHIFT;
+       if (pud_large(pud))
+               return pa + (vaddr & ~PUD_MASK);
+
+       pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
+                                                      sizeof(pmd)));
+       if (!pmd_present(pmd))
+               return 0;
+       pa = pmd_pfn(pmd) << PAGE_SHIFT;
+       if (pmd_large(pmd))
+               return pa + (vaddr & ~PMD_MASK);
+
+       pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
+                                                      sizeof(pte)));
+       if (!pte_present(pte))
+               return 0;
+       pa = pte_pfn(pte) << PAGE_SHIFT;
+
+       return pa | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
+ * this area.
+ */
+void __init xen_relocate_p2m(void)
+{
+       phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
+       unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
+       int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
+       pte_t *pt;
+       pmd_t *pmd;
+       pud_t *pud;
+       pgd_t *pgd;
+       unsigned long *new_p2m;
+
+       size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+       n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
+       n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
+       n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
+       n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
+       n_frames = n_pte + n_pt + n_pmd + n_pud;
+
+       new_area = xen_find_free_area(PFN_PHYS(n_frames));
+       if (!new_area) {
+               xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
+               BUG();
+       }
+
+       /*
+        * Setup the page tables for addressing the new p2m list.
+        * We have asked the hypervisor to map the p2m list at the user address
+        * PUD_SIZE. It may have done so, or it may have used a kernel space
+        * address depending on the Xen version.
+        * To avoid any possible virtual address collision, just use
+        * 2 * PUD_SIZE for the new area.
+        */
+       pud_phys = new_area;
+       pmd_phys = pud_phys + PFN_PHYS(n_pud);
+       pt_phys = pmd_phys + PFN_PHYS(n_pmd);
+       p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
+
+       pgd = __va(read_cr3());
+       new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
+       for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
+               pud = early_memremap(pud_phys, PAGE_SIZE);
+               clear_page(pud);
+               for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
+                    idx_pmd++) {
+                       pmd = early_memremap(pmd_phys, PAGE_SIZE);
+                       clear_page(pmd);
+                       for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
+                            idx_pt++) {
+                               pt = early_memremap(pt_phys, PAGE_SIZE);
+                               clear_page(pt);
+                               for (idx_pte = 0;
+                                    idx_pte < min(n_pte, PTRS_PER_PTE);
+                                    idx_pte++) {
+                                       set_pte(pt + idx_pte,
+                                               pfn_pte(p2m_pfn, PAGE_KERNEL));
+                                       p2m_pfn++;
+                               }
+                               n_pte -= PTRS_PER_PTE;
+                               early_memunmap(pt, PAGE_SIZE);
+                               make_lowmem_page_readonly(__va(pt_phys));
+                               pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
+                                                 PFN_DOWN(pt_phys));
+                               set_pmd(pmd + idx_pt,
+                                       __pmd(_PAGE_TABLE | pt_phys));
+                               pt_phys += PAGE_SIZE;
+                       }
+                       n_pt -= PTRS_PER_PMD;
+                       early_memunmap(pmd, PAGE_SIZE);
+                       make_lowmem_page_readonly(__va(pmd_phys));
+                       pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
+                                         PFN_DOWN(pmd_phys));
+                       set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+                       pmd_phys += PAGE_SIZE;
+               }
+               n_pmd -= PTRS_PER_PUD;
+               early_memunmap(pud, PAGE_SIZE);
+               make_lowmem_page_readonly(__va(pud_phys));
+               pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
+               set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
+               pud_phys += PAGE_SIZE;
+       }
+
+       /* Now copy the old p2m info to the new area. */
+       memcpy(new_p2m, xen_p2m_addr, size);
+       xen_p2m_addr = new_p2m;
+
+       /* Release the old p2m list and set new list info. */
+       p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
+       BUG_ON(!p2m_pfn);
+       p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
+
+       if (xen_start_info->mfn_list < __START_KERNEL_map) {
+               pfn = xen_start_info->first_p2m_pfn;
+               pfn_end = xen_start_info->first_p2m_pfn +
+                         xen_start_info->nr_p2m_frames;
+               set_pgd(pgd + 1, __pgd(0));
+       } else {
+               pfn = p2m_pfn;
+               pfn_end = p2m_pfn_end;
+       }
+
+       memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
+       while (pfn < pfn_end) {
+               if (pfn == p2m_pfn) {
+                       pfn = p2m_pfn_end;
+                       continue;
+               }
+               make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+               pfn++;
+       }
+
+       xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
+       xen_start_info->first_p2m_pfn =  PFN_DOWN(new_area);
+       xen_start_info->nr_p2m_frames = n_frames;
+}
+
  #else  /* !CONFIG_X86_64 */
  static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
  static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
@@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3)
         pv_mmu_ops.write_cr3 = &xen_write_cr3;
  }
  
+/*
+ * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
+ * not the first page table in the page table pool.
+ * Iterate through the initial page tables to find the real page table base.
+ */
+static phys_addr_t xen_find_pt_base(pmd_t *pmd)
+{
+       phys_addr_t pt_base, paddr;
+       unsigned pmdidx;
+
+       pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
+
+       for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
+               if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
+                       paddr = m2p(pmd[pmdidx].pmd);
+                       pt_base = min(pt_base, paddr);
+               }
+
+       return pt_base;
+}
+
  void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
  {
         pmd_t *kernel_pmd;
  
+       kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+
+       xen_pt_base = xen_find_pt_base(kernel_pmd);
+       xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
+
         initial_kernel_pmd =
                 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
  
-       max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
-                                 xen_start_info->nr_pt_frames * PAGE_SIZE +
-                                 512*1024);
+       max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
  
-       kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
         copy_page(initial_kernel_pmd, kernel_pmd);
  
         xen_map_identity_early(initial_kernel_pmd, max_pfn);
@@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
                           PFN_DOWN(__pa(initial_page_table)));
         xen_write_cr3(__pa(initial_page_table));
  
-       memblock_reserve(__pa(xen_start_info->pt_base),
-                        xen_start_info->nr_pt_frames * PAGE_SIZE);
+       memblock_reserve(xen_pt_base, xen_pt_size);
  }
  #endif /* CONFIG_X86_64 */
  
+void __init xen_reserve_special_pages(void)
+{
+       phys_addr_t paddr;
+
+       memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
+       if (xen_start_info->store_mfn) {
+               paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
+               memblock_reserve(paddr, PAGE_SIZE);
+       }
+       if (!xen_initial_domain()) {
+               paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
+               memblock_reserve(paddr, PAGE_SIZE);
+       }
+}
+
+void __init xen_pt_check_e820(void)
+{
+       if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
+               xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
+               BUG();
+       }
+}
+
  static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
  
  static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c

index 8b7f18e..bfc08b1 100644 (file)
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -79,10 +79,14 @@
  #include <xen/balloon.h>
  #include <xen/grant_table.h>
  
-#include "p2m.h"
  #include "multicalls.h"
  #include "xen-ops.h"
  
+#define P2M_MID_PER_PAGE       (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE       (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN    (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
  #define PMDS_PER_MID_PAGE      (P2M_MID_PER_PAGE / PTRS_PER_PTE)
  
  unsigned long *xen_p2m_addr __read_mostly;
@@ -199,7 +203,8 @@ void __ref xen_build_mfn_list_list(void)
         unsigned int level, topidx, mididx;
         unsigned long *mid_mfn_p;
  
-       if (xen_feature(XENFEAT_auto_translated_physmap))
+       if (xen_feature(XENFEAT_auto_translated_physmap) ||
+           xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
                 return;
  
         /* Pre-initialize p2m_top_mfn to be completely missing */
@@ -260,9 +265,16 @@ void xen_setup_mfn_list_list(void)
  
         BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
  
-       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
-               virt_to_mfn(p2m_top_mfn);
+       if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
+               HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL;
+       else
+               HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+                       virt_to_mfn(p2m_top_mfn);
         HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+       HYPERVISOR_shared_info->arch.p2m_generation = 0;
+       HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr;
+       HYPERVISOR_shared_info->arch.p2m_cr3 =
+               xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
  }
  
  /* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -478,8 +490,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
  
                 ptechk = lookup_address(vaddr, &level);
                 if (ptechk == pte_pg) {
+                       HYPERVISOR_shared_info->arch.p2m_generation++;
+                       wmb(); /* Tools are synchronizing via p2m_generation. */
                         set_pmd(pmdp,
                                 __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
+                       wmb(); /* Tools are synchronizing via p2m_generation. */
+                       HYPERVISOR_shared_info->arch.p2m_generation++;
                         pte_newpg[i] = NULL;
                 }
  
@@ -505,7 +521,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
   */
  static bool alloc_p2m(unsigned long pfn)
  {
-       unsigned topidx, mididx;
+       unsigned topidx;
         unsigned long *top_mfn_p, *mid_mfn;
         pte_t *ptep, *pte_pg;
         unsigned int level;
@@ -513,9 +529,6 @@ static bool alloc_p2m(unsigned long pfn)
         unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
         unsigned long p2m_pfn;
  
-       topidx = p2m_top_index(pfn);
-       mididx = p2m_mid_index(pfn);
-
         ptep = lookup_address(addr, &level);
         BUG_ON(!ptep || level != PG_LEVEL_4K);
         pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
@@ -527,7 +540,8 @@ static bool alloc_p2m(unsigned long pfn)
                         return false;
         }
  
-       if (p2m_top_mfn) {
+       if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
+               topidx = p2m_top_index(pfn);
                 top_mfn_p = &p2m_top_mfn[topidx];
                 mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
  
@@ -577,10 +591,14 @@ static bool alloc_p2m(unsigned long pfn)
                 spin_lock_irqsave(&p2m_update_lock, flags);
  
                 if (pte_pfn(*ptep) == p2m_pfn) {
+                       HYPERVISOR_shared_info->arch.p2m_generation++;
+                       wmb(); /* Tools are synchronizing via p2m_generation. */
                         set_pte(ptep,
                                 pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
+                       wmb(); /* Tools are synchronizing via p2m_generation. */
+                       HYPERVISOR_shared_info->arch.p2m_generation++;
                         if (mid_mfn)
-                               mid_mfn[mididx] = virt_to_mfn(p2m);
+                               mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m);
                         p2m = NULL;
                 }
  
@@ -630,6 +648,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
                 return true;
         }
  
+       /*
+        * The interface requires atomic updates on p2m elements.
+        * xen_safe_write_ulong() is using __put_user which does an atomic
+        * store via asm().
+        */
         if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
                 return true;
  
diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h

deleted file mode 100644 (file)

index ad8aee2..0000000
--- a/arch/x86/xen/p2m.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _XEN_P2M_H
-#define _XEN_P2M_H
-
-#define P2M_PER_PAGE        (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN         (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
-#define MAX_REMAP_RANGES    10
-
-extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
-                                      unsigned long pfn_e);
-
-#endif  /* _XEN_P2M_H */
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c

index a826171..9586ff3 100644 (file)
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -68,7 +68,7 @@ static int check_platform_magic(void)
         return 0;
  }
  
-bool xen_has_pv_devices()
+bool xen_has_pv_devices(void)
  {
         if (!xen_domain())
                 return false;
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c

new file mode 100644 (file)

index 0000000..724a087
--- /dev/null
+++ b/arch/x86/xen/pmu.c
@@ -0,0 +1,570 @@
+#include <linux/types.h>
+#include <linux/interrupt.h>
+
+#include <asm/xen/hypercall.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
+
+#include "xen-ops.h"
+#include "pmu.h"
+
+/* x86_pmu.handle_irq definition */
+#include "../kernel/cpu/perf_event.h"
+
+#define XENPMU_IRQ_PROCESSING    1
+struct xenpmu {
+       /* Shared page between hypervisor and domain */
+       struct xen_pmu_data *xenpmu_data;
+
+       uint8_t flags;
+};
+static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared);
+#define get_xenpmu_data()    (this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
+#define get_xenpmu_flags()   (this_cpu_ptr(&xenpmu_shared)->flags)
+
+/* Macro for computing address of a PMU MSR bank */
+#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \
+                                           (uintptr_t)ctxt->field))
+
+/* AMD PMU */
+#define F15H_NUM_COUNTERS   6
+#define F10H_NUM_COUNTERS   4
+
+static __read_mostly uint32_t amd_counters_base;
+static __read_mostly uint32_t amd_ctrls_base;
+static __read_mostly int amd_msr_step;
+static __read_mostly int k7_counters_mirrored;
+static __read_mostly int amd_num_counters;
+
+/* Intel PMU */
+#define MSR_TYPE_COUNTER            0
+#define MSR_TYPE_CTRL               1
+#define MSR_TYPE_GLOBAL             2
+#define MSR_TYPE_ARCH_COUNTER       3
+#define MSR_TYPE_ARCH_CTRL          4
+
+/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */
+#define PMU_GENERAL_NR_SHIFT        8
+#define PMU_GENERAL_NR_BITS         8
+#define PMU_GENERAL_NR_MASK         (((1 << PMU_GENERAL_NR_BITS) - 1) \
+                                    << PMU_GENERAL_NR_SHIFT)
+
+/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */
+#define PMU_FIXED_NR_SHIFT          0
+#define PMU_FIXED_NR_BITS           5
+#define PMU_FIXED_NR_MASK           (((1 << PMU_FIXED_NR_BITS) - 1) \
+                                    << PMU_FIXED_NR_SHIFT)
+
+/* Alias registers (0x4c1) for full-width writes to PMCs */
+#define MSR_PMC_ALIAS_MASK          (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0))
+
+#define INTEL_PMC_TYPE_SHIFT        30
+
+static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters;
+
+
+static void xen_pmu_arch_init(void)
+{
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+
+               switch (boot_cpu_data.x86) {
+               case 0x15:
+                       amd_num_counters = F15H_NUM_COUNTERS;
+                       amd_counters_base = MSR_F15H_PERF_CTR;
+                       amd_ctrls_base = MSR_F15H_PERF_CTL;
+                       amd_msr_step = 2;
+                       k7_counters_mirrored = 1;
+                       break;
+               case 0x10:
+               case 0x12:
+               case 0x14:
+               case 0x16:
+               default:
+                       amd_num_counters = F10H_NUM_COUNTERS;
+                       amd_counters_base = MSR_K7_PERFCTR0;
+                       amd_ctrls_base = MSR_K7_EVNTSEL0;
+                       amd_msr_step = 1;
+                       k7_counters_mirrored = 0;
+                       break;
+               }
+       } else {
+               uint32_t eax, ebx, ecx, edx;
+
+               cpuid(0xa, &eax, &ebx, &ecx, &edx);
+
+               intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >>
+                       PMU_GENERAL_NR_SHIFT;
+               intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >>
+                       PMU_FIXED_NR_SHIFT;
+       }
+}
+
+static inline uint32_t get_fam15h_addr(u32 addr)
+{
+       switch (addr) {
+       case MSR_K7_PERFCTR0:
+       case MSR_K7_PERFCTR1:
+       case MSR_K7_PERFCTR2:
+       case MSR_K7_PERFCTR3:
+               return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0);
+       case MSR_K7_EVNTSEL0:
+       case MSR_K7_EVNTSEL1:
+       case MSR_K7_EVNTSEL2:
+       case MSR_K7_EVNTSEL3:
+               return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0);
+       default:
+               break;
+       }
+
+       return addr;
+}
+
+static inline bool is_amd_pmu_msr(unsigned int msr)
+{
+       if ((msr >= MSR_F15H_PERF_CTL &&
+            msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) ||
+           (msr >= MSR_K7_EVNTSEL0 &&
+            msr < MSR_K7_PERFCTR0 + amd_num_counters))
+               return true;
+
+       return false;
+}
+
+static int is_intel_pmu_msr(u32 msr_index, int *type, int *index)
+{
+       u32 msr_index_pmc;
+
+       switch (msr_index) {
+       case MSR_CORE_PERF_FIXED_CTR_CTRL:
+       case MSR_IA32_DS_AREA:
+       case MSR_IA32_PEBS_ENABLE:
+               *type = MSR_TYPE_CTRL;
+               return true;
+
+       case MSR_CORE_PERF_GLOBAL_CTRL:
+       case MSR_CORE_PERF_GLOBAL_STATUS:
+       case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+               *type = MSR_TYPE_GLOBAL;
+               return true;
+
+       default:
+
+               if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) &&
+                   (msr_index < MSR_CORE_PERF_FIXED_CTR0 +
+                                intel_num_fixed_counters)) {
+                       *index = msr_index - MSR_CORE_PERF_FIXED_CTR0;
+                       *type = MSR_TYPE_COUNTER;
+                       return true;
+               }
+
+               if ((msr_index >= MSR_P6_EVNTSEL0) &&
+                   (msr_index < MSR_P6_EVNTSEL0 +  intel_num_arch_counters)) {
+                       *index = msr_index - MSR_P6_EVNTSEL0;
+                       *type = MSR_TYPE_ARCH_CTRL;
+                       return true;
+               }
+
+               msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK;
+               if ((msr_index_pmc >= MSR_IA32_PERFCTR0) &&
+                   (msr_index_pmc < MSR_IA32_PERFCTR0 +
+                                    intel_num_arch_counters)) {
+                       *type = MSR_TYPE_ARCH_COUNTER;
+                       *index = msr_index_pmc - MSR_IA32_PERFCTR0;
+                       return true;
+               }
+               return false;
+       }
+}
+
+static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
+                                 int index, bool is_read)
+{
+       uint64_t *reg = NULL;
+       struct xen_pmu_intel_ctxt *ctxt;
+       uint64_t *fix_counters;
+       struct xen_pmu_cntr_pair *arch_cntr_pair;
+       struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+       uint8_t xenpmu_flags = get_xenpmu_flags();
+
+
+       if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
+               return false;
+
+       ctxt = &xenpmu_data->pmu.c.intel;
+
+       switch (msr) {
+       case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+               reg = &ctxt->global_ovf_ctrl;
+               break;
+       case MSR_CORE_PERF_GLOBAL_STATUS:
+               reg = &ctxt->global_status;
+               break;
+       case MSR_CORE_PERF_GLOBAL_CTRL:
+               reg = &ctxt->global_ctrl;
+               break;
+       case MSR_CORE_PERF_FIXED_CTR_CTRL:
+               reg = &ctxt->fixed_ctrl;
+               break;
+       default:
+               switch (type) {
+               case MSR_TYPE_COUNTER:
+                       fix_counters = field_offset(ctxt, fixed_counters);
+                       reg = &fix_counters[index];
+                       break;
+               case MSR_TYPE_ARCH_COUNTER:
+                       arch_cntr_pair = field_offset(ctxt, arch_counters);
+                       reg = &arch_cntr_pair[index].counter;
+                       break;
+               case MSR_TYPE_ARCH_CTRL:
+                       arch_cntr_pair = field_offset(ctxt, arch_counters);
+                       reg = &arch_cntr_pair[index].control;
+                       break;
+               default:
+                       return false;
+               }
+       }
+
+       if (reg) {
+               if (is_read)
+                       *val = *reg;
+               else {
+                       *reg = *val;
+
+                       if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL)
+                               ctxt->global_status &= (~(*val));
+               }
+               return true;
+       }
+
+       return false;
+}
+
+static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
+{
+       uint64_t *reg = NULL;
+       int i, off = 0;
+       struct xen_pmu_amd_ctxt *ctxt;
+       uint64_t *counter_regs, *ctrl_regs;
+       struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+       uint8_t xenpmu_flags = get_xenpmu_flags();
+
+       if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
+               return false;
+
+       if (k7_counters_mirrored &&
+           ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3)))
+               msr = get_fam15h_addr(msr);
+
+       ctxt = &xenpmu_data->pmu.c.amd;
+       for (i = 0; i < amd_num_counters; i++) {
+               if (msr == amd_ctrls_base + off) {
+                       ctrl_regs = field_offset(ctxt, ctrls);
+                       reg = &ctrl_regs[i];
+                       break;
+               } else if (msr == amd_counters_base + off) {
+                       counter_regs = field_offset(ctxt, counters);
+                       reg = &counter_regs[i];
+                       break;
+               }
+               off += amd_msr_step;
+       }
+
+       if (reg) {
+               if (is_read)
+                       *val = *reg;
+               else
+                       *reg = *val;
+
+               return true;
+       }
+       return false;
+}
+
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
+{
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+               if (is_amd_pmu_msr(msr)) {
+                       if (!xen_amd_pmu_emulate(msr, val, 1))
+                               *val = native_read_msr_safe(msr, err);
+                       return true;
+               }
+       } else {
+               int type, index;
+
+               if (is_intel_pmu_msr(msr, &type, &index)) {
+                       if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
+                               *val = native_read_msr_safe(msr, err);
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
+{
+       uint64_t val = ((uint64_t)high << 32) | low;
+
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+               if (is_amd_pmu_msr(msr)) {
+                       if (!xen_amd_pmu_emulate(msr, &val, 0))
+                               *err = native_write_msr_safe(msr, low, high);
+                       return true;
+               }
+       } else {
+               int type, index;
+
+               if (is_intel_pmu_msr(msr, &type, &index)) {
+                       if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
+                               *err = native_write_msr_safe(msr, low, high);
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+static unsigned long long xen_amd_read_pmc(int counter)
+{
+       struct xen_pmu_amd_ctxt *ctxt;
+       uint64_t *counter_regs;
+       struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+       uint8_t xenpmu_flags = get_xenpmu_flags();
+
+       if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
+               uint32_t msr;
+               int err;
+
+               msr = amd_counters_base + (counter * amd_msr_step);
+               return native_read_msr_safe(msr, &err);
+       }
+
+       ctxt = &xenpmu_data->pmu.c.amd;
+       counter_regs = field_offset(ctxt, counters);
+       return counter_regs[counter];
+}
+
+static unsigned long long xen_intel_read_pmc(int counter)
+{
+       struct xen_pmu_intel_ctxt *ctxt;
+       uint64_t *fixed_counters;
+       struct xen_pmu_cntr_pair *arch_cntr_pair;
+       struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+       uint8_t xenpmu_flags = get_xenpmu_flags();
+
+       if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
+               uint32_t msr;
+               int err;
+
+               if (counter & (1 << INTEL_PMC_TYPE_SHIFT))
+                       msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff);
+               else
+                       msr = MSR_IA32_PERFCTR0 + counter;
+
+               return native_read_msr_safe(msr, &err);
+       }
+
+       ctxt = &xenpmu_data->pmu.c.intel;
+       if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) {
+               fixed_counters = field_offset(ctxt, fixed_counters);
+               return fixed_counters[counter & 0xffff];
+       }
+
+       arch_cntr_pair = field_offset(ctxt, arch_counters);
+       return arch_cntr_pair[counter].counter;
+}
+
+unsigned long long xen_read_pmc(int counter)
+{
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+               return xen_amd_read_pmc(counter);
+       else
+               return xen_intel_read_pmc(counter);
+}
+
+int pmu_apic_update(uint32_t val)
+{
+       int ret;
+       struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+       if (!xenpmu_data) {
+               pr_warn_once("%s: pmudata not initialized\n", __func__);
+               return -EINVAL;
+       }
+
+       xenpmu_data->pmu.l.lapic_lvtpc = val;
+
+       if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING)
+               return 0;
+
+       ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL);
+
+       return ret;
+}
+
+/* perf callbacks */
+static int xen_is_in_guest(void)
+{
+       const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+       if (!xenpmu_data) {
+               pr_warn_once("%s: pmudata not initialized\n", __func__);
+               return 0;
+       }
+
+       if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF))
+               return 0;
+
+       return 1;
+}
+
+static int xen_is_user_mode(void)
+{
+       const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+       if (!xenpmu_data) {
+               pr_warn_once("%s: pmudata not initialized\n", __func__);
+               return 0;
+       }
+
+       if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV)
+               return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER);
+       else
+               return !!(xenpmu_data->pmu.r.regs.cpl & 3);
+}
+
+static unsigned long xen_get_guest_ip(void)
+{
+       const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+       if (!xenpmu_data) {
+               pr_warn_once("%s: pmudata not initialized\n", __func__);
+               return 0;
+       }
+
+       return xenpmu_data->pmu.r.regs.ip;
+}
+
+static struct perf_guest_info_callbacks xen_guest_cbs = {
+       .is_in_guest            = xen_is_in_guest,
+       .is_user_mode           = xen_is_user_mode,
+       .get_guest_ip           = xen_get_guest_ip,
+};
+
+/* Convert registers from Xen's format to Linux' */
+static void xen_convert_regs(const struct xen_pmu_regs *xen_regs,
+                            struct pt_regs *regs, uint64_t pmu_flags)
+{
+       regs->ip = xen_regs->ip;
+       regs->cs = xen_regs->cs;
+       regs->sp = xen_regs->sp;
+
+       if (pmu_flags & PMU_SAMPLE_PV) {
+               if (pmu_flags & PMU_SAMPLE_USER)
+                       regs->cs |= 3;
+               else
+                       regs->cs &= ~3;
+       } else {
+               if (xen_regs->cpl)
+                       regs->cs |= 3;
+               else
+                       regs->cs &= ~3;
+       }
+}
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
+{
+       int err, ret = IRQ_NONE;
+       struct pt_regs regs;
+       const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+       uint8_t xenpmu_flags = get_xenpmu_flags();
+
+       if (!xenpmu_data) {
+               pr_warn_once("%s: pmudata not initialized\n", __func__);
+               return ret;
+       }
+
+       this_cpu_ptr(&xenpmu_shared)->flags =
+               xenpmu_flags | XENPMU_IRQ_PROCESSING;
+       xen_convert_regs(&xenpmu_data->pmu.r.regs, &regs,
+                        xenpmu_data->pmu.pmu_flags);
+       if (x86_pmu.handle_irq(&regs))
+               ret = IRQ_HANDLED;
+
+       /* Write out cached context to HW */
+       err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL);
+       this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags;
+       if (err) {
+               pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err);
+               return IRQ_NONE;
+       }
+
+       return ret;
+}
+
+bool is_xen_pmu(int cpu)
+{
+       return (get_xenpmu_data() != NULL);
+}
+
+void xen_pmu_init(int cpu)
+{
+       int err;
+       struct xen_pmu_params xp;
+       unsigned long pfn;
+       struct xen_pmu_data *xenpmu_data;
+
+       BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE);
+
+       if (xen_hvm_domain())
+               return;
+
+       xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL);
+       if (!xenpmu_data) {
+               pr_err("VPMU init: No memory\n");
+               return;
+       }
+       pfn = virt_to_pfn(xenpmu_data);
+
+       xp.val = pfn_to_mfn(pfn);
+       xp.vcpu = cpu;
+       xp.version.maj = XENPMU_VER_MAJ;
+       xp.version.min = XENPMU_VER_MIN;
+       err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp);
+       if (err)
+               goto fail;
+
+       per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
+       per_cpu(xenpmu_shared, cpu).flags = 0;
+
+       if (cpu == 0) {
+               perf_register_guest_info_callbacks(&xen_guest_cbs);
+               xen_pmu_arch_init();
+       }
+
+       return;
+
+fail:
+       pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n",
+               cpu, err);
+       free_pages((unsigned long)xenpmu_data, 0);
+}
+
+void xen_pmu_finish(int cpu)
+{
+       struct xen_pmu_params xp;
+
+       if (xen_hvm_domain())
+               return;
+
+       xp.vcpu = cpu;
+       xp.version.maj = XENPMU_VER_MAJ;
+       xp.version.min = XENPMU_VER_MIN;
+
+       (void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp);
+
+       free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0);
+       per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL;
+}
diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h

new file mode 100644 (file)

index 0000000..af5f0ad
--- /dev/null
+++ b/arch/x86/xen/pmu.h
@@ -0,0 +1,15 @@
+#ifndef __XEN_PMU_H
+#define __XEN_PMU_H
+
+#include <xen/interface/xenpmu.h>
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
+void xen_pmu_init(int cpu);
+void xen_pmu_finish(int cpu);
+bool is_xen_pmu(int cpu);
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
+int pmu_apic_update(uint32_t reg);
+unsigned long long xen_read_pmc(int counter);
+
+#endif /* __XEN_PMU_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c

index 55f388e..f5ef674 100644 (file)
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -27,17 +27,23 @@
  #include <xen/interface/memory.h>
  #include <xen/interface/physdev.h>
  #include <xen/features.h>
+#include <xen/hvc-console.h>
  #include "xen-ops.h"
  #include "vdso.h"
-#include "p2m.h"
  #include "mmu.h"
  
+#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
+
  /* Amount of extra memory space we add to the e820 ranges */
  struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
  
  /* Number of pages released from the initial allocation. */
  unsigned long xen_released_pages;
  
+/* E820 map used during setting up memory. */
+static struct e820entry xen_e820_map[E820MAX] __initdata;
+static u32 xen_e820_map_entries __initdata;
+
  /*
   * Buffer used to remap identity mapped pages. We only need the virtual space.
   * The physical page behind this address is remapped as needed to different
@@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
   */
  #define EXTRA_MEM_RATIO                (10)
  
-static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
+static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
+
+static void __init xen_parse_512gb(void)
+{
+       bool val = false;
+       char *arg;
+
+       arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
+       if (!arg)
+               return;
+
+       arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
+       if (!arg)
+               val = true;
+       else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
+               return;
+
+       xen_512gb_limit = val;
+}
+
+static void __init xen_add_extra_mem(unsigned long start_pfn,
+                                    unsigned long n_pfns)
  {
         int i;
  
+       /*
+        * No need to check for zero size, should happen rarely and will only
+        * write a new entry regarded to be unused due to zero size.
+        */
         for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
                 /* Add new region. */
-               if (xen_extra_mem[i].size == 0) {
-                       xen_extra_mem[i].start = start;
-                       xen_extra_mem[i].size  = size;
+               if (xen_extra_mem[i].n_pfns == 0) {
+                       xen_extra_mem[i].start_pfn = start_pfn;
+                       xen_extra_mem[i].n_pfns = n_pfns;
                         break;
                 }
                 /* Append to existing region. */
-               if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
-                       xen_extra_mem[i].size += size;
+               if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
+                   start_pfn) {
+                       xen_extra_mem[i].n_pfns += n_pfns;
                         break;
                 }
         }
         if (i == XEN_EXTRA_MEM_MAX_REGIONS)
                 printk(KERN_WARNING "Warning: not enough extra memory regions\n");
  
-       memblock_reserve(start, size);
+       memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
  }
  
-static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
+static void __init xen_del_extra_mem(unsigned long start_pfn,
+                                    unsigned long n_pfns)
  {
         int i;
-       phys_addr_t start_r, size_r;
+       unsigned long start_r, size_r;
  
         for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-               start_r = xen_extra_mem[i].start;
-               size_r = xen_extra_mem[i].size;
+               start_r = xen_extra_mem[i].start_pfn;
+               size_r = xen_extra_mem[i].n_pfns;
  
                 /* Start of region. */
-               if (start_r == start) {
-                       BUG_ON(size > size_r);
-                       xen_extra_mem[i].start += size;
-                       xen_extra_mem[i].size -= size;
+               if (start_r == start_pfn) {
+                       BUG_ON(n_pfns > size_r);
+                       xen_extra_mem[i].start_pfn += n_pfns;
+                       xen_extra_mem[i].n_pfns -= n_pfns;
                         break;
                 }
                 /* End of region. */
-               if (start_r + size_r == start + size) {
-                       BUG_ON(size > size_r);
-                       xen_extra_mem[i].size -= size;
+               if (start_r + size_r == start_pfn + n_pfns) {
+                       BUG_ON(n_pfns > size_r);
+                       xen_extra_mem[i].n_pfns -= n_pfns;
                         break;
                 }
                 /* Mid of region. */
-               if (start > start_r && start < start_r + size_r) {
-                       BUG_ON(start + size > start_r + size_r);
-                       xen_extra_mem[i].size = start - start_r;
+               if (start_pfn > start_r && start_pfn < start_r + size_r) {
+                       BUG_ON(start_pfn + n_pfns > start_r + size_r);
+                       xen_extra_mem[i].n_pfns = start_pfn - start_r;
                         /* Calling memblock_reserve() again is okay. */
-                       xen_add_extra_mem(start + size, start_r + size_r -
-                                         (start + size));
+                       xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
+                                         (start_pfn + n_pfns));
                         break;
                 }
         }
-       memblock_free(start, size);
+       memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
  }
  
  /*
@@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
  unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
  {
         int i;
-       phys_addr_t addr = PFN_PHYS(pfn);
  
         for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-               if (addr >= xen_extra_mem[i].start &&
-                   addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
+               if (pfn >= xen_extra_mem[i].start_pfn &&
+                   pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
                         return INVALID_P2M_ENTRY;
         }
  
@@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void)
         int i;
  
         for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-               if (!xen_extra_mem[i].size)
+               if (!xen_extra_mem[i].n_pfns)
                         continue;
-               pfn_s = PFN_DOWN(xen_extra_mem[i].start);
-               pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
+               pfn_s = xen_extra_mem[i].start_pfn;
+               pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
                 for (pfn = pfn_s; pfn < pfn_e; pfn++)
                         set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
         }
@@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void)
   * This function updates min_pfn with the pfn found and returns
   * the size of that range or zero if not found.
   */
-static unsigned long __init xen_find_pfn_range(
-       const struct e820entry *list, size_t map_size,
-       unsigned long *min_pfn)
+static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
  {
-       const struct e820entry *entry;
+       const struct e820entry *entry = xen_e820_map;
         unsigned int i;
         unsigned long done = 0;
  
-       for (i = 0, entry = list; i < map_size; i++, entry++) {
+       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
                 unsigned long s_pfn;
                 unsigned long e_pfn;
  
@@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn)
   * as a fallback if the remapping fails.
   */
  static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
-       unsigned long end_pfn, unsigned long nr_pages, unsigned long *released)
+                       unsigned long end_pfn, unsigned long nr_pages)
  {
         unsigned long pfn, end;
         int ret;
@@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
                 WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
  
                 if (ret == 1) {
-                       (*released)++;
+                       xen_released_pages++;
                         if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
                                 break;
                 } else
@@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk(
   * to Xen and not remapped.
   */
  static unsigned long __init xen_set_identity_and_remap_chunk(
-        const struct e820entry *list, size_t map_size, unsigned long start_pfn,
-       unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
-       unsigned long *released, unsigned long *remapped)
+       unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+       unsigned long remap_pfn)
  {
         unsigned long pfn;
         unsigned long i = 0;
@@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
                 if (cur_pfn + size > nr_pages)
                         size = nr_pages - cur_pfn;
  
-               remap_range_size = xen_find_pfn_range(list, map_size,
-                                                     &remap_pfn);
+               remap_range_size = xen_find_pfn_range(&remap_pfn);
                 if (!remap_range_size) {
                         pr_warning("Unable to find available pfn range, not remapping identity pages\n");
                         xen_set_identity_and_release_chunk(cur_pfn,
-                               cur_pfn + left, nr_pages, released);
+                                               cur_pfn + left, nr_pages);
                         break;
                 }
                 /* Adjust size to fit in current e820 RAM region */
@@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
                 /* Update variables to reflect new mappings. */
                 i += size;
                 remap_pfn += size;
-               *remapped += size;
         }
  
         /*
@@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
         return remap_pfn;
  }
  
-static void __init xen_set_identity_and_remap(
-       const struct e820entry *list, size_t map_size, unsigned long nr_pages,
-       unsigned long *released, unsigned long *remapped)
+static void __init xen_set_identity_and_remap(unsigned long nr_pages)
  {
         phys_addr_t start = 0;
         unsigned long last_pfn = nr_pages;
-       const struct e820entry *entry;
-       unsigned long num_released = 0;
-       unsigned long num_remapped = 0;
+       const struct e820entry *entry = xen_e820_map;
         int i;
  
         /*
@@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap(
          * example) the DMI tables in a reserved region that begins on
          * a non-page boundary.
          */
-       for (i = 0, entry = list; i < map_size; i++, entry++) {
+       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
                 phys_addr_t end = entry->addr + entry->size;
-               if (entry->type == E820_RAM || i == map_size - 1) {
+               if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) {
                         unsigned long start_pfn = PFN_DOWN(start);
                         unsigned long end_pfn = PFN_UP(end);
  
@@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap(
  
                         if (start_pfn < end_pfn)
                                 last_pfn = xen_set_identity_and_remap_chunk(
-                                               list, map_size, start_pfn,
-                                               end_pfn, nr_pages, last_pfn,
-                                               &num_released, &num_remapped);
+                                               start_pfn, end_pfn, nr_pages,
+                                               last_pfn);
                         start = end;
                 }
         }
  
-       *released = num_released;
-       *remapped = num_remapped;
-
-       pr_info("Released %ld page(s)\n", num_released);
+       pr_info("Released %ld page(s)\n", xen_released_pages);
  }
  
  /*
@@ -494,7 +513,7 @@ void __init xen_remap_memory(void)
                 } else if (pfn_s + len == xen_remap_buf.target_pfn) {
                         len += xen_remap_buf.size;
                 } else {
-                       xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+                       xen_del_extra_mem(pfn_s, len);
                         pfn_s = xen_remap_buf.target_pfn;
                         len = xen_remap_buf.size;
                 }
@@ -504,19 +523,36 @@ void __init xen_remap_memory(void)
         }
  
         if (pfn_s != ~0UL && len)
-               xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+               xen_del_extra_mem(pfn_s, len);
  
         set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
  
         pr_info("Remapped %ld page(s)\n", remapped);
  }
  
+static unsigned long __init xen_get_pages_limit(void)
+{
+       unsigned long limit;
+
+#ifdef CONFIG_X86_32
+       limit = GB(64) / PAGE_SIZE;
+#else
+       limit = MAXMEM / PAGE_SIZE;
+       if (!xen_initial_domain() && xen_512gb_limit)
+               limit = GB(512) / PAGE_SIZE;
+#endif
+       return limit;
+}
+
  static unsigned long __init xen_get_max_pages(void)
  {
-       unsigned long max_pages = MAX_DOMAIN_PAGES;
+       unsigned long max_pages, limit;
         domid_t domid = DOMID_SELF;
         int ret;
  
+       limit = xen_get_pages_limit();
+       max_pages = limit;
+
         /*
          * For the initial domain we use the maximum reservation as
          * the maximum page.
@@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void)
                         max_pages = ret;
         }
  
-       return min(max_pages, MAX_DOMAIN_PAGES);
+       return min(max_pages, limit);
  }
  
  static void __init xen_align_and_add_e820_region(phys_addr_t start,
@@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
         e820_add_region(start, end - start, type);
  }
  
-static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size)
+static void __init xen_ignore_unusable(void)
  {
-       struct e820entry *entry;
+       struct e820entry *entry = xen_e820_map;
         unsigned int i;
  
-       for (i = 0, entry = list; i < map_size; i++, entry++) {
+       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
                 if (entry->type == E820_UNUSABLE)
                         entry->type = E820_RAM;
         }
  }
  
+static unsigned long __init xen_count_remap_pages(unsigned long max_pfn)
+{
+       unsigned long extra = 0;
+       unsigned long start_pfn, end_pfn;
+       const struct e820entry *entry = xen_e820_map;
+       int i;
+
+       end_pfn = 0;
+       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
+               start_pfn = PFN_DOWN(entry->addr);
+               /* Adjacent regions on non-page boundaries handling! */
+               end_pfn = min(end_pfn, start_pfn);
+
+               if (start_pfn >= max_pfn)
+                       return extra + max_pfn - end_pfn;
+
+               /* Add any holes in map to result. */
+               extra += start_pfn - end_pfn;
+
+               end_pfn = PFN_UP(entry->addr + entry->size);
+               end_pfn = min(end_pfn, max_pfn);
+
+               if (entry->type != E820_RAM)
+                       extra += end_pfn - start_pfn;
+       }
+
+       return extra;
+}
+
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
+{
+       struct e820entry *entry;
+       unsigned mapcnt;
+       phys_addr_t end;
+
+       if (!size)
+               return false;
+
+       end = start + size;
+       entry = xen_e820_map;
+
+       for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) {
+               if (entry->type == E820_RAM && entry->addr <= start &&
+                   (entry->addr + entry->size) >= end)
+                       return false;
+
+               entry++;
+       }
+
+       return true;
+}
+
+/*
+ * Find a free area in physical memory not yet reserved and compliant with
+ * E820 map.
+ * Used to relocate pre-allocated areas like initrd or p2m list which are in
+ * conflict with the to be used E820 map.
+ * In case no area is found, return 0. Otherwise return the physical address
+ * of the area which is already reserved for convenience.
+ */
+phys_addr_t __init xen_find_free_area(phys_addr_t size)
+{
+       unsigned mapcnt;
+       phys_addr_t addr, start;
+       struct e820entry *entry = xen_e820_map;
+
+       for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) {
+               if (entry->type != E820_RAM || entry->size < size)
+                       continue;
+               start = entry->addr;
+               for (addr = start; addr < start + size; addr += PAGE_SIZE) {
+                       if (!memblock_is_reserved(addr))
+                               continue;
+                       start = addr + PAGE_SIZE;
+                       if (start + size > entry->addr + entry->size)
+                               break;
+               }
+               if (addr >= start + size) {
+                       memblock_reserve(start, size);
+                       return start;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Like memcpy, but with physical addresses for dest and src.
+ */
+static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
+                                  phys_addr_t n)
+{
+       phys_addr_t dest_off, src_off, dest_len, src_len, len;
+       void *from, *to;
+
+       while (n) {
+               dest_off = dest & ~PAGE_MASK;
+               src_off = src & ~PAGE_MASK;
+               dest_len = n;
+               if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
+                       dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
+               src_len = n;
+               if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
+                       src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
+               len = min(dest_len, src_len);
+               to = early_memremap(dest - dest_off, dest_len + dest_off);
+               from = early_memremap(src - src_off, src_len + src_off);
+               memcpy(to, from, len);
+               early_memunmap(to, dest_len + dest_off);
+               early_memunmap(from, src_len + src_off);
+               n -= len;
+               dest += len;
+               src += len;
+       }
+}
+
+/*
+ * Reserve Xen mfn_list.
+ */
+static void __init xen_reserve_xen_mfnlist(void)
+{
+       phys_addr_t start, size;
+
+       if (xen_start_info->mfn_list >= __START_KERNEL_map) {
+               start = __pa(xen_start_info->mfn_list);
+               size = PFN_ALIGN(xen_start_info->nr_pages *
+                                sizeof(unsigned long));
+       } else {
+               start = PFN_PHYS(xen_start_info->first_p2m_pfn);
+               size = PFN_PHYS(xen_start_info->nr_p2m_frames);
+       }
+
+       if (!xen_is_e820_reserved(start, size)) {
+               memblock_reserve(start, size);
+               return;
+       }
+
+#ifdef CONFIG_X86_32
+       /*
+        * Relocating the p2m on 32 bit system to an arbitrary virtual address
+        * is not supported, so just give up.
+        */
+       xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
+       BUG();
+#else
+       xen_relocate_p2m();
+#endif
+}
+
  /**
   * machine_specific_memory_setup - Hook for machine specific memory setup.
   **/
  char * __init xen_memory_setup(void)
  {
-       static struct e820entry map[E820MAX] __initdata;
-
-       unsigned long max_pfn = xen_start_info->nr_pages;
-       phys_addr_t mem_end;
+       unsigned long max_pfn, pfn_s, n_pfns;
+       phys_addr_t mem_end, addr, size, chunk_size;
+       u32 type;
         int rc;
         struct xen_memory_map memmap;
         unsigned long max_pages;
         unsigned long extra_pages = 0;
-       unsigned long remapped_pages;
         int i;
         int op;
  
-       max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+       xen_parse_512gb();
+       max_pfn = xen_get_pages_limit();
+       max_pfn = min(max_pfn, xen_start_info->nr_pages);
         mem_end = PFN_PHYS(max_pfn);
  
         memmap.nr_entries = E820MAX;
-       set_xen_guest_handle(memmap.buffer, map);
+       set_xen_guest_handle(memmap.buffer, xen_e820_map);
  
         op = xen_initial_domain() ?
                 XENMEM_machine_memory_map :
@@ -590,15 +775,16 @@ char * __init xen_memory_setup(void)
         if (rc == -ENOSYS) {
                 BUG_ON(xen_initial_domain());
                 memmap.nr_entries = 1;
-               map[0].addr = 0ULL;
-               map[0].size = mem_end;
+               xen_e820_map[0].addr = 0ULL;
+               xen_e820_map[0].size = mem_end;
                 /* 8MB slack (to balance backend allocations). */
-               map[0].size += 8ULL << 20;
-               map[0].type = E820_RAM;
+               xen_e820_map[0].size += 8ULL << 20;
+               xen_e820_map[0].type = E820_RAM;
                 rc = 0;
         }
         BUG_ON(rc);
         BUG_ON(memmap.nr_entries == 0);
+       xen_e820_map_entries = memmap.nr_entries;
  
         /*
          * Xen won't allow a 1:1 mapping to be created to UNUSABLE
@@ -609,24 +795,19 @@ char * __init xen_memory_setup(void)
          * a patch in the future.
          */
         if (xen_initial_domain())
-               xen_ignore_unusable(map, memmap.nr_entries);
+               xen_ignore_unusable();
  
         /* Make sure the Xen-supplied memory map is well-ordered. */
-       sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
+       sanitize_e820_map(xen_e820_map, xen_e820_map_entries,
+                         &xen_e820_map_entries);
  
         max_pages = xen_get_max_pages();
-       if (max_pages > max_pfn)
-               extra_pages += max_pages - max_pfn;
  
-       /*
-        * Set identity map on non-RAM pages and prepare remapping the
-        * underlying RAM.
-        */
-       xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
-                                  &xen_released_pages, &remapped_pages);
+       /* How many extra pages do we need due to remapping? */
+       max_pages += xen_count_remap_pages(max_pfn);
  
-       extra_pages += xen_released_pages;
-       extra_pages += remapped_pages;
+       if (max_pages > max_pfn)
+               extra_pages += max_pages - max_pfn;
  
         /*
          * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
@@ -635,46 +816,54 @@ char * __init xen_memory_setup(void)
          * is limited to the max size of lowmem, so that it doesn't
          * get completely filled.
          *
+        * Make sure we have no memory above max_pages, as this area
+        * isn't handled by the p2m management.
+        *
          * In principle there could be a problem in lowmem systems if
          * the initial memory is also very large with respect to
          * lowmem, but we won't try to deal with that here.
          */
-       extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
-                         extra_pages);
+       extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+                          extra_pages, max_pages - max_pfn);
         i = 0;
-       while (i < memmap.nr_entries) {
-               phys_addr_t addr = map[i].addr;
-               phys_addr_t size = map[i].size;
-               u32 type = map[i].type;
+       addr = xen_e820_map[0].addr;
+       size = xen_e820_map[0].size;
+       while (i < xen_e820_map_entries) {
+               chunk_size = size;
+               type = xen_e820_map[i].type;
  
                 if (type == E820_RAM) {
                         if (addr < mem_end) {
-                               size = min(size, mem_end - addr);
+                               chunk_size = min(size, mem_end - addr);
                         } else if (extra_pages) {
-                               size = min(size, PFN_PHYS(extra_pages));
-                               extra_pages -= PFN_DOWN(size);
-                               xen_add_extra_mem(addr, size);
-                               xen_max_p2m_pfn = PFN_DOWN(addr + size);
+                               chunk_size = min(size, PFN_PHYS(extra_pages));
+                               pfn_s = PFN_UP(addr);
+                               n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
+                               extra_pages -= n_pfns;
+                               xen_add_extra_mem(pfn_s, n_pfns);
+                               xen_max_p2m_pfn = pfn_s + n_pfns;
                         } else
                                 type = E820_UNUSABLE;
                 }
  
-               xen_align_and_add_e820_region(addr, size, type);
+               xen_align_and_add_e820_region(addr, chunk_size, type);
  
-               map[i].addr += size;
-               map[i].size -= size;
-               if (map[i].size == 0)
+               addr += chunk_size;
+               size -= chunk_size;
+               if (size == 0) {
                         i++;
+                       if (i < xen_e820_map_entries) {
+                               addr = xen_e820_map[i].addr;
+                               size = xen_e820_map[i].size;
+                       }
+               }
         }
  
         /*
          * Set the rest as identity mapped, in case PCI BARs are
          * located here.
-        *
-        * PFNs above MAX_P2M_PFN are considered identity mapped as
-        * well.
          */
-       set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
+       set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
  
         /*
          * In domU, the ISA region is normal, usable memory, but we
@@ -684,34 +873,53 @@ char * __init xen_memory_setup(void)
         e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
                         E820_RESERVED);
  
+       sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
         /*
-        * Reserve Xen bits:
-        *  - mfn_list
-        *  - xen_start_info
-        * See comment above "struct start_info" in <xen/interface/xen.h>
-        * We tried to make the the memblock_reserve more selective so
-        * that it would be clear what region is reserved. Sadly we ran
-        * in the problem wherein on a 64-bit hypervisor with a 32-bit
-        * initial domain, the pt_base has the cr3 value which is not
-        * neccessarily where the pagetable starts! As Jan put it: "
-        * Actually, the adjustment turns out to be correct: The page
-        * tables for a 32-on-64 dom0 get allocated in the order "first L1",
-        * "first L2", "first L3", so the offset to the page table base is
-        * indeed 2. When reading xen/include/public/xen.h's comment
-        * very strictly, this is not a violation (since there nothing is said
-        * that the first thing in the page table space is pointed to by
-        * pt_base; I admit that this seems to be implied though, namely
-        * do I think that it is implied that the page table space is the
-        * range [pt_base, pt_base + nt_pt_frames), whereas that
-        * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
-        * which - without a priori knowledge - the kernel would have
-        * difficulty to figure out)." - so lets just fall back to the
-        * easy way and reserve the whole region.
+        * Check whether the kernel itself conflicts with the target E820 map.
+        * Failing now is better than running into weird problems later due
+        * to relocating (and even reusing) pages with kernel text or data.
          */
-       memblock_reserve(__pa(xen_start_info->mfn_list),
-                        xen_start_info->pt_base - xen_start_info->mfn_list);
+       if (xen_is_e820_reserved(__pa_symbol(_text),
+                       __pa_symbol(__bss_stop) - __pa_symbol(_text))) {
+               xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
+               BUG();
+       }
  
-       sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+       /*
+        * Check for a conflict of the hypervisor supplied page tables with
+        * the target E820 map.
+        */
+       xen_pt_check_e820();
+
+       xen_reserve_xen_mfnlist();
+
+       /* Check for a conflict of the initrd with the target E820 map. */
+       if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
+                                boot_params.hdr.ramdisk_size)) {
+               phys_addr_t new_area, start, size;
+
+               new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
+               if (!new_area) {
+                       xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
+                       BUG();
+               }
+
+               start = boot_params.hdr.ramdisk_image;
+               size = boot_params.hdr.ramdisk_size;
+               xen_phys_memcpy(new_area, start, size);
+               pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
+                       start, start + size, new_area, new_area + size);
+               memblock_free(start, size);
+               boot_params.hdr.ramdisk_image = new_area;
+               boot_params.ext_ramdisk_image = new_area >> 32;
+       }
+
+       /*
+        * Set identity map on non-RAM pages and prepare remapping the
+        * underlying RAM.
+        */
+       xen_set_identity_and_remap(max_pfn);
  
         return "Xen";
  }
@@ -721,26 +929,30 @@ char * __init xen_memory_setup(void)
   */
  char * __init xen_auto_xlated_memory_setup(void)
  {
-       static struct e820entry map[E820MAX] __initdata;
-
         struct xen_memory_map memmap;
         int i;
         int rc;
  
         memmap.nr_entries = E820MAX;
-       set_xen_guest_handle(memmap.buffer, map);
+       set_xen_guest_handle(memmap.buffer, xen_e820_map);
  
         rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
         if (rc < 0)
                 panic("No memory map (%d)\n", rc);
  
-       sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
+       xen_e820_map_entries = memmap.nr_entries;
+
+       sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
+                         &xen_e820_map_entries);
  
-       for (i = 0; i < memmap.nr_entries; i++)
-               e820_add_region(map[i].addr, map[i].size, map[i].type);
+       for (i = 0; i < xen_e820_map_entries; i++)
+               e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size,
+                               xen_e820_map[i].type);
  
-       memblock_reserve(__pa(xen_start_info->mfn_list),
-                        xen_start_info->pt_base - xen_start_info->mfn_list);
+       /* Remove p2m info, it is not needed. */
+       xen_start_info->mfn_list = 0;
+       xen_start_info->first_p2m_pfn = 0;
+       xen_start_info->nr_p2m_frames = 0;
  
         return "Xen";
  }
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c

index 8648438..2a9ff73 100644 (file)
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -26,6 +26,7 @@
  
  #include <xen/interface/xen.h>
  #include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
  
  #include <asm/xen/interface.h>
  #include <asm/xen/hypercall.h>
@@ -38,6 +39,7 @@
  #include "xen-ops.h"
  #include "mmu.h"
  #include "smp.h"
+#include "pmu.h"
  
  cpumask_var_t xen_cpu_initialized_map;
  
@@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
  static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
  static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
  static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };
  
  static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
  static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
@@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu)
                 kfree(per_cpu(xen_irq_work, cpu).name);
                 per_cpu(xen_irq_work, cpu).name = NULL;
         }
+
+       if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
+               unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
+               per_cpu(xen_pmu_irq, cpu).irq = -1;
+               kfree(per_cpu(xen_pmu_irq, cpu).name);
+               per_cpu(xen_pmu_irq, cpu).name = NULL;
+       }
  };
  static int xen_smp_intr_init(unsigned int cpu)
  {
         int rc;
-       char *resched_name, *callfunc_name, *debug_name;
+       char *resched_name, *callfunc_name, *debug_name, *pmu_name;
  
         resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
         rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
@@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu)
         per_cpu(xen_irq_work, cpu).irq = rc;
         per_cpu(xen_irq_work, cpu).name = callfunc_name;
  
+       if (is_xen_pmu(cpu)) {
+               pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
+               rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
+                                            xen_pmu_irq_handler,
+                                            IRQF_PERCPU|IRQF_NOBALANCING,
+                                            pmu_name, NULL);
+               if (rc < 0)
+                       goto fail;
+               per_cpu(xen_pmu_irq, cpu).irq = rc;
+               per_cpu(xen_pmu_irq, cpu).name = pmu_name;
+       }
+
         return 0;
  
   fail:
@@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
         }
         set_cpu_sibling_map(0);
  
+       xen_pmu_init(0);
+
         if (xen_smp_intr_init(0))
                 BUG();
  
@@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
         if (rc)
                 return rc;
  
+       xen_pmu_init(cpu);
+
         rc = xen_smp_intr_init(cpu);
         if (rc)
                 return rc;
@@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu)
                 xen_smp_intr_free(cpu);
                 xen_uninit_lock_cpu(cpu);
                 xen_teardown_timer(cpu);
+               xen_pmu_finish(cpu);
         }
  }
  
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c

index 53b4c08..feddabd 100644 (file)
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -11,6 +11,7 @@
  
  #include "xen-ops.h"
  #include "mmu.h"
+#include "pmu.h"
  
  static void xen_pv_pre_suspend(void)
  {
@@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled)
  
  void xen_arch_pre_suspend(void)
  {
-    if (xen_pv_domain())
-        xen_pv_pre_suspend();
+       int cpu;
+
+       for_each_online_cpu(cpu)
+               xen_pmu_finish(cpu);
+
+       if (xen_pv_domain())
+               xen_pv_pre_suspend();
  }
  
  void xen_arch_post_suspend(int cancelled)
  {
-    if (xen_pv_domain())
-        xen_pv_post_suspend(cancelled);
-    else
-        xen_hvm_post_suspend(cancelled);
+       int cpu;
+
+       if (xen_pv_domain())
+               xen_pv_post_suspend(cancelled);
+       else
+               xen_hvm_post_suspend(cancelled);
+
+       for_each_online_cpu(cpu)
+               xen_pmu_init(cpu);
  }
  
  static void xen_vcpu_notify_restore(void *data)
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S

index 8afdfcc..b65f59a 100644 (file)
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -104,6 +104,8 @@ ENTRY(hypercall_page)
         ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __PAGE_OFFSET)
  #else
         ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __START_KERNEL_map)
+       /* Map the p2m table to a 512GB-aligned user address. */
+       ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad PGDIR_SIZE)
  #endif
         ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)
         ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h

index 2292721..1399423 100644 (file)
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -35,13 +35,20 @@ void xen_build_mfn_list_list(void);
  void xen_setup_machphys_mapping(void);
  void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
  void xen_reserve_top(void);
+void __init xen_reserve_special_pages(void);
+void __init xen_pt_check_e820(void);
  
  void xen_mm_pin_all(void);
  void xen_mm_unpin_all(void);
+#ifdef CONFIG_X86_64
+void __init xen_relocate_p2m(void);
+#endif
  
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size);
  unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
  void __init xen_inv_extra_mem(void);
  void __init xen_remap_memory(void);
+phys_addr_t __init xen_find_free_area(phys_addr_t size);
  char * __init xen_memory_setup(void);
  char * xen_auto_xlated_memory_setup(void);
  void __init xen_arch_setup(void);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c

index 5f6b3be..1508353 100644 (file)
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -37,6 +37,7 @@
  
  #include <linux/interrupt.h>
  #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
  #include <linux/hdreg.h>
  #include <linux/cdrom.h>
  #include <linux/module.h>
@@ -147,6 +148,7 @@ struct blkfront_info
         unsigned int feature_persistent:1;
         unsigned int max_indirect_segments;
         int is_ready;
+       struct blk_mq_tag_set tag_set;
  };
  
  static unsigned int nr_minors;
@@ -616,54 +618,41 @@ static inline bool blkif_request_flush_invalid(struct request *req,
                  !(info->feature_flush & REQ_FUA)));
  }
  
-/*
- * do_blkif_request
- *  read a block; request is in a request queue
- */
-static void do_blkif_request(struct request_queue *rq)
+static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
+                          const struct blk_mq_queue_data *qd)
  {
-       struct blkfront_info *info = NULL;
-       struct request *req;
-       int queued;
-
-       pr_debug("Entered do_blkif_request\n");
-
-       queued = 0;
+       struct blkfront_info *info = qd->rq->rq_disk->private_data;
  
-       while ((req = blk_peek_request(rq)) != NULL) {
-               info = req->rq_disk->private_data;
-
-               if (RING_FULL(&info->ring))
-                       goto wait;
+       blk_mq_start_request(qd->rq);
+       spin_lock_irq(&info->io_lock);
+       if (RING_FULL(&info->ring))
+               goto out_busy;
  
-               blk_start_request(req);
+       if (blkif_request_flush_invalid(qd->rq, info))
+               goto out_err;
  
-               if (blkif_request_flush_invalid(req, info)) {
-                       __blk_end_request_all(req, -EOPNOTSUPP);
-                       continue;
-               }
+       if (blkif_queue_request(qd->rq))
+               goto out_busy;
  
-               pr_debug("do_blk_req %p: cmd %p, sec %lx, "
-                        "(%u/%u) [%s]\n",
-                        req, req->cmd, (unsigned long)blk_rq_pos(req),
-                        blk_rq_cur_sectors(req), blk_rq_sectors(req),
-                        rq_data_dir(req) ? "write" : "read");
-
-               if (blkif_queue_request(req)) {
-                       blk_requeue_request(rq, req);
-wait:
-                       /* Avoid pointless unplugs. */
-                       blk_stop_queue(rq);
-                       break;
-               }
+       flush_requests(info);
+       spin_unlock_irq(&info->io_lock);
+       return BLK_MQ_RQ_QUEUE_OK;
  
-               queued++;
-       }
+out_err:
+       spin_unlock_irq(&info->io_lock);
+       return BLK_MQ_RQ_QUEUE_ERROR;
  
-       if (queued != 0)
-               flush_requests(info);
+out_busy:
+       spin_unlock_irq(&info->io_lock);
+       blk_mq_stop_hw_queue(hctx);
+       return BLK_MQ_RQ_QUEUE_BUSY;
  }
  
+static struct blk_mq_ops blkfront_mq_ops = {
+       .queue_rq = blkif_queue_rq,
+       .map_queue = blk_mq_map_queue,
+};
+
  static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
                                 unsigned int physical_sector_size,
                                 unsigned int segments)
@@ -671,9 +660,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
         struct request_queue *rq;
         struct blkfront_info *info = gd->private_data;
  
-       rq = blk_init_queue(do_blkif_request, &info->io_lock);
-       if (rq == NULL)
+       memset(&info->tag_set, 0, sizeof(info->tag_set));
+       info->tag_set.ops = &blkfront_mq_ops;
+       info->tag_set.nr_hw_queues = 1;
+       info->tag_set.queue_depth =  BLK_RING_SIZE(info);
+       info->tag_set.numa_node = NUMA_NO_NODE;
+       info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+       info->tag_set.cmd_size = 0;
+       info->tag_set.driver_data = info;
+
+       if (blk_mq_alloc_tag_set(&info->tag_set))
                 return -1;
+       rq = blk_mq_init_queue(&info->tag_set);
+       if (IS_ERR(rq)) {
+               blk_mq_free_tag_set(&info->tag_set);
+               return -1;
+       }
  
         queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
  
@@ -901,19 +903,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
  static void xlvbd_release_gendisk(struct blkfront_info *info)
  {
         unsigned int minor, nr_minors;
-       unsigned long flags;
  
         if (info->rq == NULL)
                 return;
  
-       spin_lock_irqsave(&info->io_lock, flags);
-
         /* No more blkif_request(). */
-       blk_stop_queue(info->rq);
+       blk_mq_stop_hw_queues(info->rq);
  
         /* No more gnttab callback work. */
         gnttab_cancel_free_callback(&info->callback);
-       spin_unlock_irqrestore(&info->io_lock, flags);
  
         /* Flush gnttab callback work. Must be done with no locks held. */
         flush_work(&info->work);
@@ -925,20 +923,18 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
         xlbd_release_minors(minor, nr_minors);
  
         blk_cleanup_queue(info->rq);
+       blk_mq_free_tag_set(&info->tag_set);
         info->rq = NULL;
  
         put_disk(info->gd);
         info->gd = NULL;
  }
  
+/* Must be called with io_lock holded */
  static void kick_pending_request_queues(struct blkfront_info *info)
  {
-       if (!RING_FULL(&info->ring)) {
-               /* Re-enable calldowns. */
-               blk_start_queue(info->rq);
-               /* Kick things off immediately. */
-               do_blkif_request(info->rq);
-       }
+       if (!RING_FULL(&info->ring))
+               blk_mq_start_stopped_hw_queues(info->rq, true);
  }
  
  static void blkif_restart_queue(struct work_struct *work)
@@ -963,7 +959,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
                 BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
         /* No more blkif_request(). */
         if (info->rq)
-               blk_stop_queue(info->rq);
+               blk_mq_stop_hw_queues(info->rq);
  
         /* Remove all persistent grants */
         if (!list_empty(&info->grants)) {
@@ -1146,7 +1142,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
         RING_IDX i, rp;
         unsigned long flags;
         struct blkfront_info *info = (struct blkfront_info *)dev_id;
-       int error;
  
         spin_lock_irqsave(&info->io_lock, flags);
  
@@ -1187,37 +1182,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                         continue;
                 }
  
-               error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
+               req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
                 switch (bret->operation) {
                 case BLKIF_OP_DISCARD:
                         if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
                                 struct request_queue *rq = info->rq;
                                 printk(KERN_WARNING "blkfront: %s: %s op failed\n",
                                            info->gd->disk_name, op_name(bret->operation));
-                               error = -EOPNOTSUPP;
+                               req->errors = -EOPNOTSUPP;
                                 info->feature_discard = 0;
                                 info->feature_secdiscard = 0;
                                 queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
                                 queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
                         }
-                       __blk_end_request_all(req, error);
+                       blk_mq_complete_request(req);
                         break;
                 case BLKIF_OP_FLUSH_DISKCACHE:
                 case BLKIF_OP_WRITE_BARRIER:
                         if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
                                 printk(KERN_WARNING "blkfront: %s: %s op failed\n",
                                        info->gd->disk_name, op_name(bret->operation));
-                               error = -EOPNOTSUPP;
+                               req->errors = -EOPNOTSUPP;
                         }
                         if (unlikely(bret->status == BLKIF_RSP_ERROR &&
                                      info->shadow[id].req.u.rw.nr_segments == 0)) {
                                 printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
                                        info->gd->disk_name, op_name(bret->operation));
-                               error = -EOPNOTSUPP;
+                               req->errors = -EOPNOTSUPP;
                         }
-                       if (unlikely(error)) {
-                               if (error == -EOPNOTSUPP)
-                                       error = 0;
+                       if (unlikely(req->errors)) {
+                               if (req->errors == -EOPNOTSUPP)
+                                       req->errors = 0;
                                 info->feature_flush = 0;
                                 xlvbd_flush(info);
                         }
@@ -1228,7 +1223,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                                 dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
                                         "request: %x\n", bret->status);
  
-                       __blk_end_request_all(req, error);
+                       blk_mq_complete_request(req);
                         break;
                 default:
                         BUG();
@@ -1555,28 +1550,6 @@ static int blkif_recover(struct blkfront_info *info)
  
         kfree(copy);
  
-       /*
-        * Empty the queue, this is important because we might have
-        * requests in the queue with more segments than what we
-        * can handle now.
-        */
-       spin_lock_irq(&info->io_lock);
-       while ((req = blk_fetch_request(info->rq)) != NULL) {
-               if (req->cmd_flags &
-                   (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
-                       list_add(&req->queuelist, &requests);
-                       continue;
-               }
-               merge_bio.head = req->bio;
-               merge_bio.tail = req->biotail;
-               bio_list_merge(&bio_list, &merge_bio);
-               req->bio = NULL;
-               if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
-                       pr_alert("diskcache flush request found!\n");
-               __blk_end_request_all(req, 0);
-       }
-       spin_unlock_irq(&info->io_lock);
-
         xenbus_switch_state(info->xbdev, XenbusStateConnected);
  
         spin_lock_irq(&info->io_lock);
@@ -1591,9 +1564,10 @@ static int blkif_recover(struct blkfront_info *info)
                 /* Requeue pending requests (flush or discard) */
                 list_del_init(&req->queuelist);
                 BUG_ON(req->nr_phys_segments > segs);
-               blk_requeue_request(info->rq, req);
+               blk_mq_requeue_request(req);
         }
         spin_unlock_irq(&info->io_lock);
+       blk_mq_kick_requeue_list(info->rq);
  
         while ((bio = bio_list_pop(&bio_list)) != NULL) {
                 /* Traverse the list of pending bios and re-queue them */
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig

index 7cd226d..73708ac 100644 (file)
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -280,4 +280,15 @@ config XEN_ACPI
         def_bool y
         depends on X86 && ACPI
  
+config XEN_SYMS
+       bool "Xen symbols"
+       depends on X86 && XEN_DOM0 && XENFS
+       default y if KALLSYMS
+       help
+          Exports hypervisor symbols (along with their types and addresses) via
+          /proc/xen/xensyms file, similar to /proc/kallsyms
+
+config XEN_HAVE_VPMU
+       bool
+
  endmenu
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c

index bf4a23c..1fa633b 100644 (file)
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -638,9 +638,9 @@ static int __init balloon_init(void)
          * regions (see arch/x86/xen/setup.c).
          */
         for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++)
-               if (xen_extra_mem[i].size)
-                       balloon_add_region(PFN_UP(xen_extra_mem[i].start),
-                                          PFN_DOWN(xen_extra_mem[i].size));
+               if (xen_extra_mem[i].n_pfns)
+                       balloon_add_region(xen_extra_mem[i].start_pfn,
+                                          xen_extra_mem[i].n_pfns);
  
         return 0;
  }
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c

index ed8bf10..68d1290 100644 (file)
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -1301,11 +1301,7 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
         if (!VALID_EVTCHN(evtchn))
                 return -1;
  
-       /*
-        * Events delivered via platform PCI interrupts are always
-        * routed to vcpu 0 and hence cannot be rebound.
-        */
-       if (xen_hvm_domain() && !xen_have_vector_callback)
+       if (!xen_support_evtchn_rebind())
                 return -1;
  
         /* Send future instances of this interrupt to other vcpu. */
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c

index 96453f8..b5a7342 100644 (file)
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -20,6 +20,9 @@
  #include <xen/xenbus.h>
  #include <xen/interface/xen.h>
  #include <xen/interface/version.h>
+#ifdef CONFIG_XEN_HAVE_VPMU
+#include <xen/interface/xenpmu.h>
+#endif
  
  #define HYPERVISOR_ATTR_RO(_name) \
  static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
@@ -368,6 +371,126 @@ static void xen_properties_destroy(void)
         sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
  }
  
+#ifdef CONFIG_XEN_HAVE_VPMU
+struct pmu_mode {
+       const char *name;
+       uint32_t mode;
+};
+
+static struct pmu_mode pmu_modes[] = {
+       {"off", XENPMU_MODE_OFF},
+       {"self", XENPMU_MODE_SELF},
+       {"hv", XENPMU_MODE_HV},
+       {"all", XENPMU_MODE_ALL}
+};
+
+static ssize_t pmu_mode_store(struct hyp_sysfs_attr *attr,
+                             const char *buffer, size_t len)
+{
+       int ret;
+       struct xen_pmu_params xp;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) {
+               if (strncmp(buffer, pmu_modes[i].name, len - 1) == 0) {
+                       xp.val = pmu_modes[i].mode;
+                       break;
+               }
+       }
+
+       if (i == ARRAY_SIZE(pmu_modes))
+               return -EINVAL;
+
+       xp.version.maj = XENPMU_VER_MAJ;
+       xp.version.min = XENPMU_VER_MIN;
+       ret = HYPERVISOR_xenpmu_op(XENPMU_mode_set, &xp);
+       if (ret)
+               return ret;
+
+       return len;
+}
+
+static ssize_t pmu_mode_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret;
+       struct xen_pmu_params xp;
+       int i;
+       uint32_t mode;
+
+       xp.version.maj = XENPMU_VER_MAJ;
+       xp.version.min = XENPMU_VER_MIN;
+       ret = HYPERVISOR_xenpmu_op(XENPMU_mode_get, &xp);
+       if (ret)
+               return ret;
+
+       mode = (uint32_t)xp.val;
+       for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) {
+               if (mode == pmu_modes[i].mode)
+                       return sprintf(buffer, "%s\n", pmu_modes[i].name);
+       }
+
+       return -EINVAL;
+}
+HYPERVISOR_ATTR_RW(pmu_mode);
+
+static ssize_t pmu_features_store(struct hyp_sysfs_attr *attr,
+                                 const char *buffer, size_t len)
+{
+       int ret;
+       uint32_t features;
+       struct xen_pmu_params xp;
+
+       ret = kstrtou32(buffer, 0, &features);
+       if (ret)
+               return ret;
+
+       xp.val = features;
+       xp.version.maj = XENPMU_VER_MAJ;
+       xp.version.min = XENPMU_VER_MIN;
+       ret = HYPERVISOR_xenpmu_op(XENPMU_feature_set, &xp);
+       if (ret)
+               return ret;
+
+       return len;
+}
+
+static ssize_t pmu_features_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret;
+       struct xen_pmu_params xp;
+
+       xp.version.maj = XENPMU_VER_MAJ;
+       xp.version.min = XENPMU_VER_MIN;
+       ret = HYPERVISOR_xenpmu_op(XENPMU_feature_get, &xp);
+       if (ret)
+               return ret;
+
+       return sprintf(buffer, "0x%x\n", (uint32_t)xp.val);
+}
+HYPERVISOR_ATTR_RW(pmu_features);
+
+static struct attribute *xen_pmu_attrs[] = {
+       &pmu_mode_attr.attr,
+       &pmu_features_attr.attr,
+       NULL
+};
+
+static const struct attribute_group xen_pmu_group = {
+       .name = "pmu",
+       .attrs = xen_pmu_attrs,
+};
+
+static int __init xen_pmu_init(void)
+{
+       return sysfs_create_group(hypervisor_kobj, &xen_pmu_group);
+}
+
+static void xen_pmu_destroy(void)
+{
+       sysfs_remove_group(hypervisor_kobj, &xen_pmu_group);
+}
+#endif
+
  static int __init hyper_sysfs_init(void)
  {
         int ret;
@@ -390,7 +513,15 @@ static int __init hyper_sysfs_init(void)
         ret = xen_properties_init();
         if (ret)
                 goto prop_out;
-
+#ifdef CONFIG_XEN_HAVE_VPMU
+       if (xen_initial_domain()) {
+               ret = xen_pmu_init();
+               if (ret) {
+                       xen_properties_destroy();
+                       goto prop_out;
+               }
+       }
+#endif
         goto out;
  
  prop_out:
@@ -407,6 +538,9 @@ out:
  
  static void __exit hyper_sysfs_exit(void)
  {
+#ifdef CONFIG_XEN_HAVE_VPMU
+       xen_pmu_destroy();
+#endif
         xen_properties_destroy();
         xen_compilation_destroy();
         xen_sysfs_uuid_destroy();
diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile

index b019865..1a83010 100644 (file)
--- a/drivers/xen/xenfs/Makefile
+++ b/drivers/xen/xenfs/Makefile
@@ -2,3 +2,4 @@ obj-$(CONFIG_XENFS) += xenfs.o
  
  xenfs-y                          = super.o
  xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
+xenfs-$(CONFIG_XEN_SYMS) += xensyms.o
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c

index 06092e0..8559a71 100644 (file)
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -57,6 +57,9 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
                 { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
                 { "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR},
                 { "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR},
+#ifdef CONFIG_XEN_SYMS
+               { "xensyms", &xensyms_ops, S_IRUSR},
+#endif
                 {""},
         };
  
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h

index 6b80c77..2c5934e 100644 (file)
--- a/drivers/xen/xenfs/xenfs.h
+++ b/drivers/xen/xenfs/xenfs.h
@@ -3,5 +3,6 @@
  
  extern const struct file_operations xsd_kva_file_ops;
  extern const struct file_operations xsd_port_file_ops;
+extern const struct file_operations xensyms_ops;
  
  #endif /* _XENFS_XENBUS_H */
diff --git a/drivers/xen/xenfs/xensyms.c b/drivers/xen/xenfs/xensyms.c

new file mode 100644 (file)

index 0000000..f8b1285
--- /dev/null
+++ b/drivers/xen/xenfs/xensyms.c
@@ -0,0 +1,152 @@
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+#include <xen/xen-ops.h>
+#include "xenfs.h"
+
+
+#define XEN_KSYM_NAME_LEN 127 /* Hypervisor may have different name length */
+
+struct xensyms {
+       struct xen_platform_op op;
+       char *name;
+       uint32_t namelen;
+};
+
+/* Grab next output page from the hypervisor */
+static int xensyms_next_sym(struct xensyms *xs)
+{
+       int ret;
+       struct xenpf_symdata *symdata = &xs->op.u.symdata;
+       uint64_t symnum;
+
+       memset(xs->name, 0, xs->namelen);
+       symdata->namelen = xs->namelen;
+
+       symnum = symdata->symnum;
+
+       ret = HYPERVISOR_dom0_op(&xs->op);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * If hypervisor's symbol didn't fit into the buffer then allocate
+        * a larger buffer and try again.
+        */
+       if (unlikely(symdata->namelen > xs->namelen)) {
+               kfree(xs->name);
+
+               xs->namelen = symdata->namelen;
+               xs->name = kzalloc(xs->namelen, GFP_KERNEL);
+               if (!xs->name)
+                       return -ENOMEM;
+
+               set_xen_guest_handle(symdata->name, xs->name);
+               symdata->symnum--; /* Rewind */
+
+               ret = HYPERVISOR_dom0_op(&xs->op);
+               if (ret < 0)
+                       return ret;
+       }
+
+       if (symdata->symnum == symnum)
+               /* End of symbols */
+               return 1;
+
+       return 0;
+}
+
+static void *xensyms_start(struct seq_file *m, loff_t *pos)
+{
+       struct xensyms *xs = (struct xensyms *)m->private;
+
+       xs->op.u.symdata.symnum = *pos;
+
+       if (xensyms_next_sym(xs))
+               return NULL;
+
+       return m->private;
+}
+
+static void *xensyms_next(struct seq_file *m, void *p, loff_t *pos)
+{
+       struct xensyms *xs = (struct xensyms *)m->private;
+
+       xs->op.u.symdata.symnum = ++(*pos);
+
+       if (xensyms_next_sym(xs))
+               return NULL;
+
+       return p;
+}
+
+static int xensyms_show(struct seq_file *m, void *p)
+{
+       struct xensyms *xs = (struct xensyms *)m->private;
+       struct xenpf_symdata *symdata = &xs->op.u.symdata;
+
+       seq_printf(m, "%016llx %c %s\n", symdata->address,
+                  symdata->type, xs->name);
+
+       return 0;
+}
+
+static void xensyms_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations xensyms_seq_ops = {
+       .start = xensyms_start,
+       .next = xensyms_next,
+       .show = xensyms_show,
+       .stop = xensyms_stop,
+};
+
+static int xensyms_open(struct inode *inode, struct file *file)
+{
+       struct seq_file *m;
+       struct xensyms *xs;
+       int ret;
+
+       ret = seq_open_private(file, &xensyms_seq_ops,
+                              sizeof(struct xensyms));
+       if (ret)
+               return ret;
+
+       m = file->private_data;
+       xs = (struct xensyms *)m->private;
+
+       xs->namelen = XEN_KSYM_NAME_LEN + 1;
+       xs->name = kzalloc(xs->namelen, GFP_KERNEL);
+       if (!xs->name) {
+               seq_release_private(inode, file);
+               return -ENOMEM;
+       }
+       set_xen_guest_handle(xs->op.u.symdata.name, xs->name);
+       xs->op.cmd = XENPF_get_symbol;
+       xs->op.u.symdata.namelen = xs->namelen;
+
+       return 0;
+}
+
+static int xensyms_release(struct inode *inode, struct file *file)
+{
+       struct seq_file *m = file->private_data;
+       struct xensyms *xs = (struct xensyms *)m->private;
+
+       kfree(xs->name);
+       return seq_release_private(inode, file);
+}
+
+const struct file_operations xensyms_ops = {
+       .open = xensyms_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = xensyms_release
+};
diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h

index a5de55c..316bd04 100644 (file)
--- a/include/asm-generic/early_ioremap.h
+++ b/include/asm-generic/early_ioremap.h
@@ -11,6 +11,8 @@ extern void __iomem *early_ioremap(resource_size_t phys_addr,
                                    unsigned long size);
  extern void *early_memremap(resource_size_t phys_addr,
                             unsigned long size);
+extern void *early_memremap_ro(resource_size_t phys_addr,
+                              unsigned long size);
  extern void early_iounmap(void __iomem *addr, unsigned long size);
  extern void early_memunmap(void *addr, unsigned long size);
  
diff --git a/include/asm-generic/fixmap.h b/include/asm-generic/fixmap.h

index f23174f..1cbb833 100644 (file)
--- a/include/asm-generic/fixmap.h
+++ b/include/asm-generic/fixmap.h
@@ -46,6 +46,9 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
  #ifndef FIXMAP_PAGE_NORMAL
  #define FIXMAP_PAGE_NORMAL PAGE_KERNEL
  #endif
+#if !defined(FIXMAP_PAGE_RO) && defined(PAGE_KERNEL_RO)
+#define FIXMAP_PAGE_RO PAGE_KERNEL_RO
+#endif
  #ifndef FIXMAP_PAGE_NOCACHE
  #define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_NOCACHE
  #endif
diff --git a/include/xen/events.h b/include/xen/events.h

index 7d95fdf..88da2ab 100644 (file)
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -92,7 +92,6 @@ void xen_hvm_callback_vector(void);
  #ifdef CONFIG_TRACING
  #define trace_xen_hvm_callback_vector xen_hvm_callback_vector
  #endif
-extern int xen_have_vector_callback;
  int xen_set_callback_via(uint64_t via);
  void xen_evtchn_do_upcall(struct pt_regs *regs);
  void xen_hvm_evtchn_do_upcall(void);
diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h

index 5cc49ea..8e03587 100644 (file)
--- a/include/xen/interface/platform.h
+++ b/include/xen/interface/platform.h
@@ -474,6 +474,23 @@ struct xenpf_core_parking {
  };
  DEFINE_GUEST_HANDLE_STRUCT(xenpf_core_parking);
  
+#define XENPF_get_symbol      63
+struct xenpf_symdata {
+       /* IN/OUT variables */
+       uint32_t        namelen; /* size of 'name' buffer */
+
+       /* IN/OUT variables */
+       uint32_t        symnum; /* IN:  Symbol to read                       */
+                               /* OUT: Next available symbol. If same as IN */
+                               /* then  we reached the end                  */
+
+       /* OUT variables */
+       GUEST_HANDLE(char) name;
+       uint64_t        address;
+       char            type;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_symdata);
+
  struct xen_platform_op {
         uint32_t cmd;
         uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@@ -495,6 +512,7 @@ struct xen_platform_op {
                 struct xenpf_cpu_hotadd        cpu_add;
                 struct xenpf_mem_hotadd        mem_add;
                 struct xenpf_core_parking      core_parking;
+               struct xenpf_symdata           symdata;
                 uint8_t                        pad[128];
         } u;
  };
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h

index a483789..167071c 100644 (file)
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -80,6 +80,7 @@
  #define __HYPERVISOR_kexec_op             37
  #define __HYPERVISOR_tmem_op              38
  #define __HYPERVISOR_xc_reserved_op       39 /* reserved for XenClient */
+#define __HYPERVISOR_xenpmu_op            40
  
  /* Architecture-specific hypercall definitions. */
  #define __HYPERVISOR_arch_0               48
@@ -112,6 +113,7 @@
  #define VIRQ_MEM_EVENT  10 /* G. (DOM0) A memory event has occured           */
  #define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient                     */
  #define VIRQ_ENOMEM     12 /* G. (DOM0) Low on heap memory       */
+#define VIRQ_XENPMU     13  /* PMC interrupt                                 */
  
  /* Architecture-specific VIRQ definitions. */
  #define VIRQ_ARCH_0    16
@@ -585,26 +587,29 @@ struct shared_info {
  };
  
  /*
- * Start-of-day memory layout for the initial domain (DOM0):
+ * Start-of-day memory layout
+ *
   *  1. The domain is started within contiguous virtual-memory region.
   *  2. The contiguous region begins and ends on an aligned 4MB boundary.
- *  3. The region start corresponds to the load address of the OS image.
- *     If the load address is not 4MB aligned then the address is rounded down.
- *  4. This the order of bootstrap elements in the initial virtual region:
+ *  3. This the order of bootstrap elements in the initial virtual region:
   *      a. relocated kernel image
   *      b. initial ram disk              [mod_start, mod_len]
+ *         (may be omitted)
   *      c. list of allocated page frames [mfn_list, nr_pages]
+ *         (unless relocated due to XEN_ELFNOTE_INIT_P2M)
   *      d. start_info_t structure        [register ESI (x86)]
- *      e. bootstrap page tables         [pt_base, CR3 (x86)]
- *      f. bootstrap stack               [register ESP (x86)]
- *  5. Bootstrap elements are packed together, but each is 4kB-aligned.
- *  6. The initial ram disk may be omitted.
- *  7. The list of page frames forms a contiguous 'pseudo-physical' memory
+ *         in case of dom0 this page contains the console info, too
+ *      e. unless dom0: xenstore ring page
+ *      f. unless dom0: console ring page
+ *      g. bootstrap page tables         [pt_base, CR3 (x86)]
+ *      h. bootstrap stack               [register ESP (x86)]
+ *  4. Bootstrap elements are packed together, but each is 4kB-aligned.
+ *  5. The list of page frames forms a contiguous 'pseudo-physical' memory
   *     layout for the domain. In particular, the bootstrap virtual-memory
   *     region is a 1:1 mapping to the first section of the pseudo-physical map.
- *  8. All bootstrap elements are mapped read-writable for the guest OS. The
+ *  6. All bootstrap elements are mapped read-writable for the guest OS. The
   *     only exception is the bootstrap page table, which is mapped read-only.
- *  9. There is guaranteed to be at least 512kB padding after the final
+ *  7. There is guaranteed to be at least 512kB padding after the final
   *     bootstrap element. If necessary, the bootstrap virtual region is
   *     extended by an extra 4MB to ensure this.
   */
@@ -641,10 +646,12 @@ struct start_info {
  };
  
  /* These flags are passed in the 'flags' field of start_info_t. */
-#define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
-#define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
-#define SIF_MULTIBOOT_MOD (1<<2)  /* Is mod_start a multiboot module? */
-#define SIF_MOD_START_PFN (1<<3)  /* Is mod_start a PFN? */
+#define SIF_PRIVILEGED      (1<<0)  /* Is the domain privileged? */
+#define SIF_INITDOMAIN      (1<<1)  /* Is this the initial control domain? */
+#define SIF_MULTIBOOT_MOD   (1<<2)  /* Is mod_start a multiboot module? */
+#define SIF_MOD_START_PFN   (1<<3)  /* Is mod_start a PFN? */
+#define SIF_VIRT_P2M_4TOOLS (1<<4)  /* Do Xen tools understand a virt. mapped */
+                                   /* P->M making the 3 level tree obsolete? */
  #define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
  
  /*
diff --git a/include/xen/interface/xenpmu.h b/include/xen/interface/xenpmu.h

new file mode 100644 (file)

index 0000000..139efc9
--- /dev/null
+++ b/include/xen/interface/xenpmu.h
@@ -0,0 +1,94 @@
+#ifndef __XEN_PUBLIC_XENPMU_H__
+#define __XEN_PUBLIC_XENPMU_H__
+
+#include "xen.h"
+
+#define XENPMU_VER_MAJ    0
+#define XENPMU_VER_MIN    1
+
+/*
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_xenpmu_op(enum xenpmu_op cmd, struct xenpmu_params *args);
+ *
+ * @cmd  == XENPMU_* (PMU operation)
+ * @args == struct xenpmu_params
+ */
+/* ` enum xenpmu_op { */
+#define XENPMU_mode_get        0 /* Also used for getting PMU version */
+#define XENPMU_mode_set        1
+#define XENPMU_feature_get     2
+#define XENPMU_feature_set     3
+#define XENPMU_init            4
+#define XENPMU_finish          5
+#define XENPMU_lvtpc_set       6
+#define XENPMU_flush           7
+
+/* ` } */
+
+/* Parameters structure for HYPERVISOR_xenpmu_op call */
+struct xen_pmu_params {
+       /* IN/OUT parameters */
+       struct {
+               uint32_t maj;
+               uint32_t min;
+       } version;
+       uint64_t val;
+
+       /* IN parameters */
+       uint32_t vcpu;
+       uint32_t pad;
+};
+
+/* PMU modes:
+ * - XENPMU_MODE_OFF:   No PMU virtualization
+ * - XENPMU_MODE_SELF:  Guests can profile themselves
+ * - XENPMU_MODE_HV:    Guests can profile themselves, dom0 profiles
+ *                      itself and Xen
+ * - XENPMU_MODE_ALL:   Only dom0 has access to VPMU and it profiles
+ *                      everyone: itself, the hypervisor and the guests.
+ */
+#define XENPMU_MODE_OFF           0
+#define XENPMU_MODE_SELF          (1<<0)
+#define XENPMU_MODE_HV            (1<<1)
+#define XENPMU_MODE_ALL           (1<<2)
+
+/*
+ * PMU features:
+ * - XENPMU_FEATURE_INTEL_BTS: Intel BTS support (ignored on AMD)
+ */
+#define XENPMU_FEATURE_INTEL_BTS  1
+
+/*
+ * Shared PMU data between hypervisor and PV(H) domains.
+ *
+ * The hypervisor fills out this structure during PMU interrupt and sends an
+ * interrupt to appropriate VCPU.
+ * Architecture-independent fields of xen_pmu_data are WO for the hypervisor
+ * and RO for the guest but some fields in xen_pmu_arch can be writable
+ * by both the hypervisor and the guest (see arch-$arch/pmu.h).
+ */
+struct xen_pmu_data {
+       /* Interrupted VCPU */
+       uint32_t vcpu_id;
+
+       /*
+        * Physical processor on which the interrupt occurred. On non-privileged
+        * guests set to vcpu_id;
+        */
+       uint32_t pcpu_id;
+
+       /*
+        * Domain that was interrupted. On non-privileged guests set to
+        * DOMID_SELF.
+        * On privileged guests can be DOMID_SELF, DOMID_XEN, or, when in
+        * XENPMU_MODE_ALL mode, domain ID of another domain.
+        */
+       domid_t  domain_id;
+
+       uint8_t pad[6];
+
+       /* Architecture-specific information */
+       struct xen_pmu_arch pmu;
+};
+
+#endif /* __XEN_PUBLIC_XENPMU_H__ */
diff --git a/include/xen/page.h b/include/xen/page.h

index c5ed20b..a5983da 100644 (file)
--- a/include/xen/page.h
+++ b/include/xen/page.h
@@ -9,8 +9,8 @@ static inline unsigned long page_to_mfn(struct page *page)
  }
  
  struct xen_memory_region {
-       phys_addr_t start;
-       phys_addr_t size;
+       unsigned long start_pfn;
+       unsigned long n_pfns;
  };
  
  #define XEN_EXTRA_MEM_MAX_REGIONS 128 /* == E820MAX */
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c

index e10ccd2..0cfadaf 100644 (file)
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -217,6 +217,13 @@ early_memremap(resource_size_t phys_addr, unsigned long size)
         return (__force void *)__early_ioremap(phys_addr, size,
                                                FIXMAP_PAGE_NORMAL);
  }
+#ifdef FIXMAP_PAGE_RO
+void __init *
+early_memremap_ro(resource_size_t phys_addr, unsigned long size)
+{
+       return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
+}
+#endif
  #else /* CONFIG_MMU */
  
  void __init __iomem *
@@ -231,6 +238,11 @@ early_memremap(resource_size_t phys_addr, unsigned long size)
  {
         return (void *)phys_addr;
  }
+void __init *
+early_memremap_ro(resource_size_t phys_addr, unsigned long size)
+{
+       return (void *)phys_addr;
+}
  
  void __init early_iounmap(void __iomem *addr, unsigned long size)
  {
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 8 Sep 2015 18:46:48 +0000 (11:46 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 8 Sep 2015 18:46:48 +0000 (11:46 -0700)
Documentation/ABI/testing/sysfs-hypervisor-pmu	[new file with mode: 0644]	patch \| blob
Documentation/kernel-parameters.txt		patch \| blob \| history
arch/arm/include/asm/xen/events.h		patch \| blob \| history
arch/arm/include/asm/xen/page.h		patch \| blob \| history
arch/arm/xen/enlighten.c		patch \| blob \| history
arch/arm64/include/asm/xen/events.h		patch \| blob \| history
arch/x86/include/asm/xen/events.h		patch \| blob \| history
arch/x86/include/asm/xen/hypercall.h		patch \| blob \| history
arch/x86/include/asm/xen/interface.h		patch \| blob \| history
arch/x86/include/asm/xen/page.h		patch \| blob \| history
arch/x86/xen/Kconfig		patch \| blob \| history
arch/x86/xen/Makefile		patch \| blob \| history
arch/x86/xen/apic.c		patch \| blob \| history
arch/x86/xen/enlighten.c		patch \| blob \| history
arch/x86/xen/mmu.c		patch \| blob \| history
arch/x86/xen/p2m.c		patch \| blob \| history
arch/x86/xen/p2m.h	[deleted file]	patch \| blob \| history
arch/x86/xen/platform-pci-unplug.c		patch \| blob \| history
arch/x86/xen/pmu.c	[new file with mode: 0644]	patch \| blob
arch/x86/xen/pmu.h	[new file with mode: 0644]	patch \| blob
arch/x86/xen/setup.c		patch \| blob \| history
arch/x86/xen/smp.c		patch \| blob \| history
arch/x86/xen/suspend.c		patch \| blob \| history
arch/x86/xen/xen-head.S		patch \| blob \| history
arch/x86/xen/xen-ops.h		patch \| blob \| history
drivers/block/xen-blkfront.c		patch \| blob \| history
drivers/xen/Kconfig		patch \| blob \| history
drivers/xen/balloon.c		patch \| blob \| history
drivers/xen/events/events_base.c		patch \| blob \| history
drivers/xen/sys-hypervisor.c		patch \| blob \| history
drivers/xen/xenfs/Makefile		patch \| blob \| history
drivers/xen/xenfs/super.c		patch \| blob \| history
drivers/xen/xenfs/xenfs.h		patch \| blob \| history
drivers/xen/xenfs/xensyms.c	[new file with mode: 0644]	patch \| blob
include/asm-generic/early_ioremap.h		patch \| blob \| history
include/asm-generic/fixmap.h		patch \| blob \| history
include/xen/events.h		patch \| blob \| history
include/xen/interface/platform.h		patch \| blob \| history
include/xen/interface/xen.h		patch \| blob \| history
include/xen/interface/xenpmu.h	[new file with mode: 0644]	patch \| blob
include/xen/page.h		patch \| blob \| history
mm/early_ioremap.c		patch \| blob \| history