Merge branch 'perf/x86' into perf/core, because it's ready

author Ingo Molnar <mingo@kernel.org>

Fri, 27 Mar 2015 08:46:19 +0000 (09:46 +0100)

committer Ingo Molnar <mingo@kernel.org>

Fri, 27 Mar 2015 08:46:19 +0000 (09:46 +0100)
author Ingo Molnar <mingo@kernel.org>
Fri, 27 Mar 2015 08:46:19 +0000 (09:46 +0100)
committer Ingo Molnar <mingo@kernel.org>
Fri, 27 Mar 2015 08:46:19 +0000 (09:46 +0100)
diff --combined arch/arm64/kernel/hw_breakpoint.c

index 98bbe06,d062f35..e7d934d
--- 1/arch/arm64/kernel/hw_breakpoint.c
--- 2/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@@ -527,7 -527,7 +527,7 @@@ int arch_validate_hwbkpt_settings(struc
          * Disallow per-task kernel breakpoints since these would
          * complicate the stepping code.
          */
-       if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.bp_target)
+       if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.target)
                 return -EINVAL;
   
         return 0;
@@@ -894,7 -894,7 +894,7 @@@ static struct notifier_block hw_breakpo
         .notifier_call = hw_breakpoint_reset_notify,
   };
   
- -#ifdef CONFIG_ARM64_CPU_SUSPEND
+ +#ifdef CONFIG_CPU_PM
   extern void cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *));
   #else
   static inline void cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *))
diff --combined arch/x86/kernel/cpu/common.c

index 2346c95,9fa00b2..1cd4a1a
--- 1/arch/x86/kernel/cpu/common.c
--- 2/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@@ -492,18 -492,17 +492,18 @@@ u16 __read_mostly tlb_lld_2m[NR_INFO]
   u16 __read_mostly tlb_lld_4m[NR_INFO];
   u16 __read_mostly tlb_lld_1g[NR_INFO];
   
- -void cpu_detect_tlb(struct cpuinfo_x86 *c)
+ +static void cpu_detect_tlb(struct cpuinfo_x86 *c)
   {
         if (this_cpu->c_detect_tlb)
                 this_cpu->c_detect_tlb(c);
   
- -      printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
- -              "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
+ +      pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
                 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
- -              tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
- -              tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
- -              tlb_lld_1g[ENTRIES]);
+ +              tlb_lli_4m[ENTRIES]);
+ +
+ +      pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
+ +              tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES],
+ +              tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
   }
   
   void detect_ht(struct cpuinfo_x86 *c)
@@@ -646,6 -645,30 +646,30 @@@ void get_cpu_cap(struct cpuinfo_x86 *c
                 c->x86_capability[10] = eax;
         }
   
+       /* Additional Intel-defined flags: level 0x0000000F */
+       if (c->cpuid_level >= 0x0000000F) {
+               u32 eax, ebx, ecx, edx;
+ 
+               /* QoS sub-leaf, EAX=0Fh, ECX=0 */
+               cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
+               c->x86_capability[11] = edx;
+               if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
+                       /* will be overridden if occupancy monitoring exists */
+                       c->x86_cache_max_rmid = ebx;
+ 
+                       /* QoS sub-leaf, EAX=0Fh, ECX=1 */
+                       cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
+                       c->x86_capability[12] = edx;
+                       if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+                               c->x86_cache_max_rmid = ecx;
+                               c->x86_cache_occ_scale = ebx;
+                       }
+               } else {
+                       c->x86_cache_max_rmid = -1;
+                       c->x86_cache_occ_scale = -1;
+               }
+       }
+ 
         /* AMD-defined flags: level 0x80000001 */
         xlvl = cpuid_eax(0x80000000);
         c->extended_cpuid_level = xlvl;
@@@ -834,6 -857,20 +858,20 @@@ static void generic_identify(struct cpu
         detect_nopl(c);
   }
   
+ static void x86_init_cache_qos(struct cpuinfo_x86 *c)
+ {
+       /*
+        * The heavy lifting of max_rmid and cache_occ_scale are handled
+        * in get_cpu_cap().  Here we just set the max_rmid for the boot_cpu
+        * in case CQM bits really aren't there in this CPU.
+        */
+       if (c != &boot_cpu_data) {
+               boot_cpu_data.x86_cache_max_rmid =
+                       min(boot_cpu_data.x86_cache_max_rmid,
+                           c->x86_cache_max_rmid);
+       }
+ }
+ 
   /*
    * This does the hard work of actually picking apart the CPU stuff...
    */
@@@ -923,6 -960,7 +961,7 @@@ static void identify_cpu(struct cpuinfo
   
         init_hypervisor(c);
         x86_init_rdrand(c);
+       x86_init_cache_qos(c);
   
         /*
          * Clear/Set all flags overriden by options, need do it
@@@ -1340,7 -1378,7 +1379,7 @@@ void cpu_init(void
         barrier();
   
         x86_configure_nx();
- -      enable_x2apic();
+ +      x2apic_setup();
   
         /*
          * set up and load the per-CPU TSS
@@@ -1396,12 -1434,6 +1435,12 @@@ void cpu_init(void
   
         wait_for_master_cpu(cpu);
   
+ +      /*
+ +       * Initialize the CR4 shadow before doing anything that could
+ +       * try to read it.
+ +       */
+ +      cr4_init_shadow();
+ +
         show_ucode_info_early();
   
         printk(KERN_INFO "Initializing CPU#%d\n", cpu);
diff --combined include/linux/perf_event.h

index a503100,5aa49d7..b16eac5
--- 1/include/linux/perf_event.h
--- 2/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@@ -53,6 -53,7 +53,7 @@@ struct perf_guest_info_callbacks 
   #include <linux/sysfs.h>
   #include <linux/perf_regs.h>
   #include <linux/workqueue.h>
+ #include <linux/cgroup.h>
   #include <asm/local.h>
   
   struct perf_callchain_entry {
@@@ -118,10 -119,16 +119,16 @@@ struct hw_perf_event 
                         struct hrtimer  hrtimer;
                 };
                 struct { /* tracepoint */
-                       struct task_struct      *tp_target;
                         /* for tp_event->class */
                         struct list_head        tp_list;
                 };
+               struct { /* intel_cqm */
+                       int                     cqm_state;
+                       int                     cqm_rmid;
+                       struct list_head        cqm_events_entry;
+                       struct list_head        cqm_groups_entry;
+                       struct list_head        cqm_group_entry;
+               };
   #ifdef CONFIG_HAVE_HW_BREAKPOINT
                 struct { /* breakpoint */
                         /*
@@@ -129,12 -136,12 +136,12 @@@
                          * problem hw_breakpoint has with context
                          * creation and event initalization.
                          */
-                       struct task_struct              *bp_target;
                         struct arch_hw_breakpoint       info;
                         struct list_head                bp_list;
                 };
   #endif
         };
+       struct task_struct              *target;
         int                             state;
         local64_t                       prev_count;
         u64                             sample_period;
@@@ -271,6 -278,11 +278,11 @@@ struct pmu 
          */
         size_t                          task_ctx_size;
   
+ 
+       /*
+        * Return the count value for a counter.
+        */
+       u64 (*count)                    (struct perf_event *event); /*optional*/
   };
   
   /**
@@@ -547,6 -559,35 +559,35 @@@ struct perf_output_handle 
         int                             page;
   };
   
+ #ifdef CONFIG_CGROUP_PERF
+ 
+ /*
+  * perf_cgroup_info keeps track of time_enabled for a cgroup.
+  * This is a per-cpu dynamically allocated data structure.
+  */
+ struct perf_cgroup_info {
+       u64                             time;
+       u64                             timestamp;
+ };
+ 
+ struct perf_cgroup {
+       struct cgroup_subsys_state      css;
+       struct perf_cgroup_info __percpu *info;
+ };
+ 
+ /*
+  * Must ensure cgroup is pinned (css_get) before calling
+  * this function. In other words, we cannot call this function
+  * if there is no cgroup event for the current CPU context.
+  */
+ static inline struct perf_cgroup *
+ perf_cgroup_from_task(struct task_struct *task)
+ {
+       return container_of(task_css(task, perf_event_cgrp_id),
+                           struct perf_cgroup, css);
+ }
+ #endif /* CONFIG_CGROUP_PERF */
+ 
   #ifdef CONFIG_PERF_EVENTS
   
   extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
@@@ -740,6 -781,11 +781,11 @@@ static inline void perf_event_task_sche
                 __perf_event_task_sched_out(prev, next);
   }
   
+ static inline u64 __perf_event_count(struct perf_event *event)
+ {
+       return local64_read(&event->count) + atomic64_read(&event->child_count);
+ }
+ 
   extern void perf_event_mmap(struct vm_area_struct *vma);
   extern struct perf_guest_info_callbacks *perf_guest_cbs;
   extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
@@@ -928,22 -974,12 +974,22 @@@ struct perf_pmu_events_attr 
         const char *event_str;
   };
   
+ +ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+ +                            char *page);
+ +
   #define PMU_EVENT_ATTR(_name, _var, _id, _show)                               \
   static struct perf_pmu_events_attr _var = {                           \
         .attr = __ATTR(_name, 0444, _show, NULL),                       \
         .id   =  _id,                                                   \
   };
   
+ +#define PMU_EVENT_ATTR_STRING(_name, _var, _str)                          \
+ +static struct perf_pmu_events_attr _var = {                               \
+ +      .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
+ +      .id             = 0,                                                \
+ +      .event_str      = _str,                                             \
+ +};
+ +
   #define PMU_FORMAT_ATTR(_name, _format)                                       \
   static ssize_t                                                                \
   _name##_show(struct device *dev,                                      \
diff --combined kernel/events/core.c

index 9a5f339,525062b..b01dfb6
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -34,11 -34,11 +34,11 @@@
   #include <linux/syscalls.h>
   #include <linux/anon_inodes.h>
   #include <linux/kernel_stat.h>
+ #include <linux/cgroup.h>
   #include <linux/perf_event.h>
   #include <linux/ftrace_event.h>
   #include <linux/hw_breakpoint.h>
   #include <linux/mm_types.h>
- #include <linux/cgroup.h>
   #include <linux/module.h>
   #include <linux/mman.h>
   #include <linux/compat.h>
@@@ -351,32 -351,6 +351,6 @@@ static void perf_ctx_unlock(struct perf
   
   #ifdef CONFIG_CGROUP_PERF
   
- /*
-  * perf_cgroup_info keeps track of time_enabled for a cgroup.
-  * This is a per-cpu dynamically allocated data structure.
-  */
- struct perf_cgroup_info {
-       u64                             time;
-       u64                             timestamp;
- };
- 
- struct perf_cgroup {
-       struct cgroup_subsys_state      css;
-       struct perf_cgroup_info __percpu *info;
- };
- 
- /*
-  * Must ensure cgroup is pinned (css_get) before calling
-  * this function. In other words, we cannot call this function
-  * if there is no cgroup event for the current CPU context.
-  */
- static inline struct perf_cgroup *
- perf_cgroup_from_task(struct task_struct *task)
- {
-       return container_of(task_css(task, perf_event_cgrp_id),
-                           struct perf_cgroup, css);
- }
- 
   static inline bool
   perf_cgroup_match(struct perf_event *event)
   {
@@@ -3220,7 -3194,10 +3194,10 @@@ static void __perf_event_read(void *inf
   
   static inline u64 perf_event_count(struct perf_event *event)
   {
-       return local64_read(&event->count) + atomic64_read(&event->child_count);
+       if (event->pmu->count)
+               return event->pmu->count(event);
+ 
+       return __perf_event_count(event);
   }
   
   static u64 perf_event_read(struct perf_event *event)
@@@ -3610,7 -3587,7 +3587,7 @@@ static void put_event(struct perf_even
         ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
         WARN_ON_ONCE(ctx->parent_ctx);
         perf_remove_from_context(event, true);
- -      mutex_unlock(&ctx->mutex);
+ +      perf_event_ctx_unlock(event, ctx);
   
         _free_event(event);
   }
@@@ -4446,7 -4423,7 +4423,7 @@@ static int perf_mmap(struct file *file
          * If we have rb pages ensure they're a power-of-two number, so we
          * can do bitmasks instead of modulo.
          */
- -      if (!is_power_of_2(nr_pages))
+ +      if (nr_pages != 0 && !is_power_of_2(nr_pages))
                 return -EINVAL;
   
         if (vma_size != PAGE_SIZE * (1 + nr_pages))
@@@ -4593,13 -4570,6 +4570,13 @@@ static void perf_pending_event(struct i
   {
         struct perf_event *event = container_of(entry,
                         struct perf_event, pending);
+ +      int rctx;
+ +
+ +      rctx = perf_swevent_get_recursion_context();
+ +      /*
+ +       * If we 'fail' here, that's OK, it means recursion is already disabled
+ +       * and we won't recurse 'further'.
+ +       */
   
         if (event->pending_disable) {
                 event->pending_disable = 0;
@@@ -4610,9 -4580,6 +4587,9 @@@
                 event->pending_wakeup = 0;
                 perf_event_wakeup(event);
         }
+ +
+ +      if (rctx >= 0)
+ +              perf_swevent_put_recursion_context(rctx);
   }
   
   /*
@@@ -7149,7 -7116,7 +7126,7 @@@ perf_event_alloc(struct perf_event_att
                  struct perf_event *group_leader,
                  struct perf_event *parent_event,
                  perf_overflow_handler_t overflow_handler,
-                void *context)
+                void *context, int cgroup_fd)
   {
         struct pmu *pmu;
         struct perf_event *event;
@@@ -7204,16 -7171,12 +7181,12 @@@
   
         if (task) {
                 event->attach_state = PERF_ATTACH_TASK;
- 
-               if (attr->type == PERF_TYPE_TRACEPOINT)
-                       event->hw.tp_target = task;
- #ifdef CONFIG_HAVE_HW_BREAKPOINT
                 /*
-                * hw_breakpoint is a bit difficult here..
+                * XXX pmu::event_init needs to know what task to account to
+                * and we cannot use the ctx information because we need the
+                * pmu before we get a ctx.
                  */
-               else if (attr->type == PERF_TYPE_BREAKPOINT)
-                       event->hw.bp_target = task;
- #endif
+               event->hw.target = task;
         }
   
         if (!overflow_handler && parent_event) {
@@@ -7245,6 -7208,12 +7218,12 @@@
         if (!has_branch_stack(event))
                 event->attr.branch_sample_type = 0;
   
+       if (cgroup_fd != -1) {
+               err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+               if (err)
+                       goto err_ns;
+       }
+ 
         pmu = perf_init_event(event);
         if (!pmu)
                 goto err_ns;
@@@ -7268,6 -7237,8 +7247,8 @@@ err_pmu
                 event->destroy(event);
         module_put(pmu->module);
   err_ns:
+       if (is_cgroup_event(event))
+               perf_detach_cgroup(event);
         if (event->ns)
                 put_pid_ns(event->ns);
         kfree(event);
@@@ -7486,6 -7457,7 +7467,7 @@@ SYSCALL_DEFINE5(perf_event_open
         int move_group = 0;
         int err;
         int f_flags = O_RDWR;
+       int cgroup_fd = -1;
   
         /* for future expandability... */
         if (flags & ~PERF_FLAG_ALL)
@@@ -7551,21 -7523,16 +7533,16 @@@
   
         get_online_cpus();
   
+       if (flags & PERF_FLAG_PID_CGROUP)
+               cgroup_fd = pid;
+ 
         event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
-                                NULL, NULL);
+                                NULL, NULL, cgroup_fd);
         if (IS_ERR(event)) {
                 err = PTR_ERR(event);
                 goto err_cpus;
         }
   
-       if (flags & PERF_FLAG_PID_CGROUP) {
-               err = perf_cgroup_connect(pid, event, &attr, group_leader);
-               if (err) {
-                       __free_event(event);
-                       goto err_cpus;
-               }
-       }
- 
         if (is_sampling_event(event)) {
                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
                         err = -ENOTSUPP;
@@@ -7802,7 -7769,7 +7779,7 @@@ perf_event_create_kernel_counter(struc
          */
   
         event = perf_event_alloc(attr, cpu, task, NULL, NULL,
-                                overflow_handler, context);
+                                overflow_handler, context, -1);
         if (IS_ERR(event)) {
                 err = PTR_ERR(event);
                 goto err;
@@@ -8163,7 -8130,7 +8140,7 @@@ inherit_event(struct perf_event *parent
                                            parent_event->cpu,
                                            child,
                                            group_leader, parent_event,
-                                          NULL, NULL);
+                                          NULL, NULL, -1);
         if (IS_ERR(child_event))
                 return child_event;
   
@@@ -8549,18 -8516,6 +8526,18 @@@ void __init perf_event_init(void
                      != 1024);
   }
   
+ +ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+ +                            char *page)
+ +{
+ +      struct perf_pmu_events_attr *pmu_attr =
+ +              container_of(attr, struct perf_pmu_events_attr, attr);
+ +
+ +      if (pmu_attr->event_str)
+ +              return sprintf(page, "%s\n", pmu_attr->event_str);
+ +
+ +      return 0;
+ +}
+ +
   static int __init perf_event_sysfs_init(void)
   {
         struct pmu *pmu;
diff --combined kernel/trace/trace_uprobe.c

index 7dc1c8a,93fdc77..996e452
--- 1/kernel/trace/trace_uprobe.c
--- 2/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@@ -1005,7 -1005,7 +1005,7 @@@ __uprobe_perf_filter(struct trace_uprob
                 return true;
   
         list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
-               if (event->hw.tp_target->mm == mm)
+               if (event->hw.target->mm == mm)
                         return true;
         }
   
@@@ -1015,7 -1015,7 +1015,7 @@@
   static inline bool
   uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
   {
-       return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+       return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
   }
   
   static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
@@@ -1023,10 -1023,10 +1023,10 @@@
         bool done;
   
         write_lock(&tu->filter.rwlock);
-       if (event->hw.tp_target) {
+       if (event->hw.target) {
                 list_del(&event->hw.tp_list);
                 done = tu->filter.nr_systemwide ||
-                       (event->hw.tp_target->flags & PF_EXITING) ||
+                       (event->hw.target->flags & PF_EXITING) ||
                         uprobe_filter_event(tu, event);
         } else {
                 tu->filter.nr_systemwide--;
@@@ -1046,7 -1046,7 +1046,7 @@@ static int uprobe_perf_open(struct trac
         int err;
   
         write_lock(&tu->filter.rwlock);
-       if (event->hw.tp_target) {
+       if (event->hw.target) {
                 /*
                  * event->parent != NULL means copy_process(), we can avoid
                  * uprobe_apply(). current->mm must be probed and we can rely
@@@ -1321,7 -1321,7 +1321,7 @@@ static __init int init_uprobe_trace(voi
         struct dentry *d_tracer;
   
         d_tracer = tracing_init_dentry();
- -      if (!d_tracer)
+ +      if (IS_ERR(d_tracer))
                 return 0;
   
         trace_create_file("uprobe_events", 0644, d_tracer,
author	Ingo Molnar <mingo@kernel.org>
	Fri, 27 Mar 2015 08:46:19 +0000 (09:46 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Fri, 27 Mar 2015 08:46:19 +0000 (09:46 +0100)
		1	2
arch/arm64/kernel/hw_breakpoint.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/common.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/perf_event.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace_uprobe.c	patch \|	diff1 \|	diff2 \|	blob \| history