Merge branch 'perf/x86' into perf/core, because it's ready
authorIngo Molnar <mingo@kernel.org>
Fri, 27 Mar 2015 08:46:19 +0000 (09:46 +0100)
committerIngo Molnar <mingo@kernel.org>
Fri, 27 Mar 2015 08:46:19 +0000 (09:46 +0100)
Signed-off-by: Ingo Molnar <mingo@kernel.org>
1  2 
arch/arm64/kernel/hw_breakpoint.c
arch/x86/kernel/cpu/common.c
include/linux/perf_event.h
kernel/events/core.c
kernel/trace/trace_uprobe.c

@@@ -527,7 -527,7 +527,7 @@@ int arch_validate_hwbkpt_settings(struc
         * Disallow per-task kernel breakpoints since these would
         * complicate the stepping code.
         */
-       if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.bp_target)
+       if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.target)
                return -EINVAL;
  
        return 0;
@@@ -894,7 -894,7 +894,7 @@@ static struct notifier_block hw_breakpo
        .notifier_call = hw_breakpoint_reset_notify,
  };
  
 -#ifdef CONFIG_ARM64_CPU_SUSPEND
 +#ifdef CONFIG_CPU_PM
  extern void cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *));
  #else
  static inline void cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *))
@@@ -492,18 -492,17 +492,18 @@@ u16 __read_mostly tlb_lld_2m[NR_INFO]
  u16 __read_mostly tlb_lld_4m[NR_INFO];
  u16 __read_mostly tlb_lld_1g[NR_INFO];
  
 -void cpu_detect_tlb(struct cpuinfo_x86 *c)
 +static void cpu_detect_tlb(struct cpuinfo_x86 *c)
  {
        if (this_cpu->c_detect_tlb)
                this_cpu->c_detect_tlb(c);
  
 -      printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
 -              "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
 +      pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
                tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
 -              tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
 -              tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
 -              tlb_lld_1g[ENTRIES]);
 +              tlb_lli_4m[ENTRIES]);
 +
 +      pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
 +              tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES],
 +              tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
  }
  
  void detect_ht(struct cpuinfo_x86 *c)
@@@ -646,6 -645,30 +646,30 @@@ void get_cpu_cap(struct cpuinfo_x86 *c
                c->x86_capability[10] = eax;
        }
  
+       /* Additional Intel-defined flags: level 0x0000000F */
+       if (c->cpuid_level >= 0x0000000F) {
+               u32 eax, ebx, ecx, edx;
+               /* QoS sub-leaf, EAX=0Fh, ECX=0 */
+               cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
+               c->x86_capability[11] = edx;
+               if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
+                       /* will be overridden if occupancy monitoring exists */
+                       c->x86_cache_max_rmid = ebx;
+                       /* QoS sub-leaf, EAX=0Fh, ECX=1 */
+                       cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
+                       c->x86_capability[12] = edx;
+                       if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+                               c->x86_cache_max_rmid = ecx;
+                               c->x86_cache_occ_scale = ebx;
+                       }
+               } else {
+                       c->x86_cache_max_rmid = -1;
+                       c->x86_cache_occ_scale = -1;
+               }
+       }
        /* AMD-defined flags: level 0x80000001 */
        xlvl = cpuid_eax(0x80000000);
        c->extended_cpuid_level = xlvl;
@@@ -834,6 -857,20 +858,20 @@@ static void generic_identify(struct cpu
        detect_nopl(c);
  }
  
+ static void x86_init_cache_qos(struct cpuinfo_x86 *c)
+ {
+       /*
+        * The heavy lifting of max_rmid and cache_occ_scale are handled
+        * in get_cpu_cap().  Here we just set the max_rmid for the boot_cpu
+        * in case CQM bits really aren't there in this CPU.
+        */
+       if (c != &boot_cpu_data) {
+               boot_cpu_data.x86_cache_max_rmid =
+                       min(boot_cpu_data.x86_cache_max_rmid,
+                           c->x86_cache_max_rmid);
+       }
+ }
  /*
   * This does the hard work of actually picking apart the CPU stuff...
   */
@@@ -923,6 -960,7 +961,7 @@@ static void identify_cpu(struct cpuinfo
  
        init_hypervisor(c);
        x86_init_rdrand(c);
+       x86_init_cache_qos(c);
  
        /*
         * Clear/Set all flags overriden by options, need do it
@@@ -1340,7 -1378,7 +1379,7 @@@ void cpu_init(void
        barrier();
  
        x86_configure_nx();
 -      enable_x2apic();
 +      x2apic_setup();
  
        /*
         * set up and load the per-CPU TSS
@@@ -1396,12 -1434,6 +1435,12 @@@ void cpu_init(void
  
        wait_for_master_cpu(cpu);
  
 +      /*
 +       * Initialize the CR4 shadow before doing anything that could
 +       * try to read it.
 +       */
 +      cr4_init_shadow();
 +
        show_ucode_info_early();
  
        printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@@ -53,6 -53,7 +53,7 @@@ struct perf_guest_info_callbacks 
  #include <linux/sysfs.h>
  #include <linux/perf_regs.h>
  #include <linux/workqueue.h>
+ #include <linux/cgroup.h>
  #include <asm/local.h>
  
  struct perf_callchain_entry {
@@@ -118,10 -119,16 +119,16 @@@ struct hw_perf_event 
                        struct hrtimer  hrtimer;
                };
                struct { /* tracepoint */
-                       struct task_struct      *tp_target;
                        /* for tp_event->class */
                        struct list_head        tp_list;
                };
+               struct { /* intel_cqm */
+                       int                     cqm_state;
+                       int                     cqm_rmid;
+                       struct list_head        cqm_events_entry;
+                       struct list_head        cqm_groups_entry;
+                       struct list_head        cqm_group_entry;
+               };
  #ifdef CONFIG_HAVE_HW_BREAKPOINT
                struct { /* breakpoint */
                        /*
                         * problem hw_breakpoint has with context
                         * creation and event initalization.
                         */
-                       struct task_struct              *bp_target;
                        struct arch_hw_breakpoint       info;
                        struct list_head                bp_list;
                };
  #endif
        };
+       struct task_struct              *target;
        int                             state;
        local64_t                       prev_count;
        u64                             sample_period;
@@@ -271,6 -278,11 +278,11 @@@ struct pmu 
         */
        size_t                          task_ctx_size;
  
+       /*
+        * Return the count value for a counter.
+        */
+       u64 (*count)                    (struct perf_event *event); /*optional*/
  };
  
  /**
@@@ -547,6 -559,35 +559,35 @@@ struct perf_output_handle 
        int                             page;
  };
  
+ #ifdef CONFIG_CGROUP_PERF
+ /*
+  * perf_cgroup_info keeps track of time_enabled for a cgroup.
+  * This is a per-cpu dynamically allocated data structure.
+  */
+ struct perf_cgroup_info {
+       u64                             time;
+       u64                             timestamp;
+ };
+ struct perf_cgroup {
+       struct cgroup_subsys_state      css;
+       struct perf_cgroup_info __percpu *info;
+ };
+ /*
+  * Must ensure cgroup is pinned (css_get) before calling
+  * this function. In other words, we cannot call this function
+  * if there is no cgroup event for the current CPU context.
+  */
+ static inline struct perf_cgroup *
+ perf_cgroup_from_task(struct task_struct *task)
+ {
+       return container_of(task_css(task, perf_event_cgrp_id),
+                           struct perf_cgroup, css);
+ }
+ #endif /* CONFIG_CGROUP_PERF */
  #ifdef CONFIG_PERF_EVENTS
  
  extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
@@@ -740,6 -781,11 +781,11 @@@ static inline void perf_event_task_sche
                __perf_event_task_sched_out(prev, next);
  }
  
+ static inline u64 __perf_event_count(struct perf_event *event)
+ {
+       return local64_read(&event->count) + atomic64_read(&event->child_count);
+ }
  extern void perf_event_mmap(struct vm_area_struct *vma);
  extern struct perf_guest_info_callbacks *perf_guest_cbs;
  extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
@@@ -928,22 -974,12 +974,22 @@@ struct perf_pmu_events_attr 
        const char *event_str;
  };
  
 +ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
 +                            char *page);
 +
  #define PMU_EVENT_ATTR(_name, _var, _id, _show)                               \
  static struct perf_pmu_events_attr _var = {                           \
        .attr = __ATTR(_name, 0444, _show, NULL),                       \
        .id   =  _id,                                                   \
  };
  
 +#define PMU_EVENT_ATTR_STRING(_name, _var, _str)                          \
 +static struct perf_pmu_events_attr _var = {                               \
 +      .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
 +      .id             = 0,                                                \
 +      .event_str      = _str,                                             \
 +};
 +
  #define PMU_FORMAT_ATTR(_name, _format)                                       \
  static ssize_t                                                                \
  _name##_show(struct device *dev,                                      \
diff --combined kernel/events/core.c
  #include <linux/syscalls.h>
  #include <linux/anon_inodes.h>
  #include <linux/kernel_stat.h>
+ #include <linux/cgroup.h>
  #include <linux/perf_event.h>
  #include <linux/ftrace_event.h>
  #include <linux/hw_breakpoint.h>
  #include <linux/mm_types.h>
- #include <linux/cgroup.h>
  #include <linux/module.h>
  #include <linux/mman.h>
  #include <linux/compat.h>
@@@ -351,32 -351,6 +351,6 @@@ static void perf_ctx_unlock(struct perf
  
  #ifdef CONFIG_CGROUP_PERF
  
- /*
-  * perf_cgroup_info keeps track of time_enabled for a cgroup.
-  * This is a per-cpu dynamically allocated data structure.
-  */
- struct perf_cgroup_info {
-       u64                             time;
-       u64                             timestamp;
- };
- struct perf_cgroup {
-       struct cgroup_subsys_state      css;
-       struct perf_cgroup_info __percpu *info;
- };
- /*
-  * Must ensure cgroup is pinned (css_get) before calling
-  * this function. In other words, we cannot call this function
-  * if there is no cgroup event for the current CPU context.
-  */
- static inline struct perf_cgroup *
- perf_cgroup_from_task(struct task_struct *task)
- {
-       return container_of(task_css(task, perf_event_cgrp_id),
-                           struct perf_cgroup, css);
- }
  static inline bool
  perf_cgroup_match(struct perf_event *event)
  {
@@@ -3220,7 -3194,10 +3194,10 @@@ static void __perf_event_read(void *inf
  
  static inline u64 perf_event_count(struct perf_event *event)
  {
-       return local64_read(&event->count) + atomic64_read(&event->child_count);
+       if (event->pmu->count)
+               return event->pmu->count(event);
+       return __perf_event_count(event);
  }
  
  static u64 perf_event_read(struct perf_event *event)
@@@ -3610,7 -3587,7 +3587,7 @@@ static void put_event(struct perf_even
        ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
        WARN_ON_ONCE(ctx->parent_ctx);
        perf_remove_from_context(event, true);
 -      mutex_unlock(&ctx->mutex);
 +      perf_event_ctx_unlock(event, ctx);
  
        _free_event(event);
  }
@@@ -4446,7 -4423,7 +4423,7 @@@ static int perf_mmap(struct file *file
         * If we have rb pages ensure they're a power-of-two number, so we
         * can do bitmasks instead of modulo.
         */
 -      if (!is_power_of_2(nr_pages))
 +      if (nr_pages != 0 && !is_power_of_2(nr_pages))
                return -EINVAL;
  
        if (vma_size != PAGE_SIZE * (1 + nr_pages))
@@@ -4593,13 -4570,6 +4570,13 @@@ static void perf_pending_event(struct i
  {
        struct perf_event *event = container_of(entry,
                        struct perf_event, pending);
 +      int rctx;
 +
 +      rctx = perf_swevent_get_recursion_context();
 +      /*
 +       * If we 'fail' here, that's OK, it means recursion is already disabled
 +       * and we won't recurse 'further'.
 +       */
  
        if (event->pending_disable) {
                event->pending_disable = 0;
                event->pending_wakeup = 0;
                perf_event_wakeup(event);
        }
 +
 +      if (rctx >= 0)
 +              perf_swevent_put_recursion_context(rctx);
  }
  
  /*
@@@ -7149,7 -7116,7 +7126,7 @@@ perf_event_alloc(struct perf_event_att
                 struct perf_event *group_leader,
                 struct perf_event *parent_event,
                 perf_overflow_handler_t overflow_handler,
-                void *context)
+                void *context, int cgroup_fd)
  {
        struct pmu *pmu;
        struct perf_event *event;
  
        if (task) {
                event->attach_state = PERF_ATTACH_TASK;
-               if (attr->type == PERF_TYPE_TRACEPOINT)
-                       event->hw.tp_target = task;
- #ifdef CONFIG_HAVE_HW_BREAKPOINT
                /*
-                * hw_breakpoint is a bit difficult here..
+                * XXX pmu::event_init needs to know what task to account to
+                * and we cannot use the ctx information because we need the
+                * pmu before we get a ctx.
                 */
-               else if (attr->type == PERF_TYPE_BREAKPOINT)
-                       event->hw.bp_target = task;
- #endif
+               event->hw.target = task;
        }
  
        if (!overflow_handler && parent_event) {
        if (!has_branch_stack(event))
                event->attr.branch_sample_type = 0;
  
+       if (cgroup_fd != -1) {
+               err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+               if (err)
+                       goto err_ns;
+       }
        pmu = perf_init_event(event);
        if (!pmu)
                goto err_ns;
@@@ -7268,6 -7237,8 +7247,8 @@@ err_pmu
                event->destroy(event);
        module_put(pmu->module);
  err_ns:
+       if (is_cgroup_event(event))
+               perf_detach_cgroup(event);
        if (event->ns)
                put_pid_ns(event->ns);
        kfree(event);
@@@ -7486,6 -7457,7 +7467,7 @@@ SYSCALL_DEFINE5(perf_event_open
        int move_group = 0;
        int err;
        int f_flags = O_RDWR;
+       int cgroup_fd = -1;
  
        /* for future expandability... */
        if (flags & ~PERF_FLAG_ALL)
  
        get_online_cpus();
  
+       if (flags & PERF_FLAG_PID_CGROUP)
+               cgroup_fd = pid;
        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
-                                NULL, NULL);
+                                NULL, NULL, cgroup_fd);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err_cpus;
        }
  
-       if (flags & PERF_FLAG_PID_CGROUP) {
-               err = perf_cgroup_connect(pid, event, &attr, group_leader);
-               if (err) {
-                       __free_event(event);
-                       goto err_cpus;
-               }
-       }
        if (is_sampling_event(event)) {
                if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
                        err = -ENOTSUPP;
@@@ -7802,7 -7769,7 +7779,7 @@@ perf_event_create_kernel_counter(struc
         */
  
        event = perf_event_alloc(attr, cpu, task, NULL, NULL,
-                                overflow_handler, context);
+                                overflow_handler, context, -1);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err;
@@@ -8163,7 -8130,7 +8140,7 @@@ inherit_event(struct perf_event *parent
                                           parent_event->cpu,
                                           child,
                                           group_leader, parent_event,
-                                          NULL, NULL);
+                                          NULL, NULL, -1);
        if (IS_ERR(child_event))
                return child_event;
  
@@@ -8549,18 -8516,6 +8526,18 @@@ void __init perf_event_init(void
                     != 1024);
  }
  
 +ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
 +                            char *page)
 +{
 +      struct perf_pmu_events_attr *pmu_attr =
 +              container_of(attr, struct perf_pmu_events_attr, attr);
 +
 +      if (pmu_attr->event_str)
 +              return sprintf(page, "%s\n", pmu_attr->event_str);
 +
 +      return 0;
 +}
 +
  static int __init perf_event_sysfs_init(void)
  {
        struct pmu *pmu;
@@@ -1005,7 -1005,7 +1005,7 @@@ __uprobe_perf_filter(struct trace_uprob
                return true;
  
        list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
-               if (event->hw.tp_target->mm == mm)
+               if (event->hw.target->mm == mm)
                        return true;
        }
  
  static inline bool
  uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
  {
-       return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+       return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
  }
  
  static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
        bool done;
  
        write_lock(&tu->filter.rwlock);
-       if (event->hw.tp_target) {
+       if (event->hw.target) {
                list_del(&event->hw.tp_list);
                done = tu->filter.nr_systemwide ||
-                       (event->hw.tp_target->flags & PF_EXITING) ||
+                       (event->hw.target->flags & PF_EXITING) ||
                        uprobe_filter_event(tu, event);
        } else {
                tu->filter.nr_systemwide--;
@@@ -1046,7 -1046,7 +1046,7 @@@ static int uprobe_perf_open(struct trac
        int err;
  
        write_lock(&tu->filter.rwlock);
-       if (event->hw.tp_target) {
+       if (event->hw.target) {
                /*
                 * event->parent != NULL means copy_process(), we can avoid
                 * uprobe_apply(). current->mm must be probed and we can rely
@@@ -1321,7 -1321,7 +1321,7 @@@ static __init int init_uprobe_trace(voi
        struct dentry *d_tracer;
  
        d_tracer = tracing_init_dentry();
 -      if (!d_tracer)
 +      if (IS_ERR(d_tracer))
                return 0;
  
        trace_create_file("uprobe_events", 0644, d_tracer,