Merge branch 'perf/x86' into perf/core, because it's ready

[cascardo/linux.git] / kernel / events / core.c
diff --git a/kernel/events/core.c b/kernel/events/core.c

index f04daab..b01dfb6 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -34,11 +34,11 @@
  #include <linux/syscalls.h>
  #include <linux/anon_inodes.h>
  #include <linux/kernel_stat.h>
+#include <linux/cgroup.h>
  #include <linux/perf_event.h>
  #include <linux/ftrace_event.h>
  #include <linux/hw_breakpoint.h>
  #include <linux/mm_types.h>
-#include <linux/cgroup.h>
  #include <linux/module.h>
  #include <linux/mman.h>
  #include <linux/compat.h>
@@ -153,7 +153,7 @@ enum event_type_t {
   */
  struct static_key_deferred perf_sched_events __read_mostly;
  static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
  
  static atomic_t nr_mmap_events __read_mostly;
  static atomic_t nr_comm_events __read_mostly;
@@ -351,32 +351,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
  
  #ifdef CONFIG_CGROUP_PERF
  
-/*
- * perf_cgroup_info keeps track of time_enabled for a cgroup.
- * This is a per-cpu dynamically allocated data structure.
- */
-struct perf_cgroup_info {
-       u64                             time;
-       u64                             timestamp;
-};
-
-struct perf_cgroup {
-       struct cgroup_subsys_state      css;
-       struct perf_cgroup_info __percpu *info;
-};
-
-/*
- * Must ensure cgroup is pinned (css_get) before calling
- * this function. In other words, we cannot call this function
- * if there is no cgroup event for the current CPU context.
- */
-static inline struct perf_cgroup *
-perf_cgroup_from_task(struct task_struct *task)
-{
-       return container_of(task_css(task, perf_event_cgrp_id),
-                           struct perf_cgroup, css);
-}
-
  static inline bool
  perf_cgroup_match(struct perf_event *event)
  {
@@ -905,6 +879,15 @@ static void get_ctx(struct perf_event_context *ctx)
         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
  }
  
+static void free_ctx(struct rcu_head *head)
+{
+       struct perf_event_context *ctx;
+
+       ctx = container_of(head, struct perf_event_context, rcu_head);
+       kfree(ctx->task_ctx_data);
+       kfree(ctx);
+}
+
  static void put_ctx(struct perf_event_context *ctx)
  {
         if (atomic_dec_and_test(&ctx->refcount)) {
@@ -912,7 +895,7 @@ static void put_ctx(struct perf_event_context *ctx)
                         put_ctx(ctx->parent_ctx);
                 if (ctx->task)
                         put_task_struct(ctx->task);
-               kfree_rcu(ctx, rcu_head);
+               call_rcu(&ctx->rcu_head, free_ctx);
         }
  }
  
@@ -1239,9 +1222,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
         if (is_cgroup_event(event))
                 ctx->nr_cgroups++;
  
-       if (has_branch_stack(event))
-               ctx->nr_branch_stack++;
-
         list_add_rcu(&event->event_entry, &ctx->event_list);
         ctx->nr_events++;
         if (event->attr.inherit_stat)
@@ -1408,9 +1388,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
                         cpuctx->cgrp = NULL;
         }
  
-       if (has_branch_stack(event))
-               ctx->nr_branch_stack--;
-
         ctx->nr_events--;
         if (event->attr.inherit_stat)
                 ctx->nr_stat--;
@@ -1881,6 +1858,10 @@ event_sched_in(struct perf_event *event,
  
         perf_pmu_disable(event->pmu);
  
+       event->tstamp_running += tstamp - event->tstamp_stopped;
+
+       perf_set_shadow_time(event, ctx, tstamp);
+
         if (event->pmu->add(event, PERF_EF_START)) {
                 event->state = PERF_EVENT_STATE_INACTIVE;
                 event->oncpu = -1;
@@ -1888,10 +1869,6 @@ event_sched_in(struct perf_event *event,
                 goto out;
         }
  
-       event->tstamp_running += tstamp - event->tstamp_stopped;
-
-       perf_set_shadow_time(event, ctx, tstamp);
-
         if (!is_software_event(event))
                 cpuctx->active_oncpu++;
         if (!ctx->nr_active++)
@@ -2559,6 +2536,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
                         next->perf_event_ctxp[ctxn] = ctx;
                         ctx->task = next;
                         next_ctx->task = task;
+
+                       swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+
                         do_switch = 0;
  
                         perf_event_sync_stat(ctx, next_ctx);
@@ -2577,6 +2557,56 @@ unlock:
         }
  }
  
+void perf_sched_cb_dec(struct pmu *pmu)
+{
+       this_cpu_dec(perf_sched_cb_usages);
+}
+
+void perf_sched_cb_inc(struct pmu *pmu)
+{
+       this_cpu_inc(perf_sched_cb_usages);
+}
+
+/*
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when the context switch callback is enabled.
+ */
+static void perf_pmu_sched_task(struct task_struct *prev,
+                               struct task_struct *next,
+                               bool sched_in)
+{
+       struct perf_cpu_context *cpuctx;
+       struct pmu *pmu;
+       unsigned long flags;
+
+       if (prev == next)
+               return;
+
+       local_irq_save(flags);
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               if (pmu->sched_task) {
+                       cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+                       perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+                       perf_pmu_disable(pmu);
+
+                       pmu->sched_task(cpuctx->task_ctx, sched_in);
+
+                       perf_pmu_enable(pmu);
+
+                       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+               }
+       }
+
+       rcu_read_unlock();
+
+       local_irq_restore(flags);
+}
+
  #define for_each_task_context_nr(ctxn)                                 \
         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
  
@@ -2596,6 +2626,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
  {
         int ctxn;
  
+       if (__this_cpu_read(perf_sched_cb_usages))
+               perf_pmu_sched_task(task, next, false);
+
         for_each_task_context_nr(ctxn)
                 perf_event_context_sched_out(task, ctxn, next);
  
@@ -2754,64 +2787,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
         perf_ctx_unlock(cpuctx, ctx);
  }
  
-/*
- * When sampling the branck stack in system-wide, it may be necessary
- * to flush the stack on context switch. This happens when the branch
- * stack does not tag its entries with the pid of the current task.
- * Otherwise it becomes impossible to associate a branch entry with a
- * task. This ambiguity is more likely to appear when the branch stack
- * supports priv level filtering and the user sets it to monitor only
- * at the user level (which could be a useful measurement in system-wide
- * mode). In that case, the risk is high of having a branch stack with
- * branch from multiple tasks. Flushing may mean dropping the existing
- * entries or stashing them somewhere in the PMU specific code layer.
- *
- * This function provides the context switch callback to the lower code
- * layer. It is invoked ONLY when there is at least one system-wide context
- * with at least one active event using taken branch sampling.
- */
-static void perf_branch_stack_sched_in(struct task_struct *prev,
-                                      struct task_struct *task)
-{
-       struct perf_cpu_context *cpuctx;
-       struct pmu *pmu;
-       unsigned long flags;
-
-       /* no need to flush branch stack if not changing task */
-       if (prev == task)
-               return;
-
-       local_irq_save(flags);
-
-       rcu_read_lock();
-
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-               /*
-                * check if the context has at least one
-                * event using PERF_SAMPLE_BRANCH_STACK
-                */
-               if (cpuctx->ctx.nr_branch_stack > 0
-                   && pmu->flush_branch_stack) {
-
-                       perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-
-                       perf_pmu_disable(pmu);
-
-                       pmu->flush_branch_stack();
-
-                       perf_pmu_enable(pmu);
-
-                       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-               }
-       }
-
-       rcu_read_unlock();
-
-       local_irq_restore(flags);
-}
-
  /*
   * Called from scheduler to add the events of the current task
   * with interrupts disabled.
@@ -2844,9 +2819,8 @@ void __perf_event_task_sched_in(struct task_struct *prev,
         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
                 perf_cgroup_sched_in(prev, task);
  
-       /* check for system-wide branch_stack events */
-       if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
-               perf_branch_stack_sched_in(prev, task);
+       if (__this_cpu_read(perf_sched_cb_usages))
+               perf_pmu_sched_task(prev, task, true);
  }
  
  static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -3220,7 +3194,10 @@ static void __perf_event_read(void *info)
  
  static inline u64 perf_event_count(struct perf_event *event)
  {
-       return local64_read(&event->count) + atomic64_read(&event->child_count);
+       if (event->pmu->count)
+               return event->pmu->count(event);
+
+       return __perf_event_count(event);
  }
  
  static u64 perf_event_read(struct perf_event *event)
@@ -3321,12 +3298,15 @@ errout:
   * Returns a matching context with refcount and pincount.
   */
  static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
+find_get_context(struct pmu *pmu, struct task_struct *task,
+               struct perf_event *event)
  {
         struct perf_event_context *ctx, *clone_ctx = NULL;
         struct perf_cpu_context *cpuctx;
+       void *task_ctx_data = NULL;
         unsigned long flags;
         int ctxn, err;
+       int cpu = event->cpu;
  
         if (!task) {
                 /* Must be root to operate on a CPU event: */
@@ -3354,11 +3334,24 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
         if (ctxn < 0)
                 goto errout;
  
+       if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+               task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+               if (!task_ctx_data) {
+                       err = -ENOMEM;
+                       goto errout;
+               }
+       }
+
  retry:
         ctx = perf_lock_task_context(task, ctxn, &flags);
         if (ctx) {
                 clone_ctx = unclone_ctx(ctx);
                 ++ctx->pin_count;
+
+               if (task_ctx_data && !ctx->task_ctx_data) {
+                       ctx->task_ctx_data = task_ctx_data;
+                       task_ctx_data = NULL;
+               }
                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
  
                 if (clone_ctx)
@@ -3369,6 +3362,11 @@ retry:
                 if (!ctx)
                         goto errout;
  
+               if (task_ctx_data) {
+                       ctx->task_ctx_data = task_ctx_data;
+                       task_ctx_data = NULL;
+               }
+
                 err = 0;
                 mutex_lock(&task->perf_event_mutex);
                 /*
@@ -3395,9 +3393,11 @@ retry:
                 }
         }
  
+       kfree(task_ctx_data);
         return ctx;
  
  errout:
+       kfree(task_ctx_data);
         return ERR_PTR(err);
  }
  
@@ -3423,10 +3423,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
         if (event->parent)
                 return;
  
-       if (has_branch_stack(event)) {
-               if (!(event->attach_state & PERF_ATTACH_TASK))
-                       atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
-       }
         if (is_cgroup_event(event))
                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
  }
@@ -3591,7 +3587,7 @@ static void put_event(struct perf_event *event)
         ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
         WARN_ON_ONCE(ctx->parent_ctx);
         perf_remove_from_context(event, true);
-       mutex_unlock(&ctx->mutex);
+       perf_event_ctx_unlock(event, ctx);
  
         _free_event(event);
  }
@@ -4574,6 +4570,13 @@ static void perf_pending_event(struct irq_work *entry)
  {
         struct perf_event *event = container_of(entry,
                         struct perf_event, pending);
+       int rctx;
+
+       rctx = perf_swevent_get_recursion_context();
+       /*
+        * If we 'fail' here, that's OK, it means recursion is already disabled
+        * and we won't recurse 'further'.
+        */
  
         if (event->pending_disable) {
                 event->pending_disable = 0;
@@ -4584,6 +4587,9 @@ static void perf_pending_event(struct irq_work *entry)
                 event->pending_wakeup = 0;
                 perf_event_wakeup(event);
         }
+
+       if (rctx >= 0)
+               perf_swevent_put_recursion_context(rctx);
  }
  
  /*
@@ -6123,6 +6129,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
         }
  
         hlist_add_head_rcu(&event->hlist_entry, head);
+       perf_event_update_userpage(event);
  
         return 0;
  }
@@ -6592,6 +6599,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags)
  {
         if (flags & PERF_EF_START)
                 cpu_clock_event_start(event, flags);
+       perf_event_update_userpage(event);
  
         return 0;
  }
@@ -6666,6 +6674,7 @@ static int task_clock_event_add(struct perf_event *event, int flags)
  {
         if (flags & PERF_EF_START)
                 task_clock_event_start(event, flags);
+       perf_event_update_userpage(event);
  
         return 0;
  }
@@ -7079,10 +7088,6 @@ static void account_event_cpu(struct perf_event *event, int cpu)
         if (event->parent)
                 return;
  
-       if (has_branch_stack(event)) {
-               if (!(event->attach_state & PERF_ATTACH_TASK))
-                       atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
-       }
         if (is_cgroup_event(event))
                 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
  }
@@ -7121,7 +7126,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                  struct perf_event *group_leader,
                  struct perf_event *parent_event,
                  perf_overflow_handler_t overflow_handler,
-                void *context)
+                void *context, int cgroup_fd)
  {
         struct pmu *pmu;
         struct perf_event *event;
@@ -7176,16 +7181,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
  
         if (task) {
                 event->attach_state = PERF_ATTACH_TASK;
-
-               if (attr->type == PERF_TYPE_TRACEPOINT)
-                       event->hw.tp_target = task;
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
                 /*
-                * hw_breakpoint is a bit difficult here..
+                * XXX pmu::event_init needs to know what task to account to
+                * and we cannot use the ctx information because we need the
+                * pmu before we get a ctx.
                  */
-               else if (attr->type == PERF_TYPE_BREAKPOINT)
-                       event->hw.bp_target = task;
-#endif
+               event->hw.target = task;
         }
  
         if (!overflow_handler && parent_event) {
@@ -7214,6 +7215,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
                 goto err_ns;
  
+       if (!has_branch_stack(event))
+               event->attr.branch_sample_type = 0;
+
+       if (cgroup_fd != -1) {
+               err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+               if (err)
+                       goto err_ns;
+       }
+
         pmu = perf_init_event(event);
         if (!pmu)
                 goto err_ns;
@@ -7237,6 +7247,8 @@ err_pmu:
                 event->destroy(event);
         module_put(pmu->module);
  err_ns:
+       if (is_cgroup_event(event))
+               perf_detach_cgroup(event);
         if (event->ns)
                 put_pid_ns(event->ns);
         kfree(event);
@@ -7455,6 +7467,7 @@ SYSCALL_DEFINE5(perf_event_open,
         int move_group = 0;
         int err;
         int f_flags = O_RDWR;
+       int cgroup_fd = -1;
  
         /* for future expandability... */
         if (flags & ~PERF_FLAG_ALL)
@@ -7520,21 +7533,16 @@ SYSCALL_DEFINE5(perf_event_open,
  
         get_online_cpus();
  
+       if (flags & PERF_FLAG_PID_CGROUP)
+               cgroup_fd = pid;
+
         event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
-                                NULL, NULL);
+                                NULL, NULL, cgroup_fd);
         if (IS_ERR(event)) {
                 err = PTR_ERR(event);
                 goto err_cpus;
         }
  
-       if (flags & PERF_FLAG_PID_CGROUP) {
-               err = perf_cgroup_connect(pid, event, &attr, group_leader);
-               if (err) {
-                       __free_event(event);
-                       goto err_cpus;
-               }
-       }
-
         if (is_sampling_event(event)) {
                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
                         err = -ENOTSUPP;
@@ -7576,7 +7584,7 @@ SYSCALL_DEFINE5(perf_event_open,
         /*
          * Get the target context (task or percpu):
          */
-       ctx = find_get_context(pmu, task, event->cpu);
+       ctx = find_get_context(pmu, task, event);
         if (IS_ERR(ctx)) {
                 err = PTR_ERR(ctx);
                 goto err_alloc;
@@ -7771,7 +7779,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
          */
  
         event = perf_event_alloc(attr, cpu, task, NULL, NULL,
-                                overflow_handler, context);
+                                overflow_handler, context, -1);
         if (IS_ERR(event)) {
                 err = PTR_ERR(event);
                 goto err;
@@ -7782,7 +7790,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
  
         account_event(event);
  
-       ctx = find_get_context(event->pmu, task, cpu);
+       ctx = find_get_context(event->pmu, task, event);
         if (IS_ERR(ctx)) {
                 err = PTR_ERR(ctx);
                 goto err_free;
@@ -8132,7 +8140,7 @@ inherit_event(struct perf_event *parent_event,
                                            parent_event->cpu,
                                            child,
                                            group_leader, parent_event,
-                                          NULL, NULL);
+                                          NULL, NULL, -1);
         if (IS_ERR(child_event))
                 return child_event;