Merge branch 'perf/urgent' into perf/core, to pick up dependencies
[cascardo/linux.git] / kernel / events / core.c
index 356a6c7..ca4fde5 100644 (file)
@@ -242,18 +242,6 @@ unlock:
        return ret;
 }
 
-static void event_function_local(struct perf_event *event, event_f func, void *data)
-{
-       struct event_function_struct efs = {
-               .event = event,
-               .func = func,
-               .data = data,
-       };
-
-       int ret = event_function(&efs);
-       WARN_ON_ONCE(ret);
-}
-
 static void event_function_call(struct perf_event *event, event_f func, void *data)
 {
        struct perf_event_context *ctx = event->ctx;
@@ -303,6 +291,54 @@ again:
        raw_spin_unlock_irq(&ctx->lock);
 }
 
+/*
+ * Similar to event_function_call() + event_function(), but hard assumes IRQs
+ * are already disabled and we're on the right CPU.
+ */
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+       struct perf_event_context *ctx = event->ctx;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct task_struct *task = READ_ONCE(ctx->task);
+       struct perf_event_context *task_ctx = NULL;
+
+       WARN_ON_ONCE(!irqs_disabled());
+
+       if (task) {
+               if (task == TASK_TOMBSTONE)
+                       return;
+
+               task_ctx = ctx;
+       }
+
+       perf_ctx_lock(cpuctx, task_ctx);
+
+       task = ctx->task;
+       if (task == TASK_TOMBSTONE)
+               goto unlock;
+
+       if (task) {
+               /*
+                * We must be either inactive or active and the right task,
+                * otherwise we're screwed, since we cannot IPI to somewhere
+                * else.
+                */
+               if (ctx->is_active) {
+                       if (WARN_ON_ONCE(task != current))
+                               goto unlock;
+
+                       if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
+                               goto unlock;
+               }
+       } else {
+               WARN_ON_ONCE(&cpuctx->ctx != ctx);
+       }
+
+       func(event, cpuctx, ctx, data);
+unlock:
+       perf_ctx_unlock(cpuctx, task_ctx);
+}
+
 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP |\
@@ -448,7 +484,7 @@ static u64 __report_allowed;
 
 static void perf_duration_warn(struct irq_work *w)
 {
-       printk_ratelimited(KERN_WARNING
+       printk_ratelimited(KERN_INFO
                "perf: interrupt took too long (%lld > %lld), lowering "
                "kernel.perf_event_max_sample_rate to %d\n",
                __report_avg, __report_allowed,
@@ -843,6 +879,32 @@ perf_cgroup_mark_enabled(struct perf_event *event,
                }
        }
 }
+
+/*
+ * Update cpuctx->cgrp so that it is set when first cgroup event is added and
+ * cleared when last cgroup event is removed.
+ */
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+                        struct perf_event_context *ctx, bool add)
+{
+       struct perf_cpu_context *cpuctx;
+
+       if (!is_cgroup_event(event))
+               return;
+
+       if (add && ctx->nr_cgroups++)
+               return;
+       else if (!add && --ctx->nr_cgroups)
+               return;
+       /*
+        * Because cgroup events are always per-cpu events,
+        * this will always be called from the right CPU.
+        */
+       cpuctx = __get_cpu_context(ctx);
+       cpuctx->cgrp = add ? event->cgrp : NULL;
+}
+
 #else /* !CONFIG_CGROUP_PERF */
 
 static inline bool
@@ -920,6 +982,13 @@ perf_cgroup_mark_enabled(struct perf_event *event,
                         struct perf_event_context *ctx)
 {
 }
+
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+                        struct perf_event_context *ctx, bool add)
+{
+}
+
 #endif
 
 /*
@@ -1392,6 +1461,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 {
+
        lockdep_assert_held(&ctx->lock);
 
        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1412,8 +1482,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                list_add_tail(&event->group_entry, list);
        }
 
-       if (is_cgroup_event(event))
-               ctx->nr_cgroups++;
+       list_update_cgroup_event(event, ctx, true);
 
        list_add_rcu(&event->event_entry, &ctx->event_list);
        ctx->nr_events++;
@@ -1581,8 +1650,6 @@ static void perf_group_attach(struct perf_event *event)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-       struct perf_cpu_context *cpuctx;
-
        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);
 
@@ -1594,20 +1661,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
        event->attach_state &= ~PERF_ATTACH_CONTEXT;
 
-       if (is_cgroup_event(event)) {
-               ctx->nr_cgroups--;
-               /*
-                * Because cgroup events are always per-cpu events, this will
-                * always be called from the right CPU.
-                */
-               cpuctx = __get_cpu_context(ctx);
-               /*
-                * If there are no more cgroup events then clear cgrp to avoid
-                * stale pointer in update_cgrp_time_from_cpuctx().
-                */
-               if (!ctx->nr_cgroups)
-                       cpuctx->cgrp = NULL;
-       }
+       list_update_cgroup_event(event, ctx, false);
 
        ctx->nr_events--;
        if (event->attr.inherit_stat)
@@ -1716,8 +1770,8 @@ static inline int pmu_filter_match(struct perf_event *event)
 static inline int
 event_filter_match(struct perf_event *event)
 {
-       return (event->cpu == -1 || event->cpu == smp_processor_id())
-           && perf_cgroup_match(event) && pmu_filter_match(event);
+       return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
+              perf_cgroup_match(event) && pmu_filter_match(event);
 }
 
 static void
@@ -1737,8 +1791,8 @@ event_sched_out(struct perf_event *event,
         * maintained, otherwise bogus information is return
         * via read() for time_enabled, time_running:
         */
-       if (event->state == PERF_EVENT_STATE_INACTIVE
-           && !event_filter_match(event)) {
+       if (event->state == PERF_EVENT_STATE_INACTIVE &&
+           !event_filter_match(event)) {
                delta = tstamp - event->tstamp_stopped;
                event->tstamp_running += delta;
                event->tstamp_stopped = tstamp;
@@ -1778,6 +1832,8 @@ group_sched_out(struct perf_event *group_event,
        struct perf_event *event;
        int state = group_event->state;
 
+       perf_pmu_disable(ctx->pmu);
+
        event_sched_out(group_event, cpuctx, ctx);
 
        /*
@@ -1786,6 +1842,8 @@ group_sched_out(struct perf_event *group_event,
        list_for_each_entry(event, &group_event->sibling_list, group_entry)
                event_sched_out(event, cpuctx, ctx);
 
+       perf_pmu_enable(ctx->pmu);
+
        if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
                cpuctx->exclusive = 0;
 }
@@ -2236,10 +2294,15 @@ perf_install_in_context(struct perf_event_context *ctx,
 
        lockdep_assert_held(&ctx->mutex);
 
-       event->ctx = ctx;
        if (event->cpu != -1)
                event->cpu = cpu;
 
+       /*
+        * Ensures that if we can observe event->ctx, both the event and ctx
+        * will be 'complete'. See perf_iterate_sb_cpu().
+        */
+       smp_store_release(&event->ctx, ctx);
+
        if (!task) {
                cpu_function_call(cpu, __perf_install_in_context, event);
                return;
@@ -2778,19 +2841,36 @@ unlock:
        }
 }
 
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
 void perf_sched_cb_dec(struct pmu *pmu)
 {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
        this_cpu_dec(perf_sched_cb_usages);
+
+       if (!--cpuctx->sched_cb_usage)
+               list_del(&cpuctx->sched_cb_entry);
 }
 
+
 void perf_sched_cb_inc(struct pmu *pmu)
 {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+       if (!cpuctx->sched_cb_usage++)
+               list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
        this_cpu_inc(perf_sched_cb_usages);
 }
 
 /*
  * This function provides the context switch callback to the lower code
  * layer. It is invoked ONLY when the context switch callback is enabled.
+ *
+ * This callback is relevant even to per-cpu events; for example multi event
+ * PEBS requires this to provide PID/TID information. This requires we flush
+ * all queued PEBS records before we context switch to a new task.
  */
 static void perf_pmu_sched_task(struct task_struct *prev,
                                struct task_struct *next,
@@ -2798,34 +2878,24 @@ static void perf_pmu_sched_task(struct task_struct *prev,
 {
        struct perf_cpu_context *cpuctx;
        struct pmu *pmu;
-       unsigned long flags;
 
        if (prev == next)
                return;
 
-       local_irq_save(flags);
-
-       rcu_read_lock();
-
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               if (pmu->sched_task) {
-                       cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-                       perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+       list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+               pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
 
-                       perf_pmu_disable(pmu);
+               if (WARN_ON_ONCE(!pmu->sched_task))
+                       continue;
 
-                       pmu->sched_task(cpuctx->task_ctx, sched_in);
+               perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+               perf_pmu_disable(pmu);
 
-                       perf_pmu_enable(pmu);
+               pmu->sched_task(cpuctx->task_ctx, sched_in);
 
-                       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-               }
+               perf_pmu_enable(pmu);
+               perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
        }
-
-       rcu_read_unlock();
-
-       local_irq_restore(flags);
 }
 
 static void perf_event_switch(struct task_struct *task,
@@ -3490,9 +3560,10 @@ static int perf_event_read(struct perf_event *event, bool group)
                        .group = group,
                        .ret = 0,
                };
-               smp_call_function_single(event->oncpu,
-                                        __perf_event_read, &data, 1);
-               ret = data.ret;
+               ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
+               /* The event must have been read from an online CPU: */
+               WARN_ON_ONCE(ret);
+               ret = ret ? : data.ret;
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;
@@ -5969,6 +6040,14 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
        struct perf_event *event;
 
        list_for_each_entry_rcu(event, &pel->list, sb_list) {
+               /*
+                * Skip events that are not fully formed yet; ensure that
+                * if we observe event->ctx, both event and ctx will be
+                * complete enough. See perf_install_in_context().
+                */
+               if (!smp_load_acquire(&event->ctx))
+                       continue;
+
                if (event->state < PERF_EVENT_STATE_INACTIVE)
                        continue;
                if (!event_filter_match(event))
@@ -6552,15 +6631,6 @@ got_name:
        kfree(buf);
 }
 
-/*
- * Whether this @filter depends on a dynamic object which is not loaded
- * yet or its load addresses are not known.
- */
-static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
-{
-       return filter->filter && filter->inode;
-}
-
 /*
  * Check whether inode and address range match filter criteria.
  */
@@ -6622,6 +6692,13 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
        struct perf_event_context *ctx;
        int ctxn;
 
+       /*
+        * Data tracing isn't supported yet and as such there is no need
+        * to keep track of anything that isn't related to executable code:
+        */
+       if (!(vma->vm_flags & VM_EXEC))
+               return;
+
        rcu_read_lock();
        for_each_task_context_nr(ctxn) {
                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
@@ -7774,7 +7851,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
        list_for_each_entry(filter, &ifh->list, entry) {
                event->addr_filters_offs[count] = 0;
 
-               if (perf_addr_filter_needs_mmap(filter))
+               /*
+                * Adjust base offset if the filter is associated to a binary
+                * that needs to be mapped:
+                */
+               if (filter->inode)
                        event->addr_filters_offs[count] =
                                perf_addr_filter_apply(filter, mm);
 
@@ -7905,8 +7986,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                                        goto fail;
                        }
 
-                       if (token == IF_SRC_FILE) {
-                               filename = match_strdup(&args[2]);
+                       if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
+                               int fpos = filter->range ? 2 : 1;
+
+                               filename = match_strdup(&args[fpos]);
                                if (!filename) {
                                        ret = -ENOMEM;
                                        goto fail;
@@ -10354,6 +10437,8 @@ static void __init perf_event_init_all_cpus(void)
 
                INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
                raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
+
+               INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
        }
 }