Merge branch 'perf/fast' into perf/core
[cascardo/linux.git] / kernel / events / core.c
index 58690af..de859fb 100644 (file)
@@ -4,7 +4,7 @@
  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *  Copyright  Â©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  *
  * For licensing details see kernel-base/COPYING
  */
@@ -128,7 +128,7 @@ enum event_type_t {
  * perf_sched_events : >0 events exist
  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
  */
-struct jump_label_key perf_sched_events __read_mostly;
+struct jump_label_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -815,7 +815,7 @@ static void update_event_times(struct perf_event *event)
         * here.
         */
        if (is_cgroup_event(event))
-               run_end = perf_event_time(event);
+               run_end = perf_cgroup_event_time(event);
        else if (ctx->is_active)
                run_end = ctx->time;
        else
@@ -1130,6 +1130,8 @@ event_sched_out(struct perf_event *event,
        if (!is_software_event(event))
                cpuctx->active_oncpu--;
        ctx->nr_active--;
+       if (event->attr.freq && event->attr.sample_freq)
+               ctx->nr_freq--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;
 }
@@ -1325,6 +1327,7 @@ retry:
        }
        raw_spin_unlock_irq(&ctx->lock);
 }
+EXPORT_SYMBOL_GPL(perf_event_disable);
 
 static void perf_set_shadow_time(struct perf_event *event,
                                 struct perf_event_context *ctx,
@@ -1406,6 +1409,8 @@ event_sched_in(struct perf_event *event,
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
        ctx->nr_active++;
+       if (event->attr.freq && event->attr.sample_freq)
+               ctx->nr_freq++;
 
        if (event->attr.exclusive)
                cpuctx->exclusive = 1;
@@ -1662,8 +1667,7 @@ retry:
  * Note: this works for group members as well as group leaders
  * since the non-leader members' sibling_lists will be empty.
  */
-static void __perf_event_mark_enabled(struct perf_event *event,
-                                       struct perf_event_context *ctx)
+static void __perf_event_mark_enabled(struct perf_event *event)
 {
        struct perf_event *sub;
        u64 tstamp = perf_event_time(event);
@@ -1701,7 +1705,7 @@ static int __perf_event_enable(void *info)
         */
        perf_cgroup_set_timestamp(current, ctx);
 
-       __perf_event_mark_enabled(event, ctx);
+       __perf_event_mark_enabled(event);
 
        if (!event_filter_match(event)) {
                if (is_cgroup_event(event))
@@ -1782,7 +1786,7 @@ void perf_event_enable(struct perf_event *event)
 
 retry:
        if (!ctx->is_active) {
-               __perf_event_mark_enabled(event, ctx);
+               __perf_event_mark_enabled(event);
                goto out;
        }
 
@@ -1809,6 +1813,7 @@ retry:
 out:
        raw_spin_unlock_irq(&ctx->lock);
 }
+EXPORT_SYMBOL_GPL(perf_event_enable);
 
 int perf_event_refresh(struct perf_event *event, int refresh)
 {
@@ -2327,6 +2332,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
        u64 interrupts, now;
        s64 delta;
 
+       if (!ctx->nr_freq)
+               return;
+
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;
@@ -2382,12 +2390,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
        struct perf_event_context *ctx = NULL;
-       int rotate = 0, remove = 1;
+       int rotate = 0, remove = 1, freq = 0;
 
        if (cpuctx->ctx.nr_events) {
                remove = 0;
                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
                        rotate = 1;
+               if (cpuctx->ctx.nr_freq)
+                       freq = 1;
        }
 
        ctx = cpuctx->task_ctx;
@@ -2395,33 +2405,40 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
                remove = 0;
                if (ctx->nr_events != ctx->nr_active)
                        rotate = 1;
+               if (ctx->nr_freq)
+                       freq = 1;
        }
 
+       if (!rotate && !freq)
+               goto done;
+
        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(cpuctx->ctx.pmu);
-       perf_ctx_adjust_freq(&cpuctx->ctx, interval);
-       if (ctx)
-               perf_ctx_adjust_freq(ctx, interval);
 
-       if (!rotate)
-               goto done;
+       if (freq) {
+               perf_ctx_adjust_freq(&cpuctx->ctx, interval);
+               if (ctx)
+                       perf_ctx_adjust_freq(ctx, interval);
+       }
 
-       cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-       if (ctx)
-               ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+       if (rotate) {
+               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+               if (ctx)
+                       ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
 
-       rotate_ctx(&cpuctx->ctx);
-       if (ctx)
-               rotate_ctx(ctx);
+               rotate_ctx(&cpuctx->ctx);
+               if (ctx)
+                       rotate_ctx(ctx);
 
-       perf_event_sched_in(cpuctx, ctx, current);
+               perf_event_sched_in(cpuctx, ctx, current);
+       }
+
+       perf_pmu_enable(cpuctx->ctx.pmu);
+       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 
 done:
        if (remove)
                list_del_init(&cpuctx->rotation_list);
-
-       perf_pmu_enable(cpuctx->ctx.pmu);
-       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
 void perf_event_task_tick(void)
@@ -2448,7 +2465,7 @@ static int event_enable_on_exec(struct perf_event *event,
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                return 0;
 
-       __perf_event_mark_enabled(event, ctx);
+       __perf_event_mark_enabled(event);
 
        return 1;
 }
@@ -2480,13 +2497,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
        raw_spin_lock(&ctx->lock);
        task_ctx_sched_out(ctx);
 
-       list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-               ret = event_enable_on_exec(event, ctx);
-               if (ret)
-                       enabled = 1;
-       }
-
-       list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+       list_for_each_entry(event, &ctx->event_list, event_entry) {
                ret = event_enable_on_exec(event, ctx);
                if (ret)
                        enabled = 1;
@@ -2573,215 +2584,6 @@ static u64 perf_event_read(struct perf_event *event)
        return perf_event_count(event);
 }
 
-/*
- * Callchain support
- */
-
-struct callchain_cpus_entries {
-       struct rcu_head                 rcu_head;
-       struct perf_callchain_entry     *cpu_entries[0];
-};
-
-static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
-static atomic_t nr_callchain_events;
-static DEFINE_MUTEX(callchain_mutex);
-struct callchain_cpus_entries *callchain_cpus_entries;
-
-
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
-                                 struct pt_regs *regs)
-{
-}
-
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
-                               struct pt_regs *regs)
-{
-}
-
-static void release_callchain_buffers_rcu(struct rcu_head *head)
-{
-       struct callchain_cpus_entries *entries;
-       int cpu;
-
-       entries = container_of(head, struct callchain_cpus_entries, rcu_head);
-
-       for_each_possible_cpu(cpu)
-               kfree(entries->cpu_entries[cpu]);
-
-       kfree(entries);
-}
-
-static void release_callchain_buffers(void)
-{
-       struct callchain_cpus_entries *entries;
-
-       entries = callchain_cpus_entries;
-       rcu_assign_pointer(callchain_cpus_entries, NULL);
-       call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
-}
-
-static int alloc_callchain_buffers(void)
-{
-       int cpu;
-       int size;
-       struct callchain_cpus_entries *entries;
-
-       /*
-        * We can't use the percpu allocation API for data that can be
-        * accessed from NMI. Use a temporary manual per cpu allocation
-        * until that gets sorted out.
-        */
-       size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
-
-       entries = kzalloc(size, GFP_KERNEL);
-       if (!entries)
-               return -ENOMEM;
-
-       size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
-
-       for_each_possible_cpu(cpu) {
-               entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
-                                                        cpu_to_node(cpu));
-               if (!entries->cpu_entries[cpu])
-                       goto fail;
-       }
-
-       rcu_assign_pointer(callchain_cpus_entries, entries);
-
-       return 0;
-
-fail:
-       for_each_possible_cpu(cpu)
-               kfree(entries->cpu_entries[cpu]);
-       kfree(entries);
-
-       return -ENOMEM;
-}
-
-static int get_callchain_buffers(void)
-{
-       int err = 0;
-       int count;
-
-       mutex_lock(&callchain_mutex);
-
-       count = atomic_inc_return(&nr_callchain_events);
-       if (WARN_ON_ONCE(count < 1)) {
-               err = -EINVAL;
-               goto exit;
-       }
-
-       if (count > 1) {
-               /* If the allocation failed, give up */
-               if (!callchain_cpus_entries)
-                       err = -ENOMEM;
-               goto exit;
-       }
-
-       err = alloc_callchain_buffers();
-       if (err)
-               release_callchain_buffers();
-exit:
-       mutex_unlock(&callchain_mutex);
-
-       return err;
-}
-
-static void put_callchain_buffers(void)
-{
-       if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
-               release_callchain_buffers();
-               mutex_unlock(&callchain_mutex);
-       }
-}
-
-static int get_recursion_context(int *recursion)
-{
-       int rctx;
-
-       if (in_nmi())
-               rctx = 3;
-       else if (in_irq())
-               rctx = 2;
-       else if (in_softirq())
-               rctx = 1;
-       else
-               rctx = 0;
-
-       if (recursion[rctx])
-               return -1;
-
-       recursion[rctx]++;
-       barrier();
-
-       return rctx;
-}
-
-static inline void put_recursion_context(int *recursion, int rctx)
-{
-       barrier();
-       recursion[rctx]--;
-}
-
-static struct perf_callchain_entry *get_callchain_entry(int *rctx)
-{
-       int cpu;
-       struct callchain_cpus_entries *entries;
-
-       *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
-       if (*rctx == -1)
-               return NULL;
-
-       entries = rcu_dereference(callchain_cpus_entries);
-       if (!entries)
-               return NULL;
-
-       cpu = smp_processor_id();
-
-       return &entries->cpu_entries[cpu][*rctx];
-}
-
-static void
-put_callchain_entry(int rctx)
-{
-       put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
-}
-
-static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-       int rctx;
-       struct perf_callchain_entry *entry;
-
-
-       entry = get_callchain_entry(&rctx);
-       if (rctx == -1)
-               return NULL;
-
-       if (!entry)
-               goto exit_put;
-
-       entry->nr = 0;
-
-       if (!user_mode(regs)) {
-               perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
-               perf_callchain_kernel(entry, regs);
-               if (current->mm)
-                       regs = task_pt_regs(current);
-               else
-                       regs = NULL;
-       }
-
-       if (regs) {
-               perf_callchain_store(entry, PERF_CONTEXT_USER);
-               perf_callchain_user(entry, regs);
-       }
-
-exit_put:
-       put_callchain_entry(rctx);
-
-       return entry;
-}
-
 /*
  * Initialize the perf_event context in a task_struct:
  */
@@ -2946,7 +2748,7 @@ static void free_event(struct perf_event *event)
 
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                       jump_label_dec(&perf_sched_events);
+                       jump_label_dec_deferred(&perf_sched_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_dec(&nr_mmap_events);
                if (event->attr.comm)
@@ -2957,7 +2759,7 @@ static void free_event(struct perf_event *event)
                        put_callchain_buffers();
                if (is_cgroup_event(event)) {
                        atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
-                       jump_label_dec(&perf_sched_events);
+                       jump_label_dec_deferred(&perf_sched_events);
                }
        }
 
@@ -3406,10 +3208,6 @@ int perf_event_task_disable(void)
        return 0;
 }
 
-#ifndef PERF_EVENT_INDEX_OFFSET
-# define PERF_EVENT_INDEX_OFFSET 0
-#endif
-
 static int perf_event_index(struct perf_event *event)
 {
        if (event->hw.state & PERF_HES_STOPPED)
@@ -3418,21 +3216,26 @@ static int perf_event_index(struct perf_event *event)
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;
 
-       return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
+       return event->pmu->event_idx(event);
 }
 
 static void calc_timer_values(struct perf_event *event,
+                               u64 *now,
                                u64 *enabled,
                                u64 *running)
 {
-       u64 now, ctx_time;
+       u64 ctx_time;
 
-       now = perf_clock();
-       ctx_time = event->shadow_ctx_time + now;
+       *now = perf_clock();
+       ctx_time = event->shadow_ctx_time + *now;
        *enabled = ctx_time - event->tstamp_enabled;
        *running = ctx_time - event->tstamp_running;
 }
 
+void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
+{
+}
+
 /*
  * Callers need to ensure there can be no nesting of this function, otherwise
  * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3442,7 +3245,7 @@ void perf_event_update_userpage(struct perf_event *event)
 {
        struct perf_event_mmap_page *userpg;
        struct ring_buffer *rb;
-       u64 enabled, running;
+       u64 enabled, running, now;
 
        rcu_read_lock();
        /*
@@ -3454,7 +3257,7 @@ void perf_event_update_userpage(struct perf_event *event)
         * because of locking issue as we can be called in
         * NMI context
         */
-       calc_timer_values(event, &enabled, &running);
+       calc_timer_values(event, &now, &enabled, &running);
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;
@@ -3470,7 +3273,7 @@ void perf_event_update_userpage(struct perf_event *event)
        barrier();
        userpg->index = perf_event_index(event);
        userpg->offset = perf_event_count(event);
-       if (event->state == PERF_EVENT_STATE_ACTIVE)
+       if (userpg->index)
                userpg->offset -= local64_read(&event->hw.prev_count);
 
        userpg->time_enabled = enabled +
@@ -3479,6 +3282,8 @@ void perf_event_update_userpage(struct perf_event *event)
        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);
 
+       perf_update_user_clock(userpg, now);
+
        barrier();
        ++userpg->lock;
        preempt_enable();
@@ -3736,6 +3541,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        event->mmap_user = get_current_user();
        vma->vm_mm->pinned_vm += event->mmap_locked;
 
+       perf_event_update_userpage(event);
+
 unlock:
        if (!ret)
                atomic_inc(&event->mmap_count);
@@ -3967,7 +3774,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
 {
-       u64 enabled = 0, running = 0;
+       u64 enabled = 0, running = 0, now;
        u64 read_format = event->attr.read_format;
 
        /*
@@ -3980,7 +3787,7 @@ static void perf_output_read(struct perf_output_handle *handle,
         * NMI context
         */
        if (read_format & PERF_FORMAT_TOTAL_TIMES)
-               calc_timer_values(event, &enabled, &running);
+               calc_timer_values(event, &now, &enabled, &running);
 
        if (event->attr.read_format & PERF_FORMAT_GROUP)
                perf_output_read_group(handle, event, enabled, running);
@@ -4820,7 +4627,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
        struct hw_perf_event *hwc = &event->hw;
        int throttle = 0;
 
-       data->period = event->hw.last_period;
        if (!overflow)
                overflow = perf_swevent_set_period(event);
 
@@ -4854,6 +4660,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
        if (!is_sampling_event(event))
                return;
 
+       if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
+               data->period = nr;
+               return perf_swevent_overflow(event, 1, data, regs);
+       } else
+               data->period = event->hw.last_period;
+
        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
                return perf_swevent_overflow(event, 1, data, regs);
 
@@ -5187,6 +4999,11 @@ static int perf_swevent_init(struct perf_event *event)
        return 0;
 }
 
+static int perf_swevent_event_idx(struct perf_event *event)
+{
+       return 0;
+}
+
 static struct pmu perf_swevent = {
        .task_ctx_nr    = perf_sw_context,
 
@@ -5196,6 +5013,8 @@ static struct pmu perf_swevent = {
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
+
+       .event_idx      = perf_swevent_event_idx,
 };
 
 #ifdef CONFIG_EVENT_TRACING
@@ -5282,6 +5101,8 @@ static struct pmu perf_tracepoint = {
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
+
+       .event_idx      = perf_swevent_event_idx,
 };
 
 static inline void perf_tp_register(void)
@@ -5366,7 +5187,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        regs = get_irq_regs();
 
        if (regs && !perf_exclude_event(event, regs)) {
-               if (!(event->attr.exclude_idle && current->pid == 0))
+               if (!(event->attr.exclude_idle && is_idle_task(current)))
                        if (perf_event_overflow(event, &data, regs))
                                ret = HRTIMER_NORESTART;
        }
@@ -5501,6 +5322,8 @@ static struct pmu perf_cpu_clock = {
        .start          = cpu_clock_event_start,
        .stop           = cpu_clock_event_stop,
        .read           = cpu_clock_event_read,
+
+       .event_idx      = perf_swevent_event_idx,
 };
 
 /*
@@ -5573,6 +5396,8 @@ static struct pmu perf_task_clock = {
        .start          = task_clock_event_start,
        .stop           = task_clock_event_stop,
        .read           = task_clock_event_read,
+
+       .event_idx      = perf_swevent_event_idx,
 };
 
 static void perf_pmu_nop_void(struct pmu *pmu)
@@ -5600,6 +5425,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
        perf_pmu_enable(pmu);
 }
 
+static int perf_event_idx_default(struct perf_event *event)
+{
+       return event->hw.idx + 1;
+}
+
 /*
  * Ensures all contexts with the same task_ctx_nr have the same
  * pmu_cpu_context too.
@@ -5686,6 +5516,7 @@ static int pmu_dev_alloc(struct pmu *pmu)
        if (!pmu->dev)
                goto out;
 
+       pmu->dev->groups = pmu->attr_groups;
        device_initialize(pmu->dev);
        ret = dev_set_name(pmu->dev, "%s", pmu->name);
        if (ret)
@@ -5789,6 +5620,9 @@ got_cpu_context:
                pmu->pmu_disable = perf_pmu_nop_void;
        }
 
+       if (!pmu->event_idx)
+               pmu->event_idx = perf_event_idx_default;
+
        list_add_rcu(&pmu->entry, &pmus);
        ret = 0;
 unlock:
@@ -5981,7 +5815,7 @@ done:
 
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                       jump_label_inc(&perf_sched_events);
+                       jump_label_inc(&perf_sched_events.key);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_inc(&nr_mmap_events);
                if (event->attr.comm)
@@ -6219,7 +6053,7 @@ SYSCALL_DEFINE5(perf_event_open,
                 * - that may need work on context switch
                 */
                atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
-               jump_label_inc(&perf_sched_events);
+               jump_label_inc(&perf_sched_events.key);
        }
 
        /*
@@ -7065,6 +6899,9 @@ void __init perf_event_init(void)
 
        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
+
+       /* do not patch jump label more than once per second */
+       jump_label_rate_limit(&perf_sched_events, HZ);
 }
 
 static int __init perf_event_sysfs_init(void)
@@ -7131,10 +6968,13 @@ static int __perf_cgroup_move(void *info)
        return 0;
 }
 
-static void
-perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
+static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+                              struct cgroup_taskset *tset)
 {
-       task_function_call(task, __perf_cgroup_move, task);
+       struct task_struct *task;
+
+       cgroup_taskset_for_each(task, cgrp, tset)
+               task_function_call(task, __perf_cgroup_move, task);
 }
 
 static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
@@ -7148,7 +6988,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
        if (!(task->flags & PF_EXITING))
                return;
 
-       perf_cgroup_attach_task(cgrp, task);
+       task_function_call(task, __perf_cgroup_move, task);
 }
 
 struct cgroup_subsys perf_subsys = {
@@ -7157,6 +6997,6 @@ struct cgroup_subsys perf_subsys = {
        .create         = perf_cgroup_create,
        .destroy        = perf_cgroup_destroy,
        .exit           = perf_cgroup_exit,
-       .attach_task    = perf_cgroup_attach_task,
+       .attach         = perf_cgroup_attach,
 };
 #endif /* CONFIG_CGROUP_PERF */