X-Git-Url: http://git.cascardo.info/?a=blobdiff_plain;f=kernel%2Fevents%2Fcore.c;h=5d48306879d55d3221a7789bdd7cc8bdc611969e;hb=a1396555abff9ff9b74c2e4da13e27e81fd094b2;hp=4e2ebf6f2f1f9fcec32b5d30f69267df12ba4654;hpb=33656a1f2ee5346c742d63ddd0e0970c95a56b70;p=cascardo%2Flinux.git diff --git a/kernel/events/core.c b/kernel/events/core.c index 4e2ebf6f2f1f..5d48306879d5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -44,6 +44,8 @@ #include #include #include +#include +#include #include "internal.h" @@ -333,6 +335,7 @@ static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); +static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -351,7 +354,7 @@ static struct srcu_struct pmus_srcu; * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv */ -int sysctl_perf_event_paranoid __read_mostly = 1; +int sysctl_perf_event_paranoid __read_mostly = 2; /* Minimum for 512 kiB + 1 user control page */ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ @@ -394,6 +397,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write, if (ret || !write) return ret; + /* + * If throttling is disabled don't allow the write: + */ + if (sysctl_perf_cpu_time_max_percent == 100 || + sysctl_perf_cpu_time_max_percent == 0) + return -EINVAL; + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); @@ -1927,8 +1937,13 @@ event_sched_in(struct perf_event *event, if (event->state <= PERF_EVENT_STATE_OFF) return 0; - event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = smp_processor_id(); + WRITE_ONCE(event->oncpu, smp_processor_id()); + /* + * Order event::oncpu write to happen before the ACTIVE state + * is visible. + */ + smp_wmb(); + WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several @@ -2360,6 +2375,112 @@ void perf_event_enable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_enable); +struct stop_event_data { + struct perf_event *event; + unsigned int restart; +}; + +static int __perf_event_stop(void *info) +{ + struct stop_event_data *sd = info; + struct perf_event *event = sd->event; + + /* if it's already INACTIVE, do nothing */ + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) + return 0; + + /* matches smp_wmb() in event_sched_in() */ + smp_rmb(); + + /* + * There is a window with interrupts enabled before we get here, + * so we need to check again lest we try to stop another CPU's event. + */ + if (READ_ONCE(event->oncpu) != smp_processor_id()) + return -EAGAIN; + + event->pmu->stop(event, PERF_EF_UPDATE); + + /* + * May race with the actual stop (through perf_pmu_output_stop()), + * but it is only used for events with AUX ring buffer, and such + * events will refuse to restart because of rb::aux_mmap_count==0, + * see comments in perf_aux_output_begin(). + * + * Since this is happening on a event-local CPU, no trace is lost + * while restarting. + */ + if (sd->restart) + event->pmu->start(event, PERF_EF_START); + + return 0; +} + +static int perf_event_restart(struct perf_event *event) +{ + struct stop_event_data sd = { + .event = event, + .restart = 1, + }; + int ret = 0; + + do { + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) + return 0; + + /* matches smp_wmb() in event_sched_in() */ + smp_rmb(); + + /* + * We only want to restart ACTIVE events, so if the event goes + * inactive here (event->oncpu==-1), there's nothing more to do; + * fall through with ret==-ENXIO. + */ + ret = cpu_function_call(READ_ONCE(event->oncpu), + __perf_event_stop, &sd); + } while (ret == -EAGAIN); + + return ret; +} + +/* + * In order to contain the amount of racy and tricky in the address filter + * configuration management, it is a two part process: + * + * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, + * we update the addresses of corresponding vmas in + * event::addr_filters_offs array and bump the event::addr_filters_gen; + * (p2) when an event is scheduled in (pmu::add), it calls + * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() + * if the generation has changed since the previous call. + * + * If (p1) happens while the event is active, we restart it to force (p2). + * + * (1) perf_addr_filters_apply(): adjusting filters' offsets based on + * pre-existing mappings, called once when new filters arrive via SET_FILTER + * ioctl; + * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly + * registered mapping, called for every new mmap(), with mm::mmap_sem down + * for reading; + * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process + * of exec. + */ +void perf_event_addr_filters_sync(struct perf_event *event) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + + if (!has_addr_filter(event)) + return; + + raw_spin_lock(&ifh->lock); + if (event->addr_filters_gen != event->hw.addr_filters_gen) { + event->pmu->addr_filters_sync(event); + event->hw.addr_filters_gen = event->addr_filters_gen; + } + raw_spin_unlock(&ifh->lock); +} +EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); + static int _perf_event_refresh(struct perf_event *event, int refresh) { /* @@ -3209,16 +3330,6 @@ out: put_ctx(clone_ctx); } -void perf_event_exec(void) -{ - int ctxn; - - rcu_read_lock(); - for_each_task_context_nr(ctxn) - perf_event_enable_on_exec(ctxn); - rcu_read_unlock(); -} - struct perf_read_data { struct perf_event *event; bool group; @@ -3562,6 +3673,26 @@ static void free_event_rcu(struct rcu_head *head) static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb); +static void detach_sb_event(struct perf_event *event) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + + raw_spin_lock(&pel->lock); + list_del_rcu(&event->sb_list); + raw_spin_unlock(&pel->lock); +} + +static void unaccount_pmu_sb_event(struct perf_event *event) +{ + if (event->parent) + return; + + if (event->attach_state & PERF_ATTACH_TASK) + return; + + detach_sb_event(event); +} + static void unaccount_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -3625,6 +3756,8 @@ static void unaccount_event(struct perf_event *event) } unaccount_event_cpu(event, event->cpu); + + unaccount_pmu_sb_event(event); } static void perf_sched_delayed(struct work_struct *work) @@ -3720,6 +3853,9 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } +static void perf_addr_filters_splice(struct perf_event *event, + struct list_head *head); + static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3747,6 +3883,8 @@ static void _free_event(struct perf_event *event) } perf_event_free_bpf_prog(event); + perf_addr_filters_splice(event, NULL); + kfree(event->addr_filters_offs); if (event->destroy) event->destroy(event); @@ -4343,6 +4481,19 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_BPF: return perf_event_set_bpf_prog(event, arg); + case PERF_EVENT_IOC_PAUSE_OUTPUT: { + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb || !rb->nr_pages) { + rcu_read_unlock(); + return -EINVAL; + } + rb_toggle_paused(rb, !!arg); + rcu_read_unlock(); + return 0; + } default: return -ENOTTY; } @@ -4659,6 +4810,8 @@ static void perf_mmap_open(struct vm_area_struct *vma) event->pmu->event_mapped(event); } +static void perf_pmu_output_stop(struct perf_event *event); + /* * A buffer can be mmap()ed multiple times; either directly through the same * event, or through other events by use of perf_event_set_output(). @@ -4686,10 +4839,22 @@ static void perf_mmap_close(struct vm_area_struct *vma) */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + /* + * Stop all AUX events that are writing to this buffer, + * so that we can free its AUX pages and corresponding PMU + * data. Note that after rb::aux_mmap_count dropped to zero, + * they won't start any more (see perf_aux_output_begin()). + */ + perf_pmu_output_stop(event); + + /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + /* this has to be the last one */ rb_free_aux(rb); + WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); + mutex_unlock(&event->mmap_mutex); } @@ -5630,9 +5795,13 @@ void perf_prepare_sample(struct perf_event_header *header, } } -void perf_event_output(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) +static void __always_inline +__perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs, + int (*output_begin)(struct perf_output_handle *, + struct perf_event *, + unsigned int)) { struct perf_output_handle handle; struct perf_event_header header; @@ -5642,7 +5811,7 @@ void perf_event_output(struct perf_event *event, perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, event, header.size)) + if (output_begin(&handle, event, header.size)) goto exit; perf_output_sample(&handle, &header, data, event); @@ -5653,6 +5822,30 @@ exit: rcu_read_unlock(); } +void +perf_event_output_forward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin_forward); +} + +void +perf_event_output_backward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin_backward); +} + +void +perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin); +} + /* * read event_id */ @@ -5693,16 +5886,33 @@ perf_event_read_event(struct perf_event *event, perf_output_end(&handle); } -typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); +typedef void (perf_iterate_f)(struct perf_event *event, void *data); static void -perf_event_aux_ctx(struct perf_event_context *ctx, - perf_event_aux_output_cb output, - void *data) +perf_iterate_ctx(struct perf_event_context *ctx, + perf_iterate_f output, + void *data, bool all) { struct perf_event *event; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (!all) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + } + + output(event, data); + } +} + +static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) +{ + struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); + struct perf_event *event; + + list_for_each_entry_rcu(event, &pel->list, sb_list) { if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) @@ -5711,51 +5921,168 @@ perf_event_aux_ctx(struct perf_event_context *ctx, } } +/* + * Iterate all events that need to receive side-band events. + * + * For new callers; ensure that account_pmu_sb_event() includes + * your event, otherwise it might not get delivered. + */ static void -perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, - struct perf_event_context *task_ctx) +perf_iterate_sb(perf_iterate_f output, void *data, + struct perf_event_context *task_ctx) { + struct perf_event_context *ctx; + int ctxn; + rcu_read_lock(); preempt_disable(); - perf_event_aux_ctx(task_ctx, output, data); + + /* + * If we have task_ctx != NULL we only notify the task context itself. + * The task_ctx is set only for EXIT events before releasing task + * context. + */ + if (task_ctx) { + perf_iterate_ctx(task_ctx, output, data, false); + goto done; + } + + perf_iterate_sb_cpu(output, data); + + for_each_task_context_nr(ctxn) { + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_iterate_ctx(ctx, output, data, false); + } +done: preempt_enable(); rcu_read_unlock(); } -static void -perf_event_aux(perf_event_aux_output_cb output, void *data, - struct perf_event_context *task_ctx) +/* + * Clear all file-based filters at exec, they'll have to be + * re-instated when/if these objects are mmapped again. + */ +static void perf_event_addr_filters_exec(struct perf_event *event, void *data) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct perf_addr_filter *filter; + unsigned int restart = 0, count = 0; + unsigned long flags; + + if (!has_addr_filter(event)) + return; + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + if (filter->inode) { + event->addr_filters_offs[count] = 0; + restart++; + } + + count++; + } + + if (restart) + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + if (restart) + perf_event_restart(event); +} + +void perf_event_exec(void) { - struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; int ctxn; + rcu_read_lock(); + for_each_task_context_nr(ctxn) { + ctx = current->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctxn); + + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, + true); + } + rcu_read_unlock(); +} + +struct remote_output { + struct ring_buffer *rb; + int err; +}; + +static void __perf_event_output_stop(struct perf_event *event, void *data) +{ + struct perf_event *parent = event->parent; + struct remote_output *ro = data; + struct ring_buffer *rb = ro->rb; + struct stop_event_data sd = { + .event = event, + }; + + if (!has_aux(event)) + return; + + if (!parent) + parent = event; + /* - * If we have task_ctx != NULL we only notify - * the task context itself. The task_ctx is set - * only for EXIT events before releasing task - * context. + * In case of inheritance, it will be the parent that links to the + * ring-buffer, but it will be the child that's actually using it: */ - if (task_ctx) { - perf_event_aux_task_ctx(output, data, task_ctx); - return; - } + if (rcu_dereference(parent->rb) == rb) + ro->err = __perf_event_stop(&sd); +} + +static int __perf_pmu_output_stop(void *info) +{ + struct perf_event *event = info; + struct pmu *pmu = event->pmu; + struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + struct remote_output ro = { + .rb = event->rb, + }; rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_aux_ctx(&cpuctx->ctx, output, data); - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_aux_ctx(ctx, output, data); -next: - put_cpu_ptr(pmu->pmu_cpu_context); + perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); + if (cpuctx->task_ctx) + perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, + &ro, false); + rcu_read_unlock(); + + return ro.err; +} + +static void perf_pmu_output_stop(struct perf_event *event) +{ + struct perf_event *iter; + int err, cpu; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { + /* + * For per-CPU events, we need to make sure that neither they + * nor their children are running; for cpu==-1 events it's + * sufficient to stop the event itself if it's active, since + * it can't have children. + */ + cpu = iter->cpu; + if (cpu == -1) + cpu = READ_ONCE(iter->oncpu); + + if (cpu == -1) + continue; + + err = cpu_function_call(cpu, __perf_pmu_output_stop, event); + if (err == -EAGAIN) { + rcu_read_unlock(); + goto restart; + } } rcu_read_unlock(); } @@ -5852,7 +6179,7 @@ static void perf_event_task(struct task_struct *task, }, }; - perf_event_aux(perf_event_task_output, + perf_iterate_sb(perf_event_task_output, &task_event, task_ctx); } @@ -5931,7 +6258,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - perf_event_aux(perf_event_comm_output, + perf_iterate_sb(perf_event_comm_output, comm_event, NULL); } @@ -6162,13 +6489,94 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - perf_event_aux(perf_event_mmap_output, + perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); kfree(buf); } +/* + * Whether this @filter depends on a dynamic object which is not loaded + * yet or its load addresses are not known. + */ +static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter) +{ + return filter->filter && filter->inode; +} + +/* + * Check whether inode and address range match filter criteria. + */ +static bool perf_addr_filter_match(struct perf_addr_filter *filter, + struct file *file, unsigned long offset, + unsigned long size) +{ + if (filter->inode != file->f_inode) + return false; + + if (filter->offset > offset + size) + return false; + + if (filter->offset + filter->size < offset) + return false; + + return true; +} + +static void __perf_addr_filters_adjust(struct perf_event *event, void *data) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct vm_area_struct *vma = data; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags; + struct file *file = vma->vm_file; + struct perf_addr_filter *filter; + unsigned int restart = 0, count = 0; + + if (!has_addr_filter(event)) + return; + + if (!file) + return; + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + if (perf_addr_filter_match(filter, file, off, + vma->vm_end - vma->vm_start)) { + event->addr_filters_offs[count] = vma->vm_start; + restart++; + } + + count++; + } + + if (restart) + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + if (restart) + perf_event_restart(event); +} + +/* + * Adjust all task's events' filters to the new vma + */ +static void perf_addr_filters_adjust(struct vm_area_struct *vma) +{ + struct perf_event_context *ctx; + int ctxn; + + rcu_read_lock(); + for_each_task_context_nr(ctxn) { + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (!ctx) + continue; + + perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); + } + rcu_read_unlock(); +} + void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; @@ -6200,6 +6608,7 @@ void perf_event_mmap(struct vm_area_struct *vma) /* .flags (attr_mmap2 only) */ }; + perf_addr_filters_adjust(vma); perf_event_mmap_event(&mmap_event); } @@ -6350,7 +6759,7 @@ static void perf_event_switch(struct task_struct *task, }, }; - perf_event_aux(perf_event_switch_output, + perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } @@ -6491,10 +6900,7 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending); } - if (event->overflow_handler) - event->overflow_handler(event, data, regs); - else - perf_event_output(event, data, regs); + event->overflow_handler(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -7081,24 +7487,6 @@ static inline void perf_tp_register(void) perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); } -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - char *filter_str; - int ret; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -EINVAL; - - filter_str = strndup_user(arg, PAGE_SIZE); - if (IS_ERR(filter_str)) - return PTR_ERR(filter_str); - - ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); - - kfree(filter_str); - return ret; -} - static void perf_event_free_filter(struct perf_event *event) { ftrace_profile_free_filter(event); @@ -7153,11 +7541,6 @@ static inline void perf_tp_register(void) { } -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - return -ENOENT; -} - static void perf_event_free_filter(struct perf_event *event) { } @@ -7185,6 +7568,387 @@ void perf_bp_event(struct perf_event *bp, void *data) } #endif +/* + * Allocate a new address filter + */ +static struct perf_addr_filter * +perf_addr_filter_new(struct perf_event *event, struct list_head *filters) +{ + int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu); + struct perf_addr_filter *filter; + + filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node); + if (!filter) + return NULL; + + INIT_LIST_HEAD(&filter->entry); + list_add_tail(&filter->entry, filters); + + return filter; +} + +static void free_filters_list(struct list_head *filters) +{ + struct perf_addr_filter *filter, *iter; + + list_for_each_entry_safe(filter, iter, filters, entry) { + if (filter->inode) + iput(filter->inode); + list_del(&filter->entry); + kfree(filter); + } +} + +/* + * Free existing address filters and optionally install new ones + */ +static void perf_addr_filters_splice(struct perf_event *event, + struct list_head *head) +{ + unsigned long flags; + LIST_HEAD(list); + + if (!has_addr_filter(event)) + return; + + /* don't bother with children, they don't have their own filters */ + if (event->parent) + return; + + raw_spin_lock_irqsave(&event->addr_filters.lock, flags); + + list_splice_init(&event->addr_filters.list, &list); + if (head) + list_splice(head, &event->addr_filters.list); + + raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags); + + free_filters_list(&list); +} + +/* + * Scan through mm's vmas and see if one of them matches the + * @filter; if so, adjust filter's address range. + * Called with mm::mmap_sem down for reading. + */ +static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter, + struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + struct file *file = vma->vm_file; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + unsigned long vma_size = vma->vm_end - vma->vm_start; + + if (!file) + continue; + + if (!perf_addr_filter_match(filter, file, off, vma_size)) + continue; + + return vma->vm_start; + } + + return 0; +} + +/* + * Update event's address range filters based on the + * task's existing mappings, if any. + */ +static void perf_event_addr_filters_apply(struct perf_event *event) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct task_struct *task = READ_ONCE(event->ctx->task); + struct perf_addr_filter *filter; + struct mm_struct *mm = NULL; + unsigned int count = 0; + unsigned long flags; + + /* + * We may observe TASK_TOMBSTONE, which means that the event tear-down + * will stop on the parent's child_mutex that our caller is also holding + */ + if (task == TASK_TOMBSTONE) + return; + + mm = get_task_mm(event->ctx->task); + if (!mm) + goto restart; + + down_read(&mm->mmap_sem); + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + event->addr_filters_offs[count] = 0; + + if (perf_addr_filter_needs_mmap(filter)) + event->addr_filters_offs[count] = + perf_addr_filter_apply(filter, mm); + + count++; + } + + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + up_read(&mm->mmap_sem); + + mmput(mm); + +restart: + perf_event_restart(event); +} + +/* + * Address range filtering: limiting the data to certain + * instruction address ranges. Filters are ioctl()ed to us from + * userspace as ascii strings. + * + * Filter string format: + * + * ACTION RANGE_SPEC + * where ACTION is one of the + * * "filter": limit the trace to this region + * * "start": start tracing from this address + * * "stop": stop tracing at this address/region; + * RANGE_SPEC is + * * for kernel addresses: [/] + * * for object files: [/]@ + * + * if is not specified, the range is treated as a single address. + */ +enum { + IF_ACT_FILTER, + IF_ACT_START, + IF_ACT_STOP, + IF_SRC_FILE, + IF_SRC_KERNEL, + IF_SRC_FILEADDR, + IF_SRC_KERNELADDR, +}; + +enum { + IF_STATE_ACTION = 0, + IF_STATE_SOURCE, + IF_STATE_END, +}; + +static const match_table_t if_tokens = { + { IF_ACT_FILTER, "filter" }, + { IF_ACT_START, "start" }, + { IF_ACT_STOP, "stop" }, + { IF_SRC_FILE, "%u/%u@%s" }, + { IF_SRC_KERNEL, "%u/%u" }, + { IF_SRC_FILEADDR, "%u@%s" }, + { IF_SRC_KERNELADDR, "%u" }, +}; + +/* + * Address filter string parser + */ +static int +perf_event_parse_addr_filter(struct perf_event *event, char *fstr, + struct list_head *filters) +{ + struct perf_addr_filter *filter = NULL; + char *start, *orig, *filename = NULL; + struct path path; + substring_t args[MAX_OPT_ARGS]; + int state = IF_STATE_ACTION, token; + unsigned int kernel = 0; + int ret = -EINVAL; + + orig = fstr = kstrdup(fstr, GFP_KERNEL); + if (!fstr) + return -ENOMEM; + + while ((start = strsep(&fstr, " ,\n")) != NULL) { + ret = -EINVAL; + + if (!*start) + continue; + + /* filter definition begins */ + if (state == IF_STATE_ACTION) { + filter = perf_addr_filter_new(event, filters); + if (!filter) + goto fail; + } + + token = match_token(start, if_tokens, args); + switch (token) { + case IF_ACT_FILTER: + case IF_ACT_START: + filter->filter = 1; + + case IF_ACT_STOP: + if (state != IF_STATE_ACTION) + goto fail; + + state = IF_STATE_SOURCE; + break; + + case IF_SRC_KERNELADDR: + case IF_SRC_KERNEL: + kernel = 1; + + case IF_SRC_FILEADDR: + case IF_SRC_FILE: + if (state != IF_STATE_SOURCE) + goto fail; + + if (token == IF_SRC_FILE || token == IF_SRC_KERNEL) + filter->range = 1; + + *args[0].to = 0; + ret = kstrtoul(args[0].from, 0, &filter->offset); + if (ret) + goto fail; + + if (filter->range) { + *args[1].to = 0; + ret = kstrtoul(args[1].from, 0, &filter->size); + if (ret) + goto fail; + } + + if (token == IF_SRC_FILE) { + filename = match_strdup(&args[2]); + if (!filename) { + ret = -ENOMEM; + goto fail; + } + } + + state = IF_STATE_END; + break; + + default: + goto fail; + } + + /* + * Filter definition is fully parsed, validate and install it. + * Make sure that it doesn't contradict itself or the event's + * attribute. + */ + if (state == IF_STATE_END) { + if (kernel && event->attr.exclude_kernel) + goto fail; + + if (!kernel) { + if (!filename) + goto fail; + + /* look up the path and grab its inode */ + ret = kern_path(filename, LOOKUP_FOLLOW, &path); + if (ret) + goto fail_free_name; + + filter->inode = igrab(d_inode(path.dentry)); + path_put(&path); + kfree(filename); + filename = NULL; + + ret = -EINVAL; + if (!filter->inode || + !S_ISREG(filter->inode->i_mode)) + /* free_filters_list() will iput() */ + goto fail; + } + + /* ready to consume more filters */ + state = IF_STATE_ACTION; + filter = NULL; + } + } + + if (state != IF_STATE_ACTION) + goto fail; + + kfree(orig); + + return 0; + +fail_free_name: + kfree(filename); +fail: + free_filters_list(filters); + kfree(orig); + + return ret; +} + +static int +perf_event_set_addr_filter(struct perf_event *event, char *filter_str) +{ + LIST_HEAD(filters); + int ret; + + /* + * Since this is called in perf_ioctl() path, we're already holding + * ctx::mutex. + */ + lockdep_assert_held(&event->ctx->mutex); + + if (WARN_ON_ONCE(event->parent)) + return -EINVAL; + + /* + * For now, we only support filtering in per-task events; doing so + * for CPU-wide events requires additional context switching trickery, + * since same object code will be mapped at different virtual + * addresses in different processes. + */ + if (!event->ctx->task) + return -EOPNOTSUPP; + + ret = perf_event_parse_addr_filter(event, filter_str, &filters); + if (ret) + return ret; + + ret = event->pmu->addr_filters_validate(&filters); + if (ret) { + free_filters_list(&filters); + return ret; + } + + /* remove existing filters, if any */ + perf_addr_filters_splice(event, &filters); + + /* install new filters */ + perf_event_for_each_child(event, perf_event_addr_filters_apply); + + return ret; +} + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret = -EINVAL; + + if ((event->attr.type != PERF_TYPE_TRACEPOINT || + !IS_ENABLED(CONFIG_EVENT_TRACING)) && + !has_addr_filter(event)) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + if (IS_ENABLED(CONFIG_EVENT_TRACING) && + event->attr.type == PERF_TYPE_TRACEPOINT) + ret = ftrace_profile_set_filter(event, event->attr.config, + filter_str); + else if (has_addr_filter(event)) + ret = perf_event_set_addr_filter(event, filter_str); + + kfree(filter_str); + return ret; +} + /* * hrtimer based swevent callback */ @@ -7542,6 +8306,20 @@ static void free_pmu_context(struct pmu *pmu) out: mutex_unlock(&pmus_lock); } + +/* + * Let userspace know that this PMU supports address range filtering: + */ +static ssize_t nr_addr_filters_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); +} +DEVICE_ATTR_RO(nr_addr_filters); + static struct idr pmu_idr; static ssize_t @@ -7643,9 +8421,19 @@ static int pmu_dev_alloc(struct pmu *pmu) if (ret) goto free_dev; + /* For PMUs with address filters, throw in an extra attribute: */ + if (pmu->nr_addr_filters) + ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); + + if (ret) + goto del_dev; + out: return ret; +del_dev: + device_del(pmu->dev); + free_dev: put_device(pmu->dev); goto out; @@ -7685,6 +8473,21 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) } skip_type: + if (pmu->task_ctx_nr == perf_hw_context) { + static int hw_context_taken = 0; + + /* + * Other than systems with heterogeneous CPUs, it never makes + * sense for two PMUs to share perf_hw_context. PMUs which are + * uncore must use perf_invalid_context. + */ + if (WARN_ON_ONCE(hw_context_taken && + !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS))) + pmu->task_ctx_nr = perf_invalid_context; + + hw_context_taken = 1; + } + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); if (pmu->pmu_cpu_context) goto got_cpu_context; @@ -7772,6 +8575,8 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); free_pmu_context(pmu); @@ -7843,6 +8648,39 @@ unlock: return pmu; } +static void attach_sb_event(struct perf_event *event) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + + raw_spin_lock(&pel->lock); + list_add_rcu(&event->sb_list, &pel->list); + raw_spin_unlock(&pel->lock); +} + +/* + * We keep a list of all !task (and therefore per-cpu) events + * that need to receive side-band records. + * + * This avoids having to scan all the various PMU per-cpu contexts + * looking for them. + */ +static void account_pmu_sb_event(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + + if (event->parent) + return; + + if (event->attach_state & PERF_ATTACH_TASK) + return; + + if (attr->mmap || attr->mmap_data || attr->mmap2 || + attr->comm || attr->comm_exec || + attr->task || + attr->context_switch) + attach_sb_event(event); +} + static void account_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -7923,6 +8761,8 @@ static void account_event(struct perf_event *event) enabled: account_event_cpu(event, event->cpu); + + account_pmu_sb_event(event); } /* @@ -7965,6 +8805,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); + INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); @@ -7972,6 +8813,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_irq_work(&event->pending, perf_pending_event); mutex_init(&event->mmap_mutex); + raw_spin_lock_init(&event->addr_filters.lock); atomic_long_set(&event->refcount, 1); event->cpu = cpu; @@ -8006,8 +8848,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, context = parent_event->overflow_handler_context; } - event->overflow_handler = overflow_handler; - event->overflow_handler_context = context; + if (overflow_handler) { + event->overflow_handler = overflow_handler; + event->overflow_handler_context = context; + } else if (is_write_backward(event)){ + event->overflow_handler = perf_event_output_backward; + event->overflow_handler_context = NULL; + } else { + event->overflow_handler = perf_event_output_forward; + event->overflow_handler_context = NULL; + } perf_event__state_init(event); @@ -8048,11 +8898,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (err) goto err_pmu; + if (has_addr_filter(event)) { + event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, + sizeof(unsigned long), + GFP_KERNEL); + if (!event->addr_filters_offs) + goto err_per_task; + + /* force hw sync on the address filters */ + event->addr_filters_gen = 1; + } + if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { - err = get_callchain_buffers(); + err = get_callchain_buffers(attr->sample_max_stack); if (err) - goto err_per_task; + goto err_addr_filters; } } @@ -8061,6 +8922,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, return event; +err_addr_filters: + kfree(event->addr_filters_offs); + err_per_task: exclusive_event_destroy(event); @@ -8239,6 +9103,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) if (output_event->clock != event->clock) goto out; + /* + * Either writing ring buffer from beginning or from end. + * Mixing is not allowed. + */ + if (is_write_backward(output_event) != is_write_backward(event)) + goto out; + /* * If both events generate aux data, they must be on the same PMU */ @@ -8362,6 +9233,9 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + if (!attr.sample_max_stack) + attr.sample_max_stack = sysctl_perf_event_max_stack; + /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument @@ -8435,7 +9309,7 @@ SYSCALL_DEFINE5(perf_event_open, if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { - err = -ENOTSUPP; + err = -EOPNOTSUPP; goto err_alloc; } } @@ -9397,6 +10271,9 @@ static void __init perf_event_init_all_cpus(void) swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); + + INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); + raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); } }