#include <asm/irq_regs.h>
-static struct workqueue_struct *perf_wq;
-
typedef int (*remote_function_f)(void *);
struct remote_function_call {
static bool is_kernel_event(struct perf_event *event)
{
- return event->owner == TASK_TOMBSTONE;
+ return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}
/*
* perf_event_context::mutex nests and those are:
*
* - perf_event_exit_task_context() [ child , 0 ]
- * __perf_event_exit_task()
- * sync_child_event()
- * put_event() [ parent, 1 ]
+ * perf_event_exit_event()
+ * put_event() [ parent, 1 ]
*
* - perf_event_init_context() [ parent, 0 ]
* inherit_task_group()
* Lock order:
* task_struct::perf_event_mutex
* perf_event_context::mutex
- * perf_event_context::lock
* perf_event::child_mutex;
+ * perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
*/
!atomic_inc_not_zero(&ctx->refcount)) {
raw_spin_unlock(&ctx->lock);
ctx = NULL;
+ } else {
+ WARN_ON_ONCE(ctx->task != task);
}
-
- WARN_ON_ONCE(ctx->task != task);
}
rcu_read_unlock();
if (!ctx)
perf_event__header_size(tmp);
}
-/*
- * User event without the task.
- */
static bool is_orphaned_event(struct perf_event *event)
{
- return event && !is_kernel_event(event) && !event->owner;
-}
-
-/*
- * Event has a parent but parent's task finished and it's
- * alive only because of children holding refference.
- */
-static bool is_orphaned_child(struct perf_event *event)
-{
- return is_orphaned_event(event->parent);
-}
-
-static void orphans_remove_work(struct work_struct *work);
-
-static void schedule_orphans_remove(struct perf_event_context *ctx)
-{
- if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
- return;
-
- if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
- get_ctx(ctx);
- ctx->orphans_remove_sched = true;
- }
-}
-
-static int __init perf_workqueue_init(void)
-{
- perf_wq = create_singlethread_workqueue("perf");
- WARN(!perf_wq, "failed to create perf workqueue\n");
- return perf_wq ? 0 : -1;
+ return event->state == PERF_EVENT_STATE_EXIT;
}
-core_initcall(perf_workqueue_init);
-
static inline int pmu_filter_match(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
-
perf_pmu_enable(event->pmu);
}
cpuctx->exclusive = 0;
}
+#define DETACH_GROUP 0x01UL
+#define DETACH_STATE 0x02UL
+
/*
* Cross CPU call to remove a performance event
*
struct perf_event_context *ctx,
void *info)
{
- bool detach_group = (unsigned long)info;
+ unsigned long flags = (unsigned long)info;
event_sched_out(event, cpuctx, ctx);
- if (detach_group)
+ if (flags & DETACH_GROUP)
perf_group_detach(event);
list_del_event(event, ctx);
+ if (flags & DETACH_STATE)
+ event->state = PERF_EVENT_STATE_EXIT;
if (!ctx->nr_events && ctx->is_active) {
ctx->is_active = 0;
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
lockdep_assert_held(&event->ctx->mutex);
- event_function_call(event, __perf_remove_from_context,
- (void *)(unsigned long)detach_group);
+ event_function_call(event, __perf_remove_from_context, (void *)flags);
}
/*
* remains valid. This condition is satisifed when called through
* perf_event_for_each_child or perf_event_for_each because they
* hold the top-level event's child_mutex, so any descendant that
- * goes to exit will block in sync_child_event.
+ * goes to exit will block in perf_event_exit_event().
+ *
* When called from perf_pending_event it's OK because event->ctx
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
if (event->attr.exclusive)
cpuctx->exclusive = 1;
- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
-
out:
perf_pmu_enable(event->pmu);
struct perf_event *leader = event->group_leader;
struct perf_event_context *task_ctx;
- if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state <= PERF_EVENT_STATE_ERROR)
return;
update_context_time(ctx);
struct perf_event_context *ctx = event->ctx;
raw_spin_lock_irq(&ctx->lock);
- if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state < PERF_EVENT_STATE_ERROR) {
raw_spin_unlock_irq(&ctx->lock);
return;
}
INIT_LIST_HEAD(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
- INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
}
static struct perf_event_context *
/* Reuse ptrace permission checks for now. */
err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
goto errout;
return task;
* 3) two matching events on the same context.
*
* The former two cases are handled in the allocation path (perf_event_alloc(),
- * __free_event()), the latter -- before the first perf_install_in_context().
+ * _free_event()), the latter -- before the first perf_install_in_context().
*/
static int exclusive_event_init(struct perf_event *event)
{
return true;
}
-static void __free_event(struct perf_event *event)
-{
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-
- perf_event_free_bpf_prog(event);
-
- if (event->destroy)
- event->destroy(event);
-
- if (event->ctx)
- put_ctx(event->ctx);
-
- if (event->pmu) {
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
- }
-
- call_rcu(&event->rcu_head, free_event_rcu);
-}
-
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
if (is_cgroup_event(event))
perf_detach_cgroup(event);
- __free_event(event);
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
+
+ perf_event_free_bpf_prog(event);
+
+ if (event->destroy)
+ event->destroy(event);
+
+ if (event->ctx)
+ put_ctx(event->ctx);
+
+ if (event->pmu) {
+ exclusive_event_destroy(event);
+ module_put(event->pmu->module);
+ }
+
+ call_rcu(&event->rcu_head, free_event_rcu);
}
/*
struct task_struct *owner;
rcu_read_lock();
- owner = ACCESS_ONCE(event->owner);
/*
- * Matches the smp_wmb() in perf_event_exit_task(). If we observe
- * !owner it means the list deletion is complete and we can indeed
- * free this event, otherwise we need to serialize on
+ * Matches the smp_store_release() in perf_event_exit_task(). If we
+ * observe !owner it means the list deletion is complete and we can
+ * indeed free this event, otherwise we need to serialize on
* owner->perf_event_mutex.
*/
- smp_read_barrier_depends();
+ owner = lockless_dereference(event->owner);
if (owner) {
/*
* Since delayed_put_task_struct() also drops the last
* ensured they're done, and we can proceed with freeing the
* event.
*/
- if (event->owner)
+ if (event->owner) {
list_del_init(&event->owner_entry);
+ smp_store_release(&event->owner, NULL);
+ }
mutex_unlock(&owner->perf_event_mutex);
put_task_struct(owner);
}
static void put_event(struct perf_event *event)
{
- struct perf_event_context *ctx;
-
if (!atomic_long_dec_and_test(&event->refcount))
return;
+ _free_event(event);
+}
+
+/*
+ * Kill an event dead; while event:refcount will preserve the event
+ * object, it will not preserve its functionality. Once the last 'user'
+ * gives up the object, we'll destroy the thing.
+ */
+int perf_event_release_kernel(struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+ struct perf_event *child, *tmp;
+
if (!is_kernel_event(event))
perf_remove_from_owner(event);
+ ctx = perf_event_ctx_lock(event);
+ WARN_ON_ONCE(ctx->parent_ctx);
+ perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE);
+ perf_event_ctx_unlock(event, ctx);
+
/*
- * There are two ways this annotation is useful:
+ * At this point we must have event->state == PERF_EVENT_STATE_EXIT,
+ * either from the above perf_remove_from_context() or through
+ * perf_event_exit_event().
*
- * 1) there is a lock recursion from perf_event_exit_task
- * see the comment there.
+ * Therefore, anybody acquiring event->child_mutex after the below
+ * loop _must_ also see this, most importantly inherit_event() which
+ * will avoid placing more children on the list.
*
- * 2) there is a lock-inversion with mmap_sem through
- * perf_read_group(), which takes faults while
- * holding ctx->mutex, however this is called after
- * the last filedesc died, so there is no possibility
- * to trigger the AB-BA case.
+ * Thus this guarantees that we will in fact observe and kill _ALL_
+ * child events.
*/
- ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
- WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, true);
- perf_event_ctx_unlock(event, ctx);
+ WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT);
- _free_event(event);
-}
+again:
+ mutex_lock(&event->child_mutex);
+ list_for_each_entry(child, &event->child_list, child_list) {
-int perf_event_release_kernel(struct perf_event *event)
-{
+ /*
+ * Cannot change, child events are not migrated, see the
+ * comment with perf_event_ctx_lock_nested().
+ */
+ ctx = lockless_dereference(child->ctx);
+ /*
+ * Since child_mutex nests inside ctx::mutex, we must jump
+ * through hoops. We start by grabbing a reference on the ctx.
+ *
+ * Since the event cannot get freed while we hold the
+ * child_mutex, the context must also exist and have a !0
+ * reference count.
+ */
+ get_ctx(ctx);
+
+ /*
+ * Now that we have a ctx ref, we can drop child_mutex, and
+ * acquire ctx::mutex without fear of it going away. Then we
+ * can re-acquire child_mutex.
+ */
+ mutex_unlock(&event->child_mutex);
+ mutex_lock(&ctx->mutex);
+ mutex_lock(&event->child_mutex);
+
+ /*
+ * Now that we hold ctx::mutex and child_mutex, revalidate our
+ * state, if child is still the first entry, it didn't get freed
+ * and we can continue doing so.
+ */
+ tmp = list_first_entry_or_null(&event->child_list,
+ struct perf_event, child_list);
+ if (tmp == child) {
+ perf_remove_from_context(child, DETACH_GROUP);
+ list_del(&child->child_list);
+ free_event(child);
+ /*
+ * This matches the refcount bump in inherit_event();
+ * this can't be the last reference.
+ */
+ put_event(event);
+ }
+
+ mutex_unlock(&event->child_mutex);
+ mutex_unlock(&ctx->mutex);
+ put_ctx(ctx);
+ goto again;
+ }
+ mutex_unlock(&event->child_mutex);
+
+ /* Must be the last reference */
put_event(event);
return 0;
}
*/
static int perf_release(struct inode *inode, struct file *file)
{
- put_event(file->private_data);
+ perf_event_release_kernel(file->private_data);
return 0;
}
-/*
- * Remove all orphanes events from the context.
- */
-static void orphans_remove_work(struct work_struct *work)
-{
- struct perf_event_context *ctx;
- struct perf_event *event, *tmp;
-
- ctx = container_of(work, struct perf_event_context,
- orphans_remove.work);
-
- mutex_lock(&ctx->mutex);
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
- struct perf_event *parent_event = event->parent;
-
- if (!is_orphaned_child(event))
- continue;
-
- perf_remove_from_context(event, true);
-
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- free_event(event);
- put_event(parent_event);
- }
-
- raw_spin_lock_irq(&ctx->lock);
- ctx->orphans_remove_sched = false;
- raw_spin_unlock_irq(&ctx->lock);
- mutex_unlock(&ctx->mutex);
-
- put_ctx(ctx);
-}
-
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
/*
* Holding the top-level event's child_mutex means that any
* descendant process that has inherited this event will block
- * in sync_child_event if it goes to exit, thus satisfying the
+ * in perf_event_exit_event() if it goes to exit, thus satisfying the
* task existence requirements of perf_event_enable/disable.
*/
static void perf_event_for_each_child(struct perf_event *event,
struct perf_event *event = filp->private_data;
int retval;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
retval = fasync_helper(fd, filp, on, &event->fasync);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (retval < 0)
return retval;
* See perf_event_ctx_lock() for comments on the details
* of swizzling perf_event::ctx.
*/
- perf_remove_from_context(group_leader, false);
+ perf_remove_from_context(group_leader, 0);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_remove_from_context(sibling, false);
+ perf_remove_from_context(sibling, 0);
put_ctx(gctx);
}
mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
- perf_remove_from_context(event, false);
+ perf_remove_from_context(event, 0);
unaccount_event_cpu(event, src_cpu);
put_ctx(src_ctx);
list_add(&event->migrate_entry, &events);
&parent_event->child_total_time_enabled);
atomic64_add(child_event->total_time_running,
&parent_event->child_total_time_running);
-
- /*
- * Remove this event from the parent's list
- */
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&child_event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- /*
- * Make sure user/parent get notified, that we just
- * lost one event.
- */
- perf_event_wakeup(parent_event);
-
- /*
- * Release the parent event, if this was the last
- * reference to it.
- */
- put_event(parent_event);
}
static void
-__perf_event_exit_task(struct perf_event *child_event,
- struct perf_event_context *child_ctx,
- struct task_struct *child)
+perf_event_exit_event(struct perf_event *child_event,
+ struct perf_event_context *child_ctx,
+ struct task_struct *child)
{
+ struct perf_event *parent_event = child_event->parent;
+
/*
* Do not destroy the 'original' grouping; because of the context
* switch optimization the original events could've ended up in a
raw_spin_lock_irq(&child_ctx->lock);
WARN_ON_ONCE(child_ctx->is_active);
- if (!!child_event->parent)
+ if (parent_event)
perf_group_detach(child_event);
list_del_event(child_event, child_ctx);
+ child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */
raw_spin_unlock_irq(&child_ctx->lock);
/*
- * It can happen that the parent exits first, and has events
- * that are still around due to the child reference. These
- * events need to be zapped.
+ * Parent events are governed by their filedesc, retain them.
*/
- if (child_event->parent) {
- sync_child_event(child_event, child);
- free_event(child_event);
- } else {
- child_event->state = PERF_EVENT_STATE_EXIT;
+ if (!parent_event) {
perf_event_wakeup(child_event);
+ return;
}
+ /*
+ * Child events can be cleaned up.
+ */
+
+ sync_child_event(child_event, child);
+
+ /*
+ * Remove this event from the parent's list
+ */
+ WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+ mutex_lock(&parent_event->child_mutex);
+ list_del_init(&child_event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ /*
+ * Kick perf_poll() for is_event_hup().
+ */
+ perf_event_wakeup(parent_event);
+ free_event(child_event);
+ put_event(parent_event);
}
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* perf_event_create_kernel_count() which does find_get_context()
* without ctx::mutex (it cannot because of the move_group double mutex
* lock thing). See the comments in perf_install_in_context().
- *
- * We can recurse on the same lock type through:
- *
- * __perf_event_exit_task()
- * sync_child_event()
- * put_event()
- * mutex_lock(&ctx->mutex)
- *
- * But since its the parent context it won't be the same instance.
*/
mutex_lock(&child_ctx->mutex);
perf_event_task(child, child_ctx, 0);
list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- __perf_event_exit_task(child_event, child_ctx, child);
+ perf_event_exit_event(child_event, child_ctx, child);
mutex_unlock(&child_ctx->mutex);
* the owner, closes a race against perf_release() where
* we need to serialize on the owner->perf_event_mutex.
*/
- smp_wmb();
- event->owner = NULL;
+ smp_store_release(&event->owner, NULL);
}
mutex_unlock(&child->perf_event_mutex);
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
-struct perf_event *perf_event_get(unsigned int fd)
+struct file *perf_event_get(unsigned int fd)
{
- int err;
- struct fd f;
- struct perf_event *event;
+ struct file *file;
- err = perf_fget_light(fd, &f);
- if (err)
- return ERR_PTR(err);
+ file = fget_raw(fd);
+ if (!file)
+ return ERR_PTR(-EBADF);
- event = f.file->private_data;
- atomic_long_inc(&event->refcount);
- fdput(f);
+ if (file->f_op != &perf_fops) {
+ fput(file);
+ return ERR_PTR(-EBADF);
+ }
- return event;
+ return file;
}
const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
if (IS_ERR(child_event))
return child_event;
+ /*
+ * is_orphaned_event() and list_add_tail(&parent_event->child_list)
+ * must be under the same lock in order to serialize against
+ * perf_event_release_kernel(), such that either we must observe
+ * is_orphaned_event() or they will observe us on the child_list.
+ */
+ mutex_lock(&parent_event->child_mutex);
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ mutex_unlock(&parent_event->child_mutex);
free_event(child_event);
return NULL;
}
/*
* Link this into the parent event's child list
*/
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
list_add_tail(&child_event->child_list, &parent_event->child_list);
mutex_unlock(&parent_event->child_mutex);
raw_spin_lock(&ctx->lock);
list_for_each_entry(event, &ctx->event_list, event_entry)
- __perf_remove_from_context(event, cpuctx, ctx, (void *)(unsigned long)true);
+ __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
}