perf/abi: Change the errno for sampling event not supported in hardware

[cascardo/linux.git] / kernel / events / core.c
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 4e2ebf6..5d48306 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -44,6 +44,8 @@
  #include <linux/compat.h>
  #include <linux/bpf.h>
  #include <linux/filter.h>
+#include <linux/namei.h>
+#include <linux/parser.h>
  
  #include "internal.h"
  
@@ -333,6 +335,7 @@ static atomic_t perf_sched_count;
  
  static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
  static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
  
  static atomic_t nr_mmap_events __read_mostly;
  static atomic_t nr_comm_events __read_mostly;
@@ -351,7 +354,7 @@ static struct srcu_struct pmus_srcu;
   *   1 - disallow cpu events for unpriv
   *   2 - disallow kernel profiling for unpriv
   */
-int sysctl_perf_event_paranoid __read_mostly = 1;
+int sysctl_perf_event_paranoid __read_mostly = 2;
  
  /* Minimum for 512 kiB + 1 user control page */
  int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
@@ -394,6 +397,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
         if (ret || !write)
                 return ret;
  
+       /*
+        * If throttling is disabled don't allow the write:
+        */
+       if (sysctl_perf_cpu_time_max_percent == 100 ||
+           sysctl_perf_cpu_time_max_percent == 0)
+               return -EINVAL;
+
         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
         update_perf_cpu_limits();
@@ -1927,8 +1937,13 @@ event_sched_in(struct perf_event *event,
         if (event->state <= PERF_EVENT_STATE_OFF)
                 return 0;
  
-       event->state = PERF_EVENT_STATE_ACTIVE;
-       event->oncpu = smp_processor_id();
+       WRITE_ONCE(event->oncpu, smp_processor_id());
+       /*
+        * Order event::oncpu write to happen before the ACTIVE state
+        * is visible.
+        */
+       smp_wmb();
+       WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
  
         /*
          * Unthrottle events, since we scheduled we might have missed several
@@ -2360,6 +2375,112 @@ void perf_event_enable(struct perf_event *event)
  }
  EXPORT_SYMBOL_GPL(perf_event_enable);
  
+struct stop_event_data {
+       struct perf_event       *event;
+       unsigned int            restart;
+};
+
+static int __perf_event_stop(void *info)
+{
+       struct stop_event_data *sd = info;
+       struct perf_event *event = sd->event;
+
+       /* if it's already INACTIVE, do nothing */
+       if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+               return 0;
+
+       /* matches smp_wmb() in event_sched_in() */
+       smp_rmb();
+
+       /*
+        * There is a window with interrupts enabled before we get here,
+        * so we need to check again lest we try to stop another CPU's event.
+        */
+       if (READ_ONCE(event->oncpu) != smp_processor_id())
+               return -EAGAIN;
+
+       event->pmu->stop(event, PERF_EF_UPDATE);
+
+       /*
+        * May race with the actual stop (through perf_pmu_output_stop()),
+        * but it is only used for events with AUX ring buffer, and such
+        * events will refuse to restart because of rb::aux_mmap_count==0,
+        * see comments in perf_aux_output_begin().
+        *
+        * Since this is happening on a event-local CPU, no trace is lost
+        * while restarting.
+        */
+       if (sd->restart)
+               event->pmu->start(event, PERF_EF_START);
+
+       return 0;
+}
+
+static int perf_event_restart(struct perf_event *event)
+{
+       struct stop_event_data sd = {
+               .event          = event,
+               .restart        = 1,
+       };
+       int ret = 0;
+
+       do {
+               if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+                       return 0;
+
+               /* matches smp_wmb() in event_sched_in() */
+               smp_rmb();
+
+               /*
+                * We only want to restart ACTIVE events, so if the event goes
+                * inactive here (event->oncpu==-1), there's nothing more to do;
+                * fall through with ret==-ENXIO.
+                */
+               ret = cpu_function_call(READ_ONCE(event->oncpu),
+                                       __perf_event_stop, &sd);
+       } while (ret == -EAGAIN);
+
+       return ret;
+}
+
+/*
+ * In order to contain the amount of racy and tricky in the address filter
+ * configuration management, it is a two part process:
+ *
+ * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
+ *      we update the addresses of corresponding vmas in
+ *     event::addr_filters_offs array and bump the event::addr_filters_gen;
+ * (p2) when an event is scheduled in (pmu::add), it calls
+ *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
+ *      if the generation has changed since the previous call.
+ *
+ * If (p1) happens while the event is active, we restart it to force (p2).
+ *
+ * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
+ *     pre-existing mappings, called once when new filters arrive via SET_FILTER
+ *     ioctl;
+ * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
+ *     registered mapping, called for every new mmap(), with mm::mmap_sem down
+ *     for reading;
+ * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
+ *     of exec.
+ */
+void perf_event_addr_filters_sync(struct perf_event *event)
+{
+       struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+
+       if (!has_addr_filter(event))
+               return;
+
+       raw_spin_lock(&ifh->lock);
+       if (event->addr_filters_gen != event->hw.addr_filters_gen) {
+               event->pmu->addr_filters_sync(event);
+               event->hw.addr_filters_gen = event->addr_filters_gen;
+       }
+       raw_spin_unlock(&ifh->lock);
+}
+EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
+
  static int _perf_event_refresh(struct perf_event *event, int refresh)
  {
         /*
@@ -3209,16 +3330,6 @@ out:
                 put_ctx(clone_ctx);
  }
  
-void perf_event_exec(void)
-{
-       int ctxn;
-
-       rcu_read_lock();
-       for_each_task_context_nr(ctxn)
-               perf_event_enable_on_exec(ctxn);
-       rcu_read_unlock();
-}
-
  struct perf_read_data {
         struct perf_event *event;
         bool group;
@@ -3562,6 +3673,26 @@ static void free_event_rcu(struct rcu_head *head)
  static void ring_buffer_attach(struct perf_event *event,
                                struct ring_buffer *rb);
  
+static void detach_sb_event(struct perf_event *event)
+{
+       struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+       raw_spin_lock(&pel->lock);
+       list_del_rcu(&event->sb_list);
+       raw_spin_unlock(&pel->lock);
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+       if (event->parent)
+               return;
+
+       if (event->attach_state & PERF_ATTACH_TASK)
+               return;
+
+       detach_sb_event(event);
+}
+
  static void unaccount_event_cpu(struct perf_event *event, int cpu)
  {
         if (event->parent)
@@ -3625,6 +3756,8 @@ static void unaccount_event(struct perf_event *event)
         }
  
         unaccount_event_cpu(event, event->cpu);
+
+       unaccount_pmu_sb_event(event);
  }
  
  static void perf_sched_delayed(struct work_struct *work)
@@ -3720,6 +3853,9 @@ static bool exclusive_event_installable(struct perf_event *event,
         return true;
  }
  
+static void perf_addr_filters_splice(struct perf_event *event,
+                                      struct list_head *head);
+
  static void _free_event(struct perf_event *event)
  {
         irq_work_sync(&event->pending);
@@ -3747,6 +3883,8 @@ static void _free_event(struct perf_event *event)
         }
  
         perf_event_free_bpf_prog(event);
+       perf_addr_filters_splice(event, NULL);
+       kfree(event->addr_filters_offs);
  
         if (event->destroy)
                 event->destroy(event);
@@ -4343,6 +4481,19 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
         case PERF_EVENT_IOC_SET_BPF:
                 return perf_event_set_bpf_prog(event, arg);
  
+       case PERF_EVENT_IOC_PAUSE_OUTPUT: {
+               struct ring_buffer *rb;
+
+               rcu_read_lock();
+               rb = rcu_dereference(event->rb);
+               if (!rb || !rb->nr_pages) {
+                       rcu_read_unlock();
+                       return -EINVAL;
+               }
+               rb_toggle_paused(rb, !!arg);
+               rcu_read_unlock();
+               return 0;
+       }
         default:
                 return -ENOTTY;
         }
@@ -4659,6 +4810,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
                 event->pmu->event_mapped(event);
  }
  
+static void perf_pmu_output_stop(struct perf_event *event);
+
  /*
   * A buffer can be mmap()ed multiple times; either directly through the same
   * event, or through other events by use of perf_event_set_output().
@@ -4686,10 +4839,22 @@ static void perf_mmap_close(struct vm_area_struct *vma)
          */
         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+               /*
+                * Stop all AUX events that are writing to this buffer,
+                * so that we can free its AUX pages and corresponding PMU
+                * data. Note that after rb::aux_mmap_count dropped to zero,
+                * they won't start any more (see perf_aux_output_begin()).
+                */
+               perf_pmu_output_stop(event);
+
+               /* now it's safe to free the pages */
                 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
                 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
  
+               /* this has to be the last one */
                 rb_free_aux(rb);
+               WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
+
                 mutex_unlock(&event->mmap_mutex);
         }
  
@@ -5630,9 +5795,13 @@ void perf_prepare_sample(struct perf_event_header *header,
         }
  }
  
-void perf_event_output(struct perf_event *event,
-                       struct perf_sample_data *data,
-                       struct pt_regs *regs)
+static void __always_inline
+__perf_event_output(struct perf_event *event,
+                   struct perf_sample_data *data,
+                   struct pt_regs *regs,
+                   int (*output_begin)(struct perf_output_handle *,
+                                       struct perf_event *,
+                                       unsigned int))
  {
         struct perf_output_handle handle;
         struct perf_event_header header;
@@ -5642,7 +5811,7 @@ void perf_event_output(struct perf_event *event,
  
         perf_prepare_sample(&header, data, event, regs);
  
-       if (perf_output_begin(&handle, event, header.size))
+       if (output_begin(&handle, event, header.size))
                 goto exit;
  
         perf_output_sample(&handle, &header, data, event);
@@ -5653,6 +5822,30 @@ exit:
         rcu_read_unlock();
  }
  
+void
+perf_event_output_forward(struct perf_event *event,
+                        struct perf_sample_data *data,
+                        struct pt_regs *regs)
+{
+       __perf_event_output(event, data, regs, perf_output_begin_forward);
+}
+
+void
+perf_event_output_backward(struct perf_event *event,
+                          struct perf_sample_data *data,
+                          struct pt_regs *regs)
+{
+       __perf_event_output(event, data, regs, perf_output_begin_backward);
+}
+
+void
+perf_event_output(struct perf_event *event,
+                 struct perf_sample_data *data,
+                 struct pt_regs *regs)
+{
+       __perf_event_output(event, data, regs, perf_output_begin);
+}
+
  /*
   * read event_id
   */
@@ -5693,16 +5886,33 @@ perf_event_read_event(struct perf_event *event,
         perf_output_end(&handle);
  }
  
-typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+typedef void (perf_iterate_f)(struct perf_event *event, void *data);
  
  static void
-perf_event_aux_ctx(struct perf_event_context *ctx,
-                  perf_event_aux_output_cb output,
-                  void *data)
+perf_iterate_ctx(struct perf_event_context *ctx,
+                  perf_iterate_f output,
+                  void *data, bool all)
  {
         struct perf_event *event;
  
         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+               if (!all) {
+                       if (event->state < PERF_EVENT_STATE_INACTIVE)
+                               continue;
+                       if (!event_filter_match(event))
+                               continue;
+               }
+
+               output(event, data);
+       }
+}
+
+static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
+{
+       struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
+       struct perf_event *event;
+
+       list_for_each_entry_rcu(event, &pel->list, sb_list) {
                 if (event->state < PERF_EVENT_STATE_INACTIVE)
                         continue;
                 if (!event_filter_match(event))
@@ -5711,51 +5921,168 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
         }
  }
  
+/*
+ * Iterate all events that need to receive side-band events.
+ *
+ * For new callers; ensure that account_pmu_sb_event() includes
+ * your event, otherwise it might not get delivered.
+ */
  static void
-perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
-                       struct perf_event_context *task_ctx)
+perf_iterate_sb(perf_iterate_f output, void *data,
+              struct perf_event_context *task_ctx)
  {
+       struct perf_event_context *ctx;
+       int ctxn;
+
         rcu_read_lock();
         preempt_disable();
-       perf_event_aux_ctx(task_ctx, output, data);
+
+       /*
+        * If we have task_ctx != NULL we only notify the task context itself.
+        * The task_ctx is set only for EXIT events before releasing task
+        * context.
+        */
+       if (task_ctx) {
+               perf_iterate_ctx(task_ctx, output, data, false);
+               goto done;
+       }
+
+       perf_iterate_sb_cpu(output, data);
+
+       for_each_task_context_nr(ctxn) {
+               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+               if (ctx)
+                       perf_iterate_ctx(ctx, output, data, false);
+       }
+done:
         preempt_enable();
         rcu_read_unlock();
  }
  
-static void
-perf_event_aux(perf_event_aux_output_cb output, void *data,
-              struct perf_event_context *task_ctx)
+/*
+ * Clear all file-based filters at exec, they'll have to be
+ * re-instated when/if these objects are mmapped again.
+ */
+static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
+{
+       struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+       struct perf_addr_filter *filter;
+       unsigned int restart = 0, count = 0;
+       unsigned long flags;
+
+       if (!has_addr_filter(event))
+               return;
+
+       raw_spin_lock_irqsave(&ifh->lock, flags);
+       list_for_each_entry(filter, &ifh->list, entry) {
+               if (filter->inode) {
+                       event->addr_filters_offs[count] = 0;
+                       restart++;
+               }
+
+               count++;
+       }
+
+       if (restart)
+               event->addr_filters_gen++;
+       raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+       if (restart)
+               perf_event_restart(event);
+}
+
+void perf_event_exec(void)
  {
-       struct perf_cpu_context *cpuctx;
         struct perf_event_context *ctx;
-       struct pmu *pmu;
         int ctxn;
  
+       rcu_read_lock();
+       for_each_task_context_nr(ctxn) {
+               ctx = current->perf_event_ctxp[ctxn];
+               if (!ctx)
+                       continue;
+
+               perf_event_enable_on_exec(ctxn);
+
+               perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
+                                  true);
+       }
+       rcu_read_unlock();
+}
+
+struct remote_output {
+       struct ring_buffer      *rb;
+       int                     err;
+};
+
+static void __perf_event_output_stop(struct perf_event *event, void *data)
+{
+       struct perf_event *parent = event->parent;
+       struct remote_output *ro = data;
+       struct ring_buffer *rb = ro->rb;
+       struct stop_event_data sd = {
+               .event  = event,
+       };
+
+       if (!has_aux(event))
+               return;
+
+       if (!parent)
+               parent = event;
+
         /*
-        * If we have task_ctx != NULL we only notify
-        * the task context itself. The task_ctx is set
-        * only for EXIT events before releasing task
-        * context.
+        * In case of inheritance, it will be the parent that links to the
+        * ring-buffer, but it will be the child that's actually using it:
          */
-       if (task_ctx) {
-               perf_event_aux_task_ctx(output, data, task_ctx);
-               return;
-       }
+       if (rcu_dereference(parent->rb) == rb)
+               ro->err = __perf_event_stop(&sd);
+}
+
+static int __perf_pmu_output_stop(void *info)
+{
+       struct perf_event *event = info;
+       struct pmu *pmu = event->pmu;
+       struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+       struct remote_output ro = {
+               .rb     = event->rb,
+       };
  
         rcu_read_lock();
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-               if (cpuctx->unique_pmu != pmu)
-                       goto next;
-               perf_event_aux_ctx(&cpuctx->ctx, output, data);
-               ctxn = pmu->task_ctx_nr;
-               if (ctxn < 0)
-                       goto next;
-               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-               if (ctx)
-                       perf_event_aux_ctx(ctx, output, data);
-next:
-               put_cpu_ptr(pmu->pmu_cpu_context);
+       perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
+       if (cpuctx->task_ctx)
+               perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+                                  &ro, false);
+       rcu_read_unlock();
+
+       return ro.err;
+}
+
+static void perf_pmu_output_stop(struct perf_event *event)
+{
+       struct perf_event *iter;
+       int err, cpu;
+
+restart:
+       rcu_read_lock();
+       list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
+               /*
+                * For per-CPU events, we need to make sure that neither they
+                * nor their children are running; for cpu==-1 events it's
+                * sufficient to stop the event itself if it's active, since
+                * it can't have children.
+                */
+               cpu = iter->cpu;
+               if (cpu == -1)
+                       cpu = READ_ONCE(iter->oncpu);
+
+               if (cpu == -1)
+                       continue;
+
+               err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
+               if (err == -EAGAIN) {
+                       rcu_read_unlock();
+                       goto restart;
+               }
         }
         rcu_read_unlock();
  }
@@ -5852,7 +6179,7 @@ static void perf_event_task(struct task_struct *task,
                 },
         };
  
-       perf_event_aux(perf_event_task_output,
+       perf_iterate_sb(perf_event_task_output,
                        &task_event,
                        task_ctx);
  }
@@ -5931,7 +6258,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
  
         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
  
-       perf_event_aux(perf_event_comm_output,
+       perf_iterate_sb(perf_event_comm_output,
                        comm_event,
                        NULL);
  }
@@ -6162,13 +6489,94 @@ got_name:
  
         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
  
-       perf_event_aux(perf_event_mmap_output,
+       perf_iterate_sb(perf_event_mmap_output,
                        mmap_event,
                        NULL);
  
         kfree(buf);
  }
  
+/*
+ * Whether this @filter depends on a dynamic object which is not loaded
+ * yet or its load addresses are not known.
+ */
+static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
+{
+       return filter->filter && filter->inode;
+}
+
+/*
+ * Check whether inode and address range match filter criteria.
+ */
+static bool perf_addr_filter_match(struct perf_addr_filter *filter,
+                                    struct file *file, unsigned long offset,
+                                    unsigned long size)
+{
+       if (filter->inode != file->f_inode)
+               return false;
+
+       if (filter->offset > offset + size)
+               return false;
+
+       if (filter->offset + filter->size < offset)
+               return false;
+
+       return true;
+}
+
+static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
+{
+       struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+       struct vm_area_struct *vma = data;
+       unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
+       struct file *file = vma->vm_file;
+       struct perf_addr_filter *filter;
+       unsigned int restart = 0, count = 0;
+
+       if (!has_addr_filter(event))
+               return;
+
+       if (!file)
+               return;
+
+       raw_spin_lock_irqsave(&ifh->lock, flags);
+       list_for_each_entry(filter, &ifh->list, entry) {
+               if (perf_addr_filter_match(filter, file, off,
+                                            vma->vm_end - vma->vm_start)) {
+                       event->addr_filters_offs[count] = vma->vm_start;
+                       restart++;
+               }
+
+               count++;
+       }
+
+       if (restart)
+               event->addr_filters_gen++;
+       raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+       if (restart)
+               perf_event_restart(event);
+}
+
+/*
+ * Adjust all task's events' filters to the new vma
+ */
+static void perf_addr_filters_adjust(struct vm_area_struct *vma)
+{
+       struct perf_event_context *ctx;
+       int ctxn;
+
+       rcu_read_lock();
+       for_each_task_context_nr(ctxn) {
+               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+               if (!ctx)
+                       continue;
+
+               perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+       }
+       rcu_read_unlock();
+}
+
  void perf_event_mmap(struct vm_area_struct *vma)
  {
         struct perf_mmap_event mmap_event;
@@ -6200,6 +6608,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
                 /* .flags (attr_mmap2 only) */
         };
  
+       perf_addr_filters_adjust(vma);
         perf_event_mmap_event(&mmap_event);
  }
  
@@ -6350,7 +6759,7 @@ static void perf_event_switch(struct task_struct *task,
                 },
         };
  
-       perf_event_aux(perf_event_switch_output,
+       perf_iterate_sb(perf_event_switch_output,
                        &switch_event,
                        NULL);
  }
@@ -6491,10 +6900,7 @@ static int __perf_event_overflow(struct perf_event *event,
                 irq_work_queue(&event->pending);
         }
  
-       if (event->overflow_handler)
-               event->overflow_handler(event, data, regs);
-       else
-               perf_event_output(event, data, regs);
+       event->overflow_handler(event, data, regs);
  
         if (*perf_event_fasync(event) && event->pending_kill) {
                 event->pending_wakeup = 1;
@@ -7081,24 +7487,6 @@ static inline void perf_tp_register(void)
         perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
  }
  
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
-       char *filter_str;
-       int ret;
-
-       if (event->attr.type != PERF_TYPE_TRACEPOINT)
-               return -EINVAL;
-
-       filter_str = strndup_user(arg, PAGE_SIZE);
-       if (IS_ERR(filter_str))
-               return PTR_ERR(filter_str);
-
-       ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
-
-       kfree(filter_str);
-       return ret;
-}
-
  static void perf_event_free_filter(struct perf_event *event)
  {
         ftrace_profile_free_filter(event);
@@ -7153,11 +7541,6 @@ static inline void perf_tp_register(void)
  {
  }
  
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
-       return -ENOENT;
-}
-
  static void perf_event_free_filter(struct perf_event *event)
  {
  }
@@ -7185,6 +7568,387 @@ void perf_bp_event(struct perf_event *bp, void *data)
  }
  #endif
  
+/*
+ * Allocate a new address filter
+ */
+static struct perf_addr_filter *
+perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
+{
+       int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
+       struct perf_addr_filter *filter;
+
+       filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
+       if (!filter)
+               return NULL;
+
+       INIT_LIST_HEAD(&filter->entry);
+       list_add_tail(&filter->entry, filters);
+
+       return filter;
+}
+
+static void free_filters_list(struct list_head *filters)
+{
+       struct perf_addr_filter *filter, *iter;
+
+       list_for_each_entry_safe(filter, iter, filters, entry) {
+               if (filter->inode)
+                       iput(filter->inode);
+               list_del(&filter->entry);
+               kfree(filter);
+       }
+}
+
+/*
+ * Free existing address filters and optionally install new ones
+ */
+static void perf_addr_filters_splice(struct perf_event *event,
+                                    struct list_head *head)
+{
+       unsigned long flags;
+       LIST_HEAD(list);
+
+       if (!has_addr_filter(event))
+               return;
+
+       /* don't bother with children, they don't have their own filters */
+       if (event->parent)
+               return;
+
+       raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
+
+       list_splice_init(&event->addr_filters.list, &list);
+       if (head)
+               list_splice(head, &event->addr_filters.list);
+
+       raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
+
+       free_filters_list(&list);
+}
+
+/*
+ * Scan through mm's vmas and see if one of them matches the
+ * @filter; if so, adjust filter's address range.
+ * Called with mm::mmap_sem down for reading.
+ */
+static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
+                                           struct mm_struct *mm)
+{
+       struct vm_area_struct *vma;
+
+       for (vma = mm->mmap; vma; vma = vma->vm_next) {
+               struct file *file = vma->vm_file;
+               unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+               unsigned long vma_size = vma->vm_end - vma->vm_start;
+
+               if (!file)
+                       continue;
+
+               if (!perf_addr_filter_match(filter, file, off, vma_size))
+                       continue;
+
+               return vma->vm_start;
+       }
+
+       return 0;
+}
+
+/*
+ * Update event's address range filters based on the
+ * task's existing mappings, if any.
+ */
+static void perf_event_addr_filters_apply(struct perf_event *event)
+{
+       struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+       struct task_struct *task = READ_ONCE(event->ctx->task);
+       struct perf_addr_filter *filter;
+       struct mm_struct *mm = NULL;
+       unsigned int count = 0;
+       unsigned long flags;
+
+       /*
+        * We may observe TASK_TOMBSTONE, which means that the event tear-down
+        * will stop on the parent's child_mutex that our caller is also holding
+        */
+       if (task == TASK_TOMBSTONE)
+               return;
+
+       mm = get_task_mm(event->ctx->task);
+       if (!mm)
+               goto restart;
+
+       down_read(&mm->mmap_sem);
+
+       raw_spin_lock_irqsave(&ifh->lock, flags);
+       list_for_each_entry(filter, &ifh->list, entry) {
+               event->addr_filters_offs[count] = 0;
+
+               if (perf_addr_filter_needs_mmap(filter))
+                       event->addr_filters_offs[count] =
+                               perf_addr_filter_apply(filter, mm);
+
+               count++;
+       }
+
+       event->addr_filters_gen++;
+       raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+       up_read(&mm->mmap_sem);
+
+       mmput(mm);
+
+restart:
+       perf_event_restart(event);
+}
+
+/*
+ * Address range filtering: limiting the data to certain
+ * instruction address ranges. Filters are ioctl()ed to us from
+ * userspace as ascii strings.
+ *
+ * Filter string format:
+ *
+ * ACTION RANGE_SPEC
+ * where ACTION is one of the
+ *  * "filter": limit the trace to this region
+ *  * "start": start tracing from this address
+ *  * "stop": stop tracing at this address/region;
+ * RANGE_SPEC is
+ *  * for kernel addresses: <start address>[/<size>]
+ *  * for object files:     <start address>[/<size>]@</path/to/object/file>
+ *
+ * if <size> is not specified, the range is treated as a single address.
+ */
+enum {
+       IF_ACT_FILTER,
+       IF_ACT_START,
+       IF_ACT_STOP,
+       IF_SRC_FILE,
+       IF_SRC_KERNEL,
+       IF_SRC_FILEADDR,
+       IF_SRC_KERNELADDR,
+};
+
+enum {
+       IF_STATE_ACTION = 0,
+       IF_STATE_SOURCE,
+       IF_STATE_END,
+};
+
+static const match_table_t if_tokens = {
+       { IF_ACT_FILTER,        "filter" },
+       { IF_ACT_START,         "start" },
+       { IF_ACT_STOP,          "stop" },
+       { IF_SRC_FILE,          "%u/%u@%s" },
+       { IF_SRC_KERNEL,        "%u/%u" },
+       { IF_SRC_FILEADDR,      "%u@%s" },
+       { IF_SRC_KERNELADDR,    "%u" },
+};
+
+/*
+ * Address filter string parser
+ */
+static int
+perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
+                            struct list_head *filters)
+{
+       struct perf_addr_filter *filter = NULL;
+       char *start, *orig, *filename = NULL;
+       struct path path;
+       substring_t args[MAX_OPT_ARGS];
+       int state = IF_STATE_ACTION, token;
+       unsigned int kernel = 0;
+       int ret = -EINVAL;
+
+       orig = fstr = kstrdup(fstr, GFP_KERNEL);
+       if (!fstr)
+               return -ENOMEM;
+
+       while ((start = strsep(&fstr, " ,\n")) != NULL) {
+               ret = -EINVAL;
+
+               if (!*start)
+                       continue;
+
+               /* filter definition begins */
+               if (state == IF_STATE_ACTION) {
+                       filter = perf_addr_filter_new(event, filters);
+                       if (!filter)
+                               goto fail;
+               }
+
+               token = match_token(start, if_tokens, args);
+               switch (token) {
+               case IF_ACT_FILTER:
+               case IF_ACT_START:
+                       filter->filter = 1;
+
+               case IF_ACT_STOP:
+                       if (state != IF_STATE_ACTION)
+                               goto fail;
+
+                       state = IF_STATE_SOURCE;
+                       break;
+
+               case IF_SRC_KERNELADDR:
+               case IF_SRC_KERNEL:
+                       kernel = 1;
+
+               case IF_SRC_FILEADDR:
+               case IF_SRC_FILE:
+                       if (state != IF_STATE_SOURCE)
+                               goto fail;
+
+                       if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
+                               filter->range = 1;
+
+                       *args[0].to = 0;
+                       ret = kstrtoul(args[0].from, 0, &filter->offset);
+                       if (ret)
+                               goto fail;
+
+                       if (filter->range) {
+                               *args[1].to = 0;
+                               ret = kstrtoul(args[1].from, 0, &filter->size);
+                               if (ret)
+                                       goto fail;
+                       }
+
+                       if (token == IF_SRC_FILE) {
+                               filename = match_strdup(&args[2]);
+                               if (!filename) {
+                                       ret = -ENOMEM;
+                                       goto fail;
+                               }
+                       }
+
+                       state = IF_STATE_END;
+                       break;
+
+               default:
+                       goto fail;
+               }
+
+               /*
+                * Filter definition is fully parsed, validate and install it.
+                * Make sure that it doesn't contradict itself or the event's
+                * attribute.
+                */
+               if (state == IF_STATE_END) {
+                       if (kernel && event->attr.exclude_kernel)
+                               goto fail;
+
+                       if (!kernel) {
+                               if (!filename)
+                                       goto fail;
+
+                               /* look up the path and grab its inode */
+                               ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+                               if (ret)
+                                       goto fail_free_name;
+
+                               filter->inode = igrab(d_inode(path.dentry));
+                               path_put(&path);
+                               kfree(filename);
+                               filename = NULL;
+
+                               ret = -EINVAL;
+                               if (!filter->inode ||
+                                   !S_ISREG(filter->inode->i_mode))
+                                       /* free_filters_list() will iput() */
+                                       goto fail;
+                       }
+
+                       /* ready to consume more filters */
+                       state = IF_STATE_ACTION;
+                       filter = NULL;
+               }
+       }
+
+       if (state != IF_STATE_ACTION)
+               goto fail;
+
+       kfree(orig);
+
+       return 0;
+
+fail_free_name:
+       kfree(filename);
+fail:
+       free_filters_list(filters);
+       kfree(orig);
+
+       return ret;
+}
+
+static int
+perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
+{
+       LIST_HEAD(filters);
+       int ret;
+
+       /*
+        * Since this is called in perf_ioctl() path, we're already holding
+        * ctx::mutex.
+        */
+       lockdep_assert_held(&event->ctx->mutex);
+
+       if (WARN_ON_ONCE(event->parent))
+               return -EINVAL;
+
+       /*
+        * For now, we only support filtering in per-task events; doing so
+        * for CPU-wide events requires additional context switching trickery,
+        * since same object code will be mapped at different virtual
+        * addresses in different processes.
+        */
+       if (!event->ctx->task)
+               return -EOPNOTSUPP;
+
+       ret = perf_event_parse_addr_filter(event, filter_str, &filters);
+       if (ret)
+               return ret;
+
+       ret = event->pmu->addr_filters_validate(&filters);
+       if (ret) {
+               free_filters_list(&filters);
+               return ret;
+       }
+
+       /* remove existing filters, if any */
+       perf_addr_filters_splice(event, &filters);
+
+       /* install new filters */
+       perf_event_for_each_child(event, perf_event_addr_filters_apply);
+
+       return ret;
+}
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+       char *filter_str;
+       int ret = -EINVAL;
+
+       if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
+           !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
+           !has_addr_filter(event))
+               return -EINVAL;
+
+       filter_str = strndup_user(arg, PAGE_SIZE);
+       if (IS_ERR(filter_str))
+               return PTR_ERR(filter_str);
+
+       if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
+           event->attr.type == PERF_TYPE_TRACEPOINT)
+               ret = ftrace_profile_set_filter(event, event->attr.config,
+                                               filter_str);
+       else if (has_addr_filter(event))
+               ret = perf_event_set_addr_filter(event, filter_str);
+
+       kfree(filter_str);
+       return ret;
+}
+
  /*
   * hrtimer based swevent callback
   */
@@ -7542,6 +8306,20 @@ static void free_pmu_context(struct pmu *pmu)
  out:
         mutex_unlock(&pmus_lock);
  }
+
+/*
+ * Let userspace know that this PMU supports address range filtering:
+ */
+static ssize_t nr_addr_filters_show(struct device *dev,
+                                   struct device_attribute *attr,
+                                   char *page)
+{
+       struct pmu *pmu = dev_get_drvdata(dev);
+
+       return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
+}
+DEVICE_ATTR_RO(nr_addr_filters);
+
  static struct idr pmu_idr;
  
  static ssize_t
@@ -7643,9 +8421,19 @@ static int pmu_dev_alloc(struct pmu *pmu)
         if (ret)
                 goto free_dev;
  
+       /* For PMUs with address filters, throw in an extra attribute: */
+       if (pmu->nr_addr_filters)
+               ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
+
+       if (ret)
+               goto del_dev;
+
  out:
         return ret;
  
+del_dev:
+       device_del(pmu->dev);
+
  free_dev:
         put_device(pmu->dev);
         goto out;
@@ -7685,6 +8473,21 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
         }
  
  skip_type:
+       if (pmu->task_ctx_nr == perf_hw_context) {
+               static int hw_context_taken = 0;
+
+               /*
+                * Other than systems with heterogeneous CPUs, it never makes
+                * sense for two PMUs to share perf_hw_context. PMUs which are
+                * uncore must use perf_invalid_context.
+                */
+               if (WARN_ON_ONCE(hw_context_taken &&
+                   !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
+                       pmu->task_ctx_nr = perf_invalid_context;
+
+               hw_context_taken = 1;
+       }
+
         pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
         if (pmu->pmu_cpu_context)
                 goto got_cpu_context;
@@ -7772,6 +8575,8 @@ void perf_pmu_unregister(struct pmu *pmu)
         free_percpu(pmu->pmu_disable_count);
         if (pmu->type >= PERF_TYPE_MAX)
                 idr_remove(&pmu_idr, pmu->type);
+       if (pmu->nr_addr_filters)
+               device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
         device_del(pmu->dev);
         put_device(pmu->dev);
         free_pmu_context(pmu);
@@ -7843,6 +8648,39 @@ unlock:
         return pmu;
  }
  
+static void attach_sb_event(struct perf_event *event)
+{
+       struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+       raw_spin_lock(&pel->lock);
+       list_add_rcu(&event->sb_list, &pel->list);
+       raw_spin_unlock(&pel->lock);
+}
+
+/*
+ * We keep a list of all !task (and therefore per-cpu) events
+ * that need to receive side-band records.
+ *
+ * This avoids having to scan all the various PMU per-cpu contexts
+ * looking for them.
+ */
+static void account_pmu_sb_event(struct perf_event *event)
+{
+       struct perf_event_attr *attr = &event->attr;
+
+       if (event->parent)
+               return;
+
+       if (event->attach_state & PERF_ATTACH_TASK)
+               return;
+
+       if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+           attr->comm || attr->comm_exec ||
+           attr->task ||
+           attr->context_switch)
+               attach_sb_event(event);
+}
+
  static void account_event_cpu(struct perf_event *event, int cpu)
  {
         if (event->parent)
@@ -7923,6 +8761,8 @@ static void account_event(struct perf_event *event)
  enabled:
  
         account_event_cpu(event, event->cpu);
+
+       account_pmu_sb_event(event);
  }
  
  /*
@@ -7965,6 +8805,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
         INIT_LIST_HEAD(&event->sibling_list);
         INIT_LIST_HEAD(&event->rb_entry);
         INIT_LIST_HEAD(&event->active_entry);
+       INIT_LIST_HEAD(&event->addr_filters.list);
         INIT_HLIST_NODE(&event->hlist_entry);
  
  
@@ -7972,6 +8813,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
         init_irq_work(&event->pending, perf_pending_event);
  
         mutex_init(&event->mmap_mutex);
+       raw_spin_lock_init(&event->addr_filters.lock);
  
         atomic_long_set(&event->refcount, 1);
         event->cpu              = cpu;
@@ -8006,8 +8848,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                 context = parent_event->overflow_handler_context;
         }
  
-       event->overflow_handler = overflow_handler;
-       event->overflow_handler_context = context;
+       if (overflow_handler) {
+               event->overflow_handler = overflow_handler;
+               event->overflow_handler_context = context;
+       } else if (is_write_backward(event)){
+               event->overflow_handler = perf_event_output_backward;
+               event->overflow_handler_context = NULL;
+       } else {
+               event->overflow_handler = perf_event_output_forward;
+               event->overflow_handler_context = NULL;
+       }
  
         perf_event__state_init(event);
  
@@ -8048,11 +8898,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
         if (err)
                 goto err_pmu;
  
+       if (has_addr_filter(event)) {
+               event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
+                                                  sizeof(unsigned long),
+                                                  GFP_KERNEL);
+               if (!event->addr_filters_offs)
+                       goto err_per_task;
+
+               /* force hw sync on the address filters */
+               event->addr_filters_gen = 1;
+       }
+
         if (!event->parent) {
                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
-                       err = get_callchain_buffers();
+                       err = get_callchain_buffers(attr->sample_max_stack);
                         if (err)
-                               goto err_per_task;
+                               goto err_addr_filters;
                 }
         }
  
@@ -8061,6 +8922,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
  
         return event;
  
+err_addr_filters:
+       kfree(event->addr_filters_offs);
+
  err_per_task:
         exclusive_event_destroy(event);
  
@@ -8239,6 +9103,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
         if (output_event->clock != event->clock)
                 goto out;
  
+       /*
+        * Either writing ring buffer from beginning or from end.
+        * Mixing is not allowed.
+        */
+       if (is_write_backward(output_event) != is_write_backward(event))
+               goto out;
+
         /*
          * If both events generate aux data, they must be on the same PMU
          */
@@ -8362,6 +9233,9 @@ SYSCALL_DEFINE5(perf_event_open,
                         return -EINVAL;
         }
  
+       if (!attr.sample_max_stack)
+               attr.sample_max_stack = sysctl_perf_event_max_stack;
+
         /*
          * In cgroup mode, the pid argument is used to pass the fd
          * opened to the cgroup directory in cgroupfs. The cpu argument
@@ -8435,7 +9309,7 @@ SYSCALL_DEFINE5(perf_event_open,
  
         if (is_sampling_event(event)) {
                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
-                       err = -ENOTSUPP;
+                       err = -EOPNOTSUPP;
                         goto err_alloc;
                 }
         }
@@ -9397,6 +10271,9 @@ static void __init perf_event_init_all_cpus(void)
                 swhash = &per_cpu(swevent_htable, cpu);
                 mutex_init(&swhash->hlist_mutex);
                 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+
+               INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
+               raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
         }
  }