Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux

[cascardo/linux.git] / kernel / events / core.c
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 953c143..d724e77 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly     = DEFAULT_MAX_SAMPLE_RATE;
  static int max_samples_per_tick __read_mostly  = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
  static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
  
-static atomic_t perf_sample_allowed_ns __read_mostly =
-       ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+static int perf_sample_allowed_ns __read_mostly =
+       DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
  
  void update_perf_cpu_limits(void)
  {
@@ -184,7 +184,7 @@ void update_perf_cpu_limits(void)
  
         tmp *= sysctl_perf_cpu_time_max_percent;
         do_div(tmp, 100);
-       atomic_set(&perf_sample_allowed_ns, tmp);
+       ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
  }
  
  static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
                 void __user *buffer, size_t *lenp,
                 loff_t *ppos)
  {
-       int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+       int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  
         if (ret || !write)
                 return ret;
@@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
   * we detect that events are taking too long.
   */
  #define NR_ACCUMULATED_SAMPLES 128
-DEFINE_PER_CPU(u64, running_sample_length);
+static DEFINE_PER_CPU(u64, running_sample_length);
  
  void perf_sample_event_took(u64 sample_len_ns)
  {
         u64 avg_local_sample_len;
         u64 local_samples_len;
+       u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
  
-       if (atomic_read(&perf_sample_allowed_ns) == 0)
+       if (allowed_ns == 0)
                 return;
  
         /* decay the counter by 1 average sample */
@@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns)
          */
         avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
  
-       if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+       if (avg_local_sample_len <= allowed_ns)
                 return;
  
         if (max_samples_per_tick <= 1)
@@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns)
         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
  
         printk_ratelimited(KERN_WARNING
-                       "perf samples too long (%lld > %d), lowering "
+                       "perf samples too long (%lld > %lld), lowering "
                         "kernel.perf_event_max_sample_rate to %d\n",
-                       avg_local_sample_len,
-                       atomic_read(&perf_sample_allowed_ns),
+                       avg_local_sample_len, allowed_ns,
                         sysctl_perf_event_sample_rate);
  
         update_perf_cpu_limits();
@@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
                 put_ctx(ctx->parent_ctx);
                 ctx->parent_ctx = NULL;
         }
+       ctx->generation++;
  }
  
  static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
         ctx->nr_events++;
         if (event->attr.inherit_stat)
                 ctx->nr_stat++;
+
+       ctx->generation++;
  }
  
  /*
@@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event)
         if (sample_type & PERF_SAMPLE_DATA_SRC)
                 size += sizeof(data->data_src.val);
  
+       if (sample_type & PERF_SAMPLE_TRANSACTION)
+               size += sizeof(data->txn);
+
         event->header_size = size;
  }
  
@@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
          */
         if (event->state > PERF_EVENT_STATE_OFF)
                 event->state = PERF_EVENT_STATE_OFF;
+
+       ctx->generation++;
  }
  
  static void perf_group_detach(struct perf_event *event)
@@ -2146,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
  }
  
  /*
- * Test whether two contexts are equivalent, i.e. whether they
- * have both been cloned from the same version of the same context
- * and they both have the same number of enabled events.
- * If the number of enabled events is the same, then the set
- * of enabled events should be the same, because these are both
- * inherited contexts, therefore we can't access individual events
- * in them directly with an fd; we can only enable/disable all
- * events via prctl, or enable/disable all events in a family
- * via ioctl, which will have the same effect on both contexts.
+ * Test whether two contexts are equivalent, i.e. whether they have both been
+ * cloned from the same version of the same context.
+ *
+ * Equivalence is measured using a generation number in the context that is
+ * incremented on each modification to it; see unclone_ctx(), list_add_event()
+ * and list_del_event().
   */
  static int context_equiv(struct perf_event_context *ctx1,
                          struct perf_event_context *ctx2)
  {
-       return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
-               && ctx1->parent_gen == ctx2->parent_gen
-               && !ctx1->pin_count && !ctx2->pin_count;
+       /* Pinning disables the swap optimization */
+       if (ctx1->pin_count || ctx2->pin_count)
+               return 0;
+
+       /* If ctx1 is the parent of ctx2 */
+       if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
+               return 1;
+
+       /* If ctx2 is the parent of ctx1 */
+       if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
+               return 1;
+
+       /*
+        * If ctx1 and ctx2 have the same parent; we flatten the parent
+        * hierarchy, see perf_event_init_context().
+        */
+       if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
+                       ctx1->parent_gen == ctx2->parent_gen)
+               return 1;
+
+       /* Unmatched */
+       return 0;
  }
  
  static void __perf_event_sync_stat(struct perf_event *event,
@@ -2210,9 +2234,6 @@ static void __perf_event_sync_stat(struct perf_event *event,
         perf_event_update_userpage(next_event);
  }
  
-#define list_next_entry(pos, member) \
-       list_entry(pos->member.next, typeof(*pos), member)
-
  static void perf_event_sync_stat(struct perf_event_context *ctx,
                                    struct perf_event_context *next_ctx)
  {
@@ -2244,7 +2265,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
  {
         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
         struct perf_event_context *next_ctx;
-       struct perf_event_context *parent;
+       struct perf_event_context *parent, *next_parent;
         struct perf_cpu_context *cpuctx;
         int do_switch = 1;
  
@@ -2256,10 +2277,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
                 return;
  
         rcu_read_lock();
-       parent = rcu_dereference(ctx->parent_ctx);
         next_ctx = next->perf_event_ctxp[ctxn];
-       if (parent && next_ctx &&
-           rcu_dereference(next_ctx->parent_ctx) == parent) {
+       if (!next_ctx)
+               goto unlock;
+
+       parent = rcu_dereference(ctx->parent_ctx);
+       next_parent = rcu_dereference(next_ctx->parent_ctx);
+
+       /* If neither context have a parent context; they cannot be clones. */
+       if (!parent && !next_parent)
+               goto unlock;
+
+       if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
                 /*
                  * Looks like the two contexts are clones, so we might be
                  * able to optimize the context switch.  We lock both
@@ -2287,6 +2316,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
                 raw_spin_unlock(&next_ctx->lock);
                 raw_spin_unlock(&ctx->lock);
         }
+unlock:
         rcu_read_unlock();
  
         if (do_switch) {
@@ -4572,6 +4602,9 @@ void perf_output_sample(struct perf_output_handle *handle,
         if (sample_type & PERF_SAMPLE_DATA_SRC)
                 perf_output_put(handle, data->data_src.val);
  
+       if (sample_type & PERF_SAMPLE_TRANSACTION)
+               perf_output_put(handle, data->txn);
+
         if (!event->attr.watermark) {
                 int wakeup_events = event->attr.wakeup_events;
  
@@ -5100,27 +5133,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
         unsigned int size;
         char tmp[16];
         char *buf = NULL;
-       const char *name;
-
-       memset(tmp, 0, sizeof(tmp));
+       char *name;
  
         if (file) {
                 struct inode *inode;
                 dev_t dev;
+
+               buf = kmalloc(PATH_MAX, GFP_KERNEL);
+               if (!buf) {
+                       name = "//enomem";
+                       goto cpy_name;
+               }
                 /*
-                * d_path works from the end of the rb backwards, so we
+                * d_path() works from the end of the rb backwards, so we
                  * need to add enough zero bytes after the string to handle
                  * the 64bit alignment we do later.
                  */
-               buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
-               if (!buf) {
-                       name = strncpy(tmp, "//enomem", sizeof(tmp));
-                       goto got_name;
-               }
-               name = d_path(&file->f_path, buf, PATH_MAX);
+               name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
                 if (IS_ERR(name)) {
-                       name = strncpy(tmp, "//toolong", sizeof(tmp));
-                       goto got_name;
+                       name = "//toolong";
+                       goto cpy_name;
                 }
                 inode = file_inode(vma->vm_file);
                 dev = inode->i_sb->s_dev;
@@ -5128,34 +5160,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
                 gen = inode->i_generation;
                 maj = MAJOR(dev);
                 min = MINOR(dev);
-
+               goto got_name;
         } else {
-               if (arch_vma_name(mmap_event->vma)) {
-                       name = strncpy(tmp, arch_vma_name(mmap_event->vma),
-                                      sizeof(tmp) - 1);
-                       tmp[sizeof(tmp) - 1] = '\0';
-                       goto got_name;
-               }
+               name = (char *)arch_vma_name(vma);
+               if (name)
+                       goto cpy_name;
  
-               if (!vma->vm_mm) {
-                       name = strncpy(tmp, "[vdso]", sizeof(tmp));
-                       goto got_name;
-               } else if (vma->vm_start <= vma->vm_mm->start_brk &&
+               if (vma->vm_start <= vma->vm_mm->start_brk &&
                                 vma->vm_end >= vma->vm_mm->brk) {
-                       name = strncpy(tmp, "[heap]", sizeof(tmp));
-                       goto got_name;
-               } else if (vma->vm_start <= vma->vm_mm->start_stack &&
+                       name = "[heap]";
+                       goto cpy_name;
+               }
+               if (vma->vm_start <= vma->vm_mm->start_stack &&
                                 vma->vm_end >= vma->vm_mm->start_stack) {
-                       name = strncpy(tmp, "[stack]", sizeof(tmp));
-                       goto got_name;
+                       name = "[stack]";
+                       goto cpy_name;
                 }
  
-               name = strncpy(tmp, "//anon", sizeof(tmp));
-               goto got_name;
+               name = "//anon";
+               goto cpy_name;
         }
  
+cpy_name:
+       strlcpy(tmp, name, sizeof(tmp));
+       name = tmp;
  got_name:
-       size = ALIGN(strlen(name)+1, sizeof(u64));
+       /*
+        * Since our buffer works in 8 byte units we need to align our string
+        * size to a multiple of 8. However, we must guarantee the tail end is
+        * zero'd out to avoid leaking random bits to userspace.
+        */
+       size = strlen(name)+1;
+       while (!IS_ALIGNED(size, sizeof(u64)))
+               name[size++] = '\0';
  
         mmap_event->file_name = name;
         mmap_event->file_size = size;
@@ -6292,6 +6329,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
  
         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
  }
+static DEVICE_ATTR_RO(type);
  
  static ssize_t
  perf_event_mux_interval_ms_show(struct device *dev,
@@ -6336,17 +6374,19 @@ perf_event_mux_interval_ms_store(struct device *dev,
  
         return count;
  }
+static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
  
-static struct device_attribute pmu_dev_attrs[] = {
-       __ATTR_RO(type),
-       __ATTR_RW(perf_event_mux_interval_ms),
-       __ATTR_NULL,
+static struct attribute *pmu_dev_attrs[] = {
+       &dev_attr_type.attr,
+       &dev_attr_perf_event_mux_interval_ms.attr,
+       NULL,
  };
+ATTRIBUTE_GROUPS(pmu_dev);
  
  static int pmu_bus_running;
  static struct bus_type pmu_bus = {
         .name           = "event_source",
-       .dev_attrs      = pmu_dev_attrs,
+       .dev_groups     = pmu_dev_groups,
  };
  
  static void pmu_dev_release(struct device *dev)
@@ -7126,7 +7166,6 @@ SYSCALL_DEFINE5(perf_event_open,
         }
  
         perf_install_in_context(ctx, event, event->cpu);
-       ++ctx->generation;
         perf_unpin_context(ctx);
         mutex_unlock(&ctx->mutex);
  
@@ -7209,7 +7248,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
         WARN_ON_ONCE(ctx->parent_ctx);
         mutex_lock(&ctx->mutex);
         perf_install_in_context(ctx, event, cpu);
-       ++ctx->generation;
         perf_unpin_context(ctx);
         mutex_unlock(&ctx->mutex);