tracing: Use pid bitmap instead of a pid array for set_event_pid
[cascardo/linux.git] / kernel / trace / trace_events.c
index 05ddc08..45f7cc7 100644 (file)
@@ -15,7 +15,7 @@
 #include <linux/kthread.h>
 #include <linux/tracefs.h>
 #include <linux/uaccess.h>
-#include <linux/bsearch.h>
+#include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/sort.h>
@@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr)
        mutex_unlock(&event_mutex);
 }
 
-static int cmp_pid(const void *key, const void *elt)
-{
-       const pid_t *search_pid = key;
-       const pid_t *pid = elt;
-
-       if (*search_pid == *pid)
-               return 0;
-       if (*search_pid < *pid)
-               return -1;
-       return 1;
-}
+/* Shouldn't this be in a header? */
+extern int pid_max;
 
 static bool
-check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task)
+ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 {
-       pid_t search_pid;
-       pid_t *pid;
+       pid_t pid;
 
        /*
         * Return false, because if filtered_pids does not exist,
@@ -496,15 +486,16 @@ check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task)
        if (!filtered_pids)
                return false;
 
-       search_pid = task->pid;
+       pid = task->pid;
 
-       pid = bsearch(&search_pid, filtered_pids->pids,
-                     filtered_pids->nr_pids, sizeof(pid_t),
-                     cmp_pid);
-       if (!pid)
+       /*
+        * If pid_max changed after filtered_pids was created, we
+        * by default ignore all pids greater than the previous pid_max.
+        */
+       if (task->pid >= filtered_pids->pid_max)
                return true;
 
-       return false;
+       return !test_bit(task->pid, filtered_pids->pids);
 }
 
 static void
@@ -517,8 +508,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
        pid_list = rcu_dereference_sched(tr->filtered_pids);
 
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                      check_ignore_pid(pid_list, prev) &&
-                      check_ignore_pid(pid_list, next));
+                      ignore_this_task(pid_list, prev) &&
+                      ignore_this_task(pid_list, next));
 }
 
 static void
@@ -531,7 +522,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
        pid_list = rcu_dereference_sched(tr->filtered_pids);
 
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                      check_ignore_pid(pid_list, next));
+                      ignore_this_task(pid_list, next));
 }
 
 static void
@@ -547,7 +538,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
        pid_list = rcu_dereference_sched(tr->filtered_pids);
 
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                      check_ignore_pid(pid_list, task));
+                      ignore_this_task(pid_list, task));
 }
 
 static void
@@ -564,7 +555,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
 
        /* Set tracing if current is enabled */
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                      check_ignore_pid(pid_list, current));
+                      ignore_this_task(pid_list, current));
 }
 
 static void __ftrace_clear_event_pids(struct trace_array *tr)
@@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
        /* Wait till all users are no longer using pid filtering */
        synchronize_sched();
 
-       free_pages((unsigned long)pid_list->pids, pid_list->order);
+       vfree(pid_list->pids);
        kfree(pid_list);
 }
 
@@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p)
        mutex_unlock(&event_mutex);
 }
 
+static void *
+p_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       struct trace_array *tr = m->private;
+       struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
+       unsigned long pid = (unsigned long)v;
+
+       (*pos)++;
+
+       /* pid already is +1 of the actual prevous bit */
+       pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+
+       /* Return pid + 1 to allow zero to be represented */
+       if (pid < pid_list->pid_max)
+               return (void *)(pid + 1);
+
+       return NULL;
+}
+
 static void *p_start(struct seq_file *m, loff_t *pos)
        __acquires(RCU)
 {
        struct trace_pid_list *pid_list;
        struct trace_array *tr = m->private;
+       unsigned long pid;
+       loff_t l = 0;
 
        /*
         * Grab the mutex, to keep calls to p_next() having the same
@@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos)
 
        pid_list = rcu_dereference_sched(tr->filtered_pids);
 
-       if (!pid_list || *pos >= pid_list->nr_pids)
+       if (!pid_list)
+               return NULL;
+
+       pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+       if (pid >= pid_list->pid_max)
                return NULL;
 
-       return (void *)&pid_list->pids[*pos];
+       /* Return pid + 1 so that zero can be the exit value */
+       for (pid++; pid && l < *pos;
+            pid = (unsigned long)p_next(m, (void *)pid, &l))
+               ;
+       return (void *)pid;
 }
 
 static void p_stop(struct seq_file *m, void *p)
@@ -976,25 +996,11 @@ static void p_stop(struct seq_file *m, void *p)
        mutex_unlock(&event_mutex);
 }
 
-static void *
-p_next(struct seq_file *m, void *v, loff_t *pos)
-{
-       struct trace_array *tr = m->private;
-       struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
-
-       (*pos)++;
-
-       if (*pos >= pid_list->nr_pids)
-               return NULL;
-
-       return (void *)&pid_list->pids[*pos];
-}
-
 static int p_show(struct seq_file *m, void *v)
 {
-       pid_t *pid = v;
+       unsigned long pid = (unsigned long)v - 1;
 
-       seq_printf(m, "%d\n", *pid);
+       seq_printf(m, "%lu\n", pid);
        return 0;
 }
 
@@ -1543,11 +1549,6 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
        return r;
 }
 
-static int max_pids(struct trace_pid_list *pid_list)
-{
-       return (PAGE_SIZE << pid_list->order) / sizeof(pid_t);
-}
-
 static void ignore_task_cpu(void *data)
 {
        struct trace_array *tr = data;
@@ -1561,7 +1562,7 @@ static void ignore_task_cpu(void *data)
                                             mutex_is_locked(&event_mutex));
 
        this_cpu_write(tr->trace_buffer.data->ignore_pid,
-                      check_ignore_pid(pid_list, current));
+                      ignore_this_task(pid_list, current));
 }
 
 static ssize_t
@@ -1571,7 +1572,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
        struct seq_file *m = filp->private_data;
        struct trace_array *tr = m->private;
        struct trace_pid_list *filtered_pids = NULL;
-       struct trace_pid_list *pid_list = NULL;
+       struct trace_pid_list *pid_list;
        struct trace_event_file *file;
        struct trace_parser parser;
        unsigned long val;
@@ -1579,7 +1580,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
        ssize_t read = 0;
        ssize_t ret = 0;
        pid_t pid;
-       int i;
+       int nr_pids = 0;
 
        if (!cnt)
                return 0;
@@ -1592,10 +1593,43 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
                return -ENOMEM;
 
        mutex_lock(&event_mutex);
+       filtered_pids = rcu_dereference_protected(tr->filtered_pids,
+                                            lockdep_is_held(&event_mutex));
+
        /*
-        * Load as many pids into the array before doing a
-        * swap from the tr->filtered_pids to the new list.
+        * Always recreate a new array. The write is an all or nothing
+        * operation. Always create a new array when adding new pids by
+        * the user. If the operation fails, then the current list is
+        * not modified.
         */
+       pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+       if (!pid_list) {
+               read = -ENOMEM;
+               goto out;
+       }
+       pid_list->pid_max = READ_ONCE(pid_max);
+       /* Only truncating will shrink pid_max */
+       if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
+               pid_list->pid_max = filtered_pids->pid_max;
+       pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
+       if (!pid_list->pids) {
+               kfree(pid_list);
+               read = -ENOMEM;
+               goto out;
+       }
+       if (filtered_pids) {
+               /* copy the current bits to the new max */
+               pid = find_first_bit(filtered_pids->pids,
+                                    filtered_pids->pid_max);
+               while (pid < filtered_pids->pid_max) {
+                       set_bit(pid, pid_list->pids);
+                       pid = find_next_bit(filtered_pids->pids,
+                                           filtered_pids->pid_max,
+                                           pid + 1);
+                       nr_pids++;
+               }
+       }
+
        while (cnt > 0) {
 
                this_pos = 0;
@@ -1613,92 +1647,35 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
                ret = -EINVAL;
                if (kstrtoul(parser.buffer, 0, &val))
                        break;
-               if (val > INT_MAX)
+               if (val >= pid_list->pid_max)
                        break;
 
                pid = (pid_t)val;
 
-               ret = -ENOMEM;
-               if (!pid_list) {
-                       pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
-                       if (!pid_list)
-                               break;
-
-                       filtered_pids = rcu_dereference_protected(tr->filtered_pids,
-                                                       lockdep_is_held(&event_mutex));
-                       if (filtered_pids)
-                               pid_list->order = filtered_pids->order;
-                       else
-                               pid_list->order = 0;
-
-                       pid_list->pids = (void *)__get_free_pages(GFP_KERNEL,
-                                                                 pid_list->order);
-                       if (!pid_list->pids)
-                               break;
-
-                       if (filtered_pids) {
-                               pid_list->nr_pids = filtered_pids->nr_pids;
-                               memcpy(pid_list->pids, filtered_pids->pids,
-                                      pid_list->nr_pids * sizeof(pid_t));
-                       } else
-                               pid_list->nr_pids = 0;
-               }
-
-               if (pid_list->nr_pids >= max_pids(pid_list)) {
-                       pid_t *pid_page;
-
-                       pid_page = (void *)__get_free_pages(GFP_KERNEL,
-                                                           pid_list->order + 1);
-                       if (!pid_page)
-                               break;
-                       memcpy(pid_page, pid_list->pids,
-                              pid_list->nr_pids * sizeof(pid_t));
-                       free_pages((unsigned long)pid_list->pids, pid_list->order);
-
-                       pid_list->order++;
-                       pid_list->pids = pid_page;
-               }
+               set_bit(pid, pid_list->pids);
+               nr_pids++;
 
-               pid_list->pids[pid_list->nr_pids++] = pid;
                trace_parser_clear(&parser);
                ret = 0;
        }
        trace_parser_put(&parser);
 
        if (ret < 0) {
-               if (pid_list)
-                       free_pages((unsigned long)pid_list->pids, pid_list->order);
+               vfree(pid_list->pids);
                kfree(pid_list);
-               mutex_unlock(&event_mutex);
-               return ret;
-       }
-
-       if (!pid_list) {
-               mutex_unlock(&event_mutex);
-               return ret;
+               read = ret;
+               goto out;
        }
 
-       sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL);
-
-       /* Remove duplicates */
-       for (i = 1; i < pid_list->nr_pids; i++) {
-               int start = i;
-
-               while (i < pid_list->nr_pids &&
-                      pid_list->pids[i - 1] == pid_list->pids[i])
-                       i++;
-
-               if (start != i) {
-                       if (i < pid_list->nr_pids) {
-                               memmove(&pid_list->pids[start], &pid_list->pids[i],
-                                       (pid_list->nr_pids - i) * sizeof(pid_t));
-                               pid_list->nr_pids -= i - start;
-                               i = start;
-                       } else
-                               pid_list->nr_pids = start;
-               }
+       if (!nr_pids) {
+               /* Cleared the list of pids */
+               vfree(pid_list->pids);
+               kfree(pid_list);
+               read = ret;
+               if (!filtered_pids)
+                       goto out;
+               pid_list = NULL;
        }
-
        rcu_assign_pointer(tr->filtered_pids, pid_list);
 
        list_for_each_entry(file, &tr->events, list) {
@@ -1708,7 +1685,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
        if (filtered_pids) {
                synchronize_sched();
 
-               free_pages((unsigned long)filtered_pids->pids, filtered_pids->order);
+               vfree(filtered_pids->pids);
                kfree(filtered_pids);
        } else {
                /*
@@ -1745,10 +1722,12 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
         */
        on_each_cpu(ignore_task_cpu, tr, 1);
 
+ out:
        mutex_unlock(&event_mutex);
 
        ret = read;
-       *ppos += read;
+       if (read > 0)
+               *ppos += read;
 
        return ret;
 }