Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input
[cascardo/linux.git] / tools / perf / builtin-sched.c
1 #include "builtin.h"
2 #include "perf.h"
3
4 #include "util/util.h"
5 #include "util/evlist.h"
6 #include "util/cache.h"
7 #include "util/evsel.h"
8 #include "util/symbol.h"
9 #include "util/thread.h"
10 #include "util/header.h"
11 #include "util/session.h"
12 #include "util/tool.h"
13 #include "util/cloexec.h"
14 #include "util/thread_map.h"
15 #include "util/color.h"
16
17 #include <subcmd/parse-options.h>
18 #include "util/trace-event.h"
19
20 #include "util/debug.h"
21
22 #include <sys/prctl.h>
23 #include <sys/resource.h>
24
25 #include <semaphore.h>
26 #include <pthread.h>
27 #include <math.h>
28 #include <api/fs/fs.h>
29
30 #define PR_SET_NAME             15               /* Set process name */
31 #define MAX_CPUS                4096
32 #define COMM_LEN                20
33 #define SYM_LEN                 129
34 #define MAX_PID                 1024000
35
36 struct sched_atom;
37
38 struct task_desc {
39         unsigned long           nr;
40         unsigned long           pid;
41         char                    comm[COMM_LEN];
42
43         unsigned long           nr_events;
44         unsigned long           curr_event;
45         struct sched_atom       **atoms;
46
47         pthread_t               thread;
48         sem_t                   sleep_sem;
49
50         sem_t                   ready_for_work;
51         sem_t                   work_done_sem;
52
53         u64                     cpu_usage;
54 };
55
56 enum sched_event_type {
57         SCHED_EVENT_RUN,
58         SCHED_EVENT_SLEEP,
59         SCHED_EVENT_WAKEUP,
60         SCHED_EVENT_MIGRATION,
61 };
62
63 struct sched_atom {
64         enum sched_event_type   type;
65         int                     specific_wait;
66         u64                     timestamp;
67         u64                     duration;
68         unsigned long           nr;
69         sem_t                   *wait_sem;
70         struct task_desc        *wakee;
71 };
72
73 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
74
75 enum thread_state {
76         THREAD_SLEEPING = 0,
77         THREAD_WAIT_CPU,
78         THREAD_SCHED_IN,
79         THREAD_IGNORE
80 };
81
82 struct work_atom {
83         struct list_head        list;
84         enum thread_state       state;
85         u64                     sched_out_time;
86         u64                     wake_up_time;
87         u64                     sched_in_time;
88         u64                     runtime;
89 };
90
91 struct work_atoms {
92         struct list_head        work_list;
93         struct thread           *thread;
94         struct rb_node          node;
95         u64                     max_lat;
96         u64                     max_lat_at;
97         u64                     total_lat;
98         u64                     nb_atoms;
99         u64                     total_runtime;
100         int                     num_merged;
101 };
102
103 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
104
105 struct perf_sched;
106
107 struct trace_sched_handler {
108         int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
109                             struct perf_sample *sample, struct machine *machine);
110
111         int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
112                              struct perf_sample *sample, struct machine *machine);
113
114         int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
115                             struct perf_sample *sample, struct machine *machine);
116
117         /* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
118         int (*fork_event)(struct perf_sched *sched, union perf_event *event,
119                           struct machine *machine);
120
121         int (*migrate_task_event)(struct perf_sched *sched,
122                                   struct perf_evsel *evsel,
123                                   struct perf_sample *sample,
124                                   struct machine *machine);
125 };
126
127 #define COLOR_PIDS PERF_COLOR_BLUE
128 #define COLOR_CPUS PERF_COLOR_BG_RED
129
130 struct perf_sched_map {
131         DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
132         int                     *comp_cpus;
133         bool                     comp;
134         struct thread_map       *color_pids;
135         const char              *color_pids_str;
136         struct cpu_map          *color_cpus;
137         const char              *color_cpus_str;
138         struct cpu_map          *cpus;
139         const char              *cpus_str;
140 };
141
142 struct perf_sched {
143         struct perf_tool tool;
144         const char       *sort_order;
145         unsigned long    nr_tasks;
146         struct task_desc **pid_to_task;
147         struct task_desc **tasks;
148         const struct trace_sched_handler *tp_handler;
149         pthread_mutex_t  start_work_mutex;
150         pthread_mutex_t  work_done_wait_mutex;
151         int              profile_cpu;
152 /*
153  * Track the current task - that way we can know whether there's any
154  * weird events, such as a task being switched away that is not current.
155  */
156         int              max_cpu;
157         u32              curr_pid[MAX_CPUS];
158         struct thread    *curr_thread[MAX_CPUS];
159         char             next_shortname1;
160         char             next_shortname2;
161         unsigned int     replay_repeat;
162         unsigned long    nr_run_events;
163         unsigned long    nr_sleep_events;
164         unsigned long    nr_wakeup_events;
165         unsigned long    nr_sleep_corrections;
166         unsigned long    nr_run_events_optimized;
167         unsigned long    targetless_wakeups;
168         unsigned long    multitarget_wakeups;
169         unsigned long    nr_runs;
170         unsigned long    nr_timestamps;
171         unsigned long    nr_unordered_timestamps;
172         unsigned long    nr_context_switch_bugs;
173         unsigned long    nr_events;
174         unsigned long    nr_lost_chunks;
175         unsigned long    nr_lost_events;
176         u64              run_measurement_overhead;
177         u64              sleep_measurement_overhead;
178         u64              start_time;
179         u64              cpu_usage;
180         u64              runavg_cpu_usage;
181         u64              parent_cpu_usage;
182         u64              runavg_parent_cpu_usage;
183         u64              sum_runtime;
184         u64              sum_fluct;
185         u64              run_avg;
186         u64              all_runtime;
187         u64              all_count;
188         u64              cpu_last_switched[MAX_CPUS];
189         struct rb_root   atom_root, sorted_atom_root, merged_atom_root;
190         struct list_head sort_list, cmp_pid;
191         bool force;
192         bool skip_merge;
193         struct perf_sched_map map;
194 };
195
196 static u64 get_nsecs(void)
197 {
198         struct timespec ts;
199
200         clock_gettime(CLOCK_MONOTONIC, &ts);
201
202         return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
203 }
204
205 static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
206 {
207         u64 T0 = get_nsecs(), T1;
208
209         do {
210                 T1 = get_nsecs();
211         } while (T1 + sched->run_measurement_overhead < T0 + nsecs);
212 }
213
214 static void sleep_nsecs(u64 nsecs)
215 {
216         struct timespec ts;
217
218         ts.tv_nsec = nsecs % 999999999;
219         ts.tv_sec = nsecs / 999999999;
220
221         nanosleep(&ts, NULL);
222 }
223
224 static void calibrate_run_measurement_overhead(struct perf_sched *sched)
225 {
226         u64 T0, T1, delta, min_delta = 1000000000ULL;
227         int i;
228
229         for (i = 0; i < 10; i++) {
230                 T0 = get_nsecs();
231                 burn_nsecs(sched, 0);
232                 T1 = get_nsecs();
233                 delta = T1-T0;
234                 min_delta = min(min_delta, delta);
235         }
236         sched->run_measurement_overhead = min_delta;
237
238         printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
239 }
240
241 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
242 {
243         u64 T0, T1, delta, min_delta = 1000000000ULL;
244         int i;
245
246         for (i = 0; i < 10; i++) {
247                 T0 = get_nsecs();
248                 sleep_nsecs(10000);
249                 T1 = get_nsecs();
250                 delta = T1-T0;
251                 min_delta = min(min_delta, delta);
252         }
253         min_delta -= 10000;
254         sched->sleep_measurement_overhead = min_delta;
255
256         printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
257 }
258
259 static struct sched_atom *
260 get_new_event(struct task_desc *task, u64 timestamp)
261 {
262         struct sched_atom *event = zalloc(sizeof(*event));
263         unsigned long idx = task->nr_events;
264         size_t size;
265
266         event->timestamp = timestamp;
267         event->nr = idx;
268
269         task->nr_events++;
270         size = sizeof(struct sched_atom *) * task->nr_events;
271         task->atoms = realloc(task->atoms, size);
272         BUG_ON(!task->atoms);
273
274         task->atoms[idx] = event;
275
276         return event;
277 }
278
279 static struct sched_atom *last_event(struct task_desc *task)
280 {
281         if (!task->nr_events)
282                 return NULL;
283
284         return task->atoms[task->nr_events - 1];
285 }
286
287 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
288                                 u64 timestamp, u64 duration)
289 {
290         struct sched_atom *event, *curr_event = last_event(task);
291
292         /*
293          * optimize an existing RUN event by merging this one
294          * to it:
295          */
296         if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
297                 sched->nr_run_events_optimized++;
298                 curr_event->duration += duration;
299                 return;
300         }
301
302         event = get_new_event(task, timestamp);
303
304         event->type = SCHED_EVENT_RUN;
305         event->duration = duration;
306
307         sched->nr_run_events++;
308 }
309
310 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
311                                    u64 timestamp, struct task_desc *wakee)
312 {
313         struct sched_atom *event, *wakee_event;
314
315         event = get_new_event(task, timestamp);
316         event->type = SCHED_EVENT_WAKEUP;
317         event->wakee = wakee;
318
319         wakee_event = last_event(wakee);
320         if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
321                 sched->targetless_wakeups++;
322                 return;
323         }
324         if (wakee_event->wait_sem) {
325                 sched->multitarget_wakeups++;
326                 return;
327         }
328
329         wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
330         sem_init(wakee_event->wait_sem, 0, 0);
331         wakee_event->specific_wait = 1;
332         event->wait_sem = wakee_event->wait_sem;
333
334         sched->nr_wakeup_events++;
335 }
336
337 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
338                                   u64 timestamp, u64 task_state __maybe_unused)
339 {
340         struct sched_atom *event = get_new_event(task, timestamp);
341
342         event->type = SCHED_EVENT_SLEEP;
343
344         sched->nr_sleep_events++;
345 }
346
347 static struct task_desc *register_pid(struct perf_sched *sched,
348                                       unsigned long pid, const char *comm)
349 {
350         struct task_desc *task;
351         static int pid_max;
352
353         if (sched->pid_to_task == NULL) {
354                 if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
355                         pid_max = MAX_PID;
356                 BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
357         }
358         if (pid >= (unsigned long)pid_max) {
359                 BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
360                         sizeof(struct task_desc *))) == NULL);
361                 while (pid >= (unsigned long)pid_max)
362                         sched->pid_to_task[pid_max++] = NULL;
363         }
364
365         task = sched->pid_to_task[pid];
366
367         if (task)
368                 return task;
369
370         task = zalloc(sizeof(*task));
371         task->pid = pid;
372         task->nr = sched->nr_tasks;
373         strcpy(task->comm, comm);
374         /*
375          * every task starts in sleeping state - this gets ignored
376          * if there's no wakeup pointing to this sleep state:
377          */
378         add_sched_event_sleep(sched, task, 0, 0);
379
380         sched->pid_to_task[pid] = task;
381         sched->nr_tasks++;
382         sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
383         BUG_ON(!sched->tasks);
384         sched->tasks[task->nr] = task;
385
386         if (verbose)
387                 printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
388
389         return task;
390 }
391
392
393 static void print_task_traces(struct perf_sched *sched)
394 {
395         struct task_desc *task;
396         unsigned long i;
397
398         for (i = 0; i < sched->nr_tasks; i++) {
399                 task = sched->tasks[i];
400                 printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
401                         task->nr, task->comm, task->pid, task->nr_events);
402         }
403 }
404
405 static void add_cross_task_wakeups(struct perf_sched *sched)
406 {
407         struct task_desc *task1, *task2;
408         unsigned long i, j;
409
410         for (i = 0; i < sched->nr_tasks; i++) {
411                 task1 = sched->tasks[i];
412                 j = i + 1;
413                 if (j == sched->nr_tasks)
414                         j = 0;
415                 task2 = sched->tasks[j];
416                 add_sched_event_wakeup(sched, task1, 0, task2);
417         }
418 }
419
420 static void perf_sched__process_event(struct perf_sched *sched,
421                                       struct sched_atom *atom)
422 {
423         int ret = 0;
424
425         switch (atom->type) {
426                 case SCHED_EVENT_RUN:
427                         burn_nsecs(sched, atom->duration);
428                         break;
429                 case SCHED_EVENT_SLEEP:
430                         if (atom->wait_sem)
431                                 ret = sem_wait(atom->wait_sem);
432                         BUG_ON(ret);
433                         break;
434                 case SCHED_EVENT_WAKEUP:
435                         if (atom->wait_sem)
436                                 ret = sem_post(atom->wait_sem);
437                         BUG_ON(ret);
438                         break;
439                 case SCHED_EVENT_MIGRATION:
440                         break;
441                 default:
442                         BUG_ON(1);
443         }
444 }
445
446 static u64 get_cpu_usage_nsec_parent(void)
447 {
448         struct rusage ru;
449         u64 sum;
450         int err;
451
452         err = getrusage(RUSAGE_SELF, &ru);
453         BUG_ON(err);
454
455         sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
456         sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
457
458         return sum;
459 }
460
461 static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
462 {
463         struct perf_event_attr attr;
464         char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
465         int fd;
466         struct rlimit limit;
467         bool need_privilege = false;
468
469         memset(&attr, 0, sizeof(attr));
470
471         attr.type = PERF_TYPE_SOFTWARE;
472         attr.config = PERF_COUNT_SW_TASK_CLOCK;
473
474 force_again:
475         fd = sys_perf_event_open(&attr, 0, -1, -1,
476                                  perf_event_open_cloexec_flag());
477
478         if (fd < 0) {
479                 if (errno == EMFILE) {
480                         if (sched->force) {
481                                 BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
482                                 limit.rlim_cur += sched->nr_tasks - cur_task;
483                                 if (limit.rlim_cur > limit.rlim_max) {
484                                         limit.rlim_max = limit.rlim_cur;
485                                         need_privilege = true;
486                                 }
487                                 if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
488                                         if (need_privilege && errno == EPERM)
489                                                 strcpy(info, "Need privilege\n");
490                                 } else
491                                         goto force_again;
492                         } else
493                                 strcpy(info, "Have a try with -f option\n");
494                 }
495                 pr_err("Error: sys_perf_event_open() syscall returned "
496                        "with %d (%s)\n%s", fd,
497                        str_error_r(errno, sbuf, sizeof(sbuf)), info);
498                 exit(EXIT_FAILURE);
499         }
500         return fd;
501 }
502
503 static u64 get_cpu_usage_nsec_self(int fd)
504 {
505         u64 runtime;
506         int ret;
507
508         ret = read(fd, &runtime, sizeof(runtime));
509         BUG_ON(ret != sizeof(runtime));
510
511         return runtime;
512 }
513
514 struct sched_thread_parms {
515         struct task_desc  *task;
516         struct perf_sched *sched;
517         int fd;
518 };
519
520 static void *thread_func(void *ctx)
521 {
522         struct sched_thread_parms *parms = ctx;
523         struct task_desc *this_task = parms->task;
524         struct perf_sched *sched = parms->sched;
525         u64 cpu_usage_0, cpu_usage_1;
526         unsigned long i, ret;
527         char comm2[22];
528         int fd = parms->fd;
529
530         zfree(&parms);
531
532         sprintf(comm2, ":%s", this_task->comm);
533         prctl(PR_SET_NAME, comm2);
534         if (fd < 0)
535                 return NULL;
536 again:
537         ret = sem_post(&this_task->ready_for_work);
538         BUG_ON(ret);
539         ret = pthread_mutex_lock(&sched->start_work_mutex);
540         BUG_ON(ret);
541         ret = pthread_mutex_unlock(&sched->start_work_mutex);
542         BUG_ON(ret);
543
544         cpu_usage_0 = get_cpu_usage_nsec_self(fd);
545
546         for (i = 0; i < this_task->nr_events; i++) {
547                 this_task->curr_event = i;
548                 perf_sched__process_event(sched, this_task->atoms[i]);
549         }
550
551         cpu_usage_1 = get_cpu_usage_nsec_self(fd);
552         this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
553         ret = sem_post(&this_task->work_done_sem);
554         BUG_ON(ret);
555
556         ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
557         BUG_ON(ret);
558         ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
559         BUG_ON(ret);
560
561         goto again;
562 }
563
564 static void create_tasks(struct perf_sched *sched)
565 {
566         struct task_desc *task;
567         pthread_attr_t attr;
568         unsigned long i;
569         int err;
570
571         err = pthread_attr_init(&attr);
572         BUG_ON(err);
573         err = pthread_attr_setstacksize(&attr,
574                         (size_t) max(16 * 1024, PTHREAD_STACK_MIN));
575         BUG_ON(err);
576         err = pthread_mutex_lock(&sched->start_work_mutex);
577         BUG_ON(err);
578         err = pthread_mutex_lock(&sched->work_done_wait_mutex);
579         BUG_ON(err);
580         for (i = 0; i < sched->nr_tasks; i++) {
581                 struct sched_thread_parms *parms = malloc(sizeof(*parms));
582                 BUG_ON(parms == NULL);
583                 parms->task = task = sched->tasks[i];
584                 parms->sched = sched;
585                 parms->fd = self_open_counters(sched, i);
586                 sem_init(&task->sleep_sem, 0, 0);
587                 sem_init(&task->ready_for_work, 0, 0);
588                 sem_init(&task->work_done_sem, 0, 0);
589                 task->curr_event = 0;
590                 err = pthread_create(&task->thread, &attr, thread_func, parms);
591                 BUG_ON(err);
592         }
593 }
594
595 static void wait_for_tasks(struct perf_sched *sched)
596 {
597         u64 cpu_usage_0, cpu_usage_1;
598         struct task_desc *task;
599         unsigned long i, ret;
600
601         sched->start_time = get_nsecs();
602         sched->cpu_usage = 0;
603         pthread_mutex_unlock(&sched->work_done_wait_mutex);
604
605         for (i = 0; i < sched->nr_tasks; i++) {
606                 task = sched->tasks[i];
607                 ret = sem_wait(&task->ready_for_work);
608                 BUG_ON(ret);
609                 sem_init(&task->ready_for_work, 0, 0);
610         }
611         ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
612         BUG_ON(ret);
613
614         cpu_usage_0 = get_cpu_usage_nsec_parent();
615
616         pthread_mutex_unlock(&sched->start_work_mutex);
617
618         for (i = 0; i < sched->nr_tasks; i++) {
619                 task = sched->tasks[i];
620                 ret = sem_wait(&task->work_done_sem);
621                 BUG_ON(ret);
622                 sem_init(&task->work_done_sem, 0, 0);
623                 sched->cpu_usage += task->cpu_usage;
624                 task->cpu_usage = 0;
625         }
626
627         cpu_usage_1 = get_cpu_usage_nsec_parent();
628         if (!sched->runavg_cpu_usage)
629                 sched->runavg_cpu_usage = sched->cpu_usage;
630         sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
631
632         sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
633         if (!sched->runavg_parent_cpu_usage)
634                 sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
635         sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
636                                          sched->parent_cpu_usage)/sched->replay_repeat;
637
638         ret = pthread_mutex_lock(&sched->start_work_mutex);
639         BUG_ON(ret);
640
641         for (i = 0; i < sched->nr_tasks; i++) {
642                 task = sched->tasks[i];
643                 sem_init(&task->sleep_sem, 0, 0);
644                 task->curr_event = 0;
645         }
646 }
647
648 static void run_one_test(struct perf_sched *sched)
649 {
650         u64 T0, T1, delta, avg_delta, fluct;
651
652         T0 = get_nsecs();
653         wait_for_tasks(sched);
654         T1 = get_nsecs();
655
656         delta = T1 - T0;
657         sched->sum_runtime += delta;
658         sched->nr_runs++;
659
660         avg_delta = sched->sum_runtime / sched->nr_runs;
661         if (delta < avg_delta)
662                 fluct = avg_delta - delta;
663         else
664                 fluct = delta - avg_delta;
665         sched->sum_fluct += fluct;
666         if (!sched->run_avg)
667                 sched->run_avg = delta;
668         sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
669
670         printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
671
672         printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
673
674         printf("cpu: %0.2f / %0.2f",
675                 (double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
676
677 #if 0
678         /*
679          * rusage statistics done by the parent, these are less
680          * accurate than the sched->sum_exec_runtime based statistics:
681          */
682         printf(" [%0.2f / %0.2f]",
683                 (double)sched->parent_cpu_usage/1e6,
684                 (double)sched->runavg_parent_cpu_usage/1e6);
685 #endif
686
687         printf("\n");
688
689         if (sched->nr_sleep_corrections)
690                 printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
691         sched->nr_sleep_corrections = 0;
692 }
693
694 static void test_calibrations(struct perf_sched *sched)
695 {
696         u64 T0, T1;
697
698         T0 = get_nsecs();
699         burn_nsecs(sched, 1e6);
700         T1 = get_nsecs();
701
702         printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
703
704         T0 = get_nsecs();
705         sleep_nsecs(1e6);
706         T1 = get_nsecs();
707
708         printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
709 }
710
711 static int
712 replay_wakeup_event(struct perf_sched *sched,
713                     struct perf_evsel *evsel, struct perf_sample *sample,
714                     struct machine *machine __maybe_unused)
715 {
716         const char *comm = perf_evsel__strval(evsel, sample, "comm");
717         const u32 pid    = perf_evsel__intval(evsel, sample, "pid");
718         struct task_desc *waker, *wakee;
719
720         if (verbose) {
721                 printf("sched_wakeup event %p\n", evsel);
722
723                 printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
724         }
725
726         waker = register_pid(sched, sample->tid, "<unknown>");
727         wakee = register_pid(sched, pid, comm);
728
729         add_sched_event_wakeup(sched, waker, sample->time, wakee);
730         return 0;
731 }
732
733 static int replay_switch_event(struct perf_sched *sched,
734                                struct perf_evsel *evsel,
735                                struct perf_sample *sample,
736                                struct machine *machine __maybe_unused)
737 {
738         const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
739                    *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
740         const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
741                   next_pid = perf_evsel__intval(evsel, sample, "next_pid");
742         const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
743         struct task_desc *prev, __maybe_unused *next;
744         u64 timestamp0, timestamp = sample->time;
745         int cpu = sample->cpu;
746         s64 delta;
747
748         if (verbose)
749                 printf("sched_switch event %p\n", evsel);
750
751         if (cpu >= MAX_CPUS || cpu < 0)
752                 return 0;
753
754         timestamp0 = sched->cpu_last_switched[cpu];
755         if (timestamp0)
756                 delta = timestamp - timestamp0;
757         else
758                 delta = 0;
759
760         if (delta < 0) {
761                 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
762                 return -1;
763         }
764
765         pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
766                  prev_comm, prev_pid, next_comm, next_pid, delta);
767
768         prev = register_pid(sched, prev_pid, prev_comm);
769         next = register_pid(sched, next_pid, next_comm);
770
771         sched->cpu_last_switched[cpu] = timestamp;
772
773         add_sched_event_run(sched, prev, timestamp, delta);
774         add_sched_event_sleep(sched, prev, timestamp, prev_state);
775
776         return 0;
777 }
778
779 static int replay_fork_event(struct perf_sched *sched,
780                              union perf_event *event,
781                              struct machine *machine)
782 {
783         struct thread *child, *parent;
784
785         child = machine__findnew_thread(machine, event->fork.pid,
786                                         event->fork.tid);
787         parent = machine__findnew_thread(machine, event->fork.ppid,
788                                          event->fork.ptid);
789
790         if (child == NULL || parent == NULL) {
791                 pr_debug("thread does not exist on fork event: child %p, parent %p\n",
792                                  child, parent);
793                 goto out_put;
794         }
795
796         if (verbose) {
797                 printf("fork event\n");
798                 printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
799                 printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
800         }
801
802         register_pid(sched, parent->tid, thread__comm_str(parent));
803         register_pid(sched, child->tid, thread__comm_str(child));
804 out_put:
805         thread__put(child);
806         thread__put(parent);
807         return 0;
808 }
809
810 struct sort_dimension {
811         const char              *name;
812         sort_fn_t               cmp;
813         struct list_head        list;
814 };
815
816 static int
817 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
818 {
819         struct sort_dimension *sort;
820         int ret = 0;
821
822         BUG_ON(list_empty(list));
823
824         list_for_each_entry(sort, list, list) {
825                 ret = sort->cmp(l, r);
826                 if (ret)
827                         return ret;
828         }
829
830         return ret;
831 }
832
833 static struct work_atoms *
834 thread_atoms_search(struct rb_root *root, struct thread *thread,
835                          struct list_head *sort_list)
836 {
837         struct rb_node *node = root->rb_node;
838         struct work_atoms key = { .thread = thread };
839
840         while (node) {
841                 struct work_atoms *atoms;
842                 int cmp;
843
844                 atoms = container_of(node, struct work_atoms, node);
845
846                 cmp = thread_lat_cmp(sort_list, &key, atoms);
847                 if (cmp > 0)
848                         node = node->rb_left;
849                 else if (cmp < 0)
850                         node = node->rb_right;
851                 else {
852                         BUG_ON(thread != atoms->thread);
853                         return atoms;
854                 }
855         }
856         return NULL;
857 }
858
859 static void
860 __thread_latency_insert(struct rb_root *root, struct work_atoms *data,
861                          struct list_head *sort_list)
862 {
863         struct rb_node **new = &(root->rb_node), *parent = NULL;
864
865         while (*new) {
866                 struct work_atoms *this;
867                 int cmp;
868
869                 this = container_of(*new, struct work_atoms, node);
870                 parent = *new;
871
872                 cmp = thread_lat_cmp(sort_list, data, this);
873
874                 if (cmp > 0)
875                         new = &((*new)->rb_left);
876                 else
877                         new = &((*new)->rb_right);
878         }
879
880         rb_link_node(&data->node, parent, new);
881         rb_insert_color(&data->node, root);
882 }
883
884 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
885 {
886         struct work_atoms *atoms = zalloc(sizeof(*atoms));
887         if (!atoms) {
888                 pr_err("No memory at %s\n", __func__);
889                 return -1;
890         }
891
892         atoms->thread = thread__get(thread);
893         INIT_LIST_HEAD(&atoms->work_list);
894         __thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
895         return 0;
896 }
897
898 static char sched_out_state(u64 prev_state)
899 {
900         const char *str = TASK_STATE_TO_CHAR_STR;
901
902         return str[prev_state];
903 }
904
905 static int
906 add_sched_out_event(struct work_atoms *atoms,
907                     char run_state,
908                     u64 timestamp)
909 {
910         struct work_atom *atom = zalloc(sizeof(*atom));
911         if (!atom) {
912                 pr_err("Non memory at %s", __func__);
913                 return -1;
914         }
915
916         atom->sched_out_time = timestamp;
917
918         if (run_state == 'R') {
919                 atom->state = THREAD_WAIT_CPU;
920                 atom->wake_up_time = atom->sched_out_time;
921         }
922
923         list_add_tail(&atom->list, &atoms->work_list);
924         return 0;
925 }
926
927 static void
928 add_runtime_event(struct work_atoms *atoms, u64 delta,
929                   u64 timestamp __maybe_unused)
930 {
931         struct work_atom *atom;
932
933         BUG_ON(list_empty(&atoms->work_list));
934
935         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
936
937         atom->runtime += delta;
938         atoms->total_runtime += delta;
939 }
940
941 static void
942 add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
943 {
944         struct work_atom *atom;
945         u64 delta;
946
947         if (list_empty(&atoms->work_list))
948                 return;
949
950         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
951
952         if (atom->state != THREAD_WAIT_CPU)
953                 return;
954
955         if (timestamp < atom->wake_up_time) {
956                 atom->state = THREAD_IGNORE;
957                 return;
958         }
959
960         atom->state = THREAD_SCHED_IN;
961         atom->sched_in_time = timestamp;
962
963         delta = atom->sched_in_time - atom->wake_up_time;
964         atoms->total_lat += delta;
965         if (delta > atoms->max_lat) {
966                 atoms->max_lat = delta;
967                 atoms->max_lat_at = timestamp;
968         }
969         atoms->nb_atoms++;
970 }
971
972 static int latency_switch_event(struct perf_sched *sched,
973                                 struct perf_evsel *evsel,
974                                 struct perf_sample *sample,
975                                 struct machine *machine)
976 {
977         const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
978                   next_pid = perf_evsel__intval(evsel, sample, "next_pid");
979         const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
980         struct work_atoms *out_events, *in_events;
981         struct thread *sched_out, *sched_in;
982         u64 timestamp0, timestamp = sample->time;
983         int cpu = sample->cpu, err = -1;
984         s64 delta;
985
986         BUG_ON(cpu >= MAX_CPUS || cpu < 0);
987
988         timestamp0 = sched->cpu_last_switched[cpu];
989         sched->cpu_last_switched[cpu] = timestamp;
990         if (timestamp0)
991                 delta = timestamp - timestamp0;
992         else
993                 delta = 0;
994
995         if (delta < 0) {
996                 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
997                 return -1;
998         }
999
1000         sched_out = machine__findnew_thread(machine, -1, prev_pid);
1001         sched_in = machine__findnew_thread(machine, -1, next_pid);
1002         if (sched_out == NULL || sched_in == NULL)
1003                 goto out_put;
1004
1005         out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1006         if (!out_events) {
1007                 if (thread_atoms_insert(sched, sched_out))
1008                         goto out_put;
1009                 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1010                 if (!out_events) {
1011                         pr_err("out-event: Internal tree error");
1012                         goto out_put;
1013                 }
1014         }
1015         if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
1016                 return -1;
1017
1018         in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1019         if (!in_events) {
1020                 if (thread_atoms_insert(sched, sched_in))
1021                         goto out_put;
1022                 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1023                 if (!in_events) {
1024                         pr_err("in-event: Internal tree error");
1025                         goto out_put;
1026                 }
1027                 /*
1028                  * Take came in we have not heard about yet,
1029                  * add in an initial atom in runnable state:
1030                  */
1031                 if (add_sched_out_event(in_events, 'R', timestamp))
1032                         goto out_put;
1033         }
1034         add_sched_in_event(in_events, timestamp);
1035         err = 0;
1036 out_put:
1037         thread__put(sched_out);
1038         thread__put(sched_in);
1039         return err;
1040 }
1041
1042 static int latency_runtime_event(struct perf_sched *sched,
1043                                  struct perf_evsel *evsel,
1044                                  struct perf_sample *sample,
1045                                  struct machine *machine)
1046 {
1047         const u32 pid      = perf_evsel__intval(evsel, sample, "pid");
1048         const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
1049         struct thread *thread = machine__findnew_thread(machine, -1, pid);
1050         struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1051         u64 timestamp = sample->time;
1052         int cpu = sample->cpu, err = -1;
1053
1054         if (thread == NULL)
1055                 return -1;
1056
1057         BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1058         if (!atoms) {
1059                 if (thread_atoms_insert(sched, thread))
1060                         goto out_put;
1061                 atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1062                 if (!atoms) {
1063                         pr_err("in-event: Internal tree error");
1064                         goto out_put;
1065                 }
1066                 if (add_sched_out_event(atoms, 'R', timestamp))
1067                         goto out_put;
1068         }
1069
1070         add_runtime_event(atoms, runtime, timestamp);
1071         err = 0;
1072 out_put:
1073         thread__put(thread);
1074         return err;
1075 }
1076
1077 static int latency_wakeup_event(struct perf_sched *sched,
1078                                 struct perf_evsel *evsel,
1079                                 struct perf_sample *sample,
1080                                 struct machine *machine)
1081 {
1082         const u32 pid     = perf_evsel__intval(evsel, sample, "pid");
1083         struct work_atoms *atoms;
1084         struct work_atom *atom;
1085         struct thread *wakee;
1086         u64 timestamp = sample->time;
1087         int err = -1;
1088
1089         wakee = machine__findnew_thread(machine, -1, pid);
1090         if (wakee == NULL)
1091                 return -1;
1092         atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1093         if (!atoms) {
1094                 if (thread_atoms_insert(sched, wakee))
1095                         goto out_put;
1096                 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1097                 if (!atoms) {
1098                         pr_err("wakeup-event: Internal tree error");
1099                         goto out_put;
1100                 }
1101                 if (add_sched_out_event(atoms, 'S', timestamp))
1102                         goto out_put;
1103         }
1104
1105         BUG_ON(list_empty(&atoms->work_list));
1106
1107         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1108
1109         /*
1110          * As we do not guarantee the wakeup event happens when
1111          * task is out of run queue, also may happen when task is
1112          * on run queue and wakeup only change ->state to TASK_RUNNING,
1113          * then we should not set the ->wake_up_time when wake up a
1114          * task which is on run queue.
1115          *
1116          * You WILL be missing events if you've recorded only
1117          * one CPU, or are only looking at only one, so don't
1118          * skip in this case.
1119          */
1120         if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1121                 goto out_ok;
1122
1123         sched->nr_timestamps++;
1124         if (atom->sched_out_time > timestamp) {
1125                 sched->nr_unordered_timestamps++;
1126                 goto out_ok;
1127         }
1128
1129         atom->state = THREAD_WAIT_CPU;
1130         atom->wake_up_time = timestamp;
1131 out_ok:
1132         err = 0;
1133 out_put:
1134         thread__put(wakee);
1135         return err;
1136 }
1137
1138 static int latency_migrate_task_event(struct perf_sched *sched,
1139                                       struct perf_evsel *evsel,
1140                                       struct perf_sample *sample,
1141                                       struct machine *machine)
1142 {
1143         const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1144         u64 timestamp = sample->time;
1145         struct work_atoms *atoms;
1146         struct work_atom *atom;
1147         struct thread *migrant;
1148         int err = -1;
1149
1150         /*
1151          * Only need to worry about migration when profiling one CPU.
1152          */
1153         if (sched->profile_cpu == -1)
1154                 return 0;
1155
1156         migrant = machine__findnew_thread(machine, -1, pid);
1157         if (migrant == NULL)
1158                 return -1;
1159         atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1160         if (!atoms) {
1161                 if (thread_atoms_insert(sched, migrant))
1162                         goto out_put;
1163                 register_pid(sched, migrant->tid, thread__comm_str(migrant));
1164                 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1165                 if (!atoms) {
1166                         pr_err("migration-event: Internal tree error");
1167                         goto out_put;
1168                 }
1169                 if (add_sched_out_event(atoms, 'R', timestamp))
1170                         goto out_put;
1171         }
1172
1173         BUG_ON(list_empty(&atoms->work_list));
1174
1175         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1176         atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1177
1178         sched->nr_timestamps++;
1179
1180         if (atom->sched_out_time > timestamp)
1181                 sched->nr_unordered_timestamps++;
1182         err = 0;
1183 out_put:
1184         thread__put(migrant);
1185         return err;
1186 }
1187
1188 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1189 {
1190         int i;
1191         int ret;
1192         u64 avg;
1193
1194         if (!work_list->nb_atoms)
1195                 return;
1196         /*
1197          * Ignore idle threads:
1198          */
1199         if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
1200                 return;
1201
1202         sched->all_runtime += work_list->total_runtime;
1203         sched->all_count   += work_list->nb_atoms;
1204
1205         if (work_list->num_merged > 1)
1206                 ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
1207         else
1208                 ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
1209
1210         for (i = 0; i < 24 - ret; i++)
1211                 printf(" ");
1212
1213         avg = work_list->total_lat / work_list->nb_atoms;
1214
1215         printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13.6f s\n",
1216               (double)work_list->total_runtime / 1e6,
1217                  work_list->nb_atoms, (double)avg / 1e6,
1218                  (double)work_list->max_lat / 1e6,
1219                  (double)work_list->max_lat_at / 1e9);
1220 }
1221
1222 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1223 {
1224         if (l->thread == r->thread)
1225                 return 0;
1226         if (l->thread->tid < r->thread->tid)
1227                 return -1;
1228         if (l->thread->tid > r->thread->tid)
1229                 return 1;
1230         return (int)(l->thread - r->thread);
1231 }
1232
1233 static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1234 {
1235         u64 avgl, avgr;
1236
1237         if (!l->nb_atoms)
1238                 return -1;
1239
1240         if (!r->nb_atoms)
1241                 return 1;
1242
1243         avgl = l->total_lat / l->nb_atoms;
1244         avgr = r->total_lat / r->nb_atoms;
1245
1246         if (avgl < avgr)
1247                 return -1;
1248         if (avgl > avgr)
1249                 return 1;
1250
1251         return 0;
1252 }
1253
1254 static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1255 {
1256         if (l->max_lat < r->max_lat)
1257                 return -1;
1258         if (l->max_lat > r->max_lat)
1259                 return 1;
1260
1261         return 0;
1262 }
1263
1264 static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1265 {
1266         if (l->nb_atoms < r->nb_atoms)
1267                 return -1;
1268         if (l->nb_atoms > r->nb_atoms)
1269                 return 1;
1270
1271         return 0;
1272 }
1273
1274 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1275 {
1276         if (l->total_runtime < r->total_runtime)
1277                 return -1;
1278         if (l->total_runtime > r->total_runtime)
1279                 return 1;
1280
1281         return 0;
1282 }
1283
1284 static int sort_dimension__add(const char *tok, struct list_head *list)
1285 {
1286         size_t i;
1287         static struct sort_dimension avg_sort_dimension = {
1288                 .name = "avg",
1289                 .cmp  = avg_cmp,
1290         };
1291         static struct sort_dimension max_sort_dimension = {
1292                 .name = "max",
1293                 .cmp  = max_cmp,
1294         };
1295         static struct sort_dimension pid_sort_dimension = {
1296                 .name = "pid",
1297                 .cmp  = pid_cmp,
1298         };
1299         static struct sort_dimension runtime_sort_dimension = {
1300                 .name = "runtime",
1301                 .cmp  = runtime_cmp,
1302         };
1303         static struct sort_dimension switch_sort_dimension = {
1304                 .name = "switch",
1305                 .cmp  = switch_cmp,
1306         };
1307         struct sort_dimension *available_sorts[] = {
1308                 &pid_sort_dimension,
1309                 &avg_sort_dimension,
1310                 &max_sort_dimension,
1311                 &switch_sort_dimension,
1312                 &runtime_sort_dimension,
1313         };
1314
1315         for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1316                 if (!strcmp(available_sorts[i]->name, tok)) {
1317                         list_add_tail(&available_sorts[i]->list, list);
1318
1319                         return 0;
1320                 }
1321         }
1322
1323         return -1;
1324 }
1325
1326 static void perf_sched__sort_lat(struct perf_sched *sched)
1327 {
1328         struct rb_node *node;
1329         struct rb_root *root = &sched->atom_root;
1330 again:
1331         for (;;) {
1332                 struct work_atoms *data;
1333                 node = rb_first(root);
1334                 if (!node)
1335                         break;
1336
1337                 rb_erase(node, root);
1338                 data = rb_entry(node, struct work_atoms, node);
1339                 __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1340         }
1341         if (root == &sched->atom_root) {
1342                 root = &sched->merged_atom_root;
1343                 goto again;
1344         }
1345 }
1346
1347 static int process_sched_wakeup_event(struct perf_tool *tool,
1348                                       struct perf_evsel *evsel,
1349                                       struct perf_sample *sample,
1350                                       struct machine *machine)
1351 {
1352         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1353
1354         if (sched->tp_handler->wakeup_event)
1355                 return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1356
1357         return 0;
1358 }
1359
1360 union map_priv {
1361         void    *ptr;
1362         bool     color;
1363 };
1364
1365 static bool thread__has_color(struct thread *thread)
1366 {
1367         union map_priv priv = {
1368                 .ptr = thread__priv(thread),
1369         };
1370
1371         return priv.color;
1372 }
1373
1374 static struct thread*
1375 map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid)
1376 {
1377         struct thread *thread = machine__findnew_thread(machine, pid, tid);
1378         union map_priv priv = {
1379                 .color = false,
1380         };
1381
1382         if (!sched->map.color_pids || !thread || thread__priv(thread))
1383                 return thread;
1384
1385         if (thread_map__has(sched->map.color_pids, tid))
1386                 priv.color = true;
1387
1388         thread__set_priv(thread, priv.ptr);
1389         return thread;
1390 }
1391
1392 static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
1393                             struct perf_sample *sample, struct machine *machine)
1394 {
1395         const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1396         struct thread *sched_in;
1397         int new_shortname;
1398         u64 timestamp0, timestamp = sample->time;
1399         s64 delta;
1400         int i, this_cpu = sample->cpu;
1401         int cpus_nr;
1402         bool new_cpu = false;
1403         const char *color = PERF_COLOR_NORMAL;
1404
1405         BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1406
1407         if (this_cpu > sched->max_cpu)
1408                 sched->max_cpu = this_cpu;
1409
1410         if (sched->map.comp) {
1411                 cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
1412                 if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) {
1413                         sched->map.comp_cpus[cpus_nr++] = this_cpu;
1414                         new_cpu = true;
1415                 }
1416         } else
1417                 cpus_nr = sched->max_cpu;
1418
1419         timestamp0 = sched->cpu_last_switched[this_cpu];
1420         sched->cpu_last_switched[this_cpu] = timestamp;
1421         if (timestamp0)
1422                 delta = timestamp - timestamp0;
1423         else
1424                 delta = 0;
1425
1426         if (delta < 0) {
1427                 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1428                 return -1;
1429         }
1430
1431         sched_in = map__findnew_thread(sched, machine, -1, next_pid);
1432         if (sched_in == NULL)
1433                 return -1;
1434
1435         sched->curr_thread[this_cpu] = thread__get(sched_in);
1436
1437         printf("  ");
1438
1439         new_shortname = 0;
1440         if (!sched_in->shortname[0]) {
1441                 if (!strcmp(thread__comm_str(sched_in), "swapper")) {
1442                         /*
1443                          * Don't allocate a letter-number for swapper:0
1444                          * as a shortname. Instead, we use '.' for it.
1445                          */
1446                         sched_in->shortname[0] = '.';
1447                         sched_in->shortname[1] = ' ';
1448                 } else {
1449                         sched_in->shortname[0] = sched->next_shortname1;
1450                         sched_in->shortname[1] = sched->next_shortname2;
1451
1452                         if (sched->next_shortname1 < 'Z') {
1453                                 sched->next_shortname1++;
1454                         } else {
1455                                 sched->next_shortname1 = 'A';
1456                                 if (sched->next_shortname2 < '9')
1457                                         sched->next_shortname2++;
1458                                 else
1459                                         sched->next_shortname2 = '0';
1460                         }
1461                 }
1462                 new_shortname = 1;
1463         }
1464
1465         for (i = 0; i < cpus_nr; i++) {
1466                 int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i;
1467                 struct thread *curr_thread = sched->curr_thread[cpu];
1468                 const char *pid_color = color;
1469                 const char *cpu_color = color;
1470
1471                 if (curr_thread && thread__has_color(curr_thread))
1472                         pid_color = COLOR_PIDS;
1473
1474                 if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu))
1475                         continue;
1476
1477                 if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu))
1478                         cpu_color = COLOR_CPUS;
1479
1480                 if (cpu != this_cpu)
1481                         color_fprintf(stdout, cpu_color, " ");
1482                 else
1483                         color_fprintf(stdout, cpu_color, "*");
1484
1485                 if (sched->curr_thread[cpu])
1486                         color_fprintf(stdout, pid_color, "%2s ", sched->curr_thread[cpu]->shortname);
1487                 else
1488                         color_fprintf(stdout, color, "   ");
1489         }
1490
1491         if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu))
1492                 goto out;
1493
1494         color_fprintf(stdout, color, "  %12.6f secs ", (double)timestamp/1e9);
1495         if (new_shortname) {
1496                 const char *pid_color = color;
1497
1498                 if (thread__has_color(sched_in))
1499                         pid_color = COLOR_PIDS;
1500
1501                 color_fprintf(stdout, pid_color, "%s => %s:%d",
1502                        sched_in->shortname, thread__comm_str(sched_in), sched_in->tid);
1503         }
1504
1505         if (sched->map.comp && new_cpu)
1506                 color_fprintf(stdout, color, " (CPU %d)", this_cpu);
1507
1508 out:
1509         color_fprintf(stdout, color, "\n");
1510
1511         thread__put(sched_in);
1512
1513         return 0;
1514 }
1515
1516 static int process_sched_switch_event(struct perf_tool *tool,
1517                                       struct perf_evsel *evsel,
1518                                       struct perf_sample *sample,
1519                                       struct machine *machine)
1520 {
1521         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1522         int this_cpu = sample->cpu, err = 0;
1523         u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
1524             next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1525
1526         if (sched->curr_pid[this_cpu] != (u32)-1) {
1527                 /*
1528                  * Are we trying to switch away a PID that is
1529                  * not current?
1530                  */
1531                 if (sched->curr_pid[this_cpu] != prev_pid)
1532                         sched->nr_context_switch_bugs++;
1533         }
1534
1535         if (sched->tp_handler->switch_event)
1536                 err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1537
1538         sched->curr_pid[this_cpu] = next_pid;
1539         return err;
1540 }
1541
1542 static int process_sched_runtime_event(struct perf_tool *tool,
1543                                        struct perf_evsel *evsel,
1544                                        struct perf_sample *sample,
1545                                        struct machine *machine)
1546 {
1547         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1548
1549         if (sched->tp_handler->runtime_event)
1550                 return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1551
1552         return 0;
1553 }
1554
1555 static int perf_sched__process_fork_event(struct perf_tool *tool,
1556                                           union perf_event *event,
1557                                           struct perf_sample *sample,
1558                                           struct machine *machine)
1559 {
1560         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1561
1562         /* run the fork event through the perf machineruy */
1563         perf_event__process_fork(tool, event, sample, machine);
1564
1565         /* and then run additional processing needed for this command */
1566         if (sched->tp_handler->fork_event)
1567                 return sched->tp_handler->fork_event(sched, event, machine);
1568
1569         return 0;
1570 }
1571
1572 static int process_sched_migrate_task_event(struct perf_tool *tool,
1573                                             struct perf_evsel *evsel,
1574                                             struct perf_sample *sample,
1575                                             struct machine *machine)
1576 {
1577         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1578
1579         if (sched->tp_handler->migrate_task_event)
1580                 return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1581
1582         return 0;
1583 }
1584
1585 typedef int (*tracepoint_handler)(struct perf_tool *tool,
1586                                   struct perf_evsel *evsel,
1587                                   struct perf_sample *sample,
1588                                   struct machine *machine);
1589
1590 static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
1591                                                  union perf_event *event __maybe_unused,
1592                                                  struct perf_sample *sample,
1593                                                  struct perf_evsel *evsel,
1594                                                  struct machine *machine)
1595 {
1596         int err = 0;
1597
1598         if (evsel->handler != NULL) {
1599                 tracepoint_handler f = evsel->handler;
1600                 err = f(tool, evsel, sample, machine);
1601         }
1602
1603         return err;
1604 }
1605
1606 static int perf_sched__read_events(struct perf_sched *sched)
1607 {
1608         const struct perf_evsel_str_handler handlers[] = {
1609                 { "sched:sched_switch",       process_sched_switch_event, },
1610                 { "sched:sched_stat_runtime", process_sched_runtime_event, },
1611                 { "sched:sched_wakeup",       process_sched_wakeup_event, },
1612                 { "sched:sched_wakeup_new",   process_sched_wakeup_event, },
1613                 { "sched:sched_migrate_task", process_sched_migrate_task_event, },
1614         };
1615         struct perf_session *session;
1616         struct perf_data_file file = {
1617                 .path = input_name,
1618                 .mode = PERF_DATA_MODE_READ,
1619                 .force = sched->force,
1620         };
1621         int rc = -1;
1622
1623         session = perf_session__new(&file, false, &sched->tool);
1624         if (session == NULL) {
1625                 pr_debug("No Memory for session\n");
1626                 return -1;
1627         }
1628
1629         symbol__init(&session->header.env);
1630
1631         if (perf_session__set_tracepoints_handlers(session, handlers))
1632                 goto out_delete;
1633
1634         if (perf_session__has_traces(session, "record -R")) {
1635                 int err = perf_session__process_events(session);
1636                 if (err) {
1637                         pr_err("Failed to process events, error %d", err);
1638                         goto out_delete;
1639                 }
1640
1641                 sched->nr_events      = session->evlist->stats.nr_events[0];
1642                 sched->nr_lost_events = session->evlist->stats.total_lost;
1643                 sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
1644         }
1645
1646         rc = 0;
1647 out_delete:
1648         perf_session__delete(session);
1649         return rc;
1650 }
1651
1652 static void print_bad_events(struct perf_sched *sched)
1653 {
1654         if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
1655                 printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1656                         (double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
1657                         sched->nr_unordered_timestamps, sched->nr_timestamps);
1658         }
1659         if (sched->nr_lost_events && sched->nr_events) {
1660                 printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1661                         (double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
1662                         sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
1663         }
1664         if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
1665                 printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1666                         (double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
1667                         sched->nr_context_switch_bugs, sched->nr_timestamps);
1668                 if (sched->nr_lost_events)
1669                         printf(" (due to lost events?)");
1670                 printf("\n");
1671         }
1672 }
1673
1674 static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data)
1675 {
1676         struct rb_node **new = &(root->rb_node), *parent = NULL;
1677         struct work_atoms *this;
1678         const char *comm = thread__comm_str(data->thread), *this_comm;
1679
1680         while (*new) {
1681                 int cmp;
1682
1683                 this = container_of(*new, struct work_atoms, node);
1684                 parent = *new;
1685
1686                 this_comm = thread__comm_str(this->thread);
1687                 cmp = strcmp(comm, this_comm);
1688                 if (cmp > 0) {
1689                         new = &((*new)->rb_left);
1690                 } else if (cmp < 0) {
1691                         new = &((*new)->rb_right);
1692                 } else {
1693                         this->num_merged++;
1694                         this->total_runtime += data->total_runtime;
1695                         this->nb_atoms += data->nb_atoms;
1696                         this->total_lat += data->total_lat;
1697                         list_splice(&data->work_list, &this->work_list);
1698                         if (this->max_lat < data->max_lat) {
1699                                 this->max_lat = data->max_lat;
1700                                 this->max_lat_at = data->max_lat_at;
1701                         }
1702                         zfree(&data);
1703                         return;
1704                 }
1705         }
1706
1707         data->num_merged++;
1708         rb_link_node(&data->node, parent, new);
1709         rb_insert_color(&data->node, root);
1710 }
1711
1712 static void perf_sched__merge_lat(struct perf_sched *sched)
1713 {
1714         struct work_atoms *data;
1715         struct rb_node *node;
1716
1717         if (sched->skip_merge)
1718                 return;
1719
1720         while ((node = rb_first(&sched->atom_root))) {
1721                 rb_erase(node, &sched->atom_root);
1722                 data = rb_entry(node, struct work_atoms, node);
1723                 __merge_work_atoms(&sched->merged_atom_root, data);
1724         }
1725 }
1726
1727 static int perf_sched__lat(struct perf_sched *sched)
1728 {
1729         struct rb_node *next;
1730
1731         setup_pager();
1732
1733         if (perf_sched__read_events(sched))
1734                 return -1;
1735
1736         perf_sched__merge_lat(sched);
1737         perf_sched__sort_lat(sched);
1738
1739         printf("\n -----------------------------------------------------------------------------------------------------------------\n");
1740         printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at       |\n");
1741         printf(" -----------------------------------------------------------------------------------------------------------------\n");
1742
1743         next = rb_first(&sched->sorted_atom_root);
1744
1745         while (next) {
1746                 struct work_atoms *work_list;
1747
1748                 work_list = rb_entry(next, struct work_atoms, node);
1749                 output_lat_thread(sched, work_list);
1750                 next = rb_next(next);
1751                 thread__zput(work_list->thread);
1752         }
1753
1754         printf(" -----------------------------------------------------------------------------------------------------------------\n");
1755         printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1756                 (double)sched->all_runtime / 1e6, sched->all_count);
1757
1758         printf(" ---------------------------------------------------\n");
1759
1760         print_bad_events(sched);
1761         printf("\n");
1762
1763         return 0;
1764 }
1765
1766 static int setup_map_cpus(struct perf_sched *sched)
1767 {
1768         struct cpu_map *map;
1769
1770         sched->max_cpu  = sysconf(_SC_NPROCESSORS_CONF);
1771
1772         if (sched->map.comp) {
1773                 sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int));
1774                 if (!sched->map.comp_cpus)
1775                         return -1;
1776         }
1777
1778         if (!sched->map.cpus_str)
1779                 return 0;
1780
1781         map = cpu_map__new(sched->map.cpus_str);
1782         if (!map) {
1783                 pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
1784                 return -1;
1785         }
1786
1787         sched->map.cpus = map;
1788         return 0;
1789 }
1790
1791 static int setup_color_pids(struct perf_sched *sched)
1792 {
1793         struct thread_map *map;
1794
1795         if (!sched->map.color_pids_str)
1796                 return 0;
1797
1798         map = thread_map__new_by_tid_str(sched->map.color_pids_str);
1799         if (!map) {
1800                 pr_err("failed to get thread map from %s\n", sched->map.color_pids_str);
1801                 return -1;
1802         }
1803
1804         sched->map.color_pids = map;
1805         return 0;
1806 }
1807
1808 static int setup_color_cpus(struct perf_sched *sched)
1809 {
1810         struct cpu_map *map;
1811
1812         if (!sched->map.color_cpus_str)
1813                 return 0;
1814
1815         map = cpu_map__new(sched->map.color_cpus_str);
1816         if (!map) {
1817                 pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str);
1818                 return -1;
1819         }
1820
1821         sched->map.color_cpus = map;
1822         return 0;
1823 }
1824
1825 static int perf_sched__map(struct perf_sched *sched)
1826 {
1827         if (setup_map_cpus(sched))
1828                 return -1;
1829
1830         if (setup_color_pids(sched))
1831                 return -1;
1832
1833         if (setup_color_cpus(sched))
1834                 return -1;
1835
1836         setup_pager();
1837         if (perf_sched__read_events(sched))
1838                 return -1;
1839         print_bad_events(sched);
1840         return 0;
1841 }
1842
1843 static int perf_sched__replay(struct perf_sched *sched)
1844 {
1845         unsigned long i;
1846
1847         calibrate_run_measurement_overhead(sched);
1848         calibrate_sleep_measurement_overhead(sched);
1849
1850         test_calibrations(sched);
1851
1852         if (perf_sched__read_events(sched))
1853                 return -1;
1854
1855         printf("nr_run_events:        %ld\n", sched->nr_run_events);
1856         printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
1857         printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
1858
1859         if (sched->targetless_wakeups)
1860                 printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
1861         if (sched->multitarget_wakeups)
1862                 printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
1863         if (sched->nr_run_events_optimized)
1864                 printf("run atoms optimized: %ld\n",
1865                         sched->nr_run_events_optimized);
1866
1867         print_task_traces(sched);
1868         add_cross_task_wakeups(sched);
1869
1870         create_tasks(sched);
1871         printf("------------------------------------------------------------\n");
1872         for (i = 0; i < sched->replay_repeat; i++)
1873                 run_one_test(sched);
1874
1875         return 0;
1876 }
1877
1878 static void setup_sorting(struct perf_sched *sched, const struct option *options,
1879                           const char * const usage_msg[])
1880 {
1881         char *tmp, *tok, *str = strdup(sched->sort_order);
1882
1883         for (tok = strtok_r(str, ", ", &tmp);
1884                         tok; tok = strtok_r(NULL, ", ", &tmp)) {
1885                 if (sort_dimension__add(tok, &sched->sort_list) < 0) {
1886                         usage_with_options_msg(usage_msg, options,
1887                                         "Unknown --sort key: `%s'", tok);
1888                 }
1889         }
1890
1891         free(str);
1892
1893         sort_dimension__add("pid", &sched->cmp_pid);
1894 }
1895
1896 static int __cmd_record(int argc, const char **argv)
1897 {
1898         unsigned int rec_argc, i, j;
1899         const char **rec_argv;
1900         const char * const record_args[] = {
1901                 "record",
1902                 "-a",
1903                 "-R",
1904                 "-m", "1024",
1905                 "-c", "1",
1906                 "-e", "sched:sched_switch",
1907                 "-e", "sched:sched_stat_wait",
1908                 "-e", "sched:sched_stat_sleep",
1909                 "-e", "sched:sched_stat_iowait",
1910                 "-e", "sched:sched_stat_runtime",
1911                 "-e", "sched:sched_process_fork",
1912                 "-e", "sched:sched_wakeup",
1913                 "-e", "sched:sched_wakeup_new",
1914                 "-e", "sched:sched_migrate_task",
1915         };
1916
1917         rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1918         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1919
1920         if (rec_argv == NULL)
1921                 return -ENOMEM;
1922
1923         for (i = 0; i < ARRAY_SIZE(record_args); i++)
1924                 rec_argv[i] = strdup(record_args[i]);
1925
1926         for (j = 1; j < (unsigned int)argc; j++, i++)
1927                 rec_argv[i] = argv[j];
1928
1929         BUG_ON(i != rec_argc);
1930
1931         return cmd_record(i, rec_argv, NULL);
1932 }
1933
1934 int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
1935 {
1936         const char default_sort_order[] = "avg, max, switch, runtime";
1937         struct perf_sched sched = {
1938                 .tool = {
1939                         .sample          = perf_sched__process_tracepoint_sample,
1940                         .comm            = perf_event__process_comm,
1941                         .lost            = perf_event__process_lost,
1942                         .fork            = perf_sched__process_fork_event,
1943                         .ordered_events = true,
1944                 },
1945                 .cmp_pid              = LIST_HEAD_INIT(sched.cmp_pid),
1946                 .sort_list            = LIST_HEAD_INIT(sched.sort_list),
1947                 .start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
1948                 .work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
1949                 .sort_order           = default_sort_order,
1950                 .replay_repeat        = 10,
1951                 .profile_cpu          = -1,
1952                 .next_shortname1      = 'A',
1953                 .next_shortname2      = '0',
1954                 .skip_merge           = 0,
1955         };
1956         const struct option latency_options[] = {
1957         OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
1958                    "sort by key(s): runtime, switch, avg, max"),
1959         OPT_INCR('v', "verbose", &verbose,
1960                     "be more verbose (show symbol address, etc)"),
1961         OPT_INTEGER('C', "CPU", &sched.profile_cpu,
1962                     "CPU to profile on"),
1963         OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1964                     "dump raw trace in ASCII"),
1965         OPT_BOOLEAN('p', "pids", &sched.skip_merge,
1966                     "latency stats per pid instead of per comm"),
1967         OPT_END()
1968         };
1969         const struct option replay_options[] = {
1970         OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
1971                      "repeat the workload replay N times (-1: infinite)"),
1972         OPT_INCR('v', "verbose", &verbose,
1973                     "be more verbose (show symbol address, etc)"),
1974         OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1975                     "dump raw trace in ASCII"),
1976         OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
1977         OPT_END()
1978         };
1979         const struct option sched_options[] = {
1980         OPT_STRING('i', "input", &input_name, "file",
1981                     "input file name"),
1982         OPT_INCR('v', "verbose", &verbose,
1983                     "be more verbose (show symbol address, etc)"),
1984         OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1985                     "dump raw trace in ASCII"),
1986         OPT_END()
1987         };
1988         const struct option map_options[] = {
1989         OPT_BOOLEAN(0, "compact", &sched.map.comp,
1990                     "map output in compact mode"),
1991         OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids",
1992                    "highlight given pids in map"),
1993         OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus",
1994                     "highlight given CPUs in map"),
1995         OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus",
1996                     "display given CPUs in map"),
1997         OPT_END()
1998         };
1999         const char * const latency_usage[] = {
2000                 "perf sched latency [<options>]",
2001                 NULL
2002         };
2003         const char * const replay_usage[] = {
2004                 "perf sched replay [<options>]",
2005                 NULL
2006         };
2007         const char * const map_usage[] = {
2008                 "perf sched map [<options>]",
2009                 NULL
2010         };
2011         const char *const sched_subcommands[] = { "record", "latency", "map",
2012                                                   "replay", "script", NULL };
2013         const char *sched_usage[] = {
2014                 NULL,
2015                 NULL
2016         };
2017         struct trace_sched_handler lat_ops  = {
2018                 .wakeup_event       = latency_wakeup_event,
2019                 .switch_event       = latency_switch_event,
2020                 .runtime_event      = latency_runtime_event,
2021                 .migrate_task_event = latency_migrate_task_event,
2022         };
2023         struct trace_sched_handler map_ops  = {
2024                 .switch_event       = map_switch_event,
2025         };
2026         struct trace_sched_handler replay_ops  = {
2027                 .wakeup_event       = replay_wakeup_event,
2028                 .switch_event       = replay_switch_event,
2029                 .fork_event         = replay_fork_event,
2030         };
2031         unsigned int i;
2032
2033         for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
2034                 sched.curr_pid[i] = -1;
2035
2036         argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
2037                                         sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2038         if (!argc)
2039                 usage_with_options(sched_usage, sched_options);
2040
2041         /*
2042          * Aliased to 'perf script' for now:
2043          */
2044         if (!strcmp(argv[0], "script"))
2045                 return cmd_script(argc, argv, prefix);
2046
2047         if (!strncmp(argv[0], "rec", 3)) {
2048                 return __cmd_record(argc, argv);
2049         } else if (!strncmp(argv[0], "lat", 3)) {
2050                 sched.tp_handler = &lat_ops;
2051                 if (argc > 1) {
2052                         argc = parse_options(argc, argv, latency_options, latency_usage, 0);
2053                         if (argc)
2054                                 usage_with_options(latency_usage, latency_options);
2055                 }
2056                 setup_sorting(&sched, latency_options, latency_usage);
2057                 return perf_sched__lat(&sched);
2058         } else if (!strcmp(argv[0], "map")) {
2059                 if (argc) {
2060                         argc = parse_options(argc, argv, map_options, map_usage, 0);
2061                         if (argc)
2062                                 usage_with_options(map_usage, map_options);
2063                 }
2064                 sched.tp_handler = &map_ops;
2065                 setup_sorting(&sched, latency_options, latency_usage);
2066                 return perf_sched__map(&sched);
2067         } else if (!strncmp(argv[0], "rep", 3)) {
2068                 sched.tp_handler = &replay_ops;
2069                 if (argc) {
2070                         argc = parse_options(argc, argv, replay_options, replay_usage, 0);
2071                         if (argc)
2072                                 usage_with_options(replay_usage, replay_options);
2073                 }
2074                 return perf_sched__replay(&sched);
2075         } else {
2076                 usage_with_options(sched_usage, sched_options);
2077         }
2078
2079         return 0;
2080 }