tools/perf/builtin-sched.c

   1 #include "builtin.h"
   2 #include "perf.h"
   3
   4 #include "util/util.h"
   5 #include "util/evlist.h"
   6 #include "util/cache.h"
   7 #include "util/evsel.h"
   8 #include "util/symbol.h"
   9 #include "util/thread.h"
  10 #include "util/header.h"
  11 #include "util/session.h"
  12 #include "util/tool.h"
  13 #include "util/cloexec.h"
  14 #include "util/thread_map.h"
  15 #include "util/color.h"
  16
  17 #include <subcmd/parse-options.h>
  18 #include "util/trace-event.h"
  19
  20 #include "util/debug.h"
  21
  22 #include <sys/prctl.h>
  23 #include <sys/resource.h>
  24
  25 #include <semaphore.h>
  26 #include <pthread.h>
  27 #include <math.h>
  28 #include <api/fs/fs.h>
  29
  30 #define PR_SET_NAME             15               /* Set process name */
  31 #define MAX_CPUS                4096
  32 #define COMM_LEN                20
  33 #define SYM_LEN                 129
  34 #define MAX_PID                 1024000
  35
  36 struct sched_atom;
  37
  38 struct task_desc {
  39         unsigned long           nr;
  40         unsigned long           pid;
  41         char                    comm[COMM_LEN];
  42
  43         unsigned long           nr_events;
  44         unsigned long           curr_event;
  45         struct sched_atom       **atoms;
  46
  47         pthread_t               thread;
  48         sem_t                   sleep_sem;
  49
  50         sem_t                   ready_for_work;
  51         sem_t                   work_done_sem;
  52
  53         u64                     cpu_usage;
  54 };
  55
  56 enum sched_event_type {
  57         SCHED_EVENT_RUN,
  58         SCHED_EVENT_SLEEP,
  59         SCHED_EVENT_WAKEUP,
  60         SCHED_EVENT_MIGRATION,
  61 };
  62
  63 struct sched_atom {
  64         enum sched_event_type   type;
  65         int                     specific_wait;
  66         u64                     timestamp;
  67         u64                     duration;
  68         unsigned long           nr;
  69         sem_t                   *wait_sem;
  70         struct task_desc        *wakee;
  71 };
  72
  73 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
  74
  75 enum thread_state {
  76         THREAD_SLEEPING = 0,
  77         THREAD_WAIT_CPU,
  78         THREAD_SCHED_IN,
  79         THREAD_IGNORE
  80 };
  81
  82 struct work_atom {
  83         struct list_head        list;
  84         enum thread_state       state;
  85         u64                     sched_out_time;
  86         u64                     wake_up_time;
  87         u64                     sched_in_time;
  88         u64                     runtime;
  89 };
  90
  91 struct work_atoms {
  92         struct list_head        work_list;
  93         struct thread           *thread;
  94         struct rb_node          node;
  95         u64                     max_lat;
  96         u64                     max_lat_at;
  97         u64                     total_lat;
  98         u64                     nb_atoms;
  99         u64                     total_runtime;
 100         int                     num_merged;
 101 };
 102
 103 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
 104
 105 struct perf_sched;
 106
 107 struct trace_sched_handler {
 108         int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
 109                             struct perf_sample *sample, struct machine *machine);
 110
 111         int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
 112                              struct perf_sample *sample, struct machine *machine);
 113
 114         int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
 115                             struct perf_sample *sample, struct machine *machine);
 116
 117         /* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
 118         int (*fork_event)(struct perf_sched *sched, union perf_event *event,
 119                           struct machine *machine);
 120
 121         int (*migrate_task_event)(struct perf_sched *sched,
 122                                   struct perf_evsel *evsel,
 123                                   struct perf_sample *sample,
 124                                   struct machine *machine);
 125 };
 126
 127 #define COLOR_PIDS PERF_COLOR_BLUE
 128 #define COLOR_CPUS PERF_COLOR_BG_RED
 129
 130 struct perf_sched_map {
 131         DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
 132         int                     *comp_cpus;
 133         bool                     comp;
 134         struct thread_map       *color_pids;
 135         const char              *color_pids_str;
 136         struct cpu_map          *color_cpus;
 137         const char              *color_cpus_str;
 138         struct cpu_map          *cpus;
 139         const char              *cpus_str;
 140 };
 141
 142 struct perf_sched {
 143         struct perf_tool tool;
 144         const char       *sort_order;
 145         unsigned long    nr_tasks;
 146         struct task_desc **pid_to_task;
 147         struct task_desc **tasks;
 148         const struct trace_sched_handler *tp_handler;
 149         pthread_mutex_t  start_work_mutex;
 150         pthread_mutex_t  work_done_wait_mutex;
 151         int              profile_cpu;
 152 /*
 153  * Track the current task - that way we can know whether there's any
 154  * weird events, such as a task being switched away that is not current.
 155  */
 156         int              max_cpu;
 157         u32              curr_pid[MAX_CPUS];
 158         struct thread    *curr_thread[MAX_CPUS];
 159         char             next_shortname1;
 160         char             next_shortname2;
 161         unsigned int     replay_repeat;
 162         unsigned long    nr_run_events;
 163         unsigned long    nr_sleep_events;
 164         unsigned long    nr_wakeup_events;
 165         unsigned long    nr_sleep_corrections;
 166         unsigned long    nr_run_events_optimized;
 167         unsigned long    targetless_wakeups;
 168         unsigned long    multitarget_wakeups;
 169         unsigned long    nr_runs;
 170         unsigned long    nr_timestamps;
 171         unsigned long    nr_unordered_timestamps;
 172         unsigned long    nr_context_switch_bugs;
 173         unsigned long    nr_events;
 174         unsigned long    nr_lost_chunks;
 175         unsigned long    nr_lost_events;
 176         u64              run_measurement_overhead;
 177         u64              sleep_measurement_overhead;
 178         u64              start_time;
 179         u64              cpu_usage;
 180         u64              runavg_cpu_usage;
 181         u64              parent_cpu_usage;
 182         u64              runavg_parent_cpu_usage;
 183         u64              sum_runtime;
 184         u64              sum_fluct;
 185         u64              run_avg;
 186         u64              all_runtime;
 187         u64              all_count;
 188         u64              cpu_last_switched[MAX_CPUS];
 189         struct rb_root   atom_root, sorted_atom_root, merged_atom_root;
 190         struct list_head sort_list, cmp_pid;
 191         bool force;
 192         bool skip_merge;
 193         struct perf_sched_map map;
 194 };
 195
 196 static u64 get_nsecs(void)
 197 {
 198         struct timespec ts;
 199
 200         clock_gettime(CLOCK_MONOTONIC, &ts);
 201
 202         return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
 203 }
 204
 205 static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
 206 {
 207         u64 T0 = get_nsecs(), T1;
 208
 209         do {
 210                 T1 = get_nsecs();
 211         } while (T1 + sched->run_measurement_overhead < T0 + nsecs);
 212 }
 213
 214 static void sleep_nsecs(u64 nsecs)
 215 {
 216         struct timespec ts;
 217
 218         ts.tv_nsec = nsecs % 999999999;
 219         ts.tv_sec = nsecs / 999999999;
 220
 221         nanosleep(&ts, NULL);
 222 }
 223
 224 static void calibrate_run_measurement_overhead(struct perf_sched *sched)
 225 {
 226         u64 T0, T1, delta, min_delta = 1000000000ULL;
 227         int i;
 228
 229         for (i = 0; i < 10; i++) {
 230                 T0 = get_nsecs();
 231                 burn_nsecs(sched, 0);
 232                 T1 = get_nsecs();
 233                 delta = T1-T0;
 234                 min_delta = min(min_delta, delta);
 235         }
 236         sched->run_measurement_overhead = min_delta;
 237
 238         printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
 239 }
 240
 241 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
 242 {
 243         u64 T0, T1, delta, min_delta = 1000000000ULL;
 244         int i;
 245
 246         for (i = 0; i < 10; i++) {
 247                 T0 = get_nsecs();
 248                 sleep_nsecs(10000);
 249                 T1 = get_nsecs();
 250                 delta = T1-T0;
 251                 min_delta = min(min_delta, delta);
 252         }
 253         min_delta -= 10000;
 254         sched->sleep_measurement_overhead = min_delta;
 255
 256         printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
 257 }
 258
 259 static struct sched_atom *
 260 get_new_event(struct task_desc *task, u64 timestamp)
 261 {
 262         struct sched_atom *event = zalloc(sizeof(*event));
 263         unsigned long idx = task->nr_events;
 264         size_t size;
 265
 266         event->timestamp = timestamp;
 267         event->nr = idx;
 268
 269         task->nr_events++;
 270         size = sizeof(struct sched_atom *) * task->nr_events;
 271         task->atoms = realloc(task->atoms, size);
 272         BUG_ON(!task->atoms);
 273
 274         task->atoms[idx] = event;
 275
 276         return event;
 277 }
 278
 279 static struct sched_atom *last_event(struct task_desc *task)
 280 {
 281         if (!task->nr_events)
 282                 return NULL;
 283
 284         return task->atoms[task->nr_events - 1];
 285 }
 286
 287 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
 288                                 u64 timestamp, u64 duration)
 289 {
 290         struct sched_atom *event, *curr_event = last_event(task);
 291
 292         /*
 293          * optimize an existing RUN event by merging this one
 294          * to it:
 295          */
 296         if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
 297                 sched->nr_run_events_optimized++;
 298                 curr_event->duration += duration;
 299                 return;
 300         }
 301
 302         event = get_new_event(task, timestamp);
 303
 304         event->type = SCHED_EVENT_RUN;
 305         event->duration = duration;
 306
 307         sched->nr_run_events++;
 308 }
 309
 310 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
 311                                    u64 timestamp, struct task_desc *wakee)
 312 {
 313         struct sched_atom *event, *wakee_event;
 314
 315         event = get_new_event(task, timestamp);
 316         event->type = SCHED_EVENT_WAKEUP;
 317         event->wakee = wakee;
 318
 319         wakee_event = last_event(wakee);
 320         if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
 321                 sched->targetless_wakeups++;
 322                 return;
 323         }
 324         if (wakee_event->wait_sem) {
 325                 sched->multitarget_wakeups++;
 326                 return;
 327         }
 328
 329         wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
 330         sem_init(wakee_event->wait_sem, 0, 0);
 331         wakee_event->specific_wait = 1;
 332         event->wait_sem = wakee_event->wait_sem;
 333
 334         sched->nr_wakeup_events++;
 335 }
 336
 337 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
 338                                   u64 timestamp, u64 task_state __maybe_unused)
 339 {
 340         struct sched_atom *event = get_new_event(task, timestamp);
 341
 342         event->type = SCHED_EVENT_SLEEP;
 343
 344         sched->nr_sleep_events++;
 345 }
 346
 347 static struct task_desc *register_pid(struct perf_sched *sched,
 348                                       unsigned long pid, const char *comm)
 349 {
 350         struct task_desc *task;
 351         static int pid_max;
 352
 353         if (sched->pid_to_task == NULL) {
 354                 if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
 355                         pid_max = MAX_PID;
 356                 BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
 357         }
 358         if (pid >= (unsigned long)pid_max) {
 359                 BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
 360                         sizeof(struct task_desc *))) == NULL);
 361                 while (pid >= (unsigned long)pid_max)
 362                         sched->pid_to_task[pid_max++] = NULL;
 363         }
 364
 365         task = sched->pid_to_task[pid];
 366
 367         if (task)
 368                 return task;
 369
 370         task = zalloc(sizeof(*task));
 371         task->pid = pid;
 372         task->nr = sched->nr_tasks;
 373         strcpy(task->comm, comm);
 374         /*
 375          * every task starts in sleeping state - this gets ignored
 376          * if there's no wakeup pointing to this sleep state:
 377          */
 378         add_sched_event_sleep(sched, task, 0, 0);
 379
 380         sched->pid_to_task[pid] = task;
 381         sched->nr_tasks++;
 382         sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
 383         BUG_ON(!sched->tasks);
 384         sched->tasks[task->nr] = task;
 385
 386         if (verbose)
 387                 printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
 388
 389         return task;
 390 }
 391
 392
 393 static void print_task_traces(struct perf_sched *sched)
 394 {
 395         struct task_desc *task;
 396         unsigned long i;
 397
 398         for (i = 0; i < sched->nr_tasks; i++) {
 399                 task = sched->tasks[i];
 400                 printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
 401                         task->nr, task->comm, task->pid, task->nr_events);
 402         }
 403 }
 404
 405 static void add_cross_task_wakeups(struct perf_sched *sched)
 406 {
 407         struct task_desc *task1, *task2;
 408         unsigned long i, j;
 409
 410         for (i = 0; i < sched->nr_tasks; i++) {
 411                 task1 = sched->tasks[i];
 412                 j = i + 1;
 413                 if (j == sched->nr_tasks)
 414                         j = 0;
 415                 task2 = sched->tasks[j];
 416                 add_sched_event_wakeup(sched, task1, 0, task2);
 417         }
 418 }
 419
 420 static void perf_sched__process_event(struct perf_sched *sched,
 421                                       struct sched_atom *atom)
 422 {
 423         int ret = 0;
 424
 425         switch (atom->type) {
 426                 case SCHED_EVENT_RUN:
 427                         burn_nsecs(sched, atom->duration);
 428                         break;
 429                 case SCHED_EVENT_SLEEP:
 430                         if (atom->wait_sem)
 431                                 ret = sem_wait(atom->wait_sem);
 432                         BUG_ON(ret);
 433                         break;
 434                 case SCHED_EVENT_WAKEUP:
 435                         if (atom->wait_sem)
 436                                 ret = sem_post(atom->wait_sem);
 437                         BUG_ON(ret);
 438                         break;
 439                 case SCHED_EVENT_MIGRATION:
 440                         break;
 441                 default:
 442                         BUG_ON(1);
 443         }
 444 }
 445
 446 static u64 get_cpu_usage_nsec_parent(void)
 447 {
 448         struct rusage ru;
 449         u64 sum;
 450         int err;
 451
 452         err = getrusage(RUSAGE_SELF, &ru);
 453         BUG_ON(err);
 454
 455         sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
 456         sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
 457
 458         return sum;
 459 }
 460
 461 static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
 462 {
 463         struct perf_event_attr attr;
 464         char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
 465         int fd;
 466         struct rlimit limit;
 467         bool need_privilege = false;
 468
 469         memset(&attr, 0, sizeof(attr));
 470
 471         attr.type = PERF_TYPE_SOFTWARE;
 472         attr.config = PERF_COUNT_SW_TASK_CLOCK;
 473
 474 force_again:
 475         fd = sys_perf_event_open(&attr, 0, -1, -1,
 476                                  perf_event_open_cloexec_flag());
 477
 478         if (fd < 0) {
 479                 if (errno == EMFILE) {
 480                         if (sched->force) {
 481                                 BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
 482                                 limit.rlim_cur += sched->nr_tasks - cur_task;
 483                                 if (limit.rlim_cur > limit.rlim_max) {
 484                                         limit.rlim_max = limit.rlim_cur;
 485                                         need_privilege = true;
 486                                 }
 487                                 if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
 488                                         if (need_privilege && errno == EPERM)
 489                                                 strcpy(info, "Need privilege\n");
 490                                 } else
 491                                         goto force_again;
 492                         } else
 493                                 strcpy(info, "Have a try with -f option\n");
 494                 }
 495                 pr_err("Error: sys_perf_event_open() syscall returned "
 496                        "with %d (%s)\n%s", fd,
 497                        str_error_r(errno, sbuf, sizeof(sbuf)), info);
 498                 exit(EXIT_FAILURE);
 499         }
 500         return fd;
 501 }
 502
 503 static u64 get_cpu_usage_nsec_self(int fd)
 504 {
 505         u64 runtime;
 506         int ret;
 507
 508         ret = read(fd, &runtime, sizeof(runtime));
 509         BUG_ON(ret != sizeof(runtime));
 510
 511         return runtime;
 512 }
 513
 514 struct sched_thread_parms {
 515         struct task_desc  *task;
 516         struct perf_sched *sched;
 517         int fd;
 518 };
 519
 520 static void *thread_func(void *ctx)
 521 {
 522         struct sched_thread_parms *parms = ctx;
 523         struct task_desc *this_task = parms->task;
 524         struct perf_sched *sched = parms->sched;
 525         u64 cpu_usage_0, cpu_usage_1;
 526         unsigned long i, ret;
 527         char comm2[22];
 528         int fd = parms->fd;
 529
 530         zfree(&parms);
 531
 532         sprintf(comm2, ":%s", this_task->comm);
 533         prctl(PR_SET_NAME, comm2);
 534         if (fd < 0)
 535                 return NULL;
 536 again:
 537         ret = sem_post(&this_task->ready_for_work);
 538         BUG_ON(ret);
 539         ret = pthread_mutex_lock(&sched->start_work_mutex);
 540         BUG_ON(ret);
 541         ret = pthread_mutex_unlock(&sched->start_work_mutex);
 542         BUG_ON(ret);
 543
 544         cpu_usage_0 = get_cpu_usage_nsec_self(fd);
 545
 546         for (i = 0; i < this_task->nr_events; i++) {
 547                 this_task->curr_event = i;
 548                 perf_sched__process_event(sched, this_task->atoms[i]);
 549         }
 550
 551         cpu_usage_1 = get_cpu_usage_nsec_self(fd);
 552         this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
 553         ret = sem_post(&this_task->work_done_sem);
 554         BUG_ON(ret);
 555
 556         ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
 557         BUG_ON(ret);
 558         ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
 559         BUG_ON(ret);
 560
 561         goto again;
 562 }
 563
 564 static void create_tasks(struct perf_sched *sched)
 565 {
 566         struct task_desc *task;
 567         pthread_attr_t attr;
 568         unsigned long i;
 569         int err;
 570
 571         err = pthread_attr_init(&attr);
 572         BUG_ON(err);
 573         err = pthread_attr_setstacksize(&attr,
 574                         (size_t) max(16 * 1024, PTHREAD_STACK_MIN));
 575         BUG_ON(err);
 576         err = pthread_mutex_lock(&sched->start_work_mutex);
 577         BUG_ON(err);
 578         err = pthread_mutex_lock(&sched->work_done_wait_mutex);
 579         BUG_ON(err);
 580         for (i = 0; i < sched->nr_tasks; i++) {
 581                 struct sched_thread_parms *parms = malloc(sizeof(*parms));
 582                 BUG_ON(parms == NULL);
 583                 parms->task = task = sched->tasks[i];
 584                 parms->sched = sched;
 585                 parms->fd = self_open_counters(sched, i);
 586                 sem_init(&task->sleep_sem, 0, 0);
 587                 sem_init(&task->ready_for_work, 0, 0);
 588                 sem_init(&task->work_done_sem, 0, 0);
 589                 task->curr_event = 0;
 590                 err = pthread_create(&task->thread, &attr, thread_func, parms);
 591                 BUG_ON(err);
 592         }
 593 }
 594
 595 static void wait_for_tasks(struct perf_sched *sched)
 596 {
 597         u64 cpu_usage_0, cpu_usage_1;
 598         struct task_desc *task;
 599         unsigned long i, ret;
 600
 601         sched->start_time = get_nsecs();
 602         sched->cpu_usage = 0;
 603         pthread_mutex_unlock(&sched->work_done_wait_mutex);
 604
 605         for (i = 0; i < sched->nr_tasks; i++) {
 606                 task = sched->tasks[i];
 607                 ret = sem_wait(&task->ready_for_work);
 608                 BUG_ON(ret);
 609                 sem_init(&task->ready_for_work, 0, 0);
 610         }
 611         ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
 612         BUG_ON(ret);
 613
 614         cpu_usage_0 = get_cpu_usage_nsec_parent();
 615
 616         pthread_mutex_unlock(&sched->start_work_mutex);
 617
 618         for (i = 0; i < sched->nr_tasks; i++) {
 619                 task = sched->tasks[i];
 620                 ret = sem_wait(&task->work_done_sem);
 621                 BUG_ON(ret);
 622                 sem_init(&task->work_done_sem, 0, 0);
 623                 sched->cpu_usage += task->cpu_usage;
 624                 task->cpu_usage = 0;
 625         }
 626
 627         cpu_usage_1 = get_cpu_usage_nsec_parent();
 628         if (!sched->runavg_cpu_usage)
 629                 sched->runavg_cpu_usage = sched->cpu_usage;
 630         sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
 631
 632         sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
 633         if (!sched->runavg_parent_cpu_usage)
 634                 sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
 635         sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
 636                                          sched->parent_cpu_usage)/sched->replay_repeat;
 637
 638         ret = pthread_mutex_lock(&sched->start_work_mutex);
 639         BUG_ON(ret);
 640
 641         for (i = 0; i < sched->nr_tasks; i++) {
 642                 task = sched->tasks[i];
 643                 sem_init(&task->sleep_sem, 0, 0);
 644                 task->curr_event = 0;
 645         }
 646 }
 647
 648 static void run_one_test(struct perf_sched *sched)
 649 {
 650         u64 T0, T1, delta, avg_delta, fluct;
 651
 652         T0 = get_nsecs();
 653         wait_for_tasks(sched);
 654         T1 = get_nsecs();
 655
 656         delta = T1 - T0;
 657         sched->sum_runtime += delta;
 658         sched->nr_runs++;
 659
 660         avg_delta = sched->sum_runtime / sched->nr_runs;
 661         if (delta < avg_delta)
 662                 fluct = avg_delta - delta;
 663         else
 664                 fluct = delta - avg_delta;
 665         sched->sum_fluct += fluct;
 666         if (!sched->run_avg)
 667                 sched->run_avg = delta;
 668         sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
 669
 670         printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
 671
 672         printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
 673
 674         printf("cpu: %0.2f / %0.2f",
 675                 (double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
 676
 677 #if 0
 678         /*
 679          * rusage statistics done by the parent, these are less
 680          * accurate than the sched->sum_exec_runtime based statistics:
 681          */
 682         printf(" [%0.2f / %0.2f]",
 683                 (double)sched->parent_cpu_usage/1e6,
 684                 (double)sched->runavg_parent_cpu_usage/1e6);
 685 #endif
 686
 687         printf("\n");
 688
 689         if (sched->nr_sleep_corrections)
 690                 printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
 691         sched->nr_sleep_corrections = 0;
 692 }
 693
 694 static void test_calibrations(struct perf_sched *sched)
 695 {
 696         u64 T0, T1;
 697
 698         T0 = get_nsecs();
 699         burn_nsecs(sched, 1e6);
 700         T1 = get_nsecs();
 701
 702         printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
 703
 704         T0 = get_nsecs();
 705         sleep_nsecs(1e6);
 706         T1 = get_nsecs();
 707
 708         printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
 709 }
 710
 711 static int
 712 replay_wakeup_event(struct perf_sched *sched,
 713                     struct perf_evsel *evsel, struct perf_sample *sample,
 714                     struct machine *machine __maybe_unused)
 715 {
 716         const char *comm = perf_evsel__strval(evsel, sample, "comm");
 717         const u32 pid    = perf_evsel__intval(evsel, sample, "pid");
 718         struct task_desc *waker, *wakee;
 719
 720         if (verbose) {
 721                 printf("sched_wakeup event %p\n", evsel);
 722
 723                 printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
 724         }
 725
 726         waker = register_pid(sched, sample->tid, "<unknown>");
 727         wakee = register_pid(sched, pid, comm);
 728
 729         add_sched_event_wakeup(sched, waker, sample->time, wakee);
 730         return 0;
 731 }
 732
 733 static int replay_switch_event(struct perf_sched *sched,
 734                                struct perf_evsel *evsel,
 735                                struct perf_sample *sample,
 736                                struct machine *machine __maybe_unused)
 737 {
 738         const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
 739                    *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
 740         const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
 741                   next_pid = perf_evsel__intval(evsel, sample, "next_pid");
 742         const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
 743         struct task_desc *prev, __maybe_unused *next;
 744         u64 timestamp0, timestamp = sample->time;
 745         int cpu = sample->cpu;
 746         s64 delta;
 747
 748         if (verbose)
 749                 printf("sched_switch event %p\n", evsel);
 750
 751         if (cpu >= MAX_CPUS || cpu < 0)
 752                 return 0;
 753
 754         timestamp0 = sched->cpu_last_switched[cpu];
 755         if (timestamp0)
 756                 delta = timestamp - timestamp0;
 757         else
 758                 delta = 0;
 759
 760         if (delta < 0) {
 761                 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
 762                 return -1;
 763         }
 764
 765         pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
 766                  prev_comm, prev_pid, next_comm, next_pid, delta);
 767
 768         prev = register_pid(sched, prev_pid, prev_comm);
 769         next = register_pid(sched, next_pid, next_comm);
 770
 771         sched->cpu_last_switched[cpu] = timestamp;
 772
 773         add_sched_event_run(sched, prev, timestamp, delta);
 774         add_sched_event_sleep(sched, prev, timestamp, prev_state);
 775
 776         return 0;
 777 }
 778
 779 static int replay_fork_event(struct perf_sched *sched,
 780                              union perf_event *event,
 781                              struct machine *machine)
 782 {
 783         struct thread *child, *parent;
 784
 785         child = machine__findnew_thread(machine, event->fork.pid,
 786                                         event->fork.tid);
 787         parent = machine__findnew_thread(machine, event->fork.ppid,
 788                                          event->fork.ptid);
 789
 790         if (child == NULL || parent == NULL) {
 791                 pr_debug("thread does not exist on fork event: child %p, parent %p\n",
 792                                  child, parent);
 793                 goto out_put;
 794         }
 795
 796         if (verbose) {
 797                 printf("fork event\n");
 798                 printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
 799                 printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
 800         }
 801
 802         register_pid(sched, parent->tid, thread__comm_str(parent));
 803         register_pid(sched, child->tid, thread__comm_str(child));
 804 out_put:
 805         thread__put(child);
 806         thread__put(parent);
 807         return 0;
 808 }
 809
 810 struct sort_dimension {
 811         const char              *name;
 812         sort_fn_t               cmp;
 813         struct list_head        list;
 814 };
 815
 816 static int
 817 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
 818 {
 819         struct sort_dimension *sort;
 820         int ret = 0;
 821
 822         BUG_ON(list_empty(list));
 823
 824         list_for_each_entry(sort, list, list) {
 825                 ret = sort->cmp(l, r);
 826                 if (ret)
 827                         return ret;
 828         }
 829
 830         return ret;
 831 }
 832
 833 static struct work_atoms *
 834 thread_atoms_search(struct rb_root *root, struct thread *thread,
 835                          struct list_head *sort_list)
 836 {
 837         struct rb_node *node = root->rb_node;
 838         struct work_atoms key = { .thread = thread };
 839
 840         while (node) {
 841                 struct work_atoms *atoms;
 842                 int cmp;
 843
 844                 atoms = container_of(node, struct work_atoms, node);
 845
 846                 cmp = thread_lat_cmp(sort_list, &key, atoms);
 847                 if (cmp > 0)
 848                         node = node->rb_left;
 849                 else if (cmp < 0)
 850                         node = node->rb_right;
 851                 else {
 852                         BUG_ON(thread != atoms->thread);
 853                         return atoms;
 854                 }
 855         }
 856         return NULL;
 857 }
 858
 859 static void
 860 __thread_latency_insert(struct rb_root *root, struct work_atoms *data,
 861                          struct list_head *sort_list)
 862 {
 863         struct rb_node **new = &(root->rb_node), *parent = NULL;
 864
 865         while (*new) {
 866                 struct work_atoms *this;
 867                 int cmp;
 868
 869                 this = container_of(*new, struct work_atoms, node);
 870                 parent = *new;
 871
 872                 cmp = thread_lat_cmp(sort_list, data, this);
 873
 874                 if (cmp > 0)
 875                         new = &((*new)->rb_left);
 876                 else
 877                         new = &((*new)->rb_right);
 878         }
 879
 880         rb_link_node(&data->node, parent, new);
 881         rb_insert_color(&data->node, root);
 882 }
 883
 884 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
 885 {
 886         struct work_atoms *atoms = zalloc(sizeof(*atoms));
 887         if (!atoms) {
 888                 pr_err("No memory at %s\n", __func__);
 889                 return -1;
 890         }
 891
 892         atoms->thread = thread__get(thread);
 893         INIT_LIST_HEAD(&atoms->work_list);
 894         __thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
 895         return 0;
 896 }
 897
 898 static char sched_out_state(u64 prev_state)
 899 {
 900         const char *str = TASK_STATE_TO_CHAR_STR;
 901
 902         return str[prev_state];
 903 }
 904
 905 static int
 906 add_sched_out_event(struct work_atoms *atoms,
 907                     char run_state,
 908                     u64 timestamp)
 909 {
 910         struct work_atom *atom = zalloc(sizeof(*atom));
 911         if (!atom) {
 912                 pr_err("Non memory at %s", __func__);
 913                 return -1;
 914         }
 915
 916         atom->sched_out_time = timestamp;
 917
 918         if (run_state == 'R') {
 919                 atom->state = THREAD_WAIT_CPU;
 920                 atom->wake_up_time = atom->sched_out_time;
 921         }
 922
 923         list_add_tail(&atom->list, &atoms->work_list);
 924         return 0;
 925 }
 926
 927 static void
 928 add_runtime_event(struct work_atoms *atoms, u64 delta,
 929                   u64 timestamp __maybe_unused)
 930 {
 931         struct work_atom *atom;
 932
 933         BUG_ON(list_empty(&atoms->work_list));
 934
 935         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
 936
 937         atom->runtime += delta;
 938         atoms->total_runtime += delta;
 939 }
 940
 941 static void
 942 add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
 943 {
 944         struct work_atom *atom;
 945         u64 delta;
 946
 947         if (list_empty(&atoms->work_list))
 948                 return;
 949
 950         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
 951
 952         if (atom->state != THREAD_WAIT_CPU)
 953                 return;
 954
 955         if (timestamp < atom->wake_up_time) {
 956                 atom->state = THREAD_IGNORE;
 957                 return;
 958         }
 959
 960         atom->state = THREAD_SCHED_IN;
 961         atom->sched_in_time = timestamp;
 962
 963         delta = atom->sched_in_time - atom->wake_up_time;
 964         atoms->total_lat += delta;
 965         if (delta > atoms->max_lat) {
 966                 atoms->max_lat = delta;
 967                 atoms->max_lat_at = timestamp;
 968         }
 969         atoms->nb_atoms++;
 970 }
 971
 972 static int latency_switch_event(struct perf_sched *sched,
 973                                 struct perf_evsel *evsel,
 974                                 struct perf_sample *sample,
 975                                 struct machine *machine)
 976 {
 977         const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
 978                   next_pid = perf_evsel__intval(evsel, sample, "next_pid");
 979         const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
 980         struct work_atoms *out_events, *in_events;
 981         struct thread *sched_out, *sched_in;
 982         u64 timestamp0, timestamp = sample->time;
 983         int cpu = sample->cpu, err = -1;
 984         s64 delta;
 985
 986         BUG_ON(cpu >= MAX_CPUS || cpu < 0);
 987
 988         timestamp0 = sched->cpu_last_switched[cpu];
 989         sched->cpu_last_switched[cpu] = timestamp;
 990         if (timestamp0)
 991                 delta = timestamp - timestamp0;
 992         else
 993                 delta = 0;
 994
 995         if (delta < 0) {
 996                 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
 997                 return -1;
 998         }
 999
1000         sched_out = machine__findnew_thread(machine, -1, prev_pid);
1001         sched_in = machine__findnew_thread(machine, -1, next_pid);
1002         if (sched_out == NULL || sched_in == NULL)
1003                 goto out_put;
1004
1005         out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1006         if (!out_events) {
1007                 if (thread_atoms_insert(sched, sched_out))
1008                         goto out_put;
1009                 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1010                 if (!out_events) {
1011                         pr_err("out-event: Internal tree error");
1012                         goto out_put;
1013                 }
1014         }
1015         if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
1016                 return -1;
1017
1018         in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1019         if (!in_events) {
1020                 if (thread_atoms_insert(sched, sched_in))
1021                         goto out_put;
1022                 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1023                 if (!in_events) {
1024                         pr_err("in-event: Internal tree error");
1025                         goto out_put;
1026                 }
1027                 /*
1028                  * Take came in we have not heard about yet,
1029                  * add in an initial atom in runnable state:
1030                  */
1031                 if (add_sched_out_event(in_events, 'R', timestamp))
1032                         goto out_put;
1033         }
1034         add_sched_in_event(in_events, timestamp);
1035         err = 0;
1036 out_put:
1037         thread__put(sched_out);
1038         thread__put(sched_in);
1039         return err;
1040 }
1041
1042 static int latency_runtime_event(struct perf_sched *sched,
1043                                  struct perf_evsel *evsel,
1044                                  struct perf_sample *sample,
1045                                  struct machine *machine)
1046 {
1047         const u32 pid      = perf_evsel__intval(evsel, sample, "pid");
1048         const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
1049         struct thread *thread = machine__findnew_thread(machine, -1, pid);
1050         struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1051         u64 timestamp = sample->time;
1052         int cpu = sample->cpu, err = -1;
1053
1054         if (thread == NULL)
1055                 return -1;
1056
1057         BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1058         if (!atoms) {
1059                 if (thread_atoms_insert(sched, thread))
1060                         goto out_put;
1061                 atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
1062                 if (!atoms) {
1063                         pr_err("in-event: Internal tree error");
1064                         goto out_put;
1065                 }
1066                 if (add_sched_out_event(atoms, 'R', timestamp))
1067                         goto out_put;
1068         }
1069
1070         add_runtime_event(atoms, runtime, timestamp);
1071         err = 0;
1072 out_put:
1073         thread__put(thread);
1074         return err;
1075 }
1076
1077 static int latency_wakeup_event(struct perf_sched *sched,
1078                                 struct perf_evsel *evsel,
1079                                 struct perf_sample *sample,
1080                                 struct machine *machine)
1081 {
1082         const u32 pid     = perf_evsel__intval(evsel, sample, "pid");
1083         struct work_atoms *atoms;
1084         struct work_atom *atom;
1085         struct thread *wakee;
1086         u64 timestamp = sample->time;
1087         int err = -1;
1088
1089         wakee = machine__findnew_thread(machine, -1, pid);
1090         if (wakee == NULL)
1091                 return -1;
1092         atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1093         if (!atoms) {
1094                 if (thread_atoms_insert(sched, wakee))
1095                         goto out_put;
1096                 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1097                 if (!atoms) {
1098                         pr_err("wakeup-event: Internal tree error");
1099                         goto out_put;
1100                 }
1101                 if (add_sched_out_event(atoms, 'S', timestamp))
1102                         goto out_put;
1103         }
1104
1105         BUG_ON(list_empty(&atoms->work_list));
1106
1107         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1108
1109         /*
1110          * As we do not guarantee the wakeup event happens when
1111          * task is out of run queue, also may happen when task is
1112          * on run queue and wakeup only change ->state to TASK_RUNNING,
1113          * then we should not set the ->wake_up_time when wake up a
1114          * task which is on run queue.
1115          *
1116          * You WILL be missing events if you've recorded only
1117          * one CPU, or are only looking at only one, so don't
1118          * skip in this case.
1119          */
1120         if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1121                 goto out_ok;
1122
1123         sched->nr_timestamps++;
1124         if (atom->sched_out_time > timestamp) {
1125                 sched->nr_unordered_timestamps++;
1126                 goto out_ok;
1127         }
1128
1129         atom->state = THREAD_WAIT_CPU;
1130         atom->wake_up_time = timestamp;
1131 out_ok:
1132         err = 0;
1133 out_put:
1134         thread__put(wakee);
1135         return err;
1136 }
1137
1138 static int latency_migrate_task_event(struct perf_sched *sched,
1139                                       struct perf_evsel *evsel,
1140                                       struct perf_sample *sample,
1141                                       struct machine *machine)
1142 {
1143         const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1144         u64 timestamp = sample->time;
1145         struct work_atoms *atoms;
1146         struct work_atom *atom;
1147         struct thread *migrant;
1148         int err = -1;
1149
1150         /*
1151          * Only need to worry about migration when profiling one CPU.
1152          */
1153         if (sched->profile_cpu == -1)
1154                 return 0;
1155
1156         migrant = machine__findnew_thread(machine, -1, pid);
1157         if (migrant == NULL)
1158                 return -1;
1159         atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1160         if (!atoms) {
1161                 if (thread_atoms_insert(sched, migrant))
1162                         goto out_put;
1163                 register_pid(sched, migrant->tid, thread__comm_str(migrant));
1164                 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1165                 if (!atoms) {
1166                         pr_err("migration-event: Internal tree error");
1167                         goto out_put;
1168                 }
1169                 if (add_sched_out_event(atoms, 'R', timestamp))
1170                         goto out_put;
1171         }
1172
1173         BUG_ON(list_empty(&atoms->work_list));
1174
1175         atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1176         atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1177
1178         sched->nr_timestamps++;
1179
1180         if (atom->sched_out_time > timestamp)
1181                 sched->nr_unordered_timestamps++;
1182         err = 0;
1183 out_put:
1184         thread__put(migrant);
1185         return err;
1186 }
1187
1188 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1189 {
1190         int i;
1191         int ret;
1192         u64 avg;
1193
1194         if (!work_list->nb_atoms)
1195                 return;
1196         /*
1197          * Ignore idle threads:
1198          */
1199         if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
1200                 return;
1201
1202         sched->all_runtime += work_list->total_runtime;
1203         sched->all_count   += work_list->nb_atoms;
1204
1205         if (work_list->num_merged > 1)
1206                 ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
1207         else
1208                 ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
1209
1210         for (i = 0; i < 24 - ret; i++)
1211                 printf(" ");
1212
1213         avg = work_list->total_lat / work_list->nb_atoms;
1214
1215         printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13.6f s\n",
1216               (double)work_list->total_runtime / 1e6,
1217                  work_list->nb_atoms, (double)avg / 1e6,
1218                  (double)work_list->max_lat / 1e6,
1219                  (double)work_list->max_lat_at / 1e9);
1220 }
1221
1222 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1223 {
1224         if (l->thread == r->thread)
1225                 return 0;
1226         if (l->thread->tid < r->thread->tid)
1227                 return -1;
1228         if (l->thread->tid > r->thread->tid)
1229                 return 1;
1230         return (int)(l->thread - r->thread);
1231 }
1232
1233 static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1234 {
1235         u64 avgl, avgr;
1236
1237         if (!l->nb_atoms)
1238                 return -1;
1239
1240         if (!r->nb_atoms)
1241                 return 1;
1242
1243         avgl = l->total_lat / l->nb_atoms;
1244         avgr = r->total_lat / r->nb_atoms;
1245
1246         if (avgl < avgr)
1247                 return -1;
1248         if (avgl > avgr)
1249                 return 1;
1250
1251         return 0;
1252 }
1253
1254 static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1255 {
1256         if (l->max_lat < r->max_lat)
1257                 return -1;
1258         if (l->max_lat > r->max_lat)
1259                 return 1;
1260
1261         return 0;
1262 }
1263
1264 static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1265 {
1266         if (l->nb_atoms < r->nb_atoms)
1267                 return -1;
1268         if (l->nb_atoms > r->nb_atoms)
1269                 return 1;
1270
1271         return 0;
1272 }
1273
1274 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1275 {
1276         if (l->total_runtime < r->total_runtime)
1277                 return -1;
1278         if (l->total_runtime > r->total_runtime)
1279                 return 1;
1280
1281         return 0;
1282 }
1283
1284 static int sort_dimension__add(const char *tok, struct list_head *list)
1285 {
1286         size_t i;
1287         static struct sort_dimension avg_sort_dimension = {
1288                 .name = "avg",
1289                 .cmp  = avg_cmp,
1290         };
1291         static struct sort_dimension max_sort_dimension = {
1292                 .name = "max",
1293                 .cmp  = max_cmp,
1294         };
1295         static struct sort_dimension pid_sort_dimension = {
1296                 .name = "pid",
1297                 .cmp  = pid_cmp,
1298         };
1299         static struct sort_dimension runtime_sort_dimension = {
1300                 .name = "runtime",
1301                 .cmp  = runtime_cmp,
1302         };
1303         static struct sort_dimension switch_sort_dimension = {
1304                 .name = "switch",
1305                 .cmp  = switch_cmp,
1306         };
1307         struct sort_dimension *available_sorts[] = {
1308                 &pid_sort_dimension,
1309                 &avg_sort_dimension,
1310                 &max_sort_dimension,
1311                 &switch_sort_dimension,
1312                 &runtime_sort_dimension,
1313         };
1314
1315         for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1316                 if (!strcmp(available_sorts[i]->name, tok)) {
1317                         list_add_tail(&available_sorts[i]->list, list);
1318
1319                         return 0;
1320                 }
1321         }
1322
1323         return -1;
1324 }
1325
1326 static void perf_sched__sort_lat(struct perf_sched *sched)
1327 {
1328         struct rb_node *node;
1329         struct rb_root *root = &sched->atom_root;
1330 again:
1331         for (;;) {
1332                 struct work_atoms *data;
1333                 node = rb_first(root);
1334                 if (!node)
1335                         break;
1336
1337                 rb_erase(node, root);
1338                 data = rb_entry(node, struct work_atoms, node);
1339                 __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1340         }
1341         if (root == &sched->atom_root) {
1342                 root = &sched->merged_atom_root;
1343                 goto again;
1344         }
1345 }
1346
1347 static int process_sched_wakeup_event(struct perf_tool *tool,
1348                                       struct perf_evsel *evsel,
1349                                       struct perf_sample *sample,
1350                                       struct machine *machine)
1351 {
1352         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1353
1354         if (sched->tp_handler->wakeup_event)
1355                 return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1356
1357         return 0;
1358 }
1359
1360 union map_priv {
1361         void    *ptr;
1362         bool     color;
1363 };
1364
1365 static bool thread__has_color(struct thread *thread)
1366 {
1367         union map_priv priv = {
1368                 .ptr = thread__priv(thread),
1369         };
1370
1371         return priv.color;
1372 }
1373
1374 static struct thread*
1375 map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid)
1376 {
1377         struct thread *thread = machine__findnew_thread(machine, pid, tid);
1378         union map_priv priv = {
1379                 .color = false,
1380         };
1381
1382         if (!sched->map.color_pids || !thread || thread__priv(thread))
1383                 return thread;
1384
1385         if (thread_map__has(sched->map.color_pids, tid))
1386                 priv.color = true;
1387
1388         thread__set_priv(thread, priv.ptr);
1389         return thread;
1390 }
1391
1392 static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
1393                             struct perf_sample *sample, struct machine *machine)
1394 {
1395         const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1396         struct thread *sched_in;
1397         int new_shortname;
1398         u64 timestamp0, timestamp = sample->time;
1399         s64 delta;
1400         int i, this_cpu = sample->cpu;
1401         int cpus_nr;
1402         bool new_cpu = false;
1403         const char *color = PERF_COLOR_NORMAL;
1404
1405         BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1406
1407         if (this_cpu > sched->max_cpu)
1408                 sched->max_cpu = this_cpu;
1409
1410         if (sched->map.comp) {
1411                 cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
1412                 if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) {
1413                         sched->map.comp_cpus[cpus_nr++] = this_cpu;
1414                         new_cpu = true;
1415                 }
1416         } else
1417                 cpus_nr = sched->max_cpu;
1418
1419         timestamp0 = sched->cpu_last_switched[this_cpu];
1420         sched->cpu_last_switched[this_cpu] = timestamp;
1421         if (timestamp0)
1422                 delta = timestamp - timestamp0;
1423         else
1424                 delta = 0;
1425
1426         if (delta < 0) {
1427                 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1428                 return -1;
1429         }
1430
1431         sched_in = map__findnew_thread(sched, machine, -1, next_pid);
1432         if (sched_in == NULL)
1433                 return -1;
1434
1435         sched->curr_thread[this_cpu] = thread__get(sched_in);
1436
1437         printf("  ");
1438
1439         new_shortname = 0;
1440         if (!sched_in->shortname[0]) {
1441                 if (!strcmp(thread__comm_str(sched_in), "swapper")) {
1442                         /*
1443                          * Don't allocate a letter-number for swapper:0
1444                          * as a shortname. Instead, we use '.' for it.
1445                          */
1446                         sched_in->shortname[0] = '.';
1447                         sched_in->shortname[1] = ' ';
1448                 } else {
1449                         sched_in->shortname[0] = sched->next_shortname1;
1450                         sched_in->shortname[1] = sched->next_shortname2;
1451
1452                         if (sched->next_shortname1 < 'Z') {
1453                                 sched->next_shortname1++;
1454                         } else {
1455                                 sched->next_shortname1 = 'A';
1456                                 if (sched->next_shortname2 < '9')
1457                                         sched->next_shortname2++;
1458                                 else
1459                                         sched->next_shortname2 = '0';
1460                         }
1461                 }
1462                 new_shortname = 1;
1463         }
1464
1465         for (i = 0; i < cpus_nr; i++) {
1466                 int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i;
1467                 struct thread *curr_thread = sched->curr_thread[cpu];
1468                 const char *pid_color = color;
1469                 const char *cpu_color = color;
1470
1471                 if (curr_thread && thread__has_color(curr_thread))
1472                         pid_color = COLOR_PIDS;
1473
1474                 if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu))
1475                         continue;
1476
1477                 if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu))
1478                         cpu_color = COLOR_CPUS;
1479
1480                 if (cpu != this_cpu)
1481                         color_fprintf(stdout, cpu_color, " ");
1482                 else
1483                         color_fprintf(stdout, cpu_color, "*");
1484
1485                 if (sched->curr_thread[cpu])
1486                         color_fprintf(stdout, pid_color, "%2s ", sched->curr_thread[cpu]->shortname);
1487                 else
1488                         color_fprintf(stdout, color, "   ");
1489         }
1490
1491         if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu))
1492                 goto out;
1493
1494         color_fprintf(stdout, color, "  %12.6f secs ", (double)timestamp/1e9);
1495         if (new_shortname) {
1496                 const char *pid_color = color;
1497
1498                 if (thread__has_color(sched_in))
1499                         pid_color = COLOR_PIDS;
1500
1501                 color_fprintf(stdout, pid_color, "%s => %s:%d",
1502                        sched_in->shortname, thread__comm_str(sched_in), sched_in->tid);
1503         }
1504
1505         if (sched->map.comp && new_cpu)
1506                 color_fprintf(stdout, color, " (CPU %d)", this_cpu);
1507
1508 out:
1509         color_fprintf(stdout, color, "\n");
1510
1511         thread__put(sched_in);
1512
1513         return 0;
1514 }
1515
1516 static int process_sched_switch_event(struct perf_tool *tool,
1517                                       struct perf_evsel *evsel,
1518                                       struct perf_sample *sample,
1519                                       struct machine *machine)
1520 {
1521         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1522         int this_cpu = sample->cpu, err = 0;
1523         u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
1524             next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1525
1526         if (sched->curr_pid[this_cpu] != (u32)-1) {
1527                 /*
1528                  * Are we trying to switch away a PID that is
1529                  * not current?
1530                  */
1531                 if (sched->curr_pid[this_cpu] != prev_pid)
1532                         sched->nr_context_switch_bugs++;
1533         }
1534
1535         if (sched->tp_handler->switch_event)
1536                 err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1537
1538         sched->curr_pid[this_cpu] = next_pid;
1539         return err;
1540 }
1541
1542 static int process_sched_runtime_event(struct perf_tool *tool,
1543                                        struct perf_evsel *evsel,
1544                                        struct perf_sample *sample,
1545                                        struct machine *machine)
1546 {
1547         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1548
1549         if (sched->tp_handler->runtime_event)
1550                 return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1551
1552         return 0;
1553 }
1554
1555 static int perf_sched__process_fork_event(struct perf_tool *tool,
1556                                           union perf_event *event,
1557                                           struct perf_sample *sample,
1558                                           struct machine *machine)
1559 {
1560         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1561
1562         /* run the fork event through the perf machineruy */
1563         perf_event__process_fork(tool, event, sample, machine);
1564
1565         /* and then run additional processing needed for this command */
1566         if (sched->tp_handler->fork_event)
1567                 return sched->tp_handler->fork_event(sched, event, machine);
1568
1569         return 0;
1570 }
1571
1572 static int process_sched_migrate_task_event(struct perf_tool *tool,
1573                                             struct perf_evsel *evsel,
1574                                             struct perf_sample *sample,
1575                                             struct machine *machine)
1576 {
1577         struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1578
1579         if (sched->tp_handler->migrate_task_event)
1580                 return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1581
1582         return 0;
1583 }
1584
1585 typedef int (*tracepoint_handler)(struct perf_tool *tool,
1586                                   struct perf_evsel *evsel,
1587                                   struct perf_sample *sample,
1588                                   struct machine *machine);
1589
1590 static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
1591                                                  union perf_event *event __maybe_unused,
1592                                                  struct perf_sample *sample,
1593                                                  struct perf_evsel *evsel,
1594                                                  struct machine *machine)
1595 {
1596         int err = 0;
1597
1598         if (evsel->handler != NULL) {
1599                 tracepoint_handler f = evsel->handler;
1600                 err = f(tool, evsel, sample, machine);
1601         }
1602
1603         return err;
1604 }
1605
1606 static int perf_sched__read_events(struct perf_sched *sched)
1607 {
1608         const struct perf_evsel_str_handler handlers[] = {
1609                 { "sched:sched_switch",       process_sched_switch_event, },
1610                 { "sched:sched_stat_runtime", process_sched_runtime_event, },
1611                 { "sched:sched_wakeup",       process_sched_wakeup_event, },
1612                 { "sched:sched_wakeup_new",   process_sched_wakeup_event, },
1613                 { "sched:sched_migrate_task", process_sched_migrate_task_event, },
1614         };
1615         struct perf_session *session;
1616         struct perf_data_file file = {
1617                 .path = input_name,
1618                 .mode = PERF_DATA_MODE_READ,
1619                 .force = sched->force,
1620         };
1621         int rc = -1;
1622
1623         session = perf_session__new(&file, false, &sched->tool);
1624         if (session == NULL) {
1625                 pr_debug("No Memory for session\n");
1626                 return -1;
1627         }
1628
1629         symbol__init(&session->header.env);
1630
1631         if (perf_session__set_tracepoints_handlers(session, handlers))
1632                 goto out_delete;
1633
1634         if (perf_session__has_traces(session, "record -R")) {
1635                 int err = perf_session__process_events(session);
1636                 if (err) {
1637                         pr_err("Failed to process events, error %d", err);
1638                         goto out_delete;
1639                 }
1640
1641                 sched->nr_events      = session->evlist->stats.nr_events[0];
1642                 sched->nr_lost_events = session->evlist->stats.total_lost;
1643                 sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
1644         }
1645
1646         rc = 0;
1647 out_delete:
1648         perf_session__delete(session);
1649         return rc;
1650 }
1651
1652 static void print_bad_events(struct perf_sched *sched)
1653 {
1654         if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
1655                 printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1656                         (double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
1657                         sched->nr_unordered_timestamps, sched->nr_timestamps);
1658         }
1659         if (sched->nr_lost_events && sched->nr_events) {
1660                 printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1661                         (double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
1662                         sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
1663         }
1664         if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
1665                 printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1666                         (double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
1667                         sched->nr_context_switch_bugs, sched->nr_timestamps);
1668                 if (sched->nr_lost_events)
1669                         printf(" (due to lost events?)");
1670                 printf("\n");
1671         }
1672 }
1673
1674 static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data)
1675 {
1676         struct rb_node **new = &(root->rb_node), *parent = NULL;
1677         struct work_atoms *this;
1678         const char *comm = thread__comm_str(data->thread), *this_comm;
1679
1680         while (*new) {
1681                 int cmp;
1682
1683                 this = container_of(*new, struct work_atoms, node);
1684                 parent = *new;
1685
1686                 this_comm = thread__comm_str(this->thread);
1687                 cmp = strcmp(comm, this_comm);
1688                 if (cmp > 0) {
1689                         new = &((*new)->rb_left);
1690                 } else if (cmp < 0) {
1691                         new = &((*new)->rb_right);
1692                 } else {
1693                         this->num_merged++;
1694                         this->total_runtime += data->total_runtime;
1695                         this->nb_atoms += data->nb_atoms;
1696                         this->total_lat += data->total_lat;
1697                         list_splice(&data->work_list, &this->work_list);
1698                         if (this->max_lat < data->max_lat) {
1699                                 this->max_lat = data->max_lat;
1700                                 this->max_lat_at = data->max_lat_at;
1701                         }
1702                         zfree(&data);
1703                         return;
1704                 }
1705         }
1706
1707         data->num_merged++;
1708         rb_link_node(&data->node, parent, new);
1709         rb_insert_color(&data->node, root);
1710 }
1711
1712 static void perf_sched__merge_lat(struct perf_sched *sched)
1713 {
1714         struct work_atoms *data;
1715         struct rb_node *node;
1716
1717         if (sched->skip_merge)
1718                 return;
1719
1720         while ((node = rb_first(&sched->atom_root))) {
1721                 rb_erase(node, &sched->atom_root);
1722                 data = rb_entry(node, struct work_atoms, node);
1723                 __merge_work_atoms(&sched->merged_atom_root, data);
1724         }
1725 }
1726
1727 static int perf_sched__lat(struct perf_sched *sched)
1728 {
1729         struct rb_node *next;
1730
1731         setup_pager();
1732
1733         if (perf_sched__read_events(sched))
1734                 return -1;
1735
1736         perf_sched__merge_lat(sched);
1737         perf_sched__sort_lat(sched);
1738
1739         printf("\n -----------------------------------------------------------------------------------------------------------------\n");
1740         printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at       |\n");
1741         printf(" -----------------------------------------------------------------------------------------------------------------\n");
1742
1743         next = rb_first(&sched->sorted_atom_root);
1744
1745         while (next) {
1746                 struct work_atoms *work_list;
1747
1748                 work_list = rb_entry(next, struct work_atoms, node);
1749                 output_lat_thread(sched, work_list);
1750                 next = rb_next(next);
1751                 thread__zput(work_list->thread);
1752         }
1753
1754         printf(" -----------------------------------------------------------------------------------------------------------------\n");
1755         printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1756                 (double)sched->all_runtime / 1e6, sched->all_count);
1757
1758         printf(" ---------------------------------------------------\n");
1759
1760         print_bad_events(sched);
1761         printf("\n");
1762
1763         return 0;
1764 }
1765
1766 static int setup_map_cpus(struct perf_sched *sched)
1767 {
1768         struct cpu_map *map;
1769
1770         sched->max_cpu  = sysconf(_SC_NPROCESSORS_CONF);
1771
1772         if (sched->map.comp) {
1773                 sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int));
1774                 if (!sched->map.comp_cpus)
1775                         return -1;
1776         }
1777
1778         if (!sched->map.cpus_str)
1779                 return 0;
1780
1781         map = cpu_map__new(sched->map.cpus_str);
1782         if (!map) {
1783                 pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
1784                 return -1;
1785         }
1786
1787         sched->map.cpus = map;
1788         return 0;
1789 }
1790
1791 static int setup_color_pids(struct perf_sched *sched)
1792 {
1793         struct thread_map *map;
1794
1795         if (!sched->map.color_pids_str)
1796                 return 0;
1797
1798         map = thread_map__new_by_tid_str(sched->map.color_pids_str);
1799         if (!map) {
1800                 pr_err("failed to get thread map from %s\n", sched->map.color_pids_str);
1801                 return -1;
1802         }
1803
1804         sched->map.color_pids = map;
1805         return 0;
1806 }
1807
1808 static int setup_color_cpus(struct perf_sched *sched)
1809 {
1810         struct cpu_map *map;
1811
1812         if (!sched->map.color_cpus_str)
1813                 return 0;
1814
1815         map = cpu_map__new(sched->map.color_cpus_str);
1816         if (!map) {
1817                 pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str);
1818                 return -1;
1819         }
1820
1821         sched->map.color_cpus = map;
1822         return 0;
1823 }
1824
1825 static int perf_sched__map(struct perf_sched *sched)
1826 {
1827         if (setup_map_cpus(sched))
1828                 return -1;
1829
1830         if (setup_color_pids(sched))
1831                 return -1;
1832
1833         if (setup_color_cpus(sched))
1834                 return -1;
1835
1836         setup_pager();
1837         if (perf_sched__read_events(sched))
1838                 return -1;
1839         print_bad_events(sched);
1840         return 0;
1841 }
1842
1843 static int perf_sched__replay(struct perf_sched *sched)
1844 {
1845         unsigned long i;
1846
1847         calibrate_run_measurement_overhead(sched);
1848         calibrate_sleep_measurement_overhead(sched);
1849
1850         test_calibrations(sched);
1851
1852         if (perf_sched__read_events(sched))
1853                 return -1;
1854
1855         printf("nr_run_events:        %ld\n", sched->nr_run_events);
1856         printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
1857         printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
1858
1859         if (sched->targetless_wakeups)
1860                 printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
1861         if (sched->multitarget_wakeups)
1862                 printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
1863         if (sched->nr_run_events_optimized)
1864                 printf("run atoms optimized: %ld\n",
1865                         sched->nr_run_events_optimized);
1866
1867         print_task_traces(sched);
1868         add_cross_task_wakeups(sched);
1869
1870         create_tasks(sched);
1871         printf("------------------------------------------------------------\n");
1872         for (i = 0; i < sched->replay_repeat; i++)
1873                 run_one_test(sched);
1874
1875         return 0;
1876 }
1877
1878 static void setup_sorting(struct perf_sched *sched, const struct option *options,
1879                           const char * const usage_msg[])
1880 {
1881         char *tmp, *tok, *str = strdup(sched->sort_order);
1882
1883         for (tok = strtok_r(str, ", ", &tmp);
1884                         tok; tok = strtok_r(NULL, ", ", &tmp)) {
1885                 if (sort_dimension__add(tok, &sched->sort_list) < 0) {
1886                         usage_with_options_msg(usage_msg, options,
1887                                         "Unknown --sort key: `%s'", tok);
1888                 }
1889         }
1890
1891         free(str);
1892
1893         sort_dimension__add("pid", &sched->cmp_pid);
1894 }
1895
1896 static int __cmd_record(int argc, const char **argv)
1897 {
1898         unsigned int rec_argc, i, j;
1899         const char **rec_argv;
1900         const char * const record_args[] = {
1901                 "record",
1902                 "-a",
1903                 "-R",
1904                 "-m", "1024",
1905                 "-c", "1",
1906                 "-e", "sched:sched_switch",
1907                 "-e", "sched:sched_stat_wait",
1908                 "-e", "sched:sched_stat_sleep",
1909                 "-e", "sched:sched_stat_iowait",
1910                 "-e", "sched:sched_stat_runtime",
1911                 "-e", "sched:sched_process_fork",
1912                 "-e", "sched:sched_wakeup",
1913                 "-e", "sched:sched_wakeup_new",
1914                 "-e", "sched:sched_migrate_task",
1915         };
1916
1917         rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1918         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1919
1920         if (rec_argv == NULL)
1921                 return -ENOMEM;
1922
1923         for (i = 0; i < ARRAY_SIZE(record_args); i++)
1924                 rec_argv[i] = strdup(record_args[i]);
1925
1926         for (j = 1; j < (unsigned int)argc; j++, i++)
1927                 rec_argv[i] = argv[j];
1928
1929         BUG_ON(i != rec_argc);
1930
1931         return cmd_record(i, rec_argv, NULL);
1932 }
1933
1934 int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
1935 {
1936         const char default_sort_order[] = "avg, max, switch, runtime";
1937         struct perf_sched sched = {
1938                 .tool = {
1939                         .sample          = perf_sched__process_tracepoint_sample,
1940                         .comm            = perf_event__process_comm,
1941                         .lost            = perf_event__process_lost,
1942                         .fork            = perf_sched__process_fork_event,
1943                         .ordered_events = true,
1944                 },
1945                 .cmp_pid              = LIST_HEAD_INIT(sched.cmp_pid),
1946                 .sort_list            = LIST_HEAD_INIT(sched.sort_list),
1947                 .start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
1948                 .work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
1949                 .sort_order           = default_sort_order,
1950                 .replay_repeat        = 10,
1951                 .profile_cpu          = -1,
1952                 .next_shortname1      = 'A',
1953                 .next_shortname2      = '0',
1954                 .skip_merge           = 0,
1955         };
1956         const struct option latency_options[] = {
1957         OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
1958                    "sort by key(s): runtime, switch, avg, max"),
1959         OPT_INCR('v', "verbose", &verbose,
1960                     "be more verbose (show symbol address, etc)"),
1961         OPT_INTEGER('C', "CPU", &sched.profile_cpu,
1962                     "CPU to profile on"),
1963         OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1964                     "dump raw trace in ASCII"),
1965         OPT_BOOLEAN('p', "pids", &sched.skip_merge,
1966                     "latency stats per pid instead of per comm"),
1967         OPT_END()
1968         };
1969         const struct option replay_options[] = {
1970         OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
1971                      "repeat the workload replay N times (-1: infinite)"),
1972         OPT_INCR('v', "verbose", &verbose,
1973                     "be more verbose (show symbol address, etc)"),
1974         OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1975                     "dump raw trace in ASCII"),
1976         OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
1977         OPT_END()
1978         };
1979         const struct option sched_options[] = {
1980         OPT_STRING('i', "input", &input_name, "file",
1981                     "input file name"),
1982         OPT_INCR('v', "verbose", &verbose,
1983                     "be more verbose (show symbol address, etc)"),
1984         OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1985                     "dump raw trace in ASCII"),
1986         OPT_END()
1987         };
1988         const struct option map_options[] = {
1989         OPT_BOOLEAN(0, "compact", &sched.map.comp,
1990                     "map output in compact mode"),
1991         OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids",
1992                    "highlight given pids in map"),
1993         OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus",
1994                     "highlight given CPUs in map"),
1995         OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus",
1996                     "display given CPUs in map"),
1997         OPT_END()
1998         };
1999         const char * const latency_usage[] = {
2000                 "perf sched latency [<options>]",
2001                 NULL
2002         };
2003         const char * const replay_usage[] = {
2004                 "perf sched replay [<options>]",
2005                 NULL
2006         };
2007         const char * const map_usage[] = {
2008                 "perf sched map [<options>]",
2009                 NULL
2010         };
2011         const char *const sched_subcommands[] = { "record", "latency", "map",
2012                                                   "replay", "script", NULL };
2013         const char *sched_usage[] = {
2014                 NULL,
2015                 NULL
2016         };
2017         struct trace_sched_handler lat_ops  = {
2018                 .wakeup_event       = latency_wakeup_event,
2019                 .switch_event       = latency_switch_event,
2020                 .runtime_event      = latency_runtime_event,
2021                 .migrate_task_event = latency_migrate_task_event,
2022         };
2023         struct trace_sched_handler map_ops  = {
2024                 .switch_event       = map_switch_event,
2025         };
2026         struct trace_sched_handler replay_ops  = {
2027                 .wakeup_event       = replay_wakeup_event,
2028                 .switch_event       = replay_switch_event,
2029                 .fork_event         = replay_fork_event,
2030         };
2031         unsigned int i;
2032
2033         for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
2034                 sched.curr_pid[i] = -1;
2035
2036         argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
2037                                         sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2038         if (!argc)
2039                 usage_with_options(sched_usage, sched_options);
2040
2041         /*
2042          * Aliased to 'perf script' for now:
2043          */
2044         if (!strcmp(argv[0], "script"))
2045                 return cmd_script(argc, argv, prefix);
2046
2047         if (!strncmp(argv[0], "rec", 3)) {
2048                 return __cmd_record(argc, argv);
2049         } else if (!strncmp(argv[0], "lat", 3)) {
2050                 sched.tp_handler = &lat_ops;
2051                 if (argc > 1) {
2052                         argc = parse_options(argc, argv, latency_options, latency_usage, 0);
2053                         if (argc)
2054                                 usage_with_options(latency_usage, latency_options);
2055                 }
2056                 setup_sorting(&sched, latency_options, latency_usage);
2057                 return perf_sched__lat(&sched);
2058         } else if (!strcmp(argv[0], "map")) {
2059                 if (argc) {
2060                         argc = parse_options(argc, argv, map_options, map_usage, 0);
2061                         if (argc)
2062                                 usage_with_options(map_usage, map_options);
2063                 }
2064                 sched.tp_handler = &map_ops;
2065                 setup_sorting(&sched, latency_options, latency_usage);
2066                 return perf_sched__map(&sched);
2067         } else if (!strncmp(argv[0], "rep", 3)) {
2068                 sched.tp_handler = &replay_ops;
2069                 if (argc) {
2070                         argc = parse_options(argc, argv, replay_options, replay_usage, 0);
2071                         if (argc)
2072                                 usage_with_options(replay_usage, replay_options);
2073                 }
2074                 return perf_sched__replay(&sched);
2075         } else {
2076                 usage_with_options(sched_usage, sched_options);
2077         }
2078
2079         return 0;
2080 }