3c43a3578f3115885daba420bce1f2bf04383e35
[cascardo/linux.git] / tools / perf / builtin-stat.c
1 /*
2  * builtin-stat.c
3  *
4  * Builtin stat command: Give a precise performance counters summary
5  * overview about any workload, CPU or specific PID.
6  *
7  * Sample output:
8
9    $ perf stat ./hackbench 10
10
11   Time: 0.118
12
13   Performance counter stats for './hackbench 10':
14
15        1708.761321 task-clock                #   11.037 CPUs utilized
16             41,190 context-switches          #    0.024 M/sec
17              6,735 CPU-migrations            #    0.004 M/sec
18             17,318 page-faults               #    0.010 M/sec
19      5,205,202,243 cycles                    #    3.046 GHz
20      3,856,436,920 stalled-cycles-frontend   #   74.09% frontend cycles idle
21      1,600,790,871 stalled-cycles-backend    #   30.75% backend  cycles idle
22      2,603,501,247 instructions              #    0.50  insns per cycle
23                                              #    1.48  stalled cycles per insn
24        484,357,498 branches                  #  283.455 M/sec
25          6,388,934 branch-misses             #    1.32% of all branches
26
27         0.154822978  seconds time elapsed
28
29  *
30  * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
31  *
32  * Improvements and fixes by:
33  *
34  *   Arjan van de Ven <arjan@linux.intel.com>
35  *   Yanmin Zhang <yanmin.zhang@intel.com>
36  *   Wu Fengguang <fengguang.wu@intel.com>
37  *   Mike Galbraith <efault@gmx.de>
38  *   Paul Mackerras <paulus@samba.org>
39  *   Jaswinder Singh Rajput <jaswinder@kernel.org>
40  *
41  * Released under the GPL v2. (and only v2, not any later version)
42  */
43
44 #include "perf.h"
45 #include "builtin.h"
46 #include "util/util.h"
47 #include "util/parse-options.h"
48 #include "util/parse-events.h"
49 #include "util/event.h"
50 #include "util/evlist.h"
51 #include "util/evsel.h"
52 #include "util/debug.h"
53 #include "util/color.h"
54 #include "util/stat.h"
55 #include "util/header.h"
56 #include "util/cpumap.h"
57 #include "util/thread.h"
58 #include "util/thread_map.h"
59
60 #include <sys/prctl.h>
61 #include <locale.h>
62
63 #define DEFAULT_SEPARATOR       " "
64 #define CNTR_NOT_SUPPORTED      "<not supported>"
65 #define CNTR_NOT_COUNTED        "<not counted>"
66
67 static struct perf_event_attr default_attrs[] = {
68
69   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK              },
70   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES        },
71   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS          },
72   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS             },
73
74   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES              },
75   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
76   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND  },
77   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS            },
78   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS     },
79   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES           },
80
81 };
82
83 /*
84  * Detailed stats (-d), covering the L1 and last level data caches:
85  */
86 static struct perf_event_attr detailed_attrs[] = {
87
88   { .type = PERF_TYPE_HW_CACHE,
89     .config =
90          PERF_COUNT_HW_CACHE_L1D                <<  0  |
91         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
92         (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)                          },
93
94   { .type = PERF_TYPE_HW_CACHE,
95     .config =
96          PERF_COUNT_HW_CACHE_L1D                <<  0  |
97         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
98         (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)                          },
99
100   { .type = PERF_TYPE_HW_CACHE,
101     .config =
102          PERF_COUNT_HW_CACHE_LL                 <<  0  |
103         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
104         (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)                          },
105
106   { .type = PERF_TYPE_HW_CACHE,
107     .config =
108          PERF_COUNT_HW_CACHE_LL                 <<  0  |
109         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
110         (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)                          },
111 };
112
113 /*
114  * Very detailed stats (-d -d), covering the instruction cache and the TLB caches:
115  */
116 static struct perf_event_attr very_detailed_attrs[] = {
117
118   { .type = PERF_TYPE_HW_CACHE,
119     .config =
120          PERF_COUNT_HW_CACHE_L1I                <<  0  |
121         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
122         (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)                          },
123
124   { .type = PERF_TYPE_HW_CACHE,
125     .config =
126          PERF_COUNT_HW_CACHE_L1I                <<  0  |
127         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
128         (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)                          },
129
130   { .type = PERF_TYPE_HW_CACHE,
131     .config =
132          PERF_COUNT_HW_CACHE_DTLB               <<  0  |
133         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
134         (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)                          },
135
136   { .type = PERF_TYPE_HW_CACHE,
137     .config =
138          PERF_COUNT_HW_CACHE_DTLB               <<  0  |
139         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
140         (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)                          },
141
142   { .type = PERF_TYPE_HW_CACHE,
143     .config =
144          PERF_COUNT_HW_CACHE_ITLB               <<  0  |
145         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
146         (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)                          },
147
148   { .type = PERF_TYPE_HW_CACHE,
149     .config =
150          PERF_COUNT_HW_CACHE_ITLB               <<  0  |
151         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
152         (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)                          },
153
154 };
155
156 /*
157  * Very, very detailed stats (-d -d -d), adding prefetch events:
158  */
159 static struct perf_event_attr very_very_detailed_attrs[] = {
160
161   { .type = PERF_TYPE_HW_CACHE,
162     .config =
163          PERF_COUNT_HW_CACHE_L1D                <<  0  |
164         (PERF_COUNT_HW_CACHE_OP_PREFETCH        <<  8) |
165         (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)                          },
166
167   { .type = PERF_TYPE_HW_CACHE,
168     .config =
169          PERF_COUNT_HW_CACHE_L1D                <<  0  |
170         (PERF_COUNT_HW_CACHE_OP_PREFETCH        <<  8) |
171         (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)                          },
172 };
173
174
175
176 static struct perf_evlist       *evsel_list;
177
178 static struct perf_target       target = {
179         .uid    = UINT_MAX,
180 };
181
182 static int                      run_idx                         =  0;
183 static int                      run_count                       =  1;
184 static bool                     no_inherit                      = false;
185 static bool                     scale                           =  true;
186 static bool                     no_aggr                         = false;
187 static pid_t                    child_pid                       = -1;
188 static bool                     null_run                        =  false;
189 static int                      detailed_run                    =  0;
190 static bool                     sync_run                        =  false;
191 static bool                     big_num                         =  true;
192 static int                      big_num_opt                     =  -1;
193 static const char               *csv_sep                        = NULL;
194 static bool                     csv_output                      = false;
195 static bool                     group                           = false;
196 static const char               *output_name                    = NULL;
197 static FILE                     *output                         = NULL;
198 static int                      output_fd;
199
200 static volatile int done = 0;
201
202 struct perf_stat {
203         struct stats      res_stats[3];
204 };
205
206 static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
207 {
208         evsel->priv = zalloc(sizeof(struct perf_stat));
209         return evsel->priv == NULL ? -ENOMEM : 0;
210 }
211
212 static void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
213 {
214         free(evsel->priv);
215         evsel->priv = NULL;
216 }
217
218 static struct stats runtime_nsecs_stats[MAX_NR_CPUS];
219 static struct stats runtime_cycles_stats[MAX_NR_CPUS];
220 static struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS];
221 static struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS];
222 static struct stats runtime_branches_stats[MAX_NR_CPUS];
223 static struct stats runtime_cacherefs_stats[MAX_NR_CPUS];
224 static struct stats runtime_l1_dcache_stats[MAX_NR_CPUS];
225 static struct stats runtime_l1_icache_stats[MAX_NR_CPUS];
226 static struct stats runtime_ll_cache_stats[MAX_NR_CPUS];
227 static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS];
228 static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS];
229 static struct stats walltime_nsecs_stats;
230
231 static int create_perf_stat_counter(struct perf_evsel *evsel,
232                                     struct perf_evsel *first)
233 {
234         struct perf_event_attr *attr = &evsel->attr;
235         bool exclude_guest_missing = false;
236         int ret;
237
238         if (scale)
239                 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
240                                     PERF_FORMAT_TOTAL_TIME_RUNNING;
241
242         attr->inherit = !no_inherit;
243
244 retry:
245         if (exclude_guest_missing)
246                 evsel->attr.exclude_guest = evsel->attr.exclude_host = 0;
247
248         if (perf_target__has_cpu(&target)) {
249                 ret = perf_evsel__open_per_cpu(evsel, evsel_list->cpus);
250                 if (ret)
251                         goto check_ret;
252                 return 0;
253         }
254
255         if (!perf_target__has_task(&target) && (!group || evsel == first)) {
256                 attr->disabled = 1;
257                 attr->enable_on_exec = 1;
258         }
259
260         ret = perf_evsel__open_per_thread(evsel, evsel_list->threads);
261         if (!ret)
262                 return 0;
263         /* fall through */
264 check_ret:
265         if (ret && errno == EINVAL) {
266                 if (!exclude_guest_missing &&
267                     (evsel->attr.exclude_guest || evsel->attr.exclude_host)) {
268                         pr_debug("Old kernel, cannot exclude "
269                                  "guest or host samples.\n");
270                         exclude_guest_missing = true;
271                         goto retry;
272                 }
273         }
274         return ret;
275 }
276
277 /*
278  * Does the counter have nsecs as a unit?
279  */
280 static inline int nsec_counter(struct perf_evsel *evsel)
281 {
282         if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) ||
283             perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
284                 return 1;
285
286         return 0;
287 }
288
289 /*
290  * Update various tracking values we maintain to print
291  * more semantic information such as miss/hit ratios,
292  * instruction rates, etc:
293  */
294 static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
295 {
296         if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
297                 update_stats(&runtime_nsecs_stats[0], count[0]);
298         else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
299                 update_stats(&runtime_cycles_stats[0], count[0]);
300         else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
301                 update_stats(&runtime_stalled_cycles_front_stats[0], count[0]);
302         else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
303                 update_stats(&runtime_stalled_cycles_back_stats[0], count[0]);
304         else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
305                 update_stats(&runtime_branches_stats[0], count[0]);
306         else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
307                 update_stats(&runtime_cacherefs_stats[0], count[0]);
308         else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
309                 update_stats(&runtime_l1_dcache_stats[0], count[0]);
310         else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
311                 update_stats(&runtime_l1_icache_stats[0], count[0]);
312         else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL))
313                 update_stats(&runtime_ll_cache_stats[0], count[0]);
314         else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
315                 update_stats(&runtime_dtlb_cache_stats[0], count[0]);
316         else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
317                 update_stats(&runtime_itlb_cache_stats[0], count[0]);
318 }
319
320 /*
321  * Read out the results of a single counter:
322  * aggregate counts across CPUs in system-wide mode
323  */
324 static int read_counter_aggr(struct perf_evsel *counter)
325 {
326         struct perf_stat *ps = counter->priv;
327         u64 *count = counter->counts->aggr.values;
328         int i;
329
330         if (__perf_evsel__read(counter, evsel_list->cpus->nr,
331                                evsel_list->threads->nr, scale) < 0)
332                 return -1;
333
334         for (i = 0; i < 3; i++)
335                 update_stats(&ps->res_stats[i], count[i]);
336
337         if (verbose) {
338                 fprintf(output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
339                         perf_evsel__name(counter), count[0], count[1], count[2]);
340         }
341
342         /*
343          * Save the full runtime - to allow normalization during printout:
344          */
345         update_shadow_stats(counter, count);
346
347         return 0;
348 }
349
350 /*
351  * Read out the results of a single counter:
352  * do not aggregate counts across CPUs in system-wide mode
353  */
354 static int read_counter(struct perf_evsel *counter)
355 {
356         u64 *count;
357         int cpu;
358
359         for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) {
360                 if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0)
361                         return -1;
362
363                 count = counter->counts->cpu[cpu].values;
364
365                 update_shadow_stats(counter, count);
366         }
367
368         return 0;
369 }
370
371 static int run_perf_stat(int argc __maybe_unused, const char **argv)
372 {
373         unsigned long long t0, t1;
374         struct perf_evsel *counter, *first;
375         int status = 0;
376         int child_ready_pipe[2], go_pipe[2];
377         const bool forks = (argc > 0);
378         char buf;
379
380         if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
381                 perror("failed to create pipes");
382                 return -1;
383         }
384
385         if (forks) {
386                 if ((child_pid = fork()) < 0)
387                         perror("failed to fork");
388
389                 if (!child_pid) {
390                         close(child_ready_pipe[0]);
391                         close(go_pipe[1]);
392                         fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
393
394                         /*
395                          * Do a dummy execvp to get the PLT entry resolved,
396                          * so we avoid the resolver overhead on the real
397                          * execvp call.
398                          */
399                         execvp("", (char **)argv);
400
401                         /*
402                          * Tell the parent we're ready to go
403                          */
404                         close(child_ready_pipe[1]);
405
406                         /*
407                          * Wait until the parent tells us to go.
408                          */
409                         if (read(go_pipe[0], &buf, 1) == -1)
410                                 perror("unable to read pipe");
411
412                         execvp(argv[0], (char **)argv);
413
414                         perror(argv[0]);
415                         exit(-1);
416                 }
417
418                 if (perf_target__none(&target))
419                         evsel_list->threads->map[0] = child_pid;
420
421                 /*
422                  * Wait for the child to be ready to exec.
423                  */
424                 close(child_ready_pipe[1]);
425                 close(go_pipe[0]);
426                 if (read(child_ready_pipe[0], &buf, 1) == -1)
427                         perror("unable to read pipe");
428                 close(child_ready_pipe[0]);
429         }
430
431         if (group)
432                 perf_evlist__set_leader(evsel_list);
433
434         first = perf_evlist__first(evsel_list);
435
436         list_for_each_entry(counter, &evsel_list->entries, node) {
437                 if (create_perf_stat_counter(counter, first) < 0) {
438                         /*
439                          * PPC returns ENXIO for HW counters until 2.6.37
440                          * (behavior changed with commit b0a873e).
441                          */
442                         if (errno == EINVAL || errno == ENOSYS ||
443                             errno == ENOENT || errno == EOPNOTSUPP ||
444                             errno == ENXIO) {
445                                 if (verbose)
446                                         ui__warning("%s event is not supported by the kernel.\n",
447                                                     perf_evsel__name(counter));
448                                 counter->supported = false;
449                                 continue;
450                         }
451
452                         if (errno == EPERM || errno == EACCES) {
453                                 error("You may not have permission to collect %sstats.\n"
454                                       "\t Consider tweaking"
455                                       " /proc/sys/kernel/perf_event_paranoid or running as root.",
456                                       target.system_wide ? "system-wide " : "");
457                         } else {
458                                 error("open_counter returned with %d (%s). "
459                                       "/bin/dmesg may provide additional information.\n",
460                                        errno, strerror(errno));
461                         }
462                         if (child_pid != -1)
463                                 kill(child_pid, SIGTERM);
464
465                         pr_err("Not all events could be opened.\n");
466                         return -1;
467                 }
468                 counter->supported = true;
469         }
470
471         if (perf_evlist__set_filters(evsel_list)) {
472                 error("failed to set filter with %d (%s)\n", errno,
473                         strerror(errno));
474                 return -1;
475         }
476
477         /*
478          * Enable counters and exec the command:
479          */
480         t0 = rdclock();
481
482         if (forks) {
483                 close(go_pipe[1]);
484                 wait(&status);
485                 if (WIFSIGNALED(status))
486                         psignal(WTERMSIG(status), argv[0]);
487         } else {
488                 while(!done) sleep(1);
489         }
490
491         t1 = rdclock();
492
493         update_stats(&walltime_nsecs_stats, t1 - t0);
494
495         if (no_aggr) {
496                 list_for_each_entry(counter, &evsel_list->entries, node) {
497                         read_counter(counter);
498                         perf_evsel__close_fd(counter, evsel_list->cpus->nr, 1);
499                 }
500         } else {
501                 list_for_each_entry(counter, &evsel_list->entries, node) {
502                         read_counter_aggr(counter);
503                         perf_evsel__close_fd(counter, evsel_list->cpus->nr,
504                                              evsel_list->threads->nr);
505                 }
506         }
507
508         return WEXITSTATUS(status);
509 }
510
511 static void print_noise_pct(double total, double avg)
512 {
513         double pct = rel_stddev_stats(total, avg);
514
515         if (csv_output)
516                 fprintf(output, "%s%.2f%%", csv_sep, pct);
517         else if (pct)
518                 fprintf(output, "  ( +-%6.2f%% )", pct);
519 }
520
521 static void print_noise(struct perf_evsel *evsel, double avg)
522 {
523         struct perf_stat *ps;
524
525         if (run_count == 1)
526                 return;
527
528         ps = evsel->priv;
529         print_noise_pct(stddev_stats(&ps->res_stats[0]), avg);
530 }
531
532 static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg)
533 {
534         double msecs = avg / 1e6;
535         char cpustr[16] = { '\0', };
536         const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s";
537
538         if (no_aggr)
539                 sprintf(cpustr, "CPU%*d%s",
540                         csv_output ? 0 : -4,
541                         evsel_list->cpus->map[cpu], csv_sep);
542
543         fprintf(output, fmt, cpustr, msecs, csv_sep, perf_evsel__name(evsel));
544
545         if (evsel->cgrp)
546                 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
547
548         if (csv_output)
549                 return;
550
551         if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
552                 fprintf(output, " # %8.3f CPUs utilized          ",
553                         avg / avg_stats(&walltime_nsecs_stats));
554         else
555                 fprintf(output, "                                   ");
556 }
557
558 /* used for get_ratio_color() */
559 enum grc_type {
560         GRC_STALLED_CYCLES_FE,
561         GRC_STALLED_CYCLES_BE,
562         GRC_CACHE_MISSES,
563         GRC_MAX_NR
564 };
565
566 static const char *get_ratio_color(enum grc_type type, double ratio)
567 {
568         static const double grc_table[GRC_MAX_NR][3] = {
569                 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 },
570                 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 },
571                 [GRC_CACHE_MISSES]      = { 20.0, 10.0, 5.0 },
572         };
573         const char *color = PERF_COLOR_NORMAL;
574
575         if (ratio > grc_table[type][0])
576                 color = PERF_COLOR_RED;
577         else if (ratio > grc_table[type][1])
578                 color = PERF_COLOR_MAGENTA;
579         else if (ratio > grc_table[type][2])
580                 color = PERF_COLOR_YELLOW;
581
582         return color;
583 }
584
585 static void print_stalled_cycles_frontend(int cpu,
586                                           struct perf_evsel *evsel
587                                           __maybe_unused, double avg)
588 {
589         double total, ratio = 0.0;
590         const char *color;
591
592         total = avg_stats(&runtime_cycles_stats[cpu]);
593
594         if (total)
595                 ratio = avg / total * 100.0;
596
597         color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio);
598
599         fprintf(output, " #  ");
600         color_fprintf(output, color, "%6.2f%%", ratio);
601         fprintf(output, " frontend cycles idle   ");
602 }
603
604 static void print_stalled_cycles_backend(int cpu,
605                                          struct perf_evsel *evsel
606                                          __maybe_unused, double avg)
607 {
608         double total, ratio = 0.0;
609         const char *color;
610
611         total = avg_stats(&runtime_cycles_stats[cpu]);
612
613         if (total)
614                 ratio = avg / total * 100.0;
615
616         color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
617
618         fprintf(output, " #  ");
619         color_fprintf(output, color, "%6.2f%%", ratio);
620         fprintf(output, " backend  cycles idle   ");
621 }
622
623 static void print_branch_misses(int cpu,
624                                 struct perf_evsel *evsel __maybe_unused,
625                                 double avg)
626 {
627         double total, ratio = 0.0;
628         const char *color;
629
630         total = avg_stats(&runtime_branches_stats[cpu]);
631
632         if (total)
633                 ratio = avg / total * 100.0;
634
635         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
636
637         fprintf(output, " #  ");
638         color_fprintf(output, color, "%6.2f%%", ratio);
639         fprintf(output, " of all branches        ");
640 }
641
642 static void print_l1_dcache_misses(int cpu,
643                                    struct perf_evsel *evsel __maybe_unused,
644                                    double avg)
645 {
646         double total, ratio = 0.0;
647         const char *color;
648
649         total = avg_stats(&runtime_l1_dcache_stats[cpu]);
650
651         if (total)
652                 ratio = avg / total * 100.0;
653
654         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
655
656         fprintf(output, " #  ");
657         color_fprintf(output, color, "%6.2f%%", ratio);
658         fprintf(output, " of all L1-dcache hits  ");
659 }
660
661 static void print_l1_icache_misses(int cpu,
662                                    struct perf_evsel *evsel __maybe_unused,
663                                    double avg)
664 {
665         double total, ratio = 0.0;
666         const char *color;
667
668         total = avg_stats(&runtime_l1_icache_stats[cpu]);
669
670         if (total)
671                 ratio = avg / total * 100.0;
672
673         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
674
675         fprintf(output, " #  ");
676         color_fprintf(output, color, "%6.2f%%", ratio);
677         fprintf(output, " of all L1-icache hits  ");
678 }
679
680 static void print_dtlb_cache_misses(int cpu,
681                                     struct perf_evsel *evsel __maybe_unused,
682                                     double avg)
683 {
684         double total, ratio = 0.0;
685         const char *color;
686
687         total = avg_stats(&runtime_dtlb_cache_stats[cpu]);
688
689         if (total)
690                 ratio = avg / total * 100.0;
691
692         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
693
694         fprintf(output, " #  ");
695         color_fprintf(output, color, "%6.2f%%", ratio);
696         fprintf(output, " of all dTLB cache hits ");
697 }
698
699 static void print_itlb_cache_misses(int cpu,
700                                     struct perf_evsel *evsel __maybe_unused,
701                                     double avg)
702 {
703         double total, ratio = 0.0;
704         const char *color;
705
706         total = avg_stats(&runtime_itlb_cache_stats[cpu]);
707
708         if (total)
709                 ratio = avg / total * 100.0;
710
711         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
712
713         fprintf(output, " #  ");
714         color_fprintf(output, color, "%6.2f%%", ratio);
715         fprintf(output, " of all iTLB cache hits ");
716 }
717
718 static void print_ll_cache_misses(int cpu,
719                                   struct perf_evsel *evsel __maybe_unused,
720                                   double avg)
721 {
722         double total, ratio = 0.0;
723         const char *color;
724
725         total = avg_stats(&runtime_ll_cache_stats[cpu]);
726
727         if (total)
728                 ratio = avg / total * 100.0;
729
730         color = get_ratio_color(GRC_CACHE_MISSES, ratio);
731
732         fprintf(output, " #  ");
733         color_fprintf(output, color, "%6.2f%%", ratio);
734         fprintf(output, " of all LL-cache hits   ");
735 }
736
737 static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
738 {
739         double total, ratio = 0.0;
740         char cpustr[16] = { '\0', };
741         const char *fmt;
742
743         if (csv_output)
744                 fmt = "%s%.0f%s%s";
745         else if (big_num)
746                 fmt = "%s%'18.0f%s%-25s";
747         else
748                 fmt = "%s%18.0f%s%-25s";
749
750         if (no_aggr)
751                 sprintf(cpustr, "CPU%*d%s",
752                         csv_output ? 0 : -4,
753                         evsel_list->cpus->map[cpu], csv_sep);
754         else
755                 cpu = 0;
756
757         fprintf(output, fmt, cpustr, avg, csv_sep, perf_evsel__name(evsel));
758
759         if (evsel->cgrp)
760                 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
761
762         if (csv_output)
763                 return;
764
765         if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
766                 total = avg_stats(&runtime_cycles_stats[cpu]);
767
768                 if (total)
769                         ratio = avg / total;
770
771                 fprintf(output, " #   %5.2f  insns per cycle        ", ratio);
772
773                 total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]);
774                 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu]));
775
776                 if (total && avg) {
777                         ratio = total / avg;
778                         fprintf(output, "\n                                             #   %5.2f  stalled cycles per insn", ratio);
779                 }
780
781         } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
782                         runtime_branches_stats[cpu].n != 0) {
783                 print_branch_misses(cpu, evsel, avg);
784         } else if (
785                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
786                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
787                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
788                                         ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
789                         runtime_l1_dcache_stats[cpu].n != 0) {
790                 print_l1_dcache_misses(cpu, evsel, avg);
791         } else if (
792                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
793                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1I |
794                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
795                                         ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
796                         runtime_l1_icache_stats[cpu].n != 0) {
797                 print_l1_icache_misses(cpu, evsel, avg);
798         } else if (
799                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
800                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_DTLB |
801                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
802                                         ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
803                         runtime_dtlb_cache_stats[cpu].n != 0) {
804                 print_dtlb_cache_misses(cpu, evsel, avg);
805         } else if (
806                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
807                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_ITLB |
808                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
809                                         ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
810                         runtime_itlb_cache_stats[cpu].n != 0) {
811                 print_itlb_cache_misses(cpu, evsel, avg);
812         } else if (
813                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
814                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_LL |
815                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
816                                         ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
817                         runtime_ll_cache_stats[cpu].n != 0) {
818                 print_ll_cache_misses(cpu, evsel, avg);
819         } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
820                         runtime_cacherefs_stats[cpu].n != 0) {
821                 total = avg_stats(&runtime_cacherefs_stats[cpu]);
822
823                 if (total)
824                         ratio = avg * 100 / total;
825
826                 fprintf(output, " # %8.3f %% of all cache refs    ", ratio);
827
828         } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
829                 print_stalled_cycles_frontend(cpu, evsel, avg);
830         } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
831                 print_stalled_cycles_backend(cpu, evsel, avg);
832         } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
833                 total = avg_stats(&runtime_nsecs_stats[cpu]);
834
835                 if (total)
836                         ratio = 1.0 * avg / total;
837
838                 fprintf(output, " # %8.3f GHz                    ", ratio);
839         } else if (runtime_nsecs_stats[cpu].n != 0) {
840                 char unit = 'M';
841
842                 total = avg_stats(&runtime_nsecs_stats[cpu]);
843
844                 if (total)
845                         ratio = 1000.0 * avg / total;
846                 if (ratio < 0.001) {
847                         ratio *= 1000;
848                         unit = 'K';
849                 }
850
851                 fprintf(output, " # %8.3f %c/sec                  ", ratio, unit);
852         } else {
853                 fprintf(output, "                                   ");
854         }
855 }
856
857 /*
858  * Print out the results of a single counter:
859  * aggregated counts in system-wide mode
860  */
861 static void print_counter_aggr(struct perf_evsel *counter)
862 {
863         struct perf_stat *ps = counter->priv;
864         double avg = avg_stats(&ps->res_stats[0]);
865         int scaled = counter->counts->scaled;
866
867         if (scaled == -1) {
868                 fprintf(output, "%*s%s%*s",
869                         csv_output ? 0 : 18,
870                         counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
871                         csv_sep,
872                         csv_output ? 0 : -24,
873                         perf_evsel__name(counter));
874
875                 if (counter->cgrp)
876                         fprintf(output, "%s%s", csv_sep, counter->cgrp->name);
877
878                 fputc('\n', output);
879                 return;
880         }
881
882         if (nsec_counter(counter))
883                 nsec_printout(-1, counter, avg);
884         else
885                 abs_printout(-1, counter, avg);
886
887         print_noise(counter, avg);
888
889         if (csv_output) {
890                 fputc('\n', output);
891                 return;
892         }
893
894         if (scaled) {
895                 double avg_enabled, avg_running;
896
897                 avg_enabled = avg_stats(&ps->res_stats[1]);
898                 avg_running = avg_stats(&ps->res_stats[2]);
899
900                 fprintf(output, " [%5.2f%%]", 100 * avg_running / avg_enabled);
901         }
902         fprintf(output, "\n");
903 }
904
905 /*
906  * Print out the results of a single counter:
907  * does not use aggregated count in system-wide
908  */
909 static void print_counter(struct perf_evsel *counter)
910 {
911         u64 ena, run, val;
912         int cpu;
913
914         for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) {
915                 val = counter->counts->cpu[cpu].val;
916                 ena = counter->counts->cpu[cpu].ena;
917                 run = counter->counts->cpu[cpu].run;
918                 if (run == 0 || ena == 0) {
919                         fprintf(output, "CPU%*d%s%*s%s%*s",
920                                 csv_output ? 0 : -4,
921                                 evsel_list->cpus->map[cpu], csv_sep,
922                                 csv_output ? 0 : 18,
923                                 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
924                                 csv_sep,
925                                 csv_output ? 0 : -24,
926                                 perf_evsel__name(counter));
927
928                         if (counter->cgrp)
929                                 fprintf(output, "%s%s",
930                                         csv_sep, counter->cgrp->name);
931
932                         fputc('\n', output);
933                         continue;
934                 }
935
936                 if (nsec_counter(counter))
937                         nsec_printout(cpu, counter, val);
938                 else
939                         abs_printout(cpu, counter, val);
940
941                 if (!csv_output) {
942                         print_noise(counter, 1.0);
943
944                         if (run != ena)
945                                 fprintf(output, "  (%.2f%%)",
946                                         100.0 * run / ena);
947                 }
948                 fputc('\n', output);
949         }
950 }
951
952 static void print_stat(int argc, const char **argv)
953 {
954         struct perf_evsel *counter;
955         int i;
956
957         fflush(stdout);
958
959         if (!csv_output) {
960                 fprintf(output, "\n");
961                 fprintf(output, " Performance counter stats for ");
962                 if (!perf_target__has_task(&target)) {
963                         fprintf(output, "\'%s", argv[0]);
964                         for (i = 1; i < argc; i++)
965                                 fprintf(output, " %s", argv[i]);
966                 } else if (target.pid)
967                         fprintf(output, "process id \'%s", target.pid);
968                 else
969                         fprintf(output, "thread id \'%s", target.tid);
970
971                 fprintf(output, "\'");
972                 if (run_count > 1)
973                         fprintf(output, " (%d runs)", run_count);
974                 fprintf(output, ":\n\n");
975         }
976
977         if (no_aggr) {
978                 list_for_each_entry(counter, &evsel_list->entries, node)
979                         print_counter(counter);
980         } else {
981                 list_for_each_entry(counter, &evsel_list->entries, node)
982                         print_counter_aggr(counter);
983         }
984
985         if (!csv_output) {
986                 if (!null_run)
987                         fprintf(output, "\n");
988                 fprintf(output, " %17.9f seconds time elapsed",
989                                 avg_stats(&walltime_nsecs_stats)/1e9);
990                 if (run_count > 1) {
991                         fprintf(output, "                                        ");
992                         print_noise_pct(stddev_stats(&walltime_nsecs_stats),
993                                         avg_stats(&walltime_nsecs_stats));
994                 }
995                 fprintf(output, "\n\n");
996         }
997 }
998
999 static volatile int signr = -1;
1000
1001 static void skip_signal(int signo)
1002 {
1003         if(child_pid == -1)
1004                 done = 1;
1005
1006         signr = signo;
1007 }
1008
1009 static void sig_atexit(void)
1010 {
1011         if (child_pid != -1)
1012                 kill(child_pid, SIGTERM);
1013
1014         if (signr == -1)
1015                 return;
1016
1017         signal(signr, SIG_DFL);
1018         kill(getpid(), signr);
1019 }
1020
1021 static const char * const stat_usage[] = {
1022         "perf stat [<options>] [<command>]",
1023         NULL
1024 };
1025
1026 static int stat__set_big_num(const struct option *opt __maybe_unused,
1027                              const char *s __maybe_unused, int unset)
1028 {
1029         big_num_opt = unset ? 0 : 1;
1030         return 0;
1031 }
1032
1033 static bool append_file;
1034
1035 static const struct option options[] = {
1036         OPT_CALLBACK('e', "event", &evsel_list, "event",
1037                      "event selector. use 'perf list' to list available events",
1038                      parse_events_option),
1039         OPT_CALLBACK(0, "filter", &evsel_list, "filter",
1040                      "event filter", parse_filter),
1041         OPT_BOOLEAN('i', "no-inherit", &no_inherit,
1042                     "child tasks do not inherit counters"),
1043         OPT_STRING('p', "pid", &target.pid, "pid",
1044                    "stat events on existing process id"),
1045         OPT_STRING('t', "tid", &target.tid, "tid",
1046                    "stat events on existing thread id"),
1047         OPT_BOOLEAN('a', "all-cpus", &target.system_wide,
1048                     "system-wide collection from all CPUs"),
1049         OPT_BOOLEAN('g', "group", &group,
1050                     "put the counters into a counter group"),
1051         OPT_BOOLEAN('c', "scale", &scale,
1052                     "scale/normalize counters"),
1053         OPT_INCR('v', "verbose", &verbose,
1054                     "be more verbose (show counter open errors, etc)"),
1055         OPT_INTEGER('r', "repeat", &run_count,
1056                     "repeat command and print average + stddev (max: 100)"),
1057         OPT_BOOLEAN('n', "null", &null_run,
1058                     "null run - dont start any counters"),
1059         OPT_INCR('d', "detailed", &detailed_run,
1060                     "detailed run - start a lot of events"),
1061         OPT_BOOLEAN('S', "sync", &sync_run,
1062                     "call sync() before starting a run"),
1063         OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, 
1064                            "print large numbers with thousands\' separators",
1065                            stat__set_big_num),
1066         OPT_STRING('C', "cpu", &target.cpu_list, "cpu",
1067                     "list of cpus to monitor in system-wide"),
1068         OPT_BOOLEAN('A', "no-aggr", &no_aggr,
1069                     "disable CPU count aggregation"),
1070         OPT_STRING('x', "field-separator", &csv_sep, "separator",
1071                    "print counts with custom separator"),
1072         OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
1073                      "monitor event in cgroup name only",
1074                      parse_cgroups),
1075         OPT_STRING('o', "output", &output_name, "file",
1076                     "output file name"),
1077         OPT_BOOLEAN(0, "append", &append_file, "append to the output file"),
1078         OPT_INTEGER(0, "log-fd", &output_fd,
1079                     "log output to fd, instead of stderr"),
1080         OPT_END()
1081 };
1082
1083 /*
1084  * Add default attributes, if there were no attributes specified or
1085  * if -d/--detailed, -d -d or -d -d -d is used:
1086  */
1087 static int add_default_attributes(void)
1088 {
1089         /* Set attrs if no event is selected and !null_run: */
1090         if (null_run)
1091                 return 0;
1092
1093         if (!evsel_list->nr_entries) {
1094                 if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0)
1095                         return -1;
1096         }
1097
1098         /* Detailed events get appended to the event list: */
1099
1100         if (detailed_run <  1)
1101                 return 0;
1102
1103         /* Append detailed run extra attributes: */
1104         if (perf_evlist__add_default_attrs(evsel_list, detailed_attrs) < 0)
1105                 return -1;
1106
1107         if (detailed_run < 2)
1108                 return 0;
1109
1110         /* Append very detailed run extra attributes: */
1111         if (perf_evlist__add_default_attrs(evsel_list, very_detailed_attrs) < 0)
1112                 return -1;
1113
1114         if (detailed_run < 3)
1115                 return 0;
1116
1117         /* Append very, very detailed run extra attributes: */
1118         return perf_evlist__add_default_attrs(evsel_list, very_very_detailed_attrs);
1119 }
1120
1121 int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
1122 {
1123         struct perf_evsel *pos;
1124         int status = -ENOMEM;
1125         const char *mode;
1126
1127         setlocale(LC_ALL, "");
1128
1129         evsel_list = perf_evlist__new(NULL, NULL);
1130         if (evsel_list == NULL)
1131                 return -ENOMEM;
1132
1133         argc = parse_options(argc, argv, options, stat_usage,
1134                 PARSE_OPT_STOP_AT_NON_OPTION);
1135
1136         output = stderr;
1137         if (output_name && strcmp(output_name, "-"))
1138                 output = NULL;
1139
1140         if (output_name && output_fd) {
1141                 fprintf(stderr, "cannot use both --output and --log-fd\n");
1142                 usage_with_options(stat_usage, options);
1143         }
1144
1145         if (output_fd < 0) {
1146                 fprintf(stderr, "argument to --log-fd must be a > 0\n");
1147                 usage_with_options(stat_usage, options);
1148         }
1149
1150         if (!output) {
1151                 struct timespec tm;
1152                 mode = append_file ? "a" : "w";
1153
1154                 output = fopen(output_name, mode);
1155                 if (!output) {
1156                         perror("failed to create output file");
1157                         return -1;
1158                 }
1159                 clock_gettime(CLOCK_REALTIME, &tm);
1160                 fprintf(output, "# started on %s\n", ctime(&tm.tv_sec));
1161         } else if (output_fd > 0) {
1162                 mode = append_file ? "a" : "w";
1163                 output = fdopen(output_fd, mode);
1164                 if (!output) {
1165                         perror("Failed opening logfd");
1166                         return -errno;
1167                 }
1168         }
1169
1170         if (csv_sep) {
1171                 csv_output = true;
1172                 if (!strcmp(csv_sep, "\\t"))
1173                         csv_sep = "\t";
1174         } else
1175                 csv_sep = DEFAULT_SEPARATOR;
1176
1177         /*
1178          * let the spreadsheet do the pretty-printing
1179          */
1180         if (csv_output) {
1181                 /* User explicitly passed -B? */
1182                 if (big_num_opt == 1) {
1183                         fprintf(stderr, "-B option not supported with -x\n");
1184                         usage_with_options(stat_usage, options);
1185                 } else /* Nope, so disable big number formatting */
1186                         big_num = false;
1187         } else if (big_num_opt == 0) /* User passed --no-big-num */
1188                 big_num = false;
1189
1190         if (!argc && !perf_target__has_task(&target))
1191                 usage_with_options(stat_usage, options);
1192         if (run_count <= 0)
1193                 usage_with_options(stat_usage, options);
1194
1195         /* no_aggr, cgroup are for system-wide only */
1196         if ((no_aggr || nr_cgroups) && !perf_target__has_cpu(&target)) {
1197                 fprintf(stderr, "both cgroup and no-aggregation "
1198                         "modes only available in system-wide mode\n");
1199
1200                 usage_with_options(stat_usage, options);
1201         }
1202
1203         if (add_default_attributes())
1204                 goto out;
1205
1206         perf_target__validate(&target);
1207
1208         if (perf_evlist__create_maps(evsel_list, &target) < 0) {
1209                 if (perf_target__has_task(&target))
1210                         pr_err("Problems finding threads of monitor\n");
1211                 if (perf_target__has_cpu(&target))
1212                         perror("failed to parse CPUs map");
1213
1214                 usage_with_options(stat_usage, options);
1215                 return -1;
1216         }
1217
1218         list_for_each_entry(pos, &evsel_list->entries, node) {
1219                 if (perf_evsel__alloc_stat_priv(pos) < 0 ||
1220                     perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0)
1221                         goto out_free_fd;
1222         }
1223
1224         /*
1225          * We dont want to block the signals - that would cause
1226          * child tasks to inherit that and Ctrl-C would not work.
1227          * What we want is for Ctrl-C to work in the exec()-ed
1228          * task, but being ignored by perf stat itself:
1229          */
1230         atexit(sig_atexit);
1231         signal(SIGINT,  skip_signal);
1232         signal(SIGALRM, skip_signal);
1233         signal(SIGABRT, skip_signal);
1234
1235         status = 0;
1236         for (run_idx = 0; run_idx < run_count; run_idx++) {
1237                 if (run_count != 1 && verbose)
1238                         fprintf(output, "[ perf stat: executing run #%d ... ]\n",
1239                                 run_idx + 1);
1240
1241                 if (sync_run)
1242                         sync();
1243
1244                 status = run_perf_stat(argc, argv);
1245         }
1246
1247         if (status != -1)
1248                 print_stat(argc, argv);
1249 out_free_fd:
1250         list_for_each_entry(pos, &evsel_list->entries, node)
1251                 perf_evsel__free_stat_priv(pos);
1252         perf_evlist__delete_maps(evsel_list);
1253 out:
1254         perf_evlist__delete(evsel_list);
1255         return status;
1256 }