Merge remote-tracking branch 'spi/topic/core' into spi-next
[cascardo/linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
40
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
42 #include <stdlib.h>
43 #include <linux/err.h>
44 #include <linux/filter.h>
45 #include <linux/audit.h>
46 #include <linux/random.h>
47 #include <linux/stringify.h>
48
49 #ifndef O_CLOEXEC
50 # define O_CLOEXEC              02000000
51 #endif
52
53 struct trace {
54         struct perf_tool        tool;
55         struct syscalltbl       *sctbl;
56         struct {
57                 int             max;
58                 struct syscall  *table;
59                 struct {
60                         struct perf_evsel *sys_enter,
61                                           *sys_exit;
62                 }               events;
63         } syscalls;
64         struct record_opts      opts;
65         struct perf_evlist      *evlist;
66         struct machine          *host;
67         struct thread           *current;
68         u64                     base_time;
69         FILE                    *output;
70         unsigned long           nr_events;
71         struct strlist          *ev_qualifier;
72         struct {
73                 size_t          nr;
74                 int             *entries;
75         }                       ev_qualifier_ids;
76         struct intlist          *tid_list;
77         struct intlist          *pid_list;
78         struct {
79                 size_t          nr;
80                 pid_t           *entries;
81         }                       filter_pids;
82         double                  duration_filter;
83         double                  runtime_ms;
84         struct {
85                 u64             vfs_getname,
86                                 proc_getname;
87         } stats;
88         unsigned int            max_stack;
89         unsigned int            min_stack;
90         bool                    not_ev_qualifier;
91         bool                    live;
92         bool                    full_time;
93         bool                    sched;
94         bool                    multiple_threads;
95         bool                    summary;
96         bool                    summary_only;
97         bool                    show_comm;
98         bool                    show_tool_stats;
99         bool                    trace_syscalls;
100         bool                    kernel_syscallchains;
101         bool                    force;
102         bool                    vfs_getname;
103         int                     trace_pgfaults;
104         int                     open_id;
105 };
106
107 struct tp_field {
108         int offset;
109         union {
110                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
111                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
112         };
113 };
114
115 #define TP_UINT_FIELD(bits) \
116 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
117 { \
118         u##bits value; \
119         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
120         return value;  \
121 }
122
123 TP_UINT_FIELD(8);
124 TP_UINT_FIELD(16);
125 TP_UINT_FIELD(32);
126 TP_UINT_FIELD(64);
127
128 #define TP_UINT_FIELD__SWAPPED(bits) \
129 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
130 { \
131         u##bits value; \
132         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
133         return bswap_##bits(value);\
134 }
135
136 TP_UINT_FIELD__SWAPPED(16);
137 TP_UINT_FIELD__SWAPPED(32);
138 TP_UINT_FIELD__SWAPPED(64);
139
140 static int tp_field__init_uint(struct tp_field *field,
141                                struct format_field *format_field,
142                                bool needs_swap)
143 {
144         field->offset = format_field->offset;
145
146         switch (format_field->size) {
147         case 1:
148                 field->integer = tp_field__u8;
149                 break;
150         case 2:
151                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
152                 break;
153         case 4:
154                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
155                 break;
156         case 8:
157                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
158                 break;
159         default:
160                 return -1;
161         }
162
163         return 0;
164 }
165
166 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
167 {
168         return sample->raw_data + field->offset;
169 }
170
171 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
172 {
173         field->offset = format_field->offset;
174         field->pointer = tp_field__ptr;
175         return 0;
176 }
177
178 struct syscall_tp {
179         struct tp_field id;
180         union {
181                 struct tp_field args, ret;
182         };
183 };
184
185 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
186                                           struct tp_field *field,
187                                           const char *name)
188 {
189         struct format_field *format_field = perf_evsel__field(evsel, name);
190
191         if (format_field == NULL)
192                 return -1;
193
194         return tp_field__init_uint(field, format_field, evsel->needs_swap);
195 }
196
197 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
198         ({ struct syscall_tp *sc = evsel->priv;\
199            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
200
201 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
202                                          struct tp_field *field,
203                                          const char *name)
204 {
205         struct format_field *format_field = perf_evsel__field(evsel, name);
206
207         if (format_field == NULL)
208                 return -1;
209
210         return tp_field__init_ptr(field, format_field);
211 }
212
213 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
214         ({ struct syscall_tp *sc = evsel->priv;\
215            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
216
217 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
218 {
219         zfree(&evsel->priv);
220         perf_evsel__delete(evsel);
221 }
222
223 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
224 {
225         evsel->priv = malloc(sizeof(struct syscall_tp));
226         if (evsel->priv != NULL) {
227                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
228                         goto out_delete;
229
230                 evsel->handler = handler;
231                 return 0;
232         }
233
234         return -ENOMEM;
235
236 out_delete:
237         zfree(&evsel->priv);
238         return -ENOENT;
239 }
240
241 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
242 {
243         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
244
245         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
246         if (IS_ERR(evsel))
247                 evsel = perf_evsel__newtp("syscalls", direction);
248
249         if (IS_ERR(evsel))
250                 return NULL;
251
252         if (perf_evsel__init_syscall_tp(evsel, handler))
253                 goto out_delete;
254
255         return evsel;
256
257 out_delete:
258         perf_evsel__delete_priv(evsel);
259         return NULL;
260 }
261
262 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
263         ({ struct syscall_tp *fields = evsel->priv; \
264            fields->name.integer(&fields->name, sample); })
265
266 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
267         ({ struct syscall_tp *fields = evsel->priv; \
268            fields->name.pointer(&fields->name, sample); })
269
270 struct syscall_arg {
271         unsigned long val;
272         struct thread *thread;
273         struct trace  *trace;
274         void          *parm;
275         u8            idx;
276         u8            mask;
277 };
278
279 struct strarray {
280         int         offset;
281         int         nr_entries;
282         const char **entries;
283 };
284
285 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
286         .nr_entries = ARRAY_SIZE(array), \
287         .entries = array, \
288 }
289
290 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
291         .offset     = off, \
292         .nr_entries = ARRAY_SIZE(array), \
293         .entries = array, \
294 }
295
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
297                                                 const char *intfmt,
298                                                 struct syscall_arg *arg)
299 {
300         struct strarray *sa = arg->parm;
301         int idx = arg->val - sa->offset;
302
303         if (idx < 0 || idx >= sa->nr_entries)
304                 return scnprintf(bf, size, intfmt, arg->val);
305
306         return scnprintf(bf, size, "%s", sa->entries[idx]);
307 }
308
309 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
310                                               struct syscall_arg *arg)
311 {
312         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
313 }
314
315 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
316
317 #if defined(__i386__) || defined(__x86_64__)
318 /*
319  * FIXME: Make this available to all arches as soon as the ioctl beautifier
320  *        gets rewritten to support all arches.
321  */
322 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
323                                                  struct syscall_arg *arg)
324 {
325         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
326 }
327
328 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
329 #endif /* defined(__i386__) || defined(__x86_64__) */
330
331 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
332                                         struct syscall_arg *arg);
333
334 #define SCA_FD syscall_arg__scnprintf_fd
335
336 #ifndef AT_FDCWD
337 #define AT_FDCWD        -100
338 #endif
339
340 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
341                                            struct syscall_arg *arg)
342 {
343         int fd = arg->val;
344
345         if (fd == AT_FDCWD)
346                 return scnprintf(bf, size, "CWD");
347
348         return syscall_arg__scnprintf_fd(bf, size, arg);
349 }
350
351 #define SCA_FDAT syscall_arg__scnprintf_fd_at
352
353 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
354                                               struct syscall_arg *arg);
355
356 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
357
358 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
359                                          struct syscall_arg *arg)
360 {
361         return scnprintf(bf, size, "%#lx", arg->val);
362 }
363
364 #define SCA_HEX syscall_arg__scnprintf_hex
365
366 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
367                                          struct syscall_arg *arg)
368 {
369         return scnprintf(bf, size, "%d", arg->val);
370 }
371
372 #define SCA_INT syscall_arg__scnprintf_int
373
374 static const char *bpf_cmd[] = {
375         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
376         "MAP_GET_NEXT_KEY", "PROG_LOAD",
377 };
378 static DEFINE_STRARRAY(bpf_cmd);
379
380 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
381 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
382
383 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
384 static DEFINE_STRARRAY(itimers);
385
386 static const char *keyctl_options[] = {
387         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
388         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
389         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
390         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
391         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
392 };
393 static DEFINE_STRARRAY(keyctl_options);
394
395 static const char *whences[] = { "SET", "CUR", "END",
396 #ifdef SEEK_DATA
397 "DATA",
398 #endif
399 #ifdef SEEK_HOLE
400 "HOLE",
401 #endif
402 };
403 static DEFINE_STRARRAY(whences);
404
405 static const char *fcntl_cmds[] = {
406         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
407         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
408         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
409         "F_GETOWNER_UIDS",
410 };
411 static DEFINE_STRARRAY(fcntl_cmds);
412
413 static const char *rlimit_resources[] = {
414         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
415         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
416         "RTTIME",
417 };
418 static DEFINE_STRARRAY(rlimit_resources);
419
420 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
421 static DEFINE_STRARRAY(sighow);
422
423 static const char *clockid[] = {
424         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
425         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
426         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
427 };
428 static DEFINE_STRARRAY(clockid);
429
430 static const char *socket_families[] = {
431         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
432         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
433         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
434         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
435         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
436         "ALG", "NFC", "VSOCK",
437 };
438 static DEFINE_STRARRAY(socket_families);
439
440 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
441                                                  struct syscall_arg *arg)
442 {
443         size_t printed = 0;
444         int mode = arg->val;
445
446         if (mode == F_OK) /* 0 */
447                 return scnprintf(bf, size, "F");
448 #define P_MODE(n) \
449         if (mode & n##_OK) { \
450                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
451                 mode &= ~n##_OK; \
452         }
453
454         P_MODE(R);
455         P_MODE(W);
456         P_MODE(X);
457 #undef P_MODE
458
459         if (mode)
460                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
461
462         return printed;
463 }
464
465 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
466
467 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
468                                               struct syscall_arg *arg);
469
470 #define SCA_FILENAME syscall_arg__scnprintf_filename
471
472 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
473                                                 struct syscall_arg *arg)
474 {
475         int printed = 0, flags = arg->val;
476
477 #define P_FLAG(n) \
478         if (flags & O_##n) { \
479                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
480                 flags &= ~O_##n; \
481         }
482
483         P_FLAG(CLOEXEC);
484         P_FLAG(NONBLOCK);
485 #undef P_FLAG
486
487         if (flags)
488                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
489
490         return printed;
491 }
492
493 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
494
495 #if defined(__i386__) || defined(__x86_64__)
496 /*
497  * FIXME: Make this available to all arches.
498  */
499 #define TCGETS          0x5401
500
501 static const char *tioctls[] = {
502         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
503         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
504         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
505         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
506         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
507         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
508         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
509         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
510         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
511         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
512         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
513         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
514         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
515         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
516         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
517 };
518
519 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
520 #endif /* defined(__i386__) || defined(__x86_64__) */
521
522 #ifndef GRND_NONBLOCK
523 #define GRND_NONBLOCK   0x0001
524 #endif
525 #ifndef GRND_RANDOM
526 #define GRND_RANDOM     0x0002
527 #endif
528
529 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
530                                                    struct syscall_arg *arg)
531 {
532         int printed = 0, flags = arg->val;
533
534 #define P_FLAG(n) \
535         if (flags & GRND_##n) { \
536                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
537                 flags &= ~GRND_##n; \
538         }
539
540         P_FLAG(RANDOM);
541         P_FLAG(NONBLOCK);
542 #undef P_FLAG
543
544         if (flags)
545                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
546
547         return printed;
548 }
549
550 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
551
552 #define STRARRAY(arg, name, array) \
553           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
554           .arg_parm      = { [arg] = &strarray__##array, }
555
556 #include "trace/beauty/eventfd.c"
557 #include "trace/beauty/flock.c"
558 #include "trace/beauty/futex_op.c"
559 #include "trace/beauty/mmap.c"
560 #include "trace/beauty/mode_t.c"
561 #include "trace/beauty/msg_flags.c"
562 #include "trace/beauty/open_flags.c"
563 #include "trace/beauty/perf_event_open.c"
564 #include "trace/beauty/pid.c"
565 #include "trace/beauty/sched_policy.c"
566 #include "trace/beauty/seccomp.c"
567 #include "trace/beauty/signum.c"
568 #include "trace/beauty/socket_type.c"
569 #include "trace/beauty/waitid_options.c"
570
571 static struct syscall_fmt {
572         const char *name;
573         const char *alias;
574         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
575         void       *arg_parm[6];
576         bool       errmsg;
577         bool       errpid;
578         bool       timeout;
579         bool       hexret;
580 } syscall_fmts[] = {
581         { .name     = "access",     .errmsg = true,
582           .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
583         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
584         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
585         { .name     = "brk",        .hexret = true,
586           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
587         { .name     = "chdir",      .errmsg = true, },
588         { .name     = "chmod",      .errmsg = true, },
589         { .name     = "chroot",     .errmsg = true, },
590         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
591         { .name     = "clone",      .errpid = true, },
592         { .name     = "close",      .errmsg = true,
593           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
594         { .name     = "connect",    .errmsg = true, },
595         { .name     = "creat",      .errmsg = true, },
596         { .name     = "dup",        .errmsg = true, },
597         { .name     = "dup2",       .errmsg = true, },
598         { .name     = "dup3",       .errmsg = true, },
599         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
600         { .name     = "eventfd2",   .errmsg = true,
601           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
602         { .name     = "faccessat",  .errmsg = true, },
603         { .name     = "fadvise64",  .errmsg = true, },
604         { .name     = "fallocate",  .errmsg = true, },
605         { .name     = "fchdir",     .errmsg = true, },
606         { .name     = "fchmod",     .errmsg = true, },
607         { .name     = "fchmodat",   .errmsg = true,
608           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
609         { .name     = "fchown",     .errmsg = true, },
610         { .name     = "fchownat",   .errmsg = true,
611           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
612         { .name     = "fcntl",      .errmsg = true,
613           .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
614           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
615         { .name     = "fdatasync",  .errmsg = true, },
616         { .name     = "flock",      .errmsg = true,
617           .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
618         { .name     = "fsetxattr",  .errmsg = true, },
619         { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
620         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
621         { .name     = "fstatfs",    .errmsg = true, },
622         { .name     = "fsync",    .errmsg = true, },
623         { .name     = "ftruncate", .errmsg = true, },
624         { .name     = "futex",      .errmsg = true,
625           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
626         { .name     = "futimesat", .errmsg = true,
627           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
628         { .name     = "getdents",   .errmsg = true, },
629         { .name     = "getdents64", .errmsg = true, },
630         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
631         { .name     = "getpid",     .errpid = true, },
632         { .name     = "getpgid",    .errpid = true, },
633         { .name     = "getppid",    .errpid = true, },
634         { .name     = "getrandom",  .errmsg = true,
635           .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
636         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
637         { .name     = "getxattr",   .errmsg = true, },
638         { .name     = "inotify_add_watch",          .errmsg = true, },
639         { .name     = "ioctl",      .errmsg = true,
640           .arg_scnprintf = {
641 #if defined(__i386__) || defined(__x86_64__)
642 /*
643  * FIXME: Make this available to all arches.
644  */
645                              [1] = SCA_STRHEXARRAY, /* cmd */
646                              [2] = SCA_HEX, /* arg */ },
647           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
648 #else
649                              [2] = SCA_HEX, /* arg */ }, },
650 #endif
651         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
652         { .name     = "kill",       .errmsg = true,
653           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
654         { .name     = "lchown",    .errmsg = true, },
655         { .name     = "lgetxattr",  .errmsg = true, },
656         { .name     = "linkat",     .errmsg = true,
657           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
658         { .name     = "listxattr",  .errmsg = true, },
659         { .name     = "llistxattr", .errmsg = true, },
660         { .name     = "lremovexattr",  .errmsg = true, },
661         { .name     = "lseek",      .errmsg = true,
662           .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
663           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
664         { .name     = "lsetxattr",  .errmsg = true, },
665         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
666         { .name     = "lsxattr",    .errmsg = true, },
667         { .name     = "madvise",    .errmsg = true,
668           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
669                              [2] = SCA_MADV_BHV, /* behavior */ }, },
670         { .name     = "mkdir",    .errmsg = true, },
671         { .name     = "mkdirat",    .errmsg = true,
672           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
673         { .name     = "mknod",      .errmsg = true, },
674         { .name     = "mknodat",    .errmsg = true,
675           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
676         { .name     = "mlock",      .errmsg = true,
677           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
678         { .name     = "mlockall",   .errmsg = true,
679           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
680         { .name     = "mmap",       .hexret = true,
681           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
682                              [2] = SCA_MMAP_PROT, /* prot */
683                              [3] = SCA_MMAP_FLAGS, /* flags */ }, },
684         { .name     = "mprotect",   .errmsg = true,
685           .arg_scnprintf = { [0] = SCA_HEX, /* start */
686                              [2] = SCA_MMAP_PROT, /* prot */ }, },
687         { .name     = "mq_unlink", .errmsg = true,
688           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
689         { .name     = "mremap",     .hexret = true,
690           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
691                              [3] = SCA_MREMAP_FLAGS, /* flags */
692                              [4] = SCA_HEX, /* new_addr */ }, },
693         { .name     = "munlock",    .errmsg = true,
694           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
695         { .name     = "munmap",     .errmsg = true,
696           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
697         { .name     = "name_to_handle_at", .errmsg = true,
698           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
699         { .name     = "newfstatat", .errmsg = true,
700           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
701         { .name     = "open",       .errmsg = true,
702           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
703         { .name     = "open_by_handle_at", .errmsg = true,
704           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
705                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
706         { .name     = "openat",     .errmsg = true,
707           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
708                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
709         { .name     = "perf_event_open", .errmsg = true,
710           .arg_scnprintf = { [2] = SCA_INT, /* cpu */
711                              [3] = SCA_FD,  /* group_fd */
712                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
713         { .name     = "pipe2",      .errmsg = true,
714           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
715         { .name     = "poll",       .errmsg = true, .timeout = true, },
716         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
717         { .name     = "pread",      .errmsg = true, .alias = "pread64", },
718         { .name     = "preadv",     .errmsg = true, .alias = "pread", },
719         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
720         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
721         { .name     = "pwritev",    .errmsg = true, },
722         { .name     = "read",       .errmsg = true, },
723         { .name     = "readlink",   .errmsg = true, },
724         { .name     = "readlinkat", .errmsg = true,
725           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
726         { .name     = "readv",      .errmsg = true, },
727         { .name     = "recvfrom",   .errmsg = true,
728           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
729         { .name     = "recvmmsg",   .errmsg = true,
730           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
731         { .name     = "recvmsg",    .errmsg = true,
732           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
733         { .name     = "removexattr", .errmsg = true, },
734         { .name     = "renameat",   .errmsg = true,
735           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
736         { .name     = "rmdir",    .errmsg = true, },
737         { .name     = "rt_sigaction", .errmsg = true,
738           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
739         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
740         { .name     = "rt_sigqueueinfo", .errmsg = true,
741           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
742         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
743           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
744         { .name     = "sched_setscheduler",   .errmsg = true,
745           .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
746         { .name     = "seccomp", .errmsg = true,
747           .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
748                              [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
749         { .name     = "select",     .errmsg = true, .timeout = true, },
750         { .name     = "sendmmsg",    .errmsg = true,
751           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
752         { .name     = "sendmsg",    .errmsg = true,
753           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
754         { .name     = "sendto",     .errmsg = true,
755           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
756         { .name     = "set_tid_address", .errpid = true, },
757         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
758         { .name     = "setpgid",    .errmsg = true, },
759         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
760         { .name     = "setxattr",   .errmsg = true, },
761         { .name     = "shutdown",   .errmsg = true, },
762         { .name     = "socket",     .errmsg = true,
763           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
764                              [1] = SCA_SK_TYPE, /* type */ },
765           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
766         { .name     = "socketpair", .errmsg = true,
767           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
768                              [1] = SCA_SK_TYPE, /* type */ },
769           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
770         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
771         { .name     = "statfs",     .errmsg = true, },
772         { .name     = "swapoff",    .errmsg = true,
773           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
774         { .name     = "swapon",     .errmsg = true,
775           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
776         { .name     = "symlinkat",  .errmsg = true,
777           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
778         { .name     = "tgkill",     .errmsg = true,
779           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
780         { .name     = "tkill",      .errmsg = true,
781           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
782         { .name     = "truncate",   .errmsg = true, },
783         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
784         { .name     = "unlinkat",   .errmsg = true,
785           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
786         { .name     = "utime",  .errmsg = true, },
787         { .name     = "utimensat",  .errmsg = true,
788           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
789         { .name     = "utimes",  .errmsg = true, },
790         { .name     = "vmsplice",  .errmsg = true, },
791         { .name     = "wait4",      .errpid = true,
792           .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
793         { .name     = "waitid",     .errpid = true,
794           .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
795         { .name     = "write",      .errmsg = true, },
796         { .name     = "writev",     .errmsg = true, },
797 };
798
799 static int syscall_fmt__cmp(const void *name, const void *fmtp)
800 {
801         const struct syscall_fmt *fmt = fmtp;
802         return strcmp(name, fmt->name);
803 }
804
805 static struct syscall_fmt *syscall_fmt__find(const char *name)
806 {
807         const int nmemb = ARRAY_SIZE(syscall_fmts);
808         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
809 }
810
811 struct syscall {
812         struct event_format *tp_format;
813         int                 nr_args;
814         struct format_field *args;
815         const char          *name;
816         bool                is_exit;
817         struct syscall_fmt  *fmt;
818         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
819         void                **arg_parm;
820 };
821
822 static size_t fprintf_duration(unsigned long t, FILE *fp)
823 {
824         double duration = (double)t / NSEC_PER_MSEC;
825         size_t printed = fprintf(fp, "(");
826
827         if (duration >= 1.0)
828                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
829         else if (duration >= 0.01)
830                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
831         else
832                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
833         return printed + fprintf(fp, "): ");
834 }
835
836 /**
837  * filename.ptr: The filename char pointer that will be vfs_getname'd
838  * filename.entry_str_pos: Where to insert the string translated from
839  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
840  */
841 struct thread_trace {
842         u64               entry_time;
843         u64               exit_time;
844         bool              entry_pending;
845         unsigned long     nr_events;
846         unsigned long     pfmaj, pfmin;
847         char              *entry_str;
848         double            runtime_ms;
849         struct {
850                 unsigned long ptr;
851                 short int     entry_str_pos;
852                 bool          pending_open;
853                 unsigned int  namelen;
854                 char          *name;
855         } filename;
856         struct {
857                 int       max;
858                 char      **table;
859         } paths;
860
861         struct intlist *syscall_stats;
862 };
863
864 static struct thread_trace *thread_trace__new(void)
865 {
866         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
867
868         if (ttrace)
869                 ttrace->paths.max = -1;
870
871         ttrace->syscall_stats = intlist__new(NULL);
872
873         return ttrace;
874 }
875
876 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
877 {
878         struct thread_trace *ttrace;
879
880         if (thread == NULL)
881                 goto fail;
882
883         if (thread__priv(thread) == NULL)
884                 thread__set_priv(thread, thread_trace__new());
885
886         if (thread__priv(thread) == NULL)
887                 goto fail;
888
889         ttrace = thread__priv(thread);
890         ++ttrace->nr_events;
891
892         return ttrace;
893 fail:
894         color_fprintf(fp, PERF_COLOR_RED,
895                       "WARNING: not enough memory, dropping samples!\n");
896         return NULL;
897 }
898
899 #define TRACE_PFMAJ             (1 << 0)
900 #define TRACE_PFMIN             (1 << 1)
901
902 static const size_t trace__entry_str_size = 2048;
903
904 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
905 {
906         struct thread_trace *ttrace = thread__priv(thread);
907
908         if (fd > ttrace->paths.max) {
909                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
910
911                 if (npath == NULL)
912                         return -1;
913
914                 if (ttrace->paths.max != -1) {
915                         memset(npath + ttrace->paths.max + 1, 0,
916                                (fd - ttrace->paths.max) * sizeof(char *));
917                 } else {
918                         memset(npath, 0, (fd + 1) * sizeof(char *));
919                 }
920
921                 ttrace->paths.table = npath;
922                 ttrace->paths.max   = fd;
923         }
924
925         ttrace->paths.table[fd] = strdup(pathname);
926
927         return ttrace->paths.table[fd] != NULL ? 0 : -1;
928 }
929
930 static int thread__read_fd_path(struct thread *thread, int fd)
931 {
932         char linkname[PATH_MAX], pathname[PATH_MAX];
933         struct stat st;
934         int ret;
935
936         if (thread->pid_ == thread->tid) {
937                 scnprintf(linkname, sizeof(linkname),
938                           "/proc/%d/fd/%d", thread->pid_, fd);
939         } else {
940                 scnprintf(linkname, sizeof(linkname),
941                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
942         }
943
944         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
945                 return -1;
946
947         ret = readlink(linkname, pathname, sizeof(pathname));
948
949         if (ret < 0 || ret > st.st_size)
950                 return -1;
951
952         pathname[ret] = '\0';
953         return trace__set_fd_pathname(thread, fd, pathname);
954 }
955
956 static const char *thread__fd_path(struct thread *thread, int fd,
957                                    struct trace *trace)
958 {
959         struct thread_trace *ttrace = thread__priv(thread);
960
961         if (ttrace == NULL)
962                 return NULL;
963
964         if (fd < 0)
965                 return NULL;
966
967         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
968                 if (!trace->live)
969                         return NULL;
970                 ++trace->stats.proc_getname;
971                 if (thread__read_fd_path(thread, fd))
972                         return NULL;
973         }
974
975         return ttrace->paths.table[fd];
976 }
977
978 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
979                                         struct syscall_arg *arg)
980 {
981         int fd = arg->val;
982         size_t printed = scnprintf(bf, size, "%d", fd);
983         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
984
985         if (path)
986                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
987
988         return printed;
989 }
990
991 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
992                                               struct syscall_arg *arg)
993 {
994         int fd = arg->val;
995         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
996         struct thread_trace *ttrace = thread__priv(arg->thread);
997
998         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
999                 zfree(&ttrace->paths.table[fd]);
1000
1001         return printed;
1002 }
1003
1004 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1005                                      unsigned long ptr)
1006 {
1007         struct thread_trace *ttrace = thread__priv(thread);
1008
1009         ttrace->filename.ptr = ptr;
1010         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1011 }
1012
1013 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1014                                               struct syscall_arg *arg)
1015 {
1016         unsigned long ptr = arg->val;
1017
1018         if (!arg->trace->vfs_getname)
1019                 return scnprintf(bf, size, "%#x", ptr);
1020
1021         thread__set_filename_pos(arg->thread, bf, ptr);
1022         return 0;
1023 }
1024
1025 static bool trace__filter_duration(struct trace *trace, double t)
1026 {
1027         return t < (trace->duration_filter * NSEC_PER_MSEC);
1028 }
1029
1030 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1031 {
1032         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1033
1034         return fprintf(fp, "%10.3f ", ts);
1035 }
1036
1037 static bool done = false;
1038 static bool interrupted = false;
1039
1040 static void sig_handler(int sig)
1041 {
1042         done = true;
1043         interrupted = sig == SIGINT;
1044 }
1045
1046 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1047                                         u64 duration, u64 tstamp, FILE *fp)
1048 {
1049         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1050         printed += fprintf_duration(duration, fp);
1051
1052         if (trace->multiple_threads) {
1053                 if (trace->show_comm)
1054                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1055                 printed += fprintf(fp, "%d ", thread->tid);
1056         }
1057
1058         return printed;
1059 }
1060
1061 static int trace__process_event(struct trace *trace, struct machine *machine,
1062                                 union perf_event *event, struct perf_sample *sample)
1063 {
1064         int ret = 0;
1065
1066         switch (event->header.type) {
1067         case PERF_RECORD_LOST:
1068                 color_fprintf(trace->output, PERF_COLOR_RED,
1069                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1070                 ret = machine__process_lost_event(machine, event, sample);
1071                 break;
1072         default:
1073                 ret = machine__process_event(machine, event, sample);
1074                 break;
1075         }
1076
1077         return ret;
1078 }
1079
1080 static int trace__tool_process(struct perf_tool *tool,
1081                                union perf_event *event,
1082                                struct perf_sample *sample,
1083                                struct machine *machine)
1084 {
1085         struct trace *trace = container_of(tool, struct trace, tool);
1086         return trace__process_event(trace, machine, event, sample);
1087 }
1088
1089 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1090 {
1091         struct machine *machine = vmachine;
1092
1093         if (machine->kptr_restrict_warned)
1094                 return NULL;
1095
1096         if (symbol_conf.kptr_restrict) {
1097                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1098                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1099                            "Kernel samples will not be resolved.\n");
1100                 machine->kptr_restrict_warned = true;
1101                 return NULL;
1102         }
1103
1104         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1105 }
1106
1107 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1108 {
1109         int err = symbol__init(NULL);
1110
1111         if (err)
1112                 return err;
1113
1114         trace->host = machine__new_host();
1115         if (trace->host == NULL)
1116                 return -ENOMEM;
1117
1118         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1119                 return -errno;
1120
1121         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1122                                             evlist->threads, trace__tool_process, false,
1123                                             trace->opts.proc_map_timeout);
1124         if (err)
1125                 symbol__exit();
1126
1127         return err;
1128 }
1129
1130 static int syscall__set_arg_fmts(struct syscall *sc)
1131 {
1132         struct format_field *field;
1133         int idx = 0, len;
1134
1135         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1136         if (sc->arg_scnprintf == NULL)
1137                 return -1;
1138
1139         if (sc->fmt)
1140                 sc->arg_parm = sc->fmt->arg_parm;
1141
1142         for (field = sc->args; field; field = field->next) {
1143                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1144                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1145                 else if (strcmp(field->type, "const char *") == 0 &&
1146                          (strcmp(field->name, "filename") == 0 ||
1147                           strcmp(field->name, "path") == 0 ||
1148                           strcmp(field->name, "pathname") == 0))
1149                         sc->arg_scnprintf[idx] = SCA_FILENAME;
1150                 else if (field->flags & FIELD_IS_POINTER)
1151                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1152                 else if (strcmp(field->type, "pid_t") == 0)
1153                         sc->arg_scnprintf[idx] = SCA_PID;
1154                 else if (strcmp(field->type, "umode_t") == 0)
1155                         sc->arg_scnprintf[idx] = SCA_MODE_T;
1156                 else if ((strcmp(field->type, "int") == 0 ||
1157                           strcmp(field->type, "unsigned int") == 0 ||
1158                           strcmp(field->type, "long") == 0) &&
1159                          (len = strlen(field->name)) >= 2 &&
1160                          strcmp(field->name + len - 2, "fd") == 0) {
1161                         /*
1162                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1163                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1164                          * 65 int
1165                          * 23 unsigned int
1166                          * 7 unsigned long
1167                          */
1168                         sc->arg_scnprintf[idx] = SCA_FD;
1169                 }
1170                 ++idx;
1171         }
1172
1173         return 0;
1174 }
1175
1176 static int trace__read_syscall_info(struct trace *trace, int id)
1177 {
1178         char tp_name[128];
1179         struct syscall *sc;
1180         const char *name = syscalltbl__name(trace->sctbl, id);
1181
1182         if (name == NULL)
1183                 return -1;
1184
1185         if (id > trace->syscalls.max) {
1186                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1187
1188                 if (nsyscalls == NULL)
1189                         return -1;
1190
1191                 if (trace->syscalls.max != -1) {
1192                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1193                                (id - trace->syscalls.max) * sizeof(*sc));
1194                 } else {
1195                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1196                 }
1197
1198                 trace->syscalls.table = nsyscalls;
1199                 trace->syscalls.max   = id;
1200         }
1201
1202         sc = trace->syscalls.table + id;
1203         sc->name = name;
1204
1205         sc->fmt  = syscall_fmt__find(sc->name);
1206
1207         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1208         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1209
1210         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1211                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1212                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1213         }
1214
1215         if (IS_ERR(sc->tp_format))
1216                 return -1;
1217
1218         sc->args = sc->tp_format->format.fields;
1219         sc->nr_args = sc->tp_format->format.nr_fields;
1220         /*
1221          * We need to check and discard the first variable '__syscall_nr'
1222          * or 'nr' that mean the syscall number. It is needless here.
1223          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1224          */
1225         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1226                 sc->args = sc->args->next;
1227                 --sc->nr_args;
1228         }
1229
1230         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1231
1232         return syscall__set_arg_fmts(sc);
1233 }
1234
1235 static int trace__validate_ev_qualifier(struct trace *trace)
1236 {
1237         int err = 0, i;
1238         struct str_node *pos;
1239
1240         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1241         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1242                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1243
1244         if (trace->ev_qualifier_ids.entries == NULL) {
1245                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1246                        trace->output);
1247                 err = -EINVAL;
1248                 goto out;
1249         }
1250
1251         i = 0;
1252
1253         strlist__for_each_entry(pos, trace->ev_qualifier) {
1254                 const char *sc = pos->s;
1255                 int id = syscalltbl__id(trace->sctbl, sc);
1256
1257                 if (id < 0) {
1258                         if (err == 0) {
1259                                 fputs("Error:\tInvalid syscall ", trace->output);
1260                                 err = -EINVAL;
1261                         } else {
1262                                 fputs(", ", trace->output);
1263                         }
1264
1265                         fputs(sc, trace->output);
1266                 }
1267
1268                 trace->ev_qualifier_ids.entries[i++] = id;
1269         }
1270
1271         if (err < 0) {
1272                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1273                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1274                 zfree(&trace->ev_qualifier_ids.entries);
1275                 trace->ev_qualifier_ids.nr = 0;
1276         }
1277 out:
1278         return err;
1279 }
1280
1281 /*
1282  * args is to be interpreted as a series of longs but we need to handle
1283  * 8-byte unaligned accesses. args points to raw_data within the event
1284  * and raw_data is guaranteed to be 8-byte unaligned because it is
1285  * preceded by raw_size which is a u32. So we need to copy args to a temp
1286  * variable to read it. Most notably this avoids extended load instructions
1287  * on unaligned addresses
1288  */
1289
1290 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1291                                       unsigned char *args, struct trace *trace,
1292                                       struct thread *thread)
1293 {
1294         size_t printed = 0;
1295         unsigned char *p;
1296         unsigned long val;
1297
1298         if (sc->args != NULL) {
1299                 struct format_field *field;
1300                 u8 bit = 1;
1301                 struct syscall_arg arg = {
1302                         .idx    = 0,
1303                         .mask   = 0,
1304                         .trace  = trace,
1305                         .thread = thread,
1306                 };
1307
1308                 for (field = sc->args; field;
1309                      field = field->next, ++arg.idx, bit <<= 1) {
1310                         if (arg.mask & bit)
1311                                 continue;
1312
1313                         /* special care for unaligned accesses */
1314                         p = args + sizeof(unsigned long) * arg.idx;
1315                         memcpy(&val, p, sizeof(val));
1316
1317                         /*
1318                          * Suppress this argument if its value is zero and
1319                          * and we don't have a string associated in an
1320                          * strarray for it.
1321                          */
1322                         if (val == 0 &&
1323                             !(sc->arg_scnprintf &&
1324                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1325                               sc->arg_parm[arg.idx]))
1326                                 continue;
1327
1328                         printed += scnprintf(bf + printed, size - printed,
1329                                              "%s%s: ", printed ? ", " : "", field->name);
1330                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1331                                 arg.val = val;
1332                                 if (sc->arg_parm)
1333                                         arg.parm = sc->arg_parm[arg.idx];
1334                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1335                                                                       size - printed, &arg);
1336                         } else {
1337                                 printed += scnprintf(bf + printed, size - printed,
1338                                                      "%ld", val);
1339                         }
1340                 }
1341         } else if (IS_ERR(sc->tp_format)) {
1342                 /*
1343                  * If we managed to read the tracepoint /format file, then we
1344                  * may end up not having any args, like with gettid(), so only
1345                  * print the raw args when we didn't manage to read it.
1346                  */
1347                 int i = 0;
1348
1349                 while (i < 6) {
1350                         /* special care for unaligned accesses */
1351                         p = args + sizeof(unsigned long) * i;
1352                         memcpy(&val, p, sizeof(val));
1353                         printed += scnprintf(bf + printed, size - printed,
1354                                              "%sarg%d: %ld",
1355                                              printed ? ", " : "", i, val);
1356                         ++i;
1357                 }
1358         }
1359
1360         return printed;
1361 }
1362
1363 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1364                                   union perf_event *event,
1365                                   struct perf_sample *sample);
1366
1367 static struct syscall *trace__syscall_info(struct trace *trace,
1368                                            struct perf_evsel *evsel, int id)
1369 {
1370
1371         if (id < 0) {
1372
1373                 /*
1374                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1375                  * before that, leaving at a higher verbosity level till that is
1376                  * explained. Reproduced with plain ftrace with:
1377                  *
1378                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1379                  * grep "NR -1 " /t/trace_pipe
1380                  *
1381                  * After generating some load on the machine.
1382                  */
1383                 if (verbose > 1) {
1384                         static u64 n;
1385                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1386                                 id, perf_evsel__name(evsel), ++n);
1387                 }
1388                 return NULL;
1389         }
1390
1391         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1392             trace__read_syscall_info(trace, id))
1393                 goto out_cant_read;
1394
1395         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1396                 goto out_cant_read;
1397
1398         return &trace->syscalls.table[id];
1399
1400 out_cant_read:
1401         if (verbose) {
1402                 fprintf(trace->output, "Problems reading syscall %d", id);
1403                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1404                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1405                 fputs(" information\n", trace->output);
1406         }
1407         return NULL;
1408 }
1409
1410 static void thread__update_stats(struct thread_trace *ttrace,
1411                                  int id, struct perf_sample *sample)
1412 {
1413         struct int_node *inode;
1414         struct stats *stats;
1415         u64 duration = 0;
1416
1417         inode = intlist__findnew(ttrace->syscall_stats, id);
1418         if (inode == NULL)
1419                 return;
1420
1421         stats = inode->priv;
1422         if (stats == NULL) {
1423                 stats = malloc(sizeof(struct stats));
1424                 if (stats == NULL)
1425                         return;
1426                 init_stats(stats);
1427                 inode->priv = stats;
1428         }
1429
1430         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1431                 duration = sample->time - ttrace->entry_time;
1432
1433         update_stats(stats, duration);
1434 }
1435
1436 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1437 {
1438         struct thread_trace *ttrace;
1439         u64 duration;
1440         size_t printed;
1441
1442         if (trace->current == NULL)
1443                 return 0;
1444
1445         ttrace = thread__priv(trace->current);
1446
1447         if (!ttrace->entry_pending)
1448                 return 0;
1449
1450         duration = sample->time - ttrace->entry_time;
1451
1452         printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1453         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1454         ttrace->entry_pending = false;
1455
1456         return printed;
1457 }
1458
1459 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1460                             union perf_event *event __maybe_unused,
1461                             struct perf_sample *sample)
1462 {
1463         char *msg;
1464         void *args;
1465         size_t printed = 0;
1466         struct thread *thread;
1467         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1468         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1469         struct thread_trace *ttrace;
1470
1471         if (sc == NULL)
1472                 return -1;
1473
1474         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1475         ttrace = thread__trace(thread, trace->output);
1476         if (ttrace == NULL)
1477                 goto out_put;
1478
1479         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1480
1481         if (ttrace->entry_str == NULL) {
1482                 ttrace->entry_str = malloc(trace__entry_str_size);
1483                 if (!ttrace->entry_str)
1484                         goto out_put;
1485         }
1486
1487         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1488                 trace__printf_interrupted_entry(trace, sample);
1489
1490         ttrace->entry_time = sample->time;
1491         msg = ttrace->entry_str;
1492         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1493
1494         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1495                                            args, trace, thread);
1496
1497         if (sc->is_exit) {
1498                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1499                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1500                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1501                 }
1502         } else {
1503                 ttrace->entry_pending = true;
1504                 /* See trace__vfs_getname & trace__sys_exit */
1505                 ttrace->filename.pending_open = false;
1506         }
1507
1508         if (trace->current != thread) {
1509                 thread__put(trace->current);
1510                 trace->current = thread__get(thread);
1511         }
1512         err = 0;
1513 out_put:
1514         thread__put(thread);
1515         return err;
1516 }
1517
1518 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1519                                     struct perf_sample *sample,
1520                                     struct callchain_cursor *cursor)
1521 {
1522         struct addr_location al;
1523
1524         if (machine__resolve(trace->host, &al, sample) < 0 ||
1525             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1526                 return -1;
1527
1528         return 0;
1529 }
1530
1531 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1532 {
1533         /* TODO: user-configurable print_opts */
1534         const unsigned int print_opts = EVSEL__PRINT_SYM |
1535                                         EVSEL__PRINT_DSO |
1536                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1537
1538         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1539 }
1540
1541 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1542                            union perf_event *event __maybe_unused,
1543                            struct perf_sample *sample)
1544 {
1545         long ret;
1546         u64 duration = 0;
1547         struct thread *thread;
1548         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1549         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1550         struct thread_trace *ttrace;
1551
1552         if (sc == NULL)
1553                 return -1;
1554
1555         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1556         ttrace = thread__trace(thread, trace->output);
1557         if (ttrace == NULL)
1558                 goto out_put;
1559
1560         if (trace->summary)
1561                 thread__update_stats(ttrace, id, sample);
1562
1563         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1564
1565         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1566                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1567                 ttrace->filename.pending_open = false;
1568                 ++trace->stats.vfs_getname;
1569         }
1570
1571         ttrace->exit_time = sample->time;
1572
1573         if (ttrace->entry_time) {
1574                 duration = sample->time - ttrace->entry_time;
1575                 if (trace__filter_duration(trace, duration))
1576                         goto out;
1577         } else if (trace->duration_filter)
1578                 goto out;
1579
1580         if (sample->callchain) {
1581                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1582                 if (callchain_ret == 0) {
1583                         if (callchain_cursor.nr < trace->min_stack)
1584                                 goto out;
1585                         callchain_ret = 1;
1586                 }
1587         }
1588
1589         if (trace->summary_only)
1590                 goto out;
1591
1592         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1593
1594         if (ttrace->entry_pending) {
1595                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1596         } else {
1597                 fprintf(trace->output, " ... [");
1598                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1599                 fprintf(trace->output, "]: %s()", sc->name);
1600         }
1601
1602         if (sc->fmt == NULL) {
1603 signed_print:
1604                 fprintf(trace->output, ") = %ld", ret);
1605         } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1606                 char bf[STRERR_BUFSIZE];
1607                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1608                            *e = audit_errno_to_name(-ret);
1609
1610                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1611         } else if (ret == 0 && sc->fmt->timeout)
1612                 fprintf(trace->output, ") = 0 Timeout");
1613         else if (sc->fmt->hexret)
1614                 fprintf(trace->output, ") = %#lx", ret);
1615         else if (sc->fmt->errpid) {
1616                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1617
1618                 if (child != NULL) {
1619                         fprintf(trace->output, ") = %ld", ret);
1620                         if (child->comm_set)
1621                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1622                         thread__put(child);
1623                 }
1624         } else
1625                 goto signed_print;
1626
1627         fputc('\n', trace->output);
1628
1629         if (callchain_ret > 0)
1630                 trace__fprintf_callchain(trace, sample);
1631         else if (callchain_ret < 0)
1632                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1633 out:
1634         ttrace->entry_pending = false;
1635         err = 0;
1636 out_put:
1637         thread__put(thread);
1638         return err;
1639 }
1640
1641 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1642                               union perf_event *event __maybe_unused,
1643                               struct perf_sample *sample)
1644 {
1645         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1646         struct thread_trace *ttrace;
1647         size_t filename_len, entry_str_len, to_move;
1648         ssize_t remaining_space;
1649         char *pos;
1650         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1651
1652         if (!thread)
1653                 goto out;
1654
1655         ttrace = thread__priv(thread);
1656         if (!ttrace)
1657                 goto out;
1658
1659         filename_len = strlen(filename);
1660
1661         if (ttrace->filename.namelen < filename_len) {
1662                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1663
1664                 if (f == NULL)
1665                                 goto out;
1666
1667                 ttrace->filename.namelen = filename_len;
1668                 ttrace->filename.name = f;
1669         }
1670
1671         strcpy(ttrace->filename.name, filename);
1672         ttrace->filename.pending_open = true;
1673
1674         if (!ttrace->filename.ptr)
1675                 goto out;
1676
1677         entry_str_len = strlen(ttrace->entry_str);
1678         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1679         if (remaining_space <= 0)
1680                 goto out;
1681
1682         if (filename_len > (size_t)remaining_space) {
1683                 filename += filename_len - remaining_space;
1684                 filename_len = remaining_space;
1685         }
1686
1687         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1688         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1689         memmove(pos + filename_len, pos, to_move);
1690         memcpy(pos, filename, filename_len);
1691
1692         ttrace->filename.ptr = 0;
1693         ttrace->filename.entry_str_pos = 0;
1694 out:
1695         return 0;
1696 }
1697
1698 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1699                                      union perf_event *event __maybe_unused,
1700                                      struct perf_sample *sample)
1701 {
1702         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1703         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1704         struct thread *thread = machine__findnew_thread(trace->host,
1705                                                         sample->pid,
1706                                                         sample->tid);
1707         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1708
1709         if (ttrace == NULL)
1710                 goto out_dump;
1711
1712         ttrace->runtime_ms += runtime_ms;
1713         trace->runtime_ms += runtime_ms;
1714         thread__put(thread);
1715         return 0;
1716
1717 out_dump:
1718         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1719                evsel->name,
1720                perf_evsel__strval(evsel, sample, "comm"),
1721                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1722                runtime,
1723                perf_evsel__intval(evsel, sample, "vruntime"));
1724         thread__put(thread);
1725         return 0;
1726 }
1727
1728 static void bpf_output__printer(enum binary_printer_ops op,
1729                                 unsigned int val, void *extra)
1730 {
1731         FILE *output = extra;
1732         unsigned char ch = (unsigned char)val;
1733
1734         switch (op) {
1735         case BINARY_PRINT_CHAR_DATA:
1736                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1737                 break;
1738         case BINARY_PRINT_DATA_BEGIN:
1739         case BINARY_PRINT_LINE_BEGIN:
1740         case BINARY_PRINT_ADDR:
1741         case BINARY_PRINT_NUM_DATA:
1742         case BINARY_PRINT_NUM_PAD:
1743         case BINARY_PRINT_SEP:
1744         case BINARY_PRINT_CHAR_PAD:
1745         case BINARY_PRINT_LINE_END:
1746         case BINARY_PRINT_DATA_END:
1747         default:
1748                 break;
1749         }
1750 }
1751
1752 static void bpf_output__fprintf(struct trace *trace,
1753                                 struct perf_sample *sample)
1754 {
1755         print_binary(sample->raw_data, sample->raw_size, 8,
1756                      bpf_output__printer, trace->output);
1757 }
1758
1759 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1760                                 union perf_event *event __maybe_unused,
1761                                 struct perf_sample *sample)
1762 {
1763         int callchain_ret = 0;
1764
1765         if (sample->callchain) {
1766                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1767                 if (callchain_ret == 0) {
1768                         if (callchain_cursor.nr < trace->min_stack)
1769                                 goto out;
1770                         callchain_ret = 1;
1771                 }
1772         }
1773
1774         trace__printf_interrupted_entry(trace, sample);
1775         trace__fprintf_tstamp(trace, sample->time, trace->output);
1776
1777         if (trace->trace_syscalls)
1778                 fprintf(trace->output, "(         ): ");
1779
1780         fprintf(trace->output, "%s:", evsel->name);
1781
1782         if (perf_evsel__is_bpf_output(evsel)) {
1783                 bpf_output__fprintf(trace, sample);
1784         } else if (evsel->tp_format) {
1785                 event_format__fprintf(evsel->tp_format, sample->cpu,
1786                                       sample->raw_data, sample->raw_size,
1787                                       trace->output);
1788         }
1789
1790         fprintf(trace->output, ")\n");
1791
1792         if (callchain_ret > 0)
1793                 trace__fprintf_callchain(trace, sample);
1794         else if (callchain_ret < 0)
1795                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1796 out:
1797         return 0;
1798 }
1799
1800 static void print_location(FILE *f, struct perf_sample *sample,
1801                            struct addr_location *al,
1802                            bool print_dso, bool print_sym)
1803 {
1804
1805         if ((verbose || print_dso) && al->map)
1806                 fprintf(f, "%s@", al->map->dso->long_name);
1807
1808         if ((verbose || print_sym) && al->sym)
1809                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1810                         al->addr - al->sym->start);
1811         else if (al->map)
1812                 fprintf(f, "0x%" PRIx64, al->addr);
1813         else
1814                 fprintf(f, "0x%" PRIx64, sample->addr);
1815 }
1816
1817 static int trace__pgfault(struct trace *trace,
1818                           struct perf_evsel *evsel,
1819                           union perf_event *event __maybe_unused,
1820                           struct perf_sample *sample)
1821 {
1822         struct thread *thread;
1823         struct addr_location al;
1824         char map_type = 'd';
1825         struct thread_trace *ttrace;
1826         int err = -1;
1827         int callchain_ret = 0;
1828
1829         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1830
1831         if (sample->callchain) {
1832                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1833                 if (callchain_ret == 0) {
1834                         if (callchain_cursor.nr < trace->min_stack)
1835                                 goto out_put;
1836                         callchain_ret = 1;
1837                 }
1838         }
1839
1840         ttrace = thread__trace(thread, trace->output);
1841         if (ttrace == NULL)
1842                 goto out_put;
1843
1844         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1845                 ttrace->pfmaj++;
1846         else
1847                 ttrace->pfmin++;
1848
1849         if (trace->summary_only)
1850                 goto out;
1851
1852         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1853                               sample->ip, &al);
1854
1855         trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1856
1857         fprintf(trace->output, "%sfault [",
1858                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1859                 "maj" : "min");
1860
1861         print_location(trace->output, sample, &al, false, true);
1862
1863         fprintf(trace->output, "] => ");
1864
1865         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1866                                    sample->addr, &al);
1867
1868         if (!al.map) {
1869                 thread__find_addr_location(thread, sample->cpumode,
1870                                            MAP__FUNCTION, sample->addr, &al);
1871
1872                 if (al.map)
1873                         map_type = 'x';
1874                 else
1875                         map_type = '?';
1876         }
1877
1878         print_location(trace->output, sample, &al, true, false);
1879
1880         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1881
1882         if (callchain_ret > 0)
1883                 trace__fprintf_callchain(trace, sample);
1884         else if (callchain_ret < 0)
1885                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1886 out:
1887         err = 0;
1888 out_put:
1889         thread__put(thread);
1890         return err;
1891 }
1892
1893 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1894 {
1895         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1896             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1897                 return false;
1898
1899         if (trace->pid_list || trace->tid_list)
1900                 return true;
1901
1902         return false;
1903 }
1904
1905 static void trace__set_base_time(struct trace *trace,
1906                                  struct perf_evsel *evsel,
1907                                  struct perf_sample *sample)
1908 {
1909         /*
1910          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1911          * and don't use sample->time unconditionally, we may end up having
1912          * some other event in the future without PERF_SAMPLE_TIME for good
1913          * reason, i.e. we may not be interested in its timestamps, just in
1914          * it taking place, picking some piece of information when it
1915          * appears in our event stream (vfs_getname comes to mind).
1916          */
1917         if (trace->base_time == 0 && !trace->full_time &&
1918             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1919                 trace->base_time = sample->time;
1920 }
1921
1922 static int trace__process_sample(struct perf_tool *tool,
1923                                  union perf_event *event,
1924                                  struct perf_sample *sample,
1925                                  struct perf_evsel *evsel,
1926                                  struct machine *machine __maybe_unused)
1927 {
1928         struct trace *trace = container_of(tool, struct trace, tool);
1929         int err = 0;
1930
1931         tracepoint_handler handler = evsel->handler;
1932
1933         if (skip_sample(trace, sample))
1934                 return 0;
1935
1936         trace__set_base_time(trace, evsel, sample);
1937
1938         if (handler) {
1939                 ++trace->nr_events;
1940                 handler(trace, evsel, event, sample);
1941         }
1942
1943         return err;
1944 }
1945
1946 static int parse_target_str(struct trace *trace)
1947 {
1948         if (trace->opts.target.pid) {
1949                 trace->pid_list = intlist__new(trace->opts.target.pid);
1950                 if (trace->pid_list == NULL) {
1951                         pr_err("Error parsing process id string\n");
1952                         return -EINVAL;
1953                 }
1954         }
1955
1956         if (trace->opts.target.tid) {
1957                 trace->tid_list = intlist__new(trace->opts.target.tid);
1958                 if (trace->tid_list == NULL) {
1959                         pr_err("Error parsing thread id string\n");
1960                         return -EINVAL;
1961                 }
1962         }
1963
1964         return 0;
1965 }
1966
1967 static int trace__record(struct trace *trace, int argc, const char **argv)
1968 {
1969         unsigned int rec_argc, i, j;
1970         const char **rec_argv;
1971         const char * const record_args[] = {
1972                 "record",
1973                 "-R",
1974                 "-m", "1024",
1975                 "-c", "1",
1976         };
1977
1978         const char * const sc_args[] = { "-e", };
1979         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1980         const char * const majpf_args[] = { "-e", "major-faults" };
1981         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1982         const char * const minpf_args[] = { "-e", "minor-faults" };
1983         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1984
1985         /* +1 is for the event string below */
1986         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1987                 majpf_args_nr + minpf_args_nr + argc;
1988         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1989
1990         if (rec_argv == NULL)
1991                 return -ENOMEM;
1992
1993         j = 0;
1994         for (i = 0; i < ARRAY_SIZE(record_args); i++)
1995                 rec_argv[j++] = record_args[i];
1996
1997         if (trace->trace_syscalls) {
1998                 for (i = 0; i < sc_args_nr; i++)
1999                         rec_argv[j++] = sc_args[i];
2000
2001                 /* event string may be different for older kernels - e.g., RHEL6 */
2002                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2003                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2004                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2005                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2006                 else {
2007                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2008                         return -1;
2009                 }
2010         }
2011
2012         if (trace->trace_pgfaults & TRACE_PFMAJ)
2013                 for (i = 0; i < majpf_args_nr; i++)
2014                         rec_argv[j++] = majpf_args[i];
2015
2016         if (trace->trace_pgfaults & TRACE_PFMIN)
2017                 for (i = 0; i < minpf_args_nr; i++)
2018                         rec_argv[j++] = minpf_args[i];
2019
2020         for (i = 0; i < (unsigned int)argc; i++)
2021                 rec_argv[j++] = argv[i];
2022
2023         return cmd_record(j, rec_argv, NULL);
2024 }
2025
2026 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2027
2028 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2029 {
2030         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2031
2032         if (IS_ERR(evsel))
2033                 return false;
2034
2035         if (perf_evsel__field(evsel, "pathname") == NULL) {
2036                 perf_evsel__delete(evsel);
2037                 return false;
2038         }
2039
2040         evsel->handler = trace__vfs_getname;
2041         perf_evlist__add(evlist, evsel);
2042         return true;
2043 }
2044
2045 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2046 {
2047         struct perf_evsel *evsel;
2048         struct perf_event_attr attr = {
2049                 .type = PERF_TYPE_SOFTWARE,
2050                 .mmap_data = 1,
2051         };
2052
2053         attr.config = config;
2054         attr.sample_period = 1;
2055
2056         event_attr_init(&attr);
2057
2058         evsel = perf_evsel__new(&attr);
2059         if (evsel)
2060                 evsel->handler = trace__pgfault;
2061
2062         return evsel;
2063 }
2064
2065 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2066 {
2067         const u32 type = event->header.type;
2068         struct perf_evsel *evsel;
2069
2070         if (type != PERF_RECORD_SAMPLE) {
2071                 trace__process_event(trace, trace->host, event, sample);
2072                 return;
2073         }
2074
2075         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2076         if (evsel == NULL) {
2077                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2078                 return;
2079         }
2080
2081         trace__set_base_time(trace, evsel, sample);
2082
2083         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2084             sample->raw_data == NULL) {
2085                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2086                        perf_evsel__name(evsel), sample->tid,
2087                        sample->cpu, sample->raw_size);
2088         } else {
2089                 tracepoint_handler handler = evsel->handler;
2090                 handler(trace, evsel, event, sample);
2091         }
2092 }
2093
2094 static int trace__add_syscall_newtp(struct trace *trace)
2095 {
2096         int ret = -1;
2097         struct perf_evlist *evlist = trace->evlist;
2098         struct perf_evsel *sys_enter, *sys_exit;
2099
2100         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2101         if (sys_enter == NULL)
2102                 goto out;
2103
2104         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2105                 goto out_delete_sys_enter;
2106
2107         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2108         if (sys_exit == NULL)
2109                 goto out_delete_sys_enter;
2110
2111         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2112                 goto out_delete_sys_exit;
2113
2114         perf_evlist__add(evlist, sys_enter);
2115         perf_evlist__add(evlist, sys_exit);
2116
2117         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2118                 /*
2119                  * We're interested only in the user space callchain
2120                  * leading to the syscall, allow overriding that for
2121                  * debugging reasons using --kernel_syscall_callchains
2122                  */
2123                 sys_exit->attr.exclude_callchain_kernel = 1;
2124         }
2125
2126         trace->syscalls.events.sys_enter = sys_enter;
2127         trace->syscalls.events.sys_exit  = sys_exit;
2128
2129         ret = 0;
2130 out:
2131         return ret;
2132
2133 out_delete_sys_exit:
2134         perf_evsel__delete_priv(sys_exit);
2135 out_delete_sys_enter:
2136         perf_evsel__delete_priv(sys_enter);
2137         goto out;
2138 }
2139
2140 static int trace__set_ev_qualifier_filter(struct trace *trace)
2141 {
2142         int err = -1;
2143         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2144                                                 trace->ev_qualifier_ids.nr,
2145                                                 trace->ev_qualifier_ids.entries);
2146
2147         if (filter == NULL)
2148                 goto out_enomem;
2149
2150         if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2151                 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2152
2153         free(filter);
2154 out:
2155         return err;
2156 out_enomem:
2157         errno = ENOMEM;
2158         goto out;
2159 }
2160
2161 static int trace__run(struct trace *trace, int argc, const char **argv)
2162 {
2163         struct perf_evlist *evlist = trace->evlist;
2164         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2165         int err = -1, i;
2166         unsigned long before;
2167         const bool forks = argc > 0;
2168         bool draining = false;
2169
2170         trace->live = true;
2171
2172         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2173                 goto out_error_raw_syscalls;
2174
2175         if (trace->trace_syscalls)
2176                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2177
2178         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2179                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2180                 if (pgfault_maj == NULL)
2181                         goto out_error_mem;
2182                 perf_evlist__add(evlist, pgfault_maj);
2183         }
2184
2185         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2186                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2187                 if (pgfault_min == NULL)
2188                         goto out_error_mem;
2189                 perf_evlist__add(evlist, pgfault_min);
2190         }
2191
2192         if (trace->sched &&
2193             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2194                                    trace__sched_stat_runtime))
2195                 goto out_error_sched_stat_runtime;
2196
2197         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2198         if (err < 0) {
2199                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2200                 goto out_delete_evlist;
2201         }
2202
2203         err = trace__symbols_init(trace, evlist);
2204         if (err < 0) {
2205                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2206                 goto out_delete_evlist;
2207         }
2208
2209         perf_evlist__config(evlist, &trace->opts, NULL);
2210
2211         if (callchain_param.enabled) {
2212                 bool use_identifier = false;
2213
2214                 if (trace->syscalls.events.sys_exit) {
2215                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2216                                                      &trace->opts, &callchain_param);
2217                         use_identifier = true;
2218                 }
2219
2220                 if (pgfault_maj) {
2221                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2222                         use_identifier = true;
2223                 }
2224
2225                 if (pgfault_min) {
2226                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2227                         use_identifier = true;
2228                 }
2229
2230                 if (use_identifier) {
2231                        /*
2232                         * Now we have evsels with different sample_ids, use
2233                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2234                         * from a fixed position in each ring buffer record.
2235                         *
2236                         * As of this the changeset introducing this comment, this
2237                         * isn't strictly needed, as the fields that can come before
2238                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2239                         * some of those for things like copying the payload of
2240                         * pointer syscall arguments, and for vfs_getname we don't
2241                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2242                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2243                         */
2244                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2245                         perf_evlist__reset_sample_bit(evlist, ID);
2246                 }
2247         }
2248
2249         signal(SIGCHLD, sig_handler);
2250         signal(SIGINT, sig_handler);
2251
2252         if (forks) {
2253                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2254                                                     argv, false, NULL);
2255                 if (err < 0) {
2256                         fprintf(trace->output, "Couldn't run the workload!\n");
2257                         goto out_delete_evlist;
2258                 }
2259         }
2260
2261         err = perf_evlist__open(evlist);
2262         if (err < 0)
2263                 goto out_error_open;
2264
2265         err = bpf__apply_obj_config();
2266         if (err) {
2267                 char errbuf[BUFSIZ];
2268
2269                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2270                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2271                          errbuf);
2272                 goto out_error_open;
2273         }
2274
2275         /*
2276          * Better not use !target__has_task() here because we need to cover the
2277          * case where no threads were specified in the command line, but a
2278          * workload was, and in that case we will fill in the thread_map when
2279          * we fork the workload in perf_evlist__prepare_workload.
2280          */
2281         if (trace->filter_pids.nr > 0)
2282                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2283         else if (thread_map__pid(evlist->threads, 0) == -1)
2284                 err = perf_evlist__set_filter_pid(evlist, getpid());
2285
2286         if (err < 0)
2287                 goto out_error_mem;
2288
2289         if (trace->ev_qualifier_ids.nr > 0) {
2290                 err = trace__set_ev_qualifier_filter(trace);
2291                 if (err < 0)
2292                         goto out_errno;
2293
2294                 pr_debug("event qualifier tracepoint filter: %s\n",
2295                          trace->syscalls.events.sys_exit->filter);
2296         }
2297
2298         err = perf_evlist__apply_filters(evlist, &evsel);
2299         if (err < 0)
2300                 goto out_error_apply_filters;
2301
2302         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2303         if (err < 0)
2304                 goto out_error_mmap;
2305
2306         if (!target__none(&trace->opts.target))
2307                 perf_evlist__enable(evlist);
2308
2309         if (forks)
2310                 perf_evlist__start_workload(evlist);
2311
2312         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2313                                   evlist->threads->nr > 1 ||
2314                                   perf_evlist__first(evlist)->attr.inherit;
2315 again:
2316         before = trace->nr_events;
2317
2318         for (i = 0; i < evlist->nr_mmaps; i++) {
2319                 union perf_event *event;
2320
2321                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2322                         struct perf_sample sample;
2323
2324                         ++trace->nr_events;
2325
2326                         err = perf_evlist__parse_sample(evlist, event, &sample);
2327                         if (err) {
2328                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2329                                 goto next_event;
2330                         }
2331
2332                         trace__handle_event(trace, event, &sample);
2333 next_event:
2334                         perf_evlist__mmap_consume(evlist, i);
2335
2336                         if (interrupted)
2337                                 goto out_disable;
2338
2339                         if (done && !draining) {
2340                                 perf_evlist__disable(evlist);
2341                                 draining = true;
2342                         }
2343                 }
2344         }
2345
2346         if (trace->nr_events == before) {
2347                 int timeout = done ? 100 : -1;
2348
2349                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2350                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2351                                 draining = true;
2352
2353                         goto again;
2354                 }
2355         } else {
2356                 goto again;
2357         }
2358
2359 out_disable:
2360         thread__zput(trace->current);
2361
2362         perf_evlist__disable(evlist);
2363
2364         if (!err) {
2365                 if (trace->summary)
2366                         trace__fprintf_thread_summary(trace, trace->output);
2367
2368                 if (trace->show_tool_stats) {
2369                         fprintf(trace->output, "Stats:\n "
2370                                                " vfs_getname : %" PRIu64 "\n"
2371                                                " proc_getname: %" PRIu64 "\n",
2372                                 trace->stats.vfs_getname,
2373                                 trace->stats.proc_getname);
2374                 }
2375         }
2376
2377 out_delete_evlist:
2378         perf_evlist__delete(evlist);
2379         trace->evlist = NULL;
2380         trace->live = false;
2381         return err;
2382 {
2383         char errbuf[BUFSIZ];
2384
2385 out_error_sched_stat_runtime:
2386         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2387         goto out_error;
2388
2389 out_error_raw_syscalls:
2390         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2391         goto out_error;
2392
2393 out_error_mmap:
2394         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2395         goto out_error;
2396
2397 out_error_open:
2398         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2399
2400 out_error:
2401         fprintf(trace->output, "%s\n", errbuf);
2402         goto out_delete_evlist;
2403
2404 out_error_apply_filters:
2405         fprintf(trace->output,
2406                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2407                 evsel->filter, perf_evsel__name(evsel), errno,
2408                 str_error_r(errno, errbuf, sizeof(errbuf)));
2409         goto out_delete_evlist;
2410 }
2411 out_error_mem:
2412         fprintf(trace->output, "Not enough memory to run!\n");
2413         goto out_delete_evlist;
2414
2415 out_errno:
2416         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2417         goto out_delete_evlist;
2418 }
2419
2420 static int trace__replay(struct trace *trace)
2421 {
2422         const struct perf_evsel_str_handler handlers[] = {
2423                 { "probe:vfs_getname",       trace__vfs_getname, },
2424         };
2425         struct perf_data_file file = {
2426                 .path  = input_name,
2427                 .mode  = PERF_DATA_MODE_READ,
2428                 .force = trace->force,
2429         };
2430         struct perf_session *session;
2431         struct perf_evsel *evsel;
2432         int err = -1;
2433
2434         trace->tool.sample        = trace__process_sample;
2435         trace->tool.mmap          = perf_event__process_mmap;
2436         trace->tool.mmap2         = perf_event__process_mmap2;
2437         trace->tool.comm          = perf_event__process_comm;
2438         trace->tool.exit          = perf_event__process_exit;
2439         trace->tool.fork          = perf_event__process_fork;
2440         trace->tool.attr          = perf_event__process_attr;
2441         trace->tool.tracing_data = perf_event__process_tracing_data;
2442         trace->tool.build_id      = perf_event__process_build_id;
2443
2444         trace->tool.ordered_events = true;
2445         trace->tool.ordering_requires_timestamps = true;
2446
2447         /* add tid to output */
2448         trace->multiple_threads = true;
2449
2450         session = perf_session__new(&file, false, &trace->tool);
2451         if (session == NULL)
2452                 return -1;
2453
2454         if (symbol__init(&session->header.env) < 0)
2455                 goto out;
2456
2457         trace->host = &session->machines.host;
2458
2459         err = perf_session__set_tracepoints_handlers(session, handlers);
2460         if (err)
2461                 goto out;
2462
2463         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2464                                                      "raw_syscalls:sys_enter");
2465         /* older kernels have syscalls tp versus raw_syscalls */
2466         if (evsel == NULL)
2467                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2468                                                              "syscalls:sys_enter");
2469
2470         if (evsel &&
2471             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2472             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2473                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2474                 goto out;
2475         }
2476
2477         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2478                                                      "raw_syscalls:sys_exit");
2479         if (evsel == NULL)
2480                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2481                                                              "syscalls:sys_exit");
2482         if (evsel &&
2483             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2484             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2485                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2486                 goto out;
2487         }
2488
2489         evlist__for_each_entry(session->evlist, evsel) {
2490                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2491                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2492                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2493                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2494                         evsel->handler = trace__pgfault;
2495         }
2496
2497         err = parse_target_str(trace);
2498         if (err != 0)
2499                 goto out;
2500
2501         setup_pager();
2502
2503         err = perf_session__process_events(session);
2504         if (err)
2505                 pr_err("Failed to process events, error %d", err);
2506
2507         else if (trace->summary)
2508                 trace__fprintf_thread_summary(trace, trace->output);
2509
2510 out:
2511         perf_session__delete(session);
2512
2513         return err;
2514 }
2515
2516 static size_t trace__fprintf_threads_header(FILE *fp)
2517 {
2518         size_t printed;
2519
2520         printed  = fprintf(fp, "\n Summary of events:\n\n");
2521
2522         return printed;
2523 }
2524
2525 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2526         struct stats    *stats;
2527         double          msecs;
2528         int             syscall;
2529 )
2530 {
2531         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2532         struct stats *stats = source->priv;
2533
2534         entry->syscall = source->i;
2535         entry->stats   = stats;
2536         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2537 }
2538
2539 static size_t thread__dump_stats(struct thread_trace *ttrace,
2540                                  struct trace *trace, FILE *fp)
2541 {
2542         size_t printed = 0;
2543         struct syscall *sc;
2544         struct rb_node *nd;
2545         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2546
2547         if (syscall_stats == NULL)
2548                 return 0;
2549
2550         printed += fprintf(fp, "\n");
2551
2552         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2553         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2554         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2555
2556         resort_rb__for_each_entry(nd, syscall_stats) {
2557                 struct stats *stats = syscall_stats_entry->stats;
2558                 if (stats) {
2559                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2560                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2561                         double avg = avg_stats(stats);
2562                         double pct;
2563                         u64 n = (u64) stats->n;
2564
2565                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2566                         avg /= NSEC_PER_MSEC;
2567
2568                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2569                         printed += fprintf(fp, "   %-15s", sc->name);
2570                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2571                                            n, syscall_stats_entry->msecs, min, avg);
2572                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2573                 }
2574         }
2575
2576         resort_rb__delete(syscall_stats);
2577         printed += fprintf(fp, "\n\n");
2578
2579         return printed;
2580 }
2581
2582 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2583 {
2584         size_t printed = 0;
2585         struct thread_trace *ttrace = thread__priv(thread);
2586         double ratio;
2587
2588         if (ttrace == NULL)
2589                 return 0;
2590
2591         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2592
2593         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2594         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2595         printed += fprintf(fp, "%.1f%%", ratio);
2596         if (ttrace->pfmaj)
2597                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2598         if (ttrace->pfmin)
2599                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2600         if (trace->sched)
2601                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2602         else if (fputc('\n', fp) != EOF)
2603                 ++printed;
2604
2605         printed += thread__dump_stats(ttrace, trace, fp);
2606
2607         return printed;
2608 }
2609
2610 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2611 {
2612         return ttrace ? ttrace->nr_events : 0;
2613 }
2614
2615 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2616         struct thread *thread;
2617 )
2618 {
2619         entry->thread = rb_entry(nd, struct thread, rb_node);
2620 }
2621
2622 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2623 {
2624         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2625         size_t printed = trace__fprintf_threads_header(fp);
2626         struct rb_node *nd;
2627
2628         if (threads == NULL) {
2629                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2630                 return 0;
2631         }
2632
2633         resort_rb__for_each_entry(nd, threads)
2634                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2635
2636         resort_rb__delete(threads);
2637
2638         return printed;
2639 }
2640
2641 static int trace__set_duration(const struct option *opt, const char *str,
2642                                int unset __maybe_unused)
2643 {
2644         struct trace *trace = opt->value;
2645
2646         trace->duration_filter = atof(str);
2647         return 0;
2648 }
2649
2650 static int trace__set_filter_pids(const struct option *opt, const char *str,
2651                                   int unset __maybe_unused)
2652 {
2653         int ret = -1;
2654         size_t i;
2655         struct trace *trace = opt->value;
2656         /*
2657          * FIXME: introduce a intarray class, plain parse csv and create a
2658          * { int nr, int entries[] } struct...
2659          */
2660         struct intlist *list = intlist__new(str);
2661
2662         if (list == NULL)
2663                 return -1;
2664
2665         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2666         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2667
2668         if (trace->filter_pids.entries == NULL)
2669                 goto out;
2670
2671         trace->filter_pids.entries[0] = getpid();
2672
2673         for (i = 1; i < trace->filter_pids.nr; ++i)
2674                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2675
2676         intlist__delete(list);
2677         ret = 0;
2678 out:
2679         return ret;
2680 }
2681
2682 static int trace__open_output(struct trace *trace, const char *filename)
2683 {
2684         struct stat st;
2685
2686         if (!stat(filename, &st) && st.st_size) {
2687                 char oldname[PATH_MAX];
2688
2689                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2690                 unlink(oldname);
2691                 rename(filename, oldname);
2692         }
2693
2694         trace->output = fopen(filename, "w");
2695
2696         return trace->output == NULL ? -errno : 0;
2697 }
2698
2699 static int parse_pagefaults(const struct option *opt, const char *str,
2700                             int unset __maybe_unused)
2701 {
2702         int *trace_pgfaults = opt->value;
2703
2704         if (strcmp(str, "all") == 0)
2705                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2706         else if (strcmp(str, "maj") == 0)
2707                 *trace_pgfaults |= TRACE_PFMAJ;
2708         else if (strcmp(str, "min") == 0)
2709                 *trace_pgfaults |= TRACE_PFMIN;
2710         else
2711                 return -1;
2712
2713         return 0;
2714 }
2715
2716 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2717 {
2718         struct perf_evsel *evsel;
2719
2720         evlist__for_each_entry(evlist, evsel)
2721                 evsel->handler = handler;
2722 }
2723
2724 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2725 {
2726         const char *trace_usage[] = {
2727                 "perf trace [<options>] [<command>]",
2728                 "perf trace [<options>] -- <command> [<options>]",
2729                 "perf trace record [<options>] [<command>]",
2730                 "perf trace record [<options>] -- <command> [<options>]",
2731                 NULL
2732         };
2733         struct trace trace = {
2734                 .syscalls = {
2735                         . max = -1,
2736                 },
2737                 .opts = {
2738                         .target = {
2739                                 .uid       = UINT_MAX,
2740                                 .uses_mmap = true,
2741                         },
2742                         .user_freq     = UINT_MAX,
2743                         .user_interval = ULLONG_MAX,
2744                         .no_buffering  = true,
2745                         .mmap_pages    = UINT_MAX,
2746                         .proc_map_timeout  = 500,
2747                 },
2748                 .output = stderr,
2749                 .show_comm = true,
2750                 .trace_syscalls = true,
2751                 .kernel_syscallchains = false,
2752                 .max_stack = UINT_MAX,
2753         };
2754         const char *output_name = NULL;
2755         const char *ev_qualifier_str = NULL;
2756         const struct option trace_options[] = {
2757         OPT_CALLBACK(0, "event", &trace.evlist, "event",
2758                      "event selector. use 'perf list' to list available events",
2759                      parse_events_option),
2760         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2761                     "show the thread COMM next to its id"),
2762         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2763         OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2764         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2765         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2766         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2767                     "trace events on existing process id"),
2768         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2769                     "trace events on existing thread id"),
2770         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2771                      "pids to filter (by the kernel)", trace__set_filter_pids),
2772         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2773                     "system-wide collection from all CPUs"),
2774         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2775                     "list of cpus to monitor"),
2776         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2777                     "child tasks do not inherit counters"),
2778         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2779                      "number of mmap data pages",
2780                      perf_evlist__parse_mmap_pages),
2781         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2782                    "user to profile"),
2783         OPT_CALLBACK(0, "duration", &trace, "float",
2784                      "show only events with duration > N.M ms",
2785                      trace__set_duration),
2786         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2787         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2788         OPT_BOOLEAN('T', "time", &trace.full_time,
2789                     "Show full timestamp, not time relative to first start"),
2790         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2791                     "Show only syscall summary with statistics"),
2792         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2793                     "Show all syscalls and summary with statistics"),
2794         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2795                      "Trace pagefaults", parse_pagefaults, "maj"),
2796         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2797         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2798         OPT_CALLBACK(0, "call-graph", &trace.opts,
2799                      "record_mode[,record_size]", record_callchain_help,
2800                      &record_parse_callchain_opt),
2801         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2802                     "Show the kernel callchains on the syscall exit path"),
2803         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2804                      "Set the minimum stack depth when parsing the callchain, "
2805                      "anything below the specified depth will be ignored."),
2806         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2807                      "Set the maximum stack depth when parsing the callchain, "
2808                      "anything beyond the specified depth will be ignored. "
2809                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2810         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2811                         "per thread proc mmap processing timeout in ms"),
2812         OPT_END()
2813         };
2814         bool __maybe_unused max_stack_user_set = true;
2815         bool mmap_pages_user_set = true;
2816         const char * const trace_subcommands[] = { "record", NULL };
2817         int err;
2818         char bf[BUFSIZ];
2819
2820         signal(SIGSEGV, sighandler_dump_stack);
2821         signal(SIGFPE, sighandler_dump_stack);
2822
2823         trace.evlist = perf_evlist__new();
2824         trace.sctbl = syscalltbl__new();
2825
2826         if (trace.evlist == NULL || trace.sctbl == NULL) {
2827                 pr_err("Not enough memory to run!\n");
2828                 err = -ENOMEM;
2829                 goto out;
2830         }
2831
2832         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2833                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2834
2835         err = bpf__setup_stdout(trace.evlist);
2836         if (err) {
2837                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2838                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2839                 goto out;
2840         }
2841
2842         err = -1;
2843
2844         if (trace.trace_pgfaults) {
2845                 trace.opts.sample_address = true;
2846                 trace.opts.sample_time = true;
2847         }
2848
2849         if (trace.opts.mmap_pages == UINT_MAX)
2850                 mmap_pages_user_set = false;
2851
2852         if (trace.max_stack == UINT_MAX) {
2853                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2854                 max_stack_user_set = false;
2855         }
2856
2857 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2858         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2859                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2860 #endif
2861
2862         if (callchain_param.enabled) {
2863                 if (!mmap_pages_user_set && geteuid() == 0)
2864                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2865
2866                 symbol_conf.use_callchain = true;
2867         }
2868
2869         if (trace.evlist->nr_entries > 0)
2870                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2871
2872         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2873                 return trace__record(&trace, argc-1, &argv[1]);
2874
2875         /* summary_only implies summary option, but don't overwrite summary if set */
2876         if (trace.summary_only)
2877                 trace.summary = trace.summary_only;
2878
2879         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2880             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2881                 pr_err("Please specify something to trace.\n");
2882                 return -1;
2883         }
2884
2885         if (!trace.trace_syscalls && ev_qualifier_str) {
2886                 pr_err("The -e option can't be used with --no-syscalls.\n");
2887                 goto out;
2888         }
2889
2890         if (output_name != NULL) {
2891                 err = trace__open_output(&trace, output_name);
2892                 if (err < 0) {
2893                         perror("failed to create output file");
2894                         goto out;
2895                 }
2896         }
2897
2898         trace.open_id = syscalltbl__id(trace.sctbl, "open");
2899
2900         if (ev_qualifier_str != NULL) {
2901                 const char *s = ev_qualifier_str;
2902                 struct strlist_config slist_config = {
2903                         .dirname = system_path(STRACE_GROUPS_DIR),
2904                 };
2905
2906                 trace.not_ev_qualifier = *s == '!';
2907                 if (trace.not_ev_qualifier)
2908                         ++s;
2909                 trace.ev_qualifier = strlist__new(s, &slist_config);
2910                 if (trace.ev_qualifier == NULL) {
2911                         fputs("Not enough memory to parse event qualifier",
2912                               trace.output);
2913                         err = -ENOMEM;
2914                         goto out_close;
2915                 }
2916
2917                 err = trace__validate_ev_qualifier(&trace);
2918                 if (err)
2919                         goto out_close;
2920         }
2921
2922         err = target__validate(&trace.opts.target);
2923         if (err) {
2924                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2925                 fprintf(trace.output, "%s", bf);
2926                 goto out_close;
2927         }
2928
2929         err = target__parse_uid(&trace.opts.target);
2930         if (err) {
2931                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2932                 fprintf(trace.output, "%s", bf);
2933                 goto out_close;
2934         }
2935
2936         if (!argc && target__none(&trace.opts.target))
2937                 trace.opts.target.system_wide = true;
2938
2939         if (input_name)
2940                 err = trace__replay(&trace);
2941         else
2942                 err = trace__run(&trace, argc, argv);
2943
2944 out_close:
2945         if (output_name != NULL)
2946                 fclose(trace.output);
2947 out:
2948         return err;
2949 }