perf data: Fix building in 32 bit platform with libbabeltrace
[cascardo/linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
40
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
42 #include <stdlib.h>
43 #include <linux/err.h>
44 #include <linux/filter.h>
45 #include <linux/audit.h>
46 #include <linux/random.h>
47 #include <linux/stringify.h>
48 #include <linux/time64.h>
49
50 #ifndef O_CLOEXEC
51 # define O_CLOEXEC              02000000
52 #endif
53
54 struct trace {
55         struct perf_tool        tool;
56         struct syscalltbl       *sctbl;
57         struct {
58                 int             max;
59                 struct syscall  *table;
60                 struct {
61                         struct perf_evsel *sys_enter,
62                                           *sys_exit;
63                 }               events;
64         } syscalls;
65         struct record_opts      opts;
66         struct perf_evlist      *evlist;
67         struct machine          *host;
68         struct thread           *current;
69         u64                     base_time;
70         FILE                    *output;
71         unsigned long           nr_events;
72         struct strlist          *ev_qualifier;
73         struct {
74                 size_t          nr;
75                 int             *entries;
76         }                       ev_qualifier_ids;
77         struct intlist          *tid_list;
78         struct intlist          *pid_list;
79         struct {
80                 size_t          nr;
81                 pid_t           *entries;
82         }                       filter_pids;
83         double                  duration_filter;
84         double                  runtime_ms;
85         struct {
86                 u64             vfs_getname,
87                                 proc_getname;
88         } stats;
89         unsigned int            max_stack;
90         unsigned int            min_stack;
91         bool                    not_ev_qualifier;
92         bool                    live;
93         bool                    full_time;
94         bool                    sched;
95         bool                    multiple_threads;
96         bool                    summary;
97         bool                    summary_only;
98         bool                    show_comm;
99         bool                    show_tool_stats;
100         bool                    trace_syscalls;
101         bool                    kernel_syscallchains;
102         bool                    force;
103         bool                    vfs_getname;
104         int                     trace_pgfaults;
105         int                     open_id;
106 };
107
108 struct tp_field {
109         int offset;
110         union {
111                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
113         };
114 };
115
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
118 { \
119         u##bits value; \
120         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
121         return value;  \
122 }
123
124 TP_UINT_FIELD(8);
125 TP_UINT_FIELD(16);
126 TP_UINT_FIELD(32);
127 TP_UINT_FIELD(64);
128
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
131 { \
132         u##bits value; \
133         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134         return bswap_##bits(value);\
135 }
136
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
140
141 static int tp_field__init_uint(struct tp_field *field,
142                                struct format_field *format_field,
143                                bool needs_swap)
144 {
145         field->offset = format_field->offset;
146
147         switch (format_field->size) {
148         case 1:
149                 field->integer = tp_field__u8;
150                 break;
151         case 2:
152                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
153                 break;
154         case 4:
155                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
156                 break;
157         case 8:
158                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
159                 break;
160         default:
161                 return -1;
162         }
163
164         return 0;
165 }
166
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
168 {
169         return sample->raw_data + field->offset;
170 }
171
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
173 {
174         field->offset = format_field->offset;
175         field->pointer = tp_field__ptr;
176         return 0;
177 }
178
179 struct syscall_tp {
180         struct tp_field id;
181         union {
182                 struct tp_field args, ret;
183         };
184 };
185
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187                                           struct tp_field *field,
188                                           const char *name)
189 {
190         struct format_field *format_field = perf_evsel__field(evsel, name);
191
192         if (format_field == NULL)
193                 return -1;
194
195         return tp_field__init_uint(field, format_field, evsel->needs_swap);
196 }
197
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199         ({ struct syscall_tp *sc = evsel->priv;\
200            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
201
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203                                          struct tp_field *field,
204                                          const char *name)
205 {
206         struct format_field *format_field = perf_evsel__field(evsel, name);
207
208         if (format_field == NULL)
209                 return -1;
210
211         return tp_field__init_ptr(field, format_field);
212 }
213
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215         ({ struct syscall_tp *sc = evsel->priv;\
216            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
217
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
219 {
220         zfree(&evsel->priv);
221         perf_evsel__delete(evsel);
222 }
223
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
225 {
226         evsel->priv = malloc(sizeof(struct syscall_tp));
227         if (evsel->priv != NULL) {
228                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
229                         goto out_delete;
230
231                 evsel->handler = handler;
232                 return 0;
233         }
234
235         return -ENOMEM;
236
237 out_delete:
238         zfree(&evsel->priv);
239         return -ENOENT;
240 }
241
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
243 {
244         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
245
246         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
247         if (IS_ERR(evsel))
248                 evsel = perf_evsel__newtp("syscalls", direction);
249
250         if (IS_ERR(evsel))
251                 return NULL;
252
253         if (perf_evsel__init_syscall_tp(evsel, handler))
254                 goto out_delete;
255
256         return evsel;
257
258 out_delete:
259         perf_evsel__delete_priv(evsel);
260         return NULL;
261 }
262
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264         ({ struct syscall_tp *fields = evsel->priv; \
265            fields->name.integer(&fields->name, sample); })
266
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268         ({ struct syscall_tp *fields = evsel->priv; \
269            fields->name.pointer(&fields->name, sample); })
270
271 struct syscall_arg {
272         unsigned long val;
273         struct thread *thread;
274         struct trace  *trace;
275         void          *parm;
276         u8            idx;
277         u8            mask;
278 };
279
280 struct strarray {
281         int         offset;
282         int         nr_entries;
283         const char **entries;
284 };
285
286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
287         .nr_entries = ARRAY_SIZE(array), \
288         .entries = array, \
289 }
290
291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
292         .offset     = off, \
293         .nr_entries = ARRAY_SIZE(array), \
294         .entries = array, \
295 }
296
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298                                                 const char *intfmt,
299                                                 struct syscall_arg *arg)
300 {
301         struct strarray *sa = arg->parm;
302         int idx = arg->val - sa->offset;
303
304         if (idx < 0 || idx >= sa->nr_entries)
305                 return scnprintf(bf, size, intfmt, arg->val);
306
307         return scnprintf(bf, size, "%s", sa->entries[idx]);
308 }
309
310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
311                                               struct syscall_arg *arg)
312 {
313         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
314 }
315
316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
317
318 #if defined(__i386__) || defined(__x86_64__)
319 /*
320  * FIXME: Make this available to all arches as soon as the ioctl beautifier
321  *        gets rewritten to support all arches.
322  */
323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
324                                                  struct syscall_arg *arg)
325 {
326         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
327 }
328
329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
330 #endif /* defined(__i386__) || defined(__x86_64__) */
331
332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
333                                         struct syscall_arg *arg);
334
335 #define SCA_FD syscall_arg__scnprintf_fd
336
337 #ifndef AT_FDCWD
338 #define AT_FDCWD        -100
339 #endif
340
341 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
342                                            struct syscall_arg *arg)
343 {
344         int fd = arg->val;
345
346         if (fd == AT_FDCWD)
347                 return scnprintf(bf, size, "CWD");
348
349         return syscall_arg__scnprintf_fd(bf, size, arg);
350 }
351
352 #define SCA_FDAT syscall_arg__scnprintf_fd_at
353
354 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
355                                               struct syscall_arg *arg);
356
357 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
358
359 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
360                                          struct syscall_arg *arg)
361 {
362         return scnprintf(bf, size, "%#lx", arg->val);
363 }
364
365 #define SCA_HEX syscall_arg__scnprintf_hex
366
367 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
368                                          struct syscall_arg *arg)
369 {
370         return scnprintf(bf, size, "%d", arg->val);
371 }
372
373 #define SCA_INT syscall_arg__scnprintf_int
374
375 static const char *bpf_cmd[] = {
376         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
377         "MAP_GET_NEXT_KEY", "PROG_LOAD",
378 };
379 static DEFINE_STRARRAY(bpf_cmd);
380
381 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
382 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
383
384 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
385 static DEFINE_STRARRAY(itimers);
386
387 static const char *keyctl_options[] = {
388         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
389         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
390         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
391         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
392         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
393 };
394 static DEFINE_STRARRAY(keyctl_options);
395
396 static const char *whences[] = { "SET", "CUR", "END",
397 #ifdef SEEK_DATA
398 "DATA",
399 #endif
400 #ifdef SEEK_HOLE
401 "HOLE",
402 #endif
403 };
404 static DEFINE_STRARRAY(whences);
405
406 static const char *fcntl_cmds[] = {
407         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
408         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
409         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
410         "F_GETOWNER_UIDS",
411 };
412 static DEFINE_STRARRAY(fcntl_cmds);
413
414 static const char *rlimit_resources[] = {
415         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
416         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
417         "RTTIME",
418 };
419 static DEFINE_STRARRAY(rlimit_resources);
420
421 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
422 static DEFINE_STRARRAY(sighow);
423
424 static const char *clockid[] = {
425         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
426         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
427         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
428 };
429 static DEFINE_STRARRAY(clockid);
430
431 static const char *socket_families[] = {
432         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
433         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
434         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
435         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
436         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
437         "ALG", "NFC", "VSOCK",
438 };
439 static DEFINE_STRARRAY(socket_families);
440
441 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
442                                                  struct syscall_arg *arg)
443 {
444         size_t printed = 0;
445         int mode = arg->val;
446
447         if (mode == F_OK) /* 0 */
448                 return scnprintf(bf, size, "F");
449 #define P_MODE(n) \
450         if (mode & n##_OK) { \
451                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
452                 mode &= ~n##_OK; \
453         }
454
455         P_MODE(R);
456         P_MODE(W);
457         P_MODE(X);
458 #undef P_MODE
459
460         if (mode)
461                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
462
463         return printed;
464 }
465
466 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
467
468 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
469                                               struct syscall_arg *arg);
470
471 #define SCA_FILENAME syscall_arg__scnprintf_filename
472
473 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
474                                                 struct syscall_arg *arg)
475 {
476         int printed = 0, flags = arg->val;
477
478 #define P_FLAG(n) \
479         if (flags & O_##n) { \
480                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
481                 flags &= ~O_##n; \
482         }
483
484         P_FLAG(CLOEXEC);
485         P_FLAG(NONBLOCK);
486 #undef P_FLAG
487
488         if (flags)
489                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
490
491         return printed;
492 }
493
494 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
495
496 #if defined(__i386__) || defined(__x86_64__)
497 /*
498  * FIXME: Make this available to all arches.
499  */
500 #define TCGETS          0x5401
501
502 static const char *tioctls[] = {
503         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
504         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
505         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
506         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
507         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
508         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
509         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
510         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
511         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
512         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
513         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
514         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
515         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
516         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
517         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
518 };
519
520 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
521 #endif /* defined(__i386__) || defined(__x86_64__) */
522
523 #ifndef GRND_NONBLOCK
524 #define GRND_NONBLOCK   0x0001
525 #endif
526 #ifndef GRND_RANDOM
527 #define GRND_RANDOM     0x0002
528 #endif
529
530 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
531                                                    struct syscall_arg *arg)
532 {
533         int printed = 0, flags = arg->val;
534
535 #define P_FLAG(n) \
536         if (flags & GRND_##n) { \
537                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
538                 flags &= ~GRND_##n; \
539         }
540
541         P_FLAG(RANDOM);
542         P_FLAG(NONBLOCK);
543 #undef P_FLAG
544
545         if (flags)
546                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
547
548         return printed;
549 }
550
551 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
552
553 #define STRARRAY(arg, name, array) \
554           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
555           .arg_parm      = { [arg] = &strarray__##array, }
556
557 #include "trace/beauty/eventfd.c"
558 #include "trace/beauty/flock.c"
559 #include "trace/beauty/futex_op.c"
560 #include "trace/beauty/mmap.c"
561 #include "trace/beauty/mode_t.c"
562 #include "trace/beauty/msg_flags.c"
563 #include "trace/beauty/open_flags.c"
564 #include "trace/beauty/perf_event_open.c"
565 #include "trace/beauty/pid.c"
566 #include "trace/beauty/sched_policy.c"
567 #include "trace/beauty/seccomp.c"
568 #include "trace/beauty/signum.c"
569 #include "trace/beauty/socket_type.c"
570 #include "trace/beauty/waitid_options.c"
571
572 static struct syscall_fmt {
573         const char *name;
574         const char *alias;
575         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
576         void       *arg_parm[6];
577         bool       errmsg;
578         bool       errpid;
579         bool       timeout;
580         bool       hexret;
581 } syscall_fmts[] = {
582         { .name     = "access",     .errmsg = true,
583           .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
584         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
585         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
586         { .name     = "brk",        .hexret = true,
587           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
588         { .name     = "chdir",      .errmsg = true, },
589         { .name     = "chmod",      .errmsg = true, },
590         { .name     = "chroot",     .errmsg = true, },
591         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
592         { .name     = "clone",      .errpid = true, },
593         { .name     = "close",      .errmsg = true,
594           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
595         { .name     = "connect",    .errmsg = true, },
596         { .name     = "creat",      .errmsg = true, },
597         { .name     = "dup",        .errmsg = true, },
598         { .name     = "dup2",       .errmsg = true, },
599         { .name     = "dup3",       .errmsg = true, },
600         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
601         { .name     = "eventfd2",   .errmsg = true,
602           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
603         { .name     = "faccessat",  .errmsg = true, },
604         { .name     = "fadvise64",  .errmsg = true, },
605         { .name     = "fallocate",  .errmsg = true, },
606         { .name     = "fchdir",     .errmsg = true, },
607         { .name     = "fchmod",     .errmsg = true, },
608         { .name     = "fchmodat",   .errmsg = true,
609           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
610         { .name     = "fchown",     .errmsg = true, },
611         { .name     = "fchownat",   .errmsg = true,
612           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
613         { .name     = "fcntl",      .errmsg = true,
614           .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
615           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
616         { .name     = "fdatasync",  .errmsg = true, },
617         { .name     = "flock",      .errmsg = true,
618           .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
619         { .name     = "fsetxattr",  .errmsg = true, },
620         { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
621         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
622         { .name     = "fstatfs",    .errmsg = true, },
623         { .name     = "fsync",    .errmsg = true, },
624         { .name     = "ftruncate", .errmsg = true, },
625         { .name     = "futex",      .errmsg = true,
626           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
627         { .name     = "futimesat", .errmsg = true,
628           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
629         { .name     = "getdents",   .errmsg = true, },
630         { .name     = "getdents64", .errmsg = true, },
631         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
632         { .name     = "getpid",     .errpid = true, },
633         { .name     = "getpgid",    .errpid = true, },
634         { .name     = "getppid",    .errpid = true, },
635         { .name     = "getrandom",  .errmsg = true,
636           .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
637         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
638         { .name     = "getxattr",   .errmsg = true, },
639         { .name     = "inotify_add_watch",          .errmsg = true, },
640         { .name     = "ioctl",      .errmsg = true,
641           .arg_scnprintf = {
642 #if defined(__i386__) || defined(__x86_64__)
643 /*
644  * FIXME: Make this available to all arches.
645  */
646                              [1] = SCA_STRHEXARRAY, /* cmd */
647                              [2] = SCA_HEX, /* arg */ },
648           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
649 #else
650                              [2] = SCA_HEX, /* arg */ }, },
651 #endif
652         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
653         { .name     = "kill",       .errmsg = true,
654           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
655         { .name     = "lchown",    .errmsg = true, },
656         { .name     = "lgetxattr",  .errmsg = true, },
657         { .name     = "linkat",     .errmsg = true,
658           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
659         { .name     = "listxattr",  .errmsg = true, },
660         { .name     = "llistxattr", .errmsg = true, },
661         { .name     = "lremovexattr",  .errmsg = true, },
662         { .name     = "lseek",      .errmsg = true,
663           .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
664           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
665         { .name     = "lsetxattr",  .errmsg = true, },
666         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
667         { .name     = "lsxattr",    .errmsg = true, },
668         { .name     = "madvise",    .errmsg = true,
669           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
670                              [2] = SCA_MADV_BHV, /* behavior */ }, },
671         { .name     = "mkdir",    .errmsg = true, },
672         { .name     = "mkdirat",    .errmsg = true,
673           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
674         { .name     = "mknod",      .errmsg = true, },
675         { .name     = "mknodat",    .errmsg = true,
676           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
677         { .name     = "mlock",      .errmsg = true,
678           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
679         { .name     = "mlockall",   .errmsg = true,
680           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
681         { .name     = "mmap",       .hexret = true,
682           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
683                              [2] = SCA_MMAP_PROT, /* prot */
684                              [3] = SCA_MMAP_FLAGS, /* flags */ }, },
685         { .name     = "mprotect",   .errmsg = true,
686           .arg_scnprintf = { [0] = SCA_HEX, /* start */
687                              [2] = SCA_MMAP_PROT, /* prot */ }, },
688         { .name     = "mq_unlink", .errmsg = true,
689           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
690         { .name     = "mremap",     .hexret = true,
691           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
692                              [3] = SCA_MREMAP_FLAGS, /* flags */
693                              [4] = SCA_HEX, /* new_addr */ }, },
694         { .name     = "munlock",    .errmsg = true,
695           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
696         { .name     = "munmap",     .errmsg = true,
697           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
698         { .name     = "name_to_handle_at", .errmsg = true,
699           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
700         { .name     = "newfstatat", .errmsg = true,
701           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
702         { .name     = "open",       .errmsg = true,
703           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
704         { .name     = "open_by_handle_at", .errmsg = true,
705           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
706                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
707         { .name     = "openat",     .errmsg = true,
708           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
709                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
710         { .name     = "perf_event_open", .errmsg = true,
711           .arg_scnprintf = { [2] = SCA_INT, /* cpu */
712                              [3] = SCA_FD,  /* group_fd */
713                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
714         { .name     = "pipe2",      .errmsg = true,
715           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
716         { .name     = "poll",       .errmsg = true, .timeout = true, },
717         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
718         { .name     = "pread",      .errmsg = true, .alias = "pread64", },
719         { .name     = "preadv",     .errmsg = true, .alias = "pread", },
720         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
721         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
722         { .name     = "pwritev",    .errmsg = true, },
723         { .name     = "read",       .errmsg = true, },
724         { .name     = "readlink",   .errmsg = true, },
725         { .name     = "readlinkat", .errmsg = true,
726           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
727         { .name     = "readv",      .errmsg = true, },
728         { .name     = "recvfrom",   .errmsg = true,
729           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
730         { .name     = "recvmmsg",   .errmsg = true,
731           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
732         { .name     = "recvmsg",    .errmsg = true,
733           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
734         { .name     = "removexattr", .errmsg = true, },
735         { .name     = "renameat",   .errmsg = true,
736           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
737         { .name     = "rmdir",    .errmsg = true, },
738         { .name     = "rt_sigaction", .errmsg = true,
739           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
740         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
741         { .name     = "rt_sigqueueinfo", .errmsg = true,
742           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
743         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
744           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
745         { .name     = "sched_setscheduler",   .errmsg = true,
746           .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
747         { .name     = "seccomp", .errmsg = true,
748           .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
749                              [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
750         { .name     = "select",     .errmsg = true, .timeout = true, },
751         { .name     = "sendmmsg",    .errmsg = true,
752           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
753         { .name     = "sendmsg",    .errmsg = true,
754           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
755         { .name     = "sendto",     .errmsg = true,
756           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
757         { .name     = "set_tid_address", .errpid = true, },
758         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
759         { .name     = "setpgid",    .errmsg = true, },
760         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
761         { .name     = "setxattr",   .errmsg = true, },
762         { .name     = "shutdown",   .errmsg = true, },
763         { .name     = "socket",     .errmsg = true,
764           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
765                              [1] = SCA_SK_TYPE, /* type */ },
766           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
767         { .name     = "socketpair", .errmsg = true,
768           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
769                              [1] = SCA_SK_TYPE, /* type */ },
770           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
771         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
772         { .name     = "statfs",     .errmsg = true, },
773         { .name     = "swapoff",    .errmsg = true,
774           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
775         { .name     = "swapon",     .errmsg = true,
776           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
777         { .name     = "symlinkat",  .errmsg = true,
778           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
779         { .name     = "tgkill",     .errmsg = true,
780           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
781         { .name     = "tkill",      .errmsg = true,
782           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
783         { .name     = "truncate",   .errmsg = true, },
784         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
785         { .name     = "unlinkat",   .errmsg = true,
786           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
787         { .name     = "utime",  .errmsg = true, },
788         { .name     = "utimensat",  .errmsg = true,
789           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
790         { .name     = "utimes",  .errmsg = true, },
791         { .name     = "vmsplice",  .errmsg = true, },
792         { .name     = "wait4",      .errpid = true,
793           .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
794         { .name     = "waitid",     .errpid = true,
795           .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
796         { .name     = "write",      .errmsg = true, },
797         { .name     = "writev",     .errmsg = true, },
798 };
799
800 static int syscall_fmt__cmp(const void *name, const void *fmtp)
801 {
802         const struct syscall_fmt *fmt = fmtp;
803         return strcmp(name, fmt->name);
804 }
805
806 static struct syscall_fmt *syscall_fmt__find(const char *name)
807 {
808         const int nmemb = ARRAY_SIZE(syscall_fmts);
809         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
810 }
811
812 struct syscall {
813         struct event_format *tp_format;
814         int                 nr_args;
815         struct format_field *args;
816         const char          *name;
817         bool                is_exit;
818         struct syscall_fmt  *fmt;
819         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
820         void                **arg_parm;
821 };
822
823 static size_t fprintf_duration(unsigned long t, FILE *fp)
824 {
825         double duration = (double)t / NSEC_PER_MSEC;
826         size_t printed = fprintf(fp, "(");
827
828         if (duration >= 1.0)
829                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
830         else if (duration >= 0.01)
831                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
832         else
833                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
834         return printed + fprintf(fp, "): ");
835 }
836
837 /**
838  * filename.ptr: The filename char pointer that will be vfs_getname'd
839  * filename.entry_str_pos: Where to insert the string translated from
840  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
841  */
842 struct thread_trace {
843         u64               entry_time;
844         u64               exit_time;
845         bool              entry_pending;
846         unsigned long     nr_events;
847         unsigned long     pfmaj, pfmin;
848         char              *entry_str;
849         double            runtime_ms;
850         struct {
851                 unsigned long ptr;
852                 short int     entry_str_pos;
853                 bool          pending_open;
854                 unsigned int  namelen;
855                 char          *name;
856         } filename;
857         struct {
858                 int       max;
859                 char      **table;
860         } paths;
861
862         struct intlist *syscall_stats;
863 };
864
865 static struct thread_trace *thread_trace__new(void)
866 {
867         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
868
869         if (ttrace)
870                 ttrace->paths.max = -1;
871
872         ttrace->syscall_stats = intlist__new(NULL);
873
874         return ttrace;
875 }
876
877 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
878 {
879         struct thread_trace *ttrace;
880
881         if (thread == NULL)
882                 goto fail;
883
884         if (thread__priv(thread) == NULL)
885                 thread__set_priv(thread, thread_trace__new());
886
887         if (thread__priv(thread) == NULL)
888                 goto fail;
889
890         ttrace = thread__priv(thread);
891         ++ttrace->nr_events;
892
893         return ttrace;
894 fail:
895         color_fprintf(fp, PERF_COLOR_RED,
896                       "WARNING: not enough memory, dropping samples!\n");
897         return NULL;
898 }
899
900 #define TRACE_PFMAJ             (1 << 0)
901 #define TRACE_PFMIN             (1 << 1)
902
903 static const size_t trace__entry_str_size = 2048;
904
905 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
906 {
907         struct thread_trace *ttrace = thread__priv(thread);
908
909         if (fd > ttrace->paths.max) {
910                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
911
912                 if (npath == NULL)
913                         return -1;
914
915                 if (ttrace->paths.max != -1) {
916                         memset(npath + ttrace->paths.max + 1, 0,
917                                (fd - ttrace->paths.max) * sizeof(char *));
918                 } else {
919                         memset(npath, 0, (fd + 1) * sizeof(char *));
920                 }
921
922                 ttrace->paths.table = npath;
923                 ttrace->paths.max   = fd;
924         }
925
926         ttrace->paths.table[fd] = strdup(pathname);
927
928         return ttrace->paths.table[fd] != NULL ? 0 : -1;
929 }
930
931 static int thread__read_fd_path(struct thread *thread, int fd)
932 {
933         char linkname[PATH_MAX], pathname[PATH_MAX];
934         struct stat st;
935         int ret;
936
937         if (thread->pid_ == thread->tid) {
938                 scnprintf(linkname, sizeof(linkname),
939                           "/proc/%d/fd/%d", thread->pid_, fd);
940         } else {
941                 scnprintf(linkname, sizeof(linkname),
942                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
943         }
944
945         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
946                 return -1;
947
948         ret = readlink(linkname, pathname, sizeof(pathname));
949
950         if (ret < 0 || ret > st.st_size)
951                 return -1;
952
953         pathname[ret] = '\0';
954         return trace__set_fd_pathname(thread, fd, pathname);
955 }
956
957 static const char *thread__fd_path(struct thread *thread, int fd,
958                                    struct trace *trace)
959 {
960         struct thread_trace *ttrace = thread__priv(thread);
961
962         if (ttrace == NULL)
963                 return NULL;
964
965         if (fd < 0)
966                 return NULL;
967
968         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
969                 if (!trace->live)
970                         return NULL;
971                 ++trace->stats.proc_getname;
972                 if (thread__read_fd_path(thread, fd))
973                         return NULL;
974         }
975
976         return ttrace->paths.table[fd];
977 }
978
979 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
980                                         struct syscall_arg *arg)
981 {
982         int fd = arg->val;
983         size_t printed = scnprintf(bf, size, "%d", fd);
984         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
985
986         if (path)
987                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
988
989         return printed;
990 }
991
992 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
993                                               struct syscall_arg *arg)
994 {
995         int fd = arg->val;
996         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
997         struct thread_trace *ttrace = thread__priv(arg->thread);
998
999         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1000                 zfree(&ttrace->paths.table[fd]);
1001
1002         return printed;
1003 }
1004
1005 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1006                                      unsigned long ptr)
1007 {
1008         struct thread_trace *ttrace = thread__priv(thread);
1009
1010         ttrace->filename.ptr = ptr;
1011         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1012 }
1013
1014 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1015                                               struct syscall_arg *arg)
1016 {
1017         unsigned long ptr = arg->val;
1018
1019         if (!arg->trace->vfs_getname)
1020                 return scnprintf(bf, size, "%#x", ptr);
1021
1022         thread__set_filename_pos(arg->thread, bf, ptr);
1023         return 0;
1024 }
1025
1026 static bool trace__filter_duration(struct trace *trace, double t)
1027 {
1028         return t < (trace->duration_filter * NSEC_PER_MSEC);
1029 }
1030
1031 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1032 {
1033         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1034
1035         return fprintf(fp, "%10.3f ", ts);
1036 }
1037
1038 static bool done = false;
1039 static bool interrupted = false;
1040
1041 static void sig_handler(int sig)
1042 {
1043         done = true;
1044         interrupted = sig == SIGINT;
1045 }
1046
1047 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1048                                         u64 duration, u64 tstamp, FILE *fp)
1049 {
1050         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1051         printed += fprintf_duration(duration, fp);
1052
1053         if (trace->multiple_threads) {
1054                 if (trace->show_comm)
1055                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1056                 printed += fprintf(fp, "%d ", thread->tid);
1057         }
1058
1059         return printed;
1060 }
1061
1062 static int trace__process_event(struct trace *trace, struct machine *machine,
1063                                 union perf_event *event, struct perf_sample *sample)
1064 {
1065         int ret = 0;
1066
1067         switch (event->header.type) {
1068         case PERF_RECORD_LOST:
1069                 color_fprintf(trace->output, PERF_COLOR_RED,
1070                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1071                 ret = machine__process_lost_event(machine, event, sample);
1072                 break;
1073         default:
1074                 ret = machine__process_event(machine, event, sample);
1075                 break;
1076         }
1077
1078         return ret;
1079 }
1080
1081 static int trace__tool_process(struct perf_tool *tool,
1082                                union perf_event *event,
1083                                struct perf_sample *sample,
1084                                struct machine *machine)
1085 {
1086         struct trace *trace = container_of(tool, struct trace, tool);
1087         return trace__process_event(trace, machine, event, sample);
1088 }
1089
1090 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1091 {
1092         struct machine *machine = vmachine;
1093
1094         if (machine->kptr_restrict_warned)
1095                 return NULL;
1096
1097         if (symbol_conf.kptr_restrict) {
1098                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1099                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1100                            "Kernel samples will not be resolved.\n");
1101                 machine->kptr_restrict_warned = true;
1102                 return NULL;
1103         }
1104
1105         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1106 }
1107
1108 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1109 {
1110         int err = symbol__init(NULL);
1111
1112         if (err)
1113                 return err;
1114
1115         trace->host = machine__new_host();
1116         if (trace->host == NULL)
1117                 return -ENOMEM;
1118
1119         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1120                 return -errno;
1121
1122         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1123                                             evlist->threads, trace__tool_process, false,
1124                                             trace->opts.proc_map_timeout);
1125         if (err)
1126                 symbol__exit();
1127
1128         return err;
1129 }
1130
1131 static int syscall__set_arg_fmts(struct syscall *sc)
1132 {
1133         struct format_field *field;
1134         int idx = 0, len;
1135
1136         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1137         if (sc->arg_scnprintf == NULL)
1138                 return -1;
1139
1140         if (sc->fmt)
1141                 sc->arg_parm = sc->fmt->arg_parm;
1142
1143         for (field = sc->args; field; field = field->next) {
1144                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1145                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1146                 else if (strcmp(field->type, "const char *") == 0 &&
1147                          (strcmp(field->name, "filename") == 0 ||
1148                           strcmp(field->name, "path") == 0 ||
1149                           strcmp(field->name, "pathname") == 0))
1150                         sc->arg_scnprintf[idx] = SCA_FILENAME;
1151                 else if (field->flags & FIELD_IS_POINTER)
1152                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1153                 else if (strcmp(field->type, "pid_t") == 0)
1154                         sc->arg_scnprintf[idx] = SCA_PID;
1155                 else if (strcmp(field->type, "umode_t") == 0)
1156                         sc->arg_scnprintf[idx] = SCA_MODE_T;
1157                 else if ((strcmp(field->type, "int") == 0 ||
1158                           strcmp(field->type, "unsigned int") == 0 ||
1159                           strcmp(field->type, "long") == 0) &&
1160                          (len = strlen(field->name)) >= 2 &&
1161                          strcmp(field->name + len - 2, "fd") == 0) {
1162                         /*
1163                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1164                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1165                          * 65 int
1166                          * 23 unsigned int
1167                          * 7 unsigned long
1168                          */
1169                         sc->arg_scnprintf[idx] = SCA_FD;
1170                 }
1171                 ++idx;
1172         }
1173
1174         return 0;
1175 }
1176
1177 static int trace__read_syscall_info(struct trace *trace, int id)
1178 {
1179         char tp_name[128];
1180         struct syscall *sc;
1181         const char *name = syscalltbl__name(trace->sctbl, id);
1182
1183         if (name == NULL)
1184                 return -1;
1185
1186         if (id > trace->syscalls.max) {
1187                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1188
1189                 if (nsyscalls == NULL)
1190                         return -1;
1191
1192                 if (trace->syscalls.max != -1) {
1193                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1194                                (id - trace->syscalls.max) * sizeof(*sc));
1195                 } else {
1196                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1197                 }
1198
1199                 trace->syscalls.table = nsyscalls;
1200                 trace->syscalls.max   = id;
1201         }
1202
1203         sc = trace->syscalls.table + id;
1204         sc->name = name;
1205
1206         sc->fmt  = syscall_fmt__find(sc->name);
1207
1208         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1209         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1210
1211         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1212                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1213                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1214         }
1215
1216         if (IS_ERR(sc->tp_format))
1217                 return -1;
1218
1219         sc->args = sc->tp_format->format.fields;
1220         sc->nr_args = sc->tp_format->format.nr_fields;
1221         /*
1222          * We need to check and discard the first variable '__syscall_nr'
1223          * or 'nr' that mean the syscall number. It is needless here.
1224          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1225          */
1226         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1227                 sc->args = sc->args->next;
1228                 --sc->nr_args;
1229         }
1230
1231         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1232
1233         return syscall__set_arg_fmts(sc);
1234 }
1235
1236 static int trace__validate_ev_qualifier(struct trace *trace)
1237 {
1238         int err = 0, i;
1239         struct str_node *pos;
1240
1241         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1242         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1243                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1244
1245         if (trace->ev_qualifier_ids.entries == NULL) {
1246                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1247                        trace->output);
1248                 err = -EINVAL;
1249                 goto out;
1250         }
1251
1252         i = 0;
1253
1254         strlist__for_each_entry(pos, trace->ev_qualifier) {
1255                 const char *sc = pos->s;
1256                 int id = syscalltbl__id(trace->sctbl, sc);
1257
1258                 if (id < 0) {
1259                         if (err == 0) {
1260                                 fputs("Error:\tInvalid syscall ", trace->output);
1261                                 err = -EINVAL;
1262                         } else {
1263                                 fputs(", ", trace->output);
1264                         }
1265
1266                         fputs(sc, trace->output);
1267                 }
1268
1269                 trace->ev_qualifier_ids.entries[i++] = id;
1270         }
1271
1272         if (err < 0) {
1273                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1274                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1275                 zfree(&trace->ev_qualifier_ids.entries);
1276                 trace->ev_qualifier_ids.nr = 0;
1277         }
1278 out:
1279         return err;
1280 }
1281
1282 /*
1283  * args is to be interpreted as a series of longs but we need to handle
1284  * 8-byte unaligned accesses. args points to raw_data within the event
1285  * and raw_data is guaranteed to be 8-byte unaligned because it is
1286  * preceded by raw_size which is a u32. So we need to copy args to a temp
1287  * variable to read it. Most notably this avoids extended load instructions
1288  * on unaligned addresses
1289  */
1290
1291 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1292                                       unsigned char *args, struct trace *trace,
1293                                       struct thread *thread)
1294 {
1295         size_t printed = 0;
1296         unsigned char *p;
1297         unsigned long val;
1298
1299         if (sc->args != NULL) {
1300                 struct format_field *field;
1301                 u8 bit = 1;
1302                 struct syscall_arg arg = {
1303                         .idx    = 0,
1304                         .mask   = 0,
1305                         .trace  = trace,
1306                         .thread = thread,
1307                 };
1308
1309                 for (field = sc->args; field;
1310                      field = field->next, ++arg.idx, bit <<= 1) {
1311                         if (arg.mask & bit)
1312                                 continue;
1313
1314                         /* special care for unaligned accesses */
1315                         p = args + sizeof(unsigned long) * arg.idx;
1316                         memcpy(&val, p, sizeof(val));
1317
1318                         /*
1319                          * Suppress this argument if its value is zero and
1320                          * and we don't have a string associated in an
1321                          * strarray for it.
1322                          */
1323                         if (val == 0 &&
1324                             !(sc->arg_scnprintf &&
1325                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1326                               sc->arg_parm[arg.idx]))
1327                                 continue;
1328
1329                         printed += scnprintf(bf + printed, size - printed,
1330                                              "%s%s: ", printed ? ", " : "", field->name);
1331                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1332                                 arg.val = val;
1333                                 if (sc->arg_parm)
1334                                         arg.parm = sc->arg_parm[arg.idx];
1335                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1336                                                                       size - printed, &arg);
1337                         } else {
1338                                 printed += scnprintf(bf + printed, size - printed,
1339                                                      "%ld", val);
1340                         }
1341                 }
1342         } else if (IS_ERR(sc->tp_format)) {
1343                 /*
1344                  * If we managed to read the tracepoint /format file, then we
1345                  * may end up not having any args, like with gettid(), so only
1346                  * print the raw args when we didn't manage to read it.
1347                  */
1348                 int i = 0;
1349
1350                 while (i < 6) {
1351                         /* special care for unaligned accesses */
1352                         p = args + sizeof(unsigned long) * i;
1353                         memcpy(&val, p, sizeof(val));
1354                         printed += scnprintf(bf + printed, size - printed,
1355                                              "%sarg%d: %ld",
1356                                              printed ? ", " : "", i, val);
1357                         ++i;
1358                 }
1359         }
1360
1361         return printed;
1362 }
1363
1364 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1365                                   union perf_event *event,
1366                                   struct perf_sample *sample);
1367
1368 static struct syscall *trace__syscall_info(struct trace *trace,
1369                                            struct perf_evsel *evsel, int id)
1370 {
1371
1372         if (id < 0) {
1373
1374                 /*
1375                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1376                  * before that, leaving at a higher verbosity level till that is
1377                  * explained. Reproduced with plain ftrace with:
1378                  *
1379                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1380                  * grep "NR -1 " /t/trace_pipe
1381                  *
1382                  * After generating some load on the machine.
1383                  */
1384                 if (verbose > 1) {
1385                         static u64 n;
1386                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1387                                 id, perf_evsel__name(evsel), ++n);
1388                 }
1389                 return NULL;
1390         }
1391
1392         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1393             trace__read_syscall_info(trace, id))
1394                 goto out_cant_read;
1395
1396         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1397                 goto out_cant_read;
1398
1399         return &trace->syscalls.table[id];
1400
1401 out_cant_read:
1402         if (verbose) {
1403                 fprintf(trace->output, "Problems reading syscall %d", id);
1404                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1405                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1406                 fputs(" information\n", trace->output);
1407         }
1408         return NULL;
1409 }
1410
1411 static void thread__update_stats(struct thread_trace *ttrace,
1412                                  int id, struct perf_sample *sample)
1413 {
1414         struct int_node *inode;
1415         struct stats *stats;
1416         u64 duration = 0;
1417
1418         inode = intlist__findnew(ttrace->syscall_stats, id);
1419         if (inode == NULL)
1420                 return;
1421
1422         stats = inode->priv;
1423         if (stats == NULL) {
1424                 stats = malloc(sizeof(struct stats));
1425                 if (stats == NULL)
1426                         return;
1427                 init_stats(stats);
1428                 inode->priv = stats;
1429         }
1430
1431         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1432                 duration = sample->time - ttrace->entry_time;
1433
1434         update_stats(stats, duration);
1435 }
1436
1437 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1438 {
1439         struct thread_trace *ttrace;
1440         u64 duration;
1441         size_t printed;
1442
1443         if (trace->current == NULL)
1444                 return 0;
1445
1446         ttrace = thread__priv(trace->current);
1447
1448         if (!ttrace->entry_pending)
1449                 return 0;
1450
1451         duration = sample->time - ttrace->entry_time;
1452
1453         printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1454         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1455         ttrace->entry_pending = false;
1456
1457         return printed;
1458 }
1459
1460 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1461                             union perf_event *event __maybe_unused,
1462                             struct perf_sample *sample)
1463 {
1464         char *msg;
1465         void *args;
1466         size_t printed = 0;
1467         struct thread *thread;
1468         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1469         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1470         struct thread_trace *ttrace;
1471
1472         if (sc == NULL)
1473                 return -1;
1474
1475         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1476         ttrace = thread__trace(thread, trace->output);
1477         if (ttrace == NULL)
1478                 goto out_put;
1479
1480         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1481
1482         if (ttrace->entry_str == NULL) {
1483                 ttrace->entry_str = malloc(trace__entry_str_size);
1484                 if (!ttrace->entry_str)
1485                         goto out_put;
1486         }
1487
1488         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1489                 trace__printf_interrupted_entry(trace, sample);
1490
1491         ttrace->entry_time = sample->time;
1492         msg = ttrace->entry_str;
1493         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1494
1495         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1496                                            args, trace, thread);
1497
1498         if (sc->is_exit) {
1499                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1500                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1501                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1502                 }
1503         } else {
1504                 ttrace->entry_pending = true;
1505                 /* See trace__vfs_getname & trace__sys_exit */
1506                 ttrace->filename.pending_open = false;
1507         }
1508
1509         if (trace->current != thread) {
1510                 thread__put(trace->current);
1511                 trace->current = thread__get(thread);
1512         }
1513         err = 0;
1514 out_put:
1515         thread__put(thread);
1516         return err;
1517 }
1518
1519 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1520                                     struct perf_sample *sample,
1521                                     struct callchain_cursor *cursor)
1522 {
1523         struct addr_location al;
1524
1525         if (machine__resolve(trace->host, &al, sample) < 0 ||
1526             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1527                 return -1;
1528
1529         return 0;
1530 }
1531
1532 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1533 {
1534         /* TODO: user-configurable print_opts */
1535         const unsigned int print_opts = EVSEL__PRINT_SYM |
1536                                         EVSEL__PRINT_DSO |
1537                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1538
1539         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1540 }
1541
1542 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1543                            union perf_event *event __maybe_unused,
1544                            struct perf_sample *sample)
1545 {
1546         long ret;
1547         u64 duration = 0;
1548         struct thread *thread;
1549         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1550         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1551         struct thread_trace *ttrace;
1552
1553         if (sc == NULL)
1554                 return -1;
1555
1556         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1557         ttrace = thread__trace(thread, trace->output);
1558         if (ttrace == NULL)
1559                 goto out_put;
1560
1561         if (trace->summary)
1562                 thread__update_stats(ttrace, id, sample);
1563
1564         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1565
1566         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1567                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1568                 ttrace->filename.pending_open = false;
1569                 ++trace->stats.vfs_getname;
1570         }
1571
1572         ttrace->exit_time = sample->time;
1573
1574         if (ttrace->entry_time) {
1575                 duration = sample->time - ttrace->entry_time;
1576                 if (trace__filter_duration(trace, duration))
1577                         goto out;
1578         } else if (trace->duration_filter)
1579                 goto out;
1580
1581         if (sample->callchain) {
1582                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1583                 if (callchain_ret == 0) {
1584                         if (callchain_cursor.nr < trace->min_stack)
1585                                 goto out;
1586                         callchain_ret = 1;
1587                 }
1588         }
1589
1590         if (trace->summary_only)
1591                 goto out;
1592
1593         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1594
1595         if (ttrace->entry_pending) {
1596                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1597         } else {
1598                 fprintf(trace->output, " ... [");
1599                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1600                 fprintf(trace->output, "]: %s()", sc->name);
1601         }
1602
1603         if (sc->fmt == NULL) {
1604 signed_print:
1605                 fprintf(trace->output, ") = %ld", ret);
1606         } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1607                 char bf[STRERR_BUFSIZE];
1608                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1609                            *e = audit_errno_to_name(-ret);
1610
1611                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1612         } else if (ret == 0 && sc->fmt->timeout)
1613                 fprintf(trace->output, ") = 0 Timeout");
1614         else if (sc->fmt->hexret)
1615                 fprintf(trace->output, ") = %#lx", ret);
1616         else if (sc->fmt->errpid) {
1617                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1618
1619                 if (child != NULL) {
1620                         fprintf(trace->output, ") = %ld", ret);
1621                         if (child->comm_set)
1622                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1623                         thread__put(child);
1624                 }
1625         } else
1626                 goto signed_print;
1627
1628         fputc('\n', trace->output);
1629
1630         if (callchain_ret > 0)
1631                 trace__fprintf_callchain(trace, sample);
1632         else if (callchain_ret < 0)
1633                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1634 out:
1635         ttrace->entry_pending = false;
1636         err = 0;
1637 out_put:
1638         thread__put(thread);
1639         return err;
1640 }
1641
1642 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1643                               union perf_event *event __maybe_unused,
1644                               struct perf_sample *sample)
1645 {
1646         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1647         struct thread_trace *ttrace;
1648         size_t filename_len, entry_str_len, to_move;
1649         ssize_t remaining_space;
1650         char *pos;
1651         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1652
1653         if (!thread)
1654                 goto out;
1655
1656         ttrace = thread__priv(thread);
1657         if (!ttrace)
1658                 goto out;
1659
1660         filename_len = strlen(filename);
1661
1662         if (ttrace->filename.namelen < filename_len) {
1663                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1664
1665                 if (f == NULL)
1666                                 goto out;
1667
1668                 ttrace->filename.namelen = filename_len;
1669                 ttrace->filename.name = f;
1670         }
1671
1672         strcpy(ttrace->filename.name, filename);
1673         ttrace->filename.pending_open = true;
1674
1675         if (!ttrace->filename.ptr)
1676                 goto out;
1677
1678         entry_str_len = strlen(ttrace->entry_str);
1679         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1680         if (remaining_space <= 0)
1681                 goto out;
1682
1683         if (filename_len > (size_t)remaining_space) {
1684                 filename += filename_len - remaining_space;
1685                 filename_len = remaining_space;
1686         }
1687
1688         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1689         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1690         memmove(pos + filename_len, pos, to_move);
1691         memcpy(pos, filename, filename_len);
1692
1693         ttrace->filename.ptr = 0;
1694         ttrace->filename.entry_str_pos = 0;
1695 out:
1696         return 0;
1697 }
1698
1699 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1700                                      union perf_event *event __maybe_unused,
1701                                      struct perf_sample *sample)
1702 {
1703         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1704         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1705         struct thread *thread = machine__findnew_thread(trace->host,
1706                                                         sample->pid,
1707                                                         sample->tid);
1708         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1709
1710         if (ttrace == NULL)
1711                 goto out_dump;
1712
1713         ttrace->runtime_ms += runtime_ms;
1714         trace->runtime_ms += runtime_ms;
1715         thread__put(thread);
1716         return 0;
1717
1718 out_dump:
1719         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1720                evsel->name,
1721                perf_evsel__strval(evsel, sample, "comm"),
1722                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1723                runtime,
1724                perf_evsel__intval(evsel, sample, "vruntime"));
1725         thread__put(thread);
1726         return 0;
1727 }
1728
1729 static void bpf_output__printer(enum binary_printer_ops op,
1730                                 unsigned int val, void *extra)
1731 {
1732         FILE *output = extra;
1733         unsigned char ch = (unsigned char)val;
1734
1735         switch (op) {
1736         case BINARY_PRINT_CHAR_DATA:
1737                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1738                 break;
1739         case BINARY_PRINT_DATA_BEGIN:
1740         case BINARY_PRINT_LINE_BEGIN:
1741         case BINARY_PRINT_ADDR:
1742         case BINARY_PRINT_NUM_DATA:
1743         case BINARY_PRINT_NUM_PAD:
1744         case BINARY_PRINT_SEP:
1745         case BINARY_PRINT_CHAR_PAD:
1746         case BINARY_PRINT_LINE_END:
1747         case BINARY_PRINT_DATA_END:
1748         default:
1749                 break;
1750         }
1751 }
1752
1753 static void bpf_output__fprintf(struct trace *trace,
1754                                 struct perf_sample *sample)
1755 {
1756         print_binary(sample->raw_data, sample->raw_size, 8,
1757                      bpf_output__printer, trace->output);
1758 }
1759
1760 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1761                                 union perf_event *event __maybe_unused,
1762                                 struct perf_sample *sample)
1763 {
1764         int callchain_ret = 0;
1765
1766         if (sample->callchain) {
1767                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1768                 if (callchain_ret == 0) {
1769                         if (callchain_cursor.nr < trace->min_stack)
1770                                 goto out;
1771                         callchain_ret = 1;
1772                 }
1773         }
1774
1775         trace__printf_interrupted_entry(trace, sample);
1776         trace__fprintf_tstamp(trace, sample->time, trace->output);
1777
1778         if (trace->trace_syscalls)
1779                 fprintf(trace->output, "(         ): ");
1780
1781         fprintf(trace->output, "%s:", evsel->name);
1782
1783         if (perf_evsel__is_bpf_output(evsel)) {
1784                 bpf_output__fprintf(trace, sample);
1785         } else if (evsel->tp_format) {
1786                 event_format__fprintf(evsel->tp_format, sample->cpu,
1787                                       sample->raw_data, sample->raw_size,
1788                                       trace->output);
1789         }
1790
1791         fprintf(trace->output, ")\n");
1792
1793         if (callchain_ret > 0)
1794                 trace__fprintf_callchain(trace, sample);
1795         else if (callchain_ret < 0)
1796                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1797 out:
1798         return 0;
1799 }
1800
1801 static void print_location(FILE *f, struct perf_sample *sample,
1802                            struct addr_location *al,
1803                            bool print_dso, bool print_sym)
1804 {
1805
1806         if ((verbose || print_dso) && al->map)
1807                 fprintf(f, "%s@", al->map->dso->long_name);
1808
1809         if ((verbose || print_sym) && al->sym)
1810                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1811                         al->addr - al->sym->start);
1812         else if (al->map)
1813                 fprintf(f, "0x%" PRIx64, al->addr);
1814         else
1815                 fprintf(f, "0x%" PRIx64, sample->addr);
1816 }
1817
1818 static int trace__pgfault(struct trace *trace,
1819                           struct perf_evsel *evsel,
1820                           union perf_event *event __maybe_unused,
1821                           struct perf_sample *sample)
1822 {
1823         struct thread *thread;
1824         struct addr_location al;
1825         char map_type = 'd';
1826         struct thread_trace *ttrace;
1827         int err = -1;
1828         int callchain_ret = 0;
1829
1830         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1831
1832         if (sample->callchain) {
1833                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1834                 if (callchain_ret == 0) {
1835                         if (callchain_cursor.nr < trace->min_stack)
1836                                 goto out_put;
1837                         callchain_ret = 1;
1838                 }
1839         }
1840
1841         ttrace = thread__trace(thread, trace->output);
1842         if (ttrace == NULL)
1843                 goto out_put;
1844
1845         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1846                 ttrace->pfmaj++;
1847         else
1848                 ttrace->pfmin++;
1849
1850         if (trace->summary_only)
1851                 goto out;
1852
1853         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1854                               sample->ip, &al);
1855
1856         trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1857
1858         fprintf(trace->output, "%sfault [",
1859                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1860                 "maj" : "min");
1861
1862         print_location(trace->output, sample, &al, false, true);
1863
1864         fprintf(trace->output, "] => ");
1865
1866         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1867                                    sample->addr, &al);
1868
1869         if (!al.map) {
1870                 thread__find_addr_location(thread, sample->cpumode,
1871                                            MAP__FUNCTION, sample->addr, &al);
1872
1873                 if (al.map)
1874                         map_type = 'x';
1875                 else
1876                         map_type = '?';
1877         }
1878
1879         print_location(trace->output, sample, &al, true, false);
1880
1881         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1882
1883         if (callchain_ret > 0)
1884                 trace__fprintf_callchain(trace, sample);
1885         else if (callchain_ret < 0)
1886                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1887 out:
1888         err = 0;
1889 out_put:
1890         thread__put(thread);
1891         return err;
1892 }
1893
1894 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1895 {
1896         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1897             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1898                 return false;
1899
1900         if (trace->pid_list || trace->tid_list)
1901                 return true;
1902
1903         return false;
1904 }
1905
1906 static void trace__set_base_time(struct trace *trace,
1907                                  struct perf_evsel *evsel,
1908                                  struct perf_sample *sample)
1909 {
1910         /*
1911          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1912          * and don't use sample->time unconditionally, we may end up having
1913          * some other event in the future without PERF_SAMPLE_TIME for good
1914          * reason, i.e. we may not be interested in its timestamps, just in
1915          * it taking place, picking some piece of information when it
1916          * appears in our event stream (vfs_getname comes to mind).
1917          */
1918         if (trace->base_time == 0 && !trace->full_time &&
1919             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1920                 trace->base_time = sample->time;
1921 }
1922
1923 static int trace__process_sample(struct perf_tool *tool,
1924                                  union perf_event *event,
1925                                  struct perf_sample *sample,
1926                                  struct perf_evsel *evsel,
1927                                  struct machine *machine __maybe_unused)
1928 {
1929         struct trace *trace = container_of(tool, struct trace, tool);
1930         int err = 0;
1931
1932         tracepoint_handler handler = evsel->handler;
1933
1934         if (skip_sample(trace, sample))
1935                 return 0;
1936
1937         trace__set_base_time(trace, evsel, sample);
1938
1939         if (handler) {
1940                 ++trace->nr_events;
1941                 handler(trace, evsel, event, sample);
1942         }
1943
1944         return err;
1945 }
1946
1947 static int parse_target_str(struct trace *trace)
1948 {
1949         if (trace->opts.target.pid) {
1950                 trace->pid_list = intlist__new(trace->opts.target.pid);
1951                 if (trace->pid_list == NULL) {
1952                         pr_err("Error parsing process id string\n");
1953                         return -EINVAL;
1954                 }
1955         }
1956
1957         if (trace->opts.target.tid) {
1958                 trace->tid_list = intlist__new(trace->opts.target.tid);
1959                 if (trace->tid_list == NULL) {
1960                         pr_err("Error parsing thread id string\n");
1961                         return -EINVAL;
1962                 }
1963         }
1964
1965         return 0;
1966 }
1967
1968 static int trace__record(struct trace *trace, int argc, const char **argv)
1969 {
1970         unsigned int rec_argc, i, j;
1971         const char **rec_argv;
1972         const char * const record_args[] = {
1973                 "record",
1974                 "-R",
1975                 "-m", "1024",
1976                 "-c", "1",
1977         };
1978
1979         const char * const sc_args[] = { "-e", };
1980         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1981         const char * const majpf_args[] = { "-e", "major-faults" };
1982         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1983         const char * const minpf_args[] = { "-e", "minor-faults" };
1984         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1985
1986         /* +1 is for the event string below */
1987         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1988                 majpf_args_nr + minpf_args_nr + argc;
1989         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1990
1991         if (rec_argv == NULL)
1992                 return -ENOMEM;
1993
1994         j = 0;
1995         for (i = 0; i < ARRAY_SIZE(record_args); i++)
1996                 rec_argv[j++] = record_args[i];
1997
1998         if (trace->trace_syscalls) {
1999                 for (i = 0; i < sc_args_nr; i++)
2000                         rec_argv[j++] = sc_args[i];
2001
2002                 /* event string may be different for older kernels - e.g., RHEL6 */
2003                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2004                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2005                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2006                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2007                 else {
2008                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2009                         return -1;
2010                 }
2011         }
2012
2013         if (trace->trace_pgfaults & TRACE_PFMAJ)
2014                 for (i = 0; i < majpf_args_nr; i++)
2015                         rec_argv[j++] = majpf_args[i];
2016
2017         if (trace->trace_pgfaults & TRACE_PFMIN)
2018                 for (i = 0; i < minpf_args_nr; i++)
2019                         rec_argv[j++] = minpf_args[i];
2020
2021         for (i = 0; i < (unsigned int)argc; i++)
2022                 rec_argv[j++] = argv[i];
2023
2024         return cmd_record(j, rec_argv, NULL);
2025 }
2026
2027 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2028
2029 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2030 {
2031         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2032
2033         if (IS_ERR(evsel))
2034                 return false;
2035
2036         if (perf_evsel__field(evsel, "pathname") == NULL) {
2037                 perf_evsel__delete(evsel);
2038                 return false;
2039         }
2040
2041         evsel->handler = trace__vfs_getname;
2042         perf_evlist__add(evlist, evsel);
2043         return true;
2044 }
2045
2046 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2047 {
2048         struct perf_evsel *evsel;
2049         struct perf_event_attr attr = {
2050                 .type = PERF_TYPE_SOFTWARE,
2051                 .mmap_data = 1,
2052         };
2053
2054         attr.config = config;
2055         attr.sample_period = 1;
2056
2057         event_attr_init(&attr);
2058
2059         evsel = perf_evsel__new(&attr);
2060         if (evsel)
2061                 evsel->handler = trace__pgfault;
2062
2063         return evsel;
2064 }
2065
2066 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2067 {
2068         const u32 type = event->header.type;
2069         struct perf_evsel *evsel;
2070
2071         if (type != PERF_RECORD_SAMPLE) {
2072                 trace__process_event(trace, trace->host, event, sample);
2073                 return;
2074         }
2075
2076         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2077         if (evsel == NULL) {
2078                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2079                 return;
2080         }
2081
2082         trace__set_base_time(trace, evsel, sample);
2083
2084         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2085             sample->raw_data == NULL) {
2086                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2087                        perf_evsel__name(evsel), sample->tid,
2088                        sample->cpu, sample->raw_size);
2089         } else {
2090                 tracepoint_handler handler = evsel->handler;
2091                 handler(trace, evsel, event, sample);
2092         }
2093 }
2094
2095 static int trace__add_syscall_newtp(struct trace *trace)
2096 {
2097         int ret = -1;
2098         struct perf_evlist *evlist = trace->evlist;
2099         struct perf_evsel *sys_enter, *sys_exit;
2100
2101         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2102         if (sys_enter == NULL)
2103                 goto out;
2104
2105         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2106                 goto out_delete_sys_enter;
2107
2108         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2109         if (sys_exit == NULL)
2110                 goto out_delete_sys_enter;
2111
2112         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2113                 goto out_delete_sys_exit;
2114
2115         perf_evlist__add(evlist, sys_enter);
2116         perf_evlist__add(evlist, sys_exit);
2117
2118         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2119                 /*
2120                  * We're interested only in the user space callchain
2121                  * leading to the syscall, allow overriding that for
2122                  * debugging reasons using --kernel_syscall_callchains
2123                  */
2124                 sys_exit->attr.exclude_callchain_kernel = 1;
2125         }
2126
2127         trace->syscalls.events.sys_enter = sys_enter;
2128         trace->syscalls.events.sys_exit  = sys_exit;
2129
2130         ret = 0;
2131 out:
2132         return ret;
2133
2134 out_delete_sys_exit:
2135         perf_evsel__delete_priv(sys_exit);
2136 out_delete_sys_enter:
2137         perf_evsel__delete_priv(sys_enter);
2138         goto out;
2139 }
2140
2141 static int trace__set_ev_qualifier_filter(struct trace *trace)
2142 {
2143         int err = -1;
2144         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2145                                                 trace->ev_qualifier_ids.nr,
2146                                                 trace->ev_qualifier_ids.entries);
2147
2148         if (filter == NULL)
2149                 goto out_enomem;
2150
2151         if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2152                 err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2153
2154         free(filter);
2155 out:
2156         return err;
2157 out_enomem:
2158         errno = ENOMEM;
2159         goto out;
2160 }
2161
2162 static int trace__run(struct trace *trace, int argc, const char **argv)
2163 {
2164         struct perf_evlist *evlist = trace->evlist;
2165         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2166         int err = -1, i;
2167         unsigned long before;
2168         const bool forks = argc > 0;
2169         bool draining = false;
2170
2171         trace->live = true;
2172
2173         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2174                 goto out_error_raw_syscalls;
2175
2176         if (trace->trace_syscalls)
2177                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2178
2179         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2180                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2181                 if (pgfault_maj == NULL)
2182                         goto out_error_mem;
2183                 perf_evlist__add(evlist, pgfault_maj);
2184         }
2185
2186         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2187                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2188                 if (pgfault_min == NULL)
2189                         goto out_error_mem;
2190                 perf_evlist__add(evlist, pgfault_min);
2191         }
2192
2193         if (trace->sched &&
2194             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2195                                    trace__sched_stat_runtime))
2196                 goto out_error_sched_stat_runtime;
2197
2198         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2199         if (err < 0) {
2200                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2201                 goto out_delete_evlist;
2202         }
2203
2204         err = trace__symbols_init(trace, evlist);
2205         if (err < 0) {
2206                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2207                 goto out_delete_evlist;
2208         }
2209
2210         perf_evlist__config(evlist, &trace->opts, NULL);
2211
2212         if (callchain_param.enabled) {
2213                 bool use_identifier = false;
2214
2215                 if (trace->syscalls.events.sys_exit) {
2216                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2217                                                      &trace->opts, &callchain_param);
2218                         use_identifier = true;
2219                 }
2220
2221                 if (pgfault_maj) {
2222                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2223                         use_identifier = true;
2224                 }
2225
2226                 if (pgfault_min) {
2227                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2228                         use_identifier = true;
2229                 }
2230
2231                 if (use_identifier) {
2232                        /*
2233                         * Now we have evsels with different sample_ids, use
2234                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2235                         * from a fixed position in each ring buffer record.
2236                         *
2237                         * As of this the changeset introducing this comment, this
2238                         * isn't strictly needed, as the fields that can come before
2239                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2240                         * some of those for things like copying the payload of
2241                         * pointer syscall arguments, and for vfs_getname we don't
2242                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2243                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2244                         */
2245                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2246                         perf_evlist__reset_sample_bit(evlist, ID);
2247                 }
2248         }
2249
2250         signal(SIGCHLD, sig_handler);
2251         signal(SIGINT, sig_handler);
2252
2253         if (forks) {
2254                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2255                                                     argv, false, NULL);
2256                 if (err < 0) {
2257                         fprintf(trace->output, "Couldn't run the workload!\n");
2258                         goto out_delete_evlist;
2259                 }
2260         }
2261
2262         err = perf_evlist__open(evlist);
2263         if (err < 0)
2264                 goto out_error_open;
2265
2266         err = bpf__apply_obj_config();
2267         if (err) {
2268                 char errbuf[BUFSIZ];
2269
2270                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2271                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2272                          errbuf);
2273                 goto out_error_open;
2274         }
2275
2276         /*
2277          * Better not use !target__has_task() here because we need to cover the
2278          * case where no threads were specified in the command line, but a
2279          * workload was, and in that case we will fill in the thread_map when
2280          * we fork the workload in perf_evlist__prepare_workload.
2281          */
2282         if (trace->filter_pids.nr > 0)
2283                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2284         else if (thread_map__pid(evlist->threads, 0) == -1)
2285                 err = perf_evlist__set_filter_pid(evlist, getpid());
2286
2287         if (err < 0)
2288                 goto out_error_mem;
2289
2290         if (trace->ev_qualifier_ids.nr > 0) {
2291                 err = trace__set_ev_qualifier_filter(trace);
2292                 if (err < 0)
2293                         goto out_errno;
2294
2295                 pr_debug("event qualifier tracepoint filter: %s\n",
2296                          trace->syscalls.events.sys_exit->filter);
2297         }
2298
2299         err = perf_evlist__apply_filters(evlist, &evsel);
2300         if (err < 0)
2301                 goto out_error_apply_filters;
2302
2303         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2304         if (err < 0)
2305                 goto out_error_mmap;
2306
2307         if (!target__none(&trace->opts.target))
2308                 perf_evlist__enable(evlist);
2309
2310         if (forks)
2311                 perf_evlist__start_workload(evlist);
2312
2313         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2314                                   evlist->threads->nr > 1 ||
2315                                   perf_evlist__first(evlist)->attr.inherit;
2316 again:
2317         before = trace->nr_events;
2318
2319         for (i = 0; i < evlist->nr_mmaps; i++) {
2320                 union perf_event *event;
2321
2322                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2323                         struct perf_sample sample;
2324
2325                         ++trace->nr_events;
2326
2327                         err = perf_evlist__parse_sample(evlist, event, &sample);
2328                         if (err) {
2329                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2330                                 goto next_event;
2331                         }
2332
2333                         trace__handle_event(trace, event, &sample);
2334 next_event:
2335                         perf_evlist__mmap_consume(evlist, i);
2336
2337                         if (interrupted)
2338                                 goto out_disable;
2339
2340                         if (done && !draining) {
2341                                 perf_evlist__disable(evlist);
2342                                 draining = true;
2343                         }
2344                 }
2345         }
2346
2347         if (trace->nr_events == before) {
2348                 int timeout = done ? 100 : -1;
2349
2350                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2351                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2352                                 draining = true;
2353
2354                         goto again;
2355                 }
2356         } else {
2357                 goto again;
2358         }
2359
2360 out_disable:
2361         thread__zput(trace->current);
2362
2363         perf_evlist__disable(evlist);
2364
2365         if (!err) {
2366                 if (trace->summary)
2367                         trace__fprintf_thread_summary(trace, trace->output);
2368
2369                 if (trace->show_tool_stats) {
2370                         fprintf(trace->output, "Stats:\n "
2371                                                " vfs_getname : %" PRIu64 "\n"
2372                                                " proc_getname: %" PRIu64 "\n",
2373                                 trace->stats.vfs_getname,
2374                                 trace->stats.proc_getname);
2375                 }
2376         }
2377
2378 out_delete_evlist:
2379         perf_evlist__delete(evlist);
2380         trace->evlist = NULL;
2381         trace->live = false;
2382         return err;
2383 {
2384         char errbuf[BUFSIZ];
2385
2386 out_error_sched_stat_runtime:
2387         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2388         goto out_error;
2389
2390 out_error_raw_syscalls:
2391         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2392         goto out_error;
2393
2394 out_error_mmap:
2395         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2396         goto out_error;
2397
2398 out_error_open:
2399         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2400
2401 out_error:
2402         fprintf(trace->output, "%s\n", errbuf);
2403         goto out_delete_evlist;
2404
2405 out_error_apply_filters:
2406         fprintf(trace->output,
2407                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2408                 evsel->filter, perf_evsel__name(evsel), errno,
2409                 str_error_r(errno, errbuf, sizeof(errbuf)));
2410         goto out_delete_evlist;
2411 }
2412 out_error_mem:
2413         fprintf(trace->output, "Not enough memory to run!\n");
2414         goto out_delete_evlist;
2415
2416 out_errno:
2417         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2418         goto out_delete_evlist;
2419 }
2420
2421 static int trace__replay(struct trace *trace)
2422 {
2423         const struct perf_evsel_str_handler handlers[] = {
2424                 { "probe:vfs_getname",       trace__vfs_getname, },
2425         };
2426         struct perf_data_file file = {
2427                 .path  = input_name,
2428                 .mode  = PERF_DATA_MODE_READ,
2429                 .force = trace->force,
2430         };
2431         struct perf_session *session;
2432         struct perf_evsel *evsel;
2433         int err = -1;
2434
2435         trace->tool.sample        = trace__process_sample;
2436         trace->tool.mmap          = perf_event__process_mmap;
2437         trace->tool.mmap2         = perf_event__process_mmap2;
2438         trace->tool.comm          = perf_event__process_comm;
2439         trace->tool.exit          = perf_event__process_exit;
2440         trace->tool.fork          = perf_event__process_fork;
2441         trace->tool.attr          = perf_event__process_attr;
2442         trace->tool.tracing_data = perf_event__process_tracing_data;
2443         trace->tool.build_id      = perf_event__process_build_id;
2444
2445         trace->tool.ordered_events = true;
2446         trace->tool.ordering_requires_timestamps = true;
2447
2448         /* add tid to output */
2449         trace->multiple_threads = true;
2450
2451         session = perf_session__new(&file, false, &trace->tool);
2452         if (session == NULL)
2453                 return -1;
2454
2455         if (symbol__init(&session->header.env) < 0)
2456                 goto out;
2457
2458         trace->host = &session->machines.host;
2459
2460         err = perf_session__set_tracepoints_handlers(session, handlers);
2461         if (err)
2462                 goto out;
2463
2464         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2465                                                      "raw_syscalls:sys_enter");
2466         /* older kernels have syscalls tp versus raw_syscalls */
2467         if (evsel == NULL)
2468                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2469                                                              "syscalls:sys_enter");
2470
2471         if (evsel &&
2472             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2473             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2474                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2475                 goto out;
2476         }
2477
2478         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2479                                                      "raw_syscalls:sys_exit");
2480         if (evsel == NULL)
2481                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2482                                                              "syscalls:sys_exit");
2483         if (evsel &&
2484             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2485             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2486                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2487                 goto out;
2488         }
2489
2490         evlist__for_each_entry(session->evlist, evsel) {
2491                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2492                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2493                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2494                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2495                         evsel->handler = trace__pgfault;
2496         }
2497
2498         err = parse_target_str(trace);
2499         if (err != 0)
2500                 goto out;
2501
2502         setup_pager();
2503
2504         err = perf_session__process_events(session);
2505         if (err)
2506                 pr_err("Failed to process events, error %d", err);
2507
2508         else if (trace->summary)
2509                 trace__fprintf_thread_summary(trace, trace->output);
2510
2511 out:
2512         perf_session__delete(session);
2513
2514         return err;
2515 }
2516
2517 static size_t trace__fprintf_threads_header(FILE *fp)
2518 {
2519         size_t printed;
2520
2521         printed  = fprintf(fp, "\n Summary of events:\n\n");
2522
2523         return printed;
2524 }
2525
2526 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2527         struct stats    *stats;
2528         double          msecs;
2529         int             syscall;
2530 )
2531 {
2532         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2533         struct stats *stats = source->priv;
2534
2535         entry->syscall = source->i;
2536         entry->stats   = stats;
2537         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2538 }
2539
2540 static size_t thread__dump_stats(struct thread_trace *ttrace,
2541                                  struct trace *trace, FILE *fp)
2542 {
2543         size_t printed = 0;
2544         struct syscall *sc;
2545         struct rb_node *nd;
2546         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2547
2548         if (syscall_stats == NULL)
2549                 return 0;
2550
2551         printed += fprintf(fp, "\n");
2552
2553         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2554         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2555         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2556
2557         resort_rb__for_each_entry(nd, syscall_stats) {
2558                 struct stats *stats = syscall_stats_entry->stats;
2559                 if (stats) {
2560                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2561                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2562                         double avg = avg_stats(stats);
2563                         double pct;
2564                         u64 n = (u64) stats->n;
2565
2566                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2567                         avg /= NSEC_PER_MSEC;
2568
2569                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2570                         printed += fprintf(fp, "   %-15s", sc->name);
2571                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2572                                            n, syscall_stats_entry->msecs, min, avg);
2573                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2574                 }
2575         }
2576
2577         resort_rb__delete(syscall_stats);
2578         printed += fprintf(fp, "\n\n");
2579
2580         return printed;
2581 }
2582
2583 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2584 {
2585         size_t printed = 0;
2586         struct thread_trace *ttrace = thread__priv(thread);
2587         double ratio;
2588
2589         if (ttrace == NULL)
2590                 return 0;
2591
2592         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2593
2594         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2595         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2596         printed += fprintf(fp, "%.1f%%", ratio);
2597         if (ttrace->pfmaj)
2598                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2599         if (ttrace->pfmin)
2600                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2601         if (trace->sched)
2602                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2603         else if (fputc('\n', fp) != EOF)
2604                 ++printed;
2605
2606         printed += thread__dump_stats(ttrace, trace, fp);
2607
2608         return printed;
2609 }
2610
2611 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2612 {
2613         return ttrace ? ttrace->nr_events : 0;
2614 }
2615
2616 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2617         struct thread *thread;
2618 )
2619 {
2620         entry->thread = rb_entry(nd, struct thread, rb_node);
2621 }
2622
2623 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2624 {
2625         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2626         size_t printed = trace__fprintf_threads_header(fp);
2627         struct rb_node *nd;
2628
2629         if (threads == NULL) {
2630                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2631                 return 0;
2632         }
2633
2634         resort_rb__for_each_entry(nd, threads)
2635                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2636
2637         resort_rb__delete(threads);
2638
2639         return printed;
2640 }
2641
2642 static int trace__set_duration(const struct option *opt, const char *str,
2643                                int unset __maybe_unused)
2644 {
2645         struct trace *trace = opt->value;
2646
2647         trace->duration_filter = atof(str);
2648         return 0;
2649 }
2650
2651 static int trace__set_filter_pids(const struct option *opt, const char *str,
2652                                   int unset __maybe_unused)
2653 {
2654         int ret = -1;
2655         size_t i;
2656         struct trace *trace = opt->value;
2657         /*
2658          * FIXME: introduce a intarray class, plain parse csv and create a
2659          * { int nr, int entries[] } struct...
2660          */
2661         struct intlist *list = intlist__new(str);
2662
2663         if (list == NULL)
2664                 return -1;
2665
2666         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2667         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2668
2669         if (trace->filter_pids.entries == NULL)
2670                 goto out;
2671
2672         trace->filter_pids.entries[0] = getpid();
2673
2674         for (i = 1; i < trace->filter_pids.nr; ++i)
2675                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2676
2677         intlist__delete(list);
2678         ret = 0;
2679 out:
2680         return ret;
2681 }
2682
2683 static int trace__open_output(struct trace *trace, const char *filename)
2684 {
2685         struct stat st;
2686
2687         if (!stat(filename, &st) && st.st_size) {
2688                 char oldname[PATH_MAX];
2689
2690                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2691                 unlink(oldname);
2692                 rename(filename, oldname);
2693         }
2694
2695         trace->output = fopen(filename, "w");
2696
2697         return trace->output == NULL ? -errno : 0;
2698 }
2699
2700 static int parse_pagefaults(const struct option *opt, const char *str,
2701                             int unset __maybe_unused)
2702 {
2703         int *trace_pgfaults = opt->value;
2704
2705         if (strcmp(str, "all") == 0)
2706                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2707         else if (strcmp(str, "maj") == 0)
2708                 *trace_pgfaults |= TRACE_PFMAJ;
2709         else if (strcmp(str, "min") == 0)
2710                 *trace_pgfaults |= TRACE_PFMIN;
2711         else
2712                 return -1;
2713
2714         return 0;
2715 }
2716
2717 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2718 {
2719         struct perf_evsel *evsel;
2720
2721         evlist__for_each_entry(evlist, evsel)
2722                 evsel->handler = handler;
2723 }
2724
2725 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2726 {
2727         const char *trace_usage[] = {
2728                 "perf trace [<options>] [<command>]",
2729                 "perf trace [<options>] -- <command> [<options>]",
2730                 "perf trace record [<options>] [<command>]",
2731                 "perf trace record [<options>] -- <command> [<options>]",
2732                 NULL
2733         };
2734         struct trace trace = {
2735                 .syscalls = {
2736                         . max = -1,
2737                 },
2738                 .opts = {
2739                         .target = {
2740                                 .uid       = UINT_MAX,
2741                                 .uses_mmap = true,
2742                         },
2743                         .user_freq     = UINT_MAX,
2744                         .user_interval = ULLONG_MAX,
2745                         .no_buffering  = true,
2746                         .mmap_pages    = UINT_MAX,
2747                         .proc_map_timeout  = 500,
2748                 },
2749                 .output = stderr,
2750                 .show_comm = true,
2751                 .trace_syscalls = true,
2752                 .kernel_syscallchains = false,
2753                 .max_stack = UINT_MAX,
2754         };
2755         const char *output_name = NULL;
2756         const char *ev_qualifier_str = NULL;
2757         const struct option trace_options[] = {
2758         OPT_CALLBACK(0, "event", &trace.evlist, "event",
2759                      "event selector. use 'perf list' to list available events",
2760                      parse_events_option),
2761         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2762                     "show the thread COMM next to its id"),
2763         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2764         OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2765         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2766         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2767         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2768                     "trace events on existing process id"),
2769         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2770                     "trace events on existing thread id"),
2771         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2772                      "pids to filter (by the kernel)", trace__set_filter_pids),
2773         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2774                     "system-wide collection from all CPUs"),
2775         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2776                     "list of cpus to monitor"),
2777         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2778                     "child tasks do not inherit counters"),
2779         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2780                      "number of mmap data pages",
2781                      perf_evlist__parse_mmap_pages),
2782         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2783                    "user to profile"),
2784         OPT_CALLBACK(0, "duration", &trace, "float",
2785                      "show only events with duration > N.M ms",
2786                      trace__set_duration),
2787         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2788         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2789         OPT_BOOLEAN('T', "time", &trace.full_time,
2790                     "Show full timestamp, not time relative to first start"),
2791         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2792                     "Show only syscall summary with statistics"),
2793         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2794                     "Show all syscalls and summary with statistics"),
2795         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2796                      "Trace pagefaults", parse_pagefaults, "maj"),
2797         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2798         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2799         OPT_CALLBACK(0, "call-graph", &trace.opts,
2800                      "record_mode[,record_size]", record_callchain_help,
2801                      &record_parse_callchain_opt),
2802         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2803                     "Show the kernel callchains on the syscall exit path"),
2804         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2805                      "Set the minimum stack depth when parsing the callchain, "
2806                      "anything below the specified depth will be ignored."),
2807         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2808                      "Set the maximum stack depth when parsing the callchain, "
2809                      "anything beyond the specified depth will be ignored. "
2810                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2811         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2812                         "per thread proc mmap processing timeout in ms"),
2813         OPT_END()
2814         };
2815         bool __maybe_unused max_stack_user_set = true;
2816         bool mmap_pages_user_set = true;
2817         const char * const trace_subcommands[] = { "record", NULL };
2818         int err;
2819         char bf[BUFSIZ];
2820
2821         signal(SIGSEGV, sighandler_dump_stack);
2822         signal(SIGFPE, sighandler_dump_stack);
2823
2824         trace.evlist = perf_evlist__new();
2825         trace.sctbl = syscalltbl__new();
2826
2827         if (trace.evlist == NULL || trace.sctbl == NULL) {
2828                 pr_err("Not enough memory to run!\n");
2829                 err = -ENOMEM;
2830                 goto out;
2831         }
2832
2833         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2834                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2835
2836         err = bpf__setup_stdout(trace.evlist);
2837         if (err) {
2838                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2839                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2840                 goto out;
2841         }
2842
2843         err = -1;
2844
2845         if (trace.trace_pgfaults) {
2846                 trace.opts.sample_address = true;
2847                 trace.opts.sample_time = true;
2848         }
2849
2850         if (trace.opts.mmap_pages == UINT_MAX)
2851                 mmap_pages_user_set = false;
2852
2853         if (trace.max_stack == UINT_MAX) {
2854                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2855                 max_stack_user_set = false;
2856         }
2857
2858 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2859         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2860                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2861 #endif
2862
2863         if (callchain_param.enabled) {
2864                 if (!mmap_pages_user_set && geteuid() == 0)
2865                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2866
2867                 symbol_conf.use_callchain = true;
2868         }
2869
2870         if (trace.evlist->nr_entries > 0)
2871                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2872
2873         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2874                 return trace__record(&trace, argc-1, &argv[1]);
2875
2876         /* summary_only implies summary option, but don't overwrite summary if set */
2877         if (trace.summary_only)
2878                 trace.summary = trace.summary_only;
2879
2880         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2881             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2882                 pr_err("Please specify something to trace.\n");
2883                 return -1;
2884         }
2885
2886         if (!trace.trace_syscalls && ev_qualifier_str) {
2887                 pr_err("The -e option can't be used with --no-syscalls.\n");
2888                 goto out;
2889         }
2890
2891         if (output_name != NULL) {
2892                 err = trace__open_output(&trace, output_name);
2893                 if (err < 0) {
2894                         perror("failed to create output file");
2895                         goto out;
2896                 }
2897         }
2898
2899         trace.open_id = syscalltbl__id(trace.sctbl, "open");
2900
2901         if (ev_qualifier_str != NULL) {
2902                 const char *s = ev_qualifier_str;
2903                 struct strlist_config slist_config = {
2904                         .dirname = system_path(STRACE_GROUPS_DIR),
2905                 };
2906
2907                 trace.not_ev_qualifier = *s == '!';
2908                 if (trace.not_ev_qualifier)
2909                         ++s;
2910                 trace.ev_qualifier = strlist__new(s, &slist_config);
2911                 if (trace.ev_qualifier == NULL) {
2912                         fputs("Not enough memory to parse event qualifier",
2913                               trace.output);
2914                         err = -ENOMEM;
2915                         goto out_close;
2916                 }
2917
2918                 err = trace__validate_ev_qualifier(&trace);
2919                 if (err)
2920                         goto out_close;
2921         }
2922
2923         err = target__validate(&trace.opts.target);
2924         if (err) {
2925                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2926                 fprintf(trace.output, "%s", bf);
2927                 goto out_close;
2928         }
2929
2930         err = target__parse_uid(&trace.opts.target);
2931         if (err) {
2932                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2933                 fprintf(trace.output, "%s", bf);
2934                 goto out_close;
2935         }
2936
2937         if (!argc && target__none(&trace.opts.target))
2938                 trace.opts.target.system_wide = true;
2939
2940         if (input_name)
2941                 err = trace__replay(&trace);
2942         else
2943                 err = trace__run(&trace, argc, argv);
2944
2945 out_close:
2946         if (output_name != NULL)
2947                 fclose(trace.output);
2948 out:
2949         return err;
2950 }