Merge tag 'platform-drivers-x86-v4.9-2' of git://git.infradead.org/users/dvhart/linux...
[cascardo/linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
40
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
42 #include <stdlib.h>
43 #include <linux/err.h>
44 #include <linux/filter.h>
45 #include <linux/audit.h>
46 #include <linux/random.h>
47 #include <linux/stringify.h>
48 #include <linux/time64.h>
49
50 #ifndef O_CLOEXEC
51 # define O_CLOEXEC              02000000
52 #endif
53
54 struct trace {
55         struct perf_tool        tool;
56         struct syscalltbl       *sctbl;
57         struct {
58                 int             max;
59                 struct syscall  *table;
60                 struct {
61                         struct perf_evsel *sys_enter,
62                                           *sys_exit;
63                 }               events;
64         } syscalls;
65         struct record_opts      opts;
66         struct perf_evlist      *evlist;
67         struct machine          *host;
68         struct thread           *current;
69         u64                     base_time;
70         FILE                    *output;
71         unsigned long           nr_events;
72         struct strlist          *ev_qualifier;
73         struct {
74                 size_t          nr;
75                 int             *entries;
76         }                       ev_qualifier_ids;
77         struct intlist          *tid_list;
78         struct intlist          *pid_list;
79         struct {
80                 size_t          nr;
81                 pid_t           *entries;
82         }                       filter_pids;
83         double                  duration_filter;
84         double                  runtime_ms;
85         struct {
86                 u64             vfs_getname,
87                                 proc_getname;
88         } stats;
89         unsigned int            max_stack;
90         unsigned int            min_stack;
91         bool                    not_ev_qualifier;
92         bool                    live;
93         bool                    full_time;
94         bool                    sched;
95         bool                    multiple_threads;
96         bool                    summary;
97         bool                    summary_only;
98         bool                    show_comm;
99         bool                    show_tool_stats;
100         bool                    trace_syscalls;
101         bool                    kernel_syscallchains;
102         bool                    force;
103         bool                    vfs_getname;
104         int                     trace_pgfaults;
105         int                     open_id;
106 };
107
108 struct tp_field {
109         int offset;
110         union {
111                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
113         };
114 };
115
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
118 { \
119         u##bits value; \
120         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
121         return value;  \
122 }
123
124 TP_UINT_FIELD(8);
125 TP_UINT_FIELD(16);
126 TP_UINT_FIELD(32);
127 TP_UINT_FIELD(64);
128
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
131 { \
132         u##bits value; \
133         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134         return bswap_##bits(value);\
135 }
136
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
140
141 static int tp_field__init_uint(struct tp_field *field,
142                                struct format_field *format_field,
143                                bool needs_swap)
144 {
145         field->offset = format_field->offset;
146
147         switch (format_field->size) {
148         case 1:
149                 field->integer = tp_field__u8;
150                 break;
151         case 2:
152                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
153                 break;
154         case 4:
155                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
156                 break;
157         case 8:
158                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
159                 break;
160         default:
161                 return -1;
162         }
163
164         return 0;
165 }
166
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
168 {
169         return sample->raw_data + field->offset;
170 }
171
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
173 {
174         field->offset = format_field->offset;
175         field->pointer = tp_field__ptr;
176         return 0;
177 }
178
179 struct syscall_tp {
180         struct tp_field id;
181         union {
182                 struct tp_field args, ret;
183         };
184 };
185
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187                                           struct tp_field *field,
188                                           const char *name)
189 {
190         struct format_field *format_field = perf_evsel__field(evsel, name);
191
192         if (format_field == NULL)
193                 return -1;
194
195         return tp_field__init_uint(field, format_field, evsel->needs_swap);
196 }
197
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199         ({ struct syscall_tp *sc = evsel->priv;\
200            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
201
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203                                          struct tp_field *field,
204                                          const char *name)
205 {
206         struct format_field *format_field = perf_evsel__field(evsel, name);
207
208         if (format_field == NULL)
209                 return -1;
210
211         return tp_field__init_ptr(field, format_field);
212 }
213
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215         ({ struct syscall_tp *sc = evsel->priv;\
216            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
217
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
219 {
220         zfree(&evsel->priv);
221         perf_evsel__delete(evsel);
222 }
223
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
225 {
226         evsel->priv = malloc(sizeof(struct syscall_tp));
227         if (evsel->priv != NULL) {
228                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
229                         goto out_delete;
230
231                 evsel->handler = handler;
232                 return 0;
233         }
234
235         return -ENOMEM;
236
237 out_delete:
238         zfree(&evsel->priv);
239         return -ENOENT;
240 }
241
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
243 {
244         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
245
246         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
247         if (IS_ERR(evsel))
248                 evsel = perf_evsel__newtp("syscalls", direction);
249
250         if (IS_ERR(evsel))
251                 return NULL;
252
253         if (perf_evsel__init_syscall_tp(evsel, handler))
254                 goto out_delete;
255
256         return evsel;
257
258 out_delete:
259         perf_evsel__delete_priv(evsel);
260         return NULL;
261 }
262
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264         ({ struct syscall_tp *fields = evsel->priv; \
265            fields->name.integer(&fields->name, sample); })
266
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268         ({ struct syscall_tp *fields = evsel->priv; \
269            fields->name.pointer(&fields->name, sample); })
270
271 struct syscall_arg {
272         unsigned long val;
273         struct thread *thread;
274         struct trace  *trace;
275         void          *parm;
276         u8            idx;
277         u8            mask;
278 };
279
280 struct strarray {
281         int         offset;
282         int         nr_entries;
283         const char **entries;
284 };
285
286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
287         .nr_entries = ARRAY_SIZE(array), \
288         .entries = array, \
289 }
290
291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
292         .offset     = off, \
293         .nr_entries = ARRAY_SIZE(array), \
294         .entries = array, \
295 }
296
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298                                                 const char *intfmt,
299                                                 struct syscall_arg *arg)
300 {
301         struct strarray *sa = arg->parm;
302         int idx = arg->val - sa->offset;
303
304         if (idx < 0 || idx >= sa->nr_entries)
305                 return scnprintf(bf, size, intfmt, arg->val);
306
307         return scnprintf(bf, size, "%s", sa->entries[idx]);
308 }
309
310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
311                                               struct syscall_arg *arg)
312 {
313         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
314 }
315
316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
317
318 #if defined(__i386__) || defined(__x86_64__)
319 /*
320  * FIXME: Make this available to all arches as soon as the ioctl beautifier
321  *        gets rewritten to support all arches.
322  */
323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
324                                                  struct syscall_arg *arg)
325 {
326         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
327 }
328
329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
330 #endif /* defined(__i386__) || defined(__x86_64__) */
331
332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
333                                         struct syscall_arg *arg);
334
335 #define SCA_FD syscall_arg__scnprintf_fd
336
337 #ifndef AT_FDCWD
338 #define AT_FDCWD        -100
339 #endif
340
341 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
342                                            struct syscall_arg *arg)
343 {
344         int fd = arg->val;
345
346         if (fd == AT_FDCWD)
347                 return scnprintf(bf, size, "CWD");
348
349         return syscall_arg__scnprintf_fd(bf, size, arg);
350 }
351
352 #define SCA_FDAT syscall_arg__scnprintf_fd_at
353
354 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
355                                               struct syscall_arg *arg);
356
357 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
358
359 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
360                                          struct syscall_arg *arg)
361 {
362         return scnprintf(bf, size, "%#lx", arg->val);
363 }
364
365 #define SCA_HEX syscall_arg__scnprintf_hex
366
367 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
368                                          struct syscall_arg *arg)
369 {
370         return scnprintf(bf, size, "%d", arg->val);
371 }
372
373 #define SCA_INT syscall_arg__scnprintf_int
374
375 static const char *bpf_cmd[] = {
376         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
377         "MAP_GET_NEXT_KEY", "PROG_LOAD",
378 };
379 static DEFINE_STRARRAY(bpf_cmd);
380
381 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
382 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
383
384 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
385 static DEFINE_STRARRAY(itimers);
386
387 static const char *keyctl_options[] = {
388         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
389         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
390         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
391         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
392         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
393 };
394 static DEFINE_STRARRAY(keyctl_options);
395
396 static const char *whences[] = { "SET", "CUR", "END",
397 #ifdef SEEK_DATA
398 "DATA",
399 #endif
400 #ifdef SEEK_HOLE
401 "HOLE",
402 #endif
403 };
404 static DEFINE_STRARRAY(whences);
405
406 static const char *fcntl_cmds[] = {
407         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
408         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
409         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
410         "F_GETOWNER_UIDS",
411 };
412 static DEFINE_STRARRAY(fcntl_cmds);
413
414 static const char *rlimit_resources[] = {
415         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
416         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
417         "RTTIME",
418 };
419 static DEFINE_STRARRAY(rlimit_resources);
420
421 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
422 static DEFINE_STRARRAY(sighow);
423
424 static const char *clockid[] = {
425         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
426         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
427         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
428 };
429 static DEFINE_STRARRAY(clockid);
430
431 static const char *socket_families[] = {
432         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
433         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
434         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
435         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
436         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
437         "ALG", "NFC", "VSOCK",
438 };
439 static DEFINE_STRARRAY(socket_families);
440
441 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
442                                                  struct syscall_arg *arg)
443 {
444         size_t printed = 0;
445         int mode = arg->val;
446
447         if (mode == F_OK) /* 0 */
448                 return scnprintf(bf, size, "F");
449 #define P_MODE(n) \
450         if (mode & n##_OK) { \
451                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
452                 mode &= ~n##_OK; \
453         }
454
455         P_MODE(R);
456         P_MODE(W);
457         P_MODE(X);
458 #undef P_MODE
459
460         if (mode)
461                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
462
463         return printed;
464 }
465
466 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
467
468 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
469                                               struct syscall_arg *arg);
470
471 #define SCA_FILENAME syscall_arg__scnprintf_filename
472
473 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
474                                                 struct syscall_arg *arg)
475 {
476         int printed = 0, flags = arg->val;
477
478 #define P_FLAG(n) \
479         if (flags & O_##n) { \
480                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
481                 flags &= ~O_##n; \
482         }
483
484         P_FLAG(CLOEXEC);
485         P_FLAG(NONBLOCK);
486 #undef P_FLAG
487
488         if (flags)
489                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
490
491         return printed;
492 }
493
494 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
495
496 #if defined(__i386__) || defined(__x86_64__)
497 /*
498  * FIXME: Make this available to all arches.
499  */
500 #define TCGETS          0x5401
501
502 static const char *tioctls[] = {
503         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
504         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
505         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
506         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
507         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
508         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
509         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
510         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
511         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
512         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
513         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
514         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
515         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
516         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
517         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
518 };
519
520 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
521 #endif /* defined(__i386__) || defined(__x86_64__) */
522
523 #ifndef GRND_NONBLOCK
524 #define GRND_NONBLOCK   0x0001
525 #endif
526 #ifndef GRND_RANDOM
527 #define GRND_RANDOM     0x0002
528 #endif
529
530 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
531                                                    struct syscall_arg *arg)
532 {
533         int printed = 0, flags = arg->val;
534
535 #define P_FLAG(n) \
536         if (flags & GRND_##n) { \
537                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
538                 flags &= ~GRND_##n; \
539         }
540
541         P_FLAG(RANDOM);
542         P_FLAG(NONBLOCK);
543 #undef P_FLAG
544
545         if (flags)
546                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
547
548         return printed;
549 }
550
551 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
552
553 #define STRARRAY(arg, name, array) \
554           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
555           .arg_parm      = { [arg] = &strarray__##array, }
556
557 #include "trace/beauty/eventfd.c"
558 #include "trace/beauty/flock.c"
559 #include "trace/beauty/futex_op.c"
560 #include "trace/beauty/mmap.c"
561 #include "trace/beauty/mode_t.c"
562 #include "trace/beauty/msg_flags.c"
563 #include "trace/beauty/open_flags.c"
564 #include "trace/beauty/perf_event_open.c"
565 #include "trace/beauty/pid.c"
566 #include "trace/beauty/sched_policy.c"
567 #include "trace/beauty/seccomp.c"
568 #include "trace/beauty/signum.c"
569 #include "trace/beauty/socket_type.c"
570 #include "trace/beauty/waitid_options.c"
571
572 static struct syscall_fmt {
573         const char *name;
574         const char *alias;
575         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
576         void       *arg_parm[6];
577         bool       errmsg;
578         bool       errpid;
579         bool       timeout;
580         bool       hexret;
581 } syscall_fmts[] = {
582         { .name     = "access",     .errmsg = true,
583           .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
584         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
585         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
586         { .name     = "brk",        .hexret = true,
587           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
588         { .name     = "chdir",      .errmsg = true, },
589         { .name     = "chmod",      .errmsg = true, },
590         { .name     = "chroot",     .errmsg = true, },
591         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
592         { .name     = "clone",      .errpid = true, },
593         { .name     = "close",      .errmsg = true,
594           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
595         { .name     = "connect",    .errmsg = true, },
596         { .name     = "creat",      .errmsg = true, },
597         { .name     = "dup",        .errmsg = true, },
598         { .name     = "dup2",       .errmsg = true, },
599         { .name     = "dup3",       .errmsg = true, },
600         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
601         { .name     = "eventfd2",   .errmsg = true,
602           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
603         { .name     = "faccessat",  .errmsg = true, },
604         { .name     = "fadvise64",  .errmsg = true, },
605         { .name     = "fallocate",  .errmsg = true, },
606         { .name     = "fchdir",     .errmsg = true, },
607         { .name     = "fchmod",     .errmsg = true, },
608         { .name     = "fchmodat",   .errmsg = true,
609           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
610         { .name     = "fchown",     .errmsg = true, },
611         { .name     = "fchownat",   .errmsg = true,
612           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
613         { .name     = "fcntl",      .errmsg = true,
614           .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
615           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
616         { .name     = "fdatasync",  .errmsg = true, },
617         { .name     = "flock",      .errmsg = true,
618           .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
619         { .name     = "fsetxattr",  .errmsg = true, },
620         { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
621         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
622         { .name     = "fstatfs",    .errmsg = true, },
623         { .name     = "fsync",    .errmsg = true, },
624         { .name     = "ftruncate", .errmsg = true, },
625         { .name     = "futex",      .errmsg = true,
626           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
627         { .name     = "futimesat", .errmsg = true,
628           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
629         { .name     = "getdents",   .errmsg = true, },
630         { .name     = "getdents64", .errmsg = true, },
631         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
632         { .name     = "getpid",     .errpid = true, },
633         { .name     = "getpgid",    .errpid = true, },
634         { .name     = "getppid",    .errpid = true, },
635         { .name     = "getrandom",  .errmsg = true,
636           .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
637         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
638         { .name     = "getxattr",   .errmsg = true, },
639         { .name     = "inotify_add_watch",          .errmsg = true, },
640         { .name     = "ioctl",      .errmsg = true,
641           .arg_scnprintf = {
642 #if defined(__i386__) || defined(__x86_64__)
643 /*
644  * FIXME: Make this available to all arches.
645  */
646                              [1] = SCA_STRHEXARRAY, /* cmd */
647                              [2] = SCA_HEX, /* arg */ },
648           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
649 #else
650                              [2] = SCA_HEX, /* arg */ }, },
651 #endif
652         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
653         { .name     = "kill",       .errmsg = true,
654           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
655         { .name     = "lchown",    .errmsg = true, },
656         { .name     = "lgetxattr",  .errmsg = true, },
657         { .name     = "linkat",     .errmsg = true,
658           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
659         { .name     = "listxattr",  .errmsg = true, },
660         { .name     = "llistxattr", .errmsg = true, },
661         { .name     = "lremovexattr",  .errmsg = true, },
662         { .name     = "lseek",      .errmsg = true,
663           .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
664           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
665         { .name     = "lsetxattr",  .errmsg = true, },
666         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
667         { .name     = "lsxattr",    .errmsg = true, },
668         { .name     = "madvise",    .errmsg = true,
669           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
670                              [2] = SCA_MADV_BHV, /* behavior */ }, },
671         { .name     = "mkdir",    .errmsg = true, },
672         { .name     = "mkdirat",    .errmsg = true,
673           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
674         { .name     = "mknod",      .errmsg = true, },
675         { .name     = "mknodat",    .errmsg = true,
676           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
677         { .name     = "mlock",      .errmsg = true,
678           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
679         { .name     = "mlockall",   .errmsg = true,
680           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
681         { .name     = "mmap",       .hexret = true,
682           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
683                              [2] = SCA_MMAP_PROT, /* prot */
684                              [3] = SCA_MMAP_FLAGS, /* flags */ }, },
685         { .name     = "mprotect",   .errmsg = true,
686           .arg_scnprintf = { [0] = SCA_HEX, /* start */
687                              [2] = SCA_MMAP_PROT, /* prot */ }, },
688         { .name     = "mq_unlink", .errmsg = true,
689           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
690         { .name     = "mremap",     .hexret = true,
691           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
692                              [3] = SCA_MREMAP_FLAGS, /* flags */
693                              [4] = SCA_HEX, /* new_addr */ }, },
694         { .name     = "munlock",    .errmsg = true,
695           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
696         { .name     = "munmap",     .errmsg = true,
697           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
698         { .name     = "name_to_handle_at", .errmsg = true,
699           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
700         { .name     = "newfstatat", .errmsg = true,
701           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
702         { .name     = "open",       .errmsg = true,
703           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
704         { .name     = "open_by_handle_at", .errmsg = true,
705           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
706                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
707         { .name     = "openat",     .errmsg = true,
708           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
709                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
710         { .name     = "perf_event_open", .errmsg = true,
711           .arg_scnprintf = { [2] = SCA_INT, /* cpu */
712                              [3] = SCA_FD,  /* group_fd */
713                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
714         { .name     = "pipe2",      .errmsg = true,
715           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
716         { .name     = "poll",       .errmsg = true, .timeout = true, },
717         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
718         { .name     = "pread",      .errmsg = true, .alias = "pread64", },
719         { .name     = "preadv",     .errmsg = true, .alias = "pread", },
720         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
721         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
722         { .name     = "pwritev",    .errmsg = true, },
723         { .name     = "read",       .errmsg = true, },
724         { .name     = "readlink",   .errmsg = true, },
725         { .name     = "readlinkat", .errmsg = true,
726           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
727         { .name     = "readv",      .errmsg = true, },
728         { .name     = "recvfrom",   .errmsg = true,
729           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
730         { .name     = "recvmmsg",   .errmsg = true,
731           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
732         { .name     = "recvmsg",    .errmsg = true,
733           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
734         { .name     = "removexattr", .errmsg = true, },
735         { .name     = "renameat",   .errmsg = true,
736           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
737         { .name     = "rmdir",    .errmsg = true, },
738         { .name     = "rt_sigaction", .errmsg = true,
739           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
740         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
741         { .name     = "rt_sigqueueinfo", .errmsg = true,
742           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
743         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
744           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
745         { .name     = "sched_getattr",        .errmsg = true, },
746         { .name     = "sched_setattr",        .errmsg = true, },
747         { .name     = "sched_setscheduler",   .errmsg = true,
748           .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
749         { .name     = "seccomp", .errmsg = true,
750           .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
751                              [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
752         { .name     = "select",     .errmsg = true, .timeout = true, },
753         { .name     = "sendmmsg",    .errmsg = true,
754           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
755         { .name     = "sendmsg",    .errmsg = true,
756           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
757         { .name     = "sendto",     .errmsg = true,
758           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
759         { .name     = "set_tid_address", .errpid = true, },
760         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
761         { .name     = "setpgid",    .errmsg = true, },
762         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
763         { .name     = "setxattr",   .errmsg = true, },
764         { .name     = "shutdown",   .errmsg = true, },
765         { .name     = "socket",     .errmsg = true,
766           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
767                              [1] = SCA_SK_TYPE, /* type */ },
768           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
769         { .name     = "socketpair", .errmsg = true,
770           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
771                              [1] = SCA_SK_TYPE, /* type */ },
772           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
773         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
774         { .name     = "statfs",     .errmsg = true, },
775         { .name     = "swapoff",    .errmsg = true,
776           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
777         { .name     = "swapon",     .errmsg = true,
778           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
779         { .name     = "symlinkat",  .errmsg = true,
780           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
781         { .name     = "tgkill",     .errmsg = true,
782           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
783         { .name     = "tkill",      .errmsg = true,
784           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
785         { .name     = "truncate",   .errmsg = true, },
786         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
787         { .name     = "unlinkat",   .errmsg = true,
788           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
789         { .name     = "utime",  .errmsg = true, },
790         { .name     = "utimensat",  .errmsg = true,
791           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
792         { .name     = "utimes",  .errmsg = true, },
793         { .name     = "vmsplice",  .errmsg = true, },
794         { .name     = "wait4",      .errpid = true,
795           .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
796         { .name     = "waitid",     .errpid = true,
797           .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
798         { .name     = "write",      .errmsg = true, },
799         { .name     = "writev",     .errmsg = true, },
800 };
801
802 static int syscall_fmt__cmp(const void *name, const void *fmtp)
803 {
804         const struct syscall_fmt *fmt = fmtp;
805         return strcmp(name, fmt->name);
806 }
807
808 static struct syscall_fmt *syscall_fmt__find(const char *name)
809 {
810         const int nmemb = ARRAY_SIZE(syscall_fmts);
811         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
812 }
813
814 struct syscall {
815         struct event_format *tp_format;
816         int                 nr_args;
817         struct format_field *args;
818         const char          *name;
819         bool                is_exit;
820         struct syscall_fmt  *fmt;
821         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
822         void                **arg_parm;
823 };
824
825 static size_t fprintf_duration(unsigned long t, FILE *fp)
826 {
827         double duration = (double)t / NSEC_PER_MSEC;
828         size_t printed = fprintf(fp, "(");
829
830         if (duration >= 1.0)
831                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
832         else if (duration >= 0.01)
833                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
834         else
835                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
836         return printed + fprintf(fp, "): ");
837 }
838
839 /**
840  * filename.ptr: The filename char pointer that will be vfs_getname'd
841  * filename.entry_str_pos: Where to insert the string translated from
842  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
843  */
844 struct thread_trace {
845         u64               entry_time;
846         u64               exit_time;
847         bool              entry_pending;
848         unsigned long     nr_events;
849         unsigned long     pfmaj, pfmin;
850         char              *entry_str;
851         double            runtime_ms;
852         struct {
853                 unsigned long ptr;
854                 short int     entry_str_pos;
855                 bool          pending_open;
856                 unsigned int  namelen;
857                 char          *name;
858         } filename;
859         struct {
860                 int       max;
861                 char      **table;
862         } paths;
863
864         struct intlist *syscall_stats;
865 };
866
867 static struct thread_trace *thread_trace__new(void)
868 {
869         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
870
871         if (ttrace)
872                 ttrace->paths.max = -1;
873
874         ttrace->syscall_stats = intlist__new(NULL);
875
876         return ttrace;
877 }
878
879 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
880 {
881         struct thread_trace *ttrace;
882
883         if (thread == NULL)
884                 goto fail;
885
886         if (thread__priv(thread) == NULL)
887                 thread__set_priv(thread, thread_trace__new());
888
889         if (thread__priv(thread) == NULL)
890                 goto fail;
891
892         ttrace = thread__priv(thread);
893         ++ttrace->nr_events;
894
895         return ttrace;
896 fail:
897         color_fprintf(fp, PERF_COLOR_RED,
898                       "WARNING: not enough memory, dropping samples!\n");
899         return NULL;
900 }
901
902 #define TRACE_PFMAJ             (1 << 0)
903 #define TRACE_PFMIN             (1 << 1)
904
905 static const size_t trace__entry_str_size = 2048;
906
907 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
908 {
909         struct thread_trace *ttrace = thread__priv(thread);
910
911         if (fd > ttrace->paths.max) {
912                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
913
914                 if (npath == NULL)
915                         return -1;
916
917                 if (ttrace->paths.max != -1) {
918                         memset(npath + ttrace->paths.max + 1, 0,
919                                (fd - ttrace->paths.max) * sizeof(char *));
920                 } else {
921                         memset(npath, 0, (fd + 1) * sizeof(char *));
922                 }
923
924                 ttrace->paths.table = npath;
925                 ttrace->paths.max   = fd;
926         }
927
928         ttrace->paths.table[fd] = strdup(pathname);
929
930         return ttrace->paths.table[fd] != NULL ? 0 : -1;
931 }
932
933 static int thread__read_fd_path(struct thread *thread, int fd)
934 {
935         char linkname[PATH_MAX], pathname[PATH_MAX];
936         struct stat st;
937         int ret;
938
939         if (thread->pid_ == thread->tid) {
940                 scnprintf(linkname, sizeof(linkname),
941                           "/proc/%d/fd/%d", thread->pid_, fd);
942         } else {
943                 scnprintf(linkname, sizeof(linkname),
944                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
945         }
946
947         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
948                 return -1;
949
950         ret = readlink(linkname, pathname, sizeof(pathname));
951
952         if (ret < 0 || ret > st.st_size)
953                 return -1;
954
955         pathname[ret] = '\0';
956         return trace__set_fd_pathname(thread, fd, pathname);
957 }
958
959 static const char *thread__fd_path(struct thread *thread, int fd,
960                                    struct trace *trace)
961 {
962         struct thread_trace *ttrace = thread__priv(thread);
963
964         if (ttrace == NULL)
965                 return NULL;
966
967         if (fd < 0)
968                 return NULL;
969
970         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
971                 if (!trace->live)
972                         return NULL;
973                 ++trace->stats.proc_getname;
974                 if (thread__read_fd_path(thread, fd))
975                         return NULL;
976         }
977
978         return ttrace->paths.table[fd];
979 }
980
981 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
982                                         struct syscall_arg *arg)
983 {
984         int fd = arg->val;
985         size_t printed = scnprintf(bf, size, "%d", fd);
986         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
987
988         if (path)
989                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
990
991         return printed;
992 }
993
994 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
995                                               struct syscall_arg *arg)
996 {
997         int fd = arg->val;
998         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
999         struct thread_trace *ttrace = thread__priv(arg->thread);
1000
1001         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1002                 zfree(&ttrace->paths.table[fd]);
1003
1004         return printed;
1005 }
1006
1007 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1008                                      unsigned long ptr)
1009 {
1010         struct thread_trace *ttrace = thread__priv(thread);
1011
1012         ttrace->filename.ptr = ptr;
1013         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1014 }
1015
1016 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1017                                               struct syscall_arg *arg)
1018 {
1019         unsigned long ptr = arg->val;
1020
1021         if (!arg->trace->vfs_getname)
1022                 return scnprintf(bf, size, "%#x", ptr);
1023
1024         thread__set_filename_pos(arg->thread, bf, ptr);
1025         return 0;
1026 }
1027
1028 static bool trace__filter_duration(struct trace *trace, double t)
1029 {
1030         return t < (trace->duration_filter * NSEC_PER_MSEC);
1031 }
1032
1033 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1034 {
1035         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1036
1037         return fprintf(fp, "%10.3f ", ts);
1038 }
1039
1040 static bool done = false;
1041 static bool interrupted = false;
1042
1043 static void sig_handler(int sig)
1044 {
1045         done = true;
1046         interrupted = sig == SIGINT;
1047 }
1048
1049 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1050                                         u64 duration, u64 tstamp, FILE *fp)
1051 {
1052         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1053         printed += fprintf_duration(duration, fp);
1054
1055         if (trace->multiple_threads) {
1056                 if (trace->show_comm)
1057                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1058                 printed += fprintf(fp, "%d ", thread->tid);
1059         }
1060
1061         return printed;
1062 }
1063
1064 static int trace__process_event(struct trace *trace, struct machine *machine,
1065                                 union perf_event *event, struct perf_sample *sample)
1066 {
1067         int ret = 0;
1068
1069         switch (event->header.type) {
1070         case PERF_RECORD_LOST:
1071                 color_fprintf(trace->output, PERF_COLOR_RED,
1072                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1073                 ret = machine__process_lost_event(machine, event, sample);
1074                 break;
1075         default:
1076                 ret = machine__process_event(machine, event, sample);
1077                 break;
1078         }
1079
1080         return ret;
1081 }
1082
1083 static int trace__tool_process(struct perf_tool *tool,
1084                                union perf_event *event,
1085                                struct perf_sample *sample,
1086                                struct machine *machine)
1087 {
1088         struct trace *trace = container_of(tool, struct trace, tool);
1089         return trace__process_event(trace, machine, event, sample);
1090 }
1091
1092 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1093 {
1094         struct machine *machine = vmachine;
1095
1096         if (machine->kptr_restrict_warned)
1097                 return NULL;
1098
1099         if (symbol_conf.kptr_restrict) {
1100                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1101                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1102                            "Kernel samples will not be resolved.\n");
1103                 machine->kptr_restrict_warned = true;
1104                 return NULL;
1105         }
1106
1107         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1108 }
1109
1110 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1111 {
1112         int err = symbol__init(NULL);
1113
1114         if (err)
1115                 return err;
1116
1117         trace->host = machine__new_host();
1118         if (trace->host == NULL)
1119                 return -ENOMEM;
1120
1121         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1122                 return -errno;
1123
1124         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1125                                             evlist->threads, trace__tool_process, false,
1126                                             trace->opts.proc_map_timeout);
1127         if (err)
1128                 symbol__exit();
1129
1130         return err;
1131 }
1132
1133 static int syscall__set_arg_fmts(struct syscall *sc)
1134 {
1135         struct format_field *field;
1136         int idx = 0, len;
1137
1138         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1139         if (sc->arg_scnprintf == NULL)
1140                 return -1;
1141
1142         if (sc->fmt)
1143                 sc->arg_parm = sc->fmt->arg_parm;
1144
1145         for (field = sc->args; field; field = field->next) {
1146                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1147                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1148                 else if (strcmp(field->type, "const char *") == 0 &&
1149                          (strcmp(field->name, "filename") == 0 ||
1150                           strcmp(field->name, "path") == 0 ||
1151                           strcmp(field->name, "pathname") == 0))
1152                         sc->arg_scnprintf[idx] = SCA_FILENAME;
1153                 else if (field->flags & FIELD_IS_POINTER)
1154                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1155                 else if (strcmp(field->type, "pid_t") == 0)
1156                         sc->arg_scnprintf[idx] = SCA_PID;
1157                 else if (strcmp(field->type, "umode_t") == 0)
1158                         sc->arg_scnprintf[idx] = SCA_MODE_T;
1159                 else if ((strcmp(field->type, "int") == 0 ||
1160                           strcmp(field->type, "unsigned int") == 0 ||
1161                           strcmp(field->type, "long") == 0) &&
1162                          (len = strlen(field->name)) >= 2 &&
1163                          strcmp(field->name + len - 2, "fd") == 0) {
1164                         /*
1165                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1166                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1167                          * 65 int
1168                          * 23 unsigned int
1169                          * 7 unsigned long
1170                          */
1171                         sc->arg_scnprintf[idx] = SCA_FD;
1172                 }
1173                 ++idx;
1174         }
1175
1176         return 0;
1177 }
1178
1179 static int trace__read_syscall_info(struct trace *trace, int id)
1180 {
1181         char tp_name[128];
1182         struct syscall *sc;
1183         const char *name = syscalltbl__name(trace->sctbl, id);
1184
1185         if (name == NULL)
1186                 return -1;
1187
1188         if (id > trace->syscalls.max) {
1189                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1190
1191                 if (nsyscalls == NULL)
1192                         return -1;
1193
1194                 if (trace->syscalls.max != -1) {
1195                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1196                                (id - trace->syscalls.max) * sizeof(*sc));
1197                 } else {
1198                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1199                 }
1200
1201                 trace->syscalls.table = nsyscalls;
1202                 trace->syscalls.max   = id;
1203         }
1204
1205         sc = trace->syscalls.table + id;
1206         sc->name = name;
1207
1208         sc->fmt  = syscall_fmt__find(sc->name);
1209
1210         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1211         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1212
1213         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1214                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1215                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1216         }
1217
1218         if (IS_ERR(sc->tp_format))
1219                 return -1;
1220
1221         sc->args = sc->tp_format->format.fields;
1222         sc->nr_args = sc->tp_format->format.nr_fields;
1223         /*
1224          * We need to check and discard the first variable '__syscall_nr'
1225          * or 'nr' that mean the syscall number. It is needless here.
1226          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1227          */
1228         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1229                 sc->args = sc->args->next;
1230                 --sc->nr_args;
1231         }
1232
1233         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1234
1235         return syscall__set_arg_fmts(sc);
1236 }
1237
1238 static int trace__validate_ev_qualifier(struct trace *trace)
1239 {
1240         int err = 0, i;
1241         struct str_node *pos;
1242
1243         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1244         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1245                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1246
1247         if (trace->ev_qualifier_ids.entries == NULL) {
1248                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1249                        trace->output);
1250                 err = -EINVAL;
1251                 goto out;
1252         }
1253
1254         i = 0;
1255
1256         strlist__for_each_entry(pos, trace->ev_qualifier) {
1257                 const char *sc = pos->s;
1258                 int id = syscalltbl__id(trace->sctbl, sc);
1259
1260                 if (id < 0) {
1261                         if (err == 0) {
1262                                 fputs("Error:\tInvalid syscall ", trace->output);
1263                                 err = -EINVAL;
1264                         } else {
1265                                 fputs(", ", trace->output);
1266                         }
1267
1268                         fputs(sc, trace->output);
1269                 }
1270
1271                 trace->ev_qualifier_ids.entries[i++] = id;
1272         }
1273
1274         if (err < 0) {
1275                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1276                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1277                 zfree(&trace->ev_qualifier_ids.entries);
1278                 trace->ev_qualifier_ids.nr = 0;
1279         }
1280 out:
1281         return err;
1282 }
1283
1284 /*
1285  * args is to be interpreted as a series of longs but we need to handle
1286  * 8-byte unaligned accesses. args points to raw_data within the event
1287  * and raw_data is guaranteed to be 8-byte unaligned because it is
1288  * preceded by raw_size which is a u32. So we need to copy args to a temp
1289  * variable to read it. Most notably this avoids extended load instructions
1290  * on unaligned addresses
1291  */
1292
1293 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1294                                       unsigned char *args, struct trace *trace,
1295                                       struct thread *thread)
1296 {
1297         size_t printed = 0;
1298         unsigned char *p;
1299         unsigned long val;
1300
1301         if (sc->args != NULL) {
1302                 struct format_field *field;
1303                 u8 bit = 1;
1304                 struct syscall_arg arg = {
1305                         .idx    = 0,
1306                         .mask   = 0,
1307                         .trace  = trace,
1308                         .thread = thread,
1309                 };
1310
1311                 for (field = sc->args; field;
1312                      field = field->next, ++arg.idx, bit <<= 1) {
1313                         if (arg.mask & bit)
1314                                 continue;
1315
1316                         /* special care for unaligned accesses */
1317                         p = args + sizeof(unsigned long) * arg.idx;
1318                         memcpy(&val, p, sizeof(val));
1319
1320                         /*
1321                          * Suppress this argument if its value is zero and
1322                          * and we don't have a string associated in an
1323                          * strarray for it.
1324                          */
1325                         if (val == 0 &&
1326                             !(sc->arg_scnprintf &&
1327                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1328                               sc->arg_parm[arg.idx]))
1329                                 continue;
1330
1331                         printed += scnprintf(bf + printed, size - printed,
1332                                              "%s%s: ", printed ? ", " : "", field->name);
1333                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1334                                 arg.val = val;
1335                                 if (sc->arg_parm)
1336                                         arg.parm = sc->arg_parm[arg.idx];
1337                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1338                                                                       size - printed, &arg);
1339                         } else {
1340                                 printed += scnprintf(bf + printed, size - printed,
1341                                                      "%ld", val);
1342                         }
1343                 }
1344         } else if (IS_ERR(sc->tp_format)) {
1345                 /*
1346                  * If we managed to read the tracepoint /format file, then we
1347                  * may end up not having any args, like with gettid(), so only
1348                  * print the raw args when we didn't manage to read it.
1349                  */
1350                 int i = 0;
1351
1352                 while (i < 6) {
1353                         /* special care for unaligned accesses */
1354                         p = args + sizeof(unsigned long) * i;
1355                         memcpy(&val, p, sizeof(val));
1356                         printed += scnprintf(bf + printed, size - printed,
1357                                              "%sarg%d: %ld",
1358                                              printed ? ", " : "", i, val);
1359                         ++i;
1360                 }
1361         }
1362
1363         return printed;
1364 }
1365
1366 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1367                                   union perf_event *event,
1368                                   struct perf_sample *sample);
1369
1370 static struct syscall *trace__syscall_info(struct trace *trace,
1371                                            struct perf_evsel *evsel, int id)
1372 {
1373
1374         if (id < 0) {
1375
1376                 /*
1377                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1378                  * before that, leaving at a higher verbosity level till that is
1379                  * explained. Reproduced with plain ftrace with:
1380                  *
1381                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1382                  * grep "NR -1 " /t/trace_pipe
1383                  *
1384                  * After generating some load on the machine.
1385                  */
1386                 if (verbose > 1) {
1387                         static u64 n;
1388                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1389                                 id, perf_evsel__name(evsel), ++n);
1390                 }
1391                 return NULL;
1392         }
1393
1394         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1395             trace__read_syscall_info(trace, id))
1396                 goto out_cant_read;
1397
1398         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1399                 goto out_cant_read;
1400
1401         return &trace->syscalls.table[id];
1402
1403 out_cant_read:
1404         if (verbose) {
1405                 fprintf(trace->output, "Problems reading syscall %d", id);
1406                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1407                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1408                 fputs(" information\n", trace->output);
1409         }
1410         return NULL;
1411 }
1412
1413 static void thread__update_stats(struct thread_trace *ttrace,
1414                                  int id, struct perf_sample *sample)
1415 {
1416         struct int_node *inode;
1417         struct stats *stats;
1418         u64 duration = 0;
1419
1420         inode = intlist__findnew(ttrace->syscall_stats, id);
1421         if (inode == NULL)
1422                 return;
1423
1424         stats = inode->priv;
1425         if (stats == NULL) {
1426                 stats = malloc(sizeof(struct stats));
1427                 if (stats == NULL)
1428                         return;
1429                 init_stats(stats);
1430                 inode->priv = stats;
1431         }
1432
1433         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1434                 duration = sample->time - ttrace->entry_time;
1435
1436         update_stats(stats, duration);
1437 }
1438
1439 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1440 {
1441         struct thread_trace *ttrace;
1442         u64 duration;
1443         size_t printed;
1444
1445         if (trace->current == NULL)
1446                 return 0;
1447
1448         ttrace = thread__priv(trace->current);
1449
1450         if (!ttrace->entry_pending)
1451                 return 0;
1452
1453         duration = sample->time - ttrace->entry_time;
1454
1455         printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1456         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1457         ttrace->entry_pending = false;
1458
1459         return printed;
1460 }
1461
1462 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1463                             union perf_event *event __maybe_unused,
1464                             struct perf_sample *sample)
1465 {
1466         char *msg;
1467         void *args;
1468         size_t printed = 0;
1469         struct thread *thread;
1470         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1471         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1472         struct thread_trace *ttrace;
1473
1474         if (sc == NULL)
1475                 return -1;
1476
1477         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1478         ttrace = thread__trace(thread, trace->output);
1479         if (ttrace == NULL)
1480                 goto out_put;
1481
1482         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1483
1484         if (ttrace->entry_str == NULL) {
1485                 ttrace->entry_str = malloc(trace__entry_str_size);
1486                 if (!ttrace->entry_str)
1487                         goto out_put;
1488         }
1489
1490         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1491                 trace__printf_interrupted_entry(trace, sample);
1492
1493         ttrace->entry_time = sample->time;
1494         msg = ttrace->entry_str;
1495         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1496
1497         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1498                                            args, trace, thread);
1499
1500         if (sc->is_exit) {
1501                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1502                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1503                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1504                 }
1505         } else {
1506                 ttrace->entry_pending = true;
1507                 /* See trace__vfs_getname & trace__sys_exit */
1508                 ttrace->filename.pending_open = false;
1509         }
1510
1511         if (trace->current != thread) {
1512                 thread__put(trace->current);
1513                 trace->current = thread__get(thread);
1514         }
1515         err = 0;
1516 out_put:
1517         thread__put(thread);
1518         return err;
1519 }
1520
1521 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1522                                     struct perf_sample *sample,
1523                                     struct callchain_cursor *cursor)
1524 {
1525         struct addr_location al;
1526
1527         if (machine__resolve(trace->host, &al, sample) < 0 ||
1528             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1529                 return -1;
1530
1531         return 0;
1532 }
1533
1534 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1535 {
1536         /* TODO: user-configurable print_opts */
1537         const unsigned int print_opts = EVSEL__PRINT_SYM |
1538                                         EVSEL__PRINT_DSO |
1539                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1540
1541         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1542 }
1543
1544 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1545                            union perf_event *event __maybe_unused,
1546                            struct perf_sample *sample)
1547 {
1548         long ret;
1549         u64 duration = 0;
1550         struct thread *thread;
1551         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1552         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1553         struct thread_trace *ttrace;
1554
1555         if (sc == NULL)
1556                 return -1;
1557
1558         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1559         ttrace = thread__trace(thread, trace->output);
1560         if (ttrace == NULL)
1561                 goto out_put;
1562
1563         if (trace->summary)
1564                 thread__update_stats(ttrace, id, sample);
1565
1566         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1567
1568         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1569                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1570                 ttrace->filename.pending_open = false;
1571                 ++trace->stats.vfs_getname;
1572         }
1573
1574         ttrace->exit_time = sample->time;
1575
1576         if (ttrace->entry_time) {
1577                 duration = sample->time - ttrace->entry_time;
1578                 if (trace__filter_duration(trace, duration))
1579                         goto out;
1580         } else if (trace->duration_filter)
1581                 goto out;
1582
1583         if (sample->callchain) {
1584                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1585                 if (callchain_ret == 0) {
1586                         if (callchain_cursor.nr < trace->min_stack)
1587                                 goto out;
1588                         callchain_ret = 1;
1589                 }
1590         }
1591
1592         if (trace->summary_only)
1593                 goto out;
1594
1595         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1596
1597         if (ttrace->entry_pending) {
1598                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1599         } else {
1600                 fprintf(trace->output, " ... [");
1601                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1602                 fprintf(trace->output, "]: %s()", sc->name);
1603         }
1604
1605         if (sc->fmt == NULL) {
1606 signed_print:
1607                 fprintf(trace->output, ") = %ld", ret);
1608         } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1609                 char bf[STRERR_BUFSIZE];
1610                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1611                            *e = audit_errno_to_name(-ret);
1612
1613                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1614         } else if (ret == 0 && sc->fmt->timeout)
1615                 fprintf(trace->output, ") = 0 Timeout");
1616         else if (sc->fmt->hexret)
1617                 fprintf(trace->output, ") = %#lx", ret);
1618         else if (sc->fmt->errpid) {
1619                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1620
1621                 if (child != NULL) {
1622                         fprintf(trace->output, ") = %ld", ret);
1623                         if (child->comm_set)
1624                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1625                         thread__put(child);
1626                 }
1627         } else
1628                 goto signed_print;
1629
1630         fputc('\n', trace->output);
1631
1632         if (callchain_ret > 0)
1633                 trace__fprintf_callchain(trace, sample);
1634         else if (callchain_ret < 0)
1635                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1636 out:
1637         ttrace->entry_pending = false;
1638         err = 0;
1639 out_put:
1640         thread__put(thread);
1641         return err;
1642 }
1643
1644 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1645                               union perf_event *event __maybe_unused,
1646                               struct perf_sample *sample)
1647 {
1648         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1649         struct thread_trace *ttrace;
1650         size_t filename_len, entry_str_len, to_move;
1651         ssize_t remaining_space;
1652         char *pos;
1653         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1654
1655         if (!thread)
1656                 goto out;
1657
1658         ttrace = thread__priv(thread);
1659         if (!ttrace)
1660                 goto out;
1661
1662         filename_len = strlen(filename);
1663
1664         if (ttrace->filename.namelen < filename_len) {
1665                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1666
1667                 if (f == NULL)
1668                                 goto out;
1669
1670                 ttrace->filename.namelen = filename_len;
1671                 ttrace->filename.name = f;
1672         }
1673
1674         strcpy(ttrace->filename.name, filename);
1675         ttrace->filename.pending_open = true;
1676
1677         if (!ttrace->filename.ptr)
1678                 goto out;
1679
1680         entry_str_len = strlen(ttrace->entry_str);
1681         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1682         if (remaining_space <= 0)
1683                 goto out;
1684
1685         if (filename_len > (size_t)remaining_space) {
1686                 filename += filename_len - remaining_space;
1687                 filename_len = remaining_space;
1688         }
1689
1690         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1691         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1692         memmove(pos + filename_len, pos, to_move);
1693         memcpy(pos, filename, filename_len);
1694
1695         ttrace->filename.ptr = 0;
1696         ttrace->filename.entry_str_pos = 0;
1697 out:
1698         return 0;
1699 }
1700
1701 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1702                                      union perf_event *event __maybe_unused,
1703                                      struct perf_sample *sample)
1704 {
1705         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1706         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1707         struct thread *thread = machine__findnew_thread(trace->host,
1708                                                         sample->pid,
1709                                                         sample->tid);
1710         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1711
1712         if (ttrace == NULL)
1713                 goto out_dump;
1714
1715         ttrace->runtime_ms += runtime_ms;
1716         trace->runtime_ms += runtime_ms;
1717         thread__put(thread);
1718         return 0;
1719
1720 out_dump:
1721         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1722                evsel->name,
1723                perf_evsel__strval(evsel, sample, "comm"),
1724                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1725                runtime,
1726                perf_evsel__intval(evsel, sample, "vruntime"));
1727         thread__put(thread);
1728         return 0;
1729 }
1730
1731 static void bpf_output__printer(enum binary_printer_ops op,
1732                                 unsigned int val, void *extra)
1733 {
1734         FILE *output = extra;
1735         unsigned char ch = (unsigned char)val;
1736
1737         switch (op) {
1738         case BINARY_PRINT_CHAR_DATA:
1739                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1740                 break;
1741         case BINARY_PRINT_DATA_BEGIN:
1742         case BINARY_PRINT_LINE_BEGIN:
1743         case BINARY_PRINT_ADDR:
1744         case BINARY_PRINT_NUM_DATA:
1745         case BINARY_PRINT_NUM_PAD:
1746         case BINARY_PRINT_SEP:
1747         case BINARY_PRINT_CHAR_PAD:
1748         case BINARY_PRINT_LINE_END:
1749         case BINARY_PRINT_DATA_END:
1750         default:
1751                 break;
1752         }
1753 }
1754
1755 static void bpf_output__fprintf(struct trace *trace,
1756                                 struct perf_sample *sample)
1757 {
1758         print_binary(sample->raw_data, sample->raw_size, 8,
1759                      bpf_output__printer, trace->output);
1760 }
1761
1762 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1763                                 union perf_event *event __maybe_unused,
1764                                 struct perf_sample *sample)
1765 {
1766         int callchain_ret = 0;
1767
1768         if (sample->callchain) {
1769                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1770                 if (callchain_ret == 0) {
1771                         if (callchain_cursor.nr < trace->min_stack)
1772                                 goto out;
1773                         callchain_ret = 1;
1774                 }
1775         }
1776
1777         trace__printf_interrupted_entry(trace, sample);
1778         trace__fprintf_tstamp(trace, sample->time, trace->output);
1779
1780         if (trace->trace_syscalls)
1781                 fprintf(trace->output, "(         ): ");
1782
1783         fprintf(trace->output, "%s:", evsel->name);
1784
1785         if (perf_evsel__is_bpf_output(evsel)) {
1786                 bpf_output__fprintf(trace, sample);
1787         } else if (evsel->tp_format) {
1788                 event_format__fprintf(evsel->tp_format, sample->cpu,
1789                                       sample->raw_data, sample->raw_size,
1790                                       trace->output);
1791         }
1792
1793         fprintf(trace->output, ")\n");
1794
1795         if (callchain_ret > 0)
1796                 trace__fprintf_callchain(trace, sample);
1797         else if (callchain_ret < 0)
1798                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1799 out:
1800         return 0;
1801 }
1802
1803 static void print_location(FILE *f, struct perf_sample *sample,
1804                            struct addr_location *al,
1805                            bool print_dso, bool print_sym)
1806 {
1807
1808         if ((verbose || print_dso) && al->map)
1809                 fprintf(f, "%s@", al->map->dso->long_name);
1810
1811         if ((verbose || print_sym) && al->sym)
1812                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1813                         al->addr - al->sym->start);
1814         else if (al->map)
1815                 fprintf(f, "0x%" PRIx64, al->addr);
1816         else
1817                 fprintf(f, "0x%" PRIx64, sample->addr);
1818 }
1819
1820 static int trace__pgfault(struct trace *trace,
1821                           struct perf_evsel *evsel,
1822                           union perf_event *event __maybe_unused,
1823                           struct perf_sample *sample)
1824 {
1825         struct thread *thread;
1826         struct addr_location al;
1827         char map_type = 'd';
1828         struct thread_trace *ttrace;
1829         int err = -1;
1830         int callchain_ret = 0;
1831
1832         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1833
1834         if (sample->callchain) {
1835                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1836                 if (callchain_ret == 0) {
1837                         if (callchain_cursor.nr < trace->min_stack)
1838                                 goto out_put;
1839                         callchain_ret = 1;
1840                 }
1841         }
1842
1843         ttrace = thread__trace(thread, trace->output);
1844         if (ttrace == NULL)
1845                 goto out_put;
1846
1847         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1848                 ttrace->pfmaj++;
1849         else
1850                 ttrace->pfmin++;
1851
1852         if (trace->summary_only)
1853                 goto out;
1854
1855         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1856                               sample->ip, &al);
1857
1858         trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1859
1860         fprintf(trace->output, "%sfault [",
1861                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1862                 "maj" : "min");
1863
1864         print_location(trace->output, sample, &al, false, true);
1865
1866         fprintf(trace->output, "] => ");
1867
1868         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1869                                    sample->addr, &al);
1870
1871         if (!al.map) {
1872                 thread__find_addr_location(thread, sample->cpumode,
1873                                            MAP__FUNCTION, sample->addr, &al);
1874
1875                 if (al.map)
1876                         map_type = 'x';
1877                 else
1878                         map_type = '?';
1879         }
1880
1881         print_location(trace->output, sample, &al, true, false);
1882
1883         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1884
1885         if (callchain_ret > 0)
1886                 trace__fprintf_callchain(trace, sample);
1887         else if (callchain_ret < 0)
1888                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1889 out:
1890         err = 0;
1891 out_put:
1892         thread__put(thread);
1893         return err;
1894 }
1895
1896 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1897 {
1898         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1899             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1900                 return false;
1901
1902         if (trace->pid_list || trace->tid_list)
1903                 return true;
1904
1905         return false;
1906 }
1907
1908 static void trace__set_base_time(struct trace *trace,
1909                                  struct perf_evsel *evsel,
1910                                  struct perf_sample *sample)
1911 {
1912         /*
1913          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1914          * and don't use sample->time unconditionally, we may end up having
1915          * some other event in the future without PERF_SAMPLE_TIME for good
1916          * reason, i.e. we may not be interested in its timestamps, just in
1917          * it taking place, picking some piece of information when it
1918          * appears in our event stream (vfs_getname comes to mind).
1919          */
1920         if (trace->base_time == 0 && !trace->full_time &&
1921             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1922                 trace->base_time = sample->time;
1923 }
1924
1925 static int trace__process_sample(struct perf_tool *tool,
1926                                  union perf_event *event,
1927                                  struct perf_sample *sample,
1928                                  struct perf_evsel *evsel,
1929                                  struct machine *machine __maybe_unused)
1930 {
1931         struct trace *trace = container_of(tool, struct trace, tool);
1932         int err = 0;
1933
1934         tracepoint_handler handler = evsel->handler;
1935
1936         if (skip_sample(trace, sample))
1937                 return 0;
1938
1939         trace__set_base_time(trace, evsel, sample);
1940
1941         if (handler) {
1942                 ++trace->nr_events;
1943                 handler(trace, evsel, event, sample);
1944         }
1945
1946         return err;
1947 }
1948
1949 static int parse_target_str(struct trace *trace)
1950 {
1951         if (trace->opts.target.pid) {
1952                 trace->pid_list = intlist__new(trace->opts.target.pid);
1953                 if (trace->pid_list == NULL) {
1954                         pr_err("Error parsing process id string\n");
1955                         return -EINVAL;
1956                 }
1957         }
1958
1959         if (trace->opts.target.tid) {
1960                 trace->tid_list = intlist__new(trace->opts.target.tid);
1961                 if (trace->tid_list == NULL) {
1962                         pr_err("Error parsing thread id string\n");
1963                         return -EINVAL;
1964                 }
1965         }
1966
1967         return 0;
1968 }
1969
1970 static int trace__record(struct trace *trace, int argc, const char **argv)
1971 {
1972         unsigned int rec_argc, i, j;
1973         const char **rec_argv;
1974         const char * const record_args[] = {
1975                 "record",
1976                 "-R",
1977                 "-m", "1024",
1978                 "-c", "1",
1979         };
1980
1981         const char * const sc_args[] = { "-e", };
1982         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1983         const char * const majpf_args[] = { "-e", "major-faults" };
1984         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1985         const char * const minpf_args[] = { "-e", "minor-faults" };
1986         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1987
1988         /* +1 is for the event string below */
1989         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1990                 majpf_args_nr + minpf_args_nr + argc;
1991         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1992
1993         if (rec_argv == NULL)
1994                 return -ENOMEM;
1995
1996         j = 0;
1997         for (i = 0; i < ARRAY_SIZE(record_args); i++)
1998                 rec_argv[j++] = record_args[i];
1999
2000         if (trace->trace_syscalls) {
2001                 for (i = 0; i < sc_args_nr; i++)
2002                         rec_argv[j++] = sc_args[i];
2003
2004                 /* event string may be different for older kernels - e.g., RHEL6 */
2005                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2006                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2007                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2008                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2009                 else {
2010                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2011                         return -1;
2012                 }
2013         }
2014
2015         if (trace->trace_pgfaults & TRACE_PFMAJ)
2016                 for (i = 0; i < majpf_args_nr; i++)
2017                         rec_argv[j++] = majpf_args[i];
2018
2019         if (trace->trace_pgfaults & TRACE_PFMIN)
2020                 for (i = 0; i < minpf_args_nr; i++)
2021                         rec_argv[j++] = minpf_args[i];
2022
2023         for (i = 0; i < (unsigned int)argc; i++)
2024                 rec_argv[j++] = argv[i];
2025
2026         return cmd_record(j, rec_argv, NULL);
2027 }
2028
2029 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2030
2031 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2032 {
2033         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2034
2035         if (IS_ERR(evsel))
2036                 return false;
2037
2038         if (perf_evsel__field(evsel, "pathname") == NULL) {
2039                 perf_evsel__delete(evsel);
2040                 return false;
2041         }
2042
2043         evsel->handler = trace__vfs_getname;
2044         perf_evlist__add(evlist, evsel);
2045         return true;
2046 }
2047
2048 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2049 {
2050         struct perf_evsel *evsel;
2051         struct perf_event_attr attr = {
2052                 .type = PERF_TYPE_SOFTWARE,
2053                 .mmap_data = 1,
2054         };
2055
2056         attr.config = config;
2057         attr.sample_period = 1;
2058
2059         event_attr_init(&attr);
2060
2061         evsel = perf_evsel__new(&attr);
2062         if (evsel)
2063                 evsel->handler = trace__pgfault;
2064
2065         return evsel;
2066 }
2067
2068 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2069 {
2070         const u32 type = event->header.type;
2071         struct perf_evsel *evsel;
2072
2073         if (type != PERF_RECORD_SAMPLE) {
2074                 trace__process_event(trace, trace->host, event, sample);
2075                 return;
2076         }
2077
2078         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2079         if (evsel == NULL) {
2080                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2081                 return;
2082         }
2083
2084         trace__set_base_time(trace, evsel, sample);
2085
2086         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2087             sample->raw_data == NULL) {
2088                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2089                        perf_evsel__name(evsel), sample->tid,
2090                        sample->cpu, sample->raw_size);
2091         } else {
2092                 tracepoint_handler handler = evsel->handler;
2093                 handler(trace, evsel, event, sample);
2094         }
2095 }
2096
2097 static int trace__add_syscall_newtp(struct trace *trace)
2098 {
2099         int ret = -1;
2100         struct perf_evlist *evlist = trace->evlist;
2101         struct perf_evsel *sys_enter, *sys_exit;
2102
2103         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2104         if (sys_enter == NULL)
2105                 goto out;
2106
2107         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2108                 goto out_delete_sys_enter;
2109
2110         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2111         if (sys_exit == NULL)
2112                 goto out_delete_sys_enter;
2113
2114         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2115                 goto out_delete_sys_exit;
2116
2117         perf_evlist__add(evlist, sys_enter);
2118         perf_evlist__add(evlist, sys_exit);
2119
2120         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2121                 /*
2122                  * We're interested only in the user space callchain
2123                  * leading to the syscall, allow overriding that for
2124                  * debugging reasons using --kernel_syscall_callchains
2125                  */
2126                 sys_exit->attr.exclude_callchain_kernel = 1;
2127         }
2128
2129         trace->syscalls.events.sys_enter = sys_enter;
2130         trace->syscalls.events.sys_exit  = sys_exit;
2131
2132         ret = 0;
2133 out:
2134         return ret;
2135
2136 out_delete_sys_exit:
2137         perf_evsel__delete_priv(sys_exit);
2138 out_delete_sys_enter:
2139         perf_evsel__delete_priv(sys_enter);
2140         goto out;
2141 }
2142
2143 static int trace__set_ev_qualifier_filter(struct trace *trace)
2144 {
2145         int err = -1;
2146         struct perf_evsel *sys_exit;
2147         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2148                                                 trace->ev_qualifier_ids.nr,
2149                                                 trace->ev_qualifier_ids.entries);
2150
2151         if (filter == NULL)
2152                 goto out_enomem;
2153
2154         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2155                                           filter)) {
2156                 sys_exit = trace->syscalls.events.sys_exit;
2157                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2158         }
2159
2160         free(filter);
2161 out:
2162         return err;
2163 out_enomem:
2164         errno = ENOMEM;
2165         goto out;
2166 }
2167
2168 static int trace__run(struct trace *trace, int argc, const char **argv)
2169 {
2170         struct perf_evlist *evlist = trace->evlist;
2171         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2172         int err = -1, i;
2173         unsigned long before;
2174         const bool forks = argc > 0;
2175         bool draining = false;
2176
2177         trace->live = true;
2178
2179         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2180                 goto out_error_raw_syscalls;
2181
2182         if (trace->trace_syscalls)
2183                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2184
2185         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2186                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2187                 if (pgfault_maj == NULL)
2188                         goto out_error_mem;
2189                 perf_evlist__add(evlist, pgfault_maj);
2190         }
2191
2192         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2193                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2194                 if (pgfault_min == NULL)
2195                         goto out_error_mem;
2196                 perf_evlist__add(evlist, pgfault_min);
2197         }
2198
2199         if (trace->sched &&
2200             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2201                                    trace__sched_stat_runtime))
2202                 goto out_error_sched_stat_runtime;
2203
2204         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2205         if (err < 0) {
2206                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2207                 goto out_delete_evlist;
2208         }
2209
2210         err = trace__symbols_init(trace, evlist);
2211         if (err < 0) {
2212                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2213                 goto out_delete_evlist;
2214         }
2215
2216         perf_evlist__config(evlist, &trace->opts, NULL);
2217
2218         if (callchain_param.enabled) {
2219                 bool use_identifier = false;
2220
2221                 if (trace->syscalls.events.sys_exit) {
2222                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2223                                                      &trace->opts, &callchain_param);
2224                         use_identifier = true;
2225                 }
2226
2227                 if (pgfault_maj) {
2228                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2229                         use_identifier = true;
2230                 }
2231
2232                 if (pgfault_min) {
2233                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2234                         use_identifier = true;
2235                 }
2236
2237                 if (use_identifier) {
2238                        /*
2239                         * Now we have evsels with different sample_ids, use
2240                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2241                         * from a fixed position in each ring buffer record.
2242                         *
2243                         * As of this the changeset introducing this comment, this
2244                         * isn't strictly needed, as the fields that can come before
2245                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2246                         * some of those for things like copying the payload of
2247                         * pointer syscall arguments, and for vfs_getname we don't
2248                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2249                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2250                         */
2251                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2252                         perf_evlist__reset_sample_bit(evlist, ID);
2253                 }
2254         }
2255
2256         signal(SIGCHLD, sig_handler);
2257         signal(SIGINT, sig_handler);
2258
2259         if (forks) {
2260                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2261                                                     argv, false, NULL);
2262                 if (err < 0) {
2263                         fprintf(trace->output, "Couldn't run the workload!\n");
2264                         goto out_delete_evlist;
2265                 }
2266         }
2267
2268         err = perf_evlist__open(evlist);
2269         if (err < 0)
2270                 goto out_error_open;
2271
2272         err = bpf__apply_obj_config();
2273         if (err) {
2274                 char errbuf[BUFSIZ];
2275
2276                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2277                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2278                          errbuf);
2279                 goto out_error_open;
2280         }
2281
2282         /*
2283          * Better not use !target__has_task() here because we need to cover the
2284          * case where no threads were specified in the command line, but a
2285          * workload was, and in that case we will fill in the thread_map when
2286          * we fork the workload in perf_evlist__prepare_workload.
2287          */
2288         if (trace->filter_pids.nr > 0)
2289                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2290         else if (thread_map__pid(evlist->threads, 0) == -1)
2291                 err = perf_evlist__set_filter_pid(evlist, getpid());
2292
2293         if (err < 0)
2294                 goto out_error_mem;
2295
2296         if (trace->ev_qualifier_ids.nr > 0) {
2297                 err = trace__set_ev_qualifier_filter(trace);
2298                 if (err < 0)
2299                         goto out_errno;
2300
2301                 pr_debug("event qualifier tracepoint filter: %s\n",
2302                          trace->syscalls.events.sys_exit->filter);
2303         }
2304
2305         err = perf_evlist__apply_filters(evlist, &evsel);
2306         if (err < 0)
2307                 goto out_error_apply_filters;
2308
2309         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2310         if (err < 0)
2311                 goto out_error_mmap;
2312
2313         if (!target__none(&trace->opts.target))
2314                 perf_evlist__enable(evlist);
2315
2316         if (forks)
2317                 perf_evlist__start_workload(evlist);
2318
2319         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2320                                   evlist->threads->nr > 1 ||
2321                                   perf_evlist__first(evlist)->attr.inherit;
2322 again:
2323         before = trace->nr_events;
2324
2325         for (i = 0; i < evlist->nr_mmaps; i++) {
2326                 union perf_event *event;
2327
2328                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2329                         struct perf_sample sample;
2330
2331                         ++trace->nr_events;
2332
2333                         err = perf_evlist__parse_sample(evlist, event, &sample);
2334                         if (err) {
2335                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2336                                 goto next_event;
2337                         }
2338
2339                         trace__handle_event(trace, event, &sample);
2340 next_event:
2341                         perf_evlist__mmap_consume(evlist, i);
2342
2343                         if (interrupted)
2344                                 goto out_disable;
2345
2346                         if (done && !draining) {
2347                                 perf_evlist__disable(evlist);
2348                                 draining = true;
2349                         }
2350                 }
2351         }
2352
2353         if (trace->nr_events == before) {
2354                 int timeout = done ? 100 : -1;
2355
2356                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2357                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2358                                 draining = true;
2359
2360                         goto again;
2361                 }
2362         } else {
2363                 goto again;
2364         }
2365
2366 out_disable:
2367         thread__zput(trace->current);
2368
2369         perf_evlist__disable(evlist);
2370
2371         if (!err) {
2372                 if (trace->summary)
2373                         trace__fprintf_thread_summary(trace, trace->output);
2374
2375                 if (trace->show_tool_stats) {
2376                         fprintf(trace->output, "Stats:\n "
2377                                                " vfs_getname : %" PRIu64 "\n"
2378                                                " proc_getname: %" PRIu64 "\n",
2379                                 trace->stats.vfs_getname,
2380                                 trace->stats.proc_getname);
2381                 }
2382         }
2383
2384 out_delete_evlist:
2385         perf_evlist__delete(evlist);
2386         trace->evlist = NULL;
2387         trace->live = false;
2388         return err;
2389 {
2390         char errbuf[BUFSIZ];
2391
2392 out_error_sched_stat_runtime:
2393         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2394         goto out_error;
2395
2396 out_error_raw_syscalls:
2397         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2398         goto out_error;
2399
2400 out_error_mmap:
2401         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2402         goto out_error;
2403
2404 out_error_open:
2405         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2406
2407 out_error:
2408         fprintf(trace->output, "%s\n", errbuf);
2409         goto out_delete_evlist;
2410
2411 out_error_apply_filters:
2412         fprintf(trace->output,
2413                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2414                 evsel->filter, perf_evsel__name(evsel), errno,
2415                 str_error_r(errno, errbuf, sizeof(errbuf)));
2416         goto out_delete_evlist;
2417 }
2418 out_error_mem:
2419         fprintf(trace->output, "Not enough memory to run!\n");
2420         goto out_delete_evlist;
2421
2422 out_errno:
2423         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2424         goto out_delete_evlist;
2425 }
2426
2427 static int trace__replay(struct trace *trace)
2428 {
2429         const struct perf_evsel_str_handler handlers[] = {
2430                 { "probe:vfs_getname",       trace__vfs_getname, },
2431         };
2432         struct perf_data_file file = {
2433                 .path  = input_name,
2434                 .mode  = PERF_DATA_MODE_READ,
2435                 .force = trace->force,
2436         };
2437         struct perf_session *session;
2438         struct perf_evsel *evsel;
2439         int err = -1;
2440
2441         trace->tool.sample        = trace__process_sample;
2442         trace->tool.mmap          = perf_event__process_mmap;
2443         trace->tool.mmap2         = perf_event__process_mmap2;
2444         trace->tool.comm          = perf_event__process_comm;
2445         trace->tool.exit          = perf_event__process_exit;
2446         trace->tool.fork          = perf_event__process_fork;
2447         trace->tool.attr          = perf_event__process_attr;
2448         trace->tool.tracing_data = perf_event__process_tracing_data;
2449         trace->tool.build_id      = perf_event__process_build_id;
2450
2451         trace->tool.ordered_events = true;
2452         trace->tool.ordering_requires_timestamps = true;
2453
2454         /* add tid to output */
2455         trace->multiple_threads = true;
2456
2457         session = perf_session__new(&file, false, &trace->tool);
2458         if (session == NULL)
2459                 return -1;
2460
2461         if (symbol__init(&session->header.env) < 0)
2462                 goto out;
2463
2464         trace->host = &session->machines.host;
2465
2466         err = perf_session__set_tracepoints_handlers(session, handlers);
2467         if (err)
2468                 goto out;
2469
2470         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2471                                                      "raw_syscalls:sys_enter");
2472         /* older kernels have syscalls tp versus raw_syscalls */
2473         if (evsel == NULL)
2474                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2475                                                              "syscalls:sys_enter");
2476
2477         if (evsel &&
2478             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2479             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2480                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2481                 goto out;
2482         }
2483
2484         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2485                                                      "raw_syscalls:sys_exit");
2486         if (evsel == NULL)
2487                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2488                                                              "syscalls:sys_exit");
2489         if (evsel &&
2490             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2491             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2492                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2493                 goto out;
2494         }
2495
2496         evlist__for_each_entry(session->evlist, evsel) {
2497                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2498                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2499                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2500                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2501                         evsel->handler = trace__pgfault;
2502         }
2503
2504         err = parse_target_str(trace);
2505         if (err != 0)
2506                 goto out;
2507
2508         setup_pager();
2509
2510         err = perf_session__process_events(session);
2511         if (err)
2512                 pr_err("Failed to process events, error %d", err);
2513
2514         else if (trace->summary)
2515                 trace__fprintf_thread_summary(trace, trace->output);
2516
2517 out:
2518         perf_session__delete(session);
2519
2520         return err;
2521 }
2522
2523 static size_t trace__fprintf_threads_header(FILE *fp)
2524 {
2525         size_t printed;
2526
2527         printed  = fprintf(fp, "\n Summary of events:\n\n");
2528
2529         return printed;
2530 }
2531
2532 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2533         struct stats    *stats;
2534         double          msecs;
2535         int             syscall;
2536 )
2537 {
2538         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2539         struct stats *stats = source->priv;
2540
2541         entry->syscall = source->i;
2542         entry->stats   = stats;
2543         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2544 }
2545
2546 static size_t thread__dump_stats(struct thread_trace *ttrace,
2547                                  struct trace *trace, FILE *fp)
2548 {
2549         size_t printed = 0;
2550         struct syscall *sc;
2551         struct rb_node *nd;
2552         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2553
2554         if (syscall_stats == NULL)
2555                 return 0;
2556
2557         printed += fprintf(fp, "\n");
2558
2559         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2560         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2561         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2562
2563         resort_rb__for_each_entry(nd, syscall_stats) {
2564                 struct stats *stats = syscall_stats_entry->stats;
2565                 if (stats) {
2566                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2567                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2568                         double avg = avg_stats(stats);
2569                         double pct;
2570                         u64 n = (u64) stats->n;
2571
2572                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2573                         avg /= NSEC_PER_MSEC;
2574
2575                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2576                         printed += fprintf(fp, "   %-15s", sc->name);
2577                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2578                                            n, syscall_stats_entry->msecs, min, avg);
2579                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2580                 }
2581         }
2582
2583         resort_rb__delete(syscall_stats);
2584         printed += fprintf(fp, "\n\n");
2585
2586         return printed;
2587 }
2588
2589 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2590 {
2591         size_t printed = 0;
2592         struct thread_trace *ttrace = thread__priv(thread);
2593         double ratio;
2594
2595         if (ttrace == NULL)
2596                 return 0;
2597
2598         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2599
2600         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2601         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2602         printed += fprintf(fp, "%.1f%%", ratio);
2603         if (ttrace->pfmaj)
2604                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2605         if (ttrace->pfmin)
2606                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2607         if (trace->sched)
2608                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2609         else if (fputc('\n', fp) != EOF)
2610                 ++printed;
2611
2612         printed += thread__dump_stats(ttrace, trace, fp);
2613
2614         return printed;
2615 }
2616
2617 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2618 {
2619         return ttrace ? ttrace->nr_events : 0;
2620 }
2621
2622 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2623         struct thread *thread;
2624 )
2625 {
2626         entry->thread = rb_entry(nd, struct thread, rb_node);
2627 }
2628
2629 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2630 {
2631         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2632         size_t printed = trace__fprintf_threads_header(fp);
2633         struct rb_node *nd;
2634
2635         if (threads == NULL) {
2636                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2637                 return 0;
2638         }
2639
2640         resort_rb__for_each_entry(nd, threads)
2641                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2642
2643         resort_rb__delete(threads);
2644
2645         return printed;
2646 }
2647
2648 static int trace__set_duration(const struct option *opt, const char *str,
2649                                int unset __maybe_unused)
2650 {
2651         struct trace *trace = opt->value;
2652
2653         trace->duration_filter = atof(str);
2654         return 0;
2655 }
2656
2657 static int trace__set_filter_pids(const struct option *opt, const char *str,
2658                                   int unset __maybe_unused)
2659 {
2660         int ret = -1;
2661         size_t i;
2662         struct trace *trace = opt->value;
2663         /*
2664          * FIXME: introduce a intarray class, plain parse csv and create a
2665          * { int nr, int entries[] } struct...
2666          */
2667         struct intlist *list = intlist__new(str);
2668
2669         if (list == NULL)
2670                 return -1;
2671
2672         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2673         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2674
2675         if (trace->filter_pids.entries == NULL)
2676                 goto out;
2677
2678         trace->filter_pids.entries[0] = getpid();
2679
2680         for (i = 1; i < trace->filter_pids.nr; ++i)
2681                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2682
2683         intlist__delete(list);
2684         ret = 0;
2685 out:
2686         return ret;
2687 }
2688
2689 static int trace__open_output(struct trace *trace, const char *filename)
2690 {
2691         struct stat st;
2692
2693         if (!stat(filename, &st) && st.st_size) {
2694                 char oldname[PATH_MAX];
2695
2696                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2697                 unlink(oldname);
2698                 rename(filename, oldname);
2699         }
2700
2701         trace->output = fopen(filename, "w");
2702
2703         return trace->output == NULL ? -errno : 0;
2704 }
2705
2706 static int parse_pagefaults(const struct option *opt, const char *str,
2707                             int unset __maybe_unused)
2708 {
2709         int *trace_pgfaults = opt->value;
2710
2711         if (strcmp(str, "all") == 0)
2712                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2713         else if (strcmp(str, "maj") == 0)
2714                 *trace_pgfaults |= TRACE_PFMAJ;
2715         else if (strcmp(str, "min") == 0)
2716                 *trace_pgfaults |= TRACE_PFMIN;
2717         else
2718                 return -1;
2719
2720         return 0;
2721 }
2722
2723 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2724 {
2725         struct perf_evsel *evsel;
2726
2727         evlist__for_each_entry(evlist, evsel)
2728                 evsel->handler = handler;
2729 }
2730
2731 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2732 {
2733         const char *trace_usage[] = {
2734                 "perf trace [<options>] [<command>]",
2735                 "perf trace [<options>] -- <command> [<options>]",
2736                 "perf trace record [<options>] [<command>]",
2737                 "perf trace record [<options>] -- <command> [<options>]",
2738                 NULL
2739         };
2740         struct trace trace = {
2741                 .syscalls = {
2742                         . max = -1,
2743                 },
2744                 .opts = {
2745                         .target = {
2746                                 .uid       = UINT_MAX,
2747                                 .uses_mmap = true,
2748                         },
2749                         .user_freq     = UINT_MAX,
2750                         .user_interval = ULLONG_MAX,
2751                         .no_buffering  = true,
2752                         .mmap_pages    = UINT_MAX,
2753                         .proc_map_timeout  = 500,
2754                 },
2755                 .output = stderr,
2756                 .show_comm = true,
2757                 .trace_syscalls = true,
2758                 .kernel_syscallchains = false,
2759                 .max_stack = UINT_MAX,
2760         };
2761         const char *output_name = NULL;
2762         const char *ev_qualifier_str = NULL;
2763         const struct option trace_options[] = {
2764         OPT_CALLBACK(0, "event", &trace.evlist, "event",
2765                      "event selector. use 'perf list' to list available events",
2766                      parse_events_option),
2767         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2768                     "show the thread COMM next to its id"),
2769         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2770         OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2771         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2772         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2773         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2774                     "trace events on existing process id"),
2775         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2776                     "trace events on existing thread id"),
2777         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2778                      "pids to filter (by the kernel)", trace__set_filter_pids),
2779         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2780                     "system-wide collection from all CPUs"),
2781         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2782                     "list of cpus to monitor"),
2783         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2784                     "child tasks do not inherit counters"),
2785         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2786                      "number of mmap data pages",
2787                      perf_evlist__parse_mmap_pages),
2788         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2789                    "user to profile"),
2790         OPT_CALLBACK(0, "duration", &trace, "float",
2791                      "show only events with duration > N.M ms",
2792                      trace__set_duration),
2793         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2794         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2795         OPT_BOOLEAN('T', "time", &trace.full_time,
2796                     "Show full timestamp, not time relative to first start"),
2797         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2798                     "Show only syscall summary with statistics"),
2799         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2800                     "Show all syscalls and summary with statistics"),
2801         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2802                      "Trace pagefaults", parse_pagefaults, "maj"),
2803         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2804         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2805         OPT_CALLBACK(0, "call-graph", &trace.opts,
2806                      "record_mode[,record_size]", record_callchain_help,
2807                      &record_parse_callchain_opt),
2808         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2809                     "Show the kernel callchains on the syscall exit path"),
2810         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2811                      "Set the minimum stack depth when parsing the callchain, "
2812                      "anything below the specified depth will be ignored."),
2813         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2814                      "Set the maximum stack depth when parsing the callchain, "
2815                      "anything beyond the specified depth will be ignored. "
2816                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2817         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2818                         "per thread proc mmap processing timeout in ms"),
2819         OPT_END()
2820         };
2821         bool __maybe_unused max_stack_user_set = true;
2822         bool mmap_pages_user_set = true;
2823         const char * const trace_subcommands[] = { "record", NULL };
2824         int err;
2825         char bf[BUFSIZ];
2826
2827         signal(SIGSEGV, sighandler_dump_stack);
2828         signal(SIGFPE, sighandler_dump_stack);
2829
2830         trace.evlist = perf_evlist__new();
2831         trace.sctbl = syscalltbl__new();
2832
2833         if (trace.evlist == NULL || trace.sctbl == NULL) {
2834                 pr_err("Not enough memory to run!\n");
2835                 err = -ENOMEM;
2836                 goto out;
2837         }
2838
2839         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2840                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2841
2842         err = bpf__setup_stdout(trace.evlist);
2843         if (err) {
2844                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2845                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2846                 goto out;
2847         }
2848
2849         err = -1;
2850
2851         if (trace.trace_pgfaults) {
2852                 trace.opts.sample_address = true;
2853                 trace.opts.sample_time = true;
2854         }
2855
2856         if (trace.opts.mmap_pages == UINT_MAX)
2857                 mmap_pages_user_set = false;
2858
2859         if (trace.max_stack == UINT_MAX) {
2860                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2861                 max_stack_user_set = false;
2862         }
2863
2864 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2865         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2866                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2867 #endif
2868
2869         if (callchain_param.enabled) {
2870                 if (!mmap_pages_user_set && geteuid() == 0)
2871                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2872
2873                 symbol_conf.use_callchain = true;
2874         }
2875
2876         if (trace.evlist->nr_entries > 0)
2877                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2878
2879         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2880                 return trace__record(&trace, argc-1, &argv[1]);
2881
2882         /* summary_only implies summary option, but don't overwrite summary if set */
2883         if (trace.summary_only)
2884                 trace.summary = trace.summary_only;
2885
2886         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2887             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2888                 pr_err("Please specify something to trace.\n");
2889                 return -1;
2890         }
2891
2892         if (!trace.trace_syscalls && ev_qualifier_str) {
2893                 pr_err("The -e option can't be used with --no-syscalls.\n");
2894                 goto out;
2895         }
2896
2897         if (output_name != NULL) {
2898                 err = trace__open_output(&trace, output_name);
2899                 if (err < 0) {
2900                         perror("failed to create output file");
2901                         goto out;
2902                 }
2903         }
2904
2905         trace.open_id = syscalltbl__id(trace.sctbl, "open");
2906
2907         if (ev_qualifier_str != NULL) {
2908                 const char *s = ev_qualifier_str;
2909                 struct strlist_config slist_config = {
2910                         .dirname = system_path(STRACE_GROUPS_DIR),
2911                 };
2912
2913                 trace.not_ev_qualifier = *s == '!';
2914                 if (trace.not_ev_qualifier)
2915                         ++s;
2916                 trace.ev_qualifier = strlist__new(s, &slist_config);
2917                 if (trace.ev_qualifier == NULL) {
2918                         fputs("Not enough memory to parse event qualifier",
2919                               trace.output);
2920                         err = -ENOMEM;
2921                         goto out_close;
2922                 }
2923
2924                 err = trace__validate_ev_qualifier(&trace);
2925                 if (err)
2926                         goto out_close;
2927         }
2928
2929         err = target__validate(&trace.opts.target);
2930         if (err) {
2931                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2932                 fprintf(trace.output, "%s", bf);
2933                 goto out_close;
2934         }
2935
2936         err = target__parse_uid(&trace.opts.target);
2937         if (err) {
2938                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2939                 fprintf(trace.output, "%s", bf);
2940                 goto out_close;
2941         }
2942
2943         if (!argc && target__none(&trace.opts.target))
2944                 trace.opts.target.system_wide = true;
2945
2946         if (input_name)
2947                 err = trace__replay(&trace);
2948         else
2949                 err = trace__run(&trace, argc, argv);
2950
2951 out_close:
2952         if (output_name != NULL)
2953                 fclose(trace.output);
2954 out:
2955         return err;
2956 }