Merge tag 'armsoc-defconfig' of git://git.kernel.org/pub/scm/linux/kernel/git/arm...
[cascardo/linux.git] / tools / perf / builtin-record.c
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #include "builtin.h"
9
10 #include "perf.h"
11
12 #include "util/build-id.h"
13 #include "util/util.h"
14 #include <subcmd/parse-options.h>
15 #include "util/parse-events.h"
16
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
29 #include "util/data.h"
30 #include "util/perf_regs.h"
31 #include "util/auxtrace.h"
32 #include "util/parse-branch-options.h"
33 #include "util/parse-regs-options.h"
34 #include "util/llvm-utils.h"
35 #include "util/bpf-loader.h"
36 #include "asm/bug.h"
37
38 #include <unistd.h>
39 #include <sched.h>
40 #include <sys/mman.h>
41
42
43 struct record {
44         struct perf_tool        tool;
45         struct record_opts      opts;
46         u64                     bytes_written;
47         struct perf_data_file   file;
48         struct auxtrace_record  *itr;
49         struct perf_evlist      *evlist;
50         struct perf_session     *session;
51         const char              *progname;
52         int                     realtime_prio;
53         bool                    no_buildid;
54         bool                    no_buildid_set;
55         bool                    no_buildid_cache;
56         bool                    no_buildid_cache_set;
57         bool                    buildid_all;
58         unsigned long long      samples;
59 };
60
61 static int record__write(struct record *rec, void *bf, size_t size)
62 {
63         if (perf_data_file__write(rec->session->file, bf, size) < 0) {
64                 pr_err("failed to write perf data, error: %m\n");
65                 return -1;
66         }
67
68         rec->bytes_written += size;
69         return 0;
70 }
71
72 static int process_synthesized_event(struct perf_tool *tool,
73                                      union perf_event *event,
74                                      struct perf_sample *sample __maybe_unused,
75                                      struct machine *machine __maybe_unused)
76 {
77         struct record *rec = container_of(tool, struct record, tool);
78         return record__write(rec, event, event->header.size);
79 }
80
81 static int record__mmap_read(struct record *rec, int idx)
82 {
83         struct perf_mmap *md = &rec->evlist->mmap[idx];
84         u64 head = perf_mmap__read_head(md);
85         u64 old = md->prev;
86         unsigned char *data = md->base + page_size;
87         unsigned long size;
88         void *buf;
89         int rc = 0;
90
91         if (old == head)
92                 return 0;
93
94         rec->samples++;
95
96         size = head - old;
97
98         if ((old & md->mask) + size != (head & md->mask)) {
99                 buf = &data[old & md->mask];
100                 size = md->mask + 1 - (old & md->mask);
101                 old += size;
102
103                 if (record__write(rec, buf, size) < 0) {
104                         rc = -1;
105                         goto out;
106                 }
107         }
108
109         buf = &data[old & md->mask];
110         size = head - old;
111         old += size;
112
113         if (record__write(rec, buf, size) < 0) {
114                 rc = -1;
115                 goto out;
116         }
117
118         md->prev = old;
119         perf_evlist__mmap_consume(rec->evlist, idx);
120 out:
121         return rc;
122 }
123
124 static volatile int done;
125 static volatile int signr = -1;
126 static volatile int child_finished;
127 static volatile int auxtrace_snapshot_enabled;
128 static volatile int auxtrace_snapshot_err;
129 static volatile int auxtrace_record__snapshot_started;
130
131 static void sig_handler(int sig)
132 {
133         if (sig == SIGCHLD)
134                 child_finished = 1;
135         else
136                 signr = sig;
137
138         done = 1;
139 }
140
141 static void record__sig_exit(void)
142 {
143         if (signr == -1)
144                 return;
145
146         signal(signr, SIG_DFL);
147         raise(signr);
148 }
149
150 #ifdef HAVE_AUXTRACE_SUPPORT
151
152 static int record__process_auxtrace(struct perf_tool *tool,
153                                     union perf_event *event, void *data1,
154                                     size_t len1, void *data2, size_t len2)
155 {
156         struct record *rec = container_of(tool, struct record, tool);
157         struct perf_data_file *file = &rec->file;
158         size_t padding;
159         u8 pad[8] = {0};
160
161         if (!perf_data_file__is_pipe(file)) {
162                 off_t file_offset;
163                 int fd = perf_data_file__fd(file);
164                 int err;
165
166                 file_offset = lseek(fd, 0, SEEK_CUR);
167                 if (file_offset == -1)
168                         return -1;
169                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
170                                                      event, file_offset);
171                 if (err)
172                         return err;
173         }
174
175         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
176         padding = (len1 + len2) & 7;
177         if (padding)
178                 padding = 8 - padding;
179
180         record__write(rec, event, event->header.size);
181         record__write(rec, data1, len1);
182         if (len2)
183                 record__write(rec, data2, len2);
184         record__write(rec, &pad, padding);
185
186         return 0;
187 }
188
189 static int record__auxtrace_mmap_read(struct record *rec,
190                                       struct auxtrace_mmap *mm)
191 {
192         int ret;
193
194         ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool,
195                                   record__process_auxtrace);
196         if (ret < 0)
197                 return ret;
198
199         if (ret)
200                 rec->samples++;
201
202         return 0;
203 }
204
205 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
206                                                struct auxtrace_mmap *mm)
207 {
208         int ret;
209
210         ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool,
211                                            record__process_auxtrace,
212                                            rec->opts.auxtrace_snapshot_size);
213         if (ret < 0)
214                 return ret;
215
216         if (ret)
217                 rec->samples++;
218
219         return 0;
220 }
221
222 static int record__auxtrace_read_snapshot_all(struct record *rec)
223 {
224         int i;
225         int rc = 0;
226
227         for (i = 0; i < rec->evlist->nr_mmaps; i++) {
228                 struct auxtrace_mmap *mm =
229                                 &rec->evlist->mmap[i].auxtrace_mmap;
230
231                 if (!mm->base)
232                         continue;
233
234                 if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) {
235                         rc = -1;
236                         goto out;
237                 }
238         }
239 out:
240         return rc;
241 }
242
243 static void record__read_auxtrace_snapshot(struct record *rec)
244 {
245         pr_debug("Recording AUX area tracing snapshot\n");
246         if (record__auxtrace_read_snapshot_all(rec) < 0) {
247                 auxtrace_snapshot_err = -1;
248         } else {
249                 auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr);
250                 if (!auxtrace_snapshot_err)
251                         auxtrace_snapshot_enabled = 1;
252         }
253 }
254
255 #else
256
257 static inline
258 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
259                                struct auxtrace_mmap *mm __maybe_unused)
260 {
261         return 0;
262 }
263
264 static inline
265 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
266 {
267 }
268
269 static inline
270 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
271 {
272         return 0;
273 }
274
275 #endif
276
277 static int record__open(struct record *rec)
278 {
279         char msg[512];
280         struct perf_evsel *pos;
281         struct perf_evlist *evlist = rec->evlist;
282         struct perf_session *session = rec->session;
283         struct record_opts *opts = &rec->opts;
284         int rc = 0;
285
286         perf_evlist__config(evlist, opts);
287
288         evlist__for_each(evlist, pos) {
289 try_again:
290                 if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
291                         if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
292                                 if (verbose)
293                                         ui__warning("%s\n", msg);
294                                 goto try_again;
295                         }
296
297                         rc = -errno;
298                         perf_evsel__open_strerror(pos, &opts->target,
299                                                   errno, msg, sizeof(msg));
300                         ui__error("%s\n", msg);
301                         goto out;
302                 }
303         }
304
305         if (perf_evlist__apply_filters(evlist, &pos)) {
306                 error("failed to set filter \"%s\" on event %s with %d (%s)\n",
307                         pos->filter, perf_evsel__name(pos), errno,
308                         strerror_r(errno, msg, sizeof(msg)));
309                 rc = -1;
310                 goto out;
311         }
312
313         if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
314                                  opts->auxtrace_mmap_pages,
315                                  opts->auxtrace_snapshot_mode) < 0) {
316                 if (errno == EPERM) {
317                         pr_err("Permission error mapping pages.\n"
318                                "Consider increasing "
319                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
320                                "or try again with a smaller value of -m/--mmap_pages.\n"
321                                "(current value: %u,%u)\n",
322                                opts->mmap_pages, opts->auxtrace_mmap_pages);
323                         rc = -errno;
324                 } else {
325                         pr_err("failed to mmap with %d (%s)\n", errno,
326                                 strerror_r(errno, msg, sizeof(msg)));
327                         if (errno)
328                                 rc = -errno;
329                         else
330                                 rc = -EINVAL;
331                 }
332                 goto out;
333         }
334
335         session->evlist = evlist;
336         perf_session__set_id_hdr_size(session);
337 out:
338         return rc;
339 }
340
341 static int process_sample_event(struct perf_tool *tool,
342                                 union perf_event *event,
343                                 struct perf_sample *sample,
344                                 struct perf_evsel *evsel,
345                                 struct machine *machine)
346 {
347         struct record *rec = container_of(tool, struct record, tool);
348
349         rec->samples++;
350
351         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
352 }
353
354 static int process_buildids(struct record *rec)
355 {
356         struct perf_data_file *file  = &rec->file;
357         struct perf_session *session = rec->session;
358
359         if (file->size == 0)
360                 return 0;
361
362         /*
363          * During this process, it'll load kernel map and replace the
364          * dso->long_name to a real pathname it found.  In this case
365          * we prefer the vmlinux path like
366          *   /lib/modules/3.16.4/build/vmlinux
367          *
368          * rather than build-id path (in debug directory).
369          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
370          */
371         symbol_conf.ignore_vmlinux_buildid = true;
372
373         /*
374          * If --buildid-all is given, it marks all DSO regardless of hits,
375          * so no need to process samples.
376          */
377         if (rec->buildid_all)
378                 rec->tool.sample = NULL;
379
380         return perf_session__process_events(session);
381 }
382
383 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
384 {
385         int err;
386         struct perf_tool *tool = data;
387         /*
388          *As for guest kernel when processing subcommand record&report,
389          *we arrange module mmap prior to guest kernel mmap and trigger
390          *a preload dso because default guest module symbols are loaded
391          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
392          *method is used to avoid symbol missing when the first addr is
393          *in module instead of in guest kernel.
394          */
395         err = perf_event__synthesize_modules(tool, process_synthesized_event,
396                                              machine);
397         if (err < 0)
398                 pr_err("Couldn't record guest kernel [%d]'s reference"
399                        " relocation symbol.\n", machine->pid);
400
401         /*
402          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
403          * have no _text sometimes.
404          */
405         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
406                                                  machine);
407         if (err < 0)
408                 pr_err("Couldn't record guest kernel [%d]'s reference"
409                        " relocation symbol.\n", machine->pid);
410 }
411
412 static struct perf_event_header finished_round_event = {
413         .size = sizeof(struct perf_event_header),
414         .type = PERF_RECORD_FINISHED_ROUND,
415 };
416
417 static int record__mmap_read_all(struct record *rec)
418 {
419         u64 bytes_written = rec->bytes_written;
420         int i;
421         int rc = 0;
422
423         for (i = 0; i < rec->evlist->nr_mmaps; i++) {
424                 struct auxtrace_mmap *mm = &rec->evlist->mmap[i].auxtrace_mmap;
425
426                 if (rec->evlist->mmap[i].base) {
427                         if (record__mmap_read(rec, i) != 0) {
428                                 rc = -1;
429                                 goto out;
430                         }
431                 }
432
433                 if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
434                     record__auxtrace_mmap_read(rec, mm) != 0) {
435                         rc = -1;
436                         goto out;
437                 }
438         }
439
440         /*
441          * Mark the round finished in case we wrote
442          * at least one event.
443          */
444         if (bytes_written != rec->bytes_written)
445                 rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
446
447 out:
448         return rc;
449 }
450
451 static void record__init_features(struct record *rec)
452 {
453         struct perf_session *session = rec->session;
454         int feat;
455
456         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
457                 perf_header__set_feat(&session->header, feat);
458
459         if (rec->no_buildid)
460                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
461
462         if (!have_tracepoints(&rec->evlist->entries))
463                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
464
465         if (!rec->opts.branch_stack)
466                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
467
468         if (!rec->opts.full_auxtrace)
469                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
470
471         perf_header__clear_feat(&session->header, HEADER_STAT);
472 }
473
474 static void
475 record__finish_output(struct record *rec)
476 {
477         struct perf_data_file *file = &rec->file;
478         int fd = perf_data_file__fd(file);
479
480         if (file->is_pipe)
481                 return;
482
483         rec->session->header.data_size += rec->bytes_written;
484         file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
485
486         if (!rec->no_buildid) {
487                 process_buildids(rec);
488
489                 if (rec->buildid_all)
490                         dsos__hit_all(rec->session);
491         }
492         perf_session__write_header(rec->session, rec->evlist, fd, true);
493
494         return;
495 }
496
497 static volatile int workload_exec_errno;
498
499 /*
500  * perf_evlist__prepare_workload will send a SIGUSR1
501  * if the fork fails, since we asked by setting its
502  * want_signal to true.
503  */
504 static void workload_exec_failed_signal(int signo __maybe_unused,
505                                         siginfo_t *info,
506                                         void *ucontext __maybe_unused)
507 {
508         workload_exec_errno = info->si_value.sival_int;
509         done = 1;
510         child_finished = 1;
511 }
512
513 static void snapshot_sig_handler(int sig);
514
515 static int record__synthesize(struct record *rec)
516 {
517         struct perf_session *session = rec->session;
518         struct machine *machine = &session->machines.host;
519         struct perf_data_file *file = &rec->file;
520         struct record_opts *opts = &rec->opts;
521         struct perf_tool *tool = &rec->tool;
522         int fd = perf_data_file__fd(file);
523         int err = 0;
524
525         if (file->is_pipe) {
526                 err = perf_event__synthesize_attrs(tool, session,
527                                                    process_synthesized_event);
528                 if (err < 0) {
529                         pr_err("Couldn't synthesize attrs.\n");
530                         goto out;
531                 }
532
533                 if (have_tracepoints(&rec->evlist->entries)) {
534                         /*
535                          * FIXME err <= 0 here actually means that
536                          * there were no tracepoints so its not really
537                          * an error, just that we don't need to
538                          * synthesize anything.  We really have to
539                          * return this more properly and also
540                          * propagate errors that now are calling die()
541                          */
542                         err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
543                                                                   process_synthesized_event);
544                         if (err <= 0) {
545                                 pr_err("Couldn't record tracing data.\n");
546                                 goto out;
547                         }
548                         rec->bytes_written += err;
549                 }
550         }
551
552         if (rec->opts.full_auxtrace) {
553                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
554                                         session, process_synthesized_event);
555                 if (err)
556                         goto out;
557         }
558
559         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
560                                                  machine);
561         WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
562                            "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
563                            "Check /proc/kallsyms permission or run as root.\n");
564
565         err = perf_event__synthesize_modules(tool, process_synthesized_event,
566                                              machine);
567         WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
568                            "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
569                            "Check /proc/modules permission or run as root.\n");
570
571         if (perf_guest) {
572                 machines__process_guests(&session->machines,
573                                          perf_event__synthesize_guest_os, tool);
574         }
575
576         err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
577                                             process_synthesized_event, opts->sample_address,
578                                             opts->proc_map_timeout);
579 out:
580         return err;
581 }
582
583 static int __cmd_record(struct record *rec, int argc, const char **argv)
584 {
585         int err;
586         int status = 0;
587         unsigned long waking = 0;
588         const bool forks = argc > 0;
589         struct machine *machine;
590         struct perf_tool *tool = &rec->tool;
591         struct record_opts *opts = &rec->opts;
592         struct perf_data_file *file = &rec->file;
593         struct perf_session *session;
594         bool disabled = false, draining = false;
595         int fd;
596
597         rec->progname = argv[0];
598
599         atexit(record__sig_exit);
600         signal(SIGCHLD, sig_handler);
601         signal(SIGINT, sig_handler);
602         signal(SIGTERM, sig_handler);
603         if (rec->opts.auxtrace_snapshot_mode)
604                 signal(SIGUSR2, snapshot_sig_handler);
605         else
606                 signal(SIGUSR2, SIG_IGN);
607
608         session = perf_session__new(file, false, tool);
609         if (session == NULL) {
610                 pr_err("Perf session creation failed.\n");
611                 return -1;
612         }
613
614         fd = perf_data_file__fd(file);
615         rec->session = session;
616
617         record__init_features(rec);
618
619         if (forks) {
620                 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
621                                                     argv, file->is_pipe,
622                                                     workload_exec_failed_signal);
623                 if (err < 0) {
624                         pr_err("Couldn't run the workload!\n");
625                         status = err;
626                         goto out_delete_session;
627                 }
628         }
629
630         if (record__open(rec) != 0) {
631                 err = -1;
632                 goto out_child;
633         }
634
635         err = bpf__apply_obj_config();
636         if (err) {
637                 char errbuf[BUFSIZ];
638
639                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
640                 pr_err("ERROR: Apply config to BPF failed: %s\n",
641                          errbuf);
642                 goto out_child;
643         }
644
645         /*
646          * Normally perf_session__new would do this, but it doesn't have the
647          * evlist.
648          */
649         if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
650                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
651                 rec->tool.ordered_events = false;
652         }
653
654         if (!rec->evlist->nr_groups)
655                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
656
657         if (file->is_pipe) {
658                 err = perf_header__write_pipe(fd);
659                 if (err < 0)
660                         goto out_child;
661         } else {
662                 err = perf_session__write_header(session, rec->evlist, fd, false);
663                 if (err < 0)
664                         goto out_child;
665         }
666
667         if (!rec->no_buildid
668             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
669                 pr_err("Couldn't generate buildids. "
670                        "Use --no-buildid to profile anyway.\n");
671                 err = -1;
672                 goto out_child;
673         }
674
675         machine = &session->machines.host;
676
677         err = record__synthesize(rec);
678         if (err < 0)
679                 goto out_child;
680
681         if (rec->realtime_prio) {
682                 struct sched_param param;
683
684                 param.sched_priority = rec->realtime_prio;
685                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
686                         pr_err("Could not set realtime priority.\n");
687                         err = -1;
688                         goto out_child;
689                 }
690         }
691
692         /*
693          * When perf is starting the traced process, all the events
694          * (apart from group members) have enable_on_exec=1 set,
695          * so don't spoil it by prematurely enabling them.
696          */
697         if (!target__none(&opts->target) && !opts->initial_delay)
698                 perf_evlist__enable(rec->evlist);
699
700         /*
701          * Let the child rip
702          */
703         if (forks) {
704                 union perf_event *event;
705
706                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
707                 if (event == NULL) {
708                         err = -ENOMEM;
709                         goto out_child;
710                 }
711
712                 /*
713                  * Some H/W events are generated before COMM event
714                  * which is emitted during exec(), so perf script
715                  * cannot see a correct process name for those events.
716                  * Synthesize COMM event to prevent it.
717                  */
718                 perf_event__synthesize_comm(tool, event,
719                                             rec->evlist->workload.pid,
720                                             process_synthesized_event,
721                                             machine);
722                 free(event);
723
724                 perf_evlist__start_workload(rec->evlist);
725         }
726
727         if (opts->initial_delay) {
728                 usleep(opts->initial_delay * 1000);
729                 perf_evlist__enable(rec->evlist);
730         }
731
732         auxtrace_snapshot_enabled = 1;
733         for (;;) {
734                 unsigned long long hits = rec->samples;
735
736                 if (record__mmap_read_all(rec) < 0) {
737                         auxtrace_snapshot_enabled = 0;
738                         err = -1;
739                         goto out_child;
740                 }
741
742                 if (auxtrace_record__snapshot_started) {
743                         auxtrace_record__snapshot_started = 0;
744                         if (!auxtrace_snapshot_err)
745                                 record__read_auxtrace_snapshot(rec);
746                         if (auxtrace_snapshot_err) {
747                                 pr_err("AUX area tracing snapshot failed\n");
748                                 err = -1;
749                                 goto out_child;
750                         }
751                 }
752
753                 if (hits == rec->samples) {
754                         if (done || draining)
755                                 break;
756                         err = perf_evlist__poll(rec->evlist, -1);
757                         /*
758                          * Propagate error, only if there's any. Ignore positive
759                          * number of returned events and interrupt error.
760                          */
761                         if (err > 0 || (err < 0 && errno == EINTR))
762                                 err = 0;
763                         waking++;
764
765                         if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
766                                 draining = true;
767                 }
768
769                 /*
770                  * When perf is starting the traced process, at the end events
771                  * die with the process and we wait for that. Thus no need to
772                  * disable events in this case.
773                  */
774                 if (done && !disabled && !target__none(&opts->target)) {
775                         auxtrace_snapshot_enabled = 0;
776                         perf_evlist__disable(rec->evlist);
777                         disabled = true;
778                 }
779         }
780         auxtrace_snapshot_enabled = 0;
781
782         if (forks && workload_exec_errno) {
783                 char msg[STRERR_BUFSIZE];
784                 const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
785                 pr_err("Workload failed: %s\n", emsg);
786                 err = -1;
787                 goto out_child;
788         }
789
790         if (!quiet)
791                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
792
793 out_child:
794         if (forks) {
795                 int exit_status;
796
797                 if (!child_finished)
798                         kill(rec->evlist->workload.pid, SIGTERM);
799
800                 wait(&exit_status);
801
802                 if (err < 0)
803                         status = err;
804                 else if (WIFEXITED(exit_status))
805                         status = WEXITSTATUS(exit_status);
806                 else if (WIFSIGNALED(exit_status))
807                         signr = WTERMSIG(exit_status);
808         } else
809                 status = err;
810
811         /* this will be recalculated during process_buildids() */
812         rec->samples = 0;
813
814         if (!err)
815                 record__finish_output(rec);
816
817         if (!err && !quiet) {
818                 char samples[128];
819
820                 if (rec->samples && !rec->opts.full_auxtrace)
821                         scnprintf(samples, sizeof(samples),
822                                   " (%" PRIu64 " samples)", rec->samples);
823                 else
824                         samples[0] = '\0';
825
826                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s ]\n",
827                         perf_data_file__size(file) / 1024.0 / 1024.0,
828                         file->path, samples);
829         }
830
831 out_delete_session:
832         perf_session__delete(session);
833         return status;
834 }
835
836 static void callchain_debug(void)
837 {
838         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
839
840         pr_debug("callchain: type %s\n", str[callchain_param.record_mode]);
841
842         if (callchain_param.record_mode == CALLCHAIN_DWARF)
843                 pr_debug("callchain: stack dump size %d\n",
844                          callchain_param.dump_size);
845 }
846
847 int record_parse_callchain_opt(const struct option *opt,
848                                const char *arg,
849                                int unset)
850 {
851         int ret;
852         struct record_opts *record = (struct record_opts *)opt->value;
853
854         record->callgraph_set = true;
855         callchain_param.enabled = !unset;
856
857         /* --no-call-graph */
858         if (unset) {
859                 callchain_param.record_mode = CALLCHAIN_NONE;
860                 pr_debug("callchain: disabled\n");
861                 return 0;
862         }
863
864         ret = parse_callchain_record_opt(arg, &callchain_param);
865         if (!ret) {
866                 /* Enable data address sampling for DWARF unwind. */
867                 if (callchain_param.record_mode == CALLCHAIN_DWARF)
868                         record->sample_address = true;
869                 callchain_debug();
870         }
871
872         return ret;
873 }
874
875 int record_callchain_opt(const struct option *opt,
876                          const char *arg __maybe_unused,
877                          int unset __maybe_unused)
878 {
879         struct record_opts *record = (struct record_opts *)opt->value;
880
881         record->callgraph_set = true;
882         callchain_param.enabled = true;
883
884         if (callchain_param.record_mode == CALLCHAIN_NONE)
885                 callchain_param.record_mode = CALLCHAIN_FP;
886
887         callchain_debug();
888         return 0;
889 }
890
891 static int perf_record_config(const char *var, const char *value, void *cb)
892 {
893         struct record *rec = cb;
894
895         if (!strcmp(var, "record.build-id")) {
896                 if (!strcmp(value, "cache"))
897                         rec->no_buildid_cache = false;
898                 else if (!strcmp(value, "no-cache"))
899                         rec->no_buildid_cache = true;
900                 else if (!strcmp(value, "skip"))
901                         rec->no_buildid = true;
902                 else
903                         return -1;
904                 return 0;
905         }
906         if (!strcmp(var, "record.call-graph"))
907                 var = "call-graph.record-mode"; /* fall-through */
908
909         return perf_default_config(var, value, cb);
910 }
911
912 struct clockid_map {
913         const char *name;
914         int clockid;
915 };
916
917 #define CLOCKID_MAP(n, c)       \
918         { .name = n, .clockid = (c), }
919
920 #define CLOCKID_END     { .name = NULL, }
921
922
923 /*
924  * Add the missing ones, we need to build on many distros...
925  */
926 #ifndef CLOCK_MONOTONIC_RAW
927 #define CLOCK_MONOTONIC_RAW 4
928 #endif
929 #ifndef CLOCK_BOOTTIME
930 #define CLOCK_BOOTTIME 7
931 #endif
932 #ifndef CLOCK_TAI
933 #define CLOCK_TAI 11
934 #endif
935
936 static const struct clockid_map clockids[] = {
937         /* available for all events, NMI safe */
938         CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
939         CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
940
941         /* available for some events */
942         CLOCKID_MAP("realtime", CLOCK_REALTIME),
943         CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
944         CLOCKID_MAP("tai", CLOCK_TAI),
945
946         /* available for the lazy */
947         CLOCKID_MAP("mono", CLOCK_MONOTONIC),
948         CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
949         CLOCKID_MAP("real", CLOCK_REALTIME),
950         CLOCKID_MAP("boot", CLOCK_BOOTTIME),
951
952         CLOCKID_END,
953 };
954
955 static int parse_clockid(const struct option *opt, const char *str, int unset)
956 {
957         struct record_opts *opts = (struct record_opts *)opt->value;
958         const struct clockid_map *cm;
959         const char *ostr = str;
960
961         if (unset) {
962                 opts->use_clockid = 0;
963                 return 0;
964         }
965
966         /* no arg passed */
967         if (!str)
968                 return 0;
969
970         /* no setting it twice */
971         if (opts->use_clockid)
972                 return -1;
973
974         opts->use_clockid = true;
975
976         /* if its a number, we're done */
977         if (sscanf(str, "%d", &opts->clockid) == 1)
978                 return 0;
979
980         /* allow a "CLOCK_" prefix to the name */
981         if (!strncasecmp(str, "CLOCK_", 6))
982                 str += 6;
983
984         for (cm = clockids; cm->name; cm++) {
985                 if (!strcasecmp(str, cm->name)) {
986                         opts->clockid = cm->clockid;
987                         return 0;
988                 }
989         }
990
991         opts->use_clockid = false;
992         ui__warning("unknown clockid %s, check man page\n", ostr);
993         return -1;
994 }
995
996 static int record__parse_mmap_pages(const struct option *opt,
997                                     const char *str,
998                                     int unset __maybe_unused)
999 {
1000         struct record_opts *opts = opt->value;
1001         char *s, *p;
1002         unsigned int mmap_pages;
1003         int ret;
1004
1005         if (!str)
1006                 return -EINVAL;
1007
1008         s = strdup(str);
1009         if (!s)
1010                 return -ENOMEM;
1011
1012         p = strchr(s, ',');
1013         if (p)
1014                 *p = '\0';
1015
1016         if (*s) {
1017                 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1018                 if (ret)
1019                         goto out_free;
1020                 opts->mmap_pages = mmap_pages;
1021         }
1022
1023         if (!p) {
1024                 ret = 0;
1025                 goto out_free;
1026         }
1027
1028         ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1029         if (ret)
1030                 goto out_free;
1031
1032         opts->auxtrace_mmap_pages = mmap_pages;
1033
1034 out_free:
1035         free(s);
1036         return ret;
1037 }
1038
1039 static const char * const __record_usage[] = {
1040         "perf record [<options>] [<command>]",
1041         "perf record [<options>] -- <command> [<options>]",
1042         NULL
1043 };
1044 const char * const *record_usage = __record_usage;
1045
1046 /*
1047  * XXX Ideally would be local to cmd_record() and passed to a record__new
1048  * because we need to have access to it in record__exit, that is called
1049  * after cmd_record() exits, but since record_options need to be accessible to
1050  * builtin-script, leave it here.
1051  *
1052  * At least we don't ouch it in all the other functions here directly.
1053  *
1054  * Just say no to tons of global variables, sigh.
1055  */
1056 static struct record record = {
1057         .opts = {
1058                 .sample_time         = true,
1059                 .mmap_pages          = UINT_MAX,
1060                 .user_freq           = UINT_MAX,
1061                 .user_interval       = ULLONG_MAX,
1062                 .freq                = 4000,
1063                 .target              = {
1064                         .uses_mmap   = true,
1065                         .default_per_cpu = true,
1066                 },
1067                 .proc_map_timeout     = 500,
1068         },
1069         .tool = {
1070                 .sample         = process_sample_event,
1071                 .fork           = perf_event__process_fork,
1072                 .exit           = perf_event__process_exit,
1073                 .comm           = perf_event__process_comm,
1074                 .mmap           = perf_event__process_mmap,
1075                 .mmap2          = perf_event__process_mmap2,
1076                 .ordered_events = true,
1077         },
1078 };
1079
1080 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
1081         "\n\t\t\t\tDefault: fp";
1082
1083 /*
1084  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
1085  * with it and switch to use the library functions in perf_evlist that came
1086  * from builtin-record.c, i.e. use record_opts,
1087  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1088  * using pipes, etc.
1089  */
1090 struct option __record_options[] = {
1091         OPT_CALLBACK('e', "event", &record.evlist, "event",
1092                      "event selector. use 'perf list' to list available events",
1093                      parse_events_option),
1094         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1095                      "event filter", parse_filter),
1096         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
1097                            NULL, "don't record events from perf itself",
1098                            exclude_perf),
1099         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1100                     "record events on existing process id"),
1101         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1102                     "record events on existing thread id"),
1103         OPT_INTEGER('r', "realtime", &record.realtime_prio,
1104                     "collect data with this RT SCHED_FIFO priority"),
1105         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
1106                     "collect data without buffering"),
1107         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1108                     "collect raw sample records from all opened counters"),
1109         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1110                             "system-wide collection from all CPUs"),
1111         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1112                     "list of cpus to monitor"),
1113         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1114         OPT_STRING('o', "output", &record.file.path, "file",
1115                     "output file name"),
1116         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
1117                         &record.opts.no_inherit_set,
1118                         "child tasks do not inherit counters"),
1119         OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
1120         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
1121                      "number of mmap data pages and AUX area tracing mmap pages",
1122                      record__parse_mmap_pages),
1123         OPT_BOOLEAN(0, "group", &record.opts.group,
1124                     "put the counters into a counter group"),
1125         OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
1126                            NULL, "enables call-graph recording" ,
1127                            &record_callchain_opt),
1128         OPT_CALLBACK(0, "call-graph", &record.opts,
1129                      "record_mode[,record_size]", record_callchain_help,
1130                      &record_parse_callchain_opt),
1131         OPT_INCR('v', "verbose", &verbose,
1132                     "be more verbose (show counter open errors, etc)"),
1133         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1134         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1135                     "per thread counts"),
1136         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
1137         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
1138                         &record.opts.sample_time_set,
1139                         "Record the sample timestamps"),
1140         OPT_BOOLEAN('P', "period", &record.opts.period, "Record the sample period"),
1141         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1142                     "don't sample"),
1143         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
1144                         &record.no_buildid_cache_set,
1145                         "do not update the buildid cache"),
1146         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
1147                         &record.no_buildid_set,
1148                         "do not collect buildids in perf.data"),
1149         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1150                      "monitor event in cgroup name only",
1151                      parse_cgroups),
1152         OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
1153                   "ms to wait before starting measurement after program start"),
1154         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1155                    "user to profile"),
1156
1157         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1158                      "branch any", "sample any taken branches",
1159                      parse_branch_stack),
1160
1161         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1162                      "branch filter mask", "branch stack filter modes",
1163                      parse_branch_stack),
1164         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
1165                     "sample by weight (on special events only)"),
1166         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
1167                     "sample transaction flags (special events only)"),
1168         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
1169                     "use per-thread mmaps"),
1170         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
1171                     "sample selected machine registers on interrupt,"
1172                     " use -I ? to list register names", parse_regs),
1173         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
1174                     "Record running/enabled time of read (:S) events"),
1175         OPT_CALLBACK('k', "clockid", &record.opts,
1176         "clockid", "clockid to use for events, see clock_gettime()",
1177         parse_clockid),
1178         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
1179                           "opts", "AUX area tracing Snapshot Mode", ""),
1180         OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
1181                         "per thread proc mmap processing timeout in ms"),
1182         OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
1183                     "Record context switch events"),
1184         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
1185                          "Configure all used events to run in kernel space.",
1186                          PARSE_OPT_EXCLUSIVE),
1187         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
1188                          "Configure all used events to run in user space.",
1189                          PARSE_OPT_EXCLUSIVE),
1190         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
1191                    "clang binary to use for compiling BPF scriptlets"),
1192         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
1193                    "options passed to clang when compiling BPF scriptlets"),
1194         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
1195                    "file", "vmlinux pathname"),
1196         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
1197                     "Record build-id of all DSOs regardless of hits"),
1198         OPT_END()
1199 };
1200
1201 struct option *record_options = __record_options;
1202
1203 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1204 {
1205         int err;
1206         struct record *rec = &record;
1207         char errbuf[BUFSIZ];
1208
1209 #ifndef HAVE_LIBBPF_SUPPORT
1210 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
1211         set_nobuild('\0', "clang-path", true);
1212         set_nobuild('\0', "clang-opt", true);
1213 # undef set_nobuild
1214 #endif
1215
1216 #ifndef HAVE_BPF_PROLOGUE
1217 # if !defined (HAVE_DWARF_SUPPORT)
1218 #  define REASON  "NO_DWARF=1"
1219 # elif !defined (HAVE_LIBBPF_SUPPORT)
1220 #  define REASON  "NO_LIBBPF=1"
1221 # else
1222 #  define REASON  "this architecture doesn't support BPF prologue"
1223 # endif
1224 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
1225         set_nobuild('\0', "vmlinux", true);
1226 # undef set_nobuild
1227 # undef REASON
1228 #endif
1229
1230         rec->evlist = perf_evlist__new();
1231         if (rec->evlist == NULL)
1232                 return -ENOMEM;
1233
1234         perf_config(perf_record_config, rec);
1235
1236         argc = parse_options(argc, argv, record_options, record_usage,
1237                             PARSE_OPT_STOP_AT_NON_OPTION);
1238         if (!argc && target__none(&rec->opts.target))
1239                 usage_with_options(record_usage, record_options);
1240
1241         if (nr_cgroups && !rec->opts.target.system_wide) {
1242                 usage_with_options_msg(record_usage, record_options,
1243                         "cgroup monitoring only available in system-wide mode");
1244
1245         }
1246         if (rec->opts.record_switch_events &&
1247             !perf_can_record_switch_events()) {
1248                 ui__error("kernel does not support recording context switch events\n");
1249                 parse_options_usage(record_usage, record_options, "switch-events", 0);
1250                 return -EINVAL;
1251         }
1252
1253         if (!rec->itr) {
1254                 rec->itr = auxtrace_record__init(rec->evlist, &err);
1255                 if (err)
1256                         return err;
1257         }
1258
1259         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
1260                                               rec->opts.auxtrace_snapshot_opts);
1261         if (err)
1262                 return err;
1263
1264         err = -ENOMEM;
1265
1266         symbol__init(NULL);
1267
1268         if (symbol_conf.kptr_restrict)
1269                 pr_warning(
1270 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1271 "check /proc/sys/kernel/kptr_restrict.\n\n"
1272 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1273 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1274 "Samples in kernel modules won't be resolved at all.\n\n"
1275 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1276 "even with a suitable vmlinux or kallsyms file.\n\n");
1277
1278         if (rec->no_buildid_cache || rec->no_buildid)
1279                 disable_buildid_cache();
1280
1281         if (rec->evlist->nr_entries == 0 &&
1282             perf_evlist__add_default(rec->evlist) < 0) {
1283                 pr_err("Not enough memory for event selector list\n");
1284                 goto out_symbol_exit;
1285         }
1286
1287         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
1288                 rec->opts.no_inherit = true;
1289
1290         err = target__validate(&rec->opts.target);
1291         if (err) {
1292                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1293                 ui__warning("%s", errbuf);
1294         }
1295
1296         err = target__parse_uid(&rec->opts.target);
1297         if (err) {
1298                 int saved_errno = errno;
1299
1300                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1301                 ui__error("%s", errbuf);
1302
1303                 err = -saved_errno;
1304                 goto out_symbol_exit;
1305         }
1306
1307         err = -ENOMEM;
1308         if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
1309                 usage_with_options(record_usage, record_options);
1310
1311         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
1312         if (err)
1313                 goto out_symbol_exit;
1314
1315         /*
1316          * We take all buildids when the file contains
1317          * AUX area tracing data because we do not decode the
1318          * trace because it would take too long.
1319          */
1320         if (rec->opts.full_auxtrace)
1321                 rec->buildid_all = true;
1322
1323         if (record_opts__config(&rec->opts)) {
1324                 err = -EINVAL;
1325                 goto out_symbol_exit;
1326         }
1327
1328         err = __cmd_record(&record, argc, argv);
1329 out_symbol_exit:
1330         perf_evlist__delete(rec->evlist);
1331         symbol__exit();
1332         auxtrace_record__free(rec->itr);
1333         return err;
1334 }
1335
1336 static void snapshot_sig_handler(int sig __maybe_unused)
1337 {
1338         if (!auxtrace_snapshot_enabled)
1339                 return;
1340         auxtrace_snapshot_enabled = 0;
1341         auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
1342         auxtrace_record__snapshot_started = 1;
1343 }