perf/x86/intel/bts: Make sure debug store is valid
[cascardo/linux.git] / arch / x86 / events / intel / bts.c
1 /*
2  * BTS PMU driver for perf
3  * Copyright (c) 2013-2014, Intel Corporation.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  */
14
15 #undef DEBUG
16
17 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
19 #include <linux/bitops.h>
20 #include <linux/types.h>
21 #include <linux/slab.h>
22 #include <linux/debugfs.h>
23 #include <linux/device.h>
24 #include <linux/coredump.h>
25
26 #include <asm-generic/sizes.h>
27 #include <asm/perf_event.h>
28
29 #include "../perf_event.h"
30
31 struct bts_ctx {
32         struct perf_output_handle       handle;
33         struct debug_store              ds_back;
34         int                             state;
35 };
36
37 /* BTS context states: */
38 enum {
39         /* no ongoing AUX transactions */
40         BTS_STATE_STOPPED = 0,
41         /* AUX transaction is on, BTS tracing is disabled */
42         BTS_STATE_INACTIVE,
43         /* AUX transaction is on, BTS tracing is running */
44         BTS_STATE_ACTIVE,
45 };
46
47 static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
48
49 #define BTS_RECORD_SIZE         24
50 #define BTS_SAFETY_MARGIN       4080
51
52 struct bts_phys {
53         struct page     *page;
54         unsigned long   size;
55         unsigned long   offset;
56         unsigned long   displacement;
57 };
58
59 struct bts_buffer {
60         size_t          real_size;      /* multiple of BTS_RECORD_SIZE */
61         unsigned int    nr_pages;
62         unsigned int    nr_bufs;
63         unsigned int    cur_buf;
64         bool            snapshot;
65         local_t         data_size;
66         local_t         lost;
67         local_t         head;
68         unsigned long   end;
69         void            **data_pages;
70         struct bts_phys buf[0];
71 };
72
73 struct pmu bts_pmu;
74
75 static size_t buf_size(struct page *page)
76 {
77         return 1 << (PAGE_SHIFT + page_private(page));
78 }
79
80 static void *
81 bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
82 {
83         struct bts_buffer *buf;
84         struct page *page;
85         int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
86         unsigned long offset;
87         size_t size = nr_pages << PAGE_SHIFT;
88         int pg, nbuf, pad;
89
90         /* count all the high order buffers */
91         for (pg = 0, nbuf = 0; pg < nr_pages;) {
92                 page = virt_to_page(pages[pg]);
93                 if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
94                         return NULL;
95                 pg += 1 << page_private(page);
96                 nbuf++;
97         }
98
99         /*
100          * to avoid interrupts in overwrite mode, only allow one physical
101          */
102         if (overwrite && nbuf > 1)
103                 return NULL;
104
105         buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
106         if (!buf)
107                 return NULL;
108
109         buf->nr_pages = nr_pages;
110         buf->nr_bufs = nbuf;
111         buf->snapshot = overwrite;
112         buf->data_pages = pages;
113         buf->real_size = size - size % BTS_RECORD_SIZE;
114
115         for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
116                 unsigned int __nr_pages;
117
118                 page = virt_to_page(pages[pg]);
119                 __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
120                 buf->buf[nbuf].page = page;
121                 buf->buf[nbuf].offset = offset;
122                 buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
123                 buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
124                 pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
125                 buf->buf[nbuf].size -= pad;
126
127                 pg += __nr_pages;
128                 offset += __nr_pages << PAGE_SHIFT;
129         }
130
131         return buf;
132 }
133
134 static void bts_buffer_free_aux(void *data)
135 {
136         kfree(data);
137 }
138
139 static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
140 {
141         return buf->buf[idx].offset + buf->buf[idx].displacement;
142 }
143
144 static void
145 bts_config_buffer(struct bts_buffer *buf)
146 {
147         int cpu = raw_smp_processor_id();
148         struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
149         struct bts_phys *phys = &buf->buf[buf->cur_buf];
150         unsigned long index, thresh = 0, end = phys->size;
151         struct page *page = phys->page;
152
153         index = local_read(&buf->head);
154
155         if (!buf->snapshot) {
156                 if (buf->end < phys->offset + buf_size(page))
157                         end = buf->end - phys->offset - phys->displacement;
158
159                 index -= phys->offset + phys->displacement;
160
161                 if (end - index > BTS_SAFETY_MARGIN)
162                         thresh = end - BTS_SAFETY_MARGIN;
163                 else if (end - index > BTS_RECORD_SIZE)
164                         thresh = end - BTS_RECORD_SIZE;
165                 else
166                         thresh = end;
167         }
168
169         ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
170         ds->bts_index = ds->bts_buffer_base + index;
171         ds->bts_absolute_maximum = ds->bts_buffer_base + end;
172         ds->bts_interrupt_threshold = !buf->snapshot
173                 ? ds->bts_buffer_base + thresh
174                 : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
175 }
176
177 static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
178 {
179         unsigned long index = head - phys->offset;
180
181         memset(page_address(phys->page) + index, 0, phys->size - index);
182 }
183
184 static void bts_update(struct bts_ctx *bts)
185 {
186         int cpu = raw_smp_processor_id();
187         struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
188         struct bts_buffer *buf = perf_get_aux(&bts->handle);
189         unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
190
191         if (!buf)
192                 return;
193
194         head = index + bts_buffer_offset(buf, buf->cur_buf);
195         old = local_xchg(&buf->head, head);
196
197         if (!buf->snapshot) {
198                 if (old == head)
199                         return;
200
201                 if (ds->bts_index >= ds->bts_absolute_maximum)
202                         local_inc(&buf->lost);
203
204                 /*
205                  * old and head are always in the same physical buffer, so we
206                  * can subtract them to get the data size.
207                  */
208                 local_add(head - old, &buf->data_size);
209         } else {
210                 local_set(&buf->data_size, head);
211         }
212 }
213
214 static int
215 bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle);
216
217 /*
218  * Ordering PMU callbacks wrt themselves and the PMI is done by means
219  * of bts::state, which:
220  *  - is set when bts::handle::event is valid, that is, between
221  *    perf_aux_output_begin() and perf_aux_output_end();
222  *  - is zero otherwise;
223  *  - is ordered against bts::handle::event with a compiler barrier.
224  */
225
226 static void __bts_event_start(struct perf_event *event)
227 {
228         struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
229         struct bts_buffer *buf = perf_get_aux(&bts->handle);
230         u64 config = 0;
231
232         if (!buf->snapshot)
233                 config |= ARCH_PERFMON_EVENTSEL_INT;
234         if (!event->attr.exclude_kernel)
235                 config |= ARCH_PERFMON_EVENTSEL_OS;
236         if (!event->attr.exclude_user)
237                 config |= ARCH_PERFMON_EVENTSEL_USR;
238
239         bts_config_buffer(buf);
240
241         /*
242          * local barrier to make sure that ds configuration made it
243          * before we enable BTS and bts::state goes ACTIVE
244          */
245         wmb();
246
247         /* INACTIVE/STOPPED -> ACTIVE */
248         WRITE_ONCE(bts->state, BTS_STATE_ACTIVE);
249
250         intel_pmu_enable_bts(config);
251
252 }
253
254 static void bts_event_start(struct perf_event *event, int flags)
255 {
256         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
257         struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
258         struct bts_buffer *buf;
259
260         buf = perf_aux_output_begin(&bts->handle, event);
261         if (!buf)
262                 goto fail_stop;
263
264         if (bts_buffer_reset(buf, &bts->handle))
265                 goto fail_end_stop;
266
267         bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
268         bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
269         bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
270
271         event->hw.itrace_started = 1;
272         event->hw.state = 0;
273
274         __bts_event_start(event);
275
276         return;
277
278 fail_end_stop:
279         perf_aux_output_end(&bts->handle, 0, false);
280
281 fail_stop:
282         event->hw.state = PERF_HES_STOPPED;
283 }
284
285 static void __bts_event_stop(struct perf_event *event, int state)
286 {
287         struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
288
289         /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */
290         WRITE_ONCE(bts->state, state);
291
292         /*
293          * No extra synchronization is mandated by the documentation to have
294          * BTS data stores globally visible.
295          */
296         intel_pmu_disable_bts();
297 }
298
299 static void bts_event_stop(struct perf_event *event, int flags)
300 {
301         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
302         struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
303         struct bts_buffer *buf = NULL;
304         int state = READ_ONCE(bts->state);
305
306         if (state == BTS_STATE_ACTIVE)
307                 __bts_event_stop(event, BTS_STATE_STOPPED);
308
309         if (state != BTS_STATE_STOPPED)
310                 buf = perf_get_aux(&bts->handle);
311
312         event->hw.state |= PERF_HES_STOPPED;
313
314         if (flags & PERF_EF_UPDATE) {
315                 bts_update(bts);
316
317                 if (buf) {
318                         if (buf->snapshot)
319                                 bts->handle.head =
320                                         local_xchg(&buf->data_size,
321                                                    buf->nr_pages << PAGE_SHIFT);
322
323                         perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
324                                             !!local_xchg(&buf->lost, 0));
325                 }
326
327                 cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
328                 cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
329                 cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
330                 cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
331         }
332 }
333
334 void intel_bts_enable_local(void)
335 {
336         struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
337         int state = READ_ONCE(bts->state);
338
339         /*
340          * Here we transition from INACTIVE to ACTIVE;
341          * if we instead are STOPPED from the interrupt handler,
342          * stay that way. Can't be ACTIVE here though.
343          */
344         if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE))
345                 return;
346
347         if (state == BTS_STATE_STOPPED)
348                 return;
349
350         if (bts->handle.event)
351                 __bts_event_start(bts->handle.event);
352 }
353
354 void intel_bts_disable_local(void)
355 {
356         struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
357
358         /*
359          * Here we transition from ACTIVE to INACTIVE;
360          * do nothing for STOPPED or INACTIVE.
361          */
362         if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE)
363                 return;
364
365         if (bts->handle.event)
366                 __bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE);
367 }
368
369 static int
370 bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
371 {
372         unsigned long head, space, next_space, pad, gap, skip, wakeup;
373         unsigned int next_buf;
374         struct bts_phys *phys, *next_phys;
375         int ret;
376
377         if (buf->snapshot)
378                 return 0;
379
380         head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
381
382         phys = &buf->buf[buf->cur_buf];
383         space = phys->offset + phys->displacement + phys->size - head;
384         pad = space;
385         if (space > handle->size) {
386                 space = handle->size;
387                 space -= space % BTS_RECORD_SIZE;
388         }
389         if (space <= BTS_SAFETY_MARGIN) {
390                 /* See if next phys buffer has more space */
391                 next_buf = buf->cur_buf + 1;
392                 if (next_buf >= buf->nr_bufs)
393                         next_buf = 0;
394                 next_phys = &buf->buf[next_buf];
395                 gap = buf_size(phys->page) - phys->displacement - phys->size +
396                       next_phys->displacement;
397                 skip = pad + gap;
398                 if (handle->size >= skip) {
399                         next_space = next_phys->size;
400                         if (next_space + skip > handle->size) {
401                                 next_space = handle->size - skip;
402                                 next_space -= next_space % BTS_RECORD_SIZE;
403                         }
404                         if (next_space > space || !space) {
405                                 if (pad)
406                                         bts_buffer_pad_out(phys, head);
407                                 ret = perf_aux_output_skip(handle, skip);
408                                 if (ret)
409                                         return ret;
410                                 /* Advance to next phys buffer */
411                                 phys = next_phys;
412                                 space = next_space;
413                                 head = phys->offset + phys->displacement;
414                                 /*
415                                  * After this, cur_buf and head won't match ds
416                                  * anymore, so we must not be racing with
417                                  * bts_update().
418                                  */
419                                 buf->cur_buf = next_buf;
420                                 local_set(&buf->head, head);
421                         }
422                 }
423         }
424
425         /* Don't go far beyond wakeup watermark */
426         wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
427                  handle->head;
428         if (space > wakeup) {
429                 space = wakeup;
430                 space -= space % BTS_RECORD_SIZE;
431         }
432
433         buf->end = head + space;
434
435         /*
436          * If we have no space, the lost notification would have been sent when
437          * we hit absolute_maximum - see bts_update()
438          */
439         if (!space)
440                 return -ENOSPC;
441
442         return 0;
443 }
444
445 int intel_bts_interrupt(void)
446 {
447         struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds;
448         struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
449         struct perf_event *event = bts->handle.event;
450         struct bts_buffer *buf;
451         s64 old_head;
452         int err = -ENOSPC, handled = 0;
453
454         /*
455          * The only surefire way of knowing if this NMI is ours is by checking
456          * the write ptr against the PMI threshold.
457          */
458         if (ds && (ds->bts_index >= ds->bts_interrupt_threshold))
459                 handled = 1;
460
461         /*
462          * this is wrapped in intel_bts_enable_local/intel_bts_disable_local,
463          * so we can only be INACTIVE or STOPPED
464          */
465         if (READ_ONCE(bts->state) == BTS_STATE_STOPPED)
466                 return handled;
467
468         buf = perf_get_aux(&bts->handle);
469         if (!buf)
470                 return handled;
471
472         /*
473          * Skip snapshot counters: they don't use the interrupt, but
474          * there's no other way of telling, because the pointer will
475          * keep moving
476          */
477         if (buf->snapshot)
478                 return 0;
479
480         old_head = local_read(&buf->head);
481         bts_update(bts);
482
483         /* no new data */
484         if (old_head == local_read(&buf->head))
485                 return handled;
486
487         perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
488                             !!local_xchg(&buf->lost, 0));
489
490         buf = perf_aux_output_begin(&bts->handle, event);
491         if (buf)
492                 err = bts_buffer_reset(buf, &bts->handle);
493
494         if (err) {
495                 WRITE_ONCE(bts->state, BTS_STATE_STOPPED);
496
497                 if (buf) {
498                         /*
499                          * BTS_STATE_STOPPED should be visible before
500                          * cleared handle::event
501                          */
502                         barrier();
503                         perf_aux_output_end(&bts->handle, 0, false);
504                 }
505         }
506
507         return 1;
508 }
509
510 static void bts_event_del(struct perf_event *event, int mode)
511 {
512         bts_event_stop(event, PERF_EF_UPDATE);
513 }
514
515 static int bts_event_add(struct perf_event *event, int mode)
516 {
517         struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
518         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
519         struct hw_perf_event *hwc = &event->hw;
520
521         event->hw.state = PERF_HES_STOPPED;
522
523         if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
524                 return -EBUSY;
525
526         if (bts->handle.event)
527                 return -EBUSY;
528
529         if (mode & PERF_EF_START) {
530                 bts_event_start(event, 0);
531                 if (hwc->state & PERF_HES_STOPPED)
532                         return -EINVAL;
533         }
534
535         return 0;
536 }
537
538 static void bts_event_destroy(struct perf_event *event)
539 {
540         x86_release_hardware();
541         x86_del_exclusive(x86_lbr_exclusive_bts);
542 }
543
544 static int bts_event_init(struct perf_event *event)
545 {
546         int ret;
547
548         if (event->attr.type != bts_pmu.type)
549                 return -ENOENT;
550
551         if (x86_add_exclusive(x86_lbr_exclusive_bts))
552                 return -EBUSY;
553
554         /*
555          * BTS leaks kernel addresses even when CPL0 tracing is
556          * disabled, so disallow intel_bts driver for unprivileged
557          * users on paranoid systems since it provides trace data
558          * to the user in a zero-copy fashion.
559          *
560          * Note that the default paranoia setting permits unprivileged
561          * users to profile the kernel.
562          */
563         if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
564             !capable(CAP_SYS_ADMIN))
565                 return -EACCES;
566
567         ret = x86_reserve_hardware();
568         if (ret) {
569                 x86_del_exclusive(x86_lbr_exclusive_bts);
570                 return ret;
571         }
572
573         event->destroy = bts_event_destroy;
574
575         return 0;
576 }
577
578 static void bts_event_read(struct perf_event *event)
579 {
580 }
581
582 static __init int bts_init(void)
583 {
584         if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
585                 return -ENODEV;
586
587         bts_pmu.capabilities    = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
588         bts_pmu.task_ctx_nr     = perf_sw_context;
589         bts_pmu.event_init      = bts_event_init;
590         bts_pmu.add             = bts_event_add;
591         bts_pmu.del             = bts_event_del;
592         bts_pmu.start           = bts_event_start;
593         bts_pmu.stop            = bts_event_stop;
594         bts_pmu.read            = bts_event_read;
595         bts_pmu.setup_aux       = bts_buffer_setup_aux;
596         bts_pmu.free_aux        = bts_buffer_free_aux;
597
598         return perf_pmu_register(&bts_pmu, "intel_bts", -1);
599 }
600 arch_initcall(bts_init);