perf/x86/intel: Support task events with Intel CQM
[cascardo/linux.git] / arch / x86 / kernel / cpu / perf_event_intel_cqm.c
1 /*
2  * Intel Cache Quality-of-Service Monitoring (CQM) support.
3  *
4  * Based very, very heavily on work by Peter Zijlstra.
5  */
6
7 #include <linux/perf_event.h>
8 #include <linux/slab.h>
9 #include <asm/cpu_device_id.h>
10 #include "perf_event.h"
11
12 #define MSR_IA32_PQR_ASSOC      0x0c8f
13 #define MSR_IA32_QM_CTR         0x0c8e
14 #define MSR_IA32_QM_EVTSEL      0x0c8d
15
16 static unsigned int cqm_max_rmid = -1;
17 static unsigned int cqm_l3_scale; /* supposedly cacheline size */
18
19 struct intel_cqm_state {
20         raw_spinlock_t          lock;
21         int                     rmid;
22         int                     cnt;
23 };
24
25 static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
26
27 /*
28  * Protects cache_cgroups and cqm_rmid_lru.
29  */
30 static DEFINE_MUTEX(cache_mutex);
31
32 /*
33  * Groups of events that have the same target(s), one RMID per group.
34  */
35 static LIST_HEAD(cache_groups);
36
37 /*
38  * Mask of CPUs for reading CQM values. We only need one per-socket.
39  */
40 static cpumask_t cqm_cpumask;
41
42 #define RMID_VAL_ERROR          (1ULL << 63)
43 #define RMID_VAL_UNAVAIL        (1ULL << 62)
44
45 #define QOS_L3_OCCUP_EVENT_ID   (1 << 0)
46
47 #define QOS_EVENT_MASK  QOS_L3_OCCUP_EVENT_ID
48
49 static u64 __rmid_read(unsigned long rmid)
50 {
51         u64 val;
52
53         /*
54          * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
55          * it just says that to increase confusion.
56          */
57         wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
58         rdmsrl(MSR_IA32_QM_CTR, val);
59
60         /*
61          * Aside from the ERROR and UNAVAIL bits, assume this thing returns
62          * the number of cachelines tagged with @rmid.
63          */
64         return val;
65 }
66
67 struct cqm_rmid_entry {
68         u64 rmid;
69         struct list_head list;
70 };
71
72 /*
73  * A least recently used list of RMIDs.
74  *
75  * Oldest entry at the head, newest (most recently used) entry at the
76  * tail. This list is never traversed, it's only used to keep track of
77  * the lru order. That is, we only pick entries of the head or insert
78  * them on the tail.
79  *
80  * All entries on the list are 'free', and their RMIDs are not currently
81  * in use. To mark an RMID as in use, remove its entry from the lru
82  * list.
83  *
84  * This list is protected by cache_mutex.
85  */
86 static LIST_HEAD(cqm_rmid_lru);
87
88 /*
89  * We use a simple array of pointers so that we can lookup a struct
90  * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
91  * and __put_rmid() from having to worry about dealing with struct
92  * cqm_rmid_entry - they just deal with rmids, i.e. integers.
93  *
94  * Once this array is initialized it is read-only. No locks are required
95  * to access it.
96  *
97  * All entries for all RMIDs can be looked up in the this array at all
98  * times.
99  */
100 static struct cqm_rmid_entry **cqm_rmid_ptrs;
101
102 static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
103 {
104         struct cqm_rmid_entry *entry;
105
106         entry = cqm_rmid_ptrs[rmid];
107         WARN_ON(entry->rmid != rmid);
108
109         return entry;
110 }
111
112 /*
113  * Returns < 0 on fail.
114  *
115  * We expect to be called with cache_mutex held.
116  */
117 static int __get_rmid(void)
118 {
119         struct cqm_rmid_entry *entry;
120
121         lockdep_assert_held(&cache_mutex);
122
123         if (list_empty(&cqm_rmid_lru))
124                 return -EAGAIN;
125
126         entry = list_first_entry(&cqm_rmid_lru, struct cqm_rmid_entry, list);
127         list_del(&entry->list);
128
129         return entry->rmid;
130 }
131
132 static void __put_rmid(int rmid)
133 {
134         struct cqm_rmid_entry *entry;
135
136         lockdep_assert_held(&cache_mutex);
137
138         entry = __rmid_entry(rmid);
139
140         list_add_tail(&entry->list, &cqm_rmid_lru);
141 }
142
143 static int intel_cqm_setup_rmid_cache(void)
144 {
145         struct cqm_rmid_entry *entry;
146         int r;
147
148         cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
149                                 (cqm_max_rmid + 1), GFP_KERNEL);
150         if (!cqm_rmid_ptrs)
151                 return -ENOMEM;
152
153         for (r = 0; r <= cqm_max_rmid; r++) {
154                 struct cqm_rmid_entry *entry;
155
156                 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
157                 if (!entry)
158                         goto fail;
159
160                 INIT_LIST_HEAD(&entry->list);
161                 entry->rmid = r;
162                 cqm_rmid_ptrs[r] = entry;
163
164                 list_add_tail(&entry->list, &cqm_rmid_lru);
165         }
166
167         /*
168          * RMID 0 is special and is always allocated. It's used for all
169          * tasks that are not monitored.
170          */
171         entry = __rmid_entry(0);
172         list_del(&entry->list);
173
174         return 0;
175 fail:
176         while (r--)
177                 kfree(cqm_rmid_ptrs[r]);
178
179         kfree(cqm_rmid_ptrs);
180         return -ENOMEM;
181 }
182
183 /*
184  * Determine if @a and @b measure the same set of tasks.
185  *
186  * If @a and @b measure the same set of tasks then we want to share a
187  * single RMID.
188  */
189 static bool __match_event(struct perf_event *a, struct perf_event *b)
190 {
191         /* Per-cpu and task events don't mix */
192         if ((a->attach_state & PERF_ATTACH_TASK) !=
193             (b->attach_state & PERF_ATTACH_TASK))
194                 return false;
195
196 #ifdef CONFIG_CGROUP_PERF
197         if (a->cgrp != b->cgrp)
198                 return false;
199 #endif
200
201         /* If not task event, we're machine wide */
202         if (!(b->attach_state & PERF_ATTACH_TASK))
203                 return true;
204
205         /*
206          * Events that target same task are placed into the same cache group.
207          */
208         if (a->hw.cqm_target == b->hw.cqm_target)
209                 return true;
210
211         /*
212          * Are we an inherited event?
213          */
214         if (b->parent == a)
215                 return true;
216
217         return false;
218 }
219
220 #ifdef CONFIG_CGROUP_PERF
221 static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
222 {
223         if (event->attach_state & PERF_ATTACH_TASK)
224                 return perf_cgroup_from_task(event->hw.cqm_target);
225
226         return event->cgrp;
227 }
228 #endif
229
230 /*
231  * Determine if @a's tasks intersect with @b's tasks
232  *
233  * There are combinations of events that we explicitly prohibit,
234  *
235  *                 PROHIBITS
236  *     system-wide    ->        cgroup and task
237  *     cgroup         ->        system-wide
238  *                    ->        task in cgroup
239  *     task           ->        system-wide
240  *                    ->        task in cgroup
241  *
242  * Call this function before allocating an RMID.
243  */
244 static bool __conflict_event(struct perf_event *a, struct perf_event *b)
245 {
246 #ifdef CONFIG_CGROUP_PERF
247         /*
248          * We can have any number of cgroups but only one system-wide
249          * event at a time.
250          */
251         if (a->cgrp && b->cgrp) {
252                 struct perf_cgroup *ac = a->cgrp;
253                 struct perf_cgroup *bc = b->cgrp;
254
255                 /*
256                  * This condition should have been caught in
257                  * __match_event() and we should be sharing an RMID.
258                  */
259                 WARN_ON_ONCE(ac == bc);
260
261                 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
262                     cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
263                         return true;
264
265                 return false;
266         }
267
268         if (a->cgrp || b->cgrp) {
269                 struct perf_cgroup *ac, *bc;
270
271                 /*
272                  * cgroup and system-wide events are mutually exclusive
273                  */
274                 if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
275                     (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
276                         return true;
277
278                 /*
279                  * Ensure neither event is part of the other's cgroup
280                  */
281                 ac = event_to_cgroup(a);
282                 bc = event_to_cgroup(b);
283                 if (ac == bc)
284                         return true;
285
286                 /*
287                  * Must have cgroup and non-intersecting task events.
288                  */
289                 if (!ac || !bc)
290                         return false;
291
292                 /*
293                  * We have cgroup and task events, and the task belongs
294                  * to a cgroup. Check for for overlap.
295                  */
296                 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
297                     cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
298                         return true;
299
300                 return false;
301         }
302 #endif
303         /*
304          * If one of them is not a task, same story as above with cgroups.
305          */
306         if (!(a->attach_state & PERF_ATTACH_TASK) ||
307             !(b->attach_state & PERF_ATTACH_TASK))
308                 return true;
309
310         /*
311          * Must be non-overlapping.
312          */
313         return false;
314 }
315
316 /*
317  * Find a group and setup RMID.
318  *
319  * If we're part of a group, we use the group's RMID.
320  */
321 static int intel_cqm_setup_event(struct perf_event *event,
322                                  struct perf_event **group)
323 {
324         struct perf_event *iter;
325         int rmid;
326
327         list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
328                 if (__match_event(iter, event)) {
329                         /* All tasks in a group share an RMID */
330                         event->hw.cqm_rmid = iter->hw.cqm_rmid;
331                         *group = iter;
332                         return 0;
333                 }
334
335                 if (__conflict_event(iter, event))
336                         return -EBUSY;
337         }
338
339         rmid = __get_rmid();
340         if (rmid < 0)
341                 return rmid;
342
343         event->hw.cqm_rmid = rmid;
344         return 0;
345 }
346
347 static void intel_cqm_event_read(struct perf_event *event)
348 {
349         unsigned long rmid;
350         u64 val;
351
352         /*
353          * Task events are handled by intel_cqm_event_count().
354          */
355         if (event->cpu == -1)
356                 return;
357
358         rmid = event->hw.cqm_rmid;
359         val = __rmid_read(rmid);
360
361         /*
362          * Ignore this reading on error states and do not update the value.
363          */
364         if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
365                 return;
366
367         local64_set(&event->count, val);
368 }
369
370 struct rmid_read {
371         unsigned int rmid;
372         atomic64_t value;
373 };
374
375 static void __intel_cqm_event_count(void *info)
376 {
377         struct rmid_read *rr = info;
378         u64 val;
379
380         val = __rmid_read(rr->rmid);
381
382         if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
383                 return;
384
385         atomic64_add(val, &rr->value);
386 }
387
388 static inline bool cqm_group_leader(struct perf_event *event)
389 {
390         return !list_empty(&event->hw.cqm_groups_entry);
391 }
392
393 static u64 intel_cqm_event_count(struct perf_event *event)
394 {
395         struct rmid_read rr = {
396                 .rmid = event->hw.cqm_rmid,
397                 .value = ATOMIC64_INIT(0),
398         };
399
400         /*
401          * We only need to worry about task events. System-wide events
402          * are handled like usual, i.e. entirely with
403          * intel_cqm_event_read().
404          */
405         if (event->cpu != -1)
406                 return __perf_event_count(event);
407
408         /*
409          * Only the group leader gets to report values. This stops us
410          * reporting duplicate values to userspace, and gives us a clear
411          * rule for which task gets to report the values.
412          *
413          * Note that it is impossible to attribute these values to
414          * specific packages - we forfeit that ability when we create
415          * task events.
416          */
417         if (!cqm_group_leader(event))
418                 return 0;
419
420         on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
421
422         local64_set(&event->count, atomic64_read(&rr.value));
423
424         return __perf_event_count(event);
425 }
426
427 static void intel_cqm_event_start(struct perf_event *event, int mode)
428 {
429         struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
430         unsigned long rmid = event->hw.cqm_rmid;
431         unsigned long flags;
432
433         if (!(event->hw.cqm_state & PERF_HES_STOPPED))
434                 return;
435
436         event->hw.cqm_state &= ~PERF_HES_STOPPED;
437
438         raw_spin_lock_irqsave(&state->lock, flags);
439
440         if (state->cnt++)
441                 WARN_ON_ONCE(state->rmid != rmid);
442         else
443                 WARN_ON_ONCE(state->rmid);
444
445         state->rmid = rmid;
446         wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
447
448         raw_spin_unlock_irqrestore(&state->lock, flags);
449 }
450
451 static void intel_cqm_event_stop(struct perf_event *event, int mode)
452 {
453         struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
454         unsigned long flags;
455
456         if (event->hw.cqm_state & PERF_HES_STOPPED)
457                 return;
458
459         event->hw.cqm_state |= PERF_HES_STOPPED;
460
461         raw_spin_lock_irqsave(&state->lock, flags);
462         intel_cqm_event_read(event);
463
464         if (!--state->cnt) {
465                 state->rmid = 0;
466                 wrmsrl(MSR_IA32_PQR_ASSOC, 0);
467         } else {
468                 WARN_ON_ONCE(!state->rmid);
469         }
470
471         raw_spin_unlock_irqrestore(&state->lock, flags);
472 }
473
474 static int intel_cqm_event_add(struct perf_event *event, int mode)
475 {
476         int rmid;
477
478         event->hw.cqm_state = PERF_HES_STOPPED;
479         rmid = event->hw.cqm_rmid;
480         WARN_ON_ONCE(!rmid);
481
482         if (mode & PERF_EF_START)
483                 intel_cqm_event_start(event, mode);
484
485         return 0;
486 }
487
488 static void intel_cqm_event_del(struct perf_event *event, int mode)
489 {
490         intel_cqm_event_stop(event, mode);
491 }
492
493 static void intel_cqm_event_destroy(struct perf_event *event)
494 {
495         struct perf_event *group_other = NULL;
496
497         mutex_lock(&cache_mutex);
498
499         /*
500          * If there's another event in this group...
501          */
502         if (!list_empty(&event->hw.cqm_group_entry)) {
503                 group_other = list_first_entry(&event->hw.cqm_group_entry,
504                                                struct perf_event,
505                                                hw.cqm_group_entry);
506                 list_del(&event->hw.cqm_group_entry);
507         }
508
509         /*
510          * And we're the group leader..
511          */
512         if (cqm_group_leader(event)) {
513                 /*
514                  * If there was a group_other, make that leader, otherwise
515                  * destroy the group and return the RMID.
516                  */
517                 if (group_other) {
518                         list_replace(&event->hw.cqm_groups_entry,
519                                      &group_other->hw.cqm_groups_entry);
520                 } else {
521                         int rmid = event->hw.cqm_rmid;
522
523                         __put_rmid(rmid);
524                         list_del(&event->hw.cqm_groups_entry);
525                 }
526         }
527
528         mutex_unlock(&cache_mutex);
529 }
530
531 static struct pmu intel_cqm_pmu;
532
533 static int intel_cqm_event_init(struct perf_event *event)
534 {
535         struct perf_event *group = NULL;
536         int err;
537
538         if (event->attr.type != intel_cqm_pmu.type)
539                 return -ENOENT;
540
541         if (event->attr.config & ~QOS_EVENT_MASK)
542                 return -EINVAL;
543
544         /* unsupported modes and filters */
545         if (event->attr.exclude_user   ||
546             event->attr.exclude_kernel ||
547             event->attr.exclude_hv     ||
548             event->attr.exclude_idle   ||
549             event->attr.exclude_host   ||
550             event->attr.exclude_guest  ||
551             event->attr.sample_period) /* no sampling */
552                 return -EINVAL;
553
554         INIT_LIST_HEAD(&event->hw.cqm_group_entry);
555         INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
556
557         event->destroy = intel_cqm_event_destroy;
558
559         mutex_lock(&cache_mutex);
560
561         /* Will also set rmid */
562         err = intel_cqm_setup_event(event, &group);
563         if (err)
564                 goto out;
565
566         if (group) {
567                 list_add_tail(&event->hw.cqm_group_entry,
568                               &group->hw.cqm_group_entry);
569         } else {
570                 list_add_tail(&event->hw.cqm_groups_entry,
571                               &cache_groups);
572         }
573
574 out:
575         mutex_unlock(&cache_mutex);
576         return err;
577 }
578
579 EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
580 EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
581 EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
582 EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
583 EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
584
585 static struct attribute *intel_cqm_events_attr[] = {
586         EVENT_PTR(intel_cqm_llc),
587         EVENT_PTR(intel_cqm_llc_pkg),
588         EVENT_PTR(intel_cqm_llc_unit),
589         EVENT_PTR(intel_cqm_llc_scale),
590         EVENT_PTR(intel_cqm_llc_snapshot),
591         NULL,
592 };
593
594 static struct attribute_group intel_cqm_events_group = {
595         .name = "events",
596         .attrs = intel_cqm_events_attr,
597 };
598
599 PMU_FORMAT_ATTR(event, "config:0-7");
600 static struct attribute *intel_cqm_formats_attr[] = {
601         &format_attr_event.attr,
602         NULL,
603 };
604
605 static struct attribute_group intel_cqm_format_group = {
606         .name = "format",
607         .attrs = intel_cqm_formats_attr,
608 };
609
610 static const struct attribute_group *intel_cqm_attr_groups[] = {
611         &intel_cqm_events_group,
612         &intel_cqm_format_group,
613         NULL,
614 };
615
616 static struct pmu intel_cqm_pmu = {
617         .attr_groups    = intel_cqm_attr_groups,
618         .task_ctx_nr    = perf_sw_context,
619         .event_init     = intel_cqm_event_init,
620         .add            = intel_cqm_event_add,
621         .del            = intel_cqm_event_del,
622         .start          = intel_cqm_event_start,
623         .stop           = intel_cqm_event_stop,
624         .read           = intel_cqm_event_read,
625         .count          = intel_cqm_event_count,
626 };
627
628 static inline void cqm_pick_event_reader(int cpu)
629 {
630         int phys_id = topology_physical_package_id(cpu);
631         int i;
632
633         for_each_cpu(i, &cqm_cpumask) {
634                 if (phys_id == topology_physical_package_id(i))
635                         return; /* already got reader for this socket */
636         }
637
638         cpumask_set_cpu(cpu, &cqm_cpumask);
639 }
640
641 static void intel_cqm_cpu_prepare(unsigned int cpu)
642 {
643         struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
644         struct cpuinfo_x86 *c = &cpu_data(cpu);
645
646         raw_spin_lock_init(&state->lock);
647         state->rmid = 0;
648         state->cnt  = 0;
649
650         WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
651         WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
652 }
653
654 static void intel_cqm_cpu_exit(unsigned int cpu)
655 {
656         int phys_id = topology_physical_package_id(cpu);
657         int i;
658
659         /*
660          * Is @cpu a designated cqm reader?
661          */
662         if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
663                 return;
664
665         for_each_online_cpu(i) {
666                 if (i == cpu)
667                         continue;
668
669                 if (phys_id == topology_physical_package_id(i)) {
670                         cpumask_set_cpu(i, &cqm_cpumask);
671                         break;
672                 }
673         }
674 }
675
676 static int intel_cqm_cpu_notifier(struct notifier_block *nb,
677                                   unsigned long action, void *hcpu)
678 {
679         unsigned int cpu  = (unsigned long)hcpu;
680
681         switch (action & ~CPU_TASKS_FROZEN) {
682         case CPU_UP_PREPARE:
683                 intel_cqm_cpu_prepare(cpu);
684                 break;
685         case CPU_DOWN_PREPARE:
686                 intel_cqm_cpu_exit(cpu);
687                 break;
688         case CPU_STARTING:
689                 cqm_pick_event_reader(cpu);
690                 break;
691         }
692
693         return NOTIFY_OK;
694 }
695
696 static const struct x86_cpu_id intel_cqm_match[] = {
697         { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
698         {}
699 };
700
701 static int __init intel_cqm_init(void)
702 {
703         char *str, scale[20];
704         int i, cpu, ret;
705
706         if (!x86_match_cpu(intel_cqm_match))
707                 return -ENODEV;
708
709         cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
710
711         /*
712          * It's possible that not all resources support the same number
713          * of RMIDs. Instead of making scheduling much more complicated
714          * (where we have to match a task's RMID to a cpu that supports
715          * that many RMIDs) just find the minimum RMIDs supported across
716          * all cpus.
717          *
718          * Also, check that the scales match on all cpus.
719          */
720         cpu_notifier_register_begin();
721
722         for_each_online_cpu(cpu) {
723                 struct cpuinfo_x86 *c = &cpu_data(cpu);
724
725                 if (c->x86_cache_max_rmid < cqm_max_rmid)
726                         cqm_max_rmid = c->x86_cache_max_rmid;
727
728                 if (c->x86_cache_occ_scale != cqm_l3_scale) {
729                         pr_err("Multiple LLC scale values, disabling\n");
730                         ret = -EINVAL;
731                         goto out;
732                 }
733         }
734
735         snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
736         str = kstrdup(scale, GFP_KERNEL);
737         if (!str) {
738                 ret = -ENOMEM;
739                 goto out;
740         }
741
742         event_attr_intel_cqm_llc_scale.event_str = str;
743
744         ret = intel_cqm_setup_rmid_cache();
745         if (ret)
746                 goto out;
747
748         for_each_online_cpu(i) {
749                 intel_cqm_cpu_prepare(i);
750                 cqm_pick_event_reader(i);
751         }
752
753         __perf_cpu_notifier(intel_cqm_cpu_notifier);
754
755         ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm",
756                                 PERF_TYPE_INTEL_CQM);
757         if (ret)
758                 pr_err("Intel CQM perf registration failed: %d\n", ret);
759         else
760                 pr_info("Intel CQM monitoring enabled\n");
761
762 out:
763         cpu_notifier_register_done();
764
765         return ret;
766 }
767 device_initcall(intel_cqm_init);