2 * Intel Cache Quality-of-Service Monitoring (CQM) support.
4 * Based very, very heavily on work by Peter Zijlstra.
7 #include <linux/perf_event.h>
8 #include <linux/slab.h>
9 #include <asm/cpu_device_id.h>
10 #include "perf_event.h"
12 #define MSR_IA32_PQR_ASSOC 0x0c8f
13 #define MSR_IA32_QM_CTR 0x0c8e
14 #define MSR_IA32_QM_EVTSEL 0x0c8d
16 static unsigned int cqm_max_rmid = -1;
17 static unsigned int cqm_l3_scale; /* supposedly cacheline size */
19 struct intel_cqm_state {
25 static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
28 * Protects cache_cgroups and cqm_rmid_lru.
30 static DEFINE_MUTEX(cache_mutex);
33 * Groups of events that have the same target(s), one RMID per group.
35 static LIST_HEAD(cache_groups);
38 * Mask of CPUs for reading CQM values. We only need one per-socket.
40 static cpumask_t cqm_cpumask;
42 #define RMID_VAL_ERROR (1ULL << 63)
43 #define RMID_VAL_UNAVAIL (1ULL << 62)
45 #define QOS_L3_OCCUP_EVENT_ID (1 << 0)
47 #define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
49 static u64 __rmid_read(unsigned long rmid)
54 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
55 * it just says that to increase confusion.
57 wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
58 rdmsrl(MSR_IA32_QM_CTR, val);
61 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
62 * the number of cachelines tagged with @rmid.
67 struct cqm_rmid_entry {
69 struct list_head list;
73 * A least recently used list of RMIDs.
75 * Oldest entry at the head, newest (most recently used) entry at the
76 * tail. This list is never traversed, it's only used to keep track of
77 * the lru order. That is, we only pick entries of the head or insert
80 * All entries on the list are 'free', and their RMIDs are not currently
81 * in use. To mark an RMID as in use, remove its entry from the lru
84 * This list is protected by cache_mutex.
86 static LIST_HEAD(cqm_rmid_lru);
89 * We use a simple array of pointers so that we can lookup a struct
90 * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
91 * and __put_rmid() from having to worry about dealing with struct
92 * cqm_rmid_entry - they just deal with rmids, i.e. integers.
94 * Once this array is initialized it is read-only. No locks are required
97 * All entries for all RMIDs can be looked up in the this array at all
100 static struct cqm_rmid_entry **cqm_rmid_ptrs;
102 static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
104 struct cqm_rmid_entry *entry;
106 entry = cqm_rmid_ptrs[rmid];
107 WARN_ON(entry->rmid != rmid);
113 * Returns < 0 on fail.
115 * We expect to be called with cache_mutex held.
117 static int __get_rmid(void)
119 struct cqm_rmid_entry *entry;
121 lockdep_assert_held(&cache_mutex);
123 if (list_empty(&cqm_rmid_lru))
126 entry = list_first_entry(&cqm_rmid_lru, struct cqm_rmid_entry, list);
127 list_del(&entry->list);
132 static void __put_rmid(int rmid)
134 struct cqm_rmid_entry *entry;
136 lockdep_assert_held(&cache_mutex);
138 entry = __rmid_entry(rmid);
140 list_add_tail(&entry->list, &cqm_rmid_lru);
143 static int intel_cqm_setup_rmid_cache(void)
145 struct cqm_rmid_entry *entry;
148 cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
149 (cqm_max_rmid + 1), GFP_KERNEL);
153 for (r = 0; r <= cqm_max_rmid; r++) {
154 struct cqm_rmid_entry *entry;
156 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
160 INIT_LIST_HEAD(&entry->list);
162 cqm_rmid_ptrs[r] = entry;
164 list_add_tail(&entry->list, &cqm_rmid_lru);
168 * RMID 0 is special and is always allocated. It's used for all
169 * tasks that are not monitored.
171 entry = __rmid_entry(0);
172 list_del(&entry->list);
177 kfree(cqm_rmid_ptrs[r]);
179 kfree(cqm_rmid_ptrs);
184 * Determine if @a and @b measure the same set of tasks.
186 * If @a and @b measure the same set of tasks then we want to share a
189 static bool __match_event(struct perf_event *a, struct perf_event *b)
191 /* Per-cpu and task events don't mix */
192 if ((a->attach_state & PERF_ATTACH_TASK) !=
193 (b->attach_state & PERF_ATTACH_TASK))
196 #ifdef CONFIG_CGROUP_PERF
197 if (a->cgrp != b->cgrp)
201 /* If not task event, we're machine wide */
202 if (!(b->attach_state & PERF_ATTACH_TASK))
206 * Events that target same task are placed into the same cache group.
208 if (a->hw.cqm_target == b->hw.cqm_target)
212 * Are we an inherited event?
220 #ifdef CONFIG_CGROUP_PERF
221 static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
223 if (event->attach_state & PERF_ATTACH_TASK)
224 return perf_cgroup_from_task(event->hw.cqm_target);
231 * Determine if @a's tasks intersect with @b's tasks
233 * There are combinations of events that we explicitly prohibit,
236 * system-wide -> cgroup and task
237 * cgroup -> system-wide
239 * task -> system-wide
242 * Call this function before allocating an RMID.
244 static bool __conflict_event(struct perf_event *a, struct perf_event *b)
246 #ifdef CONFIG_CGROUP_PERF
248 * We can have any number of cgroups but only one system-wide
251 if (a->cgrp && b->cgrp) {
252 struct perf_cgroup *ac = a->cgrp;
253 struct perf_cgroup *bc = b->cgrp;
256 * This condition should have been caught in
257 * __match_event() and we should be sharing an RMID.
259 WARN_ON_ONCE(ac == bc);
261 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
262 cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
268 if (a->cgrp || b->cgrp) {
269 struct perf_cgroup *ac, *bc;
272 * cgroup and system-wide events are mutually exclusive
274 if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
275 (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
279 * Ensure neither event is part of the other's cgroup
281 ac = event_to_cgroup(a);
282 bc = event_to_cgroup(b);
287 * Must have cgroup and non-intersecting task events.
293 * We have cgroup and task events, and the task belongs
294 * to a cgroup. Check for for overlap.
296 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
297 cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
304 * If one of them is not a task, same story as above with cgroups.
306 if (!(a->attach_state & PERF_ATTACH_TASK) ||
307 !(b->attach_state & PERF_ATTACH_TASK))
311 * Must be non-overlapping.
317 * Find a group and setup RMID.
319 * If we're part of a group, we use the group's RMID.
321 static int intel_cqm_setup_event(struct perf_event *event,
322 struct perf_event **group)
324 struct perf_event *iter;
327 list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
328 if (__match_event(iter, event)) {
329 /* All tasks in a group share an RMID */
330 event->hw.cqm_rmid = iter->hw.cqm_rmid;
335 if (__conflict_event(iter, event))
343 event->hw.cqm_rmid = rmid;
347 static void intel_cqm_event_read(struct perf_event *event)
353 * Task events are handled by intel_cqm_event_count().
355 if (event->cpu == -1)
358 rmid = event->hw.cqm_rmid;
359 val = __rmid_read(rmid);
362 * Ignore this reading on error states and do not update the value.
364 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
367 local64_set(&event->count, val);
375 static void __intel_cqm_event_count(void *info)
377 struct rmid_read *rr = info;
380 val = __rmid_read(rr->rmid);
382 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
385 atomic64_add(val, &rr->value);
388 static inline bool cqm_group_leader(struct perf_event *event)
390 return !list_empty(&event->hw.cqm_groups_entry);
393 static u64 intel_cqm_event_count(struct perf_event *event)
395 struct rmid_read rr = {
396 .rmid = event->hw.cqm_rmid,
397 .value = ATOMIC64_INIT(0),
401 * We only need to worry about task events. System-wide events
402 * are handled like usual, i.e. entirely with
403 * intel_cqm_event_read().
405 if (event->cpu != -1)
406 return __perf_event_count(event);
409 * Only the group leader gets to report values. This stops us
410 * reporting duplicate values to userspace, and gives us a clear
411 * rule for which task gets to report the values.
413 * Note that it is impossible to attribute these values to
414 * specific packages - we forfeit that ability when we create
417 if (!cqm_group_leader(event))
420 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
422 local64_set(&event->count, atomic64_read(&rr.value));
424 return __perf_event_count(event);
427 static void intel_cqm_event_start(struct perf_event *event, int mode)
429 struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
430 unsigned long rmid = event->hw.cqm_rmid;
433 if (!(event->hw.cqm_state & PERF_HES_STOPPED))
436 event->hw.cqm_state &= ~PERF_HES_STOPPED;
438 raw_spin_lock_irqsave(&state->lock, flags);
441 WARN_ON_ONCE(state->rmid != rmid);
443 WARN_ON_ONCE(state->rmid);
446 wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
448 raw_spin_unlock_irqrestore(&state->lock, flags);
451 static void intel_cqm_event_stop(struct perf_event *event, int mode)
453 struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
456 if (event->hw.cqm_state & PERF_HES_STOPPED)
459 event->hw.cqm_state |= PERF_HES_STOPPED;
461 raw_spin_lock_irqsave(&state->lock, flags);
462 intel_cqm_event_read(event);
466 wrmsrl(MSR_IA32_PQR_ASSOC, 0);
468 WARN_ON_ONCE(!state->rmid);
471 raw_spin_unlock_irqrestore(&state->lock, flags);
474 static int intel_cqm_event_add(struct perf_event *event, int mode)
478 event->hw.cqm_state = PERF_HES_STOPPED;
479 rmid = event->hw.cqm_rmid;
482 if (mode & PERF_EF_START)
483 intel_cqm_event_start(event, mode);
488 static void intel_cqm_event_del(struct perf_event *event, int mode)
490 intel_cqm_event_stop(event, mode);
493 static void intel_cqm_event_destroy(struct perf_event *event)
495 struct perf_event *group_other = NULL;
497 mutex_lock(&cache_mutex);
500 * If there's another event in this group...
502 if (!list_empty(&event->hw.cqm_group_entry)) {
503 group_other = list_first_entry(&event->hw.cqm_group_entry,
506 list_del(&event->hw.cqm_group_entry);
510 * And we're the group leader..
512 if (cqm_group_leader(event)) {
514 * If there was a group_other, make that leader, otherwise
515 * destroy the group and return the RMID.
518 list_replace(&event->hw.cqm_groups_entry,
519 &group_other->hw.cqm_groups_entry);
521 int rmid = event->hw.cqm_rmid;
524 list_del(&event->hw.cqm_groups_entry);
528 mutex_unlock(&cache_mutex);
531 static struct pmu intel_cqm_pmu;
533 static int intel_cqm_event_init(struct perf_event *event)
535 struct perf_event *group = NULL;
538 if (event->attr.type != intel_cqm_pmu.type)
541 if (event->attr.config & ~QOS_EVENT_MASK)
544 /* unsupported modes and filters */
545 if (event->attr.exclude_user ||
546 event->attr.exclude_kernel ||
547 event->attr.exclude_hv ||
548 event->attr.exclude_idle ||
549 event->attr.exclude_host ||
550 event->attr.exclude_guest ||
551 event->attr.sample_period) /* no sampling */
554 INIT_LIST_HEAD(&event->hw.cqm_group_entry);
555 INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
557 event->destroy = intel_cqm_event_destroy;
559 mutex_lock(&cache_mutex);
561 /* Will also set rmid */
562 err = intel_cqm_setup_event(event, &group);
567 list_add_tail(&event->hw.cqm_group_entry,
568 &group->hw.cqm_group_entry);
570 list_add_tail(&event->hw.cqm_groups_entry,
575 mutex_unlock(&cache_mutex);
579 EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
580 EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
581 EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
582 EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
583 EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
585 static struct attribute *intel_cqm_events_attr[] = {
586 EVENT_PTR(intel_cqm_llc),
587 EVENT_PTR(intel_cqm_llc_pkg),
588 EVENT_PTR(intel_cqm_llc_unit),
589 EVENT_PTR(intel_cqm_llc_scale),
590 EVENT_PTR(intel_cqm_llc_snapshot),
594 static struct attribute_group intel_cqm_events_group = {
596 .attrs = intel_cqm_events_attr,
599 PMU_FORMAT_ATTR(event, "config:0-7");
600 static struct attribute *intel_cqm_formats_attr[] = {
601 &format_attr_event.attr,
605 static struct attribute_group intel_cqm_format_group = {
607 .attrs = intel_cqm_formats_attr,
610 static const struct attribute_group *intel_cqm_attr_groups[] = {
611 &intel_cqm_events_group,
612 &intel_cqm_format_group,
616 static struct pmu intel_cqm_pmu = {
617 .attr_groups = intel_cqm_attr_groups,
618 .task_ctx_nr = perf_sw_context,
619 .event_init = intel_cqm_event_init,
620 .add = intel_cqm_event_add,
621 .del = intel_cqm_event_del,
622 .start = intel_cqm_event_start,
623 .stop = intel_cqm_event_stop,
624 .read = intel_cqm_event_read,
625 .count = intel_cqm_event_count,
628 static inline void cqm_pick_event_reader(int cpu)
630 int phys_id = topology_physical_package_id(cpu);
633 for_each_cpu(i, &cqm_cpumask) {
634 if (phys_id == topology_physical_package_id(i))
635 return; /* already got reader for this socket */
638 cpumask_set_cpu(cpu, &cqm_cpumask);
641 static void intel_cqm_cpu_prepare(unsigned int cpu)
643 struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
644 struct cpuinfo_x86 *c = &cpu_data(cpu);
646 raw_spin_lock_init(&state->lock);
650 WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
651 WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
654 static void intel_cqm_cpu_exit(unsigned int cpu)
656 int phys_id = topology_physical_package_id(cpu);
660 * Is @cpu a designated cqm reader?
662 if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
665 for_each_online_cpu(i) {
669 if (phys_id == topology_physical_package_id(i)) {
670 cpumask_set_cpu(i, &cqm_cpumask);
676 static int intel_cqm_cpu_notifier(struct notifier_block *nb,
677 unsigned long action, void *hcpu)
679 unsigned int cpu = (unsigned long)hcpu;
681 switch (action & ~CPU_TASKS_FROZEN) {
683 intel_cqm_cpu_prepare(cpu);
685 case CPU_DOWN_PREPARE:
686 intel_cqm_cpu_exit(cpu);
689 cqm_pick_event_reader(cpu);
696 static const struct x86_cpu_id intel_cqm_match[] = {
697 { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
701 static int __init intel_cqm_init(void)
703 char *str, scale[20];
706 if (!x86_match_cpu(intel_cqm_match))
709 cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
712 * It's possible that not all resources support the same number
713 * of RMIDs. Instead of making scheduling much more complicated
714 * (where we have to match a task's RMID to a cpu that supports
715 * that many RMIDs) just find the minimum RMIDs supported across
718 * Also, check that the scales match on all cpus.
720 cpu_notifier_register_begin();
722 for_each_online_cpu(cpu) {
723 struct cpuinfo_x86 *c = &cpu_data(cpu);
725 if (c->x86_cache_max_rmid < cqm_max_rmid)
726 cqm_max_rmid = c->x86_cache_max_rmid;
728 if (c->x86_cache_occ_scale != cqm_l3_scale) {
729 pr_err("Multiple LLC scale values, disabling\n");
735 snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
736 str = kstrdup(scale, GFP_KERNEL);
742 event_attr_intel_cqm_llc_scale.event_str = str;
744 ret = intel_cqm_setup_rmid_cache();
748 for_each_online_cpu(i) {
749 intel_cqm_cpu_prepare(i);
750 cqm_pick_event_reader(i);
753 __perf_cpu_notifier(intel_cqm_cpu_notifier);
755 ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm",
756 PERF_TYPE_INTEL_CQM);
758 pr_err("Intel CQM perf registration failed: %d\n", ret);
760 pr_info("Intel CQM monitoring enabled\n");
763 cpu_notifier_register_done();
767 device_initcall(intel_cqm_init);