perf/core: Fix dynamic interrupt throttle
authorPeter Zijlstra <peterz@infradead.org>
Thu, 17 Mar 2016 14:17:35 +0000 (15:17 +0100)
committerIngo Molnar <mingo@kernel.org>
Mon, 21 Mar 2016 08:08:17 +0000 (09:08 +0100)
There were two problems with the dynamic interrupt throttle mechanism,
both triggered by the same action.

When you (or perf_fuzzer) write a huge value into
/proc/sys/kernel/perf_event_max_sample_rate the computed
perf_sample_allowed_ns becomes 0. This effectively disables the whole
dynamic throttle.

This is fixed by ensuring update_perf_cpu_limits() never sets the
value to 0. However, we allow disabling of the dynamic throttle by
writing 100 to /proc/sys/kernel/perf_cpu_time_max_percent. This will
generate a warning in dmesg.

The second problem is that by setting the max_sample_rate to a huge
number, the adaptive process can take a few tries, since it halfs the
limit each time. Change that to directly compute a new value based on
the observed duration.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
kernel/events/core.c

index d394773..43e35fa 100644 (file)
@@ -376,8 +376,11 @@ static void update_perf_cpu_limits(void)
        u64 tmp = perf_sample_period_ns;
 
        tmp *= sysctl_perf_cpu_time_max_percent;
-       do_div(tmp, 100);
-       ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
+       tmp = div_u64(tmp, 100);
+       if (!tmp)
+               tmp = 1;
+
+       WRITE_ONCE(perf_sample_allowed_ns, tmp);
 }
 
 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -409,7 +412,13 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
        if (ret || !write)
                return ret;
 
-       update_perf_cpu_limits();
+       if (sysctl_perf_cpu_time_max_percent == 100) {
+               printk(KERN_WARNING
+                      "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
+               WRITE_ONCE(perf_sample_allowed_ns, 0);
+       } else {
+               update_perf_cpu_limits();
+       }
 
        return 0;
 }
@@ -423,62 +432,68 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 #define NR_ACCUMULATED_SAMPLES 128
 static DEFINE_PER_CPU(u64, running_sample_length);
 
+static u64 __report_avg;
+static u64 __report_allowed;
+
 static void perf_duration_warn(struct irq_work *w)
 {
-       u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
-       u64 avg_local_sample_len;
-       u64 local_samples_len;
-
-       local_samples_len = __this_cpu_read(running_sample_length);
-       avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-
        printk_ratelimited(KERN_WARNING
-                       "perf interrupt took too long (%lld > %lld), lowering "
-                       "kernel.perf_event_max_sample_rate to %d\n",
-                       avg_local_sample_len, allowed_ns >> 1,
-                       sysctl_perf_event_sample_rate);
+               "perf: interrupt took too long (%lld > %lld), lowering "
+               "kernel.perf_event_max_sample_rate to %d\n",
+               __report_avg, __report_allowed,
+               sysctl_perf_event_sample_rate);
 }
 
 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
 
 void perf_sample_event_took(u64 sample_len_ns)
 {
-       u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
-       u64 avg_local_sample_len;
-       u64 local_samples_len;
+       u64 max_len = READ_ONCE(perf_sample_allowed_ns);
+       u64 running_len;
+       u64 avg_len;
+       u32 max;
 
-       if (allowed_ns == 0)
+       if (max_len == 0)
                return;
 
-       /* decay the counter by 1 average sample */
-       local_samples_len = __this_cpu_read(running_sample_length);
-       local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
-       local_samples_len += sample_len_ns;
-       __this_cpu_write(running_sample_length, local_samples_len);
+       /* Decay the counter by 1 average sample. */
+       running_len = __this_cpu_read(running_sample_length);
+       running_len -= running_len/NR_ACCUMULATED_SAMPLES;
+       running_len += sample_len_ns;
+       __this_cpu_write(running_sample_length, running_len);
 
        /*
-        * note: this will be biased artifically low until we have
-        * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
+        * Note: this will be biased artifically low until we have
+        * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
         * from having to maintain a count.
         */
-       avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-
-       if (avg_local_sample_len <= allowed_ns)
+       avg_len = running_len/NR_ACCUMULATED_SAMPLES;
+       if (avg_len <= max_len)
                return;
 
-       if (max_samples_per_tick <= 1)
-               return;
+       __report_avg = avg_len;
+       __report_allowed = max_len;
 
-       max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
-       sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
-       perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+       /*
+        * Compute a throttle threshold 25% below the current duration.
+        */
+       avg_len += avg_len / 4;
+       max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
+       if (avg_len < max)
+               max /= (u32)avg_len;
+       else
+               max = 1;
 
-       update_perf_cpu_limits();
+       WRITE_ONCE(perf_sample_allowed_ns, avg_len);
+       WRITE_ONCE(max_samples_per_tick, max);
+
+       sysctl_perf_event_sample_rate = max * HZ;
+       perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 
        if (!irq_work_queue(&perf_duration_work)) {
-               early_printk("perf interrupt took too long (%lld > %lld), lowering "
+               early_printk("perf: interrupt took too long (%lld > %lld), lowering "
                             "kernel.perf_event_max_sample_rate to %d\n",
-                            avg_local_sample_len, allowed_ns >> 1,
+                            __report_avg, __report_allowed,
                             sysctl_perf_event_sample_rate);
        }
 }