cpufreq: governor: Close dbs_data update race condition
[cascardo/linux.git] / drivers / cpufreq / cpufreq_governor.c
1 /*
2  * drivers/cpufreq/cpufreq_governor.c
3  *
4  * CPUFREQ governors common code
5  *
6  * Copyright    (C) 2001 Russell King
7  *              (C) 2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>.
8  *              (C) 2003 Jun Nakajima <jun.nakajima@intel.com>
9  *              (C) 2009 Alexander Clouter <alex@digriz.org.uk>
10  *              (c) 2012 Viresh Kumar <viresh.kumar@linaro.org>
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License version 2 as
14  * published by the Free Software Foundation.
15  */
16
17 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
19 #include <linux/export.h>
20 #include <linux/kernel_stat.h>
21 #include <linux/slab.h>
22
23 #include "cpufreq_governor.h"
24
25 DEFINE_MUTEX(dbs_data_mutex);
26 EXPORT_SYMBOL_GPL(dbs_data_mutex);
27
28 /* Common sysfs tunables */
29 /**
30  * store_sampling_rate - update sampling rate effective immediately if needed.
31  *
32  * If new rate is smaller than the old, simply updating
33  * dbs.sampling_rate might not be appropriate. For example, if the
34  * original sampling_rate was 1 second and the requested new sampling rate is 10
35  * ms because the user needs immediate reaction from ondemand governor, but not
36  * sure if higher frequency will be required or not, then, the governor may
37  * change the sampling rate too late; up to 1 second later. Thus, if we are
38  * reducing the sampling rate, we need to make the new value effective
39  * immediately.
40  *
41  * This must be called with dbs_data->mutex held, otherwise traversing
42  * policy_dbs_list isn't safe.
43  */
44 ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
45                             size_t count)
46 {
47         struct policy_dbs_info *policy_dbs;
48         unsigned int rate;
49         int ret;
50         ret = sscanf(buf, "%u", &rate);
51         if (ret != 1)
52                 return -EINVAL;
53
54         dbs_data->sampling_rate = max(rate, dbs_data->min_sampling_rate);
55
56         /*
57          * We are operating under dbs_data->mutex and so the list and its
58          * entries can't be freed concurrently.
59          */
60         list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list) {
61                 mutex_lock(&policy_dbs->timer_mutex);
62                 /*
63                  * On 32-bit architectures this may race with the
64                  * sample_delay_ns read in dbs_update_util_handler(), but that
65                  * really doesn't matter.  If the read returns a value that's
66                  * too big, the sample will be skipped, but the next invocation
67                  * of dbs_update_util_handler() (when the update has been
68                  * completed) will take a sample.
69                  *
70                  * If this runs in parallel with dbs_work_handler(), we may end
71                  * up overwriting the sample_delay_ns value that it has just
72                  * written, but it will be corrected next time a sample is
73                  * taken, so it shouldn't be significant.
74                  */
75                 gov_update_sample_delay(policy_dbs, 0);
76                 mutex_unlock(&policy_dbs->timer_mutex);
77         }
78
79         return count;
80 }
81 EXPORT_SYMBOL_GPL(store_sampling_rate);
82
83 static inline struct dbs_data *to_dbs_data(struct kobject *kobj)
84 {
85         return container_of(kobj, struct dbs_data, kobj);
86 }
87
88 static inline struct governor_attr *to_gov_attr(struct attribute *attr)
89 {
90         return container_of(attr, struct governor_attr, attr);
91 }
92
93 static ssize_t governor_show(struct kobject *kobj, struct attribute *attr,
94                              char *buf)
95 {
96         struct dbs_data *dbs_data = to_dbs_data(kobj);
97         struct governor_attr *gattr = to_gov_attr(attr);
98         int ret = -EIO;
99
100         if (gattr->show)
101                 ret = gattr->show(dbs_data, buf);
102
103         return ret;
104 }
105
106 static ssize_t governor_store(struct kobject *kobj, struct attribute *attr,
107                               const char *buf, size_t count)
108 {
109         struct dbs_data *dbs_data = to_dbs_data(kobj);
110         struct governor_attr *gattr = to_gov_attr(attr);
111         int ret = -EIO;
112
113         mutex_lock(&dbs_data->mutex);
114
115         if (dbs_data->usage_count && gattr->store)
116                 ret = gattr->store(dbs_data, buf, count);
117
118         mutex_unlock(&dbs_data->mutex);
119
120         return ret;
121 }
122
123 /*
124  * Sysfs Ops for accessing governor attributes.
125  *
126  * All show/store invocations for governor specific sysfs attributes, will first
127  * call the below show/store callbacks and the attribute specific callback will
128  * be called from within it.
129  */
130 static const struct sysfs_ops governor_sysfs_ops = {
131         .show   = governor_show,
132         .store  = governor_store,
133 };
134
135 unsigned int dbs_update(struct cpufreq_policy *policy)
136 {
137         struct dbs_governor *gov = dbs_governor_of(policy);
138         struct policy_dbs_info *policy_dbs = policy->governor_data;
139         struct dbs_data *dbs_data = policy_dbs->dbs_data;
140         struct od_dbs_tuners *od_tuners = dbs_data->tuners;
141         unsigned int ignore_nice = dbs_data->ignore_nice_load;
142         unsigned int max_load = 0;
143         unsigned int sampling_rate, j;
144
145         /*
146          * Sometimes governors may use an additional multiplier to increase
147          * sample delays temporarily.  Apply that multiplier to sampling_rate
148          * so as to keep the wake-up-from-idle detection logic a bit
149          * conservative.
150          */
151         sampling_rate = dbs_data->sampling_rate * policy_dbs->rate_mult;
152
153         /* Get Absolute Load */
154         for_each_cpu(j, policy->cpus) {
155                 struct cpu_dbs_info *j_cdbs;
156                 u64 cur_wall_time, cur_idle_time;
157                 unsigned int idle_time, wall_time;
158                 unsigned int load;
159                 int io_busy = 0;
160
161                 j_cdbs = gov->get_cpu_cdbs(j);
162
163                 /*
164                  * For the purpose of ondemand, waiting for disk IO is
165                  * an indication that you're performance critical, and
166                  * not that the system is actually idle. So do not add
167                  * the iowait time to the cpu idle time.
168                  */
169                 if (gov->governor == GOV_ONDEMAND)
170                         io_busy = od_tuners->io_is_busy;
171                 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time, io_busy);
172
173                 wall_time = cur_wall_time - j_cdbs->prev_cpu_wall;
174                 j_cdbs->prev_cpu_wall = cur_wall_time;
175
176                 if (cur_idle_time <= j_cdbs->prev_cpu_idle) {
177                         idle_time = 0;
178                 } else {
179                         idle_time = cur_idle_time - j_cdbs->prev_cpu_idle;
180                         j_cdbs->prev_cpu_idle = cur_idle_time;
181                 }
182
183                 if (ignore_nice) {
184                         u64 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
185
186                         idle_time += cputime_to_usecs(cur_nice - j_cdbs->prev_cpu_nice);
187                         j_cdbs->prev_cpu_nice = cur_nice;
188                 }
189
190                 if (unlikely(!wall_time || wall_time < idle_time))
191                         continue;
192
193                 /*
194                  * If the CPU had gone completely idle, and a task just woke up
195                  * on this CPU now, it would be unfair to calculate 'load' the
196                  * usual way for this elapsed time-window, because it will show
197                  * near-zero load, irrespective of how CPU intensive that task
198                  * actually is. This is undesirable for latency-sensitive bursty
199                  * workloads.
200                  *
201                  * To avoid this, we reuse the 'load' from the previous
202                  * time-window and give this task a chance to start with a
203                  * reasonably high CPU frequency. (However, we shouldn't over-do
204                  * this copy, lest we get stuck at a high load (high frequency)
205                  * for too long, even when the current system load has actually
206                  * dropped down. So we perform the copy only once, upon the
207                  * first wake-up from idle.)
208                  *
209                  * Detecting this situation is easy: the governor's utilization
210                  * update handler would not have run during CPU-idle periods.
211                  * Hence, an unusually large 'wall_time' (as compared to the
212                  * sampling rate) indicates this scenario.
213                  *
214                  * prev_load can be zero in two cases and we must recalculate it
215                  * for both cases:
216                  * - during long idle intervals
217                  * - explicitly set to zero
218                  */
219                 if (unlikely(wall_time > (2 * sampling_rate) &&
220                              j_cdbs->prev_load)) {
221                         load = j_cdbs->prev_load;
222
223                         /*
224                          * Perform a destructive copy, to ensure that we copy
225                          * the previous load only once, upon the first wake-up
226                          * from idle.
227                          */
228                         j_cdbs->prev_load = 0;
229                 } else {
230                         load = 100 * (wall_time - idle_time) / wall_time;
231                         j_cdbs->prev_load = load;
232                 }
233
234                 if (load > max_load)
235                         max_load = load;
236         }
237         return max_load;
238 }
239 EXPORT_SYMBOL_GPL(dbs_update);
240
241 void gov_set_update_util(struct policy_dbs_info *policy_dbs,
242                          unsigned int delay_us)
243 {
244         struct cpufreq_policy *policy = policy_dbs->policy;
245         struct dbs_governor *gov = dbs_governor_of(policy);
246         int cpu;
247
248         gov_update_sample_delay(policy_dbs, delay_us);
249         policy_dbs->last_sample_time = 0;
250
251         for_each_cpu(cpu, policy->cpus) {
252                 struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
253
254                 cpufreq_set_update_util_data(cpu, &cdbs->update_util);
255         }
256 }
257 EXPORT_SYMBOL_GPL(gov_set_update_util);
258
259 static inline void gov_clear_update_util(struct cpufreq_policy *policy)
260 {
261         int i;
262
263         for_each_cpu(i, policy->cpus)
264                 cpufreq_set_update_util_data(i, NULL);
265
266         synchronize_rcu();
267 }
268
269 static void gov_cancel_work(struct cpufreq_policy *policy)
270 {
271         struct policy_dbs_info *policy_dbs = policy->governor_data;
272
273         gov_clear_update_util(policy_dbs->policy);
274         irq_work_sync(&policy_dbs->irq_work);
275         cancel_work_sync(&policy_dbs->work);
276         atomic_set(&policy_dbs->work_count, 0);
277         policy_dbs->work_in_progress = false;
278 }
279
280 static void dbs_work_handler(struct work_struct *work)
281 {
282         struct policy_dbs_info *policy_dbs;
283         struct cpufreq_policy *policy;
284         struct dbs_governor *gov;
285
286         policy_dbs = container_of(work, struct policy_dbs_info, work);
287         policy = policy_dbs->policy;
288         gov = dbs_governor_of(policy);
289
290         /*
291          * Make sure cpufreq_governor_limits() isn't evaluating load or the
292          * ondemand governor isn't updating the sampling rate in parallel.
293          */
294         mutex_lock(&policy_dbs->timer_mutex);
295         gov_update_sample_delay(policy_dbs, gov->gov_dbs_timer(policy));
296         mutex_unlock(&policy_dbs->timer_mutex);
297
298         /* Allow the utilization update handler to queue up more work. */
299         atomic_set(&policy_dbs->work_count, 0);
300         /*
301          * If the update below is reordered with respect to the sample delay
302          * modification, the utilization update handler may end up using a stale
303          * sample delay value.
304          */
305         smp_wmb();
306         policy_dbs->work_in_progress = false;
307 }
308
309 static void dbs_irq_work(struct irq_work *irq_work)
310 {
311         struct policy_dbs_info *policy_dbs;
312
313         policy_dbs = container_of(irq_work, struct policy_dbs_info, irq_work);
314         schedule_work(&policy_dbs->work);
315 }
316
317 static void dbs_update_util_handler(struct update_util_data *data, u64 time,
318                                     unsigned long util, unsigned long max)
319 {
320         struct cpu_dbs_info *cdbs = container_of(data, struct cpu_dbs_info, update_util);
321         struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
322         u64 delta_ns;
323
324         /*
325          * The work may not be allowed to be queued up right now.
326          * Possible reasons:
327          * - Work has already been queued up or is in progress.
328          * - It is too early (too little time from the previous sample).
329          */
330         if (policy_dbs->work_in_progress)
331                 return;
332
333         /*
334          * If the reads below are reordered before the check above, the value
335          * of sample_delay_ns used in the computation may be stale.
336          */
337         smp_rmb();
338         delta_ns = time - policy_dbs->last_sample_time;
339         if ((s64)delta_ns < policy_dbs->sample_delay_ns)
340                 return;
341
342         /*
343          * If the policy is not shared, the irq_work may be queued up right away
344          * at this point.  Otherwise, we need to ensure that only one of the
345          * CPUs sharing the policy will do that.
346          */
347         if (policy_dbs->is_shared &&
348             !atomic_add_unless(&policy_dbs->work_count, 1, 1))
349                 return;
350
351         policy_dbs->last_sample_time = time;
352         policy_dbs->work_in_progress = true;
353         irq_work_queue(&policy_dbs->irq_work);
354 }
355
356 static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *policy,
357                                                      struct dbs_governor *gov)
358 {
359         struct policy_dbs_info *policy_dbs;
360         int j;
361
362         /* Allocate memory for the common information for policy->cpus */
363         policy_dbs = kzalloc(sizeof(*policy_dbs), GFP_KERNEL);
364         if (!policy_dbs)
365                 return NULL;
366
367         policy_dbs->policy = policy;
368         mutex_init(&policy_dbs->timer_mutex);
369         atomic_set(&policy_dbs->work_count, 0);
370         init_irq_work(&policy_dbs->irq_work, dbs_irq_work);
371         INIT_WORK(&policy_dbs->work, dbs_work_handler);
372
373         /* Set policy_dbs for all CPUs, online+offline */
374         for_each_cpu(j, policy->related_cpus) {
375                 struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
376
377                 j_cdbs->policy_dbs = policy_dbs;
378                 j_cdbs->update_util.func = dbs_update_util_handler;
379         }
380         return policy_dbs;
381 }
382
383 static void free_policy_dbs_info(struct cpufreq_policy *policy,
384                                  struct dbs_governor *gov)
385 {
386         struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
387         struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
388         int j;
389
390         mutex_destroy(&policy_dbs->timer_mutex);
391
392         for_each_cpu(j, policy->related_cpus) {
393                 struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
394
395                 j_cdbs->policy_dbs = NULL;
396                 j_cdbs->update_util.func = NULL;
397         }
398         kfree(policy_dbs);
399 }
400
401 static int cpufreq_governor_init(struct cpufreq_policy *policy)
402 {
403         struct dbs_governor *gov = dbs_governor_of(policy);
404         struct dbs_data *dbs_data = gov->gdbs_data;
405         struct policy_dbs_info *policy_dbs;
406         unsigned int latency;
407         int ret;
408
409         /* State should be equivalent to EXIT */
410         if (policy->governor_data)
411                 return -EBUSY;
412
413         policy_dbs = alloc_policy_dbs_info(policy, gov);
414         if (!policy_dbs)
415                 return -ENOMEM;
416
417         if (dbs_data) {
418                 if (WARN_ON(have_governor_per_policy())) {
419                         ret = -EINVAL;
420                         goto free_policy_dbs_info;
421                 }
422                 policy_dbs->dbs_data = dbs_data;
423                 policy->governor_data = policy_dbs;
424
425                 mutex_lock(&dbs_data->mutex);
426                 dbs_data->usage_count++;
427                 list_add(&policy_dbs->list, &dbs_data->policy_dbs_list);
428                 mutex_unlock(&dbs_data->mutex);
429
430                 return 0;
431         }
432
433         dbs_data = kzalloc(sizeof(*dbs_data), GFP_KERNEL);
434         if (!dbs_data) {
435                 ret = -ENOMEM;
436                 goto free_policy_dbs_info;
437         }
438
439         INIT_LIST_HEAD(&dbs_data->policy_dbs_list);
440         mutex_init(&dbs_data->mutex);
441
442         ret = gov->init(dbs_data, !policy->governor->initialized);
443         if (ret)
444                 goto free_policy_dbs_info;
445
446         /* policy latency is in ns. Convert it to us first */
447         latency = policy->cpuinfo.transition_latency / 1000;
448         if (latency == 0)
449                 latency = 1;
450
451         /* Bring kernel and HW constraints together */
452         dbs_data->min_sampling_rate = max(dbs_data->min_sampling_rate,
453                                           MIN_LATENCY_MULTIPLIER * latency);
454         dbs_data->sampling_rate = max(dbs_data->min_sampling_rate,
455                                       LATENCY_MULTIPLIER * latency);
456
457         if (!have_governor_per_policy())
458                 gov->gdbs_data = dbs_data;
459
460         policy->governor_data = policy_dbs;
461
462         policy_dbs->dbs_data = dbs_data;
463         dbs_data->usage_count = 1;
464         list_add(&policy_dbs->list, &dbs_data->policy_dbs_list);
465
466         gov->kobj_type.sysfs_ops = &governor_sysfs_ops;
467         ret = kobject_init_and_add(&dbs_data->kobj, &gov->kobj_type,
468                                    get_governor_parent_kobj(policy),
469                                    "%s", gov->gov.name);
470         if (!ret)
471                 return 0;
472
473         /* Failure, so roll back. */
474         pr_err("cpufreq: Governor initialization failed (dbs_data kobject init error %d)\n", ret);
475
476         policy->governor_data = NULL;
477
478         if (!have_governor_per_policy())
479                 gov->gdbs_data = NULL;
480         gov->exit(dbs_data, !policy->governor->initialized);
481         kfree(dbs_data);
482
483 free_policy_dbs_info:
484         free_policy_dbs_info(policy, gov);
485         return ret;
486 }
487
488 static int cpufreq_governor_exit(struct cpufreq_policy *policy)
489 {
490         struct dbs_governor *gov = dbs_governor_of(policy);
491         struct policy_dbs_info *policy_dbs = policy->governor_data;
492         struct dbs_data *dbs_data = policy_dbs->dbs_data;
493         int count;
494
495         mutex_lock(&dbs_data->mutex);
496         list_del(&policy_dbs->list);
497         count = --dbs_data->usage_count;
498         mutex_unlock(&dbs_data->mutex);
499
500         if (!count) {
501                 kobject_put(&dbs_data->kobj);
502
503                 policy->governor_data = NULL;
504
505                 if (!have_governor_per_policy())
506                         gov->gdbs_data = NULL;
507
508                 gov->exit(dbs_data, policy->governor->initialized == 1);
509                 mutex_destroy(&dbs_data->mutex);
510                 kfree(dbs_data);
511         } else {
512                 policy->governor_data = NULL;
513         }
514
515         free_policy_dbs_info(policy, gov);
516         return 0;
517 }
518
519 static int cpufreq_governor_start(struct cpufreq_policy *policy)
520 {
521         struct dbs_governor *gov = dbs_governor_of(policy);
522         struct policy_dbs_info *policy_dbs = policy->governor_data;
523         struct dbs_data *dbs_data = policy_dbs->dbs_data;
524         unsigned int sampling_rate, ignore_nice, j, cpu = policy->cpu;
525         int io_busy = 0;
526
527         if (!policy->cur)
528                 return -EINVAL;
529
530         policy_dbs->is_shared = policy_is_shared(policy);
531         policy_dbs->rate_mult = 1;
532
533         sampling_rate = dbs_data->sampling_rate;
534         ignore_nice = dbs_data->ignore_nice_load;
535
536         if (gov->governor == GOV_ONDEMAND) {
537                 struct od_dbs_tuners *od_tuners = dbs_data->tuners;
538
539                 io_busy = od_tuners->io_is_busy;
540         }
541
542         for_each_cpu(j, policy->cpus) {
543                 struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
544                 unsigned int prev_load;
545
546                 j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall, io_busy);
547
548                 prev_load = j_cdbs->prev_cpu_wall - j_cdbs->prev_cpu_idle;
549                 j_cdbs->prev_load = 100 * prev_load / (unsigned int)j_cdbs->prev_cpu_wall;
550
551                 if (ignore_nice)
552                         j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
553         }
554
555         if (gov->governor == GOV_CONSERVATIVE) {
556                 struct cs_cpu_dbs_info_s *cs_dbs_info =
557                         gov->get_cpu_dbs_info_s(cpu);
558
559                 cs_dbs_info->down_skip = 0;
560                 cs_dbs_info->requested_freq = policy->cur;
561         } else {
562                 struct od_ops *od_ops = gov->gov_ops;
563                 struct od_cpu_dbs_info_s *od_dbs_info = gov->get_cpu_dbs_info_s(cpu);
564
565                 od_dbs_info->sample_type = OD_NORMAL_SAMPLE;
566                 od_ops->powersave_bias_init_cpu(cpu);
567         }
568
569         gov_set_update_util(policy_dbs, sampling_rate);
570         return 0;
571 }
572
573 static int cpufreq_governor_stop(struct cpufreq_policy *policy)
574 {
575         gov_cancel_work(policy);
576
577         return 0;
578 }
579
580 static int cpufreq_governor_limits(struct cpufreq_policy *policy)
581 {
582         struct policy_dbs_info *policy_dbs = policy->governor_data;
583
584         mutex_lock(&policy_dbs->timer_mutex);
585
586         if (policy->max < policy->cur)
587                 __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
588         else if (policy->min > policy->cur)
589                 __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
590
591         gov_update_sample_delay(policy_dbs, 0);
592
593         mutex_unlock(&policy_dbs->timer_mutex);
594
595         return 0;
596 }
597
598 int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event)
599 {
600         int ret = -EINVAL;
601
602         /* Lock governor to block concurrent initialization of governor */
603         mutex_lock(&dbs_data_mutex);
604
605         if (event == CPUFREQ_GOV_POLICY_INIT) {
606                 ret = cpufreq_governor_init(policy);
607         } else if (policy->governor_data) {
608                 switch (event) {
609                 case CPUFREQ_GOV_POLICY_EXIT:
610                         ret = cpufreq_governor_exit(policy);
611                         break;
612                 case CPUFREQ_GOV_START:
613                         ret = cpufreq_governor_start(policy);
614                         break;
615                 case CPUFREQ_GOV_STOP:
616                         ret = cpufreq_governor_stop(policy);
617                         break;
618                 case CPUFREQ_GOV_LIMITS:
619                         ret = cpufreq_governor_limits(policy);
620                         break;
621                 }
622         }
623
624         mutex_unlock(&dbs_data_mutex);
625         return ret;
626 }
627 EXPORT_SYMBOL_GPL(cpufreq_governor_dbs);