2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
23 #include <linux/sched.h>
24 #include <linux/latencytop.h>
25 #include <linux/cpumask.h>
26 #include <linux/cpuidle.h>
27 #include <linux/slab.h>
28 #include <linux/profile.h>
29 #include <linux/interrupt.h>
30 #include <linux/mempolicy.h>
31 #include <linux/migrate.h>
32 #include <linux/task_work.h>
34 #include <trace/events/sched.h>
39 * Targeted preemption latency for CPU-bound tasks:
40 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
42 * NOTE: this latency value is not the same as the concept of
43 * 'timeslice length' - timeslices in CFS are of variable length
44 * and have no persistent notion like in traditional, time-slice
45 * based scheduling concepts.
47 * (to see the precise effective timeslice length of your workload,
48 * run vmstat and monitor the context-switches (cs) field)
50 unsigned int sysctl_sched_latency = 6000000ULL;
51 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
54 * The initial- and re-scaling of tunables is configurable
55 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
58 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
59 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
60 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
62 enum sched_tunable_scaling sysctl_sched_tunable_scaling
63 = SCHED_TUNABLESCALING_LOG;
66 * Minimal preemption granularity for CPU-bound tasks:
67 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
69 unsigned int sysctl_sched_min_granularity = 750000ULL;
70 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
73 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
75 static unsigned int sched_nr_latency = 8;
78 * After fork, child runs first. If set to 0 (default) then
79 * parent will (try to) run first.
81 unsigned int sysctl_sched_child_runs_first __read_mostly;
84 * SCHED_OTHER wake-up granularity.
85 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
87 * This option delays the preemption effects of decoupled workloads
88 * and reduces their over-scheduling. Synchronous workloads will still
89 * have immediate wakeup/sleep latencies.
91 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
92 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
94 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
97 * The exponential sliding window over which load is averaged for shares
101 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
103 #ifdef CONFIG_CFS_BANDWIDTH
105 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
106 * each time a cfs_rq requests quota.
108 * Note: in the case that the slice exceeds the runtime remaining (either due
109 * to consumption or the quota being specified to be smaller than the slice)
110 * we will always only issue the remaining available time.
112 * default: 5 msec, units: microseconds
114 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
118 * The margin used when comparing utilization with CPU capacity:
119 * util * 1024 < capacity * margin
121 unsigned int capacity_margin = 1280; /* ~20% */
123 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
129 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
135 static inline void update_load_set(struct load_weight *lw, unsigned long w)
142 * Increase the granularity value when there are more CPUs,
143 * because with more CPUs the 'effective latency' as visible
144 * to users decreases. But the relationship is not linear,
145 * so pick a second-best guess by going with the log2 of the
148 * This idea comes from the SD scheduler of Con Kolivas:
150 static unsigned int get_update_sysctl_factor(void)
152 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
155 switch (sysctl_sched_tunable_scaling) {
156 case SCHED_TUNABLESCALING_NONE:
159 case SCHED_TUNABLESCALING_LINEAR:
162 case SCHED_TUNABLESCALING_LOG:
164 factor = 1 + ilog2(cpus);
171 static void update_sysctl(void)
173 unsigned int factor = get_update_sysctl_factor();
175 #define SET_SYSCTL(name) \
176 (sysctl_##name = (factor) * normalized_sysctl_##name)
177 SET_SYSCTL(sched_min_granularity);
178 SET_SYSCTL(sched_latency);
179 SET_SYSCTL(sched_wakeup_granularity);
183 void sched_init_granularity(void)
188 #define WMULT_CONST (~0U)
189 #define WMULT_SHIFT 32
191 static void __update_inv_weight(struct load_weight *lw)
195 if (likely(lw->inv_weight))
198 w = scale_load_down(lw->weight);
200 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
202 else if (unlikely(!w))
203 lw->inv_weight = WMULT_CONST;
205 lw->inv_weight = WMULT_CONST / w;
209 * delta_exec * weight / lw.weight
211 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
213 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
214 * we're guaranteed shift stays positive because inv_weight is guaranteed to
215 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
217 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
218 * weight/lw.weight <= 1, and therefore our shift will also be positive.
220 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
222 u64 fact = scale_load_down(weight);
223 int shift = WMULT_SHIFT;
225 __update_inv_weight(lw);
227 if (unlikely(fact >> 32)) {
234 /* hint to use a 32x32->64 mul */
235 fact = (u64)(u32)fact * lw->inv_weight;
242 return mul_u64_u32_shr(delta_exec, fact, shift);
246 const struct sched_class fair_sched_class;
248 /**************************************************************
249 * CFS operations on generic schedulable entities:
252 #ifdef CONFIG_FAIR_GROUP_SCHED
254 /* cpu runqueue to which this cfs_rq is attached */
255 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
260 /* An entity is a task if it doesn't "own" a runqueue */
261 #define entity_is_task(se) (!se->my_q)
263 static inline struct task_struct *task_of(struct sched_entity *se)
265 SCHED_WARN_ON(!entity_is_task(se));
266 return container_of(se, struct task_struct, se);
269 /* Walk up scheduling entities hierarchy */
270 #define for_each_sched_entity(se) \
271 for (; se; se = se->parent)
273 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
278 /* runqueue on which this entity is (to be) queued */
279 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
284 /* runqueue "owned" by this group */
285 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
290 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
292 if (!cfs_rq->on_list) {
294 * Ensure we either appear before our parent (if already
295 * enqueued) or force our parent to appear after us when it is
296 * enqueued. The fact that we always enqueue bottom-up
297 * reduces this to two cases.
299 if (cfs_rq->tg->parent &&
300 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
301 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
302 &rq_of(cfs_rq)->leaf_cfs_rq_list);
304 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
305 &rq_of(cfs_rq)->leaf_cfs_rq_list);
312 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
314 if (cfs_rq->on_list) {
315 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
320 /* Iterate thr' all leaf cfs_rq's on a runqueue */
321 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
322 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
324 /* Do the two (enqueued) entities belong to the same group ? */
325 static inline struct cfs_rq *
326 is_same_group(struct sched_entity *se, struct sched_entity *pse)
328 if (se->cfs_rq == pse->cfs_rq)
334 static inline struct sched_entity *parent_entity(struct sched_entity *se)
340 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
342 int se_depth, pse_depth;
345 * preemption test can be made between sibling entities who are in the
346 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
347 * both tasks until we find their ancestors who are siblings of common
351 /* First walk up until both entities are at same depth */
352 se_depth = (*se)->depth;
353 pse_depth = (*pse)->depth;
355 while (se_depth > pse_depth) {
357 *se = parent_entity(*se);
360 while (pse_depth > se_depth) {
362 *pse = parent_entity(*pse);
365 while (!is_same_group(*se, *pse)) {
366 *se = parent_entity(*se);
367 *pse = parent_entity(*pse);
371 #else /* !CONFIG_FAIR_GROUP_SCHED */
373 static inline struct task_struct *task_of(struct sched_entity *se)
375 return container_of(se, struct task_struct, se);
378 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
380 return container_of(cfs_rq, struct rq, cfs);
383 #define entity_is_task(se) 1
385 #define for_each_sched_entity(se) \
386 for (; se; se = NULL)
388 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
390 return &task_rq(p)->cfs;
393 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
395 struct task_struct *p = task_of(se);
396 struct rq *rq = task_rq(p);
401 /* runqueue "owned" by this group */
402 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
407 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
411 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
415 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
416 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
418 static inline struct sched_entity *parent_entity(struct sched_entity *se)
424 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
428 #endif /* CONFIG_FAIR_GROUP_SCHED */
430 static __always_inline
431 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
433 /**************************************************************
434 * Scheduling class tree data structure manipulation methods:
437 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
439 s64 delta = (s64)(vruntime - max_vruntime);
441 max_vruntime = vruntime;
446 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
448 s64 delta = (s64)(vruntime - min_vruntime);
450 min_vruntime = vruntime;
455 static inline int entity_before(struct sched_entity *a,
456 struct sched_entity *b)
458 return (s64)(a->vruntime - b->vruntime) < 0;
461 static void update_min_vruntime(struct cfs_rq *cfs_rq)
463 u64 vruntime = cfs_rq->min_vruntime;
466 vruntime = cfs_rq->curr->vruntime;
468 if (cfs_rq->rb_leftmost) {
469 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
474 vruntime = se->vruntime;
476 vruntime = min_vruntime(vruntime, se->vruntime);
479 /* ensure we never gain time by being placed backwards. */
480 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
483 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
488 * Enqueue an entity into the rb-tree:
490 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
492 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
493 struct rb_node *parent = NULL;
494 struct sched_entity *entry;
498 * Find the right place in the rbtree:
502 entry = rb_entry(parent, struct sched_entity, run_node);
504 * We dont care about collisions. Nodes with
505 * the same key stay together.
507 if (entity_before(se, entry)) {
508 link = &parent->rb_left;
510 link = &parent->rb_right;
516 * Maintain a cache of leftmost tree entries (it is frequently
520 cfs_rq->rb_leftmost = &se->run_node;
522 rb_link_node(&se->run_node, parent, link);
523 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
526 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
528 if (cfs_rq->rb_leftmost == &se->run_node) {
529 struct rb_node *next_node;
531 next_node = rb_next(&se->run_node);
532 cfs_rq->rb_leftmost = next_node;
535 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
538 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
540 struct rb_node *left = cfs_rq->rb_leftmost;
545 return rb_entry(left, struct sched_entity, run_node);
548 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
550 struct rb_node *next = rb_next(&se->run_node);
555 return rb_entry(next, struct sched_entity, run_node);
558 #ifdef CONFIG_SCHED_DEBUG
559 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
561 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
566 return rb_entry(last, struct sched_entity, run_node);
569 /**************************************************************
570 * Scheduling class statistics methods:
573 int sched_proc_update_handler(struct ctl_table *table, int write,
574 void __user *buffer, size_t *lenp,
577 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
578 unsigned int factor = get_update_sysctl_factor();
583 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
584 sysctl_sched_min_granularity);
586 #define WRT_SYSCTL(name) \
587 (normalized_sysctl_##name = sysctl_##name / (factor))
588 WRT_SYSCTL(sched_min_granularity);
589 WRT_SYSCTL(sched_latency);
590 WRT_SYSCTL(sched_wakeup_granularity);
600 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
602 if (unlikely(se->load.weight != NICE_0_LOAD))
603 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
609 * The idea is to set a period in which each task runs once.
611 * When there are too many tasks (sched_nr_latency) we have to stretch
612 * this period because otherwise the slices get too small.
614 * p = (nr <= nl) ? l : l*nr/nl
616 static u64 __sched_period(unsigned long nr_running)
618 if (unlikely(nr_running > sched_nr_latency))
619 return nr_running * sysctl_sched_min_granularity;
621 return sysctl_sched_latency;
625 * We calculate the wall-time slice from the period by taking a part
626 * proportional to the weight.
630 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
632 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
634 for_each_sched_entity(se) {
635 struct load_weight *load;
636 struct load_weight lw;
638 cfs_rq = cfs_rq_of(se);
639 load = &cfs_rq->load;
641 if (unlikely(!se->on_rq)) {
644 update_load_add(&lw, se->load.weight);
647 slice = __calc_delta(slice, se->load.weight, load);
653 * We calculate the vruntime slice of a to-be-inserted task.
657 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
659 return calc_delta_fair(sched_slice(cfs_rq, se), se);
663 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
664 static unsigned long task_h_load(struct task_struct *p);
667 * We choose a half-life close to 1 scheduling period.
668 * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
669 * dependent on this value.
671 #define LOAD_AVG_PERIOD 32
672 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
673 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
675 /* Give new sched_entity start runnable values to heavy its load in infant time */
676 void init_entity_runnable_average(struct sched_entity *se)
678 struct sched_avg *sa = &se->avg;
680 sa->last_update_time = 0;
682 * sched_avg's period_contrib should be strictly less then 1024, so
683 * we give it 1023 to make sure it is almost a period (1024us), and
684 * will definitely be update (after enqueue).
686 sa->period_contrib = 1023;
687 sa->load_avg = scale_load_down(se->load.weight);
688 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
690 * At this point, util_avg won't be used in select_task_rq_fair anyway
694 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
697 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
698 static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
699 static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
700 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
703 * With new tasks being created, their initial util_avgs are extrapolated
704 * based on the cfs_rq's current util_avg:
706 * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
708 * However, in many cases, the above util_avg does not give a desired
709 * value. Moreover, the sum of the util_avgs may be divergent, such
710 * as when the series is a harmonic series.
712 * To solve this problem, we also cap the util_avg of successive tasks to
713 * only 1/2 of the left utilization budget:
715 * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
717 * where n denotes the nth task.
719 * For example, a simplest series from the beginning would be like:
721 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
722 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
724 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
725 * if util_avg > util_avg_cap.
727 void post_init_entity_util_avg(struct sched_entity *se)
729 struct cfs_rq *cfs_rq = cfs_rq_of(se);
730 struct sched_avg *sa = &se->avg;
731 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
732 u64 now = cfs_rq_clock_task(cfs_rq);
735 if (cfs_rq->avg.util_avg != 0) {
736 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
737 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
739 if (sa->util_avg > cap)
744 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
747 if (entity_is_task(se)) {
748 struct task_struct *p = task_of(se);
749 if (p->sched_class != &fair_sched_class) {
751 * For !fair tasks do:
753 update_cfs_rq_load_avg(now, cfs_rq, false);
754 attach_entity_load_avg(cfs_rq, se);
755 switched_from_fair(rq, p);
757 * such that the next switched_to_fair() has the
760 se->avg.last_update_time = now;
765 update_cfs_rq_load_avg(now, cfs_rq, false);
766 attach_entity_load_avg(cfs_rq, se);
767 update_tg_load_avg(cfs_rq, false);
770 #else /* !CONFIG_SMP */
771 void init_entity_runnable_average(struct sched_entity *se)
774 void post_init_entity_util_avg(struct sched_entity *se)
777 static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
780 #endif /* CONFIG_SMP */
783 * Update the current task's runtime statistics.
785 static void update_curr(struct cfs_rq *cfs_rq)
787 struct sched_entity *curr = cfs_rq->curr;
788 u64 now = rq_clock_task(rq_of(cfs_rq));
794 delta_exec = now - curr->exec_start;
795 if (unlikely((s64)delta_exec <= 0))
798 curr->exec_start = now;
800 schedstat_set(curr->statistics.exec_max,
801 max(delta_exec, curr->statistics.exec_max));
803 curr->sum_exec_runtime += delta_exec;
804 schedstat_add(cfs_rq->exec_clock, delta_exec);
806 curr->vruntime += calc_delta_fair(delta_exec, curr);
807 update_min_vruntime(cfs_rq);
809 if (entity_is_task(curr)) {
810 struct task_struct *curtask = task_of(curr);
812 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
813 cpuacct_charge(curtask, delta_exec);
814 account_group_exec_runtime(curtask, delta_exec);
817 account_cfs_rq_runtime(cfs_rq, delta_exec);
820 static void update_curr_fair(struct rq *rq)
822 update_curr(cfs_rq_of(&rq->curr->se));
826 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
828 u64 wait_start, prev_wait_start;
830 if (!schedstat_enabled())
833 wait_start = rq_clock(rq_of(cfs_rq));
834 prev_wait_start = schedstat_val(se->statistics.wait_start);
836 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
837 likely(wait_start > prev_wait_start))
838 wait_start -= prev_wait_start;
840 schedstat_set(se->statistics.wait_start, wait_start);
844 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
846 struct task_struct *p;
849 if (!schedstat_enabled())
852 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
854 if (entity_is_task(se)) {
856 if (task_on_rq_migrating(p)) {
858 * Preserve migrating task's wait time so wait_start
859 * time stamp can be adjusted to accumulate wait time
860 * prior to migration.
862 schedstat_set(se->statistics.wait_start, delta);
865 trace_sched_stat_wait(p, delta);
868 schedstat_set(se->statistics.wait_max,
869 max(schedstat_val(se->statistics.wait_max), delta));
870 schedstat_inc(se->statistics.wait_count);
871 schedstat_add(se->statistics.wait_sum, delta);
872 schedstat_set(se->statistics.wait_start, 0);
876 update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
878 struct task_struct *tsk = NULL;
879 u64 sleep_start, block_start;
881 if (!schedstat_enabled())
884 sleep_start = schedstat_val(se->statistics.sleep_start);
885 block_start = schedstat_val(se->statistics.block_start);
887 if (entity_is_task(se))
891 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
896 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
897 schedstat_set(se->statistics.sleep_max, delta);
899 schedstat_set(se->statistics.sleep_start, 0);
900 schedstat_add(se->statistics.sum_sleep_runtime, delta);
903 account_scheduler_latency(tsk, delta >> 10, 1);
904 trace_sched_stat_sleep(tsk, delta);
908 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
913 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
914 schedstat_set(se->statistics.block_max, delta);
916 schedstat_set(se->statistics.block_start, 0);
917 schedstat_add(se->statistics.sum_sleep_runtime, delta);
920 if (tsk->in_iowait) {
921 schedstat_add(se->statistics.iowait_sum, delta);
922 schedstat_inc(se->statistics.iowait_count);
923 trace_sched_stat_iowait(tsk, delta);
926 trace_sched_stat_blocked(tsk, delta);
929 * Blocking time is in units of nanosecs, so shift by
930 * 20 to get a milliseconds-range estimation of the
931 * amount of time that the task spent sleeping:
933 if (unlikely(prof_on == SLEEP_PROFILING)) {
934 profile_hits(SLEEP_PROFILING,
935 (void *)get_wchan(tsk),
938 account_scheduler_latency(tsk, delta >> 10, 0);
944 * Task is being enqueued - update stats:
947 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
949 if (!schedstat_enabled())
953 * Are we enqueueing a waiting task? (for current tasks
954 * a dequeue/enqueue event is a NOP)
956 if (se != cfs_rq->curr)
957 update_stats_wait_start(cfs_rq, se);
959 if (flags & ENQUEUE_WAKEUP)
960 update_stats_enqueue_sleeper(cfs_rq, se);
964 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
967 if (!schedstat_enabled())
971 * Mark the end of the wait period if dequeueing a
974 if (se != cfs_rq->curr)
975 update_stats_wait_end(cfs_rq, se);
977 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
978 struct task_struct *tsk = task_of(se);
980 if (tsk->state & TASK_INTERRUPTIBLE)
981 schedstat_set(se->statistics.sleep_start,
982 rq_clock(rq_of(cfs_rq)));
983 if (tsk->state & TASK_UNINTERRUPTIBLE)
984 schedstat_set(se->statistics.block_start,
985 rq_clock(rq_of(cfs_rq)));
990 * We are picking a new current task - update its stats:
993 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
996 * We are starting a new run period:
998 se->exec_start = rq_clock_task(rq_of(cfs_rq));
1001 /**************************************************
1002 * Scheduling class queueing methods:
1005 #ifdef CONFIG_NUMA_BALANCING
1007 * Approximate time to scan a full NUMA task in ms. The task scan period is
1008 * calculated based on the tasks virtual memory size and
1009 * numa_balancing_scan_size.
1011 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1012 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1014 /* Portion of address space to scan in MB */
1015 unsigned int sysctl_numa_balancing_scan_size = 256;
1017 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1018 unsigned int sysctl_numa_balancing_scan_delay = 1000;
1020 static unsigned int task_nr_scan_windows(struct task_struct *p)
1022 unsigned long rss = 0;
1023 unsigned long nr_scan_pages;
1026 * Calculations based on RSS as non-present and empty pages are skipped
1027 * by the PTE scanner and NUMA hinting faults should be trapped based
1030 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1031 rss = get_mm_rss(p->mm);
1033 rss = nr_scan_pages;
1035 rss = round_up(rss, nr_scan_pages);
1036 return rss / nr_scan_pages;
1039 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1040 #define MAX_SCAN_WINDOW 2560
1042 static unsigned int task_scan_min(struct task_struct *p)
1044 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1045 unsigned int scan, floor;
1046 unsigned int windows = 1;
1048 if (scan_size < MAX_SCAN_WINDOW)
1049 windows = MAX_SCAN_WINDOW / scan_size;
1050 floor = 1000 / windows;
1052 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1053 return max_t(unsigned int, floor, scan);
1056 static unsigned int task_scan_max(struct task_struct *p)
1058 unsigned int smin = task_scan_min(p);
1061 /* Watch for min being lower than max due to floor calculations */
1062 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1063 return max(smin, smax);
1066 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1068 rq->nr_numa_running += (p->numa_preferred_nid != -1);
1069 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1072 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1074 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1075 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1081 spinlock_t lock; /* nr_tasks, tasks */
1086 struct rcu_head rcu;
1087 unsigned long total_faults;
1088 unsigned long max_faults_cpu;
1090 * Faults_cpu is used to decide whether memory should move
1091 * towards the CPU. As a consequence, these stats are weighted
1092 * more by CPU use than by memory faults.
1094 unsigned long *faults_cpu;
1095 unsigned long faults[0];
1098 /* Shared or private faults. */
1099 #define NR_NUMA_HINT_FAULT_TYPES 2
1101 /* Memory and CPU locality */
1102 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1104 /* Averaged statistics, and temporary buffers. */
1105 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1107 pid_t task_numa_group_id(struct task_struct *p)
1109 return p->numa_group ? p->numa_group->gid : 0;
1113 * The averaged statistics, shared & private, memory & cpu,
1114 * occupy the first half of the array. The second half of the
1115 * array is for current counters, which are averaged into the
1116 * first set by task_numa_placement.
1118 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1120 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1123 static inline unsigned long task_faults(struct task_struct *p, int nid)
1125 if (!p->numa_faults)
1128 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1129 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1132 static inline unsigned long group_faults(struct task_struct *p, int nid)
1137 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1138 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1141 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1143 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1144 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1148 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1149 * considered part of a numa group's pseudo-interleaving set. Migrations
1150 * between these nodes are slowed down, to allow things to settle down.
1152 #define ACTIVE_NODE_FRACTION 3
1154 static bool numa_is_active_node(int nid, struct numa_group *ng)
1156 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1159 /* Handle placement on systems where not all nodes are directly connected. */
1160 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1161 int maxdist, bool task)
1163 unsigned long score = 0;
1167 * All nodes are directly connected, and the same distance
1168 * from each other. No need for fancy placement algorithms.
1170 if (sched_numa_topology_type == NUMA_DIRECT)
1174 * This code is called for each node, introducing N^2 complexity,
1175 * which should be ok given the number of nodes rarely exceeds 8.
1177 for_each_online_node(node) {
1178 unsigned long faults;
1179 int dist = node_distance(nid, node);
1182 * The furthest away nodes in the system are not interesting
1183 * for placement; nid was already counted.
1185 if (dist == sched_max_numa_distance || node == nid)
1189 * On systems with a backplane NUMA topology, compare groups
1190 * of nodes, and move tasks towards the group with the most
1191 * memory accesses. When comparing two nodes at distance
1192 * "hoplimit", only nodes closer by than "hoplimit" are part
1193 * of each group. Skip other nodes.
1195 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1199 /* Add up the faults from nearby nodes. */
1201 faults = task_faults(p, node);
1203 faults = group_faults(p, node);
1206 * On systems with a glueless mesh NUMA topology, there are
1207 * no fixed "groups of nodes". Instead, nodes that are not
1208 * directly connected bounce traffic through intermediate
1209 * nodes; a numa_group can occupy any set of nodes.
1210 * The further away a node is, the less the faults count.
1211 * This seems to result in good task placement.
1213 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1214 faults *= (sched_max_numa_distance - dist);
1215 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1225 * These return the fraction of accesses done by a particular task, or
1226 * task group, on a particular numa node. The group weight is given a
1227 * larger multiplier, in order to group tasks together that are almost
1228 * evenly spread out between numa nodes.
1230 static inline unsigned long task_weight(struct task_struct *p, int nid,
1233 unsigned long faults, total_faults;
1235 if (!p->numa_faults)
1238 total_faults = p->total_numa_faults;
1243 faults = task_faults(p, nid);
1244 faults += score_nearby_nodes(p, nid, dist, true);
1246 return 1000 * faults / total_faults;
1249 static inline unsigned long group_weight(struct task_struct *p, int nid,
1252 unsigned long faults, total_faults;
1257 total_faults = p->numa_group->total_faults;
1262 faults = group_faults(p, nid);
1263 faults += score_nearby_nodes(p, nid, dist, false);
1265 return 1000 * faults / total_faults;
1268 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1269 int src_nid, int dst_cpu)
1271 struct numa_group *ng = p->numa_group;
1272 int dst_nid = cpu_to_node(dst_cpu);
1273 int last_cpupid, this_cpupid;
1275 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1278 * Multi-stage node selection is used in conjunction with a periodic
1279 * migration fault to build a temporal task<->page relation. By using
1280 * a two-stage filter we remove short/unlikely relations.
1282 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1283 * a task's usage of a particular page (n_p) per total usage of this
1284 * page (n_t) (in a given time-span) to a probability.
1286 * Our periodic faults will sample this probability and getting the
1287 * same result twice in a row, given these samples are fully
1288 * independent, is then given by P(n)^2, provided our sample period
1289 * is sufficiently short compared to the usage pattern.
1291 * This quadric squishes small probabilities, making it less likely we
1292 * act on an unlikely task<->page relation.
1294 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1295 if (!cpupid_pid_unset(last_cpupid) &&
1296 cpupid_to_nid(last_cpupid) != dst_nid)
1299 /* Always allow migrate on private faults */
1300 if (cpupid_match_pid(p, last_cpupid))
1303 /* A shared fault, but p->numa_group has not been set up yet. */
1308 * Destination node is much more heavily used than the source
1309 * node? Allow migration.
1311 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1312 ACTIVE_NODE_FRACTION)
1316 * Distribute memory according to CPU & memory use on each node,
1317 * with 3/4 hysteresis to avoid unnecessary memory migrations:
1319 * faults_cpu(dst) 3 faults_cpu(src)
1320 * --------------- * - > ---------------
1321 * faults_mem(dst) 4 faults_mem(src)
1323 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1324 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1327 static unsigned long weighted_cpuload(const int cpu);
1328 static unsigned long source_load(int cpu, int type);
1329 static unsigned long target_load(int cpu, int type);
1330 static unsigned long capacity_of(int cpu);
1331 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1333 /* Cached statistics for all CPUs within a node */
1335 unsigned long nr_running;
1338 /* Total compute capacity of CPUs on a node */
1339 unsigned long compute_capacity;
1341 /* Approximate capacity in terms of runnable tasks on a node */
1342 unsigned long task_capacity;
1343 int has_free_capacity;
1347 * XXX borrowed from update_sg_lb_stats
1349 static void update_numa_stats(struct numa_stats *ns, int nid)
1351 int smt, cpu, cpus = 0;
1352 unsigned long capacity;
1354 memset(ns, 0, sizeof(*ns));
1355 for_each_cpu(cpu, cpumask_of_node(nid)) {
1356 struct rq *rq = cpu_rq(cpu);
1358 ns->nr_running += rq->nr_running;
1359 ns->load += weighted_cpuload(cpu);
1360 ns->compute_capacity += capacity_of(cpu);
1366 * If we raced with hotplug and there are no CPUs left in our mask
1367 * the @ns structure is NULL'ed and task_numa_compare() will
1368 * not find this node attractive.
1370 * We'll either bail at !has_free_capacity, or we'll detect a huge
1371 * imbalance and bail there.
1376 /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1377 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1378 capacity = cpus / smt; /* cores */
1380 ns->task_capacity = min_t(unsigned, capacity,
1381 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1382 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1385 struct task_numa_env {
1386 struct task_struct *p;
1388 int src_cpu, src_nid;
1389 int dst_cpu, dst_nid;
1391 struct numa_stats src_stats, dst_stats;
1396 struct task_struct *best_task;
1401 static void task_numa_assign(struct task_numa_env *env,
1402 struct task_struct *p, long imp)
1405 put_task_struct(env->best_task);
1410 env->best_imp = imp;
1411 env->best_cpu = env->dst_cpu;
1414 static bool load_too_imbalanced(long src_load, long dst_load,
1415 struct task_numa_env *env)
1418 long orig_src_load, orig_dst_load;
1419 long src_capacity, dst_capacity;
1422 * The load is corrected for the CPU capacity available on each node.
1425 * ------------ vs ---------
1426 * src_capacity dst_capacity
1428 src_capacity = env->src_stats.compute_capacity;
1429 dst_capacity = env->dst_stats.compute_capacity;
1431 /* We care about the slope of the imbalance, not the direction. */
1432 if (dst_load < src_load)
1433 swap(dst_load, src_load);
1435 /* Is the difference below the threshold? */
1436 imb = dst_load * src_capacity * 100 -
1437 src_load * dst_capacity * env->imbalance_pct;
1442 * The imbalance is above the allowed threshold.
1443 * Compare it with the old imbalance.
1445 orig_src_load = env->src_stats.load;
1446 orig_dst_load = env->dst_stats.load;
1448 if (orig_dst_load < orig_src_load)
1449 swap(orig_dst_load, orig_src_load);
1451 old_imb = orig_dst_load * src_capacity * 100 -
1452 orig_src_load * dst_capacity * env->imbalance_pct;
1454 /* Would this change make things worse? */
1455 return (imb > old_imb);
1459 * This checks if the overall compute and NUMA accesses of the system would
1460 * be improved if the source tasks was migrated to the target dst_cpu taking
1461 * into account that it might be best if task running on the dst_cpu should
1462 * be exchanged with the source task
1464 static void task_numa_compare(struct task_numa_env *env,
1465 long taskimp, long groupimp)
1467 struct rq *src_rq = cpu_rq(env->src_cpu);
1468 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1469 struct task_struct *cur;
1470 long src_load, dst_load;
1472 long imp = env->p->numa_group ? groupimp : taskimp;
1474 int dist = env->dist;
1477 cur = task_rcu_dereference(&dst_rq->curr);
1478 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1482 * Because we have preemption enabled we can get migrated around and
1483 * end try selecting ourselves (current == env->p) as a swap candidate.
1489 * "imp" is the fault differential for the source task between the
1490 * source and destination node. Calculate the total differential for
1491 * the source task and potential destination task. The more negative
1492 * the value is, the more rmeote accesses that would be expected to
1493 * be incurred if the tasks were swapped.
1496 /* Skip this swap candidate if cannot move to the source cpu */
1497 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1501 * If dst and source tasks are in the same NUMA group, or not
1502 * in any group then look only at task weights.
1504 if (cur->numa_group == env->p->numa_group) {
1505 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1506 task_weight(cur, env->dst_nid, dist);
1508 * Add some hysteresis to prevent swapping the
1509 * tasks within a group over tiny differences.
1511 if (cur->numa_group)
1515 * Compare the group weights. If a task is all by
1516 * itself (not part of a group), use the task weight
1519 if (cur->numa_group)
1520 imp += group_weight(cur, env->src_nid, dist) -
1521 group_weight(cur, env->dst_nid, dist);
1523 imp += task_weight(cur, env->src_nid, dist) -
1524 task_weight(cur, env->dst_nid, dist);
1528 if (imp <= env->best_imp && moveimp <= env->best_imp)
1532 /* Is there capacity at our destination? */
1533 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1534 !env->dst_stats.has_free_capacity)
1540 /* Balance doesn't matter much if we're running a task per cpu */
1541 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1542 dst_rq->nr_running == 1)
1546 * In the overloaded case, try and keep the load balanced.
1549 load = task_h_load(env->p);
1550 dst_load = env->dst_stats.load + load;
1551 src_load = env->src_stats.load - load;
1553 if (moveimp > imp && moveimp > env->best_imp) {
1555 * If the improvement from just moving env->p direction is
1556 * better than swapping tasks around, check if a move is
1557 * possible. Store a slightly smaller score than moveimp,
1558 * so an actually idle CPU will win.
1560 if (!load_too_imbalanced(src_load, dst_load, env)) {
1567 if (imp <= env->best_imp)
1571 load = task_h_load(cur);
1576 if (load_too_imbalanced(src_load, dst_load, env))
1580 * One idle CPU per node is evaluated for a task numa move.
1581 * Call select_idle_sibling to maybe find a better one.
1585 * select_idle_siblings() uses an per-cpu cpumask that
1586 * can be used from IRQ context.
1588 local_irq_disable();
1589 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1595 task_numa_assign(env, cur, imp);
1600 static void task_numa_find_cpu(struct task_numa_env *env,
1601 long taskimp, long groupimp)
1605 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1606 /* Skip this CPU if the source task cannot migrate */
1607 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1611 task_numa_compare(env, taskimp, groupimp);
1615 /* Only move tasks to a NUMA node less busy than the current node. */
1616 static bool numa_has_capacity(struct task_numa_env *env)
1618 struct numa_stats *src = &env->src_stats;
1619 struct numa_stats *dst = &env->dst_stats;
1621 if (src->has_free_capacity && !dst->has_free_capacity)
1625 * Only consider a task move if the source has a higher load
1626 * than the destination, corrected for CPU capacity on each node.
1628 * src->load dst->load
1629 * --------------------- vs ---------------------
1630 * src->compute_capacity dst->compute_capacity
1632 if (src->load * dst->compute_capacity * env->imbalance_pct >
1634 dst->load * src->compute_capacity * 100)
1640 static int task_numa_migrate(struct task_struct *p)
1642 struct task_numa_env env = {
1645 .src_cpu = task_cpu(p),
1646 .src_nid = task_node(p),
1648 .imbalance_pct = 112,
1654 struct sched_domain *sd;
1655 unsigned long taskweight, groupweight;
1657 long taskimp, groupimp;
1660 * Pick the lowest SD_NUMA domain, as that would have the smallest
1661 * imbalance and would be the first to start moving tasks about.
1663 * And we want to avoid any moving of tasks about, as that would create
1664 * random movement of tasks -- counter the numa conditions we're trying
1668 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1670 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1674 * Cpusets can break the scheduler domain tree into smaller
1675 * balance domains, some of which do not cross NUMA boundaries.
1676 * Tasks that are "trapped" in such domains cannot be migrated
1677 * elsewhere, so there is no point in (re)trying.
1679 if (unlikely(!sd)) {
1680 p->numa_preferred_nid = task_node(p);
1684 env.dst_nid = p->numa_preferred_nid;
1685 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1686 taskweight = task_weight(p, env.src_nid, dist);
1687 groupweight = group_weight(p, env.src_nid, dist);
1688 update_numa_stats(&env.src_stats, env.src_nid);
1689 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1690 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1691 update_numa_stats(&env.dst_stats, env.dst_nid);
1693 /* Try to find a spot on the preferred nid. */
1694 if (numa_has_capacity(&env))
1695 task_numa_find_cpu(&env, taskimp, groupimp);
1698 * Look at other nodes in these cases:
1699 * - there is no space available on the preferred_nid
1700 * - the task is part of a numa_group that is interleaved across
1701 * multiple NUMA nodes; in order to better consolidate the group,
1702 * we need to check other locations.
1704 if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
1705 for_each_online_node(nid) {
1706 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1709 dist = node_distance(env.src_nid, env.dst_nid);
1710 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1712 taskweight = task_weight(p, env.src_nid, dist);
1713 groupweight = group_weight(p, env.src_nid, dist);
1716 /* Only consider nodes where both task and groups benefit */
1717 taskimp = task_weight(p, nid, dist) - taskweight;
1718 groupimp = group_weight(p, nid, dist) - groupweight;
1719 if (taskimp < 0 && groupimp < 0)
1724 update_numa_stats(&env.dst_stats, env.dst_nid);
1725 if (numa_has_capacity(&env))
1726 task_numa_find_cpu(&env, taskimp, groupimp);
1731 * If the task is part of a workload that spans multiple NUMA nodes,
1732 * and is migrating into one of the workload's active nodes, remember
1733 * this node as the task's preferred numa node, so the workload can
1735 * A task that migrated to a second choice node will be better off
1736 * trying for a better one later. Do not set the preferred node here.
1738 if (p->numa_group) {
1739 struct numa_group *ng = p->numa_group;
1741 if (env.best_cpu == -1)
1746 if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
1747 sched_setnuma(p, env.dst_nid);
1750 /* No better CPU than the current one was found. */
1751 if (env.best_cpu == -1)
1755 * Reset the scan period if the task is being rescheduled on an
1756 * alternative node to recheck if the tasks is now properly placed.
1758 p->numa_scan_period = task_scan_min(p);
1760 if (env.best_task == NULL) {
1761 ret = migrate_task_to(p, env.best_cpu);
1763 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1767 ret = migrate_swap(p, env.best_task);
1769 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1770 put_task_struct(env.best_task);
1774 /* Attempt to migrate a task to a CPU on the preferred node. */
1775 static void numa_migrate_preferred(struct task_struct *p)
1777 unsigned long interval = HZ;
1779 /* This task has no NUMA fault statistics yet */
1780 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1783 /* Periodically retry migrating the task to the preferred node */
1784 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1785 p->numa_migrate_retry = jiffies + interval;
1787 /* Success if task is already running on preferred CPU */
1788 if (task_node(p) == p->numa_preferred_nid)
1791 /* Otherwise, try migrate to a CPU on the preferred node */
1792 task_numa_migrate(p);
1796 * Find out how many nodes on the workload is actively running on. Do this by
1797 * tracking the nodes from which NUMA hinting faults are triggered. This can
1798 * be different from the set of nodes where the workload's memory is currently
1801 static void numa_group_count_active_nodes(struct numa_group *numa_group)
1803 unsigned long faults, max_faults = 0;
1804 int nid, active_nodes = 0;
1806 for_each_online_node(nid) {
1807 faults = group_faults_cpu(numa_group, nid);
1808 if (faults > max_faults)
1809 max_faults = faults;
1812 for_each_online_node(nid) {
1813 faults = group_faults_cpu(numa_group, nid);
1814 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1818 numa_group->max_faults_cpu = max_faults;
1819 numa_group->active_nodes = active_nodes;
1823 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1824 * increments. The more local the fault statistics are, the higher the scan
1825 * period will be for the next scan window. If local/(local+remote) ratio is
1826 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1827 * the scan period will decrease. Aim for 70% local accesses.
1829 #define NUMA_PERIOD_SLOTS 10
1830 #define NUMA_PERIOD_THRESHOLD 7
1833 * Increase the scan period (slow down scanning) if the majority of
1834 * our memory is already on our local node, or if the majority of
1835 * the page accesses are shared with other processes.
1836 * Otherwise, decrease the scan period.
1838 static void update_task_scan_period(struct task_struct *p,
1839 unsigned long shared, unsigned long private)
1841 unsigned int period_slot;
1845 unsigned long remote = p->numa_faults_locality[0];
1846 unsigned long local = p->numa_faults_locality[1];
1849 * If there were no record hinting faults then either the task is
1850 * completely idle or all activity is areas that are not of interest
1851 * to automatic numa balancing. Related to that, if there were failed
1852 * migration then it implies we are migrating too quickly or the local
1853 * node is overloaded. In either case, scan slower
1855 if (local + shared == 0 || p->numa_faults_locality[2]) {
1856 p->numa_scan_period = min(p->numa_scan_period_max,
1857 p->numa_scan_period << 1);
1859 p->mm->numa_next_scan = jiffies +
1860 msecs_to_jiffies(p->numa_scan_period);
1866 * Prepare to scale scan period relative to the current period.
1867 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1868 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1869 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1871 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1872 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1873 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1874 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1877 diff = slot * period_slot;
1879 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1882 * Scale scan rate increases based on sharing. There is an
1883 * inverse relationship between the degree of sharing and
1884 * the adjustment made to the scanning period. Broadly
1885 * speaking the intent is that there is little point
1886 * scanning faster if shared accesses dominate as it may
1887 * simply bounce migrations uselessly
1889 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1890 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1893 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1894 task_scan_min(p), task_scan_max(p));
1895 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1899 * Get the fraction of time the task has been running since the last
1900 * NUMA placement cycle. The scheduler keeps similar statistics, but
1901 * decays those on a 32ms period, which is orders of magnitude off
1902 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1903 * stats only if the task is so new there are no NUMA statistics yet.
1905 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1907 u64 runtime, delta, now;
1908 /* Use the start of this time slice to avoid calculations. */
1909 now = p->se.exec_start;
1910 runtime = p->se.sum_exec_runtime;
1912 if (p->last_task_numa_placement) {
1913 delta = runtime - p->last_sum_exec_runtime;
1914 *period = now - p->last_task_numa_placement;
1916 delta = p->se.avg.load_sum / p->se.load.weight;
1917 *period = LOAD_AVG_MAX;
1920 p->last_sum_exec_runtime = runtime;
1921 p->last_task_numa_placement = now;
1927 * Determine the preferred nid for a task in a numa_group. This needs to
1928 * be done in a way that produces consistent results with group_weight,
1929 * otherwise workloads might not converge.
1931 static int preferred_group_nid(struct task_struct *p, int nid)
1936 /* Direct connections between all NUMA nodes. */
1937 if (sched_numa_topology_type == NUMA_DIRECT)
1941 * On a system with glueless mesh NUMA topology, group_weight
1942 * scores nodes according to the number of NUMA hinting faults on
1943 * both the node itself, and on nearby nodes.
1945 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1946 unsigned long score, max_score = 0;
1947 int node, max_node = nid;
1949 dist = sched_max_numa_distance;
1951 for_each_online_node(node) {
1952 score = group_weight(p, node, dist);
1953 if (score > max_score) {
1962 * Finding the preferred nid in a system with NUMA backplane
1963 * interconnect topology is more involved. The goal is to locate
1964 * tasks from numa_groups near each other in the system, and
1965 * untangle workloads from different sides of the system. This requires
1966 * searching down the hierarchy of node groups, recursively searching
1967 * inside the highest scoring group of nodes. The nodemask tricks
1968 * keep the complexity of the search down.
1970 nodes = node_online_map;
1971 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1972 unsigned long max_faults = 0;
1973 nodemask_t max_group = NODE_MASK_NONE;
1976 /* Are there nodes at this distance from each other? */
1977 if (!find_numa_distance(dist))
1980 for_each_node_mask(a, nodes) {
1981 unsigned long faults = 0;
1982 nodemask_t this_group;
1983 nodes_clear(this_group);
1985 /* Sum group's NUMA faults; includes a==b case. */
1986 for_each_node_mask(b, nodes) {
1987 if (node_distance(a, b) < dist) {
1988 faults += group_faults(p, b);
1989 node_set(b, this_group);
1990 node_clear(b, nodes);
1994 /* Remember the top group. */
1995 if (faults > max_faults) {
1996 max_faults = faults;
1997 max_group = this_group;
1999 * subtle: at the smallest distance there is
2000 * just one node left in each "group", the
2001 * winner is the preferred nid.
2006 /* Next round, evaluate the nodes within max_group. */
2014 static void task_numa_placement(struct task_struct *p)
2016 int seq, nid, max_nid = -1, max_group_nid = -1;
2017 unsigned long max_faults = 0, max_group_faults = 0;
2018 unsigned long fault_types[2] = { 0, 0 };
2019 unsigned long total_faults;
2020 u64 runtime, period;
2021 spinlock_t *group_lock = NULL;
2024 * The p->mm->numa_scan_seq field gets updated without
2025 * exclusive access. Use READ_ONCE() here to ensure
2026 * that the field is read in a single access:
2028 seq = READ_ONCE(p->mm->numa_scan_seq);
2029 if (p->numa_scan_seq == seq)
2031 p->numa_scan_seq = seq;
2032 p->numa_scan_period_max = task_scan_max(p);
2034 total_faults = p->numa_faults_locality[0] +
2035 p->numa_faults_locality[1];
2036 runtime = numa_get_avg_runtime(p, &period);
2038 /* If the task is part of a group prevent parallel updates to group stats */
2039 if (p->numa_group) {
2040 group_lock = &p->numa_group->lock;
2041 spin_lock_irq(group_lock);
2044 /* Find the node with the highest number of faults */
2045 for_each_online_node(nid) {
2046 /* Keep track of the offsets in numa_faults array */
2047 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2048 unsigned long faults = 0, group_faults = 0;
2051 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2052 long diff, f_diff, f_weight;
2054 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2055 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2056 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2057 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2059 /* Decay existing window, copy faults since last scan */
2060 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2061 fault_types[priv] += p->numa_faults[membuf_idx];
2062 p->numa_faults[membuf_idx] = 0;
2065 * Normalize the faults_from, so all tasks in a group
2066 * count according to CPU use, instead of by the raw
2067 * number of faults. Tasks with little runtime have
2068 * little over-all impact on throughput, and thus their
2069 * faults are less important.
2071 f_weight = div64_u64(runtime << 16, period + 1);
2072 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2074 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2075 p->numa_faults[cpubuf_idx] = 0;
2077 p->numa_faults[mem_idx] += diff;
2078 p->numa_faults[cpu_idx] += f_diff;
2079 faults += p->numa_faults[mem_idx];
2080 p->total_numa_faults += diff;
2081 if (p->numa_group) {
2083 * safe because we can only change our own group
2085 * mem_idx represents the offset for a given
2086 * nid and priv in a specific region because it
2087 * is at the beginning of the numa_faults array.
2089 p->numa_group->faults[mem_idx] += diff;
2090 p->numa_group->faults_cpu[mem_idx] += f_diff;
2091 p->numa_group->total_faults += diff;
2092 group_faults += p->numa_group->faults[mem_idx];
2096 if (faults > max_faults) {
2097 max_faults = faults;
2101 if (group_faults > max_group_faults) {
2102 max_group_faults = group_faults;
2103 max_group_nid = nid;
2107 update_task_scan_period(p, fault_types[0], fault_types[1]);
2109 if (p->numa_group) {
2110 numa_group_count_active_nodes(p->numa_group);
2111 spin_unlock_irq(group_lock);
2112 max_nid = preferred_group_nid(p, max_group_nid);
2116 /* Set the new preferred node */
2117 if (max_nid != p->numa_preferred_nid)
2118 sched_setnuma(p, max_nid);
2120 if (task_node(p) != p->numa_preferred_nid)
2121 numa_migrate_preferred(p);
2125 static inline int get_numa_group(struct numa_group *grp)
2127 return atomic_inc_not_zero(&grp->refcount);
2130 static inline void put_numa_group(struct numa_group *grp)
2132 if (atomic_dec_and_test(&grp->refcount))
2133 kfree_rcu(grp, rcu);
2136 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2139 struct numa_group *grp, *my_grp;
2140 struct task_struct *tsk;
2142 int cpu = cpupid_to_cpu(cpupid);
2145 if (unlikely(!p->numa_group)) {
2146 unsigned int size = sizeof(struct numa_group) +
2147 4*nr_node_ids*sizeof(unsigned long);
2149 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2153 atomic_set(&grp->refcount, 1);
2154 grp->active_nodes = 1;
2155 grp->max_faults_cpu = 0;
2156 spin_lock_init(&grp->lock);
2158 /* Second half of the array tracks nids where faults happen */
2159 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2162 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2163 grp->faults[i] = p->numa_faults[i];
2165 grp->total_faults = p->total_numa_faults;
2168 rcu_assign_pointer(p->numa_group, grp);
2172 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2174 if (!cpupid_match_pid(tsk, cpupid))
2177 grp = rcu_dereference(tsk->numa_group);
2181 my_grp = p->numa_group;
2186 * Only join the other group if its bigger; if we're the bigger group,
2187 * the other task will join us.
2189 if (my_grp->nr_tasks > grp->nr_tasks)
2193 * Tie-break on the grp address.
2195 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2198 /* Always join threads in the same process. */
2199 if (tsk->mm == current->mm)
2202 /* Simple filter to avoid false positives due to PID collisions */
2203 if (flags & TNF_SHARED)
2206 /* Update priv based on whether false sharing was detected */
2209 if (join && !get_numa_group(grp))
2217 BUG_ON(irqs_disabled());
2218 double_lock_irq(&my_grp->lock, &grp->lock);
2220 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2221 my_grp->faults[i] -= p->numa_faults[i];
2222 grp->faults[i] += p->numa_faults[i];
2224 my_grp->total_faults -= p->total_numa_faults;
2225 grp->total_faults += p->total_numa_faults;
2230 spin_unlock(&my_grp->lock);
2231 spin_unlock_irq(&grp->lock);
2233 rcu_assign_pointer(p->numa_group, grp);
2235 put_numa_group(my_grp);
2243 void task_numa_free(struct task_struct *p)
2245 struct numa_group *grp = p->numa_group;
2246 void *numa_faults = p->numa_faults;
2247 unsigned long flags;
2251 spin_lock_irqsave(&grp->lock, flags);
2252 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2253 grp->faults[i] -= p->numa_faults[i];
2254 grp->total_faults -= p->total_numa_faults;
2257 spin_unlock_irqrestore(&grp->lock, flags);
2258 RCU_INIT_POINTER(p->numa_group, NULL);
2259 put_numa_group(grp);
2262 p->numa_faults = NULL;
2267 * Got a PROT_NONE fault for a page on @node.
2269 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2271 struct task_struct *p = current;
2272 bool migrated = flags & TNF_MIGRATED;
2273 int cpu_node = task_node(current);
2274 int local = !!(flags & TNF_FAULT_LOCAL);
2275 struct numa_group *ng;
2278 if (!static_branch_likely(&sched_numa_balancing))
2281 /* for example, ksmd faulting in a user's mm */
2285 /* Allocate buffer to track faults on a per-node basis */
2286 if (unlikely(!p->numa_faults)) {
2287 int size = sizeof(*p->numa_faults) *
2288 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2290 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2291 if (!p->numa_faults)
2294 p->total_numa_faults = 0;
2295 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2299 * First accesses are treated as private, otherwise consider accesses
2300 * to be private if the accessing pid has not changed
2302 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2305 priv = cpupid_match_pid(p, last_cpupid);
2306 if (!priv && !(flags & TNF_NO_GROUP))
2307 task_numa_group(p, last_cpupid, flags, &priv);
2311 * If a workload spans multiple NUMA nodes, a shared fault that
2312 * occurs wholly within the set of nodes that the workload is
2313 * actively using should be counted as local. This allows the
2314 * scan rate to slow down when a workload has settled down.
2317 if (!priv && !local && ng && ng->active_nodes > 1 &&
2318 numa_is_active_node(cpu_node, ng) &&
2319 numa_is_active_node(mem_node, ng))
2322 task_numa_placement(p);
2325 * Retry task to preferred node migration periodically, in case it
2326 * case it previously failed, or the scheduler moved us.
2328 if (time_after(jiffies, p->numa_migrate_retry))
2329 numa_migrate_preferred(p);
2332 p->numa_pages_migrated += pages;
2333 if (flags & TNF_MIGRATE_FAIL)
2334 p->numa_faults_locality[2] += pages;
2336 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2337 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2338 p->numa_faults_locality[local] += pages;
2341 static void reset_ptenuma_scan(struct task_struct *p)
2344 * We only did a read acquisition of the mmap sem, so
2345 * p->mm->numa_scan_seq is written to without exclusive access
2346 * and the update is not guaranteed to be atomic. That's not
2347 * much of an issue though, since this is just used for
2348 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2349 * expensive, to avoid any form of compiler optimizations:
2351 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2352 p->mm->numa_scan_offset = 0;
2356 * The expensive part of numa migration is done from task_work context.
2357 * Triggered from task_tick_numa().
2359 void task_numa_work(struct callback_head *work)
2361 unsigned long migrate, next_scan, now = jiffies;
2362 struct task_struct *p = current;
2363 struct mm_struct *mm = p->mm;
2364 u64 runtime = p->se.sum_exec_runtime;
2365 struct vm_area_struct *vma;
2366 unsigned long start, end;
2367 unsigned long nr_pte_updates = 0;
2368 long pages, virtpages;
2370 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2372 work->next = work; /* protect against double add */
2374 * Who cares about NUMA placement when they're dying.
2376 * NOTE: make sure not to dereference p->mm before this check,
2377 * exit_task_work() happens _after_ exit_mm() so we could be called
2378 * without p->mm even though we still had it when we enqueued this
2381 if (p->flags & PF_EXITING)
2384 if (!mm->numa_next_scan) {
2385 mm->numa_next_scan = now +
2386 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2390 * Enforce maximal scan/migration frequency..
2392 migrate = mm->numa_next_scan;
2393 if (time_before(now, migrate))
2396 if (p->numa_scan_period == 0) {
2397 p->numa_scan_period_max = task_scan_max(p);
2398 p->numa_scan_period = task_scan_min(p);
2401 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2402 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2406 * Delay this task enough that another task of this mm will likely win
2407 * the next time around.
2409 p->node_stamp += 2 * TICK_NSEC;
2411 start = mm->numa_scan_offset;
2412 pages = sysctl_numa_balancing_scan_size;
2413 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2414 virtpages = pages * 8; /* Scan up to this much virtual space */
2419 down_read(&mm->mmap_sem);
2420 vma = find_vma(mm, start);
2422 reset_ptenuma_scan(p);
2426 for (; vma; vma = vma->vm_next) {
2427 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2428 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2433 * Shared library pages mapped by multiple processes are not
2434 * migrated as it is expected they are cache replicated. Avoid
2435 * hinting faults in read-only file-backed mappings or the vdso
2436 * as migrating the pages will be of marginal benefit.
2439 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2443 * Skip inaccessible VMAs to avoid any confusion between
2444 * PROT_NONE and NUMA hinting ptes
2446 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2450 start = max(start, vma->vm_start);
2451 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2452 end = min(end, vma->vm_end);
2453 nr_pte_updates = change_prot_numa(vma, start, end);
2456 * Try to scan sysctl_numa_balancing_size worth of
2457 * hpages that have at least one present PTE that
2458 * is not already pte-numa. If the VMA contains
2459 * areas that are unused or already full of prot_numa
2460 * PTEs, scan up to virtpages, to skip through those
2464 pages -= (end - start) >> PAGE_SHIFT;
2465 virtpages -= (end - start) >> PAGE_SHIFT;
2468 if (pages <= 0 || virtpages <= 0)
2472 } while (end != vma->vm_end);
2477 * It is possible to reach the end of the VMA list but the last few
2478 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2479 * would find the !migratable VMA on the next scan but not reset the
2480 * scanner to the start so check it now.
2483 mm->numa_scan_offset = start;
2485 reset_ptenuma_scan(p);
2486 up_read(&mm->mmap_sem);
2489 * Make sure tasks use at least 32x as much time to run other code
2490 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2491 * Usually update_task_scan_period slows down scanning enough; on an
2492 * overloaded system we need to limit overhead on a per task basis.
2494 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2495 u64 diff = p->se.sum_exec_runtime - runtime;
2496 p->node_stamp += 32 * diff;
2501 * Drive the periodic memory faults..
2503 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2505 struct callback_head *work = &curr->numa_work;
2509 * We don't care about NUMA placement if we don't have memory.
2511 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2515 * Using runtime rather than walltime has the dual advantage that
2516 * we (mostly) drive the selection from busy threads and that the
2517 * task needs to have done some actual work before we bother with
2520 now = curr->se.sum_exec_runtime;
2521 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2523 if (now > curr->node_stamp + period) {
2524 if (!curr->node_stamp)
2525 curr->numa_scan_period = task_scan_min(curr);
2526 curr->node_stamp += period;
2528 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2529 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2530 task_work_add(curr, work, true);
2535 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2539 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2543 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2546 #endif /* CONFIG_NUMA_BALANCING */
2549 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2551 update_load_add(&cfs_rq->load, se->load.weight);
2552 if (!parent_entity(se))
2553 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2555 if (entity_is_task(se)) {
2556 struct rq *rq = rq_of(cfs_rq);
2558 account_numa_enqueue(rq, task_of(se));
2559 list_add(&se->group_node, &rq->cfs_tasks);
2562 cfs_rq->nr_running++;
2566 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2568 update_load_sub(&cfs_rq->load, se->load.weight);
2569 if (!parent_entity(se))
2570 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2572 if (entity_is_task(se)) {
2573 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2574 list_del_init(&se->group_node);
2577 cfs_rq->nr_running--;
2580 #ifdef CONFIG_FAIR_GROUP_SCHED
2582 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2584 long tg_weight, load, shares;
2587 * This really should be: cfs_rq->avg.load_avg, but instead we use
2588 * cfs_rq->load.weight, which is its upper bound. This helps ramp up
2589 * the shares for small weight interactive tasks.
2591 load = scale_load_down(cfs_rq->load.weight);
2593 tg_weight = atomic_long_read(&tg->load_avg);
2595 /* Ensure tg_weight >= load */
2596 tg_weight -= cfs_rq->tg_load_avg_contrib;
2599 shares = (tg->shares * load);
2601 shares /= tg_weight;
2603 if (shares < MIN_SHARES)
2604 shares = MIN_SHARES;
2605 if (shares > tg->shares)
2606 shares = tg->shares;
2610 # else /* CONFIG_SMP */
2611 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2615 # endif /* CONFIG_SMP */
2617 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2618 unsigned long weight)
2621 /* commit outstanding execution time */
2622 if (cfs_rq->curr == se)
2623 update_curr(cfs_rq);
2624 account_entity_dequeue(cfs_rq, se);
2627 update_load_set(&se->load, weight);
2630 account_entity_enqueue(cfs_rq, se);
2633 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2635 static void update_cfs_shares(struct cfs_rq *cfs_rq)
2637 struct task_group *tg;
2638 struct sched_entity *se;
2642 se = tg->se[cpu_of(rq_of(cfs_rq))];
2643 if (!se || throttled_hierarchy(cfs_rq))
2646 if (likely(se->load.weight == tg->shares))
2649 shares = calc_cfs_shares(cfs_rq, tg);
2651 reweight_entity(cfs_rq_of(se), se, shares);
2653 #else /* CONFIG_FAIR_GROUP_SCHED */
2654 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2657 #endif /* CONFIG_FAIR_GROUP_SCHED */
2660 /* Precomputed fixed inverse multiplies for multiplication by y^n */
2661 static const u32 runnable_avg_yN_inv[] = {
2662 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2663 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2664 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2665 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2666 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2667 0x85aac367, 0x82cd8698,
2671 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
2672 * over-estimates when re-combining.
2674 static const u32 runnable_avg_yN_sum[] = {
2675 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2676 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2677 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2681 * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
2682 * lower integers. See Documentation/scheduler/sched-avg.txt how these
2685 static const u32 __accumulated_sum_N32[] = {
2686 0, 23371, 35056, 40899, 43820, 45281,
2687 46011, 46376, 46559, 46650, 46696, 46719,
2692 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
2694 static __always_inline u64 decay_load(u64 val, u64 n)
2696 unsigned int local_n;
2700 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2703 /* after bounds checking we can collapse to 32-bit */
2707 * As y^PERIOD = 1/2, we can combine
2708 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2709 * With a look-up table which covers y^n (n<PERIOD)
2711 * To achieve constant time decay_load.
2713 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2714 val >>= local_n / LOAD_AVG_PERIOD;
2715 local_n %= LOAD_AVG_PERIOD;
2718 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2723 * For updates fully spanning n periods, the contribution to runnable
2724 * average will be: \Sum 1024*y^n
2726 * We can compute this reasonably efficiently by combining:
2727 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
2729 static u32 __compute_runnable_contrib(u64 n)
2733 if (likely(n <= LOAD_AVG_PERIOD))
2734 return runnable_avg_yN_sum[n];
2735 else if (unlikely(n >= LOAD_AVG_MAX_N))
2736 return LOAD_AVG_MAX;
2738 /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
2739 contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
2740 n %= LOAD_AVG_PERIOD;
2741 contrib = decay_load(contrib, n);
2742 return contrib + runnable_avg_yN_sum[n];
2745 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2748 * We can represent the historical contribution to runnable average as the
2749 * coefficients of a geometric series. To do this we sub-divide our runnable
2750 * history into segments of approximately 1ms (1024us); label the segment that
2751 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2753 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2755 * (now) (~1ms ago) (~2ms ago)
2757 * Let u_i denote the fraction of p_i that the entity was runnable.
2759 * We then designate the fractions u_i as our co-efficients, yielding the
2760 * following representation of historical load:
2761 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2763 * We choose y based on the with of a reasonably scheduling period, fixing:
2766 * This means that the contribution to load ~32ms ago (u_32) will be weighted
2767 * approximately half as much as the contribution to load within the last ms
2770 * When a period "rolls over" and we have new u_0`, multiplying the previous
2771 * sum again by y is sufficient to update:
2772 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2773 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2775 static __always_inline int
2776 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2777 unsigned long weight, int running, struct cfs_rq *cfs_rq)
2779 u64 delta, scaled_delta, periods;
2781 unsigned int delta_w, scaled_delta_w, decayed = 0;
2782 unsigned long scale_freq, scale_cpu;
2784 delta = now - sa->last_update_time;
2786 * This should only happen when time goes backwards, which it
2787 * unfortunately does during sched clock init when we swap over to TSC.
2789 if ((s64)delta < 0) {
2790 sa->last_update_time = now;
2795 * Use 1024ns as the unit of measurement since it's a reasonable
2796 * approximation of 1us and fast to compute.
2801 sa->last_update_time = now;
2803 scale_freq = arch_scale_freq_capacity(NULL, cpu);
2804 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2806 /* delta_w is the amount already accumulated against our next period */
2807 delta_w = sa->period_contrib;
2808 if (delta + delta_w >= 1024) {
2811 /* how much left for next period will start over, we don't know yet */
2812 sa->period_contrib = 0;
2815 * Now that we know we're crossing a period boundary, figure
2816 * out how much from delta we need to complete the current
2817 * period and accrue it.
2819 delta_w = 1024 - delta_w;
2820 scaled_delta_w = cap_scale(delta_w, scale_freq);
2822 sa->load_sum += weight * scaled_delta_w;
2824 cfs_rq->runnable_load_sum +=
2825 weight * scaled_delta_w;
2829 sa->util_sum += scaled_delta_w * scale_cpu;
2833 /* Figure out how many additional periods this update spans */
2834 periods = delta / 1024;
2837 sa->load_sum = decay_load(sa->load_sum, periods + 1);
2839 cfs_rq->runnable_load_sum =
2840 decay_load(cfs_rq->runnable_load_sum, periods + 1);
2842 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
2844 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
2845 contrib = __compute_runnable_contrib(periods);
2846 contrib = cap_scale(contrib, scale_freq);
2848 sa->load_sum += weight * contrib;
2850 cfs_rq->runnable_load_sum += weight * contrib;
2853 sa->util_sum += contrib * scale_cpu;
2856 /* Remainder of delta accrued against u_0` */
2857 scaled_delta = cap_scale(delta, scale_freq);
2859 sa->load_sum += weight * scaled_delta;
2861 cfs_rq->runnable_load_sum += weight * scaled_delta;
2864 sa->util_sum += scaled_delta * scale_cpu;
2866 sa->period_contrib += delta;
2869 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2871 cfs_rq->runnable_load_avg =
2872 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2874 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2880 #ifdef CONFIG_FAIR_GROUP_SCHED
2882 * update_tg_load_avg - update the tg's load avg
2883 * @cfs_rq: the cfs_rq whose avg changed
2884 * @force: update regardless of how small the difference
2886 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
2887 * However, because tg->load_avg is a global value there are performance
2890 * In order to avoid having to look at the other cfs_rq's, we use a
2891 * differential update where we store the last value we propagated. This in
2892 * turn allows skipping updates if the differential is 'small'.
2894 * Updating tg's load_avg is necessary before update_cfs_share() (which is
2895 * done) and effective_load() (which is not done because it is too costly).
2897 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2899 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
2902 * No need to update load_avg for root_task_group as it is not used.
2904 if (cfs_rq->tg == &root_task_group)
2907 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
2908 atomic_long_add(delta, &cfs_rq->tg->load_avg);
2909 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
2914 * Called within set_task_rq() right before setting a task's cpu. The
2915 * caller only guarantees p->pi_lock is held; no other assumptions,
2916 * including the state of rq->lock, should be made.
2918 void set_task_rq_fair(struct sched_entity *se,
2919 struct cfs_rq *prev, struct cfs_rq *next)
2921 if (!sched_feat(ATTACH_AGE_LOAD))
2925 * We are supposed to update the task to "current" time, then its up to
2926 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
2927 * getting what current time is, so simply throw away the out-of-date
2928 * time. This will result in the wakee task is less decayed, but giving
2929 * the wakee more load sounds not bad.
2931 if (se->avg.last_update_time && prev) {
2932 u64 p_last_update_time;
2933 u64 n_last_update_time;
2935 #ifndef CONFIG_64BIT
2936 u64 p_last_update_time_copy;
2937 u64 n_last_update_time_copy;
2940 p_last_update_time_copy = prev->load_last_update_time_copy;
2941 n_last_update_time_copy = next->load_last_update_time_copy;
2945 p_last_update_time = prev->avg.last_update_time;
2946 n_last_update_time = next->avg.last_update_time;
2948 } while (p_last_update_time != p_last_update_time_copy ||
2949 n_last_update_time != n_last_update_time_copy);
2951 p_last_update_time = prev->avg.last_update_time;
2952 n_last_update_time = next->avg.last_update_time;
2954 __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
2955 &se->avg, 0, 0, NULL);
2956 se->avg.last_update_time = n_last_update_time;
2959 #else /* CONFIG_FAIR_GROUP_SCHED */
2960 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2961 #endif /* CONFIG_FAIR_GROUP_SCHED */
2963 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2965 struct rq *rq = rq_of(cfs_rq);
2966 int cpu = cpu_of(rq);
2968 if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
2969 unsigned long max = rq->cpu_capacity_orig;
2972 * There are a few boundary cases this might miss but it should
2973 * get called often enough that that should (hopefully) not be
2974 * a real problem -- added to that it only calls on the local
2975 * CPU, so if we enqueue remotely we'll miss an update, but
2976 * the next tick/schedule should update.
2978 * It will not get called when we go idle, because the idle
2979 * thread is a different class (!fair), nor will the utilization
2980 * number include things like RT tasks.
2982 * As is, the util number is not freq-invariant (we'd have to
2983 * implement arch_scale_freq_capacity() for that).
2987 cpufreq_update_util(rq_clock(rq),
2988 min(cfs_rq->avg.util_avg, max), max);
2993 * Unsigned subtract and clamp on underflow.
2995 * Explicitly do a load-store to ensure the intermediate value never hits
2996 * memory. This allows lockless observations without ever seeing the negative
2999 #define sub_positive(_ptr, _val) do { \
3000 typeof(_ptr) ptr = (_ptr); \
3001 typeof(*ptr) val = (_val); \
3002 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3006 WRITE_ONCE(*ptr, res); \
3010 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3011 * @now: current time, as per cfs_rq_clock_task()
3012 * @cfs_rq: cfs_rq to update
3013 * @update_freq: should we call cfs_rq_util_change() or will the call do so
3015 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3016 * avg. The immediate corollary is that all (fair) tasks must be attached, see
3017 * post_init_entity_util_avg().
3019 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3021 * Returns true if the load decayed or we removed load.
3023 * Since both these conditions indicate a changed cfs_rq->avg.load we should
3024 * call update_tg_load_avg() when this function returns true.
3027 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3029 struct sched_avg *sa = &cfs_rq->avg;
3030 int decayed, removed_load = 0, removed_util = 0;
3032 if (atomic_long_read(&cfs_rq->removed_load_avg)) {
3033 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
3034 sub_positive(&sa->load_avg, r);
3035 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
3039 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
3040 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
3041 sub_positive(&sa->util_avg, r);
3042 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
3046 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
3047 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
3049 #ifndef CONFIG_64BIT
3051 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3054 if (update_freq && (decayed || removed_util))
3055 cfs_rq_util_change(cfs_rq);
3057 return decayed || removed_load;
3060 /* Update task and its cfs_rq load average */
3061 static inline void update_load_avg(struct sched_entity *se, int update_tg)
3063 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3064 u64 now = cfs_rq_clock_task(cfs_rq);
3065 struct rq *rq = rq_of(cfs_rq);
3066 int cpu = cpu_of(rq);
3069 * Track task load average for carrying it to new CPU after migrated, and
3070 * track group sched_entity load average for task_h_load calc in migration
3072 __update_load_avg(now, cpu, &se->avg,
3073 se->on_rq * scale_load_down(se->load.weight),
3074 cfs_rq->curr == se, NULL);
3076 if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
3077 update_tg_load_avg(cfs_rq, 0);
3081 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3082 * @cfs_rq: cfs_rq to attach to
3083 * @se: sched_entity to attach
3085 * Must call update_cfs_rq_load_avg() before this, since we rely on
3086 * cfs_rq->avg.last_update_time being current.
3088 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3090 if (!sched_feat(ATTACH_AGE_LOAD))
3094 * If we got migrated (either between CPUs or between cgroups) we'll
3095 * have aged the average right before clearing @last_update_time.
3097 * Or we're fresh through post_init_entity_util_avg().
3099 if (se->avg.last_update_time) {
3100 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
3101 &se->avg, 0, 0, NULL);
3104 * XXX: we could have just aged the entire load away if we've been
3105 * absent from the fair class for too long.
3110 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3111 cfs_rq->avg.load_avg += se->avg.load_avg;
3112 cfs_rq->avg.load_sum += se->avg.load_sum;
3113 cfs_rq->avg.util_avg += se->avg.util_avg;
3114 cfs_rq->avg.util_sum += se->avg.util_sum;
3116 cfs_rq_util_change(cfs_rq);
3120 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3121 * @cfs_rq: cfs_rq to detach from
3122 * @se: sched_entity to detach
3124 * Must call update_cfs_rq_load_avg() before this, since we rely on
3125 * cfs_rq->avg.last_update_time being current.
3127 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3129 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
3130 &se->avg, se->on_rq * scale_load_down(se->load.weight),
3131 cfs_rq->curr == se, NULL);
3133 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3134 sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
3135 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3136 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3138 cfs_rq_util_change(cfs_rq);
3141 /* Add the load generated by se into cfs_rq's load average */
3143 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3145 struct sched_avg *sa = &se->avg;
3146 u64 now = cfs_rq_clock_task(cfs_rq);
3147 int migrated, decayed;
3149 migrated = !sa->last_update_time;
3151 __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
3152 se->on_rq * scale_load_down(se->load.weight),
3153 cfs_rq->curr == se, NULL);
3156 decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
3158 cfs_rq->runnable_load_avg += sa->load_avg;
3159 cfs_rq->runnable_load_sum += sa->load_sum;
3162 attach_entity_load_avg(cfs_rq, se);
3164 if (decayed || migrated)
3165 update_tg_load_avg(cfs_rq, 0);
3168 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
3170 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3172 update_load_avg(se, 1);
3174 cfs_rq->runnable_load_avg =
3175 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
3176 cfs_rq->runnable_load_sum =
3177 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
3180 #ifndef CONFIG_64BIT
3181 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3183 u64 last_update_time_copy;
3184 u64 last_update_time;
3187 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3189 last_update_time = cfs_rq->avg.last_update_time;
3190 } while (last_update_time != last_update_time_copy);
3192 return last_update_time;
3195 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3197 return cfs_rq->avg.last_update_time;
3202 * Task first catches up with cfs_rq, and then subtract
3203 * itself from the cfs_rq (task must be off the queue now).
3205 void remove_entity_load_avg(struct sched_entity *se)
3207 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3208 u64 last_update_time;
3211 * tasks cannot exit without having gone through wake_up_new_task() ->
3212 * post_init_entity_util_avg() which will have added things to the
3213 * cfs_rq, so we can remove unconditionally.
3215 * Similarly for groups, they will have passed through
3216 * post_init_entity_util_avg() before unregister_sched_fair_group()
3220 last_update_time = cfs_rq_last_update_time(cfs_rq);
3222 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
3223 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
3224 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
3227 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3229 return cfs_rq->runnable_load_avg;
3232 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3234 return cfs_rq->avg.load_avg;
3237 static int idle_balance(struct rq *this_rq);
3239 #else /* CONFIG_SMP */
3242 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3247 static inline void update_load_avg(struct sched_entity *se, int not_used)
3249 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3250 struct rq *rq = rq_of(cfs_rq);
3252 cpufreq_trigger_update(rq_clock(rq));
3256 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3258 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3259 static inline void remove_entity_load_avg(struct sched_entity *se) {}
3262 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3264 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3266 static inline int idle_balance(struct rq *rq)
3271 #endif /* CONFIG_SMP */
3273 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3275 #ifdef CONFIG_SCHED_DEBUG
3276 s64 d = se->vruntime - cfs_rq->min_vruntime;
3281 if (d > 3*sysctl_sched_latency)
3282 schedstat_inc(cfs_rq->nr_spread_over);
3287 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3289 u64 vruntime = cfs_rq->min_vruntime;
3292 * The 'current' period is already promised to the current tasks,
3293 * however the extra weight of the new task will slow them down a
3294 * little, place the new task so that it fits in the slot that
3295 * stays open at the end.
3297 if (initial && sched_feat(START_DEBIT))
3298 vruntime += sched_vslice(cfs_rq, se);
3300 /* sleeps up to a single latency don't count. */
3302 unsigned long thresh = sysctl_sched_latency;
3305 * Halve their sleep time's effect, to allow
3306 * for a gentler effect of sleepers:
3308 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3314 /* ensure we never gain time by being placed backwards. */
3315 se->vruntime = max_vruntime(se->vruntime, vruntime);
3318 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3320 static inline void check_schedstat_required(void)
3322 #ifdef CONFIG_SCHEDSTATS
3323 if (schedstat_enabled())
3326 /* Force schedstat enabled if a dependent tracepoint is active */
3327 if (trace_sched_stat_wait_enabled() ||
3328 trace_sched_stat_sleep_enabled() ||
3329 trace_sched_stat_iowait_enabled() ||
3330 trace_sched_stat_blocked_enabled() ||
3331 trace_sched_stat_runtime_enabled()) {
3332 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3333 "stat_blocked and stat_runtime require the "
3334 "kernel parameter schedstats=enabled or "
3335 "kernel.sched_schedstats=1\n");
3346 * update_min_vruntime()
3347 * vruntime -= min_vruntime
3351 * update_min_vruntime()
3352 * vruntime += min_vruntime
3354 * this way the vruntime transition between RQs is done when both
3355 * min_vruntime are up-to-date.
3359 * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
3360 * vruntime -= min_vruntime
3364 * update_min_vruntime()
3365 * vruntime += min_vruntime
3367 * this way we don't have the most up-to-date min_vruntime on the originating
3368 * CPU and an up-to-date min_vruntime on the destination CPU.
3372 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3374 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3375 bool curr = cfs_rq->curr == se;
3378 * If we're the current task, we must renormalise before calling
3382 se->vruntime += cfs_rq->min_vruntime;
3384 update_curr(cfs_rq);
3387 * Otherwise, renormalise after, such that we're placed at the current
3388 * moment in time, instead of some random moment in the past. Being
3389 * placed in the past could significantly boost this task to the
3390 * fairness detriment of existing tasks.
3392 if (renorm && !curr)
3393 se->vruntime += cfs_rq->min_vruntime;
3395 enqueue_entity_load_avg(cfs_rq, se);
3396 account_entity_enqueue(cfs_rq, se);
3397 update_cfs_shares(cfs_rq);
3399 if (flags & ENQUEUE_WAKEUP)
3400 place_entity(cfs_rq, se, 0);
3402 check_schedstat_required();
3403 update_stats_enqueue(cfs_rq, se, flags);
3404 check_spread(cfs_rq, se);
3406 __enqueue_entity(cfs_rq, se);
3409 if (cfs_rq->nr_running == 1) {
3410 list_add_leaf_cfs_rq(cfs_rq);
3411 check_enqueue_throttle(cfs_rq);
3415 static void __clear_buddies_last(struct sched_entity *se)
3417 for_each_sched_entity(se) {
3418 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3419 if (cfs_rq->last != se)
3422 cfs_rq->last = NULL;
3426 static void __clear_buddies_next(struct sched_entity *se)
3428 for_each_sched_entity(se) {
3429 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3430 if (cfs_rq->next != se)
3433 cfs_rq->next = NULL;
3437 static void __clear_buddies_skip(struct sched_entity *se)
3439 for_each_sched_entity(se) {
3440 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3441 if (cfs_rq->skip != se)
3444 cfs_rq->skip = NULL;
3448 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3450 if (cfs_rq->last == se)
3451 __clear_buddies_last(se);
3453 if (cfs_rq->next == se)
3454 __clear_buddies_next(se);
3456 if (cfs_rq->skip == se)
3457 __clear_buddies_skip(se);
3460 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3463 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3466 * Update run-time statistics of the 'current'.
3468 update_curr(cfs_rq);
3469 dequeue_entity_load_avg(cfs_rq, se);
3471 update_stats_dequeue(cfs_rq, se, flags);
3473 clear_buddies(cfs_rq, se);
3475 if (se != cfs_rq->curr)
3476 __dequeue_entity(cfs_rq, se);
3478 account_entity_dequeue(cfs_rq, se);
3481 * Normalize the entity after updating the min_vruntime because the
3482 * update can refer to the ->curr item and we need to reflect this
3483 * movement in our normalized position.
3485 if (!(flags & DEQUEUE_SLEEP))
3486 se->vruntime -= cfs_rq->min_vruntime;
3488 /* return excess runtime on last dequeue */
3489 return_cfs_rq_runtime(cfs_rq);
3491 update_min_vruntime(cfs_rq);
3492 update_cfs_shares(cfs_rq);
3496 * Preempt the current task with a newly woken task if needed:
3499 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3501 unsigned long ideal_runtime, delta_exec;
3502 struct sched_entity *se;
3505 ideal_runtime = sched_slice(cfs_rq, curr);
3506 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
3507 if (delta_exec > ideal_runtime) {
3508 resched_curr(rq_of(cfs_rq));
3510 * The current task ran long enough, ensure it doesn't get
3511 * re-elected due to buddy favours.
3513 clear_buddies(cfs_rq, curr);
3518 * Ensure that a task that missed wakeup preemption by a
3519 * narrow margin doesn't have to wait for a full slice.
3520 * This also mitigates buddy induced latencies under load.
3522 if (delta_exec < sysctl_sched_min_granularity)
3525 se = __pick_first_entity(cfs_rq);
3526 delta = curr->vruntime - se->vruntime;
3531 if (delta > ideal_runtime)
3532 resched_curr(rq_of(cfs_rq));
3536 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3538 /* 'current' is not kept within the tree. */
3541 * Any task has to be enqueued before it get to execute on
3542 * a CPU. So account for the time it spent waiting on the
3545 update_stats_wait_end(cfs_rq, se);
3546 __dequeue_entity(cfs_rq, se);
3547 update_load_avg(se, 1);
3550 update_stats_curr_start(cfs_rq, se);
3554 * Track our maximum slice length, if the CPU's load is at
3555 * least twice that of our own weight (i.e. dont track it
3556 * when there are only lesser-weight tasks around):
3558 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3559 schedstat_set(se->statistics.slice_max,
3560 max((u64)schedstat_val(se->statistics.slice_max),
3561 se->sum_exec_runtime - se->prev_sum_exec_runtime));
3564 se->prev_sum_exec_runtime = se->sum_exec_runtime;
3568 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3571 * Pick the next process, keeping these things in mind, in this order:
3572 * 1) keep things fair between processes/task groups
3573 * 2) pick the "next" process, since someone really wants that to run
3574 * 3) pick the "last" process, for cache locality
3575 * 4) do not run the "skip" process, if something else is available
3577 static struct sched_entity *
3578 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3580 struct sched_entity *left = __pick_first_entity(cfs_rq);
3581 struct sched_entity *se;
3584 * If curr is set we have to see if its left of the leftmost entity
3585 * still in the tree, provided there was anything in the tree at all.
3587 if (!left || (curr && entity_before(curr, left)))
3590 se = left; /* ideally we run the leftmost entity */
3593 * Avoid running the skip buddy, if running something else can
3594 * be done without getting too unfair.
3596 if (cfs_rq->skip == se) {
3597 struct sched_entity *second;
3600 second = __pick_first_entity(cfs_rq);
3602 second = __pick_next_entity(se);
3603 if (!second || (curr && entity_before(curr, second)))
3607 if (second && wakeup_preempt_entity(second, left) < 1)
3612 * Prefer last buddy, try to return the CPU to a preempted task.
3614 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3618 * Someone really wants this to run. If it's not unfair, run it.
3620 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3623 clear_buddies(cfs_rq, se);
3628 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3630 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3633 * If still on the runqueue then deactivate_task()
3634 * was not called and update_curr() has to be done:
3637 update_curr(cfs_rq);
3639 /* throttle cfs_rqs exceeding runtime */
3640 check_cfs_rq_runtime(cfs_rq);
3642 check_spread(cfs_rq, prev);
3645 update_stats_wait_start(cfs_rq, prev);
3646 /* Put 'current' back into the tree. */
3647 __enqueue_entity(cfs_rq, prev);
3648 /* in !on_rq case, update occurred at dequeue */
3649 update_load_avg(prev, 0);
3651 cfs_rq->curr = NULL;
3655 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3658 * Update run-time statistics of the 'current'.
3660 update_curr(cfs_rq);
3663 * Ensure that runnable average is periodically updated.
3665 update_load_avg(curr, 1);
3666 update_cfs_shares(cfs_rq);
3668 #ifdef CONFIG_SCHED_HRTICK
3670 * queued ticks are scheduled to match the slice, so don't bother
3671 * validating it and just reschedule.
3674 resched_curr(rq_of(cfs_rq));
3678 * don't let the period tick interfere with the hrtick preemption
3680 if (!sched_feat(DOUBLE_TICK) &&
3681 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3685 if (cfs_rq->nr_running > 1)
3686 check_preempt_tick(cfs_rq, curr);
3690 /**************************************************
3691 * CFS bandwidth control machinery
3694 #ifdef CONFIG_CFS_BANDWIDTH
3696 #ifdef HAVE_JUMP_LABEL
3697 static struct static_key __cfs_bandwidth_used;
3699 static inline bool cfs_bandwidth_used(void)
3701 return static_key_false(&__cfs_bandwidth_used);
3704 void cfs_bandwidth_usage_inc(void)
3706 static_key_slow_inc(&__cfs_bandwidth_used);
3709 void cfs_bandwidth_usage_dec(void)
3711 static_key_slow_dec(&__cfs_bandwidth_used);
3713 #else /* HAVE_JUMP_LABEL */
3714 static bool cfs_bandwidth_used(void)
3719 void cfs_bandwidth_usage_inc(void) {}
3720 void cfs_bandwidth_usage_dec(void) {}
3721 #endif /* HAVE_JUMP_LABEL */
3724 * default period for cfs group bandwidth.
3725 * default: 0.1s, units: nanoseconds
3727 static inline u64 default_cfs_period(void)
3729 return 100000000ULL;
3732 static inline u64 sched_cfs_bandwidth_slice(void)
3734 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3738 * Replenish runtime according to assigned quota and update expiration time.
3739 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
3740 * additional synchronization around rq->lock.
3742 * requires cfs_b->lock
3744 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3748 if (cfs_b->quota == RUNTIME_INF)
3751 now = sched_clock_cpu(smp_processor_id());
3752 cfs_b->runtime = cfs_b->quota;
3753 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3756 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3758 return &tg->cfs_bandwidth;
3761 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
3762 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3764 if (unlikely(cfs_rq->throttle_count))
3765 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
3767 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3770 /* returns 0 on failure to allocate runtime */
3771 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3773 struct task_group *tg = cfs_rq->tg;
3774 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3775 u64 amount = 0, min_amount, expires;
3777 /* note: this is a positive sum as runtime_remaining <= 0 */
3778 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3780 raw_spin_lock(&cfs_b->lock);
3781 if (cfs_b->quota == RUNTIME_INF)
3782 amount = min_amount;
3784 start_cfs_bandwidth(cfs_b);
3786 if (cfs_b->runtime > 0) {
3787 amount = min(cfs_b->runtime, min_amount);
3788 cfs_b->runtime -= amount;
3792 expires = cfs_b->runtime_expires;
3793 raw_spin_unlock(&cfs_b->lock);
3795 cfs_rq->runtime_remaining += amount;
3797 * we may have advanced our local expiration to account for allowed
3798 * spread between our sched_clock and the one on which runtime was
3801 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3802 cfs_rq->runtime_expires = expires;
3804 return cfs_rq->runtime_remaining > 0;
3808 * Note: This depends on the synchronization provided by sched_clock and the
3809 * fact that rq->clock snapshots this value.
3811 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3813 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3815 /* if the deadline is ahead of our clock, nothing to do */
3816 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
3819 if (cfs_rq->runtime_remaining < 0)
3823 * If the local deadline has passed we have to consider the
3824 * possibility that our sched_clock is 'fast' and the global deadline
3825 * has not truly expired.
3827 * Fortunately we can check determine whether this the case by checking
3828 * whether the global deadline has advanced. It is valid to compare
3829 * cfs_b->runtime_expires without any locks since we only care about
3830 * exact equality, so a partial write will still work.
3833 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
3834 /* extend local deadline, drift is bounded above by 2 ticks */
3835 cfs_rq->runtime_expires += TICK_NSEC;
3837 /* global deadline is ahead, expiration has passed */
3838 cfs_rq->runtime_remaining = 0;
3842 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3844 /* dock delta_exec before expiring quota (as it could span periods) */
3845 cfs_rq->runtime_remaining -= delta_exec;
3846 expire_cfs_rq_runtime(cfs_rq);
3848 if (likely(cfs_rq->runtime_remaining > 0))
3852 * if we're unable to extend our runtime we resched so that the active
3853 * hierarchy can be throttled
3855 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3856 resched_curr(rq_of(cfs_rq));
3859 static __always_inline
3860 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3862 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3865 __account_cfs_rq_runtime(cfs_rq, delta_exec);
3868 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3870 return cfs_bandwidth_used() && cfs_rq->throttled;
3873 /* check whether cfs_rq, or any parent, is throttled */
3874 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3876 return cfs_bandwidth_used() && cfs_rq->throttle_count;
3880 * Ensure that neither of the group entities corresponding to src_cpu or
3881 * dest_cpu are members of a throttled hierarchy when performing group
3882 * load-balance operations.
3884 static inline int throttled_lb_pair(struct task_group *tg,
3885 int src_cpu, int dest_cpu)
3887 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3889 src_cfs_rq = tg->cfs_rq[src_cpu];
3890 dest_cfs_rq = tg->cfs_rq[dest_cpu];
3892 return throttled_hierarchy(src_cfs_rq) ||
3893 throttled_hierarchy(dest_cfs_rq);
3896 /* updated child weight may affect parent so we have to do this bottom up */
3897 static int tg_unthrottle_up(struct task_group *tg, void *data)
3899 struct rq *rq = data;
3900 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3902 cfs_rq->throttle_count--;
3903 if (!cfs_rq->throttle_count) {
3904 /* adjust cfs_rq_clock_task() */
3905 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3906 cfs_rq->throttled_clock_task;
3912 static int tg_throttle_down(struct task_group *tg, void *data)
3914 struct rq *rq = data;
3915 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3917 /* group is entering throttled state, stop time */
3918 if (!cfs_rq->throttle_count)
3919 cfs_rq->throttled_clock_task = rq_clock_task(rq);
3920 cfs_rq->throttle_count++;
3925 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3927 struct rq *rq = rq_of(cfs_rq);
3928 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3929 struct sched_entity *se;
3930 long task_delta, dequeue = 1;
3933 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3935 /* freeze hierarchy runnable averages while throttled */
3937 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3940 task_delta = cfs_rq->h_nr_running;
3941 for_each_sched_entity(se) {
3942 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3943 /* throttled entity or throttle-on-deactivate */
3948 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3949 qcfs_rq->h_nr_running -= task_delta;
3951 if (qcfs_rq->load.weight)
3956 sub_nr_running(rq, task_delta);
3958 cfs_rq->throttled = 1;
3959 cfs_rq->throttled_clock = rq_clock(rq);
3960 raw_spin_lock(&cfs_b->lock);
3961 empty = list_empty(&cfs_b->throttled_cfs_rq);
3964 * Add to the _head_ of the list, so that an already-started
3965 * distribute_cfs_runtime will not see us
3967 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3970 * If we're the first throttled task, make sure the bandwidth
3974 start_cfs_bandwidth(cfs_b);
3976 raw_spin_unlock(&cfs_b->lock);
3979 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3981 struct rq *rq = rq_of(cfs_rq);
3982 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3983 struct sched_entity *se;
3987 se = cfs_rq->tg->se[cpu_of(rq)];
3989 cfs_rq->throttled = 0;
3991 update_rq_clock(rq);
3993 raw_spin_lock(&cfs_b->lock);
3994 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3995 list_del_rcu(&cfs_rq->throttled_list);
3996 raw_spin_unlock(&cfs_b->lock);
3998 /* update hierarchical throttle state */
3999 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4001 if (!cfs_rq->load.weight)
4004 task_delta = cfs_rq->h_nr_running;
4005 for_each_sched_entity(se) {
4009 cfs_rq = cfs_rq_of(se);
4011 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4012 cfs_rq->h_nr_running += task_delta;
4014 if (cfs_rq_throttled(cfs_rq))
4019 add_nr_running(rq, task_delta);
4021 /* determine whether we need to wake up potentially idle cpu */
4022 if (rq->curr == rq->idle && rq->cfs.nr_running)
4026 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4027 u64 remaining, u64 expires)
4029 struct cfs_rq *cfs_rq;
4031 u64 starting_runtime = remaining;
4034 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4036 struct rq *rq = rq_of(cfs_rq);
4038 raw_spin_lock(&rq->lock);
4039 if (!cfs_rq_throttled(cfs_rq))
4042 runtime = -cfs_rq->runtime_remaining + 1;
4043 if (runtime > remaining)
4044 runtime = remaining;
4045 remaining -= runtime;
4047 cfs_rq->runtime_remaining += runtime;
4048 cfs_rq->runtime_expires = expires;
4050 /* we check whether we're throttled above */
4051 if (cfs_rq->runtime_remaining > 0)
4052 unthrottle_cfs_rq(cfs_rq);
4055 raw_spin_unlock(&rq->lock);
4062 return starting_runtime - remaining;
4066 * Responsible for refilling a task_group's bandwidth and unthrottling its
4067 * cfs_rqs as appropriate. If there has been no activity within the last
4068 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
4069 * used to track this state.
4071 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4073 u64 runtime, runtime_expires;
4076 /* no need to continue the timer with no bandwidth constraint */
4077 if (cfs_b->quota == RUNTIME_INF)
4078 goto out_deactivate;
4080 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4081 cfs_b->nr_periods += overrun;
4084 * idle depends on !throttled (for the case of a large deficit), and if
4085 * we're going inactive then everything else can be deferred
4087 if (cfs_b->idle && !throttled)
4088 goto out_deactivate;
4090 __refill_cfs_bandwidth_runtime(cfs_b);
4093 /* mark as potentially idle for the upcoming period */
4098 /* account preceding periods in which throttling occurred */
4099 cfs_b->nr_throttled += overrun;
4101 runtime_expires = cfs_b->runtime_expires;
4104 * This check is repeated as we are holding onto the new bandwidth while
4105 * we unthrottle. This can potentially race with an unthrottled group
4106 * trying to acquire new bandwidth from the global pool. This can result
4107 * in us over-using our runtime if it is all used during this loop, but
4108 * only by limited amounts in that extreme case.
4110 while (throttled && cfs_b->runtime > 0) {
4111 runtime = cfs_b->runtime;
4112 raw_spin_unlock(&cfs_b->lock);
4113 /* we can't nest cfs_b->lock while distributing bandwidth */
4114 runtime = distribute_cfs_runtime(cfs_b, runtime,
4116 raw_spin_lock(&cfs_b->lock);
4118 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4120 cfs_b->runtime -= min(runtime, cfs_b->runtime);
4124 * While we are ensured activity in the period following an
4125 * unthrottle, this also covers the case in which the new bandwidth is
4126 * insufficient to cover the existing bandwidth deficit. (Forcing the
4127 * timer to remain active while there are any throttled entities.)
4137 /* a cfs_rq won't donate quota below this amount */
4138 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4139 /* minimum remaining period time to redistribute slack quota */
4140 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4141 /* how long we wait to gather additional slack before distributing */
4142 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4145 * Are we near the end of the current quota period?
4147 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
4148 * hrtimer base being cleared by hrtimer_start. In the case of
4149 * migrate_hrtimers, base is never cleared, so we are fine.
4151 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4153 struct hrtimer *refresh_timer = &cfs_b->period_timer;
4156 /* if the call-back is running a quota refresh is already occurring */
4157 if (hrtimer_callback_running(refresh_timer))
4160 /* is a quota refresh about to occur? */
4161 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4162 if (remaining < min_expire)
4168 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4170 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4172 /* if there's a quota refresh soon don't bother with slack */
4173 if (runtime_refresh_within(cfs_b, min_left))
4176 hrtimer_start(&cfs_b->slack_timer,
4177 ns_to_ktime(cfs_bandwidth_slack_period),
4181 /* we know any runtime found here is valid as update_curr() precedes return */
4182 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4184 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4185 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4187 if (slack_runtime <= 0)
4190 raw_spin_lock(&cfs_b->lock);
4191 if (cfs_b->quota != RUNTIME_INF &&
4192 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4193 cfs_b->runtime += slack_runtime;
4195 /* we are under rq->lock, defer unthrottling using a timer */
4196 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4197 !list_empty(&cfs_b->throttled_cfs_rq))
4198 start_cfs_slack_bandwidth(cfs_b);
4200 raw_spin_unlock(&cfs_b->lock);
4202 /* even if it's not valid for return we don't want to try again */
4203 cfs_rq->runtime_remaining -= slack_runtime;
4206 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4208 if (!cfs_bandwidth_used())
4211 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4214 __return_cfs_rq_runtime(cfs_rq);
4218 * This is done with a timer (instead of inline with bandwidth return) since
4219 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
4221 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4223 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4226 /* confirm we're still not at a refresh boundary */
4227 raw_spin_lock(&cfs_b->lock);
4228 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4229 raw_spin_unlock(&cfs_b->lock);
4233 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4234 runtime = cfs_b->runtime;
4236 expires = cfs_b->runtime_expires;
4237 raw_spin_unlock(&cfs_b->lock);
4242 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4244 raw_spin_lock(&cfs_b->lock);
4245 if (expires == cfs_b->runtime_expires)
4246 cfs_b->runtime -= min(runtime, cfs_b->runtime);
4247 raw_spin_unlock(&cfs_b->lock);
4251 * When a group wakes up we want to make sure that its quota is not already
4252 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4253 * runtime as update_curr() throttling can not not trigger until it's on-rq.
4255 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4257 if (!cfs_bandwidth_used())
4260 /* an active group must be handled by the update_curr()->put() path */
4261 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4264 /* ensure the group is not already throttled */
4265 if (cfs_rq_throttled(cfs_rq))
4268 /* update runtime allocation */
4269 account_cfs_rq_runtime(cfs_rq, 0);
4270 if (cfs_rq->runtime_remaining <= 0)
4271 throttle_cfs_rq(cfs_rq);
4274 static void sync_throttle(struct task_group *tg, int cpu)
4276 struct cfs_rq *pcfs_rq, *cfs_rq;
4278 if (!cfs_bandwidth_used())
4284 cfs_rq = tg->cfs_rq[cpu];
4285 pcfs_rq = tg->parent->cfs_rq[cpu];
4287 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4288 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
4291 /* conditionally throttle active cfs_rq's from put_prev_entity() */
4292 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4294 if (!cfs_bandwidth_used())
4297 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4301 * it's possible for a throttled entity to be forced into a running
4302 * state (e.g. set_curr_task), in this case we're finished.
4304 if (cfs_rq_throttled(cfs_rq))
4307 throttle_cfs_rq(cfs_rq);
4311 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4313 struct cfs_bandwidth *cfs_b =
4314 container_of(timer, struct cfs_bandwidth, slack_timer);
4316 do_sched_cfs_slack_timer(cfs_b);
4318 return HRTIMER_NORESTART;
4321 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4323 struct cfs_bandwidth *cfs_b =
4324 container_of(timer, struct cfs_bandwidth, period_timer);
4328 raw_spin_lock(&cfs_b->lock);
4330 overrun = hrtimer_forward_now(timer, cfs_b->period);
4334 idle = do_sched_cfs_period_timer(cfs_b, overrun);
4337 cfs_b->period_active = 0;
4338 raw_spin_unlock(&cfs_b->lock);
4340 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4343 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4345 raw_spin_lock_init(&cfs_b->lock);
4347 cfs_b->quota = RUNTIME_INF;
4348 cfs_b->period = ns_to_ktime(default_cfs_period());
4350 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4351 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4352 cfs_b->period_timer.function = sched_cfs_period_timer;
4353 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4354 cfs_b->slack_timer.function = sched_cfs_slack_timer;
4357 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4359 cfs_rq->runtime_enabled = 0;
4360 INIT_LIST_HEAD(&cfs_rq->throttled_list);
4363 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4365 lockdep_assert_held(&cfs_b->lock);
4367 if (!cfs_b->period_active) {
4368 cfs_b->period_active = 1;
4369 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4370 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4374 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4376 /* init_cfs_bandwidth() was not called */
4377 if (!cfs_b->throttled_cfs_rq.next)
4380 hrtimer_cancel(&cfs_b->period_timer);
4381 hrtimer_cancel(&cfs_b->slack_timer);
4384 static void __maybe_unused update_runtime_enabled(struct rq *rq)
4386 struct cfs_rq *cfs_rq;
4388 for_each_leaf_cfs_rq(rq, cfs_rq) {
4389 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4391 raw_spin_lock(&cfs_b->lock);
4392 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4393 raw_spin_unlock(&cfs_b->lock);
4397 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4399 struct cfs_rq *cfs_rq;
4401 for_each_leaf_cfs_rq(rq, cfs_rq) {
4402 if (!cfs_rq->runtime_enabled)
4406 * clock_task is not advancing so we just need to make sure
4407 * there's some valid quota amount
4409 cfs_rq->runtime_remaining = 1;
4411 * Offline rq is schedulable till cpu is completely disabled
4412 * in take_cpu_down(), so we prevent new cfs throttling here.
4414 cfs_rq->runtime_enabled = 0;
4416 if (cfs_rq_throttled(cfs_rq))
4417 unthrottle_cfs_rq(cfs_rq);
4421 #else /* CONFIG_CFS_BANDWIDTH */
4422 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4424 return rq_clock_task(rq_of(cfs_rq));
4427 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4428 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4429 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4430 static inline void sync_throttle(struct task_group *tg, int cpu) {}
4431 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4433 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4438 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4443 static inline int throttled_lb_pair(struct task_group *tg,
4444 int src_cpu, int dest_cpu)
4449 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4451 #ifdef CONFIG_FAIR_GROUP_SCHED
4452 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4455 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4459 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4460 static inline void update_runtime_enabled(struct rq *rq) {}
4461 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
4463 #endif /* CONFIG_CFS_BANDWIDTH */
4465 /**************************************************
4466 * CFS operations on tasks:
4469 #ifdef CONFIG_SCHED_HRTICK
4470 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4472 struct sched_entity *se = &p->se;
4473 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4475 SCHED_WARN_ON(task_rq(p) != rq);
4477 if (rq->cfs.h_nr_running > 1) {
4478 u64 slice = sched_slice(cfs_rq, se);
4479 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4480 s64 delta = slice - ran;
4487 hrtick_start(rq, delta);
4492 * called from enqueue/dequeue and updates the hrtick when the
4493 * current task is from our class and nr_running is low enough
4496 static void hrtick_update(struct rq *rq)
4498 struct task_struct *curr = rq->curr;
4500 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
4503 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4504 hrtick_start_fair(rq, curr);
4506 #else /* !CONFIG_SCHED_HRTICK */
4508 hrtick_start_fair(struct rq *rq, struct task_struct *p)
4512 static inline void hrtick_update(struct rq *rq)
4518 * The enqueue_task method is called before nr_running is
4519 * increased. Here we update the fair scheduling stats and
4520 * then put the task into the rbtree:
4523 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4525 struct cfs_rq *cfs_rq;
4526 struct sched_entity *se = &p->se;
4528 for_each_sched_entity(se) {
4531 cfs_rq = cfs_rq_of(se);
4532 enqueue_entity(cfs_rq, se, flags);
4535 * end evaluation on encountering a throttled cfs_rq
4537 * note: in the case of encountering a throttled cfs_rq we will
4538 * post the final h_nr_running increment below.
4540 if (cfs_rq_throttled(cfs_rq))
4542 cfs_rq->h_nr_running++;
4544 flags = ENQUEUE_WAKEUP;
4547 for_each_sched_entity(se) {
4548 cfs_rq = cfs_rq_of(se);
4549 cfs_rq->h_nr_running++;
4551 if (cfs_rq_throttled(cfs_rq))
4554 update_load_avg(se, 1);
4555 update_cfs_shares(cfs_rq);
4559 add_nr_running(rq, 1);
4564 static void set_next_buddy(struct sched_entity *se);
4567 * The dequeue_task method is called before nr_running is
4568 * decreased. We remove the task from the rbtree and
4569 * update the fair scheduling stats:
4571 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4573 struct cfs_rq *cfs_rq;
4574 struct sched_entity *se = &p->se;
4575 int task_sleep = flags & DEQUEUE_SLEEP;
4577 for_each_sched_entity(se) {
4578 cfs_rq = cfs_rq_of(se);
4579 dequeue_entity(cfs_rq, se, flags);
4582 * end evaluation on encountering a throttled cfs_rq
4584 * note: in the case of encountering a throttled cfs_rq we will
4585 * post the final h_nr_running decrement below.
4587 if (cfs_rq_throttled(cfs_rq))
4589 cfs_rq->h_nr_running--;
4591 /* Don't dequeue parent if it has other entities besides us */
4592 if (cfs_rq->load.weight) {
4593 /* Avoid re-evaluating load for this entity: */
4594 se = parent_entity(se);
4596 * Bias pick_next to pick a task from this cfs_rq, as
4597 * p is sleeping when it is within its sched_slice.
4599 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
4603 flags |= DEQUEUE_SLEEP;
4606 for_each_sched_entity(se) {
4607 cfs_rq = cfs_rq_of(se);
4608 cfs_rq->h_nr_running--;
4610 if (cfs_rq_throttled(cfs_rq))
4613 update_load_avg(se, 1);
4614 update_cfs_shares(cfs_rq);
4618 sub_nr_running(rq, 1);
4625 /* Working cpumask for: load_balance, load_balance_newidle. */
4626 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
4627 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
4629 #ifdef CONFIG_NO_HZ_COMMON
4631 * per rq 'load' arrray crap; XXX kill this.
4635 * The exact cpuload calculated at every tick would be:
4637 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
4639 * If a cpu misses updates for n ticks (as it was idle) and update gets
4640 * called on the n+1-th tick when cpu may be busy, then we have:
4642 * load_n = (1 - 1/2^i)^n * load_0
4643 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
4645 * decay_load_missed() below does efficient calculation of
4647 * load' = (1 - 1/2^i)^n * load
4649 * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
4650 * This allows us to precompute the above in said factors, thereby allowing the
4651 * reduction of an arbitrary n in O(log_2 n) steps. (See also
4652 * fixed_power_int())
4654 * The calculation is approximated on a 128 point scale.
4656 #define DEGRADE_SHIFT 7
4658 static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4659 static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4660 { 0, 0, 0, 0, 0, 0, 0, 0 },
4661 { 64, 32, 8, 0, 0, 0, 0, 0 },
4662 { 96, 72, 40, 12, 1, 0, 0, 0 },
4663 { 112, 98, 75, 43, 15, 1, 0, 0 },
4664 { 120, 112, 98, 76, 45, 16, 2, 0 }
4668 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4669 * would be when CPU is idle and so we just decay the old load without
4670 * adding any new load.
4672 static unsigned long
4673 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4677 if (!missed_updates)
4680 if (missed_updates >= degrade_zero_ticks[idx])
4684 return load >> missed_updates;
4686 while (missed_updates) {
4687 if (missed_updates % 2)
4688 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4690 missed_updates >>= 1;
4695 #endif /* CONFIG_NO_HZ_COMMON */
4698 * __cpu_load_update - update the rq->cpu_load[] statistics
4699 * @this_rq: The rq to update statistics for
4700 * @this_load: The current load
4701 * @pending_updates: The number of missed updates
4703 * Update rq->cpu_load[] statistics. This function is usually called every
4704 * scheduler tick (TICK_NSEC).
4706 * This function computes a decaying average:
4708 * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
4710 * Because of NOHZ it might not get called on every tick which gives need for
4711 * the @pending_updates argument.
4713 * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
4714 * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
4715 * = A * (A * load[i]_n-2 + B) + B
4716 * = A * (A * (A * load[i]_n-3 + B) + B) + B
4717 * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
4718 * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
4719 * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
4720 * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
4722 * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
4723 * any change in load would have resulted in the tick being turned back on.
4725 * For regular NOHZ, this reduces to:
4727 * load[i]_n = (1 - 1/2^i)^n * load[i]_0
4729 * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
4732 static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
4733 unsigned long pending_updates)
4735 unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
4738 this_rq->nr_load_updates++;
4740 /* Update our load: */
4741 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
4742 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4743 unsigned long old_load, new_load;
4745 /* scale is effectively 1 << i now, and >> i divides by scale */
4747 old_load = this_rq->cpu_load[i];
4748 #ifdef CONFIG_NO_HZ_COMMON
4749 old_load = decay_load_missed(old_load, pending_updates - 1, i);
4750 if (tickless_load) {
4751 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
4753 * old_load can never be a negative value because a
4754 * decayed tickless_load cannot be greater than the
4755 * original tickless_load.
4757 old_load += tickless_load;
4760 new_load = this_load;
4762 * Round up the averaging division if load is increasing. This
4763 * prevents us from getting stuck on 9 if the load is 10, for
4766 if (new_load > old_load)
4767 new_load += scale - 1;
4769 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4772 sched_avg_update(this_rq);
4775 /* Used instead of source_load when we know the type == 0 */
4776 static unsigned long weighted_cpuload(const int cpu)
4778 return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
4781 #ifdef CONFIG_NO_HZ_COMMON
4783 * There is no sane way to deal with nohz on smp when using jiffies because the
4784 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4785 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4787 * Therefore we need to avoid the delta approach from the regular tick when
4788 * possible since that would seriously skew the load calculation. This is why we
4789 * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
4790 * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
4791 * loop exit, nohz_idle_balance, nohz full exit...)
4793 * This means we might still be one tick off for nohz periods.
4796 static void cpu_load_update_nohz(struct rq *this_rq,
4797 unsigned long curr_jiffies,
4800 unsigned long pending_updates;
4802 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4803 if (pending_updates) {
4804 this_rq->last_load_update_tick = curr_jiffies;
4806 * In the regular NOHZ case, we were idle, this means load 0.
4807 * In the NOHZ_FULL case, we were non-idle, we should consider
4808 * its weighted load.
4810 cpu_load_update(this_rq, load, pending_updates);
4815 * Called from nohz_idle_balance() to update the load ratings before doing the
4818 static void cpu_load_update_idle(struct rq *this_rq)
4821 * bail if there's load or we're actually up-to-date.
4823 if (weighted_cpuload(cpu_of(this_rq)))
4826 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
4830 * Record CPU load on nohz entry so we know the tickless load to account
4831 * on nohz exit. cpu_load[0] happens then to be updated more frequently
4832 * than other cpu_load[idx] but it should be fine as cpu_load readers
4833 * shouldn't rely into synchronized cpu_load[*] updates.
4835 void cpu_load_update_nohz_start(void)
4837 struct rq *this_rq = this_rq();
4840 * This is all lockless but should be fine. If weighted_cpuload changes
4841 * concurrently we'll exit nohz. And cpu_load write can race with
4842 * cpu_load_update_idle() but both updater would be writing the same.
4844 this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
4848 * Account the tickless load in the end of a nohz frame.
4850 void cpu_load_update_nohz_stop(void)
4852 unsigned long curr_jiffies = READ_ONCE(jiffies);
4853 struct rq *this_rq = this_rq();
4856 if (curr_jiffies == this_rq->last_load_update_tick)
4859 load = weighted_cpuload(cpu_of(this_rq));
4860 raw_spin_lock(&this_rq->lock);
4861 update_rq_clock(this_rq);
4862 cpu_load_update_nohz(this_rq, curr_jiffies, load);
4863 raw_spin_unlock(&this_rq->lock);
4865 #else /* !CONFIG_NO_HZ_COMMON */
4866 static inline void cpu_load_update_nohz(struct rq *this_rq,
4867 unsigned long curr_jiffies,
4868 unsigned long load) { }
4869 #endif /* CONFIG_NO_HZ_COMMON */
4871 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
4873 #ifdef CONFIG_NO_HZ_COMMON
4874 /* See the mess around cpu_load_update_nohz(). */
4875 this_rq->last_load_update_tick = READ_ONCE(jiffies);
4877 cpu_load_update(this_rq, load, 1);
4881 * Called from scheduler_tick()
4883 void cpu_load_update_active(struct rq *this_rq)
4885 unsigned long load = weighted_cpuload(cpu_of(this_rq));
4887 if (tick_nohz_tick_stopped())
4888 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
4890 cpu_load_update_periodic(this_rq, load);
4894 * Return a low guess at the load of a migration-source cpu weighted
4895 * according to the scheduling class and "nice" value.
4897 * We want to under-estimate the load of migration sources, to
4898 * balance conservatively.
4900 static unsigned long source_load(int cpu, int type)
4902 struct rq *rq = cpu_rq(cpu);
4903 unsigned long total = weighted_cpuload(cpu);
4905 if (type == 0 || !sched_feat(LB_BIAS))
4908 return min(rq->cpu_load[type-1], total);
4912 * Return a high guess at the load of a migration-target cpu weighted
4913 * according to the scheduling class and "nice" value.
4915 static unsigned long target_load(int cpu, int type)
4917 struct rq *rq = cpu_rq(cpu);
4918 unsigned long total = weighted_cpuload(cpu);
4920 if (type == 0 || !sched_feat(LB_BIAS))
4923 return max(rq->cpu_load[type-1], total);
4926 static unsigned long capacity_of(int cpu)
4928 return cpu_rq(cpu)->cpu_capacity;
4931 static unsigned long capacity_orig_of(int cpu)
4933 return cpu_rq(cpu)->cpu_capacity_orig;
4936 static unsigned long cpu_avg_load_per_task(int cpu)
4938 struct rq *rq = cpu_rq(cpu);
4939 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4940 unsigned long load_avg = weighted_cpuload(cpu);
4943 return load_avg / nr_running;
4948 #ifdef CONFIG_FAIR_GROUP_SCHED
4950 * effective_load() calculates the load change as seen from the root_task_group
4952 * Adding load to a group doesn't make a group heavier, but can cause movement
4953 * of group shares between cpus. Assuming the shares were perfectly aligned one
4954 * can calculate the shift in shares.
4956 * Calculate the effective load difference if @wl is added (subtracted) to @tg
4957 * on this @cpu and results in a total addition (subtraction) of @wg to the
4958 * total group weight.
4960 * Given a runqueue weight distribution (rw_i) we can compute a shares
4961 * distribution (s_i) using:
4963 * s_i = rw_i / \Sum rw_j (1)
4965 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4966 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4967 * shares distribution (s_i):
4969 * rw_i = { 2, 4, 1, 0 }
4970 * s_i = { 2/7, 4/7, 1/7, 0 }
4972 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4973 * task used to run on and the CPU the waker is running on), we need to
4974 * compute the effect of waking a task on either CPU and, in case of a sync
4975 * wakeup, compute the effect of the current task going to sleep.
4977 * So for a change of @wl to the local @cpu with an overall group weight change
4978 * of @wl we can compute the new shares distribution (s'_i) using:
4980 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
4982 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4983 * differences in waking a task to CPU 0. The additional task changes the
4984 * weight and shares distributions like:
4986 * rw'_i = { 3, 4, 1, 0 }
4987 * s'_i = { 3/8, 4/8, 1/8, 0 }
4989 * We can then compute the difference in effective weight by using:
4991 * dw_i = S * (s'_i - s_i) (3)
4993 * Where 'S' is the group weight as seen by its parent.
4995 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4996 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4997 * 4/7) times the weight of the group.
4999 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
5001 struct sched_entity *se = tg->se[cpu];
5003 if (!tg->parent) /* the trivial, non-cgroup case */
5006 for_each_sched_entity(se) {
5007 struct cfs_rq *cfs_rq = se->my_q;
5008 long W, w = cfs_rq_load_avg(cfs_rq);
5013 * W = @wg + \Sum rw_j
5015 W = wg + atomic_long_read(&tg->load_avg);
5017 /* Ensure \Sum rw_j >= rw_i */
5018 W -= cfs_rq->tg_load_avg_contrib;
5027 * wl = S * s'_i; see (2)
5030 wl = (w * (long)scale_load_down(tg->shares)) / W;
5032 wl = scale_load_down(tg->shares);
5035 * Per the above, wl is the new se->load.weight value; since
5036 * those are clipped to [MIN_SHARES, ...) do so now. See
5037 * calc_cfs_shares().
5039 if (wl < MIN_SHARES)
5043 * wl = dw_i = S * (s'_i - s_i); see (3)
5045 wl -= se->avg.load_avg;
5048 * Recursively apply this logic to all parent groups to compute
5049 * the final effective load change on the root group. Since
5050 * only the @tg group gets extra weight, all parent groups can
5051 * only redistribute existing shares. @wl is the shift in shares
5052 * resulting from this level per the above.
5061 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
5068 static void record_wakee(struct task_struct *p)
5071 * Only decay a single time; tasks that have less then 1 wakeup per
5072 * jiffy will not have built up many flips.
5074 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5075 current->wakee_flips >>= 1;
5076 current->wakee_flip_decay_ts = jiffies;
5079 if (current->last_wakee != p) {
5080 current->last_wakee = p;
5081 current->wakee_flips++;
5086 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
5088 * A waker of many should wake a different task than the one last awakened
5089 * at a frequency roughly N times higher than one of its wakees.
5091 * In order to determine whether we should let the load spread vs consolidating
5092 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
5093 * partner, and a factor of lls_size higher frequency in the other.
5095 * With both conditions met, we can be relatively sure that the relationship is
5096 * non-monogamous, with partner count exceeding socket size.
5098 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
5099 * whatever is irrelevant, spread criteria is apparent partner count exceeds
5102 static int wake_wide(struct task_struct *p)
5104 unsigned int master = current->wakee_flips;
5105 unsigned int slave = p->wakee_flips;
5106 int factor = this_cpu_read(sd_llc_size);
5109 swap(master, slave);
5110 if (slave < factor || master < slave * factor)
5115 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5116 int prev_cpu, int sync)
5118 s64 this_load, load;
5119 s64 this_eff_load, prev_eff_load;
5121 struct task_group *tg;
5122 unsigned long weight;
5126 this_cpu = smp_processor_id();
5127 load = source_load(prev_cpu, idx);
5128 this_load = target_load(this_cpu, idx);
5131 * If sync wakeup then subtract the (maximum possible)
5132 * effect of the currently running task from the load
5133 * of the current CPU:
5136 tg = task_group(current);
5137 weight = current->se.avg.load_avg;
5139 this_load += effective_load(tg, this_cpu, -weight, -weight);
5140 load += effective_load(tg, prev_cpu, 0, -weight);
5144 weight = p->se.avg.load_avg;
5147 * In low-load situations, where prev_cpu is idle and this_cpu is idle
5148 * due to the sync cause above having dropped this_load to 0, we'll
5149 * always have an imbalance, but there's really nothing you can do
5150 * about that, so that's good too.
5152 * Otherwise check if either cpus are near enough in load to allow this
5153 * task to be woken on this_cpu.
5155 this_eff_load = 100;
5156 this_eff_load *= capacity_of(prev_cpu);
5158 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
5159 prev_eff_load *= capacity_of(this_cpu);
5161 if (this_load > 0) {
5162 this_eff_load *= this_load +
5163 effective_load(tg, this_cpu, weight, weight);
5165 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
5168 balanced = this_eff_load <= prev_eff_load;
5170 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5175 schedstat_inc(sd->ttwu_move_affine);
5176 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5182 * find_idlest_group finds and returns the least busy CPU group within the
5185 static struct sched_group *
5186 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5187 int this_cpu, int sd_flag)
5189 struct sched_group *idlest = NULL, *group = sd->groups;
5190 unsigned long min_load = ULONG_MAX, this_load = 0;
5191 int load_idx = sd->forkexec_idx;
5192 int imbalance = 100 + (sd->imbalance_pct-100)/2;
5194 if (sd_flag & SD_BALANCE_WAKE)
5195 load_idx = sd->wake_idx;
5198 unsigned long load, avg_load;
5202 /* Skip over this group if it has no CPUs allowed */
5203 if (!cpumask_intersects(sched_group_cpus(group),
5204 tsk_cpus_allowed(p)))
5207 local_group = cpumask_test_cpu(this_cpu,
5208 sched_group_cpus(group));
5210 /* Tally up the load of all CPUs in the group */
5213 for_each_cpu(i, sched_group_cpus(group)) {
5214 /* Bias balancing toward cpus of our domain */
5216 load = source_load(i, load_idx);
5218 load = target_load(i, load_idx);
5223 /* Adjust by relative CPU capacity of the group */
5224 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
5227 this_load = avg_load;
5228 } else if (avg_load < min_load) {
5229 min_load = avg_load;
5232 } while (group = group->next, group != sd->groups);
5234 if (!idlest || 100*this_load < imbalance*min_load)
5240 * find_idlest_cpu - find the idlest cpu among the cpus in group.
5243 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5245 unsigned long load, min_load = ULONG_MAX;
5246 unsigned int min_exit_latency = UINT_MAX;
5247 u64 latest_idle_timestamp = 0;
5248 int least_loaded_cpu = this_cpu;
5249 int shallowest_idle_cpu = -1;
5252 /* Check if we have any choice: */
5253 if (group->group_weight == 1)
5254 return cpumask_first(sched_group_cpus(group));
5256 /* Traverse only the allowed CPUs */
5257 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
5259 struct rq *rq = cpu_rq(i);
5260 struct cpuidle_state *idle = idle_get_state(rq);
5261 if (idle && idle->exit_latency < min_exit_latency) {
5263 * We give priority to a CPU whose idle state
5264 * has the smallest exit latency irrespective
5265 * of any idle timestamp.
5267 min_exit_latency = idle->exit_latency;
5268 latest_idle_timestamp = rq->idle_stamp;
5269 shallowest_idle_cpu = i;
5270 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5271 rq->idle_stamp > latest_idle_timestamp) {
5273 * If equal or no active idle state, then
5274 * the most recently idled CPU might have
5277 latest_idle_timestamp = rq->idle_stamp;
5278 shallowest_idle_cpu = i;
5280 } else if (shallowest_idle_cpu == -1) {
5281 load = weighted_cpuload(i);
5282 if (load < min_load || (load == min_load && i == this_cpu)) {
5284 least_loaded_cpu = i;
5289 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
5293 * Implement a for_each_cpu() variant that starts the scan at a given cpu
5294 * (@start), and wraps around.
5296 * This is used to scan for idle CPUs; such that not all CPUs looking for an
5297 * idle CPU find the same CPU. The down-side is that tasks tend to cycle
5298 * through the LLC domain.
5300 * Especially tbench is found sensitive to this.
5303 static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
5308 next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
5312 return nr_cpumask_bits;
5314 if (next >= nr_cpumask_bits) {
5324 #define for_each_cpu_wrap(cpu, mask, start, wrap) \
5325 for ((wrap) = 0, (cpu) = (start)-1; \
5326 (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
5327 (cpu) < nr_cpumask_bits; )
5329 #ifdef CONFIG_SCHED_SMT
5331 static inline void set_idle_cores(int cpu, int val)
5333 struct sched_domain_shared *sds;
5335 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5337 WRITE_ONCE(sds->has_idle_cores, val);
5340 static inline bool test_idle_cores(int cpu, bool def)
5342 struct sched_domain_shared *sds;
5344 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5346 return READ_ONCE(sds->has_idle_cores);
5352 * Scans the local SMT mask to see if the entire core is idle, and records this
5353 * information in sd_llc_shared->has_idle_cores.
5355 * Since SMT siblings share all cache levels, inspecting this limited remote
5356 * state should be fairly cheap.
5358 void __update_idle_core(struct rq *rq)
5360 int core = cpu_of(rq);
5364 if (test_idle_cores(core, true))
5367 for_each_cpu(cpu, cpu_smt_mask(core)) {
5375 set_idle_cores(core, 1);
5381 * Scan the entire LLC domain for idle cores; this dynamically switches off if
5382 * there are no idle cores left in the system; tracked through
5383 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
5385 static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5387 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5388 int core, cpu, wrap;
5390 if (!static_branch_likely(&sched_smt_present))
5393 if (!test_idle_cores(target, false))
5396 cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
5398 for_each_cpu_wrap(core, cpus, target, wrap) {
5401 for_each_cpu(cpu, cpu_smt_mask(core)) {
5402 cpumask_clear_cpu(cpu, cpus);
5412 * Failed to find an idle core; stop looking for one.
5414 set_idle_cores(target, 0);
5420 * Scan the local SMT mask for idle CPUs.
5422 static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
5426 if (!static_branch_likely(&sched_smt_present))
5429 for_each_cpu(cpu, cpu_smt_mask(target)) {
5430 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
5439 #else /* CONFIG_SCHED_SMT */
5441 static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5446 static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
5451 #endif /* CONFIG_SCHED_SMT */
5454 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
5455 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
5456 * average idle time for this rq (as found in rq->avg_idle).
5458 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
5460 struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
5461 u64 avg_idle = this_rq()->avg_idle;
5462 u64 avg_cost = this_sd->avg_scan_cost;
5468 * Due to large variance we need a large fuzz factor; hackbench in
5469 * particularly is sensitive here.
5471 if ((avg_idle / 512) < avg_cost)
5474 time = local_clock();
5476 for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
5477 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
5483 time = local_clock() - time;
5484 cost = this_sd->avg_scan_cost;
5485 delta = (s64)(time - cost) / 8;
5486 this_sd->avg_scan_cost += delta;
5492 * Try and locate an idle core/thread in the LLC cache domain.
5494 static int select_idle_sibling(struct task_struct *p, int prev, int target)
5496 struct sched_domain *sd;
5499 if (idle_cpu(target))
5503 * If the previous cpu is cache affine and idle, don't be stupid.
5505 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
5508 sd = rcu_dereference(per_cpu(sd_llc, target));
5512 i = select_idle_core(p, sd, target);
5513 if ((unsigned)i < nr_cpumask_bits)
5516 i = select_idle_cpu(p, sd, target);
5517 if ((unsigned)i < nr_cpumask_bits)
5520 i = select_idle_smt(p, sd, target);
5521 if ((unsigned)i < nr_cpumask_bits)
5528 * cpu_util returns the amount of capacity of a CPU that is used by CFS
5529 * tasks. The unit of the return value must be the one of capacity so we can
5530 * compare the utilization with the capacity of the CPU that is available for
5531 * CFS task (ie cpu_capacity).
5533 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
5534 * recent utilization of currently non-runnable tasks on a CPU. It represents
5535 * the amount of utilization of a CPU in the range [0..capacity_orig] where
5536 * capacity_orig is the cpu_capacity available at the highest frequency
5537 * (arch_scale_freq_capacity()).
5538 * The utilization of a CPU converges towards a sum equal to or less than the
5539 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
5540 * the running time on this CPU scaled by capacity_curr.
5542 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
5543 * higher than capacity_orig because of unfortunate rounding in
5544 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
5545 * the average stabilizes with the new running time. We need to check that the
5546 * utilization stays within the range of [0..capacity_orig] and cap it if
5547 * necessary. Without utilization capping, a group could be seen as overloaded
5548 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
5549 * available capacity. We allow utilization to overshoot capacity_curr (but not
5550 * capacity_orig) as it useful for predicting the capacity required after task
5551 * migrations (scheduler-driven DVFS).
5553 static int cpu_util(int cpu)
5555 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
5556 unsigned long capacity = capacity_orig_of(cpu);
5558 return (util >= capacity) ? capacity : util;
5561 static inline int task_util(struct task_struct *p)
5563 return p->se.avg.util_avg;
5567 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
5568 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
5570 * In that case WAKE_AFFINE doesn't make sense and we'll let
5571 * BALANCE_WAKE sort things out.
5573 static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
5575 long min_cap, max_cap;
5577 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
5578 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
5580 /* Minimum capacity is close to max, no need to abort wake_affine */
5581 if (max_cap - min_cap < max_cap >> 3)
5584 return min_cap * 1024 < task_util(p) * capacity_margin;
5588 * select_task_rq_fair: Select target runqueue for the waking task in domains
5589 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
5590 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
5592 * Balances load by selecting the idlest cpu in the idlest group, or under
5593 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
5595 * Returns the target cpu number.
5597 * preempt must be disabled.
5600 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
5602 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
5603 int cpu = smp_processor_id();
5604 int new_cpu = prev_cpu;
5605 int want_affine = 0;
5606 int sync = wake_flags & WF_SYNC;
5608 if (sd_flag & SD_BALANCE_WAKE) {
5610 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
5611 && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
5615 for_each_domain(cpu, tmp) {
5616 if (!(tmp->flags & SD_LOAD_BALANCE))
5620 * If both cpu and prev_cpu are part of this domain,
5621 * cpu is a valid SD_WAKE_AFFINE target.
5623 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
5624 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
5629 if (tmp->flags & sd_flag)
5631 else if (!want_affine)
5636 sd = NULL; /* Prefer wake_affine over balance flags */
5637 if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
5642 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
5643 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
5646 struct sched_group *group;
5649 if (!(sd->flags & sd_flag)) {
5654 group = find_idlest_group(sd, p, cpu, sd_flag);
5660 new_cpu = find_idlest_cpu(group, p, cpu);
5661 if (new_cpu == -1 || new_cpu == cpu) {
5662 /* Now try balancing at a lower domain level of cpu */
5667 /* Now try balancing at a lower domain level of new_cpu */
5669 weight = sd->span_weight;
5671 for_each_domain(cpu, tmp) {
5672 if (weight <= tmp->span_weight)
5674 if (tmp->flags & sd_flag)
5677 /* while loop will break here if sd == NULL */
5685 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
5686 * cfs_rq_of(p) references at time of call are still valid and identify the
5687 * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
5689 static void migrate_task_rq_fair(struct task_struct *p)
5692 * As blocked tasks retain absolute vruntime the migration needs to
5693 * deal with this by subtracting the old and adding the new
5694 * min_vruntime -- the latter is done by enqueue_entity() when placing
5695 * the task on the new runqueue.
5697 if (p->state == TASK_WAKING) {
5698 struct sched_entity *se = &p->se;
5699 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5702 #ifndef CONFIG_64BIT
5703 u64 min_vruntime_copy;
5706 min_vruntime_copy = cfs_rq->min_vruntime_copy;
5708 min_vruntime = cfs_rq->min_vruntime;
5709 } while (min_vruntime != min_vruntime_copy);
5711 min_vruntime = cfs_rq->min_vruntime;
5714 se->vruntime -= min_vruntime;
5718 * We are supposed to update the task to "current" time, then its up to date
5719 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
5720 * what current time is, so simply throw away the out-of-date time. This
5721 * will result in the wakee task is less decayed, but giving the wakee more
5722 * load sounds not bad.
5724 remove_entity_load_avg(&p->se);
5726 /* Tell new CPU we are migrated */
5727 p->se.avg.last_update_time = 0;
5729 /* We have migrated, no longer consider this task hot */
5730 p->se.exec_start = 0;
5733 static void task_dead_fair(struct task_struct *p)
5735 remove_entity_load_avg(&p->se);
5737 #endif /* CONFIG_SMP */
5739 static unsigned long
5740 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
5742 unsigned long gran = sysctl_sched_wakeup_granularity;
5745 * Since its curr running now, convert the gran from real-time
5746 * to virtual-time in his units.
5748 * By using 'se' instead of 'curr' we penalize light tasks, so
5749 * they get preempted easier. That is, if 'se' < 'curr' then
5750 * the resulting gran will be larger, therefore penalizing the
5751 * lighter, if otoh 'se' > 'curr' then the resulting gran will
5752 * be smaller, again penalizing the lighter task.
5754 * This is especially important for buddies when the leftmost
5755 * task is higher priority than the buddy.
5757 return calc_delta_fair(gran, se);
5761 * Should 'se' preempt 'curr'.
5775 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
5777 s64 gran, vdiff = curr->vruntime - se->vruntime;
5782 gran = wakeup_gran(curr, se);
5789 static void set_last_buddy(struct sched_entity *se)
5791 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5794 for_each_sched_entity(se)
5795 cfs_rq_of(se)->last = se;
5798 static void set_next_buddy(struct sched_entity *se)
5800 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5803 for_each_sched_entity(se)
5804 cfs_rq_of(se)->next = se;
5807 static void set_skip_buddy(struct sched_entity *se)
5809 for_each_sched_entity(se)
5810 cfs_rq_of(se)->skip = se;
5814 * Preempt the current task with a newly woken task if needed:
5816 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
5818 struct task_struct *curr = rq->curr;
5819 struct sched_entity *se = &curr->se, *pse = &p->se;
5820 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5821 int scale = cfs_rq->nr_running >= sched_nr_latency;
5822 int next_buddy_marked = 0;
5824 if (unlikely(se == pse))
5828 * This is possible from callers such as attach_tasks(), in which we
5829 * unconditionally check_prempt_curr() after an enqueue (which may have
5830 * lead to a throttle). This both saves work and prevents false
5831 * next-buddy nomination below.
5833 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
5836 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
5837 set_next_buddy(pse);
5838 next_buddy_marked = 1;
5842 * We can come here with TIF_NEED_RESCHED already set from new task
5845 * Note: this also catches the edge-case of curr being in a throttled
5846 * group (e.g. via set_curr_task), since update_curr() (in the
5847 * enqueue of curr) will have resulted in resched being set. This
5848 * prevents us from potentially nominating it as a false LAST_BUDDY
5851 if (test_tsk_need_resched(curr))
5854 /* Idle tasks are by definition preempted by non-idle tasks. */
5855 if (unlikely(curr->policy == SCHED_IDLE) &&
5856 likely(p->policy != SCHED_IDLE))
5860 * Batch and idle tasks do not preempt non-idle tasks (their preemption
5861 * is driven by the tick):
5863 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
5866 find_matching_se(&se, &pse);
5867 update_curr(cfs_rq_of(se));
5869 if (wakeup_preempt_entity(se, pse) == 1) {
5871 * Bias pick_next to pick the sched entity that is
5872 * triggering this preemption.
5874 if (!next_buddy_marked)
5875 set_next_buddy(pse);
5884 * Only set the backward buddy when the current task is still
5885 * on the rq. This can happen when a wakeup gets interleaved
5886 * with schedule on the ->pre_schedule() or idle_balance()
5887 * point, either of which can * drop the rq lock.
5889 * Also, during early boot the idle thread is in the fair class,
5890 * for obvious reasons its a bad idea to schedule back to it.
5892 if (unlikely(!se->on_rq || curr == rq->idle))
5895 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
5899 static struct task_struct *
5900 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
5902 struct cfs_rq *cfs_rq = &rq->cfs;
5903 struct sched_entity *se;
5904 struct task_struct *p;
5908 #ifdef CONFIG_FAIR_GROUP_SCHED
5909 if (!cfs_rq->nr_running)
5912 if (prev->sched_class != &fair_sched_class)
5916 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
5917 * likely that a next task is from the same cgroup as the current.
5919 * Therefore attempt to avoid putting and setting the entire cgroup
5920 * hierarchy, only change the part that actually changes.
5924 struct sched_entity *curr = cfs_rq->curr;
5927 * Since we got here without doing put_prev_entity() we also
5928 * have to consider cfs_rq->curr. If it is still a runnable
5929 * entity, update_curr() will update its vruntime, otherwise
5930 * forget we've ever seen it.
5934 update_curr(cfs_rq);
5939 * This call to check_cfs_rq_runtime() will do the
5940 * throttle and dequeue its entity in the parent(s).
5941 * Therefore the 'simple' nr_running test will indeed
5944 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
5948 se = pick_next_entity(cfs_rq, curr);
5949 cfs_rq = group_cfs_rq(se);
5955 * Since we haven't yet done put_prev_entity and if the selected task
5956 * is a different task than we started out with, try and touch the
5957 * least amount of cfs_rqs.
5960 struct sched_entity *pse = &prev->se;
5962 while (!(cfs_rq = is_same_group(se, pse))) {
5963 int se_depth = se->depth;
5964 int pse_depth = pse->depth;
5966 if (se_depth <= pse_depth) {
5967 put_prev_entity(cfs_rq_of(pse), pse);
5968 pse = parent_entity(pse);
5970 if (se_depth >= pse_depth) {
5971 set_next_entity(cfs_rq_of(se), se);
5972 se = parent_entity(se);
5976 put_prev_entity(cfs_rq, pse);
5977 set_next_entity(cfs_rq, se);
5980 if (hrtick_enabled(rq))
5981 hrtick_start_fair(rq, p);
5988 if (!cfs_rq->nr_running)
5991 put_prev_task(rq, prev);
5994 se = pick_next_entity(cfs_rq, NULL);
5995 set_next_entity(cfs_rq, se);
5996 cfs_rq = group_cfs_rq(se);
6001 if (hrtick_enabled(rq))
6002 hrtick_start_fair(rq, p);
6008 * This is OK, because current is on_cpu, which avoids it being picked
6009 * for load-balance and preemption/IRQs are still disabled avoiding
6010 * further scheduler activity on it and we're being very careful to
6011 * re-start the picking loop.
6013 lockdep_unpin_lock(&rq->lock, cookie);
6014 new_tasks = idle_balance(rq);
6015 lockdep_repin_lock(&rq->lock, cookie);
6017 * Because idle_balance() releases (and re-acquires) rq->lock, it is
6018 * possible for any higher priority task to appear. In that case we
6019 * must re-start the pick_next_entity() loop.
6031 * Account for a descheduled task:
6033 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
6035 struct sched_entity *se = &prev->se;
6036 struct cfs_rq *cfs_rq;
6038 for_each_sched_entity(se) {
6039 cfs_rq = cfs_rq_of(se);
6040 put_prev_entity(cfs_rq, se);
6045 * sched_yield() is very simple
6047 * The magic of dealing with the ->skip buddy is in pick_next_entity.
6049 static void yield_task_fair(struct rq *rq)
6051 struct task_struct *curr = rq->curr;
6052 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6053 struct sched_entity *se = &curr->se;
6056 * Are we the only task in the tree?
6058 if (unlikely(rq->nr_running == 1))
6061 clear_buddies(cfs_rq, se);
6063 if (curr->policy != SCHED_BATCH) {
6064 update_rq_clock(rq);
6066 * Update run-time statistics of the 'current'.
6068 update_curr(cfs_rq);
6070 * Tell update_rq_clock() that we've just updated,
6071 * so we don't do microscopic update in schedule()
6072 * and double the fastpath cost.
6074 rq_clock_skip_update(rq, true);
6080 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
6082 struct sched_entity *se = &p->se;
6084 /* throttled hierarchies are not runnable */
6085 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
6088 /* Tell the scheduler that we'd really like pse to run next. */
6091 yield_task_fair(rq);
6097 /**************************************************
6098 * Fair scheduling class load-balancing methods.
6102 * The purpose of load-balancing is to achieve the same basic fairness the
6103 * per-cpu scheduler provides, namely provide a proportional amount of compute
6104 * time to each task. This is expressed in the following equation:
6106 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
6108 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
6109 * W_i,0 is defined as:
6111 * W_i,0 = \Sum_j w_i,j (2)
6113 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
6114 * is derived from the nice value as per sched_prio_to_weight[].
6116 * The weight average is an exponential decay average of the instantaneous
6119 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
6121 * C_i is the compute capacity of cpu i, typically it is the
6122 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
6123 * can also include other factors [XXX].
6125 * To achieve this balance we define a measure of imbalance which follows
6126 * directly from (1):
6128 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
6130 * We them move tasks around to minimize the imbalance. In the continuous
6131 * function space it is obvious this converges, in the discrete case we get
6132 * a few fun cases generally called infeasible weight scenarios.
6135 * - infeasible weights;
6136 * - local vs global optima in the discrete case. ]
6141 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
6142 * for all i,j solution, we create a tree of cpus that follows the hardware
6143 * topology where each level pairs two lower groups (or better). This results
6144 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
6145 * tree to only the first of the previous level and we decrease the frequency
6146 * of load-balance at each level inv. proportional to the number of cpus in
6152 * \Sum { --- * --- * 2^i } = O(n) (5)
6154 * `- size of each group
6155 * | | `- number of cpus doing load-balance
6157 * `- sum over all levels
6159 * Coupled with a limit on how many tasks we can migrate every balance pass,
6160 * this makes (5) the runtime complexity of the balancer.
6162 * An important property here is that each CPU is still (indirectly) connected
6163 * to every other cpu in at most O(log n) steps:
6165 * The adjacency matrix of the resulting graph is given by:
6168 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
6171 * And you'll find that:
6173 * A^(log_2 n)_i,j != 0 for all i,j (7)
6175 * Showing there's indeed a path between every cpu in at most O(log n) steps.
6176 * The task movement gives a factor of O(m), giving a convergence complexity
6179 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
6184 * In order to avoid CPUs going idle while there's still work to do, new idle
6185 * balancing is more aggressive and has the newly idle cpu iterate up the domain
6186 * tree itself instead of relying on other CPUs to bring it work.
6188 * This adds some complexity to both (5) and (8) but it reduces the total idle
6196 * Cgroups make a horror show out of (2), instead of a simple sum we get:
6199 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
6204 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
6206 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
6208 * The big problem is S_k, its a global sum needed to compute a local (W_i)
6211 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
6212 * rewrite all of this once again.]
6215 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
6217 enum fbq_type { regular, remote, all };
6219 #define LBF_ALL_PINNED 0x01
6220 #define LBF_NEED_BREAK 0x02
6221 #define LBF_DST_PINNED 0x04
6222 #define LBF_SOME_PINNED 0x08
6225 struct sched_domain *sd;
6233 struct cpumask *dst_grpmask;
6235 enum cpu_idle_type idle;
6237 /* The set of CPUs under consideration for load-balancing */
6238 struct cpumask *cpus;
6243 unsigned int loop_break;
6244 unsigned int loop_max;
6246 enum fbq_type fbq_type;
6247 struct list_head tasks;
6251 * Is this task likely cache-hot:
6253 static int task_hot(struct task_struct *p, struct lb_env *env)
6257 lockdep_assert_held(&env->src_rq->lock);
6259 if (p->sched_class != &fair_sched_class)
6262 if (unlikely(p->policy == SCHED_IDLE))
6266 * Buddy candidates are cache hot:
6268 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
6269 (&p->se == cfs_rq_of(&p->se)->next ||
6270 &p->se == cfs_rq_of(&p->se)->last))
6273 if (sysctl_sched_migration_cost == -1)
6275 if (sysctl_sched_migration_cost == 0)
6278 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
6280 return delta < (s64)sysctl_sched_migration_cost;
6283 #ifdef CONFIG_NUMA_BALANCING
6285 * Returns 1, if task migration degrades locality
6286 * Returns 0, if task migration improves locality i.e migration preferred.
6287 * Returns -1, if task migration is not affected by locality.
6289 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
6291 struct numa_group *numa_group = rcu_dereference(p->numa_group);
6292 unsigned long src_faults, dst_faults;
6293 int src_nid, dst_nid;
6295 if (!static_branch_likely(&sched_numa_balancing))
6298 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
6301 src_nid = cpu_to_node(env->src_cpu);
6302 dst_nid = cpu_to_node(env->dst_cpu);
6304 if (src_nid == dst_nid)
6307 /* Migrating away from the preferred node is always bad. */
6308 if (src_nid == p->numa_preferred_nid) {
6309 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
6315 /* Encourage migration to the preferred node. */
6316 if (dst_nid == p->numa_preferred_nid)
6320 src_faults = group_faults(p, src_nid);
6321 dst_faults = group_faults(p, dst_nid);
6323 src_faults = task_faults(p, src_nid);
6324 dst_faults = task_faults(p, dst_nid);
6327 return dst_faults < src_faults;
6331 static inline int migrate_degrades_locality(struct task_struct *p,
6339 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
6342 int can_migrate_task(struct task_struct *p, struct lb_env *env)
6346 lockdep_assert_held(&env->src_rq->lock);
6349 * We do not migrate tasks that are:
6350 * 1) throttled_lb_pair, or
6351 * 2) cannot be migrated to this CPU due to cpus_allowed, or
6352 * 3) running (obviously), or
6353 * 4) are cache-hot on their current CPU.
6355 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
6358 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
6361 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
6363 env->flags |= LBF_SOME_PINNED;
6366 * Remember if this task can be migrated to any other cpu in
6367 * our sched_group. We may want to revisit it if we couldn't
6368 * meet load balance goals by pulling other tasks on src_cpu.
6370 * Also avoid computing new_dst_cpu if we have already computed
6371 * one in current iteration.
6373 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
6376 /* Prevent to re-select dst_cpu via env's cpus */
6377 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
6378 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
6379 env->flags |= LBF_DST_PINNED;
6380 env->new_dst_cpu = cpu;
6388 /* Record that we found atleast one task that could run on dst_cpu */
6389 env->flags &= ~LBF_ALL_PINNED;
6391 if (task_running(env->src_rq, p)) {
6392 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
6397 * Aggressive migration if:
6398 * 1) destination numa is preferred
6399 * 2) task is cache cold, or
6400 * 3) too many balance attempts have failed.
6402 tsk_cache_hot = migrate_degrades_locality(p, env);
6403 if (tsk_cache_hot == -1)
6404 tsk_cache_hot = task_hot(p, env);
6406 if (tsk_cache_hot <= 0 ||
6407 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
6408 if (tsk_cache_hot == 1) {
6409 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
6410 schedstat_inc(p->se.statistics.nr_forced_migrations);
6415 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
6420 * detach_task() -- detach the task for the migration specified in env
6422 static void detach_task(struct task_struct *p, struct lb_env *env)
6424 lockdep_assert_held(&env->src_rq->lock);
6426 p->on_rq = TASK_ON_RQ_MIGRATING;
6427 deactivate_task(env->src_rq, p, 0);
6428 set_task_cpu(p, env->dst_cpu);
6432 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
6433 * part of active balancing operations within "domain".
6435 * Returns a task if successful and NULL otherwise.
6437 static struct task_struct *detach_one_task(struct lb_env *env)
6439 struct task_struct *p, *n;
6441 lockdep_assert_held(&env->src_rq->lock);
6443 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
6444 if (!can_migrate_task(p, env))
6447 detach_task(p, env);
6450 * Right now, this is only the second place where
6451 * lb_gained[env->idle] is updated (other is detach_tasks)
6452 * so we can safely collect stats here rather than
6453 * inside detach_tasks().
6455 schedstat_inc(env->sd->lb_gained[env->idle]);
6461 static const unsigned int sched_nr_migrate_break = 32;
6464 * detach_tasks() -- tries to detach up to imbalance weighted load from
6465 * busiest_rq, as part of a balancing operation within domain "sd".
6467 * Returns number of detached tasks if successful and 0 otherwise.
6469 static int detach_tasks(struct lb_env *env)
6471 struct list_head *tasks = &env->src_rq->cfs_tasks;
6472 struct task_struct *p;
6476 lockdep_assert_held(&env->src_rq->lock);
6478 if (env->imbalance <= 0)
6481 while (!list_empty(tasks)) {
6483 * We don't want to steal all, otherwise we may be treated likewise,
6484 * which could at worst lead to a livelock crash.
6486 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
6489 p = list_first_entry(tasks, struct task_struct, se.group_node);
6492 /* We've more or less seen every task there is, call it quits */
6493 if (env->loop > env->loop_max)
6496 /* take a breather every nr_migrate tasks */
6497 if (env->loop > env->loop_break) {
6498 env->loop_break += sched_nr_migrate_break;
6499 env->flags |= LBF_NEED_BREAK;
6503 if (!can_migrate_task(p, env))
6506 load = task_h_load(p);
6508 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
6511 if ((load / 2) > env->imbalance)
6514 detach_task(p, env);
6515 list_add(&p->se.group_node, &env->tasks);
6518 env->imbalance -= load;
6520 #ifdef CONFIG_PREEMPT
6522 * NEWIDLE balancing is a source of latency, so preemptible
6523 * kernels will stop after the first task is detached to minimize
6524 * the critical section.
6526 if (env->idle == CPU_NEWLY_IDLE)
6531 * We only want to steal up to the prescribed amount of
6534 if (env->imbalance <= 0)
6539 list_move_tail(&p->se.group_node, tasks);
6543 * Right now, this is one of only two places we collect this stat
6544 * so we can safely collect detach_one_task() stats here rather
6545 * than inside detach_one_task().
6547 schedstat_add(env->sd->lb_gained[env->idle], detached);
6553 * attach_task() -- attach the task detached by detach_task() to its new rq.
6555 static void attach_task(struct rq *rq, struct task_struct *p)
6557 lockdep_assert_held(&rq->lock);
6559 BUG_ON(task_rq(p) != rq);
6560 activate_task(rq, p, 0);
6561 p->on_rq = TASK_ON_RQ_QUEUED;
6562 check_preempt_curr(rq, p, 0);
6566 * attach_one_task() -- attaches the task returned from detach_one_task() to
6569 static void attach_one_task(struct rq *rq, struct task_struct *p)
6571 raw_spin_lock(&rq->lock);
6573 raw_spin_unlock(&rq->lock);
6577 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
6580 static void attach_tasks(struct lb_env *env)
6582 struct list_head *tasks = &env->tasks;
6583 struct task_struct *p;
6585 raw_spin_lock(&env->dst_rq->lock);
6587 while (!list_empty(tasks)) {
6588 p = list_first_entry(tasks, struct task_struct, se.group_node);
6589 list_del_init(&p->se.group_node);
6591 attach_task(env->dst_rq, p);
6594 raw_spin_unlock(&env->dst_rq->lock);
6597 #ifdef CONFIG_FAIR_GROUP_SCHED
6598 static void update_blocked_averages(int cpu)
6600 struct rq *rq = cpu_rq(cpu);
6601 struct cfs_rq *cfs_rq;
6602 unsigned long flags;
6604 raw_spin_lock_irqsave(&rq->lock, flags);
6605 update_rq_clock(rq);
6608 * Iterates the task_group tree in a bottom up fashion, see
6609 * list_add_leaf_cfs_rq() for details.
6611 for_each_leaf_cfs_rq(rq, cfs_rq) {
6612 /* throttled entities do not contribute to load */
6613 if (throttled_hierarchy(cfs_rq))
6616 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
6617 update_tg_load_avg(cfs_rq, 0);
6619 raw_spin_unlock_irqrestore(&rq->lock, flags);
6623 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
6624 * This needs to be done in a top-down fashion because the load of a child
6625 * group is a fraction of its parents load.
6627 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
6629 struct rq *rq = rq_of(cfs_rq);
6630 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
6631 unsigned long now = jiffies;
6634 if (cfs_rq->last_h_load_update == now)
6637 cfs_rq->h_load_next = NULL;
6638 for_each_sched_entity(se) {
6639 cfs_rq = cfs_rq_of(se);
6640 cfs_rq->h_load_next = se;
6641 if (cfs_rq->last_h_load_update == now)
6646 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
6647 cfs_rq->last_h_load_update = now;
6650 while ((se = cfs_rq->h_load_next) != NULL) {
6651 load = cfs_rq->h_load;
6652 load = div64_ul(load * se->avg.load_avg,
6653 cfs_rq_load_avg(cfs_rq) + 1);
6654 cfs_rq = group_cfs_rq(se);
6655 cfs_rq->h_load = load;
6656 cfs_rq->last_h_load_update = now;
6660 static unsigned long task_h_load(struct task_struct *p)
6662 struct cfs_rq *cfs_rq = task_cfs_rq(p);
6664 update_cfs_rq_h_load(cfs_rq);
6665 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
6666 cfs_rq_load_avg(cfs_rq) + 1);
6669 static inline void update_blocked_averages(int cpu)
6671 struct rq *rq = cpu_rq(cpu);
6672 struct cfs_rq *cfs_rq = &rq->cfs;
6673 unsigned long flags;
6675 raw_spin_lock_irqsave(&rq->lock, flags);
6676 update_rq_clock(rq);
6677 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
6678 raw_spin_unlock_irqrestore(&rq->lock, flags);
6681 static unsigned long task_h_load(struct task_struct *p)
6683 return p->se.avg.load_avg;
6687 /********** Helpers for find_busiest_group ************************/
6696 * sg_lb_stats - stats of a sched_group required for load_balancing
6698 struct sg_lb_stats {
6699 unsigned long avg_load; /*Avg load across the CPUs of the group */
6700 unsigned long group_load; /* Total load over the CPUs of the group */
6701 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
6702 unsigned long load_per_task;
6703 unsigned long group_capacity;
6704 unsigned long group_util; /* Total utilization of the group */
6705 unsigned int sum_nr_running; /* Nr tasks running in the group */
6706 unsigned int idle_cpus;
6707 unsigned int group_weight;
6708 enum group_type group_type;
6709 int group_no_capacity;
6710 #ifdef CONFIG_NUMA_BALANCING
6711 unsigned int nr_numa_running;
6712 unsigned int nr_preferred_running;
6717 * sd_lb_stats - Structure to store the statistics of a sched_domain
6718 * during load balancing.
6720 struct sd_lb_stats {
6721 struct sched_group *busiest; /* Busiest group in this sd */
6722 struct sched_group *local; /* Local group in this sd */
6723 unsigned long total_load; /* Total load of all groups in sd */
6724 unsigned long total_capacity; /* Total capacity of all groups in sd */
6725 unsigned long avg_load; /* Average load across all groups in sd */
6727 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
6728 struct sg_lb_stats local_stat; /* Statistics of the local group */
6731 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
6734 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
6735 * local_stat because update_sg_lb_stats() does a full clear/assignment.
6736 * We must however clear busiest_stat::avg_load because
6737 * update_sd_pick_busiest() reads this before assignment.
6739 *sds = (struct sd_lb_stats){
6743 .total_capacity = 0UL,
6746 .sum_nr_running = 0,
6747 .group_type = group_other,
6753 * get_sd_load_idx - Obtain the load index for a given sched domain.
6754 * @sd: The sched_domain whose load_idx is to be obtained.
6755 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
6757 * Return: The load index.
6759 static inline int get_sd_load_idx(struct sched_domain *sd,
6760 enum cpu_idle_type idle)
6766 load_idx = sd->busy_idx;
6769 case CPU_NEWLY_IDLE:
6770 load_idx = sd->newidle_idx;
6773 load_idx = sd->idle_idx;
6780 static unsigned long scale_rt_capacity(int cpu)
6782 struct rq *rq = cpu_rq(cpu);
6783 u64 total, used, age_stamp, avg;
6787 * Since we're reading these variables without serialization make sure
6788 * we read them once before doing sanity checks on them.
6790 age_stamp = READ_ONCE(rq->age_stamp);
6791 avg = READ_ONCE(rq->rt_avg);
6792 delta = __rq_clock_broken(rq) - age_stamp;
6794 if (unlikely(delta < 0))
6797 total = sched_avg_period() + delta;
6799 used = div_u64(avg, total);
6801 if (likely(used < SCHED_CAPACITY_SCALE))
6802 return SCHED_CAPACITY_SCALE - used;
6807 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
6809 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
6810 struct sched_group *sdg = sd->groups;
6812 cpu_rq(cpu)->cpu_capacity_orig = capacity;
6814 capacity *= scale_rt_capacity(cpu);
6815 capacity >>= SCHED_CAPACITY_SHIFT;
6820 cpu_rq(cpu)->cpu_capacity = capacity;
6821 sdg->sgc->capacity = capacity;
6824 void update_group_capacity(struct sched_domain *sd, int cpu)
6826 struct sched_domain *child = sd->child;
6827 struct sched_group *group, *sdg = sd->groups;
6828 unsigned long capacity;
6829 unsigned long interval;
6831 interval = msecs_to_jiffies(sd->balance_interval);
6832 interval = clamp(interval, 1UL, max_load_balance_interval);
6833 sdg->sgc->next_update = jiffies + interval;
6836 update_cpu_capacity(sd, cpu);
6842 if (child->flags & SD_OVERLAP) {
6844 * SD_OVERLAP domains cannot assume that child groups
6845 * span the current group.
6848 for_each_cpu(cpu, sched_group_cpus(sdg)) {
6849 struct sched_group_capacity *sgc;
6850 struct rq *rq = cpu_rq(cpu);
6853 * build_sched_domains() -> init_sched_groups_capacity()
6854 * gets here before we've attached the domains to the
6857 * Use capacity_of(), which is set irrespective of domains
6858 * in update_cpu_capacity().
6860 * This avoids capacity from being 0 and
6861 * causing divide-by-zero issues on boot.
6863 if (unlikely(!rq->sd)) {
6864 capacity += capacity_of(cpu);
6868 sgc = rq->sd->groups->sgc;
6869 capacity += sgc->capacity;
6873 * !SD_OVERLAP domains can assume that child groups
6874 * span the current group.
6877 group = child->groups;
6879 capacity += group->sgc->capacity;
6880 group = group->next;
6881 } while (group != child->groups);
6884 sdg->sgc->capacity = capacity;
6888 * Check whether the capacity of the rq has been noticeably reduced by side
6889 * activity. The imbalance_pct is used for the threshold.
6890 * Return true is the capacity is reduced
6893 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
6895 return ((rq->cpu_capacity * sd->imbalance_pct) <
6896 (rq->cpu_capacity_orig * 100));
6900 * Group imbalance indicates (and tries to solve) the problem where balancing
6901 * groups is inadequate due to tsk_cpus_allowed() constraints.
6903 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
6904 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
6907 * { 0 1 2 3 } { 4 5 6 7 }
6910 * If we were to balance group-wise we'd place two tasks in the first group and
6911 * two tasks in the second group. Clearly this is undesired as it will overload
6912 * cpu 3 and leave one of the cpus in the second group unused.
6914 * The current solution to this issue is detecting the skew in the first group
6915 * by noticing the lower domain failed to reach balance and had difficulty
6916 * moving tasks due to affinity constraints.
6918 * When this is so detected; this group becomes a candidate for busiest; see
6919 * update_sd_pick_busiest(). And calculate_imbalance() and
6920 * find_busiest_group() avoid some of the usual balance conditions to allow it
6921 * to create an effective group imbalance.
6923 * This is a somewhat tricky proposition since the next run might not find the
6924 * group imbalance and decide the groups need to be balanced again. A most
6925 * subtle and fragile situation.
6928 static inline int sg_imbalanced(struct sched_group *group)
6930 return group->sgc->imbalance;
6934 * group_has_capacity returns true if the group has spare capacity that could
6935 * be used by some tasks.
6936 * We consider that a group has spare capacity if the * number of task is
6937 * smaller than the number of CPUs or if the utilization is lower than the
6938 * available capacity for CFS tasks.
6939 * For the latter, we use a threshold to stabilize the state, to take into
6940 * account the variance of the tasks' load and to return true if the available
6941 * capacity in meaningful for the load balancer.
6942 * As an example, an available capacity of 1% can appear but it doesn't make
6943 * any benefit for the load balance.
6946 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
6948 if (sgs->sum_nr_running < sgs->group_weight)
6951 if ((sgs->group_capacity * 100) >
6952 (sgs->group_util * env->sd->imbalance_pct))
6959 * group_is_overloaded returns true if the group has more tasks than it can
6961 * group_is_overloaded is not equals to !group_has_capacity because a group
6962 * with the exact right number of tasks, has no more spare capacity but is not
6963 * overloaded so both group_has_capacity and group_is_overloaded return
6967 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6969 if (sgs->sum_nr_running <= sgs->group_weight)
6972 if ((sgs->group_capacity * 100) <
6973 (sgs->group_util * env->sd->imbalance_pct))
6980 group_type group_classify(struct sched_group *group,
6981 struct sg_lb_stats *sgs)
6983 if (sgs->group_no_capacity)
6984 return group_overloaded;
6986 if (sg_imbalanced(group))
6987 return group_imbalanced;
6993 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
6994 * @env: The load balancing environment.
6995 * @group: sched_group whose statistics are to be updated.
6996 * @load_idx: Load index of sched_domain of this_cpu for load calc.
6997 * @local_group: Does group contain this_cpu.
6998 * @sgs: variable to hold the statistics for this group.
6999 * @overload: Indicate more than one runnable task for any CPU.
7001 static inline void update_sg_lb_stats(struct lb_env *env,
7002 struct sched_group *group, int load_idx,
7003 int local_group, struct sg_lb_stats *sgs,
7009 memset(sgs, 0, sizeof(*sgs));
7011 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
7012 struct rq *rq = cpu_rq(i);
7014 /* Bias balancing toward cpus of our domain */
7016 load = target_load(i, load_idx);
7018 load = source_load(i, load_idx);
7020 sgs->group_load += load;
7021 sgs->group_util += cpu_util(i);
7022 sgs->sum_nr_running += rq->cfs.h_nr_running;
7024 nr_running = rq->nr_running;
7028 #ifdef CONFIG_NUMA_BALANCING
7029 sgs->nr_numa_running += rq->nr_numa_running;
7030 sgs->nr_preferred_running += rq->nr_preferred_running;
7032 sgs->sum_weighted_load += weighted_cpuload(i);
7034 * No need to call idle_cpu() if nr_running is not 0
7036 if (!nr_running && idle_cpu(i))
7040 /* Adjust by relative CPU capacity of the group */
7041 sgs->group_capacity = group->sgc->capacity;
7042 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
7044 if (sgs->sum_nr_running)
7045 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
7047 sgs->group_weight = group->group_weight;
7049 sgs->group_no_capacity = group_is_overloaded(env, sgs);
7050 sgs->group_type = group_classify(group, sgs);
7054 * update_sd_pick_busiest - return 1 on busiest group
7055 * @env: The load balancing environment.
7056 * @sds: sched_domain statistics
7057 * @sg: sched_group candidate to be checked for being the busiest
7058 * @sgs: sched_group statistics
7060 * Determine if @sg is a busier group than the previously selected
7063 * Return: %true if @sg is a busier group than the previously selected
7064 * busiest group. %false otherwise.
7066 static bool update_sd_pick_busiest(struct lb_env *env,
7067 struct sd_lb_stats *sds,
7068 struct sched_group *sg,
7069 struct sg_lb_stats *sgs)
7071 struct sg_lb_stats *busiest = &sds->busiest_stat;
7073 if (sgs->group_type > busiest->group_type)
7076 if (sgs->group_type < busiest->group_type)
7079 if (sgs->avg_load <= busiest->avg_load)
7082 /* This is the busiest node in its class. */
7083 if (!(env->sd->flags & SD_ASYM_PACKING))
7086 /* No ASYM_PACKING if target cpu is already busy */
7087 if (env->idle == CPU_NOT_IDLE)
7090 * ASYM_PACKING needs to move all the work to the lowest
7091 * numbered CPUs in the group, therefore mark all groups
7092 * higher than ourself as busy.
7094 if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
7098 /* Prefer to move from highest possible cpu's work */
7099 if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
7106 #ifdef CONFIG_NUMA_BALANCING
7107 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
7109 if (sgs->sum_nr_running > sgs->nr_numa_running)
7111 if (sgs->sum_nr_running > sgs->nr_preferred_running)
7116 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7118 if (rq->nr_running > rq->nr_numa_running)
7120 if (rq->nr_running > rq->nr_preferred_running)
7125 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
7130 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7134 #endif /* CONFIG_NUMA_BALANCING */
7137 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
7138 * @env: The load balancing environment.
7139 * @sds: variable to hold the statistics for this sched_domain.
7141 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
7143 struct sched_domain *child = env->sd->child;
7144 struct sched_group *sg = env->sd->groups;
7145 struct sg_lb_stats tmp_sgs;
7146 int load_idx, prefer_sibling = 0;
7147 bool overload = false;
7149 if (child && child->flags & SD_PREFER_SIBLING)
7152 load_idx = get_sd_load_idx(env->sd, env->idle);
7155 struct sg_lb_stats *sgs = &tmp_sgs;
7158 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
7161 sgs = &sds->local_stat;
7163 if (env->idle != CPU_NEWLY_IDLE ||
7164 time_after_eq(jiffies, sg->sgc->next_update))
7165 update_group_capacity(env->sd, env->dst_cpu);
7168 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
7175 * In case the child domain prefers tasks go to siblings
7176 * first, lower the sg capacity so that we'll try
7177 * and move all the excess tasks away. We lower the capacity
7178 * of a group only if the local group has the capacity to fit
7179 * these excess tasks. The extra check prevents the case where
7180 * you always pull from the heaviest group when it is already
7181 * under-utilized (possible with a large weight task outweighs
7182 * the tasks on the system).
7184 if (prefer_sibling && sds->local &&
7185 group_has_capacity(env, &sds->local_stat) &&
7186 (sgs->sum_nr_running > 1)) {
7187 sgs->group_no_capacity = 1;
7188 sgs->group_type = group_classify(sg, sgs);
7191 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
7193 sds->busiest_stat = *sgs;
7197 /* Now, start updating sd_lb_stats */
7198 sds->total_load += sgs->group_load;
7199 sds->total_capacity += sgs->group_capacity;
7202 } while (sg != env->sd->groups);
7204 if (env->sd->flags & SD_NUMA)
7205 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
7207 if (!env->sd->parent) {
7208 /* update overload indicator if we are at root domain */
7209 if (env->dst_rq->rd->overload != overload)
7210 env->dst_rq->rd->overload = overload;
7216 * check_asym_packing - Check to see if the group is packed into the
7219 * This is primarily intended to used at the sibling level. Some
7220 * cores like POWER7 prefer to use lower numbered SMT threads. In the
7221 * case of POWER7, it can move to lower SMT modes only when higher
7222 * threads are idle. When in lower SMT modes, the threads will
7223 * perform better since they share less core resources. Hence when we
7224 * have idle threads, we want them to be the higher ones.
7226 * This packing function is run on idle threads. It checks to see if
7227 * the busiest CPU in this domain (core in the P7 case) has a higher
7228 * CPU number than the packing function is being run on. Here we are
7229 * assuming lower CPU number will be equivalent to lower a SMT thread
7232 * Return: 1 when packing is required and a task should be moved to
7233 * this CPU. The amount of the imbalance is returned in *imbalance.
7235 * @env: The load balancing environment.
7236 * @sds: Statistics of the sched_domain which is to be packed
7238 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
7242 if (!(env->sd->flags & SD_ASYM_PACKING))
7245 if (env->idle == CPU_NOT_IDLE)
7251 busiest_cpu = group_first_cpu(sds->busiest);
7252 if (env->dst_cpu > busiest_cpu)
7255 env->imbalance = DIV_ROUND_CLOSEST(
7256 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
7257 SCHED_CAPACITY_SCALE);
7263 * fix_small_imbalance - Calculate the minor imbalance that exists
7264 * amongst the groups of a sched_domain, during
7266 * @env: The load balancing environment.
7267 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
7270 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
7272 unsigned long tmp, capa_now = 0, capa_move = 0;
7273 unsigned int imbn = 2;
7274 unsigned long scaled_busy_load_per_task;
7275 struct sg_lb_stats *local, *busiest;
7277 local = &sds->local_stat;
7278 busiest = &sds->busiest_stat;
7280 if (!local->sum_nr_running)
7281 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
7282 else if (busiest->load_per_task > local->load_per_task)
7285 scaled_busy_load_per_task =
7286 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
7287 busiest->group_capacity;
7289 if (busiest->avg_load + scaled_busy_load_per_task >=
7290 local->avg_load + (scaled_busy_load_per_task * imbn)) {
7291 env->imbalance = busiest->load_per_task;
7296 * OK, we don't have enough imbalance to justify moving tasks,
7297 * however we may be able to increase total CPU capacity used by
7301 capa_now += busiest->group_capacity *
7302 min(busiest->load_per_task, busiest->avg_load);
7303 capa_now += local->group_capacity *
7304 min(local->load_per_task, local->avg_load);
7305 capa_now /= SCHED_CAPACITY_SCALE;
7307 /* Amount of load we'd subtract */
7308 if (busiest->avg_load > scaled_busy_load_per_task) {
7309 capa_move += busiest->group_capacity *
7310 min(busiest->load_per_task,
7311 busiest->avg_load - scaled_busy_load_per_task);
7314 /* Amount of load we'd add */
7315 if (busiest->avg_load * busiest->group_capacity <
7316 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
7317 tmp = (busiest->avg_load * busiest->group_capacity) /
7318 local->group_capacity;
7320 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
7321 local->group_capacity;
7323 capa_move += local->group_capacity *
7324 min(local->load_per_task, local->avg_load + tmp);
7325 capa_move /= SCHED_CAPACITY_SCALE;
7327 /* Move if we gain throughput */
7328 if (capa_move > capa_now)
7329 env->imbalance = busiest->load_per_task;
7333 * calculate_imbalance - Calculate the amount of imbalance present within the
7334 * groups of a given sched_domain during load balance.
7335 * @env: load balance environment
7336 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
7338 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
7340 unsigned long max_pull, load_above_capacity = ~0UL;
7341 struct sg_lb_stats *local, *busiest;
7343 local = &sds->local_stat;
7344 busiest = &sds->busiest_stat;
7346 if (busiest->group_type == group_imbalanced) {
7348 * In the group_imb case we cannot rely on group-wide averages
7349 * to ensure cpu-load equilibrium, look at wider averages. XXX
7351 busiest->load_per_task =
7352 min(busiest->load_per_task, sds->avg_load);
7356 * Avg load of busiest sg can be less and avg load of local sg can
7357 * be greater than avg load across all sgs of sd because avg load
7358 * factors in sg capacity and sgs with smaller group_type are
7359 * skipped when updating the busiest sg:
7361 if (busiest->avg_load <= sds->avg_load ||
7362 local->avg_load >= sds->avg_load) {
7364 return fix_small_imbalance(env, sds);
7368 * If there aren't any idle cpus, avoid creating some.
7370 if (busiest->group_type == group_overloaded &&
7371 local->group_type == group_overloaded) {
7372 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
7373 if (load_above_capacity > busiest->group_capacity) {
7374 load_above_capacity -= busiest->group_capacity;
7375 load_above_capacity *= scale_load_down(NICE_0_LOAD);
7376 load_above_capacity /= busiest->group_capacity;
7378 load_above_capacity = ~0UL;
7382 * We're trying to get all the cpus to the average_load, so we don't
7383 * want to push ourselves above the average load, nor do we wish to
7384 * reduce the max loaded cpu below the average load. At the same time,
7385 * we also don't want to reduce the group load below the group
7386 * capacity. Thus we look for the minimum possible imbalance.
7388 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
7390 /* How much load to actually move to equalise the imbalance */
7391 env->imbalance = min(
7392 max_pull * busiest->group_capacity,
7393 (sds->avg_load - local->avg_load) * local->group_capacity
7394 ) / SCHED_CAPACITY_SCALE;
7397 * if *imbalance is less than the average load per runnable task
7398 * there is no guarantee that any tasks will be moved so we'll have
7399 * a think about bumping its value to force at least one task to be
7402 if (env->imbalance < busiest->load_per_task)
7403 return fix_small_imbalance(env, sds);
7406 /******* find_busiest_group() helpers end here *********************/
7409 * find_busiest_group - Returns the busiest group within the sched_domain
7410 * if there is an imbalance.
7412 * Also calculates the amount of weighted load which should be moved
7413 * to restore balance.
7415 * @env: The load balancing environment.
7417 * Return: - The busiest group if imbalance exists.
7419 static struct sched_group *find_busiest_group(struct lb_env *env)
7421 struct sg_lb_stats *local, *busiest;
7422 struct sd_lb_stats sds;
7424 init_sd_lb_stats(&sds);
7427 * Compute the various statistics relavent for load balancing at
7430 update_sd_lb_stats(env, &sds);
7431 local = &sds.local_stat;
7432 busiest = &sds.busiest_stat;
7434 /* ASYM feature bypasses nice load balance check */
7435 if (check_asym_packing(env, &sds))
7438 /* There is no busy sibling group to pull tasks from */
7439 if (!sds.busiest || busiest->sum_nr_running == 0)
7442 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
7443 / sds.total_capacity;
7446 * If the busiest group is imbalanced the below checks don't
7447 * work because they assume all things are equal, which typically
7448 * isn't true due to cpus_allowed constraints and the like.
7450 if (busiest->group_type == group_imbalanced)
7453 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
7454 if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
7455 busiest->group_no_capacity)
7459 * If the local group is busier than the selected busiest group
7460 * don't try and pull any tasks.
7462 if (local->avg_load >= busiest->avg_load)
7466 * Don't pull any tasks if this group is already above the domain
7469 if (local->avg_load >= sds.avg_load)
7472 if (env->idle == CPU_IDLE) {
7474 * This cpu is idle. If the busiest group is not overloaded
7475 * and there is no imbalance between this and busiest group
7476 * wrt idle cpus, it is balanced. The imbalance becomes
7477 * significant if the diff is greater than 1 otherwise we
7478 * might end up to just move the imbalance on another group
7480 if ((busiest->group_type != group_overloaded) &&
7481 (local->idle_cpus <= (busiest->idle_cpus + 1)))
7485 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
7486 * imbalance_pct to be conservative.
7488 if (100 * busiest->avg_load <=
7489 env->sd->imbalance_pct * local->avg_load)
7494 /* Looks like there is an imbalance. Compute it */
7495 calculate_imbalance(env, &sds);
7504 * find_busiest_queue - find the busiest runqueue among the cpus in group.
7506 static struct rq *find_busiest_queue(struct lb_env *env,
7507 struct sched_group *group)
7509 struct rq *busiest = NULL, *rq;
7510 unsigned long busiest_load = 0, busiest_capacity = 1;
7513 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
7514 unsigned long capacity, wl;
7518 rt = fbq_classify_rq(rq);
7521 * We classify groups/runqueues into three groups:
7522 * - regular: there are !numa tasks
7523 * - remote: there are numa tasks that run on the 'wrong' node
7524 * - all: there is no distinction
7526 * In order to avoid migrating ideally placed numa tasks,
7527 * ignore those when there's better options.
7529 * If we ignore the actual busiest queue to migrate another
7530 * task, the next balance pass can still reduce the busiest
7531 * queue by moving tasks around inside the node.
7533 * If we cannot move enough load due to this classification
7534 * the next pass will adjust the group classification and
7535 * allow migration of more tasks.
7537 * Both cases only affect the total convergence complexity.
7539 if (rt > env->fbq_type)
7542 capacity = capacity_of(i);
7544 wl = weighted_cpuload(i);
7547 * When comparing with imbalance, use weighted_cpuload()
7548 * which is not scaled with the cpu capacity.
7551 if (rq->nr_running == 1 && wl > env->imbalance &&
7552 !check_cpu_capacity(rq, env->sd))
7556 * For the load comparisons with the other cpu's, consider
7557 * the weighted_cpuload() scaled with the cpu capacity, so
7558 * that the load can be moved away from the cpu that is
7559 * potentially running at a lower capacity.
7561 * Thus we're looking for max(wl_i / capacity_i), crosswise
7562 * multiplication to rid ourselves of the division works out
7563 * to: wl_i * capacity_j > wl_j * capacity_i; where j is
7564 * our previous maximum.
7566 if (wl * busiest_capacity > busiest_load * capacity) {
7568 busiest_capacity = capacity;
7577 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
7578 * so long as it is large enough.
7580 #define MAX_PINNED_INTERVAL 512
7582 static int need_active_balance(struct lb_env *env)
7584 struct sched_domain *sd = env->sd;
7586 if (env->idle == CPU_NEWLY_IDLE) {
7589 * ASYM_PACKING needs to force migrate tasks from busy but
7590 * higher numbered CPUs in order to pack all tasks in the
7591 * lowest numbered CPUs.
7593 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
7598 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
7599 * It's worth migrating the task if the src_cpu's capacity is reduced
7600 * because of other sched_class or IRQs if more capacity stays
7601 * available on dst_cpu.
7603 if ((env->idle != CPU_NOT_IDLE) &&
7604 (env->src_rq->cfs.h_nr_running == 1)) {
7605 if ((check_cpu_capacity(env->src_rq, sd)) &&
7606 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
7610 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
7613 static int active_load_balance_cpu_stop(void *data);
7615 static int should_we_balance(struct lb_env *env)
7617 struct sched_group *sg = env->sd->groups;
7618 struct cpumask *sg_cpus, *sg_mask;
7619 int cpu, balance_cpu = -1;
7622 * In the newly idle case, we will allow all the cpu's
7623 * to do the newly idle load balance.
7625 if (env->idle == CPU_NEWLY_IDLE)
7628 sg_cpus = sched_group_cpus(sg);
7629 sg_mask = sched_group_mask(sg);
7630 /* Try to find first idle cpu */
7631 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
7632 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
7639 if (balance_cpu == -1)
7640 balance_cpu = group_balance_cpu(sg);
7643 * First idle cpu or the first cpu(busiest) in this sched group
7644 * is eligible for doing load balancing at this and above domains.
7646 return balance_cpu == env->dst_cpu;
7650 * Check this_cpu to ensure it is balanced within domain. Attempt to move
7651 * tasks if there is an imbalance.
7653 static int load_balance(int this_cpu, struct rq *this_rq,
7654 struct sched_domain *sd, enum cpu_idle_type idle,
7655 int *continue_balancing)
7657 int ld_moved, cur_ld_moved, active_balance = 0;
7658 struct sched_domain *sd_parent = sd->parent;
7659 struct sched_group *group;
7661 unsigned long flags;
7662 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
7664 struct lb_env env = {
7666 .dst_cpu = this_cpu,
7668 .dst_grpmask = sched_group_cpus(sd->groups),
7670 .loop_break = sched_nr_migrate_break,
7673 .tasks = LIST_HEAD_INIT(env.tasks),
7677 * For NEWLY_IDLE load_balancing, we don't need to consider
7678 * other cpus in our group
7680 if (idle == CPU_NEWLY_IDLE)
7681 env.dst_grpmask = NULL;
7683 cpumask_copy(cpus, cpu_active_mask);
7685 schedstat_inc(sd->lb_count[idle]);
7688 if (!should_we_balance(&env)) {
7689 *continue_balancing = 0;
7693 group = find_busiest_group(&env);
7695 schedstat_inc(sd->lb_nobusyg[idle]);
7699 busiest = find_busiest_queue(&env, group);
7701 schedstat_inc(sd->lb_nobusyq[idle]);
7705 BUG_ON(busiest == env.dst_rq);
7707 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
7709 env.src_cpu = busiest->cpu;
7710 env.src_rq = busiest;
7713 if (busiest->nr_running > 1) {
7715 * Attempt to move tasks. If find_busiest_group has found
7716 * an imbalance but busiest->nr_running <= 1, the group is
7717 * still unbalanced. ld_moved simply stays zero, so it is
7718 * correctly treated as an imbalance.
7720 env.flags |= LBF_ALL_PINNED;
7721 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
7724 raw_spin_lock_irqsave(&busiest->lock, flags);
7727 * cur_ld_moved - load moved in current iteration
7728 * ld_moved - cumulative load moved across iterations
7730 cur_ld_moved = detach_tasks(&env);
7733 * We've detached some tasks from busiest_rq. Every
7734 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
7735 * unlock busiest->lock, and we are able to be sure
7736 * that nobody can manipulate the tasks in parallel.
7737 * See task_rq_lock() family for the details.
7740 raw_spin_unlock(&busiest->lock);
7744 ld_moved += cur_ld_moved;
7747 local_irq_restore(flags);
7749 if (env.flags & LBF_NEED_BREAK) {
7750 env.flags &= ~LBF_NEED_BREAK;
7755 * Revisit (affine) tasks on src_cpu that couldn't be moved to
7756 * us and move them to an alternate dst_cpu in our sched_group
7757 * where they can run. The upper limit on how many times we
7758 * iterate on same src_cpu is dependent on number of cpus in our
7761 * This changes load balance semantics a bit on who can move
7762 * load to a given_cpu. In addition to the given_cpu itself
7763 * (or a ilb_cpu acting on its behalf where given_cpu is
7764 * nohz-idle), we now have balance_cpu in a position to move
7765 * load to given_cpu. In rare situations, this may cause
7766 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
7767 * _independently_ and at _same_ time to move some load to
7768 * given_cpu) causing exceess load to be moved to given_cpu.
7769 * This however should not happen so much in practice and
7770 * moreover subsequent load balance cycles should correct the
7771 * excess load moved.
7773 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
7775 /* Prevent to re-select dst_cpu via env's cpus */
7776 cpumask_clear_cpu(env.dst_cpu, env.cpus);
7778 env.dst_rq = cpu_rq(env.new_dst_cpu);
7779 env.dst_cpu = env.new_dst_cpu;
7780 env.flags &= ~LBF_DST_PINNED;
7782 env.loop_break = sched_nr_migrate_break;
7785 * Go back to "more_balance" rather than "redo" since we
7786 * need to continue with same src_cpu.
7792 * We failed to reach balance because of affinity.
7795 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7797 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
7798 *group_imbalance = 1;
7801 /* All tasks on this runqueue were pinned by CPU affinity */
7802 if (unlikely(env.flags & LBF_ALL_PINNED)) {
7803 cpumask_clear_cpu(cpu_of(busiest), cpus);
7804 if (!cpumask_empty(cpus)) {
7806 env.loop_break = sched_nr_migrate_break;
7809 goto out_all_pinned;
7814 schedstat_inc(sd->lb_failed[idle]);
7816 * Increment the failure counter only on periodic balance.
7817 * We do not want newidle balance, which can be very
7818 * frequent, pollute the failure counter causing
7819 * excessive cache_hot migrations and active balances.
7821 if (idle != CPU_NEWLY_IDLE)
7822 sd->nr_balance_failed++;
7824 if (need_active_balance(&env)) {
7825 raw_spin_lock_irqsave(&busiest->lock, flags);
7827 /* don't kick the active_load_balance_cpu_stop,
7828 * if the curr task on busiest cpu can't be
7831 if (!cpumask_test_cpu(this_cpu,
7832 tsk_cpus_allowed(busiest->curr))) {
7833 raw_spin_unlock_irqrestore(&busiest->lock,
7835 env.flags |= LBF_ALL_PINNED;
7836 goto out_one_pinned;
7840 * ->active_balance synchronizes accesses to
7841 * ->active_balance_work. Once set, it's cleared
7842 * only after active load balance is finished.
7844 if (!busiest->active_balance) {
7845 busiest->active_balance = 1;
7846 busiest->push_cpu = this_cpu;
7849 raw_spin_unlock_irqrestore(&busiest->lock, flags);
7851 if (active_balance) {
7852 stop_one_cpu_nowait(cpu_of(busiest),
7853 active_load_balance_cpu_stop, busiest,
7854 &busiest->active_balance_work);
7857 /* We've kicked active balancing, force task migration. */
7858 sd->nr_balance_failed = sd->cache_nice_tries+1;
7861 sd->nr_balance_failed = 0;
7863 if (likely(!active_balance)) {
7864 /* We were unbalanced, so reset the balancing interval */
7865 sd->balance_interval = sd->min_interval;
7868 * If we've begun active balancing, start to back off. This
7869 * case may not be covered by the all_pinned logic if there
7870 * is only 1 task on the busy runqueue (because we don't call
7873 if (sd->balance_interval < sd->max_interval)
7874 sd->balance_interval *= 2;
7881 * We reach balance although we may have faced some affinity
7882 * constraints. Clear the imbalance flag if it was set.
7885 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7887 if (*group_imbalance)
7888 *group_imbalance = 0;
7893 * We reach balance because all tasks are pinned at this level so
7894 * we can't migrate them. Let the imbalance flag set so parent level
7895 * can try to migrate them.
7897 schedstat_inc(sd->lb_balanced[idle]);
7899 sd->nr_balance_failed = 0;
7902 /* tune up the balancing interval */
7903 if (((env.flags & LBF_ALL_PINNED) &&
7904 sd->balance_interval < MAX_PINNED_INTERVAL) ||
7905 (sd->balance_interval < sd->max_interval))
7906 sd->balance_interval *= 2;
7913 static inline unsigned long
7914 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
7916 unsigned long interval = sd->balance_interval;
7919 interval *= sd->busy_factor;
7921 /* scale ms to jiffies */
7922 interval = msecs_to_jiffies(interval);
7923 interval = clamp(interval, 1UL, max_load_balance_interval);
7929 update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
7931 unsigned long interval, next;
7933 /* used by idle balance, so cpu_busy = 0 */
7934 interval = get_sd_balance_interval(sd, 0);
7935 next = sd->last_balance + interval;
7937 if (time_after(*next_balance, next))
7938 *next_balance = next;
7942 * idle_balance is called by schedule() if this_cpu is about to become
7943 * idle. Attempts to pull tasks from other CPUs.
7945 static int idle_balance(struct rq *this_rq)
7947 unsigned long next_balance = jiffies + HZ;
7948 int this_cpu = this_rq->cpu;
7949 struct sched_domain *sd;
7950 int pulled_task = 0;
7954 * We must set idle_stamp _before_ calling idle_balance(), such that we
7955 * measure the duration of idle_balance() as idle time.
7957 this_rq->idle_stamp = rq_clock(this_rq);
7959 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
7960 !this_rq->rd->overload) {
7962 sd = rcu_dereference_check_sched_domain(this_rq->sd);
7964 update_next_balance(sd, &next_balance);
7970 raw_spin_unlock(&this_rq->lock);
7972 update_blocked_averages(this_cpu);
7974 for_each_domain(this_cpu, sd) {
7975 int continue_balancing = 1;
7976 u64 t0, domain_cost;
7978 if (!(sd->flags & SD_LOAD_BALANCE))
7981 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
7982 update_next_balance(sd, &next_balance);
7986 if (sd->flags & SD_BALANCE_NEWIDLE) {
7987 t0 = sched_clock_cpu(this_cpu);
7989 pulled_task = load_balance(this_cpu, this_rq,
7991 &continue_balancing);
7993 domain_cost = sched_clock_cpu(this_cpu) - t0;
7994 if (domain_cost > sd->max_newidle_lb_cost)
7995 sd->max_newidle_lb_cost = domain_cost;
7997 curr_cost += domain_cost;
8000 update_next_balance(sd, &next_balance);
8003 * Stop searching for tasks to pull if there are
8004 * now runnable tasks on this rq.
8006 if (pulled_task || this_rq->nr_running > 0)
8011 raw_spin_lock(&this_rq->lock);
8013 if (curr_cost > this_rq->max_idle_balance_cost)
8014 this_rq->max_idle_balance_cost = curr_cost;
8017 * While browsing the domains, we released the rq lock, a task could
8018 * have been enqueued in the meantime. Since we're not going idle,
8019 * pretend we pulled a task.
8021 if (this_rq->cfs.h_nr_running && !pulled_task)
8025 /* Move the next balance forward */
8026 if (time_after(this_rq->next_balance, next_balance))
8027 this_rq->next_balance = next_balance;
8029 /* Is there a task of a high priority class? */
8030 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
8034 this_rq->idle_stamp = 0;
8040 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
8041 * running tasks off the busiest CPU onto idle CPUs. It requires at
8042 * least 1 task to be running on each physical CPU where possible, and
8043 * avoids physical / logical imbalances.
8045 static int active_load_balance_cpu_stop(void *data)
8047 struct rq *busiest_rq = data;
8048 int busiest_cpu = cpu_of(busiest_rq);
8049 int target_cpu = busiest_rq->push_cpu;
8050 struct rq *target_rq = cpu_rq(target_cpu);
8051 struct sched_domain *sd;
8052 struct task_struct *p = NULL;
8054 raw_spin_lock_irq(&busiest_rq->lock);
8056 /* make sure the requested cpu hasn't gone down in the meantime */
8057 if (unlikely(busiest_cpu != smp_processor_id() ||
8058 !busiest_rq->active_balance))
8061 /* Is there any task to move? */
8062 if (busiest_rq->nr_running <= 1)
8066 * This condition is "impossible", if it occurs
8067 * we need to fix it. Originally reported by
8068 * Bjorn Helgaas on a 128-cpu setup.
8070 BUG_ON(busiest_rq == target_rq);
8072 /* Search for an sd spanning us and the target CPU. */
8074 for_each_domain(target_cpu, sd) {
8075 if ((sd->flags & SD_LOAD_BALANCE) &&
8076 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
8081 struct lb_env env = {
8083 .dst_cpu = target_cpu,
8084 .dst_rq = target_rq,
8085 .src_cpu = busiest_rq->cpu,
8086 .src_rq = busiest_rq,
8090 schedstat_inc(sd->alb_count);
8092 p = detach_one_task(&env);
8094 schedstat_inc(sd->alb_pushed);
8095 /* Active balancing done, reset the failure counter. */
8096 sd->nr_balance_failed = 0;
8098 schedstat_inc(sd->alb_failed);
8103 busiest_rq->active_balance = 0;
8104 raw_spin_unlock(&busiest_rq->lock);
8107 attach_one_task(target_rq, p);
8114 static inline int on_null_domain(struct rq *rq)
8116 return unlikely(!rcu_dereference_sched(rq->sd));
8119 #ifdef CONFIG_NO_HZ_COMMON
8121 * idle load balancing details
8122 * - When one of the busy CPUs notice that there may be an idle rebalancing
8123 * needed, they will kick the idle load balancer, which then does idle
8124 * load balancing for all the idle CPUs.
8127 cpumask_var_t idle_cpus_mask;
8129 unsigned long next_balance; /* in jiffy units */
8130 } nohz ____cacheline_aligned;
8132 static inline int find_new_ilb(void)
8134 int ilb = cpumask_first(nohz.idle_cpus_mask);
8136 if (ilb < nr_cpu_ids && idle_cpu(ilb))
8143 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
8144 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
8145 * CPU (if there is one).
8147 static void nohz_balancer_kick(void)
8151 nohz.next_balance++;
8153 ilb_cpu = find_new_ilb();
8155 if (ilb_cpu >= nr_cpu_ids)
8158 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
8161 * Use smp_send_reschedule() instead of resched_cpu().
8162 * This way we generate a sched IPI on the target cpu which
8163 * is idle. And the softirq performing nohz idle load balance
8164 * will be run before returning from the IPI.
8166 smp_send_reschedule(ilb_cpu);
8170 void nohz_balance_exit_idle(unsigned int cpu)
8172 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
8174 * Completely isolated CPUs don't ever set, so we must test.
8176 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
8177 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
8178 atomic_dec(&nohz.nr_cpus);
8180 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
8184 static inline void set_cpu_sd_state_busy(void)
8186 struct sched_domain *sd;
8187 int cpu = smp_processor_id();
8190 sd = rcu_dereference(per_cpu(sd_llc, cpu));
8192 if (!sd || !sd->nohz_idle)
8196 atomic_inc(&sd->shared->nr_busy_cpus);
8201 void set_cpu_sd_state_idle(void)
8203 struct sched_domain *sd;
8204 int cpu = smp_processor_id();
8207 sd = rcu_dereference(per_cpu(sd_llc, cpu));
8209 if (!sd || sd->nohz_idle)
8213 atomic_dec(&sd->shared->nr_busy_cpus);
8219 * This routine will record that the cpu is going idle with tick stopped.
8220 * This info will be used in performing idle load balancing in the future.
8222 void nohz_balance_enter_idle(int cpu)
8225 * If this cpu is going down, then nothing needs to be done.
8227 if (!cpu_active(cpu))
8230 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
8234 * If we're a completely isolated CPU, we don't play.
8236 if (on_null_domain(cpu_rq(cpu)))
8239 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
8240 atomic_inc(&nohz.nr_cpus);
8241 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
8245 static DEFINE_SPINLOCK(balancing);
8248 * Scale the max load_balance interval with the number of CPUs in the system.
8249 * This trades load-balance latency on larger machines for less cross talk.
8251 void update_max_interval(void)
8253 max_load_balance_interval = HZ*num_online_cpus()/10;
8257 * It checks each scheduling domain to see if it is due to be balanced,
8258 * and initiates a balancing operation if so.
8260 * Balancing parameters are set up in init_sched_domains.
8262 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
8264 int continue_balancing = 1;
8266 unsigned long interval;
8267 struct sched_domain *sd;
8268 /* Earliest time when we have to do rebalance again */
8269 unsigned long next_balance = jiffies + 60*HZ;
8270 int update_next_balance = 0;
8271 int need_serialize, need_decay = 0;
8274 update_blocked_averages(cpu);
8277 for_each_domain(cpu, sd) {
8279 * Decay the newidle max times here because this is a regular
8280 * visit to all the domains. Decay ~1% per second.
8282 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
8283 sd->max_newidle_lb_cost =
8284 (sd->max_newidle_lb_cost * 253) / 256;
8285 sd->next_decay_max_lb_cost = jiffies + HZ;
8288 max_cost += sd->max_newidle_lb_cost;
8290 if (!(sd->flags & SD_LOAD_BALANCE))
8294 * Stop the load balance at this level. There is another
8295 * CPU in our sched group which is doing load balancing more
8298 if (!continue_balancing) {
8304 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
8306 need_serialize = sd->flags & SD_SERIALIZE;
8307 if (need_serialize) {
8308 if (!spin_trylock(&balancing))
8312 if (time_after_eq(jiffies, sd->last_balance + interval)) {
8313 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
8315 * The LBF_DST_PINNED logic could have changed
8316 * env->dst_cpu, so we can't know our idle
8317 * state even if we migrated tasks. Update it.
8319 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
8321 sd->last_balance = jiffies;
8322 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
8325 spin_unlock(&balancing);
8327 if (time_after(next_balance, sd->last_balance + interval)) {
8328 next_balance = sd->last_balance + interval;
8329 update_next_balance = 1;
8334 * Ensure the rq-wide value also decays but keep it at a
8335 * reasonable floor to avoid funnies with rq->avg_idle.
8337 rq->max_idle_balance_cost =
8338 max((u64)sysctl_sched_migration_cost, max_cost);
8343 * next_balance will be updated only when there is a need.
8344 * When the cpu is attached to null domain for ex, it will not be
8347 if (likely(update_next_balance)) {
8348 rq->next_balance = next_balance;
8350 #ifdef CONFIG_NO_HZ_COMMON
8352 * If this CPU has been elected to perform the nohz idle
8353 * balance. Other idle CPUs have already rebalanced with
8354 * nohz_idle_balance() and nohz.next_balance has been
8355 * updated accordingly. This CPU is now running the idle load
8356 * balance for itself and we need to update the
8357 * nohz.next_balance accordingly.
8359 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
8360 nohz.next_balance = rq->next_balance;
8365 #ifdef CONFIG_NO_HZ_COMMON
8367 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
8368 * rebalancing for all the cpus for whom scheduler ticks are stopped.
8370 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
8372 int this_cpu = this_rq->cpu;
8375 /* Earliest time when we have to do rebalance again */
8376 unsigned long next_balance = jiffies + 60*HZ;
8377 int update_next_balance = 0;
8379 if (idle != CPU_IDLE ||
8380 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
8383 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
8384 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
8388 * If this cpu gets work to do, stop the load balancing
8389 * work being done for other cpus. Next load
8390 * balancing owner will pick it up.
8395 rq = cpu_rq(balance_cpu);
8398 * If time for next balance is due,
8401 if (time_after_eq(jiffies, rq->next_balance)) {
8402 raw_spin_lock_irq(&rq->lock);
8403 update_rq_clock(rq);
8404 cpu_load_update_idle(rq);
8405 raw_spin_unlock_irq(&rq->lock);
8406 rebalance_domains(rq, CPU_IDLE);
8409 if (time_after(next_balance, rq->next_balance)) {
8410 next_balance = rq->next_balance;
8411 update_next_balance = 1;
8416 * next_balance will be updated only when there is a need.
8417 * When the CPU is attached to null domain for ex, it will not be
8420 if (likely(update_next_balance))
8421 nohz.next_balance = next_balance;
8423 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
8427 * Current heuristic for kicking the idle load balancer in the presence
8428 * of an idle cpu in the system.
8429 * - This rq has more than one task.
8430 * - This rq has at least one CFS task and the capacity of the CPU is
8431 * significantly reduced because of RT tasks or IRQs.
8432 * - At parent of LLC scheduler domain level, this cpu's scheduler group has
8433 * multiple busy cpu.
8434 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
8435 * domain span are idle.
8437 static inline bool nohz_kick_needed(struct rq *rq)
8439 unsigned long now = jiffies;
8440 struct sched_domain_shared *sds;
8441 struct sched_domain *sd;
8442 int nr_busy, cpu = rq->cpu;
8445 if (unlikely(rq->idle_balance))
8449 * We may be recently in ticked or tickless idle mode. At the first
8450 * busy tick after returning from idle, we will update the busy stats.
8452 set_cpu_sd_state_busy();
8453 nohz_balance_exit_idle(cpu);
8456 * None are in tickless mode and hence no need for NOHZ idle load
8459 if (likely(!atomic_read(&nohz.nr_cpus)))
8462 if (time_before(now, nohz.next_balance))
8465 if (rq->nr_running >= 2)
8469 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
8472 * XXX: write a coherent comment on why we do this.
8473 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
8475 nr_busy = atomic_read(&sds->nr_busy_cpus);
8483 sd = rcu_dereference(rq->sd);
8485 if ((rq->cfs.h_nr_running >= 1) &&
8486 check_cpu_capacity(rq, sd)) {
8492 sd = rcu_dereference(per_cpu(sd_asym, cpu));
8493 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
8494 sched_domain_span(sd)) < cpu)) {
8504 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
8508 * run_rebalance_domains is triggered when needed from the scheduler tick.
8509 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
8511 static void run_rebalance_domains(struct softirq_action *h)
8513 struct rq *this_rq = this_rq();
8514 enum cpu_idle_type idle = this_rq->idle_balance ?
8515 CPU_IDLE : CPU_NOT_IDLE;
8518 * If this cpu has a pending nohz_balance_kick, then do the
8519 * balancing on behalf of the other idle cpus whose ticks are
8520 * stopped. Do nohz_idle_balance *before* rebalance_domains to
8521 * give the idle cpus a chance to load balance. Else we may
8522 * load balance only within the local sched_domain hierarchy
8523 * and abort nohz_idle_balance altogether if we pull some load.
8525 nohz_idle_balance(this_rq, idle);
8526 rebalance_domains(this_rq, idle);
8530 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
8532 void trigger_load_balance(struct rq *rq)
8534 /* Don't need to rebalance while attached to NULL domain */
8535 if (unlikely(on_null_domain(rq)))
8538 if (time_after_eq(jiffies, rq->next_balance))
8539 raise_softirq(SCHED_SOFTIRQ);
8540 #ifdef CONFIG_NO_HZ_COMMON
8541 if (nohz_kick_needed(rq))
8542 nohz_balancer_kick();
8546 static void rq_online_fair(struct rq *rq)
8550 update_runtime_enabled(rq);
8553 static void rq_offline_fair(struct rq *rq)
8557 /* Ensure any throttled groups are reachable by pick_next_task */
8558 unthrottle_offline_cfs_rqs(rq);
8561 #endif /* CONFIG_SMP */
8564 * scheduler tick hitting a task of our scheduling class:
8566 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
8568 struct cfs_rq *cfs_rq;
8569 struct sched_entity *se = &curr->se;
8571 for_each_sched_entity(se) {
8572 cfs_rq = cfs_rq_of(se);
8573 entity_tick(cfs_rq, se, queued);
8576 if (static_branch_unlikely(&sched_numa_balancing))
8577 task_tick_numa(rq, curr);
8581 * called on fork with the child task as argument from the parent's context
8582 * - child not yet on the tasklist
8583 * - preemption disabled
8585 static void task_fork_fair(struct task_struct *p)
8587 struct cfs_rq *cfs_rq;
8588 struct sched_entity *se = &p->se, *curr;
8589 struct rq *rq = this_rq();
8591 raw_spin_lock(&rq->lock);
8592 update_rq_clock(rq);
8594 cfs_rq = task_cfs_rq(current);
8595 curr = cfs_rq->curr;
8597 update_curr(cfs_rq);
8598 se->vruntime = curr->vruntime;
8600 place_entity(cfs_rq, se, 1);
8602 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
8604 * Upon rescheduling, sched_class::put_prev_task() will place
8605 * 'current' within the tree based on its new key value.
8607 swap(curr->vruntime, se->vruntime);
8611 se->vruntime -= cfs_rq->min_vruntime;
8612 raw_spin_unlock(&rq->lock);
8616 * Priority of the task has changed. Check to see if we preempt
8620 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
8622 if (!task_on_rq_queued(p))
8626 * Reschedule if we are currently running on this runqueue and
8627 * our priority decreased, or if we are not currently running on
8628 * this runqueue and our priority is higher than the current's
8630 if (rq->curr == p) {
8631 if (p->prio > oldprio)
8634 check_preempt_curr(rq, p, 0);
8637 static inline bool vruntime_normalized(struct task_struct *p)
8639 struct sched_entity *se = &p->se;
8642 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
8643 * the dequeue_entity(.flags=0) will already have normalized the
8650 * When !on_rq, vruntime of the task has usually NOT been normalized.
8651 * But there are some cases where it has already been normalized:
8653 * - A forked child which is waiting for being woken up by
8654 * wake_up_new_task().
8655 * - A task which has been woken up by try_to_wake_up() and
8656 * waiting for actually being woken up by sched_ttwu_pending().
8658 if (!se->sum_exec_runtime || p->state == TASK_WAKING)
8664 static void detach_task_cfs_rq(struct task_struct *p)
8666 struct sched_entity *se = &p->se;
8667 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8668 u64 now = cfs_rq_clock_task(cfs_rq);
8670 if (!vruntime_normalized(p)) {
8672 * Fix up our vruntime so that the current sleep doesn't
8673 * cause 'unlimited' sleep bonus.
8675 place_entity(cfs_rq, se, 0);
8676 se->vruntime -= cfs_rq->min_vruntime;
8679 /* Catch up with the cfs_rq and remove our load when we leave */
8680 update_cfs_rq_load_avg(now, cfs_rq, false);
8681 detach_entity_load_avg(cfs_rq, se);
8682 update_tg_load_avg(cfs_rq, false);
8685 static void attach_task_cfs_rq(struct task_struct *p)
8687 struct sched_entity *se = &p->se;
8688 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8689 u64 now = cfs_rq_clock_task(cfs_rq);
8691 #ifdef CONFIG_FAIR_GROUP_SCHED
8693 * Since the real-depth could have been changed (only FAIR
8694 * class maintain depth value), reset depth properly.
8696 se->depth = se->parent ? se->parent->depth + 1 : 0;
8699 /* Synchronize task with its cfs_rq */
8700 update_cfs_rq_load_avg(now, cfs_rq, false);
8701 attach_entity_load_avg(cfs_rq, se);
8702 update_tg_load_avg(cfs_rq, false);
8704 if (!vruntime_normalized(p))
8705 se->vruntime += cfs_rq->min_vruntime;
8708 static void switched_from_fair(struct rq *rq, struct task_struct *p)
8710 detach_task_cfs_rq(p);
8713 static void switched_to_fair(struct rq *rq, struct task_struct *p)
8715 attach_task_cfs_rq(p);
8717 if (task_on_rq_queued(p)) {
8719 * We were most likely switched from sched_rt, so
8720 * kick off the schedule if running, otherwise just see
8721 * if we can still preempt the current task.
8726 check_preempt_curr(rq, p, 0);
8730 /* Account for a task changing its policy or group.
8732 * This routine is mostly called to set cfs_rq->curr field when a task
8733 * migrates between groups/classes.
8735 static void set_curr_task_fair(struct rq *rq)
8737 struct sched_entity *se = &rq->curr->se;
8739 for_each_sched_entity(se) {
8740 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8742 set_next_entity(cfs_rq, se);
8743 /* ensure bandwidth has been allocated on our new cfs_rq */
8744 account_cfs_rq_runtime(cfs_rq, 0);
8748 void init_cfs_rq(struct cfs_rq *cfs_rq)
8750 cfs_rq->tasks_timeline = RB_ROOT;
8751 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8752 #ifndef CONFIG_64BIT
8753 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8756 atomic_long_set(&cfs_rq->removed_load_avg, 0);
8757 atomic_long_set(&cfs_rq->removed_util_avg, 0);
8761 #ifdef CONFIG_FAIR_GROUP_SCHED
8762 static void task_set_group_fair(struct task_struct *p)
8764 struct sched_entity *se = &p->se;
8766 set_task_rq(p, task_cpu(p));
8767 se->depth = se->parent ? se->parent->depth + 1 : 0;
8770 static void task_move_group_fair(struct task_struct *p)
8772 detach_task_cfs_rq(p);
8773 set_task_rq(p, task_cpu(p));
8776 /* Tell se's cfs_rq has been changed -- migrated */
8777 p->se.avg.last_update_time = 0;
8779 attach_task_cfs_rq(p);
8782 static void task_change_group_fair(struct task_struct *p, int type)
8785 case TASK_SET_GROUP:
8786 task_set_group_fair(p);
8789 case TASK_MOVE_GROUP:
8790 task_move_group_fair(p);
8795 void free_fair_sched_group(struct task_group *tg)
8799 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8801 for_each_possible_cpu(i) {
8803 kfree(tg->cfs_rq[i]);
8812 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8814 struct sched_entity *se;
8815 struct cfs_rq *cfs_rq;
8819 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8822 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8826 tg->shares = NICE_0_LOAD;
8828 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8830 for_each_possible_cpu(i) {
8833 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8834 GFP_KERNEL, cpu_to_node(i));
8838 se = kzalloc_node(sizeof(struct sched_entity),
8839 GFP_KERNEL, cpu_to_node(i));
8843 init_cfs_rq(cfs_rq);
8844 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8845 init_entity_runnable_average(se);
8856 void online_fair_sched_group(struct task_group *tg)
8858 struct sched_entity *se;
8862 for_each_possible_cpu(i) {
8866 raw_spin_lock_irq(&rq->lock);
8867 post_init_entity_util_avg(se);
8868 sync_throttle(tg, i);
8869 raw_spin_unlock_irq(&rq->lock);
8873 void unregister_fair_sched_group(struct task_group *tg)
8875 unsigned long flags;
8879 for_each_possible_cpu(cpu) {
8881 remove_entity_load_avg(tg->se[cpu]);
8884 * Only empty task groups can be destroyed; so we can speculatively
8885 * check on_list without danger of it being re-added.
8887 if (!tg->cfs_rq[cpu]->on_list)
8892 raw_spin_lock_irqsave(&rq->lock, flags);
8893 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8894 raw_spin_unlock_irqrestore(&rq->lock, flags);
8898 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8899 struct sched_entity *se, int cpu,
8900 struct sched_entity *parent)
8902 struct rq *rq = cpu_rq(cpu);
8906 init_cfs_rq_runtime(cfs_rq);
8908 tg->cfs_rq[cpu] = cfs_rq;
8911 /* se could be NULL for root_task_group */
8916 se->cfs_rq = &rq->cfs;
8919 se->cfs_rq = parent->my_q;
8920 se->depth = parent->depth + 1;
8924 /* guarantee group entities always have weight */
8925 update_load_set(&se->load, NICE_0_LOAD);
8926 se->parent = parent;
8929 static DEFINE_MUTEX(shares_mutex);
8931 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8934 unsigned long flags;
8937 * We can't change the weight of the root cgroup.
8942 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8944 mutex_lock(&shares_mutex);
8945 if (tg->shares == shares)
8948 tg->shares = shares;
8949 for_each_possible_cpu(i) {
8950 struct rq *rq = cpu_rq(i);
8951 struct sched_entity *se;
8954 /* Propagate contribution to hierarchy */
8955 raw_spin_lock_irqsave(&rq->lock, flags);
8957 /* Possible calls to update_curr() need rq clock */
8958 update_rq_clock(rq);
8959 for_each_sched_entity(se)
8960 update_cfs_shares(group_cfs_rq(se));
8961 raw_spin_unlock_irqrestore(&rq->lock, flags);
8965 mutex_unlock(&shares_mutex);
8968 #else /* CONFIG_FAIR_GROUP_SCHED */
8970 void free_fair_sched_group(struct task_group *tg) { }
8972 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8977 void online_fair_sched_group(struct task_group *tg) { }
8979 void unregister_fair_sched_group(struct task_group *tg) { }
8981 #endif /* CONFIG_FAIR_GROUP_SCHED */
8984 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
8986 struct sched_entity *se = &task->se;
8987 unsigned int rr_interval = 0;
8990 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
8993 if (rq->cfs.load.weight)
8994 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
9000 * All the scheduling class methods:
9002 const struct sched_class fair_sched_class = {
9003 .next = &idle_sched_class,
9004 .enqueue_task = enqueue_task_fair,
9005 .dequeue_task = dequeue_task_fair,
9006 .yield_task = yield_task_fair,
9007 .yield_to_task = yield_to_task_fair,
9009 .check_preempt_curr = check_preempt_wakeup,
9011 .pick_next_task = pick_next_task_fair,
9012 .put_prev_task = put_prev_task_fair,
9015 .select_task_rq = select_task_rq_fair,
9016 .migrate_task_rq = migrate_task_rq_fair,
9018 .rq_online = rq_online_fair,
9019 .rq_offline = rq_offline_fair,
9021 .task_dead = task_dead_fair,
9022 .set_cpus_allowed = set_cpus_allowed_common,
9025 .set_curr_task = set_curr_task_fair,
9026 .task_tick = task_tick_fair,
9027 .task_fork = task_fork_fair,
9029 .prio_changed = prio_changed_fair,
9030 .switched_from = switched_from_fair,
9031 .switched_to = switched_to_fair,
9033 .get_rr_interval = get_rr_interval_fair,
9035 .update_curr = update_curr_fair,
9037 #ifdef CONFIG_FAIR_GROUP_SCHED
9038 .task_change_group = task_change_group_fair,
9042 #ifdef CONFIG_SCHED_DEBUG
9043 void print_cfs_stats(struct seq_file *m, int cpu)
9045 struct cfs_rq *cfs_rq;
9048 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
9049 print_cfs_rq(m, cpu, cfs_rq);
9053 #ifdef CONFIG_NUMA_BALANCING
9054 void show_numa_stats(struct task_struct *p, struct seq_file *m)
9057 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
9059 for_each_online_node(node) {
9060 if (p->numa_faults) {
9061 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
9062 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
9064 if (p->numa_group) {
9065 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
9066 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
9068 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
9071 #endif /* CONFIG_NUMA_BALANCING */
9072 #endif /* CONFIG_SCHED_DEBUG */
9074 __init void init_sched_fair_class(void)
9077 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
9079 #ifdef CONFIG_NO_HZ_COMMON
9080 nohz.next_balance = jiffies;
9081 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);