Merge tag 'driver-core-4.9-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...

[cascardo/linux.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 15902cd..502e95a 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -262,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  
  static inline struct task_struct *task_of(struct sched_entity *se)
  {
-#ifdef CONFIG_SCHED_DEBUG
-       WARN_ON_ONCE(!entity_is_task(se));
-#endif
+       SCHED_WARN_ON(!entity_is_task(se));
         return container_of(se, struct task_struct, se);
  }
  
@@ -462,17 +460,23 @@ static inline int entity_before(struct sched_entity *a,
  
  static void update_min_vruntime(struct cfs_rq *cfs_rq)
  {
+       struct sched_entity *curr = cfs_rq->curr;
+
         u64 vruntime = cfs_rq->min_vruntime;
  
-       if (cfs_rq->curr)
-               vruntime = cfs_rq->curr->vruntime;
+       if (curr) {
+               if (curr->on_rq)
+                       vruntime = curr->vruntime;
+               else
+                       curr = NULL;
+       }
  
         if (cfs_rq->rb_leftmost) {
                 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
                                                    struct sched_entity,
                                                    run_node);
  
-               if (!cfs_rq->curr)
+               if (!curr)
                         vruntime = se->vruntime;
                 else
                         vruntime = min_vruntime(vruntime, se->vruntime);
@@ -1582,9 +1586,16 @@ balance:
          * One idle CPU per node is evaluated for a task numa move.
          * Call select_idle_sibling to maybe find a better one.
          */
-       if (!cur)
+       if (!cur) {
+               /*
+                * select_idle_siblings() uses an per-cpu cpumask that
+                * can be used from IRQ context.
+                */
+               local_irq_disable();
                 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
                                                    env->dst_cpu);
+               local_irq_enable();
+       }
  
  assign:
         task_numa_assign(env, cur, imp);
@@ -2362,7 +2373,7 @@ void task_numa_work(struct callback_head *work)
         unsigned long nr_pte_updates = 0;
         long pages, virtpages;
  
-       WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+       SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
  
         work->next = work; /* protect against double add */
         /*
@@ -2957,12 +2968,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
  
  static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
  {
-       struct rq *rq = rq_of(cfs_rq);
-       int cpu = cpu_of(rq);
-
-       if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
-               unsigned long max = rq->cpu_capacity_orig;
-
+       if (&this_rq()->cfs == cfs_rq) {
                 /*
                  * There are a few boundary cases this might miss but it should
                  * get called often enough that that should (hopefully) not be
@@ -2979,8 +2985,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
                  *
                  * See cpu_util().
                  */
-               cpufreq_update_util(rq_clock(rq),
-                                   min(cfs_rq->avg.util_avg, max), max);
+               cpufreq_update_util(rq_of(cfs_rq), 0);
         }
  }
  
@@ -3241,10 +3246,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
  
  static inline void update_load_avg(struct sched_entity *se, int not_used)
  {
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       struct rq *rq = rq_of(cfs_rq);
-
-       cpufreq_trigger_update(rq_clock(rq));
+       cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
  }
  
  static inline void
@@ -3473,9 +3475,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         account_entity_dequeue(cfs_rq, se);
  
         /*
-        * Normalize the entity after updating the min_vruntime because the
-        * update can refer to the ->curr item and we need to reflect this
-        * movement in our normalized position.
+        * Normalize after update_curr(); which will also have moved
+        * min_vruntime if @se is the one holding it back. But before doing
+        * update_min_vruntime() again, which will discount @se's position and
+        * can move min_vruntime forward still more.
          */
         if (!(flags & DEQUEUE_SLEEP))
                 se->vruntime -= cfs_rq->min_vruntime;
@@ -3483,8 +3486,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         /* return excess runtime on last dequeue */
         return_cfs_rq_runtime(cfs_rq);
  
-       update_min_vruntime(cfs_rq);
         update_cfs_shares(cfs_rq);
+
+       /*
+        * Now advance min_vruntime if @se was the entity holding it back,
+        * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
+        * put back on, and if we advance min_vruntime, we'll be placed back
+        * further than we started -- ie. we'll be penalized.
+        */
+       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+               update_min_vruntime(cfs_rq);
  }
  
  /*
@@ -4467,7 +4478,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
-       WARN_ON(task_rq(p) != rq);
+       SCHED_WARN_ON(task_rq(p) != rq);
  
         if (rq->cfs.h_nr_running > 1) {
                 u64 slice = sched_slice(cfs_rq, se);
@@ -4520,6 +4531,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         struct cfs_rq *cfs_rq;
         struct sched_entity *se = &p->se;
  
+       /*
+        * If in_iowait is set, the code below may not trigger any cpufreq
+        * utilization updates, so do it here explicitly with the IOWAIT flag
+        * passed.
+        */
+       if (p->in_iowait)
+               cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+
         for_each_sched_entity(se) {
                 if (se->on_rq)
                         break;
@@ -4616,6 +4635,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  }
  
  #ifdef CONFIG_SMP
+
+/* Working cpumask for: load_balance, load_balance_newidle. */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+
  #ifdef CONFIG_NO_HZ_COMMON
  /*
   * per rq 'load' arrray crap; XXX kill this.
@@ -5280,65 +5304,237 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  }
  
  /*
- * Try and locate an idle CPU in the sched_domain.
+ * Implement a for_each_cpu() variant that starts the scan at a given cpu
+ * (@start), and wraps around.
+ *
+ * This is used to scan for idle CPUs; such that not all CPUs looking for an
+ * idle CPU find the same CPU. The down-side is that tasks tend to cycle
+ * through the LLC domain.
+ *
+ * Especially tbench is found sensitive to this.
+ */
+
+static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
+{
+       int next;
+
+again:
+       next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
+
+       if (*wrapped) {
+               if (next >= start)
+                       return nr_cpumask_bits;
+       } else {
+               if (next >= nr_cpumask_bits) {
+                       *wrapped = 1;
+                       n = -1;
+                       goto again;
+               }
+       }
+
+       return next;
+}
+
+#define for_each_cpu_wrap(cpu, mask, start, wrap)                              \
+       for ((wrap) = 0, (cpu) = (start)-1;                                     \
+               (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)),     \
+               (cpu) < nr_cpumask_bits; )
+
+#ifdef CONFIG_SCHED_SMT
+
+static inline void set_idle_cores(int cpu, int val)
+{
+       struct sched_domain_shared *sds;
+
+       sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+       if (sds)
+               WRITE_ONCE(sds->has_idle_cores, val);
+}
+
+static inline bool test_idle_cores(int cpu, bool def)
+{
+       struct sched_domain_shared *sds;
+
+       sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+       if (sds)
+               return READ_ONCE(sds->has_idle_cores);
+
+       return def;
+}
+
+/*
+ * Scans the local SMT mask to see if the entire core is idle, and records this
+ * information in sd_llc_shared->has_idle_cores.
+ *
+ * Since SMT siblings share all cache levels, inspecting this limited remote
+ * state should be fairly cheap.
+ */
+void __update_idle_core(struct rq *rq)
+{
+       int core = cpu_of(rq);
+       int cpu;
+
+       rcu_read_lock();
+       if (test_idle_cores(core, true))
+               goto unlock;
+
+       for_each_cpu(cpu, cpu_smt_mask(core)) {
+               if (cpu == core)
+                       continue;
+
+               if (!idle_cpu(cpu))
+                       goto unlock;
+       }
+
+       set_idle_cores(core, 1);
+unlock:
+       rcu_read_unlock();
+}
+
+/*
+ * Scan the entire LLC domain for idle cores; this dynamically switches off if
+ * there are no idle cores left in the system; tracked through
+ * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ */
+static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+       struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+       int core, cpu, wrap;
+
+       if (!static_branch_likely(&sched_smt_present))
+               return -1;
+
+       if (!test_idle_cores(target, false))
+               return -1;
+
+       cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+
+       for_each_cpu_wrap(core, cpus, target, wrap) {
+               bool idle = true;
+
+               for_each_cpu(cpu, cpu_smt_mask(core)) {
+                       cpumask_clear_cpu(cpu, cpus);
+                       if (!idle_cpu(cpu))
+                               idle = false;
+               }
+
+               if (idle)
+                       return core;
+       }
+
+       /*
+        * Failed to find an idle core; stop looking for one.
+        */
+       set_idle_cores(target, 0);
+
+       return -1;
+}
+
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+       int cpu;
+
+       if (!static_branch_likely(&sched_smt_present))
+               return -1;
+
+       for_each_cpu(cpu, cpu_smt_mask(target)) {
+               if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+                       continue;
+               if (idle_cpu(cpu))
+                       return cpu;
+       }
+
+       return -1;
+}
+
+#else /* CONFIG_SCHED_SMT */
+
+static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+       return -1;
+}
+
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+       return -1;
+}
+
+#endif /* CONFIG_SCHED_SMT */
+
+/*
+ * Scan the LLC domain for idle CPUs; this is dynamically regulated by
+ * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
+ * average idle time for this rq (as found in rq->avg_idle).
+ */
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+{
+       struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+       u64 avg_idle = this_rq()->avg_idle;
+       u64 avg_cost = this_sd->avg_scan_cost;
+       u64 time, cost;
+       s64 delta;
+       int cpu, wrap;
+
+       /*
+        * Due to large variance we need a large fuzz factor; hackbench in
+        * particularly is sensitive here.
+        */
+       if ((avg_idle / 512) < avg_cost)
+               return -1;
+
+       time = local_clock();
+
+       for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+               if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+                       continue;
+               if (idle_cpu(cpu))
+                       break;
+       }
+
+       time = local_clock() - time;
+       cost = this_sd->avg_scan_cost;
+       delta = (s64)(time - cost) / 8;
+       this_sd->avg_scan_cost += delta;
+
+       return cpu;
+}
+
+/*
+ * Try and locate an idle core/thread in the LLC cache domain.
   */
  static int select_idle_sibling(struct task_struct *p, int prev, int target)
  {
         struct sched_domain *sd;
-       struct sched_group *sg;
+       int i;
  
         if (idle_cpu(target))
                 return target;
  
         /*
-        * If the prevous cpu is cache affine and idle, don't be stupid.
+        * If the previous cpu is cache affine and idle, don't be stupid.
          */
         if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
                 return prev;
  
-       /*
-        * Otherwise, iterate the domains and find an eligible idle cpu.
-        *
-        * A completely idle sched group at higher domains is more
-        * desirable than an idle group at a lower level, because lower
-        * domains have smaller groups and usually share hardware
-        * resources which causes tasks to contend on them, e.g. x86
-        * hyperthread siblings in the lowest domain (SMT) can contend
-        * on the shared cpu pipeline.
-        *
-        * However, while we prefer idle groups at higher domains
-        * finding an idle cpu at the lowest domain is still better than
-        * returning 'target', which we've already established, isn't
-        * idle.
-        */
         sd = rcu_dereference(per_cpu(sd_llc, target));
-       for_each_lower_domain(sd) {
-               sg = sd->groups;
-               do {
-                       int i;
+       if (!sd)
+               return target;
  
-                       if (!cpumask_intersects(sched_group_cpus(sg),
-                                               tsk_cpus_allowed(p)))
-                               goto next;
+       i = select_idle_core(p, sd, target);
+       if ((unsigned)i < nr_cpumask_bits)
+               return i;
  
-                       /* Ensure the entire group is idle */
-                       for_each_cpu(i, sched_group_cpus(sg)) {
-                               if (i == target || !idle_cpu(i))
-                                       goto next;
-                       }
+       i = select_idle_cpu(p, sd, target);
+       if ((unsigned)i < nr_cpumask_bits)
+               return i;
+
+       i = select_idle_smt(p, sd, target);
+       if ((unsigned)i < nr_cpumask_bits)
+               return i;
  
-                       /*
-                        * It doesn't matter which cpu we pick, the
-                        * whole group is idle.
-                        */
-                       target = cpumask_first_and(sched_group_cpus(sg),
-                                       tsk_cpus_allowed(p));
-                       goto done;
-next:
-                       sg = sg->next;
-               } while (sg != sd->groups);
-       }
-done:
         return target;
  }
  
@@ -7397,9 +7593,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
   */
  #define MAX_PINNED_INTERVAL    512
  
-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-
  static int need_active_balance(struct lb_env *env)
  {
         struct sched_domain *sd = env->sd;