Merge tag 'trace-fixes-v4.6-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git...

[cascardo/linux.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 537d71e..218f8e8 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3099,7 +3099,14 @@ static int idle_balance(struct rq *this_rq);
  
  #else /* CONFIG_SMP */
  
-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline void update_load_avg(struct sched_entity *se, int not_used)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       struct rq *rq = rq_of(cfs_rq);
+
+       cpufreq_trigger_update(rq_clock(rq));
+}
+
  static inline void
  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  static inline void
@@ -3247,10 +3254,41 @@ static inline void check_schedstat_required(void)
  #endif
  }
  
+
+/*
+ * MIGRATION
+ *
+ *     dequeue
+ *       update_curr()
+ *         update_min_vruntime()
+ *       vruntime -= min_vruntime
+ *
+ *     enqueue
+ *       update_curr()
+ *         update_min_vruntime()
+ *       vruntime += min_vruntime
+ *
+ * this way the vruntime transition between RQs is done when both
+ * min_vruntime are up-to-date.
+ *
+ * WAKEUP (remote)
+ *
+ *     ->migrate_task_rq_fair() (p->state == TASK_WAKING)
+ *       vruntime -= min_vruntime
+ *
+ *     enqueue
+ *       update_curr()
+ *         update_min_vruntime()
+ *       vruntime += min_vruntime
+ *
+ * this way we don't have the most up-to-date min_vruntime on the originating
+ * CPU and an up-to-date min_vruntime on the destination CPU.
+ */
+
  static void
  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
-       bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
+       bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
         bool curr = cfs_rq->curr == se;
  
         /*
@@ -3264,7 +3302,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  
         /*
          * Otherwise, renormalise after, such that we're placed at the current
-        * moment in time, instead of some random moment in the past.
+        * moment in time, instead of some random moment in the past. Being
+        * placed in the past could significantly boost this task to the
+        * fairness detriment of existing tasks.
          */
         if (renorm && !curr)
                 se->vruntime += cfs_rq->min_vruntime;
@@ -4811,46 +4851,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
         return 0;
  }
  
-static void record_wakee(struct task_struct *p)
-{
-       /*
-        * Rough decay (wiping) for cost saving, don't worry
-        * about the boundary, really active task won't care
-        * about the loss.
-        */
-       if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
-               current->wakee_flips >>= 1;
-               current->wakee_flip_decay_ts = jiffies;
-       }
-
-       if (current->last_wakee != p) {
-               current->last_wakee = p;
-               current->wakee_flips++;
-       }
-}
-
-static void task_waking_fair(struct task_struct *p)
-{
-       struct sched_entity *se = &p->se;
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
-       u64 min_vruntime_copy;
-
-       do {
-               min_vruntime_copy = cfs_rq->min_vruntime_copy;
-               smp_rmb();
-               min_vruntime = cfs_rq->min_vruntime;
-       } while (min_vruntime != min_vruntime_copy);
-#else
-       min_vruntime = cfs_rq->min_vruntime;
-#endif
-
-       se->vruntime -= min_vruntime;
-       record_wakee(p);
-}
-
  #ifdef CONFIG_FAIR_GROUP_SCHED
  /*
   * effective_load() calculates the load change as seen from the root_task_group
@@ -4966,17 +4966,39 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  
  #endif
  
+static void record_wakee(struct task_struct *p)
+{
+       /*
+        * Only decay a single time; tasks that have less then 1 wakeup per
+        * jiffy will not have built up many flips.
+        */
+       if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
+               current->wakee_flips >>= 1;
+               current->wakee_flip_decay_ts = jiffies;
+       }
+
+       if (current->last_wakee != p) {
+               current->last_wakee = p;
+               current->wakee_flips++;
+       }
+}
+
  /*
   * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ *
   * A waker of many should wake a different task than the one last awakened
- * at a frequency roughly N times higher than one of its wakees.  In order
- * to determine whether we should let the load spread vs consolodating to
- * shared cache, we look for a minimum 'flip' frequency of llc_size in one
- * partner, and a factor of lls_size higher frequency in the other.  With
- * both conditions met, we can be relatively sure that the relationship is
- * non-monogamous, with partner count exceeding socket size.  Waker/wakee
- * being client/server, worker/dispatcher, interrupt source or whatever is
- * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ * at a frequency roughly N times higher than one of its wakees.
+ *
+ * In order to determine whether we should let the load spread vs consolidating
+ * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other.
+ *
+ * With both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size.
+ *
+ * Waker/wakee being client/server, worker/dispatcher, interrupt source or
+ * whatever is irrelevant, spread criteria is apparent partner count exceeds
+ * socket size.
   */
  static int wake_wide(struct task_struct *p)
  {
@@ -5281,8 +5303,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
         int want_affine = 0;
         int sync = wake_flags & WF_SYNC;
  
-       if (sd_flag & SD_BALANCE_WAKE)
+       if (sd_flag & SD_BALANCE_WAKE) {
+               record_wakee(p);
                 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+       }
  
         rcu_read_lock();
         for_each_domain(cpu, tmp) {
@@ -5361,6 +5385,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
   */
  static void migrate_task_rq_fair(struct task_struct *p)
  {
+       /*
+        * As blocked tasks retain absolute vruntime the migration needs to
+        * deal with this by subtracting the old and adding the new
+        * min_vruntime -- the latter is done by enqueue_entity() when placing
+        * the task on the new runqueue.
+        */
+       if (p->state == TASK_WAKING) {
+               struct sched_entity *se = &p->se;
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+               u64 min_vruntime;
+
+#ifndef CONFIG_64BIT
+               u64 min_vruntime_copy;
+
+               do {
+                       min_vruntime_copy = cfs_rq->min_vruntime_copy;
+                       smp_rmb();
+                       min_vruntime = cfs_rq->min_vruntime;
+               } while (min_vruntime != min_vruntime_copy);
+#else
+               min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+               se->vruntime -= min_vruntime;
+       }
+
         /*
          * We are supposed to update the task to "current" time, then its up to date
          * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
@@ -7000,9 +7050,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         }
  
         /*
-        * In the presence of smp nice balancing, certain scenarios can have
-        * max load less than avg load(as we skip the groups at or below
-        * its cpu_capacity, while calculating max_load..)
+        * Avg load of busiest sg can be less and avg load of local sg can
+        * be greater than avg load across all sgs of sd because avg load
+        * factors in sg capacity and sgs with smaller group_type are
+        * skipped when updating the busiest sg:
          */
         if (busiest->avg_load <= sds->avg_load ||
             local->avg_load >= sds->avg_load) {
@@ -7015,11 +7066,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
          */
         if (busiest->group_type == group_overloaded &&
             local->group_type   == group_overloaded) {
-               load_above_capacity = busiest->sum_nr_running *
-                                     scale_load_down(NICE_0_LOAD);
-               if (load_above_capacity > busiest->group_capacity)
+               load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
+               if (load_above_capacity > busiest->group_capacity) {
                         load_above_capacity -= busiest->group_capacity;
-               else
+                       load_above_capacity *= NICE_0_LOAD;
+                       load_above_capacity /= busiest->group_capacity;
+               } else
                         load_above_capacity = ~0UL;
         }
  
@@ -7814,7 +7866,7 @@ static void nohz_balancer_kick(void)
         return;
  }
  
-static inline void nohz_balance_exit_idle(int cpu)
+void nohz_balance_exit_idle(unsigned int cpu)
  {
         if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
                 /*
@@ -7887,18 +7939,6 @@ void nohz_balance_enter_idle(int cpu)
         atomic_inc(&nohz.nr_cpus);
         set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
  }
-
-static int sched_ilb_notifier(struct notifier_block *nfb,
-                                       unsigned long action, void *hcpu)
-{
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DYING:
-               nohz_balance_exit_idle(smp_processor_id());
-               return NOTIFY_OK;
-       default:
-               return NOTIFY_DONE;
-       }
-}
  #endif
  
  static DEFINE_SPINLOCK(balancing);
@@ -8642,7 +8682,6 @@ const struct sched_class fair_sched_class = {
         .rq_online              = rq_online_fair,
         .rq_offline             = rq_offline_fair,
  
-       .task_waking            = task_waking_fair,
         .task_dead              = task_dead_fair,
         .set_cpus_allowed       = set_cpus_allowed_common,
  #endif
@@ -8704,7 +8743,6 @@ __init void init_sched_fair_class(void)
  #ifdef CONFIG_NO_HZ_COMMON
         nohz.next_balance = jiffies;
         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
-       cpu_notifier(sched_ilb_notifier, 0);
  #endif
  #endif /* SMP */