cpufreq / sched: Pass runqueue pointer to cpufreq_update_util()

[cascardo/linux.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index c8c5d2d..5d558cc 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,6 +690,11 @@ void init_entity_runnable_average(struct sched_entity *se)
         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
  }
  
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
  /*
   * With new tasks being created, their initial util_avgs are extrapolated
   * based on the cfs_rq's current util_avg:
@@ -720,6 +725,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         struct sched_avg *sa = &se->avg;
         long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+       u64 now = cfs_rq_clock_task(cfs_rq);
+       int tg_update;
  
         if (cap > 0) {
                 if (cfs_rq->avg.util_avg != 0) {
@@ -733,16 +740,42 @@ void post_init_entity_util_avg(struct sched_entity *se)
                 }
                 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
         }
+
+       if (entity_is_task(se)) {
+               struct task_struct *p = task_of(se);
+               if (p->sched_class != &fair_sched_class) {
+                       /*
+                        * For !fair tasks do:
+                        *
+                       update_cfs_rq_load_avg(now, cfs_rq, false);
+                       attach_entity_load_avg(cfs_rq, se);
+                       switched_from_fair(rq, p);
+                        *
+                        * such that the next switched_to_fair() has the
+                        * expected state.
+                        */
+                       se->avg.last_update_time = now;
+                       return;
+               }
+       }
+
+       tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+       attach_entity_load_avg(cfs_rq, se);
+       if (tg_update)
+               update_tg_load_avg(cfs_rq, false);
  }
  
-#else
+#else /* !CONFIG_SMP */
  void init_entity_runnable_average(struct sched_entity *se)
  {
  }
  void post_init_entity_util_avg(struct sched_entity *se)
  {
  }
-#endif
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
  
  /*
   * Update the current task's runtime statistics.
@@ -1303,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env,
  {
         if (env->best_task)
                 put_task_struct(env->best_task);
+       if (p)
+               get_task_struct(p);
  
         env->best_task = p;
         env->best_imp = imp;
@@ -1370,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env,
         long imp = env->p->numa_group ? groupimp : taskimp;
         long moveimp = imp;
         int dist = env->dist;
-       bool assigned = false;
  
         rcu_read_lock();
-
-       raw_spin_lock_irq(&dst_rq->lock);
-       cur = dst_rq->curr;
-       /*
-        * No need to move the exiting task or idle task.
-        */
-       if ((cur->flags & PF_EXITING) || is_idle_task(cur))
+       cur = task_rcu_dereference(&dst_rq->curr);
+       if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
                 cur = NULL;
-       else {
-               /*
-                * The task_struct must be protected here to protect the
-                * p->numa_faults access in the task_weight since the
-                * numa_faults could already be freed in the following path:
-                * finish_task_switch()
-                *     --> put_task_struct()
-                *         --> __put_task_struct()
-                *             --> task_numa_free()
-                */
-               get_task_struct(cur);
-       }
-
-       raw_spin_unlock_irq(&dst_rq->lock);
  
         /*
          * Because we have preemption enabled we can get migrated around and
@@ -1477,7 +1492,6 @@ balance:
                  */
                 if (!load_too_imbalanced(src_load, dst_load, env)) {
                         imp = moveimp - 1;
-                       put_task_struct(cur);
                         cur = NULL;
                         goto assign;
                 }
@@ -1503,16 +1517,9 @@ balance:
                 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
  
  assign:
-       assigned = true;
         task_numa_assign(env, cur, imp);
  unlock:
         rcu_read_unlock();
-       /*
-        * The dst_rq->curr isn't assigned. The protection for task_struct is
-        * finished.
-        */
-       if (cur && !assigned)
-               put_task_struct(cur);
  }
  
  static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2866,16 +2873,9 @@ void set_task_rq_fair(struct sched_entity *se,
  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-
  static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
  {
-       struct rq *rq = rq_of(cfs_rq);
-       int cpu = cpu_of(rq);
-
-       if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
-               unsigned long max = rq->cpu_capacity_orig;
-
+       if (&this_rq()->cfs == cfs_rq) {
                 /*
                  * There are a few boundary cases this might miss but it should
                  * get called often enough that that should (hopefully) not be
@@ -2892,8 +2892,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
                  *
                  * See cpu_util().
                  */
-               cpufreq_update_util(rq_clock(rq),
-                                   min(cfs_rq->avg.util_avg, max), max);
+               cpufreq_update_util(rq_of(cfs_rq), 0);
         }
  }
  
@@ -2914,7 +2913,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
         WRITE_ONCE(*ptr, res);                                  \
  } while (0)
  
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed utilization. It is expected
+ * that one calls update_tg_load_avg() on this condition, but after you've
+ * modified the cfs_rq avg (attach/detach), such that we propagate the new
+ * avg up.
+ */
  static inline int
  update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
  {
@@ -2969,6 +2984,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
                 update_tg_load_avg(cfs_rq, 0);
  }
  
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         if (!sched_feat(ATTACH_AGE_LOAD))
@@ -2977,6 +3000,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
         /*
          * If we got migrated (either between CPUs or between cgroups) we'll
          * have aged the average right before clearing @last_update_time.
+        *
+        * Or we're fresh through post_init_entity_util_avg().
          */
         if (se->avg.last_update_time) {
                 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -2998,6 +3023,14 @@ skip_aging:
         cfs_rq_util_change(cfs_rq);
  }
  
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -3082,11 +3115,14 @@ void remove_entity_load_avg(struct sched_entity *se)
         u64 last_update_time;
  
         /*
-        * Newly created task or never used group entity should not be removed
-        * from its (source) cfs_rq
+        * tasks cannot exit without having gone through wake_up_new_task() ->
+        * post_init_entity_util_avg() which will have added things to the
+        * cfs_rq, so we can remove unconditionally.
+        *
+        * Similarly for groups, they will have passed through
+        * post_init_entity_util_avg() before unregister_sched_fair_group()
+        * calls this.
          */
-       if (se->avg.last_update_time == 0)
-               return;
  
         last_update_time = cfs_rq_last_update_time(cfs_rq);
  
@@ -3109,12 +3145,15 @@ static int idle_balance(struct rq *this_rq);
  
  #else /* CONFIG_SMP */
  
-static inline void update_load_avg(struct sched_entity *se, int not_used)
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
  {
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       struct rq *rq = rq_of(cfs_rq);
+       return 0;
+}
  
-       cpufreq_trigger_update(rq_clock(rq));
+static inline void update_load_avg(struct sched_entity *se, int not_used)
+{
+       cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
  }
  
  static inline void
@@ -3698,7 +3737,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
  {
         if (unlikely(cfs_rq->throttle_count))
-               return cfs_rq->throttled_clock_task;
+               return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
  
         return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
  }
@@ -3836,13 +3875,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
  
         cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
         if (!cfs_rq->throttle_count) {
                 /* adjust cfs_rq_clock_task() */
                 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                              cfs_rq->throttled_clock_task;
         }
-#endif
  
         return 0;
  }
@@ -4195,26 +4232,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
         if (!cfs_bandwidth_used())
                 return;
  
-       /* Synchronize hierarchical throttle counter: */
-       if (unlikely(!cfs_rq->throttle_uptodate)) {
-               struct rq *rq = rq_of(cfs_rq);
-               struct cfs_rq *pcfs_rq;
-               struct task_group *tg;
-
-               cfs_rq->throttle_uptodate = 1;
-
-               /* Get closest up-to-date node, because leaves go first: */
-               for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
-                       pcfs_rq = tg->cfs_rq[cpu_of(rq)];
-                       if (pcfs_rq->throttle_uptodate)
-                               break;
-               }
-               if (tg) {
-                       cfs_rq->throttle_count = pcfs_rq->throttle_count;
-                       cfs_rq->throttled_clock_task = rq_clock_task(rq);
-               }
-       }
-
         /* an active group must be handled by the update_curr()->put() path */
         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
                 return;
@@ -4229,6 +4246,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
                 throttle_cfs_rq(cfs_rq);
  }
  
+static void sync_throttle(struct task_group *tg, int cpu)
+{
+       struct cfs_rq *pcfs_rq, *cfs_rq;
+
+       if (!cfs_bandwidth_used())
+               return;
+
+       if (!tg->parent)
+               return;
+
+       cfs_rq = tg->cfs_rq[cpu];
+       pcfs_rq = tg->parent->cfs_rq[cpu];
+
+       cfs_rq->throttle_count = pcfs_rq->throttle_count;
+       cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+}
+
  /* conditionally throttle active cfs_rq's from put_prev_entity() */
  static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
@@ -4368,6 +4402,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
  static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
  static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static inline void sync_throttle(struct task_group *tg, int cpu) {}
  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
  
  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -4476,7 +4511,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                  *
                  * note: in the case of encountering a throttled cfs_rq we will
                  * post the final h_nr_running increment below.
-               */
+                */
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running++;
@@ -8317,31 +8352,17 @@ static void task_fork_fair(struct task_struct *p)
  {
         struct cfs_rq *cfs_rq;
         struct sched_entity *se = &p->se, *curr;
-       int this_cpu = smp_processor_id();
         struct rq *rq = this_rq();
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
  
+       raw_spin_lock(&rq->lock);
         update_rq_clock(rq);
  
         cfs_rq = task_cfs_rq(current);
         curr = cfs_rq->curr;
-
-       /*
-        * Not only the cpu but also the task_group of the parent might have
-        * been changed after parent->se.parent,cfs_rq were copied to
-        * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
-        * of child point to valid ones.
-        */
-       rcu_read_lock();
-       __set_task_cpu(p, this_cpu);
-       rcu_read_unlock();
-
-       update_curr(cfs_rq);
-
-       if (curr)
+       if (curr) {
+               update_curr(cfs_rq);
                 se->vruntime = curr->vruntime;
+       }
         place_entity(cfs_rq, se, 1);
  
         if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -8354,8 +8375,7 @@ static void task_fork_fair(struct task_struct *p)
         }
  
         se->vruntime -= cfs_rq->min_vruntime;
-
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       raw_spin_unlock(&rq->lock);
  }
  
  /*
@@ -8411,6 +8431,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
  {
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       u64 now = cfs_rq_clock_task(cfs_rq);
+       int tg_update;
  
         if (!vruntime_normalized(p)) {
                 /*
@@ -8422,13 +8444,18 @@ static void detach_task_cfs_rq(struct task_struct *p)
         }
  
         /* Catch up with the cfs_rq and remove our load when we leave */
+       tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
         detach_entity_load_avg(cfs_rq, se);
+       if (tg_update)
+               update_tg_load_avg(cfs_rq, false);
  }
  
  static void attach_task_cfs_rq(struct task_struct *p)
  {
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       u64 now = cfs_rq_clock_task(cfs_rq);
+       int tg_update;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
         /*
@@ -8439,7 +8466,10 @@ static void attach_task_cfs_rq(struct task_struct *p)
  #endif
  
         /* Synchronize task with its cfs_rq */
+       tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
         attach_entity_load_avg(cfs_rq, se);
+       if (tg_update)
+               update_tg_load_avg(cfs_rq, false);
  
         if (!vruntime_normalized(p))
                 se->vruntime += cfs_rq->min_vruntime;
@@ -8499,6 +8529,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+       struct sched_entity *se = &p->se;
+
+       set_task_rq(p, task_cpu(p));
+       se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
+
  static void task_move_group_fair(struct task_struct *p)
  {
         detach_task_cfs_rq(p);
@@ -8511,6 +8549,19 @@ static void task_move_group_fair(struct task_struct *p)
         attach_task_cfs_rq(p);
  }
  
+static void task_change_group_fair(struct task_struct *p, int type)
+{
+       switch (type) {
+       case TASK_SET_GROUP:
+               task_set_group_fair(p);
+               break;
+
+       case TASK_MOVE_GROUP:
+               task_move_group_fair(p);
+               break;
+       }
+}
+
  void free_fair_sched_group(struct task_group *tg)
  {
         int i;
@@ -8562,10 +8613,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                 init_cfs_rq(cfs_rq);
                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
                 init_entity_runnable_average(se);
-
-               raw_spin_lock_irq(&rq->lock);
-               post_init_entity_util_avg(se);
-               raw_spin_unlock_irq(&rq->lock);
         }
  
         return 1;
@@ -8576,6 +8623,23 @@ err:
         return 0;
  }
  
+void online_fair_sched_group(struct task_group *tg)
+{
+       struct sched_entity *se;
+       struct rq *rq;
+       int i;
+
+       for_each_possible_cpu(i) {
+               rq = cpu_rq(i);
+               se = tg->se[i];
+
+               raw_spin_lock_irq(&rq->lock);
+               post_init_entity_util_avg(se);
+               sync_throttle(tg, i);
+               raw_spin_unlock_irq(&rq->lock);
+       }
+}
+
  void unregister_fair_sched_group(struct task_group *tg)
  {
         unsigned long flags;
@@ -8680,6 +8744,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
         return 1;
  }
  
+void online_fair_sched_group(struct task_group *tg) { }
+
  void unregister_fair_sched_group(struct task_group *tg) { }
  
  #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8739,7 +8805,7 @@ const struct sched_class fair_sched_class = {
         .update_curr            = update_curr_fair,
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-       .task_move_group        = task_move_group_fair,
+       .task_change_group      = task_change_group_fair,
  #endif
  };