sched/fair: Fix PELT integrity for new tasks

author Peter Zijlstra <peterz@infradead.org>

Thu, 16 Jun 2016 11:29:28 +0000 (13:29 +0200)

committer Ingo Molnar <mingo@kernel.org>

Mon, 27 Jun 2016 10:17:53 +0000 (12:17 +0200)
author Peter Zijlstra <peterz@infradead.org>
Thu, 16 Jun 2016 11:29:28 +0000 (13:29 +0200)
committer Ingo Molnar <mingo@kernel.org>
Mon, 27 Jun 2016 10:17:53 +0000 (12:17 +0200)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index b45acfd..d99218a 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -219,9 +219,10 @@ extern void proc_sched_set_task(struct task_struct *p);
  #define TASK_WAKING            256
  #define TASK_PARKED            512
  #define TASK_NOLOAD            1024
-#define TASK_STATE_MAX         2048
+#define TASK_NEW               2048
+#define TASK_STATE_MAX         4096
  
-#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
  
  extern char ___assert_task_state[1 - 2*!!(
                 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 3d856c4..14afa51 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
  
         __sched_fork(clone_flags, p);
         /*
-        * We mark the process as running here. This guarantees that
+        * We mark the process as NEW here. This guarantees that
          * nobody will actually run it, and a signal or other external
          * event cannot wake it up and insert it on the runqueue either.
          */
-       p->state = TASK_RUNNING;
+       p->state = TASK_NEW;
  
         /*
          * Make sure we do not leak PI boosting priority to the child.
@@ -2383,6 +2383,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
                 p->sched_class = &fair_sched_class;
         }
  
+       init_entity_runnable_average(&p->se);
+
         /*
          * The child is not yet in the pid-hash so no cgroup attach races,
          * and the cgroup is pinned to this child due to cgroup_fork()
@@ -2529,9 +2531,8 @@ void wake_up_new_task(struct task_struct *p)
         struct rq_flags rf;
         struct rq *rq;
  
-       /* Initialize new task's runnable average */
-       init_entity_runnable_average(&p->se);
         raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+       p->state = TASK_RUNNING;
  #ifdef CONFIG_SMP
         /*
          * Fork balancing, do it here and not earlier because:
@@ -8237,6 +8238,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
  {
         struct task_struct *task;
         struct cgroup_subsys_state *css;
+       int ret = 0;
  
         cgroup_taskset_for_each(task, css, tset) {
  #ifdef CONFIG_RT_GROUP_SCHED
@@ -8247,8 +8249,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
                 if (task->sched_class != &fair_sched_class)
                         return -EINVAL;
  #endif
+               /*
+                * Serialize against wake_up_new_task() such that if its
+                * running, we're sure to observe its full state.
+                */
+               raw_spin_lock_irq(&task->pi_lock);
+               /*
+                * Avoid calling sched_move_task() before wake_up_new_task()
+                * has happened. This would lead to problems with PELT, due to
+                * move wanting to detach+attach while we're not attached yet.
+                */
+               if (task->state == TASK_NEW)
+                       ret = -EINVAL;
+               raw_spin_unlock_irq(&task->pi_lock);
+
+               if (ret)
+                       break;
         }
-       return 0;
+       return ret;
  }
  
  static void cpu_cgroup_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 64f26bc..0c21a12 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,6 +690,10 @@ void init_entity_runnable_average(struct sched_entity *se)
         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
  }
  
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
  /*
   * With new tasks being created, their initial util_avgs are extrapolated
   * based on the cfs_rq's current util_avg:
@@ -720,6 +724,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         struct sched_avg *sa = &se->avg;
         long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+       u64 now = cfs_rq_clock_task(cfs_rq);
  
         if (cap > 0) {
                 if (cfs_rq->avg.util_avg != 0) {
@@ -733,16 +738,37 @@ void post_init_entity_util_avg(struct sched_entity *se)
                 }
                 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
         }
+
+       if (entity_is_task(se)) {
+               struct task_struct *p = task_of(se);
+               if (p->sched_class != &fair_sched_class) {
+                       /*
+                        * For !fair tasks do:
+                        *
+                       update_cfs_rq_load_avg(now, cfs_rq, false);
+                       attach_entity_load_avg(cfs_rq, se);
+                       switched_from_fair(rq, p);
+                        *
+                        * such that the next switched_to_fair() has the
+                        * expected state.
+                        */
+                       se->avg.last_update_time = now;
+                       return;
+               }
+       }
+
+       update_cfs_rq_load_avg(now, cfs_rq, false);
+       attach_entity_load_avg(cfs_rq, se);
  }
  
-#else
+#else /* !CONFIG_SMP */
  void init_entity_runnable_average(struct sched_entity *se)
  {
  }
  void post_init_entity_util_avg(struct sched_entity *se)
  {
  }
-#endif
+#endif /* CONFIG_SMP */
  
  /*
   * Update the current task's runtime statistics.
@@ -2840,8 +2866,6 @@ void set_task_rq_fair(struct sched_entity *se,
  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-
  static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
  {
         struct rq *rq = rq_of(cfs_rq);
@@ -2951,6 +2975,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
         /*
          * If we got migrated (either between CPUs or between cgroups) we'll
          * have aged the average right before clearing @last_update_time.
+        *
+        * Or we're fresh through post_init_entity_util_avg().
          */
         if (se->avg.last_update_time) {
                 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -3056,11 +3082,14 @@ void remove_entity_load_avg(struct sched_entity *se)
         u64 last_update_time;
  
         /*
-        * Newly created task or never used group entity should not be removed
-        * from its (source) cfs_rq
+        * tasks cannot exit without having gone through wake_up_new_task() ->
+        * post_init_entity_util_avg() which will have added things to the
+        * cfs_rq, so we can remove unconditionally.
+        *
+        * Similarly for groups, they will have passed through
+        * post_init_entity_util_avg() before unregister_sched_fair_group()
+        * calls this.
          */
-       if (se->avg.last_update_time == 0)
-               return;
  
         last_update_time = cfs_rq_last_update_time(cfs_rq);
author	Peter Zijlstra <peterz@infradead.org>
	Thu, 16 Jun 2016 11:29:28 +0000 (13:29 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Mon, 27 Jun 2016 10:17:53 +0000 (12:17 +0200)
include/linux/sched.h		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history