Merge branch 'x86-cleanups-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[cascardo/linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 97ee9ac..5c883fe 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1937,7 +1937,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
   * chain to provide order. Instead we do:
   *
   *   1) smp_store_release(X->on_cpu, 0)
- *   2) smp_cond_acquire(!X->on_cpu)
+ *   2) smp_cond_load_acquire(!X->on_cpu)
   *
   * Example:
   *
@@ -1948,7 +1948,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
   *   sched-out X
   *   smp_store_release(X->on_cpu, 0);
   *
- *                    smp_cond_acquire(!X->on_cpu);
+ *                    smp_cond_load_acquire(&X->on_cpu, !VAL);
   *                    X->state = WAKING
   *                    set_task_cpu(X,2)
   *
@@ -1974,7 +1974,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
   * This means that any means of doing remote wakeups must order the CPU doing
   * the wakeup against the CPU the task is going to end up running on. This,
   * however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
   *
   */
  
@@ -2047,7 +2047,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
          * This ensures that tasks getting woken will be fully ordered against
          * their previous state and preserve Program Order.
          */
-       smp_cond_acquire(!p->on_cpu);
+       smp_cond_load_acquire(&p->on_cpu, !VAL);
  
         p->sched_contributes_to_load = !!task_contributes_to_load(p);
         p->state = TASK_WAKING;
@@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
  
         __sched_fork(clone_flags, p);
         /*
-        * We mark the process as running here. This guarantees that
+        * We mark the process as NEW here. This guarantees that
          * nobody will actually run it, and a signal or other external
          * event cannot wake it up and insert it on the runqueue either.
          */
-       p->state = TASK_RUNNING;
+       p->state = TASK_NEW;
  
         /*
          * Make sure we do not leak PI boosting priority to the child.
@@ -2383,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
                 p->sched_class = &fair_sched_class;
         }
  
-       if (p->sched_class->task_fork)
-               p->sched_class->task_fork(p);
+       init_entity_runnable_average(&p->se);
  
         /*
          * The child is not yet in the pid-hash so no cgroup attach races,
@@ -2394,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
          * Silence PROVE_RCU.
          */
         raw_spin_lock_irqsave(&p->pi_lock, flags);
-       set_task_cpu(p, cpu);
+       /*
+        * We're setting the cpu for the first time, we don't migrate,
+        * so use __set_task_cpu().
+        */
+       __set_task_cpu(p, cpu);
+       if (p->sched_class->task_fork)
+               p->sched_class->task_fork(p);
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  
  #ifdef CONFIG_SCHED_INFO
@@ -2526,16 +2531,18 @@ void wake_up_new_task(struct task_struct *p)
         struct rq_flags rf;
         struct rq *rq;
  
-       /* Initialize new task's runnable average */
-       init_entity_runnable_average(&p->se);
         raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+       p->state = TASK_RUNNING;
  #ifdef CONFIG_SMP
         /*
          * Fork balancing, do it here and not earlier because:
          *  - cpus_allowed can change in the fork path
          *  - any previously selected cpu might disappear through hotplug
+        *
+        * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+        * as we're not fully set-up yet.
          */
-       set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+       __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  #endif
         rq = __task_rq_lock(p, &rf);
         post_init_entity_util_avg(&p->se);
@@ -3161,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
                 pr_cont("\n");
         }
  #endif
+       if (panic_on_warn)
+               panic("scheduling while atomic\n");
+
         dump_stack();
         add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
  }
@@ -4752,7 +4762,8 @@ out_unlock:
   * @len: length in bytes of the bitmask pointed to by user_mask_ptr
   * @user_mask_ptr: user-space pointer to hold the current cpu mask
   *
- * Return: 0 on success. An error code otherwise.
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
   */
  SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
                 unsigned long __user *, user_mask_ptr)
@@ -7233,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
         struct rq *rq = cpu_rq(cpu);
  
         rq->calc_load_update = calc_load_update;
-       account_reset_rq(rq);
         update_max_interval();
  }
  
@@ -7713,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
         INIT_LIST_HEAD(&tg->children);
         list_add_rcu(&tg->siblings, &parent->children);
         spin_unlock_irqrestore(&task_group_lock, flags);
+
+       online_fair_sched_group(tg);
  }
  
  /* rcu callback to free various structures associated with a task group */
@@ -7741,27 +7753,9 @@ void sched_offline_group(struct task_group *tg)
         spin_unlock_irqrestore(&task_group_lock, flags);
  }
  
-/* change task's runqueue when it moves between groups.
- *     The caller of this function should have put the task in its new group
- *     by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- *     reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk, int type)
  {
         struct task_group *tg;
-       int queued, running;
-       struct rq_flags rf;
-       struct rq *rq;
-
-       rq = task_rq_lock(tsk, &rf);
-
-       running = task_current(rq, tsk);
-       queued = task_on_rq_queued(tsk);
-
-       if (queued)
-               dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
-       if (unlikely(running))
-               put_prev_task(rq, tsk);
  
         /*
          * All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7774,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk)
         tsk->sched_task_group = tg;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-       if (tsk->sched_class->task_move_group)
-               tsk->sched_class->task_move_group(tsk);
+       if (tsk->sched_class->task_change_group)
+               tsk->sched_class->task_change_group(tsk, type);
         else
  #endif
                 set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+       int queued, running;
+       struct rq_flags rf;
+       struct rq *rq;
+
+       rq = task_rq_lock(tsk, &rf);
+
+       running = task_current(rq, tsk);
+       queued = task_on_rq_queued(tsk);
+
+       if (queued)
+               dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+       if (unlikely(running))
+               put_prev_task(rq, tsk);
+
+       sched_change_group(tsk, TASK_MOVE_GROUP);
  
         if (unlikely(running))
                 tsk->sched_class->set_curr_task(rq);
@@ -8206,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
         sched_free_group(tg);
  }
  
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
  static void cpu_cgroup_fork(struct task_struct *task)
  {
-       sched_move_task(task);
+       struct rq_flags rf;
+       struct rq *rq;
+
+       rq = task_rq_lock(task, &rf);
+
+       sched_change_group(task, TASK_SET_GROUP);
+
+       task_rq_unlock(rq, task, &rf);
  }
  
  static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
  {
         struct task_struct *task;
         struct cgroup_subsys_state *css;
+       int ret = 0;
  
         cgroup_taskset_for_each(task, css, tset) {
  #ifdef CONFIG_RT_GROUP_SCHED
@@ -8225,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
                 if (task->sched_class != &fair_sched_class)
                         return -EINVAL;
  #endif
+               /*
+                * Serialize against wake_up_new_task() such that if its
+                * running, we're sure to observe its full state.
+                */
+               raw_spin_lock_irq(&task->pi_lock);
+               /*
+                * Avoid calling sched_move_task() before wake_up_new_task()
+                * has happened. This would lead to problems with PELT, due to
+                * move wanting to detach+attach while we're not attached yet.
+                */
+               if (task->state == TASK_NEW)
+                       ret = -EINVAL;
+               raw_spin_unlock_irq(&task->pi_lock);
+
+               if (ret)
+                       break;
         }
-       return 0;
+       return ret;
  }
  
  static void cpu_cgroup_attach(struct cgroup_taskset *tset)