Merge branch 'for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

[cascardo/linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index b7d2271..44253ad 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -731,7 +731,7 @@ bool sched_can_stop_tick(void)
         if (current->policy == SCHED_RR) {
                 struct sched_rt_entity *rt_se = &current->rt;
  
-               return rt_se->run_list.prev == rt_se->run_list.next;
+               return list_is_singular(&rt_se->run_list);
         }
  
         /*
@@ -823,8 +823,8 @@ static void set_load_weight(struct task_struct *p)
                 return;
         }
  
-       load->weight = scale_load(prio_to_weight[prio]);
-       load->inv_weight = prio_to_wmult[prio];
+       load->weight = scale_load(sched_prio_to_weight[prio]);
+       load->inv_weight = sched_prio_to_wmult[prio];
  }
  
  static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1071,8 +1071,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
  {
         lockdep_assert_held(&rq->lock);
  
-       dequeue_task(rq, p, 0);
         p->on_rq = TASK_ON_RQ_MIGRATING;
+       dequeue_task(rq, p, 0);
         set_task_cpu(p, new_cpu);
         raw_spin_unlock(&rq->lock);
  
@@ -1080,8 +1080,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
  
         raw_spin_lock(&rq->lock);
         BUG_ON(task_cpu(p) != new_cpu);
-       p->on_rq = TASK_ON_RQ_QUEUED;
         enqueue_task(rq, p, 0);
+       p->on_rq = TASK_ON_RQ_QUEUED;
         check_preempt_curr(rq, p, 0);
  
         return rq;
@@ -1274,6 +1274,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
                         !p->on_rq);
  
+       /*
+        * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+        * because schedstat_wait_{start,end} rebase migrating task's wait_start
+        * time relying on p->on_rq.
+        */
+       WARN_ON_ONCE(p->state == TASK_RUNNING &&
+                    p->sched_class == &fair_sched_class &&
+                    (p->on_rq && !task_on_rq_migrating(p)));
+
  #ifdef CONFIG_LOCKDEP
         /*
          * The caller should hold either p->pi_lock or rq->lock, when changing
@@ -1310,9 +1319,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
                 src_rq = task_rq(p);
                 dst_rq = cpu_rq(cpu);
  
+               p->on_rq = TASK_ON_RQ_MIGRATING;
                 deactivate_task(src_rq, p, 0);
                 set_task_cpu(p, cpu);
                 activate_task(dst_rq, p, 0);
+               p->on_rq = TASK_ON_RQ_QUEUED;
                 check_preempt_curr(dst_rq, p, 0);
         } else {
                 /*
@@ -1905,6 +1916,97 @@ static void ttwu_queue(struct task_struct *p, int cpu)
         raw_spin_unlock(&rq->lock);
  }
  
+/*
+ * Notes on Program-Order guarantees on SMP systems.
+ *
+ *  MIGRATION
+ *
+ * The basic program-order guarantee on SMP systems is that when a task [t]
+ * migrates, all its activity on its old cpu [c0] happens-before any subsequent
+ * execution on its new cpu [c1].
+ *
+ * For migration (of runnable tasks) this is provided by the following means:
+ *
+ *  A) UNLOCK of the rq(c0)->lock scheduling out task t
+ *  B) migration for t is required to synchronize *both* rq(c0)->lock and
+ *     rq(c1)->lock (if not at the same time, then in that order).
+ *  C) LOCK of the rq(c1)->lock scheduling in task
+ *
+ * Transitivity guarantees that B happens after A and C after B.
+ * Note: we only require RCpc transitivity.
+ * Note: the cpu doing B need not be c0 or c1
+ *
+ * Example:
+ *
+ *   CPU0            CPU1            CPU2
+ *
+ *   LOCK rq(0)->lock
+ *   sched-out X
+ *   sched-in Y
+ *   UNLOCK rq(0)->lock
+ *
+ *                                   LOCK rq(0)->lock // orders against CPU0
+ *                                   dequeue X
+ *                                   UNLOCK rq(0)->lock
+ *
+ *                                   LOCK rq(1)->lock
+ *                                   enqueue X
+ *                                   UNLOCK rq(1)->lock
+ *
+ *                   LOCK rq(1)->lock // orders against CPU2
+ *                   sched-out Z
+ *                   sched-in X
+ *                   UNLOCK rq(1)->lock
+ *
+ *
+ *  BLOCKING -- aka. SLEEP + WAKEUP
+ *
+ * For blocking we (obviously) need to provide the same guarantee as for
+ * migration. However the means are completely different as there is no lock
+ * chain to provide order. Instead we do:
+ *
+ *   1) smp_store_release(X->on_cpu, 0)
+ *   2) smp_cond_acquire(!X->on_cpu)
+ *
+ * Example:
+ *
+ *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)
+ *
+ *   LOCK rq(0)->lock LOCK X->pi_lock
+ *   dequeue X
+ *   sched-out X
+ *   smp_store_release(X->on_cpu, 0);
+ *
+ *                    smp_cond_acquire(!X->on_cpu);
+ *                    X->state = WAKING
+ *                    set_task_cpu(X,2)
+ *
+ *                    LOCK rq(2)->lock
+ *                    enqueue X
+ *                    X->state = RUNNING
+ *                    UNLOCK rq(2)->lock
+ *
+ *                                          LOCK rq(2)->lock // orders against CPU1
+ *                                          sched-out Z
+ *                                          sched-in X
+ *                                          UNLOCK rq(2)->lock
+ *
+ *                    UNLOCK X->pi_lock
+ *   UNLOCK rq(0)->lock
+ *
+ *
+ * However; for wakeups there is a second guarantee we must provide, namely we
+ * must observe the state that lead to our wakeup. That is, not only must our
+ * task observe its own prior state, it must also observe the stores prior to
+ * its wakeup.
+ *
+ * This means that any means of doing remote wakeups must order the CPU doing
+ * the wakeup against the CPU the task is going to end up running on. This,
+ * however, is already required for the regular Program-Order guarantee above,
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ *
+ */
+
  /**
   * try_to_wake_up - wake up a thread
   * @p: the thread to be awakened
@@ -1947,15 +2049,34 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
  
  #ifdef CONFIG_SMP
         /*
-        * If the owning (remote) cpu is still in the middle of schedule() with
-        * this task as prev, wait until its done referencing the task.
+        * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+        * possible to, falsely, observe p->on_cpu == 0.
+        *
+        * One must be running (->on_cpu == 1) in order to remove oneself
+        * from the runqueue.
+        *
+        *  [S] ->on_cpu = 1;   [L] ->on_rq
+        *      UNLOCK rq->lock
+        *                      RMB
+        *      LOCK   rq->lock
+        *  [S] ->on_rq = 0;    [L] ->on_cpu
+        *
+        * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
+        * from the consecutive calls to schedule(); the first switching to our
+        * task, the second putting it to sleep.
          */
-       while (p->on_cpu)
-               cpu_relax();
+       smp_rmb();
+
         /*
-        * Pairs with the smp_wmb() in finish_lock_switch().
+        * If the owning (remote) cpu is still in the middle of schedule() with
+        * this task as prev, wait until its done referencing the task.
+        *
+        * Pairs with the smp_store_release() in finish_lock_switch().
+        *
+        * This ensures that tasks getting woken will be fully ordered against
+        * their previous state and preserve Program Order.
          */
-       smp_rmb();
+       smp_cond_acquire(!p->on_cpu);
  
         p->sched_contributes_to_load = !!task_contributes_to_load(p);
         p->state = TASK_WAKING;
@@ -2039,7 +2160,6 @@ out:
   */
  int wake_up_process(struct task_struct *p)
  {
-       WARN_ON(task_is_stopped_or_traced(p));
         return try_to_wake_up(p, TASK_NORMAL, 0);
  }
  EXPORT_SYMBOL(wake_up_process);
@@ -2085,6 +2205,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
         p->se.vruntime                  = 0;
         INIT_LIST_HEAD(&p->se.group_node);
  
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       p->se.cfs_rq                    = NULL;
+#endif
+
  #ifdef CONFIG_SCHEDSTATS
         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
  #endif
@@ -3085,7 +3209,6 @@ static void __sched notrace __schedule(bool preempt)
  
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
-       rcu_note_context_switch();
         prev = rq->curr;
  
         /*
@@ -3104,13 +3227,16 @@ static void __sched notrace __schedule(bool preempt)
         if (sched_feat(HRTICK))
                 hrtick_clear(rq);
  
+       local_irq_disable();
+       rcu_note_context_switch();
+
         /*
          * Make sure that signal_pending_state()->signal_pending() below
          * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
          * done by the caller to avoid the race with signal_wake_up().
          */
         smp_mb__before_spinlock();
-       raw_spin_lock_irq(&rq->lock);
+       raw_spin_lock(&rq->lock);
         lockdep_pin_lock(&rq->lock);
  
         rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@ -5847,13 +5973,13 @@ static int init_rootdomain(struct root_domain *rd)
  {
         memset(rd, 0, sizeof(*rd));
  
-       if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+       if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
                 goto out;
-       if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+       if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
                 goto free_span;
-       if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
+       if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
                 goto free_online;
-       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+       if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
                 goto free_dlo_mask;
  
         init_dl_bw(&rd->dl_bw);
@@ -7331,6 +7457,9 @@ int in_sched_functions(unsigned long addr)
   */
  struct task_group root_task_group;
  LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
  #endif
  
  DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
@@ -7388,11 +7517,12 @@ void __init sched_init(void)
  #endif /* CONFIG_RT_GROUP_SCHED */
  
  #ifdef CONFIG_CGROUP_SCHED
+       task_group_cache = KMEM_CACHE(task_group, 0);
+
         list_add(&root_task_group.list, &task_groups);
         INIT_LIST_HEAD(&root_task_group.children);
         INIT_LIST_HEAD(&root_task_group.siblings);
         autogroup_init(&init_task);
-
  #endif /* CONFIG_CGROUP_SCHED */
  
         for_each_possible_cpu(i) {
@@ -7673,7 +7803,7 @@ static void free_sched_group(struct task_group *tg)
         free_fair_sched_group(tg);
         free_rt_sched_group(tg);
         autogroup_free(tg);
-       kfree(tg);
+       kmem_cache_free(task_group_cache, tg);
  }
  
  /* allocate runqueue etc for a new task group */
@@ -7681,7 +7811,7 @@ struct task_group *sched_create_group(struct task_group *parent)
  {
         struct task_group *tg;
  
-       tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+       tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
         if (!tg)
                 return ERR_PTR(-ENOMEM);
  
@@ -8586,3 +8716,44 @@ void dump_cpu_task(int cpu)
         pr_info("Task dump for CPU %d:\n", cpu);
         sched_show_task(cpu_curr(cpu));
  }
+
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+const int sched_prio_to_weight[40] = {
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
+};
+
+/*
+ * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+const u32 sched_prio_to_wmult[40] = {
+ /* -20 */     48388,     59856,     76040,     92818,    118348,
+ /* -15 */    147320,    184698,    229616,    287308,    360437,
+ /* -10 */    449829,    563644,    704093,    875809,   1099582,
+ /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
+ /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
+ /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
+ /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
+ /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};