Merge branch 'sched/urgent' into sched/core, to pick up fixes
authorIngo Molnar <mingo@kernel.org>
Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)
committerIngo Molnar <mingo@kernel.org>
Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)
Signed-off-by: Ingo Molnar <mingo@kernel.org>
1  2 
include/linux/sched.h
kernel/sched/core.c
kernel/sched/fair.c
kernel/sched/sched.h

diff --combined include/linux/sched.h
@@@ -2139,9 -2139,6 +2139,9 @@@ static inline void put_task_struct(stru
                __put_task_struct(t);
  }
  
 +struct task_struct *task_rcu_dereference(struct task_struct **ptask);
 +struct task_struct *try_get_task_struct(struct task_struct **ptask);
 +
  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
  extern void task_cputime(struct task_struct *t,
                         cputime_t *utime, cputime_t *stime);
@@@ -3010,7 -3007,7 +3010,7 @@@ static inline int object_is_on_stack(vo
        return (obj >= stack) && (obj < (stack + THREAD_SIZE));
  }
  
- extern void thread_info_cache_init(void);
+ extern void thread_stack_cache_init(void);
  
  #ifdef CONFIG_DEBUG_STACK_USAGE
  static inline unsigned long stack_not_used(struct task_struct *p)
diff --combined kernel/sched/core.c
@@@ -1536,7 -1536,9 +1536,9 @@@ static int select_fallback_rq(int cpu, 
        for (;;) {
                /* Any allowed, online CPU? */
                for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-                       if (!cpu_active(dest_cpu))
+                       if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
+                               continue;
+                       if (!cpu_online(dest_cpu))
                                continue;
                        goto out;
                }
@@@ -5147,14 -5149,16 +5149,16 @@@ void show_state_filter(unsigned long st
                /*
                 * reset the NMI-timeout, listing all files on a slow
                 * console might take a lot of time:
+                * Also, reset softlockup watchdogs on all CPUs, because
+                * another CPU might be blocked waiting for us to process
+                * an IPI.
                 */
                touch_nmi_watchdog();
+               touch_all_softlockup_watchdogs();
                if (!state_filter || (p->state & state_filter))
                        sched_show_task(p);
        }
  
-       touch_all_softlockup_watchdogs();
  #ifdef CONFIG_SCHED_DEBUG
        if (!state_filter)
                sysrq_sched_debug_show();
@@@ -7227,6 -7231,7 +7231,6 @@@ static void sched_rq_cpu_starting(unsig
        struct rq *rq = cpu_rq(cpu);
  
        rq->calc_load_update = calc_load_update;
 -      account_reset_rq(rq);
        update_max_interval();
  }
  
diff --combined kernel/sched/fair.c
@@@ -735,8 -735,6 +735,6 @@@ void post_init_entity_util_avg(struct s
        }
  }
  
- static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
- static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
  #else
  void init_entity_runnable_average(struct sched_entity *se)
  {
@@@ -1305,8 -1303,6 +1303,8 @@@ static void task_numa_assign(struct tas
  {
        if (env->best_task)
                put_task_struct(env->best_task);
 +      if (p)
 +              get_task_struct(p);
  
        env->best_task = p;
        env->best_imp = imp;
@@@ -1374,11 -1370,31 +1372,11 @@@ static void task_numa_compare(struct ta
        long imp = env->p->numa_group ? groupimp : taskimp;
        long moveimp = imp;
        int dist = env->dist;
 -      bool assigned = false;
  
        rcu_read_lock();
 -
 -      raw_spin_lock_irq(&dst_rq->lock);
 -      cur = dst_rq->curr;
 -      /*
 -       * No need to move the exiting task or idle task.
 -       */
 -      if ((cur->flags & PF_EXITING) || is_idle_task(cur))
 +      cur = task_rcu_dereference(&dst_rq->curr);
 +      if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
                cur = NULL;
 -      else {
 -              /*
 -               * The task_struct must be protected here to protect the
 -               * p->numa_faults access in the task_weight since the
 -               * numa_faults could already be freed in the following path:
 -               * finish_task_switch()
 -               *     --> put_task_struct()
 -               *         --> __put_task_struct()
 -               *             --> task_numa_free()
 -               */
 -              get_task_struct(cur);
 -      }
 -
 -      raw_spin_unlock_irq(&dst_rq->lock);
  
        /*
         * Because we have preemption enabled we can get migrated around and
@@@ -1461,6 -1477,7 +1459,6 @@@ balance
                 */
                if (!load_too_imbalanced(src_load, dst_load, env)) {
                        imp = moveimp - 1;
 -                      put_task_struct(cur);
                        cur = NULL;
                        goto assign;
                }
                env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
  
  assign:
 -      assigned = true;
        task_numa_assign(env, cur, imp);
  unlock:
        rcu_read_unlock();
 -      /*
 -       * The dst_rq->curr isn't assigned. The protection for task_struct is
 -       * finished.
 -       */
 -      if (cur && !assigned)
 -              put_task_struct(cur);
  }
  
  static void task_numa_find_cpu(struct task_numa_env *env,
@@@ -2473,28 -2497,22 +2471,22 @@@ account_entity_dequeue(struct cfs_rq *c
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  # ifdef CONFIG_SMP
- static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
  {
-       long tg_weight;
+       long tg_weight, load, shares;
  
        /*
-        * Use this CPU's real-time load instead of the last load contribution
-        * as the updating of the contribution is delayed, and we will use the
-        * the real-time load to calc the share. See update_tg_load_avg().
+        * This really should be: cfs_rq->avg.load_avg, but instead we use
+        * cfs_rq->load.weight, which is its upper bound. This helps ramp up
+        * the shares for small weight interactive tasks.
         */
-       tg_weight = atomic_long_read(&tg->load_avg);
-       tg_weight -= cfs_rq->tg_load_avg_contrib;
-       tg_weight += cfs_rq->load.weight;
+       load = scale_load_down(cfs_rq->load.weight);
  
-       return tg_weight;
- }
- static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
- {
-       long tg_weight, load, shares;
+       tg_weight = atomic_long_read(&tg->load_avg);
  
-       tg_weight = calc_tg_weight(tg, cfs_rq);
-       load = cfs_rq->load.weight;
+       /* Ensure tg_weight >= load */
+       tg_weight -= cfs_rq->tg_load_avg_contrib;
+       tg_weight += load;
  
        shares = (tg->shares * load);
        if (tg_weight)
@@@ -2513,6 -2531,7 +2505,7 @@@ static inline long calc_cfs_shares(stru
        return tg->shares;
  }
  # endif /* CONFIG_SMP */
  static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                            unsigned long weight)
  {
@@@ -2878,6 -2897,23 +2871,23 @@@ static inline void cfs_rq_util_change(s
        }
  }
  
+ /*
+  * Unsigned subtract and clamp on underflow.
+  *
+  * Explicitly do a load-store to ensure the intermediate value never hits
+  * memory. This allows lockless observations without ever seeing the negative
+  * values.
+  */
+ #define sub_positive(_ptr, _val) do {                         \
+       typeof(_ptr) ptr = (_ptr);                              \
+       typeof(*ptr) val = (_val);                              \
+       typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+       res = var - val;                                        \
+       if (res > var)                                          \
+               res = 0;                                        \
+       WRITE_ONCE(*ptr, res);                                  \
+ } while (0)
  /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
  static inline int
  update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
  
        if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
-               sa->load_avg = max_t(long, sa->load_avg - r, 0);
-               sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+               sub_positive(&sa->load_avg, r);
+               sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
                removed_load = 1;
        }
  
        if (atomic_long_read(&cfs_rq->removed_util_avg)) {
                long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-               sa->util_avg = max_t(long, sa->util_avg - r, 0);
-               sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+               sub_positive(&sa->util_avg, r);
+               sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
                removed_util = 1;
        }
  
@@@ -2968,10 -3004,10 +2978,10 @@@ static void detach_entity_load_avg(stru
                          &se->avg, se->on_rq * scale_load_down(se->load.weight),
                          cfs_rq->curr == se, NULL);
  
-       cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
-       cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
-       cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
-       cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+       sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+       sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+       sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
+       sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
  
        cfs_rq_util_change(cfs_rq);
  }
@@@ -3220,7 -3256,7 +3230,7 @@@ static inline void check_schedstat_requ
                        trace_sched_stat_iowait_enabled()  ||
                        trace_sched_stat_blocked_enabled() ||
                        trace_sched_stat_runtime_enabled())  {
-               pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+               printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
                             "stat_blocked and stat_runtime require the "
                             "kernel parameter schedstats=enabled or "
                             "kernel.sched_schedstats=1\n");
@@@ -3662,7 -3698,7 +3672,7 @@@ static inline struct cfs_bandwidth *tg_
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
  {
        if (unlikely(cfs_rq->throttle_count))
 -              return cfs_rq->throttled_clock_task;
 +              return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
  
        return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
  }
@@@ -3800,11 -3836,13 +3810,11 @@@ static int tg_unthrottle_up(struct task
        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
  
        cfs_rq->throttle_count--;
 -#ifdef CONFIG_SMP
        if (!cfs_rq->throttle_count) {
                /* adjust cfs_rq_clock_task() */
                cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                             cfs_rq->throttled_clock_task;
        }
 -#endif
  
        return 0;
  }
@@@ -4157,6 -4195,26 +4167,26 @@@ static void check_enqueue_throttle(stru
        if (!cfs_bandwidth_used())
                return;
  
+       /* Synchronize hierarchical throttle counter: */
+       if (unlikely(!cfs_rq->throttle_uptodate)) {
+               struct rq *rq = rq_of(cfs_rq);
+               struct cfs_rq *pcfs_rq;
+               struct task_group *tg;
+               cfs_rq->throttle_uptodate = 1;
+               /* Get closest up-to-date node, because leaves go first: */
+               for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
+                       pcfs_rq = tg->cfs_rq[cpu_of(rq)];
+                       if (pcfs_rq->throttle_uptodate)
+                               break;
+               }
+               if (tg) {
+                       cfs_rq->throttle_count = pcfs_rq->throttle_count;
+                       cfs_rq->throttled_clock_task = rq_clock_task(rq);
+               }
+       }
        /* an active group must be handled by the update_curr()->put() path */
        if (!cfs_rq->runtime_enabled || cfs_rq->curr)
                return;
@@@ -4472,15 -4530,14 +4502,14 @@@ static void dequeue_task_fair(struct r
  
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
+                       /* Avoid re-evaluating load for this entity: */
+                       se = parent_entity(se);
                        /*
                         * Bias pick_next to pick a task from this cfs_rq, as
                         * p is sleeping when it is within its sched_slice.
                         */
-                       if (task_sleep && parent_entity(se))
-                               set_next_buddy(parent_entity(se));
-                       /* avoid re-evaluating load for this entity */
-                       se = parent_entity(se);
+                       if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+                               set_next_buddy(se);
                        break;
                }
                flags |= DEQUEUE_SLEEP;
@@@ -4882,19 -4939,24 +4911,24 @@@ static long effective_load(struct task_
                return wl;
  
        for_each_sched_entity(se) {
-               long w, W;
+               struct cfs_rq *cfs_rq = se->my_q;
+               long W, w = cfs_rq_load_avg(cfs_rq);
  
-               tg = se->my_q->tg;
+               tg = cfs_rq->tg;
  
                /*
                 * W = @wg + \Sum rw_j
                 */
-               W = wg + calc_tg_weight(tg, se->my_q);
+               W = wg + atomic_long_read(&tg->load_avg);
+               /* Ensure \Sum rw_j >= rw_i */
+               W -= cfs_rq->tg_load_avg_contrib;
+               W += w;
  
                /*
                 * w = rw_i + @wl
                 */
-               w = cfs_rq_load_avg(se->my_q) + wl;
+               w += wl;
  
                /*
                 * wl = S * s'_i; see (2)
diff --combined kernel/sched/sched.h
@@@ -437,7 -437,7 +437,7 @@@ struct cfs_rq 
  
        u64 throttled_clock, throttled_clock_task;
        u64 throttled_clock_task_time;
-       int throttled, throttle_count;
+       int throttled, throttle_count, throttle_uptodate;
        struct list_head throttled_list;
  #endif /* CONFIG_CFS_BANDWIDTH */
  #endif /* CONFIG_FAIR_GROUP_SCHED */
@@@ -1809,3 -1809,16 +1809,3 @@@ static inline void cpufreq_trigger_upda
  #else /* arch_scale_freq_capacity */
  #define arch_scale_freq_invariant()   (false)
  #endif
 -
 -static inline void account_reset_rq(struct rq *rq)
 -{
 -#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 -      rq->prev_irq_time = 0;
 -#endif
 -#ifdef CONFIG_PARAVIRT
 -      rq->prev_steal_time = 0;
 -#endif
 -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 -      rq->prev_steal_time_rq = 0;
 -#endif
 -}