Merge branch 'sched/urgent' into sched/core, to pick up fixes

author Ingo Molnar <mingo@kernel.org>

Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)

committer Ingo Molnar <mingo@kernel.org>

Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)
author Ingo Molnar <mingo@kernel.org>
Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)
committer Ingo Molnar <mingo@kernel.org>
Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)
diff --combined include/linux/sched.h

index dee41bf,253538f..b45acfd
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -2139,9 -2139,6 +2139,9 @@@ static inline void put_task_struct(stru
                 __put_task_struct(t);
   }
   
+ +struct task_struct *task_rcu_dereference(struct task_struct **ptask);
+ +struct task_struct *try_get_task_struct(struct task_struct **ptask);
+ +
   #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
   extern void task_cputime(struct task_struct *t,
                          cputime_t *utime, cputime_t *stime);
@@@ -3010,7 -3007,7 +3010,7 @@@ static inline int object_is_on_stack(vo
         return (obj >= stack) && (obj < (stack + THREAD_SIZE));
   }
   
- extern void thread_info_cache_init(void);
+ extern void thread_stack_cache_init(void);
   
   #ifdef CONFIG_DEBUG_STACK_USAGE
   static inline unsigned long stack_not_used(struct task_struct *p)
diff --combined kernel/sched/core.c

index c1b537b,51d7105..e406ba0
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -1536,7 -1536,9 +1536,9 @@@ static int select_fallback_rq(int cpu, 
         for (;;) {
                 /* Any allowed, online CPU? */
                 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-                       if (!cpu_active(dest_cpu))
+                       if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
+                               continue;
+                       if (!cpu_online(dest_cpu))
                                 continue;
                         goto out;
                 }
@@@ -5147,14 -5149,16 +5149,16 @@@ void show_state_filter(unsigned long st
                 /*
                  * reset the NMI-timeout, listing all files on a slow
                  * console might take a lot of time:
+                * Also, reset softlockup watchdogs on all CPUs, because
+                * another CPU might be blocked waiting for us to process
+                * an IPI.
                  */
                 touch_nmi_watchdog();
+               touch_all_softlockup_watchdogs();
                 if (!state_filter || (p->state & state_filter))
                         sched_show_task(p);
         }
   
-       touch_all_softlockup_watchdogs();
- 
   #ifdef CONFIG_SCHED_DEBUG
         if (!state_filter)
                 sysrq_sched_debug_show();
@@@ -7227,6 -7231,7 +7231,6 @@@ static void sched_rq_cpu_starting(unsig
         struct rq *rq = cpu_rq(cpu);
   
         rq->calc_load_update = calc_load_update;
- -      account_reset_rq(rq);
         update_max_interval();
   }
   
diff --combined kernel/sched/fair.c

index 40d5ace,c8c5d2d..7306356
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -735,8 -735,6 +735,6 @@@ void post_init_entity_util_avg(struct s
         }
   }
   
- static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
- static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
   #else
   void init_entity_runnable_average(struct sched_entity *se)
   {
@@@ -1305,8 -1303,6 +1303,8 @@@ static void task_numa_assign(struct tas
   {
         if (env->best_task)
                 put_task_struct(env->best_task);
+ +      if (p)
+ +              get_task_struct(p);
   
         env->best_task = p;
         env->best_imp = imp;
@@@ -1374,11 -1370,31 +1372,11 @@@ static void task_numa_compare(struct ta
         long imp = env->p->numa_group ? groupimp : taskimp;
         long moveimp = imp;
         int dist = env->dist;
- -      bool assigned = false;
   
         rcu_read_lock();
- -
- -      raw_spin_lock_irq(&dst_rq->lock);
- -      cur = dst_rq->curr;
- -      /*
- -       * No need to move the exiting task or idle task.
- -       */
- -      if ((cur->flags & PF_EXITING) || is_idle_task(cur))
+ +      cur = task_rcu_dereference(&dst_rq->curr);
+ +      if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
                 cur = NULL;
- -      else {
- -              /*
- -               * The task_struct must be protected here to protect the
- -               * p->numa_faults access in the task_weight since the
- -               * numa_faults could already be freed in the following path:
- -               * finish_task_switch()
- -               *     --> put_task_struct()
- -               *         --> __put_task_struct()
- -               *             --> task_numa_free()
- -               */
- -              get_task_struct(cur);
- -      }
- -
- -      raw_spin_unlock_irq(&dst_rq->lock);
   
         /*
          * Because we have preemption enabled we can get migrated around and
@@@ -1461,6 -1477,7 +1459,6 @@@ balance
                  */
                 if (!load_too_imbalanced(src_load, dst_load, env)) {
                         imp = moveimp - 1;
- -                      put_task_struct(cur);
                         cur = NULL;
                         goto assign;
                 }
@@@ -1486,9 -1503,16 +1484,9 @@@
                 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
   
   assign:
- -      assigned = true;
         task_numa_assign(env, cur, imp);
   unlock:
         rcu_read_unlock();
- -      /*
- -       * The dst_rq->curr isn't assigned. The protection for task_struct is
- -       * finished.
- -       */
- -      if (cur && !assigned)
- -              put_task_struct(cur);
   }
   
   static void task_numa_find_cpu(struct task_numa_env *env,
@@@ -2473,28 -2497,22 +2471,22 @@@ account_entity_dequeue(struct cfs_rq *c
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
   # ifdef CONFIG_SMP
- static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
   {
-       long tg_weight;
+       long tg_weight, load, shares;
   
         /*
-        * Use this CPU's real-time load instead of the last load contribution
-        * as the updating of the contribution is delayed, and we will use the
-        * the real-time load to calc the share. See update_tg_load_avg().
+        * This really should be: cfs_rq->avg.load_avg, but instead we use
+        * cfs_rq->load.weight, which is its upper bound. This helps ramp up
+        * the shares for small weight interactive tasks.
          */
-       tg_weight = atomic_long_read(&tg->load_avg);
-       tg_weight -= cfs_rq->tg_load_avg_contrib;
-       tg_weight += cfs_rq->load.weight;
+       load = scale_load_down(cfs_rq->load.weight);
   
-       return tg_weight;
- }
- 
- static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
- {
-       long tg_weight, load, shares;
+       tg_weight = atomic_long_read(&tg->load_avg);
   
-       tg_weight = calc_tg_weight(tg, cfs_rq);
-       load = cfs_rq->load.weight;
+       /* Ensure tg_weight >= load */
+       tg_weight -= cfs_rq->tg_load_avg_contrib;
+       tg_weight += load;
   
         shares = (tg->shares * load);
         if (tg_weight)
@@@ -2513,6 -2531,7 +2505,7 @@@ static inline long calc_cfs_shares(stru
         return tg->shares;
   }
   # endif /* CONFIG_SMP */
+ 
   static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                             unsigned long weight)
   {
@@@ -2878,6 -2897,23 +2871,23 @@@ static inline void cfs_rq_util_change(s
         }
   }
   
+ /*
+  * Unsigned subtract and clamp on underflow.
+  *
+  * Explicitly do a load-store to ensure the intermediate value never hits
+  * memory. This allows lockless observations without ever seeing the negative
+  * values.
+  */
+ #define sub_positive(_ptr, _val) do {                         \
+       typeof(_ptr) ptr = (_ptr);                              \
+       typeof(*ptr) val = (_val);                              \
+       typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+       res = var - val;                                        \
+       if (res > var)                                          \
+               res = 0;                                        \
+       WRITE_ONCE(*ptr, res);                                  \
+ } while (0)
+ 
   /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
   static inline int
   update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@@ -2887,15 -2923,15 +2897,15 @@@
   
         if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
-               sa->load_avg = max_t(long, sa->load_avg - r, 0);
-               sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+               sub_positive(&sa->load_avg, r);
+               sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
                 removed_load = 1;
         }
   
         if (atomic_long_read(&cfs_rq->removed_util_avg)) {
                 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-               sa->util_avg = max_t(long, sa->util_avg - r, 0);
-               sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+               sub_positive(&sa->util_avg, r);
+               sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
                 removed_util = 1;
         }
   
@@@ -2968,10 -3004,10 +2978,10 @@@ static void detach_entity_load_avg(stru
                           &se->avg, se->on_rq * scale_load_down(se->load.weight),
                           cfs_rq->curr == se, NULL);
   
-       cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
-       cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
-       cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
-       cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+       sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+       sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+       sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
+       sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
   
         cfs_rq_util_change(cfs_rq);
   }
@@@ -3220,7 -3256,7 +3230,7 @@@ static inline void check_schedstat_requ
                         trace_sched_stat_iowait_enabled()  ||
                         trace_sched_stat_blocked_enabled() ||
                         trace_sched_stat_runtime_enabled())  {
-               pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+               printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
                              "stat_blocked and stat_runtime require the "
                              "kernel parameter schedstats=enabled or "
                              "kernel.sched_schedstats=1\n");
@@@ -3662,7 -3698,7 +3672,7 @@@ static inline struct cfs_bandwidth *tg_
   static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
   {
         if (unlikely(cfs_rq->throttle_count))
- -              return cfs_rq->throttled_clock_task;
+ +              return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
   
         return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
   }
@@@ -3800,11 -3836,13 +3810,11 @@@ static int tg_unthrottle_up(struct task
         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
   
         cfs_rq->throttle_count--;
- -#ifdef CONFIG_SMP
         if (!cfs_rq->throttle_count) {
                 /* adjust cfs_rq_clock_task() */
                 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                              cfs_rq->throttled_clock_task;
         }
- -#endif
   
         return 0;
   }
@@@ -4157,6 -4195,26 +4167,26 @@@ static void check_enqueue_throttle(stru
         if (!cfs_bandwidth_used())
                 return;
   
+       /* Synchronize hierarchical throttle counter: */
+       if (unlikely(!cfs_rq->throttle_uptodate)) {
+               struct rq *rq = rq_of(cfs_rq);
+               struct cfs_rq *pcfs_rq;
+               struct task_group *tg;
+ 
+               cfs_rq->throttle_uptodate = 1;
+ 
+               /* Get closest up-to-date node, because leaves go first: */
+               for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
+                       pcfs_rq = tg->cfs_rq[cpu_of(rq)];
+                       if (pcfs_rq->throttle_uptodate)
+                               break;
+               }
+               if (tg) {
+                       cfs_rq->throttle_count = pcfs_rq->throttle_count;
+                       cfs_rq->throttled_clock_task = rq_clock_task(rq);
+               }
+       }
+ 
         /* an active group must be handled by the update_curr()->put() path */
         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
                 return;
@@@ -4472,15 -4530,14 +4502,14 @@@ static void dequeue_task_fair(struct r
   
                 /* Don't dequeue parent if it has other entities besides us */
                 if (cfs_rq->load.weight) {
+                       /* Avoid re-evaluating load for this entity: */
+                       se = parent_entity(se);
                         /*
                          * Bias pick_next to pick a task from this cfs_rq, as
                          * p is sleeping when it is within its sched_slice.
                          */
-                       if (task_sleep && parent_entity(se))
-                               set_next_buddy(parent_entity(se));
- 
-                       /* avoid re-evaluating load for this entity */
-                       se = parent_entity(se);
+                       if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+                               set_next_buddy(se);
                         break;
                 }
                 flags |= DEQUEUE_SLEEP;
@@@ -4882,19 -4939,24 +4911,24 @@@ static long effective_load(struct task_
                 return wl;
   
         for_each_sched_entity(se) {
-               long w, W;
+               struct cfs_rq *cfs_rq = se->my_q;
+               long W, w = cfs_rq_load_avg(cfs_rq);
   
-               tg = se->my_q->tg;
+               tg = cfs_rq->tg;
   
                 /*
                  * W = @wg + \Sum rw_j
                  */
-               W = wg + calc_tg_weight(tg, se->my_q);
+               W = wg + atomic_long_read(&tg->load_avg);
+ 
+               /* Ensure \Sum rw_j >= rw_i */
+               W -= cfs_rq->tg_load_avg_contrib;
+               W += w;
   
                 /*
                  * w = rw_i + @wl
                  */
-               w = cfs_rq_load_avg(se->my_q) + wl;
+               w += wl;
   
                 /*
                  * wl = S * s'_i; see (2)
diff --combined kernel/sched/sched.h

index de607e4,7cbeb92..71ce986
--- 1/kernel/sched/sched.h
--- 2/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@@ -437,7 -437,7 +437,7 @@@ struct cfs_rq 
   
         u64 throttled_clock, throttled_clock_task;
         u64 throttled_clock_task_time;
-       int throttled, throttle_count;
+       int throttled, throttle_count, throttle_uptodate;
         struct list_head throttled_list;
   #endif /* CONFIG_CFS_BANDWIDTH */
   #endif /* CONFIG_FAIR_GROUP_SCHED */
@@@ -1809,3 -1809,16 +1809,3 @@@ static inline void cpufreq_trigger_upda
   #else /* arch_scale_freq_capacity */
   #define arch_scale_freq_invariant()   (false)
   #endif
- -
- -static inline void account_reset_rq(struct rq *rq)
- -{
- -#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- -      rq->prev_irq_time = 0;
- -#endif
- -#ifdef CONFIG_PARAVIRT
- -      rq->prev_steal_time = 0;
- -#endif
- -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- -      rq->prev_steal_time_rq = 0;
- -#endif
- -}
author	Ingo Molnar <mingo@kernel.org>
	Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Mon, 27 Jun 2016 09:35:02 +0000 (11:35 +0200)
		1	2
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history