Merge branch 'sched/urgent' into sched/core, to pick up fixes

author Ingo Molnar <mingo@kernel.org>

Mon, 5 Sep 2016 11:24:11 +0000 (13:24 +0200)

committer Ingo Molnar <mingo@kernel.org>

Mon, 5 Sep 2016 11:24:11 +0000 (13:24 +0200)
author Ingo Molnar <mingo@kernel.org>
Mon, 5 Sep 2016 11:24:11 +0000 (13:24 +0200)
committer Ingo Molnar <mingo@kernel.org>
Mon, 5 Sep 2016 11:24:11 +0000 (13:24 +0200)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 62c68e5..d750240 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1022,7 +1022,8 @@ extern void wake_up_q(struct wake_q_head *head);
  #define SD_BALANCE_FORK                0x0008  /* Balance on fork, clone */
  #define SD_BALANCE_WAKE                0x0010  /* Balance on wakeup */
  #define SD_WAKE_AFFINE         0x0020  /* Wake task to waking CPU */
-#define SD_SHARE_CPUCAPACITY   0x0080  /* Domain members share cpu power */
+#define SD_ASYM_CPUCAPACITY    0x0040  /* Groups have different max cpu capacities */
+#define SD_SHARE_CPUCAPACITY   0x0080  /* Domain members share cpu capacity */
  #define SD_SHARE_POWERDOMAIN   0x0100  /* Domain members share power domain */
  #define SD_SHARE_PKG_RESOURCES 0x0200  /* Domain members share cpu pkg resources */
  #define SD_SERIALIZE           0x0400  /* Only a single load balancing instance */
@@ -3236,6 +3237,15 @@ static inline void cond_resched_rcu(void)
  #endif
  }
  
+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+       return p->preempt_disable_ip;
+#else
+       return 0;
+#endif
+}
+
  /*
   * Does a critical section need to be broken due to another
   * task waiting?: (technically does not depend on CONFIG_PREEMPT,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 44817c6..7d602f5 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1265,7 +1265,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
                 /*
                  * Task isn't running anymore; make it appear like we migrated
                  * it before it went to sleep. This means on wakeup we make the
-                * previous cpu our targer instead of where it really is.
+                * previous cpu our target instead of where it really is.
                  */
                 p->wake_cpu = cpu;
         }
@@ -2095,6 +2095,7 @@ out:
  /**
   * try_to_wake_up_local - try to wake up a local task with rq lock held
   * @p: the thread to be awakened
+ * @cookie: context's cookie for pinning
   *
   * Put @p on the run-queue if it's not already there. The caller must
   * ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -3192,6 +3193,9 @@ static inline void preempt_latency_stop(int val) { }
   */
  static noinline void __schedule_bug(struct task_struct *prev)
  {
+       /* Save this before calling printk(), since that will clobber it */
+       unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
+
         if (oops_in_progress)
                 return;
  
@@ -3202,13 +3206,12 @@ static noinline void __schedule_bug(struct task_struct *prev)
         print_modules();
         if (irqs_disabled())
                 print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
-       if (in_atomic_preempt_off()) {
+       if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+           && in_atomic_preempt_off()) {
                 pr_err("Preemption disabled at:");
-               print_ip_sym(current->preempt_disable_ip);
+               print_ip_sym(preempt_disable_ip);
                 pr_cont("\n");
         }
-#endif
         if (panic_on_warn)
                 panic("scheduling while atomic\n");
  
@@ -5735,6 +5738,7 @@ static int sd_degenerate(struct sched_domain *sd)
                          SD_BALANCE_FORK |
                          SD_BALANCE_EXEC |
                          SD_SHARE_CPUCAPACITY |
+                        SD_ASYM_CPUCAPACITY |
                          SD_SHARE_PKG_RESOURCES |
                          SD_SHARE_POWERDOMAIN)) {
                 if (sd->groups != sd->groups->next)
@@ -5765,6 +5769,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                 SD_BALANCE_NEWIDLE |
                                 SD_BALANCE_FORK |
                                 SD_BALANCE_EXEC |
+                               SD_ASYM_CPUCAPACITY |
                                 SD_SHARE_CPUCAPACITY |
                                 SD_SHARE_PKG_RESOURCES |
                                 SD_PREFER_SIBLING |
@@ -6374,23 +6379,32 @@ static int sched_domains_curr_level;
  /*
   * SD_flags allowed in topology descriptions.
   *
- * SD_SHARE_CPUCAPACITY      - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA                - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
+ *   SD_SHARE_PKG_RESOURCES - describes shared caches
+ *   SD_NUMA                - describes NUMA topologies
+ *   SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
   *
- * Odd one out:
- * SD_ASYM_PACKING        - describes SMT quirks
+ *   SD_ASYM_PACKING        - describes SMT quirks
   */
  #define TOPOLOGY_SD_FLAGS              \
         (SD_SHARE_CPUCAPACITY |         \
          SD_SHARE_PKG_RESOURCES |       \
          SD_NUMA |                      \
          SD_ASYM_PACKING |              \
+        SD_ASYM_CPUCAPACITY |          \
          SD_SHARE_POWERDOMAIN)
  
  static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+       struct sched_domain *child, int cpu)
  {
         struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
         int sd_weight, sd_flags = 0;
@@ -6442,6 +6456,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
                 .smt_gain               = 0,
                 .max_newidle_lb_cost    = 0,
                 .next_decay_max_lb_cost = jiffies,
+               .child                  = child,
  #ifdef CONFIG_SCHED_DEBUG
                 .name                   = tl->name,
  #endif
@@ -6451,6 +6466,13 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
          * Convert topological properties into behaviour.
          */
  
+       if (sd->flags & SD_ASYM_CPUCAPACITY) {
+               struct sched_domain *t = sd;
+
+               for_each_lower_domain(t)
+                       t->flags |= SD_BALANCE_WAKE;
+       }
+
         if (sd->flags & SD_SHARE_CPUCAPACITY) {
                 sd->flags |= SD_PREFER_SIBLING;
                 sd->imbalance_pct = 110;
@@ -6866,16 +6888,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                 struct sched_domain *child, int cpu)
  {
-       struct sched_domain *sd = sd_init(tl, cpu);
-       if (!sd)
-               return child;
+       struct sched_domain *sd = sd_init(tl, child, cpu);
  
         cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
         if (child) {
                 sd->level = child->level + 1;
                 sched_domain_level_max = max(sched_domain_level_max, sd->level);
                 child->parent = sd;
-               sd->child = child;
  
                 if (!cpumask_subset(sched_domain_span(child),
                                     sched_domain_span(sd))) {
@@ -6906,6 +6925,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
         enum s_alloc alloc_state;
         struct sched_domain *sd;
         struct s_data d;
+       struct rq *rq = NULL;
         int i, ret = -ENOMEM;
  
         alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -6956,11 +6976,22 @@ static int build_sched_domains(const struct cpumask *cpu_map,
         /* Attach the domains */
         rcu_read_lock();
         for_each_cpu(i, cpu_map) {
+               rq = cpu_rq(i);
                 sd = *per_cpu_ptr(d.sd, i);
+
+               /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
+               if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
+                       WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+
                 cpu_attach_domain(sd, d.rd, i);
         }
         rcu_read_unlock();
  
+       if (rq) {
+               pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+                       cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
+       }
+
         ret = 0;
  error:
         __free_domain_allocs(&d, alloc_state, cpu_map);
@@ -7592,6 +7623,7 @@ EXPORT_SYMBOL(__might_sleep);
  void ___might_sleep(const char *file, int line, int preempt_offset)
  {
         static unsigned long prev_jiffy;        /* ratelimiting */
+       unsigned long preempt_disable_ip;
  
         rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
         if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
@@ -7602,6 +7634,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
                 return;
         prev_jiffy = jiffies;
  
+       /* Save this before calling printk(), since that will clobber it */
+       preempt_disable_ip = get_preempt_disable_ip(current);
+
         printk(KERN_ERR
                 "BUG: sleeping function called from invalid context at %s:%d\n",
                         file, line);
@@ -7616,14 +7651,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
         debug_show_held_locks(current);
         if (irqs_disabled())
                 print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
-       if (!preempt_count_equals(preempt_offset)) {
+       if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+           && !preempt_count_equals(preempt_offset)) {
                 pr_err("Preemption disabled at:");
-               print_ip_sym(current->preempt_disable_ip);
+               print_ip_sym(preempt_disable_ip);
                 pr_cont("\n");
         }
-#endif
         dump_stack();
+       add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
  }
  EXPORT_SYMBOL(___might_sleep);
  #endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

index a846cf8..b93c72d 100644 (file)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -306,6 +306,26 @@ static inline cputime_t account_other_time(cputime_t max)
         return accounted;
  }
  
+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+       return t->se.sum_exec_runtime;
+}
+#else
+static u64 read_sum_exec_runtime(struct task_struct *t)
+{
+       u64 ns;
+       struct rq_flags rf;
+       struct rq *rq;
+
+       rq = task_rq_lock(t, &rf);
+       ns = t->se.sum_exec_runtime;
+       task_rq_unlock(rq, t, &rf);
+
+       return ns;
+}
+#endif
+
  /*
   * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
   * tasks (sum on group iteration) belonging to @tsk's group.
@@ -318,6 +338,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
         unsigned int seq, nextseq;
         unsigned long flags;
  
+       /*
+        * Update current task runtime to account pending time since last
+        * scheduler action or thread_group_cputime() call. This thread group
+        * might have other running tasks on different CPUs, but updating
+        * their runtime can affect syscall performance, so we skip account
+        * those pending times and rely only on values updated on tick or
+        * other scheduler action.
+        */
+       if (same_thread_group(current, tsk))
+               (void) task_sched_runtime(current);
+
         rcu_read_lock();
         /* Attempt a lockless read on the first round. */
         nextseq = 0;
@@ -332,7 +363,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
                         task_cputime(t, &utime, &stime);
                         times->utime += utime;
                         times->stime += stime;
-                       times->sum_exec_runtime += task_sched_runtime(t);
+                       times->sum_exec_runtime += read_sum_exec_runtime(t);
                 }
                 /* If lockless access failed, take the lock. */
                 nextseq = 1;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index 1ce8867..d091f4a 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -346,12 +346,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
   * one, and to (try to!) reconcile itself with its own scheduling
   * parameters.
   */
-static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
-                                      struct sched_dl_entity *pi_se)
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
  {
         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
         struct rq *rq = rq_of_dl_rq(dl_rq);
  
+       WARN_ON(dl_se->dl_boosted);
         WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
  
         /*
@@ -367,8 +367,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
          * future; in fact, we must consider execution overheads (time
          * spent on hardirq context, etc.).
          */
-       dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
-       dl_se->runtime = pi_se->dl_runtime;
+       dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
+       dl_se->runtime = dl_se->dl_runtime;
  }
  
  /*
@@ -1723,10 +1723,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
   */
  static void switched_to_dl(struct rq *rq, struct task_struct *p)
  {
+
+       /* If p is not queued we will update its parameters at next wakeup. */
+       if (!task_on_rq_queued(p))
+               return;
+
+       /*
+        * If p is boosted we already updated its params in
+        * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
+        * p's deadline being now already after rq_clock(rq).
+        */
         if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-               setup_new_dl_entity(&p->dl, &p->dl);
+               setup_new_dl_entity(&p->dl);
  
-       if (task_on_rq_queued(p) && rq->curr != p) {
+       if (rq->curr != p) {
  #ifdef CONFIG_SMP
                 if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
                         queue_push_tasks(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 039de34..61d4854 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
  unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
  #endif
  
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * 1024 < capacity * margin
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
+
  static inline void update_load_add(struct load_weight *lw, unsigned long inc)
  {
         lw->weight += inc;
@@ -656,7 +662,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  }
  
  #ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
  static unsigned long task_h_load(struct task_struct *p);
  
  /*
@@ -726,7 +732,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
         struct sched_avg *sa = &se->avg;
         long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
         u64 now = cfs_rq_clock_task(cfs_rq);
-       int tg_update;
  
         if (cap > 0) {
                 if (cfs_rq->avg.util_avg != 0) {
@@ -759,10 +764,9 @@ void post_init_entity_util_avg(struct sched_entity *se)
                 }
         }
  
-       tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+       update_cfs_rq_load_avg(now, cfs_rq, false);
         attach_entity_load_avg(cfs_rq, se);
-       if (tg_update)
-               update_tg_load_avg(cfs_rq, false);
+       update_tg_load_avg(cfs_rq, false);
  }
  
  #else /* !CONFIG_SMP */
@@ -1514,7 +1518,8 @@ balance:
          * Call select_idle_sibling to maybe find a better one.
          */
         if (!cur)
-               env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+               env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+                                                  env->dst_cpu);
  
  assign:
         task_numa_assign(env, cur, imp);
@@ -2803,9 +2808,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
   */
  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
  {
@@ -2931,10 +2948,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
   *
   * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
   *
- * Returns true if the load decayed or we removed utilization. It is expected
- * that one calls update_tg_load_avg() on this condition, but after you've
- * modified the cfs_rq avg (attach/detach), such that we propagate the new
- * avg up.
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
   */
  static inline int
  update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -5091,18 +5108,18 @@ static int wake_wide(struct task_struct *p)
         return 1;
  }
  
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+                      int prev_cpu, int sync)
  {
         s64 this_load, load;
         s64 this_eff_load, prev_eff_load;
-       int idx, this_cpu, prev_cpu;
+       int idx, this_cpu;
         struct task_group *tg;
         unsigned long weight;
         int balanced;
  
         idx       = sd->wake_idx;
         this_cpu  = smp_processor_id();
-       prev_cpu  = task_cpu(p);
         load      = source_load(prev_cpu, idx);
         this_load = target_load(this_cpu, idx);
  
@@ -5228,6 +5245,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
         int shallowest_idle_cpu = -1;
         int i;
  
+       /* Check if we have any choice: */
+       if (group->group_weight == 1)
+               return cpumask_first(sched_group_cpus(group));
+
         /* Traverse only the allowed CPUs */
         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
                 if (idle_cpu(i)) {
@@ -5267,11 +5288,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  /*
   * Try and locate an idle CPU in the sched_domain.
   */
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
  {
         struct sched_domain *sd;
         struct sched_group *sg;
-       int i = task_cpu(p);
  
         if (idle_cpu(target))
                 return target;
@@ -5279,8 +5299,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
         /*
          * If the prevous cpu is cache affine and idle, don't be stupid.
          */
-       if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-               return i;
+       if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+               return prev;
  
         /*
          * Otherwise, iterate the domains and find an eligible idle cpu.
@@ -5301,6 +5321,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
         for_each_lower_domain(sd) {
                 sg = sd->groups;
                 do {
+                       int i;
+
                         if (!cpumask_intersects(sched_group_cpus(sg),
                                                 tsk_cpus_allowed(p)))
                                 goto next;
@@ -5360,6 +5382,32 @@ static int cpu_util(int cpu)
         return (util >= capacity) ? capacity : util;
  }
  
+static inline int task_util(struct task_struct *p)
+{
+       return p->se.avg.util_avg;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+       long min_cap, max_cap;
+
+       min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+       max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+
+       /* Minimum capacity is close to max, no need to abort wake_affine */
+       if (max_cap - min_cap < max_cap >> 3)
+               return 0;
+
+       return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
  /*
   * select_task_rq_fair: Select target runqueue for the waking task in domains
   * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5383,7 +5431,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  
         if (sd_flag & SD_BALANCE_WAKE) {
                 record_wakee(p);
-               want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+               want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+                             && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
         }
  
         rcu_read_lock();
@@ -5409,13 +5458,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  
         if (affine_sd) {
                 sd = NULL; /* Prefer wake_affine over balance flags */
-               if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+               if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
                         new_cpu = cpu;
         }
  
         if (!sd) {
                 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
-                       new_cpu = select_idle_sibling(p, new_cpu);
+                       new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
  
         } else while (sd) {
                 struct sched_group *group;
@@ -7704,11 +7753,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
  }
  
  static inline void
-update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
  {
         unsigned long interval, next;
  
-       interval = get_sd_balance_interval(sd, cpu_busy);
+       /* used by idle balance, so cpu_busy = 0 */
+       interval = get_sd_balance_interval(sd, 0);
         next = sd->last_balance + interval;
  
         if (time_after(*next_balance, next))
@@ -7738,7 +7788,7 @@ static int idle_balance(struct rq *this_rq)
                 rcu_read_lock();
                 sd = rcu_dereference_check_sched_domain(this_rq->sd);
                 if (sd)
-                       update_next_balance(sd, 0, &next_balance);
+                       update_next_balance(sd, &next_balance);
                 rcu_read_unlock();
  
                 goto out;
@@ -7756,7 +7806,7 @@ static int idle_balance(struct rq *this_rq)
                         continue;
  
                 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
-                       update_next_balance(sd, 0, &next_balance);
+                       update_next_balance(sd, &next_balance);
                         break;
                 }
  
@@ -7774,7 +7824,7 @@ static int idle_balance(struct rq *this_rq)
                         curr_cost += domain_cost;
                 }
  
-               update_next_balance(sd, 0, &next_balance);
+               update_next_balance(sd, &next_balance);
  
                 /*
                  * Stop searching for tasks to pull if there are
@@ -8441,7 +8491,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         u64 now = cfs_rq_clock_task(cfs_rq);
-       int tg_update;
  
         if (!vruntime_normalized(p)) {
                 /*
@@ -8453,10 +8502,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
         }
  
         /* Catch up with the cfs_rq and remove our load when we leave */
-       tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+       update_cfs_rq_load_avg(now, cfs_rq, false);
         detach_entity_load_avg(cfs_rq, se);
-       if (tg_update)
-               update_tg_load_avg(cfs_rq, false);
+       update_tg_load_avg(cfs_rq, false);
  }
  
  static void attach_task_cfs_rq(struct task_struct *p)
@@ -8464,7 +8512,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         u64 now = cfs_rq_clock_task(cfs_rq);
-       int tg_update;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
         /*
@@ -8475,10 +8522,9 @@ static void attach_task_cfs_rq(struct task_struct *p)
  #endif
  
         /* Synchronize task with its cfs_rq */
-       tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+       update_cfs_rq_load_avg(now, cfs_rq, false);
         attach_entity_load_avg(cfs_rq, se);
-       if (tg_update)
-               update_tg_load_avg(cfs_rq, false);
+       update_tg_load_avg(cfs_rq, false);
  
         if (!vruntime_normalized(p))
                 se->vruntime += cfs_rq->min_vruntime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index c64fc51..420c05d 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -565,6 +565,8 @@ struct root_domain {
          */
         cpumask_var_t rto_mask;
         struct cpupri cpupri;
+
+       unsigned long max_cpu_capacity;
  };
  
  extern struct root_domain def_root_domain;
@@ -597,7 +599,6 @@ struct rq {
  #ifdef CONFIG_SMP
         unsigned long last_load_update_tick;
  #endif /* CONFIG_SMP */
-       u64 nohz_stamp;
         unsigned long nohz_flags;
  #endif /* CONFIG_NO_HZ_COMMON */
  #ifdef CONFIG_NO_HZ_FULL
author	Ingo Molnar <mingo@kernel.org>
	Mon, 5 Sep 2016 11:24:11 +0000 (13:24 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Mon, 5 Sep 2016 11:24:11 +0000 (13:24 +0200)
include/linux/sched.h		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/cputime.c		patch \| blob \| history
kernel/sched/deadline.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history