Merge branch 'linus' into sched/core, to pick up fixes

author Ingo Molnar <mingo@kernel.org>

Fri, 30 Sep 2016 08:44:27 +0000 (10:44 +0200)

committer Ingo Molnar <mingo@kernel.org>

Fri, 30 Sep 2016 08:44:27 +0000 (10:44 +0200)
author Ingo Molnar <mingo@kernel.org>
Fri, 30 Sep 2016 08:44:27 +0000 (10:44 +0200)
committer Ingo Molnar <mingo@kernel.org>
Fri, 30 Sep 2016 08:44:27 +0000 (10:44 +0200)
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt

index 53a2fe1..8e37b0b 100644 (file)
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -16,6 +16,7 @@ CONTENTS
     4.1 System-wide settings
     4.2 Task interface
     4.3 Default behavior
+   4.4 Behavior of sched_yield()
   5. Tasks CPU affinity
     5.1 SCHED_DEADLINE and cpusets HOWTO
   6. Future plans
@@ -426,6 +427,23 @@ CONTENTS
   Finally, notice that in order not to jeopardize the admission control a
   -deadline task cannot fork.
  
+
+4.4 Behavior of sched_yield()
+-----------------------------
+
+ When a SCHED_DEADLINE task calls sched_yield(), it gives up its
+ remaining runtime and is immediately throttled, until the next
+ period, when its runtime will be replenished (a special flag
+ dl_yielded is set and used to handle correctly throttling and runtime
+ replenishment after a call to sched_yield()).
+
+ This behavior of sched_yield() allows the task to wake-up exactly at
+ the beginning of the next period. Also, this may be useful in the
+ future with bandwidth reclaiming mechanisms, where sched_yield() will
+ make the leftoever runtime available for reclamation by other
+ SCHED_DEADLINE tasks.
+
+
  5. Tasks CPU affinity
  =====================
  
diff --git a/include/linux/kernel.h b/include/linux/kernel.h

index d96a611..74fd6f0 100644 (file)
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -259,17 +259,14 @@ static inline void might_fault(void) { }
  extern struct atomic_notifier_head panic_notifier_list;
  extern long (*panic_blink)(int state);
  __printf(1, 2)
-void panic(const char *fmt, ...)
-       __noreturn __cold;
+void panic(const char *fmt, ...) __noreturn __cold;
  void nmi_panic(struct pt_regs *regs, const char *msg);
  extern void oops_enter(void);
  extern void oops_exit(void);
  void print_oops_end_marker(void);
  extern int oops_may_print(void);
-void do_exit(long error_code)
-       __noreturn;
-void complete_and_exit(struct completion *, long)
-       __noreturn;
+void do_exit(long error_code) __noreturn;
+void complete_and_exit(struct completion *, long) __noreturn;
  
  /* Internal, do not use. */
  int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 62c68e5..b99fcd1 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -448,6 +448,8 @@ static inline void io_schedule(void)
         io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
  }
  
+void __noreturn do_task_dead(void);
+
  struct nsproxy;
  struct user_namespace;
  
@@ -1022,7 +1024,8 @@ extern void wake_up_q(struct wake_q_head *head);
  #define SD_BALANCE_FORK                0x0008  /* Balance on fork, clone */
  #define SD_BALANCE_WAKE                0x0010  /* Balance on wakeup */
  #define SD_WAKE_AFFINE         0x0020  /* Wake task to waking CPU */
-#define SD_SHARE_CPUCAPACITY   0x0080  /* Domain members share cpu power */
+#define SD_ASYM_CPUCAPACITY    0x0040  /* Groups have different max cpu capacities */
+#define SD_SHARE_CPUCAPACITY   0x0080  /* Domain members share cpu capacity */
  #define SD_SHARE_POWERDOMAIN   0x0100  /* Domain members share power domain */
  #define SD_SHARE_PKG_RESOURCES 0x0200  /* Domain members share cpu pkg resources */
  #define SD_SERIALIZE           0x0400  /* Only a single load balancing instance */
@@ -3206,7 +3209,11 @@ static inline int signal_pending_state(long state, struct task_struct *p)
   * cond_resched_lock() will drop the spinlock before scheduling,
   * cond_resched_softirq() will enable bhs before scheduling.
   */
+#ifndef CONFIG_PREEMPT
  extern int _cond_resched(void);
+#else
+static inline int _cond_resched(void) { return 0; }
+#endif
  
  #define cond_resched() ({                      \
         ___might_sleep(__FILE__, __LINE__, 0);  \
@@ -3236,6 +3243,15 @@ static inline void cond_resched_rcu(void)
  #endif
  }
  
+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+       return p->preempt_disable_ip;
+#else
+       return 0;
+#endif
+}
+
  /*
   * Does a critical section need to be broken due to another
   * task waiting?: (technically does not depend on CONFIG_PREEMPT,
diff --git a/kernel/exit.c b/kernel/exit.c

index 091a78b..1e1d913 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -725,7 +725,7 @@ static void check_stack_usage(void)
  static inline void check_stack_usage(void) {}
  #endif
  
-void do_exit(long code)
+void __noreturn do_exit(long code)
  {
         struct task_struct *tsk = current;
         int group_dead;
@@ -882,29 +882,7 @@ void do_exit(long code)
         exit_rcu();
         TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
  
-       /*
-        * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
-        * when the following two conditions become true.
-        *   - There is race condition of mmap_sem (It is acquired by
-        *     exit_mm()), and
-        *   - SMI occurs before setting TASK_RUNINNG.
-        *     (or hypervisor of virtual machine switches to other guest)
-        *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
-        *
-        * To avoid it, we have to wait for releasing tsk->pi_lock which
-        * is held by try_to_wake_up()
-        */
-       smp_mb();
-       raw_spin_unlock_wait(&tsk->pi_lock);
-
-       /* causes final put_task_struct in finish_task_switch(). */
-       tsk->state = TASK_DEAD;
-       tsk->flags |= PF_NOFREEZE;      /* tell freezer to ignore us */
-       schedule();
-       BUG();
-       /* Avoid "noreturn function does return".  */
-       for (;;)
-               cpu_relax();    /* For when BUG is null */
+       do_task_dead();
  }
  EXPORT_SYMBOL_GPL(do_exit);
  
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 44817c6..8bae0cd 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1063,8 +1063,12 @@ static int migration_cpu_stop(void *data)
          * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
          * we're holding p->pi_lock.
          */
-       if (task_rq(p) == rq && task_on_rq_queued(p))
-               rq = __migrate_task(rq, p, arg->dest_cpu);
+       if (task_rq(p) == rq) {
+               if (task_on_rq_queued(p))
+                       rq = __migrate_task(rq, p, arg->dest_cpu);
+               else
+                       p->wake_cpu = arg->dest_cpu;
+       }
         raw_spin_unlock(&rq->lock);
         raw_spin_unlock(&p->pi_lock);
  
@@ -1265,7 +1269,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
                 /*
                  * Task isn't running anymore; make it appear like we migrated
                  * it before it went to sleep. This means on wakeup we make the
-                * previous cpu our targer instead of where it really is.
+                * previous cpu our target instead of where it really is.
                  */
                 p->wake_cpu = cpu;
         }
@@ -1629,23 +1633,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
  static void
  ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
  {
-#ifdef CONFIG_SCHEDSTATS
-       struct rq *rq = this_rq();
+       struct rq *rq;
  
-#ifdef CONFIG_SMP
-       int this_cpu = smp_processor_id();
+       if (!schedstat_enabled())
+               return;
  
-       if (cpu == this_cpu) {
-               schedstat_inc(rq, ttwu_local);
-               schedstat_inc(p, se.statistics.nr_wakeups_local);
+       rq = this_rq();
+
+#ifdef CONFIG_SMP
+       if (cpu == rq->cpu) {
+               schedstat_inc(rq->ttwu_local);
+               schedstat_inc(p->se.statistics.nr_wakeups_local);
         } else {
                 struct sched_domain *sd;
  
-               schedstat_inc(p, se.statistics.nr_wakeups_remote);
+               schedstat_inc(p->se.statistics.nr_wakeups_remote);
                 rcu_read_lock();
-               for_each_domain(this_cpu, sd) {
+               for_each_domain(rq->cpu, sd) {
                         if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                               schedstat_inc(sd, ttwu_wake_remote);
+                               schedstat_inc(sd->ttwu_wake_remote);
                                 break;
                         }
                 }
@@ -1653,17 +1659,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
         }
  
         if (wake_flags & WF_MIGRATED)
-               schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-
+               schedstat_inc(p->se.statistics.nr_wakeups_migrate);
  #endif /* CONFIG_SMP */
  
-       schedstat_inc(rq, ttwu_count);
-       schedstat_inc(p, se.statistics.nr_wakeups);
+       schedstat_inc(rq->ttwu_count);
+       schedstat_inc(p->se.statistics.nr_wakeups);
  
         if (wake_flags & WF_SYNC)
-               schedstat_inc(p, se.statistics.nr_wakeups_sync);
-
-#endif /* CONFIG_SCHEDSTATS */
+               schedstat_inc(p->se.statistics.nr_wakeups_sync);
  }
  
  static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -2084,8 +2087,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
  
         ttwu_queue(p, cpu, wake_flags);
  stat:
-       if (schedstat_enabled())
-               ttwu_stat(p, cpu, wake_flags);
+       ttwu_stat(p, cpu, wake_flags);
  out:
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  
@@ -2095,6 +2097,7 @@ out:
  /**
   * try_to_wake_up_local - try to wake up a local task with rq lock held
   * @p: the thread to be awakened
+ * @cookie: context's cookie for pinning
   *
   * Put @p on the run-queue if it's not already there. The caller must
   * ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -2133,8 +2136,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
                 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
  
         ttwu_do_wakeup(rq, p, 0, cookie);
-       if (schedstat_enabled())
-               ttwu_stat(p, smp_processor_id(), 0);
+       ttwu_stat(p, smp_processor_id(), 0);
  out:
         raw_spin_unlock(&p->pi_lock);
  }
@@ -3192,6 +3194,9 @@ static inline void preempt_latency_stop(int val) { }
   */
  static noinline void __schedule_bug(struct task_struct *prev)
  {
+       /* Save this before calling printk(), since that will clobber it */
+       unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
+
         if (oops_in_progress)
                 return;
  
@@ -3202,13 +3207,12 @@ static noinline void __schedule_bug(struct task_struct *prev)
         print_modules();
         if (irqs_disabled())
                 print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
-       if (in_atomic_preempt_off()) {
+       if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+           && in_atomic_preempt_off()) {
                 pr_err("Preemption disabled at:");
-               print_ip_sym(current->preempt_disable_ip);
+               print_ip_sym(preempt_disable_ip);
                 pr_cont("\n");
         }
-#endif
         if (panic_on_warn)
                 panic("scheduling while atomic\n");
  
@@ -3234,7 +3238,7 @@ static inline void schedule_debug(struct task_struct *prev)
  
         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
  
-       schedstat_inc(this_rq(), sched_count);
+       schedstat_inc(this_rq()->sched_count);
  }
  
  /*
@@ -3327,17 +3331,6 @@ static void __sched notrace __schedule(bool preempt)
         rq = cpu_rq(cpu);
         prev = rq->curr;
  
-       /*
-        * do_exit() calls schedule() with preemption disabled as an exception;
-        * however we must fix that up, otherwise the next task will see an
-        * inconsistent (higher) preempt count.
-        *
-        * It also avoids the below schedule_debug() test from complaining
-        * about this.
-        */
-       if (unlikely(prev->state == TASK_DEAD))
-               preempt_enable_no_resched_notrace();
-
         schedule_debug(prev);
  
         if (sched_feat(HRTICK))
@@ -3405,6 +3398,33 @@ static void __sched notrace __schedule(bool preempt)
  }
  STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
  
+void __noreturn do_task_dead(void)
+{
+       /*
+        * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+        * when the following two conditions become true.
+        *   - There is race condition of mmap_sem (It is acquired by
+        *     exit_mm()), and
+        *   - SMI occurs before setting TASK_RUNINNG.
+        *     (or hypervisor of virtual machine switches to other guest)
+        *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+        *
+        * To avoid it, we have to wait for releasing tsk->pi_lock which
+        * is held by try_to_wake_up()
+        */
+       smp_mb();
+       raw_spin_unlock_wait(&current->pi_lock);
+
+       /* causes final put_task_struct in finish_task_switch(). */
+       __set_current_state(TASK_DEAD);
+       current->flags |= PF_NOFREEZE;  /* tell freezer to ignore us */
+       __schedule(false);
+       BUG();
+       /* Avoid "noreturn function does return".  */
+       for (;;)
+               cpu_relax();    /* For when BUG is null */
+}
+
  static inline void sched_submit_work(struct task_struct *tsk)
  {
         if (!tsk->state || tsk_is_pi_blocked(tsk))
@@ -4846,7 +4866,7 @@ SYSCALL_DEFINE0(sched_yield)
  {
         struct rq *rq = this_rq_lock();
  
-       schedstat_inc(rq, yld_count);
+       schedstat_inc(rq->yld_count);
         current->sched_class->yield_task(rq);
  
         /*
@@ -4863,6 +4883,7 @@ SYSCALL_DEFINE0(sched_yield)
         return 0;
  }
  
+#ifndef CONFIG_PREEMPT
  int __sched _cond_resched(void)
  {
         if (should_resched(0)) {
@@ -4872,6 +4893,7 @@ int __sched _cond_resched(void)
         return 0;
  }
  EXPORT_SYMBOL(_cond_resched);
+#endif
  
  /*
   * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4997,7 +5019,7 @@ again:
  
         yielded = curr->sched_class->yield_to_task(rq, p, preempt);
         if (yielded) {
-               schedstat_inc(rq, yld_count);
+               schedstat_inc(rq->yld_count);
                 /*
                  * Make p's CPU reschedule; pick_next_entity takes care of
                  * fairness.
@@ -5717,6 +5739,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
         }
  }
  #else /* !CONFIG_SCHED_DEBUG */
+
+# define sched_debug_enabled 0
  # define sched_domain_debug(sd, cpu) do { } while (0)
  static inline bool sched_debug(void)
  {
@@ -5735,6 +5759,7 @@ static int sd_degenerate(struct sched_domain *sd)
                          SD_BALANCE_FORK |
                          SD_BALANCE_EXEC |
                          SD_SHARE_CPUCAPACITY |
+                        SD_ASYM_CPUCAPACITY |
                          SD_SHARE_PKG_RESOURCES |
                          SD_SHARE_POWERDOMAIN)) {
                 if (sd->groups != sd->groups->next)
@@ -5765,6 +5790,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                 SD_BALANCE_NEWIDLE |
                                 SD_BALANCE_FORK |
                                 SD_BALANCE_EXEC |
+                               SD_ASYM_CPUCAPACITY |
                                 SD_SHARE_CPUCAPACITY |
                                 SD_SHARE_PKG_RESOURCES |
                                 SD_PREFER_SIBLING |
@@ -6374,23 +6400,32 @@ static int sched_domains_curr_level;
  /*
   * SD_flags allowed in topology descriptions.
   *
- * SD_SHARE_CPUCAPACITY      - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA                - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
   *
- * Odd one out:
- * SD_ASYM_PACKING        - describes SMT quirks
+ *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
+ *   SD_SHARE_PKG_RESOURCES - describes shared caches
+ *   SD_NUMA                - describes NUMA topologies
+ *   SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
+ *
+ *   SD_ASYM_PACKING        - describes SMT quirks
   */
  #define TOPOLOGY_SD_FLAGS              \
         (SD_SHARE_CPUCAPACITY |         \
          SD_SHARE_PKG_RESOURCES |       \
          SD_NUMA |                      \
          SD_ASYM_PACKING |              \
+        SD_ASYM_CPUCAPACITY |          \
          SD_SHARE_POWERDOMAIN)
  
  static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+       struct sched_domain *child, int cpu)
  {
         struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
         int sd_weight, sd_flags = 0;
@@ -6442,6 +6477,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
                 .smt_gain               = 0,
                 .max_newidle_lb_cost    = 0,
                 .next_decay_max_lb_cost = jiffies,
+               .child                  = child,
  #ifdef CONFIG_SCHED_DEBUG
                 .name                   = tl->name,
  #endif
@@ -6451,6 +6487,13 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
          * Convert topological properties into behaviour.
          */
  
+       if (sd->flags & SD_ASYM_CPUCAPACITY) {
+               struct sched_domain *t = sd;
+
+               for_each_lower_domain(t)
+                       t->flags |= SD_BALANCE_WAKE;
+       }
+
         if (sd->flags & SD_SHARE_CPUCAPACITY) {
                 sd->flags |= SD_PREFER_SIBLING;
                 sd->imbalance_pct = 110;
@@ -6866,16 +6909,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                 struct sched_domain *child, int cpu)
  {
-       struct sched_domain *sd = sd_init(tl, cpu);
-       if (!sd)
-               return child;
+       struct sched_domain *sd = sd_init(tl, child, cpu);
  
         cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
         if (child) {
                 sd->level = child->level + 1;
                 sched_domain_level_max = max(sched_domain_level_max, sd->level);
                 child->parent = sd;
-               sd->child = child;
  
                 if (!cpumask_subset(sched_domain_span(child),
                                     sched_domain_span(sd))) {
@@ -6906,6 +6946,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
         enum s_alloc alloc_state;
         struct sched_domain *sd;
         struct s_data d;
+       struct rq *rq = NULL;
         int i, ret = -ENOMEM;
  
         alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -6956,11 +6997,22 @@ static int build_sched_domains(const struct cpumask *cpu_map,
         /* Attach the domains */
         rcu_read_lock();
         for_each_cpu(i, cpu_map) {
+               rq = cpu_rq(i);
                 sd = *per_cpu_ptr(d.sd, i);
+
+               /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
+               if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
+                       WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+
                 cpu_attach_domain(sd, d.rd, i);
         }
         rcu_read_unlock();
  
+       if (rq && sched_debug_enabled) {
+               pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+                       cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
+       }
+
         ret = 0;
  error:
         __free_domain_allocs(&d, alloc_state, cpu_map);
@@ -7523,21 +7575,12 @@ void __init sched_init(void)
  
         set_load_weight(&init_task);
  
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-       INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
-
         /*
          * The boot idle thread does lazy MMU switching as well:
          */
         atomic_inc(&init_mm.mm_count);
         enter_lazy_tlb(&init_mm, current);
  
-       /*
-        * During early bootup we pretend to be a normal task:
-        */
-       current->sched_class = &fair_sched_class;
-
         /*
          * Make us the idle thread. Technically, schedule() should not be
          * called from this thread, however somewhere below it might be,
@@ -7592,6 +7635,7 @@ EXPORT_SYMBOL(__might_sleep);
  void ___might_sleep(const char *file, int line, int preempt_offset)
  {
         static unsigned long prev_jiffy;        /* ratelimiting */
+       unsigned long preempt_disable_ip;
  
         rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
         if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
@@ -7602,6 +7646,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
                 return;
         prev_jiffy = jiffies;
  
+       /* Save this before calling printk(), since that will clobber it */
+       preempt_disable_ip = get_preempt_disable_ip(current);
+
         printk(KERN_ERR
                 "BUG: sleeping function called from invalid context at %s:%d\n",
                         file, line);
@@ -7616,14 +7663,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
         debug_show_held_locks(current);
         if (irqs_disabled())
                 print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
-       if (!preempt_count_equals(preempt_offset)) {
+       if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+           && !preempt_count_equals(preempt_offset)) {
                 pr_err("Preemption disabled at:");
-               print_ip_sym(current->preempt_disable_ip);
+               print_ip_sym(preempt_disable_ip);
                 pr_cont("\n");
         }
-#endif
         dump_stack();
+       add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
  }
  EXPORT_SYMBOL(___might_sleep);
  #endif
@@ -7644,12 +7691,10 @@ void normalize_rt_tasks(void)
                 if (p->flags & PF_KTHREAD)
                         continue;
  
-               p->se.exec_start                = 0;
-#ifdef CONFIG_SCHEDSTATS
-               p->se.statistics.wait_start     = 0;
-               p->se.statistics.sleep_start    = 0;
-               p->se.statistics.block_start    = 0;
-#endif
+               p->se.exec_start = 0;
+               schedstat_set(p->se.statistics.wait_start,  0);
+               schedstat_set(p->se.statistics.sleep_start, 0);
+               schedstat_set(p->se.statistics.block_start, 0);
  
                 if (!dl_task(p) && !rt_task(p)) {
                         /*
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c

index d418449..e731190 100644 (file)
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -31,56 +31,81 @@ static inline int right_child(int i)
         return (i << 1) + 2;
  }
  
-static void cpudl_exchange(struct cpudl *cp, int a, int b)
+static void cpudl_heapify_down(struct cpudl *cp, int idx)
  {
-       int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
+       int l, r, largest;
  
-       swap(cp->elements[a].cpu, cp->elements[b].cpu);
-       swap(cp->elements[a].dl , cp->elements[b].dl );
+       int orig_cpu = cp->elements[idx].cpu;
+       u64 orig_dl = cp->elements[idx].dl;
  
-       swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
-}
-
-static void cpudl_heapify(struct cpudl *cp, int idx)
-{
-       int l, r, largest;
+       if (left_child(idx) >= cp->size)
+               return;
  
         /* adapted from lib/prio_heap.c */
         while(1) {
+               u64 largest_dl;
                 l = left_child(idx);
                 r = right_child(idx);
                 largest = idx;
+               largest_dl = orig_dl;
  
-               if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
-                                                       cp->elements[l].dl))
+               if ((l < cp->size) && dl_time_before(orig_dl,
+                                               cp->elements[l].dl)) {
                         largest = l;
-               if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
-                                                       cp->elements[r].dl))
+                       largest_dl = cp->elements[l].dl;
+               }
+               if ((r < cp->size) && dl_time_before(largest_dl,
+                                               cp->elements[r].dl))
                         largest = r;
+
                 if (largest == idx)
                         break;
  
-               /* Push idx down the heap one level and bump one up */
-               cpudl_exchange(cp, largest, idx);
+               /* pull largest child onto idx */
+               cp->elements[idx].cpu = cp->elements[largest].cpu;
+               cp->elements[idx].dl = cp->elements[largest].dl;
+               cp->elements[cp->elements[idx].cpu].idx = idx;
                 idx = largest;
         }
+       /* actual push down of saved original values orig_* */
+       cp->elements[idx].cpu = orig_cpu;
+       cp->elements[idx].dl = orig_dl;
+       cp->elements[cp->elements[idx].cpu].idx = idx;
  }
  
-static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
+static void cpudl_heapify_up(struct cpudl *cp, int idx)
  {
-       WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
+       int p;
  
-       if (dl_time_before(new_dl, cp->elements[idx].dl)) {
-               cp->elements[idx].dl = new_dl;
-               cpudl_heapify(cp, idx);
-       } else {
-               cp->elements[idx].dl = new_dl;
-               while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
-                                       cp->elements[idx].dl)) {
-                       cpudl_exchange(cp, idx, parent(idx));
-                       idx = parent(idx);
-               }
-       }
+       int orig_cpu = cp->elements[idx].cpu;
+       u64 orig_dl = cp->elements[idx].dl;
+
+       if (idx == 0)
+               return;
+
+       do {
+               p = parent(idx);
+               if (dl_time_before(orig_dl, cp->elements[p].dl))
+                       break;
+               /* pull parent onto idx */
+               cp->elements[idx].cpu = cp->elements[p].cpu;
+               cp->elements[idx].dl = cp->elements[p].dl;
+               cp->elements[cp->elements[idx].cpu].idx = idx;
+               idx = p;
+       } while (idx != 0);
+       /* actual push up of saved original values orig_* */
+       cp->elements[idx].cpu = orig_cpu;
+       cp->elements[idx].dl = orig_dl;
+       cp->elements[cp->elements[idx].cpu].idx = idx;
+}
+
+static void cpudl_heapify(struct cpudl *cp, int idx)
+{
+       if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+                               cp->elements[idx].dl))
+               cpudl_heapify_up(cp, idx);
+       else
+               cpudl_heapify_down(cp, idx);
  }
  
  static inline int cpudl_maximum(struct cpudl *cp)
@@ -120,16 +145,15 @@ out:
  }
  
  /*
- * cpudl_set - update the cpudl max-heap
+ * cpudl_clear - remove a cpu from the cpudl max-heap
   * @cp: the cpudl max-heap context
   * @cpu: the target cpu
- * @dl: the new earliest deadline for this cpu
   *
   * Notes: assumes cpu_rq(cpu)->lock is locked
   *
   * Returns: (void)
   */
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+void cpudl_clear(struct cpudl *cp, int cpu)
  {
         int old_idx, new_cpu;
         unsigned long flags;
@@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
         WARN_ON(!cpu_present(cpu));
  
         raw_spin_lock_irqsave(&cp->lock, flags);
+
         old_idx = cp->elements[cpu].idx;
-       if (!is_valid) {
-               /* remove item */
-               if (old_idx == IDX_INVALID) {
-                       /*
-                        * Nothing to remove if old_idx was invalid.
-                        * This could happen if a rq_offline_dl is
-                        * called for a CPU without -dl tasks running.
-                        */
-                       goto out;
-               }
+       if (old_idx == IDX_INVALID) {
+               /*
+                * Nothing to remove if old_idx was invalid.
+                * This could happen if a rq_offline_dl is
+                * called for a CPU without -dl tasks running.
+                */
+       } else {
                 new_cpu = cp->elements[cp->size - 1].cpu;
                 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
                 cp->elements[old_idx].cpu = new_cpu;
                 cp->size--;
                 cp->elements[new_cpu].idx = old_idx;
                 cp->elements[cpu].idx = IDX_INVALID;
-               while (old_idx > 0 && dl_time_before(
-                               cp->elements[parent(old_idx)].dl,
-                               cp->elements[old_idx].dl)) {
-                       cpudl_exchange(cp, old_idx, parent(old_idx));
-                       old_idx = parent(old_idx);
-               }
-               cpumask_set_cpu(cpu, cp->free_cpus);
-                cpudl_heapify(cp, old_idx);
+               cpudl_heapify(cp, old_idx);
  
-               goto out;
+               cpumask_set_cpu(cpu, cp->free_cpus);
         }
+       raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+
+/*
+ * cpudl_set - update the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target cpu
+ * @dl: the new earliest deadline for this cpu
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
+{
+       int old_idx;
+       unsigned long flags;
  
+       WARN_ON(!cpu_present(cpu));
+
+       raw_spin_lock_irqsave(&cp->lock, flags);
+
+       old_idx = cp->elements[cpu].idx;
         if (old_idx == IDX_INVALID) {
-               cp->size++;
-               cp->elements[cp->size - 1].dl = dl;
-               cp->elements[cp->size - 1].cpu = cpu;
-               cp->elements[cpu].idx = cp->size - 1;
-               cpudl_change_key(cp, cp->size - 1, dl);
+               int new_idx = cp->size++;
+               cp->elements[new_idx].dl = dl;
+               cp->elements[new_idx].cpu = cpu;
+               cp->elements[cpu].idx = new_idx;
+               cpudl_heapify_up(cp, new_idx);
                 cpumask_clear_cpu(cpu, cp->free_cpus);
         } else {
-               cpudl_change_key(cp, old_idx, dl);
+               cp->elements[old_idx].dl = dl;
+               cpudl_heapify(cp, old_idx);
         }
  
-out:
         raw_spin_unlock_irqrestore(&cp->lock, flags);
  }
  
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h

index fcbdf83..f7da8c5 100644 (file)
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -23,7 +23,8 @@ struct cpudl {
  #ifdef CONFIG_SMP
  int cpudl_find(struct cpudl *cp, struct task_struct *p,
                struct cpumask *later_mask);
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
+void cpudl_clear(struct cpudl *cp, int cpu);
  int cpudl_init(struct cpudl *cp);
  void cpudl_set_freecpu(struct cpudl *cp, int cpu);
  void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

index a846cf8..b93c72d 100644 (file)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -306,6 +306,26 @@ static inline cputime_t account_other_time(cputime_t max)
         return accounted;
  }
  
+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+       return t->se.sum_exec_runtime;
+}
+#else
+static u64 read_sum_exec_runtime(struct task_struct *t)
+{
+       u64 ns;
+       struct rq_flags rf;
+       struct rq *rq;
+
+       rq = task_rq_lock(t, &rf);
+       ns = t->se.sum_exec_runtime;
+       task_rq_unlock(rq, t, &rf);
+
+       return ns;
+}
+#endif
+
  /*
   * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
   * tasks (sum on group iteration) belonging to @tsk's group.
@@ -318,6 +338,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
         unsigned int seq, nextseq;
         unsigned long flags;
  
+       /*
+        * Update current task runtime to account pending time since last
+        * scheduler action or thread_group_cputime() call. This thread group
+        * might have other running tasks on different CPUs, but updating
+        * their runtime can affect syscall performance, so we skip account
+        * those pending times and rely only on values updated on tick or
+        * other scheduler action.
+        */
+       if (same_thread_group(current, tsk))
+               (void) task_sched_runtime(current);
+
         rcu_read_lock();
         /* Attempt a lockless read on the first round. */
         nextseq = 0;
@@ -332,7 +363,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
                         task_cputime(t, &utime, &stime);
                         times->utime += utime;
                         times->stime += stime;
-                       times->sum_exec_runtime += task_sched_runtime(t);
+                       times->sum_exec_runtime += read_sum_exec_runtime(t);
                 }
                 /* If lockless access failed, take the lock. */
                 nextseq = 1;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index 1ce8867..0c75bc6 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
  static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
  {
         struct rq *later_rq = NULL;
-       bool fallback = false;
  
         later_rq = find_lock_later_rq(p, rq);
-
         if (!later_rq) {
                 int cpu;
  
@@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
                  * If we cannot preempt any rq, fall back to pick any
                  * online cpu.
                  */
-               fallback = true;
                 cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
                 if (cpu >= nr_cpu_ids) {
                         /*
@@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
                 double_lock_balance(rq, later_rq);
         }
  
-       /*
-        * By now the task is replenished and enqueued; migrate it.
-        */
-       deactivate_task(rq, p, 0);
         set_task_cpu(p, later_rq->cpu);
-       activate_task(later_rq, p, 0);
-
-       if (!fallback)
-               resched_curr(later_rq);
-
         double_unlock_balance(later_rq, rq);
  
         return later_rq;
@@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
   * one, and to (try to!) reconcile itself with its own scheduling
   * parameters.
   */
-static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
-                                      struct sched_dl_entity *pi_se)
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
  {
         struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
         struct rq *rq = rq_of_dl_rq(dl_rq);
  
+       WARN_ON(dl_se->dl_boosted);
         WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
  
         /*
@@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
          * future; in fact, we must consider execution overheads (time
          * spent on hardirq context, etc.).
          */
-       dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
-       dl_se->runtime = pi_se->dl_runtime;
+       dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
+       dl_se->runtime = dl_se->dl_runtime;
  }
  
  /*
@@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                 goto unlock;
         }
  
-       enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
-       if (dl_task(rq->curr))
-               check_preempt_curr_dl(rq, p, 0);
-       else
-               resched_curr(rq);
-
  #ifdef CONFIG_SMP
-       /*
-        * Perform balancing operations here; after the replenishments.  We
-        * cannot drop rq->lock before this, otherwise the assertion in
-        * start_dl_timer() about not missing updates is not true.
-        *
-        * If we find that the rq the task was on is no longer available, we
-        * need to select a new rq.
-        *
-        * XXX figure out if select_task_rq_dl() deals with offline cpus.
-        */
         if (unlikely(!rq->online)) {
+               /*
+                * If the runqueue is no longer available, migrate the
+                * task elsewhere. This necessarily changes rq.
+                */
                 lockdep_unpin_lock(&rq->lock, rf.cookie);
                 rq = dl_task_offline_migration(rq, p);
                 rf.cookie = lockdep_pin_lock(&rq->lock);
+
+               /*
+                * Now that the task has been migrated to the new RQ and we
+                * have that locked, proceed as normal and enqueue the task
+                * there.
+                */
         }
+#endif
+
+       enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+       if (dl_task(rq->curr))
+               check_preempt_curr_dl(rq, p, 0);
+       else
+               resched_curr(rq);
  
+#ifdef CONFIG_SMP
         /*
          * Queueing this task back might have overloaded rq, check if we need
          * to kick someone away.
@@ -798,7 +788,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
         if (dl_rq->earliest_dl.curr == 0 ||
             dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
                 dl_rq->earliest_dl.curr = deadline;
-               cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
+               cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
         }
  }
  
@@ -813,14 +803,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
         if (!dl_rq->dl_nr_running) {
                 dl_rq->earliest_dl.curr = 0;
                 dl_rq->earliest_dl.next = 0;
-               cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+               cpudl_clear(&rq->rd->cpudl, rq->cpu);
         } else {
                 struct rb_node *leftmost = dl_rq->rb_leftmost;
                 struct sched_dl_entity *entry;
  
                 entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
                 dl_rq->earliest_dl.curr = entry->deadline;
-               cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
+               cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
         }
  }
  
@@ -1671,7 +1661,7 @@ static void rq_online_dl(struct rq *rq)
  
         cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
         if (rq->dl.dl_nr_running > 0)
-               cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
+               cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
  }
  
  /* Assumes rq->lock is held */
@@ -1680,7 +1670,7 @@ static void rq_offline_dl(struct rq *rq)
         if (rq->dl.overloaded)
                 dl_clear_overload(rq);
  
-       cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+       cpudl_clear(&rq->rd->cpudl, rq->cpu);
         cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
  }
  
@@ -1723,10 +1713,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
   */
  static void switched_to_dl(struct rq *rq, struct task_struct *p)
  {
+
+       /* If p is not queued we will update its parameters at next wakeup. */
+       if (!task_on_rq_queued(p))
+               return;
+
+       /*
+        * If p is boosted we already updated its params in
+        * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
+        * p's deadline being now already after rq_clock(rq).
+        */
         if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-               setup_new_dl_entity(&p->dl, &p->dl);
+               setup_new_dl_entity(&p->dl);
  
-       if (task_on_rq_queued(p) && rq->curr != p) {
+       if (rq->curr != p) {
  #ifdef CONFIG_SMP
                 if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
                         queue_push_tasks(rq);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 2a0a999..1393588 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
  
  #define P(F) \
         SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
+#define P_SCHEDSTAT(F) \
+       SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)schedstat_val(F))
  #define PN(F) \
         SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN_SCHEDSTAT(F) \
+       SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
  
         if (!se)
                 return;
@@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
         PN(se->exec_start);
         PN(se->vruntime);
         PN(se->sum_exec_runtime);
-#ifdef CONFIG_SCHEDSTATS
         if (schedstat_enabled()) {
-               PN(se->statistics.wait_start);
-               PN(se->statistics.sleep_start);
-               PN(se->statistics.block_start);
-               PN(se->statistics.sleep_max);
-               PN(se->statistics.block_max);
-               PN(se->statistics.exec_max);
-               PN(se->statistics.slice_max);
-               PN(se->statistics.wait_max);
-               PN(se->statistics.wait_sum);
-               P(se->statistics.wait_count);
+               PN_SCHEDSTAT(se->statistics.wait_start);
+               PN_SCHEDSTAT(se->statistics.sleep_start);
+               PN_SCHEDSTAT(se->statistics.block_start);
+               PN_SCHEDSTAT(se->statistics.sleep_max);
+               PN_SCHEDSTAT(se->statistics.block_max);
+               PN_SCHEDSTAT(se->statistics.exec_max);
+               PN_SCHEDSTAT(se->statistics.slice_max);
+               PN_SCHEDSTAT(se->statistics.wait_max);
+               PN_SCHEDSTAT(se->statistics.wait_sum);
+               P_SCHEDSTAT(se->statistics.wait_count);
         }
-#endif
         P(se->load.weight);
  #ifdef CONFIG_SMP
         P(se->avg.load_avg);
         P(se->avg.util_avg);
  #endif
+
+#undef PN_SCHEDSTAT
  #undef PN
+#undef P_SCHEDSTAT
  #undef P
  }
  #endif
@@ -429,9 +434,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                 p->prio);
  
         SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-               SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
+               SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
                 SPLIT_NS(p->se.sum_exec_runtime),
-               SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
+               SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
  
  #ifdef CONFIG_NUMA_BALANCING
         SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
@@ -626,9 +631,7 @@ do {                                                                        \
  #undef P64
  #endif
  
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
-
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, schedstat_val(rq->n));
         if (schedstat_enabled()) {
                 P(yld_count);
                 P(sched_count);
@@ -636,9 +639,8 @@ do {                                                                        \
                 P(ttwu_count);
                 P(ttwu_local);
         }
-
  #undef P
-#endif
+
         spin_lock_irqsave(&sched_debug_lock, flags);
         print_cfs_stats(m, cpu);
         print_rt_stats(m, cpu);
@@ -868,10 +870,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
         SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
  #define P(F) \
         SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define P_SCHEDSTAT(F) \
+       SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
  #define __PN(F) \
         SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
  #define PN(F) \
         SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#define PN_SCHEDSTAT(F) \
+       SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
  
         PN(se.exec_start);
         PN(se.vruntime);
@@ -881,37 +887,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
  
         P(se.nr_migrations);
  
-#ifdef CONFIG_SCHEDSTATS
         if (schedstat_enabled()) {
                 u64 avg_atom, avg_per_cpu;
  
-               PN(se.statistics.sum_sleep_runtime);
-               PN(se.statistics.wait_start);
-               PN(se.statistics.sleep_start);
-               PN(se.statistics.block_start);
-               PN(se.statistics.sleep_max);
-               PN(se.statistics.block_max);
-               PN(se.statistics.exec_max);
-               PN(se.statistics.slice_max);
-               PN(se.statistics.wait_max);
-               PN(se.statistics.wait_sum);
-               P(se.statistics.wait_count);
-               PN(se.statistics.iowait_sum);
-               P(se.statistics.iowait_count);
-               P(se.statistics.nr_migrations_cold);
-               P(se.statistics.nr_failed_migrations_affine);
-               P(se.statistics.nr_failed_migrations_running);
-               P(se.statistics.nr_failed_migrations_hot);
-               P(se.statistics.nr_forced_migrations);
-               P(se.statistics.nr_wakeups);
-               P(se.statistics.nr_wakeups_sync);
-               P(se.statistics.nr_wakeups_migrate);
-               P(se.statistics.nr_wakeups_local);
-               P(se.statistics.nr_wakeups_remote);
-               P(se.statistics.nr_wakeups_affine);
-               P(se.statistics.nr_wakeups_affine_attempts);
-               P(se.statistics.nr_wakeups_passive);
-               P(se.statistics.nr_wakeups_idle);
+               PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
+               PN_SCHEDSTAT(se.statistics.wait_start);
+               PN_SCHEDSTAT(se.statistics.sleep_start);
+               PN_SCHEDSTAT(se.statistics.block_start);
+               PN_SCHEDSTAT(se.statistics.sleep_max);
+               PN_SCHEDSTAT(se.statistics.block_max);
+               PN_SCHEDSTAT(se.statistics.exec_max);
+               PN_SCHEDSTAT(se.statistics.slice_max);
+               PN_SCHEDSTAT(se.statistics.wait_max);
+               PN_SCHEDSTAT(se.statistics.wait_sum);
+               P_SCHEDSTAT(se.statistics.wait_count);
+               PN_SCHEDSTAT(se.statistics.iowait_sum);
+               P_SCHEDSTAT(se.statistics.iowait_count);
+               P_SCHEDSTAT(se.statistics.nr_migrations_cold);
+               P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
+               P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
+               P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
+               P_SCHEDSTAT(se.statistics.nr_forced_migrations);
+               P_SCHEDSTAT(se.statistics.nr_wakeups);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_local);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
+               P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
  
                 avg_atom = p->se.sum_exec_runtime;
                 if (nr_switches)
@@ -930,7 +935,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
                 __PN(avg_atom);
                 __PN(avg_per_cpu);
         }
-#endif
+
         __P(nr_switches);
         SEQ_printf(m, "%-45s:%21Ld\n",
                    "nr_voluntary_switches", (long long)p->nvcsw);
@@ -947,8 +952,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
  #endif
         P(policy);
         P(prio);
+#undef PN_SCHEDSTAT
  #undef PN
  #undef __PN
+#undef P_SCHEDSTAT
  #undef P
  #undef __P
  
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 039de34..8fb4d19 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
  unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
  #endif
  
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * 1024 < capacity * margin
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
+
  static inline void update_load_add(struct load_weight *lw, unsigned long inc)
  {
         lw->weight += inc;
@@ -656,7 +662,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  }
  
  #ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
  static unsigned long task_h_load(struct task_struct *p);
  
  /*
@@ -726,7 +732,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
         struct sched_avg *sa = &se->avg;
         long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
         u64 now = cfs_rq_clock_task(cfs_rq);
-       int tg_update;
  
         if (cap > 0) {
                 if (cfs_rq->avg.util_avg != 0) {
@@ -759,10 +764,9 @@ void post_init_entity_util_avg(struct sched_entity *se)
                 }
         }
  
-       tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+       update_cfs_rq_load_avg(now, cfs_rq, false);
         attach_entity_load_avg(cfs_rq, se);
-       if (tg_update)
-               update_tg_load_avg(cfs_rq, false);
+       update_tg_load_avg(cfs_rq, false);
  }
  
  #else /* !CONFIG_SMP */
@@ -799,7 +803,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
                       max(delta_exec, curr->statistics.exec_max));
  
         curr->sum_exec_runtime += delta_exec;
-       schedstat_add(cfs_rq, exec_clock, delta_exec);
+       schedstat_add(cfs_rq->exec_clock, delta_exec);
  
         curr->vruntime += calc_delta_fair(delta_exec, curr);
         update_min_vruntime(cfs_rq);
@@ -820,26 +824,34 @@ static void update_curr_fair(struct rq *rq)
         update_curr(cfs_rq_of(&rq->curr->se));
  }
  
-#ifdef CONFIG_SCHEDSTATS
  static inline void
  update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       u64 wait_start = rq_clock(rq_of(cfs_rq));
+       u64 wait_start, prev_wait_start;
+
+       if (!schedstat_enabled())
+               return;
+
+       wait_start = rq_clock(rq_of(cfs_rq));
+       prev_wait_start = schedstat_val(se->statistics.wait_start);
  
         if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
-           likely(wait_start > se->statistics.wait_start))
-               wait_start -= se->statistics.wait_start;
+           likely(wait_start > prev_wait_start))
+               wait_start -= prev_wait_start;
  
-       se->statistics.wait_start = wait_start;
+       schedstat_set(se->statistics.wait_start, wait_start);
  }
  
-static void
+static inline void
  update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         struct task_struct *p;
         u64 delta;
  
-       delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+       if (!schedstat_enabled())
+               return;
+
+       delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
  
         if (entity_is_task(se)) {
                 p = task_of(se);
@@ -849,35 +861,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
                          * time stamp can be adjusted to accumulate wait time
                          * prior to migration.
                          */
-                       se->statistics.wait_start = delta;
+                       schedstat_set(se->statistics.wait_start, delta);
                         return;
                 }
                 trace_sched_stat_wait(p, delta);
         }
  
-       se->statistics.wait_max = max(se->statistics.wait_max, delta);
-       se->statistics.wait_count++;
-       se->statistics.wait_sum += delta;
-       se->statistics.wait_start = 0;
+       schedstat_set(se->statistics.wait_max,
+                     max(schedstat_val(se->statistics.wait_max), delta));
+       schedstat_inc(se->statistics.wait_count);
+       schedstat_add(se->statistics.wait_sum, delta);
+       schedstat_set(se->statistics.wait_start, 0);
+}
+
+static inline void
+update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       struct task_struct *tsk = NULL;
+       u64 sleep_start, block_start;
+
+       if (!schedstat_enabled())
+               return;
+
+       sleep_start = schedstat_val(se->statistics.sleep_start);
+       block_start = schedstat_val(se->statistics.block_start);
+
+       if (entity_is_task(se))
+               tsk = task_of(se);
+
+       if (sleep_start) {
+               u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
+
+               if ((s64)delta < 0)
+                       delta = 0;
+
+               if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
+                       schedstat_set(se->statistics.sleep_max, delta);
+
+               schedstat_set(se->statistics.sleep_start, 0);
+               schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+               if (tsk) {
+                       account_scheduler_latency(tsk, delta >> 10, 1);
+                       trace_sched_stat_sleep(tsk, delta);
+               }
+       }
+       if (block_start) {
+               u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
+
+               if ((s64)delta < 0)
+                       delta = 0;
+
+               if (unlikely(delta > schedstat_val(se->statistics.block_max)))
+                       schedstat_set(se->statistics.block_max, delta);
+
+               schedstat_set(se->statistics.block_start, 0);
+               schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+               if (tsk) {
+                       if (tsk->in_iowait) {
+                               schedstat_add(se->statistics.iowait_sum, delta);
+                               schedstat_inc(se->statistics.iowait_count);
+                               trace_sched_stat_iowait(tsk, delta);
+                       }
+
+                       trace_sched_stat_blocked(tsk, delta);
+
+                       /*
+                        * Blocking time is in units of nanosecs, so shift by
+                        * 20 to get a milliseconds-range estimation of the
+                        * amount of time that the task spent sleeping:
+                        */
+                       if (unlikely(prof_on == SLEEP_PROFILING)) {
+                               profile_hits(SLEEP_PROFILING,
+                                               (void *)get_wchan(tsk),
+                                               delta >> 20);
+                       }
+                       account_scheduler_latency(tsk, delta >> 10, 0);
+               }
+       }
  }
  
  /*
   * Task is being enqueued - update stats:
   */
  static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
+       if (!schedstat_enabled())
+               return;
+
         /*
          * Are we enqueueing a waiting task? (for current tasks
          * a dequeue/enqueue event is a NOP)
          */
         if (se != cfs_rq->curr)
                 update_stats_wait_start(cfs_rq, se);
+
+       if (flags & ENQUEUE_WAKEUP)
+               update_stats_enqueue_sleeper(cfs_rq, se);
  }
  
  static inline void
  update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
+
+       if (!schedstat_enabled())
+               return;
+
         /*
          * Mark the end of the wait period if dequeueing a
          * waiting task:
@@ -885,40 +976,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         if (se != cfs_rq->curr)
                 update_stats_wait_end(cfs_rq, se);
  
-       if (flags & DEQUEUE_SLEEP) {
-               if (entity_is_task(se)) {
-                       struct task_struct *tsk = task_of(se);
+       if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
+               struct task_struct *tsk = task_of(se);
  
-                       if (tsk->state & TASK_INTERRUPTIBLE)
-                               se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
-                       if (tsk->state & TASK_UNINTERRUPTIBLE)
-                               se->statistics.block_start = rq_clock(rq_of(cfs_rq));
-               }
+               if (tsk->state & TASK_INTERRUPTIBLE)
+                       schedstat_set(se->statistics.sleep_start,
+                                     rq_clock(rq_of(cfs_rq)));
+               if (tsk->state & TASK_UNINTERRUPTIBLE)
+                       schedstat_set(se->statistics.block_start,
+                                     rq_clock(rq_of(cfs_rq)));
         }
-
-}
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
  }
  
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
-}
-#endif
-
  /*
   * We are picking a new current task - update its stats:
   */
@@ -1514,7 +1583,8 @@ balance:
          * Call select_idle_sibling to maybe find a better one.
          */
         if (!cur)
-               env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+               env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+                                                  env->dst_cpu);
  
  assign:
         task_numa_assign(env, cur, imp);
@@ -2803,9 +2873,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
   */
  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
  {
@@ -2931,10 +3013,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
   *
   * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
   *
- * Returns true if the load decayed or we removed utilization. It is expected
- * that one calls update_tg_load_avg() on this condition, but after you've
- * modified the cfs_rq avg (attach/detach), such that we propagate the new
- * avg up.
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
   */
  static inline int
  update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -3183,68 +3265,6 @@ static inline int idle_balance(struct rq *rq)
  
  #endif /* CONFIG_SMP */
  
-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHEDSTATS
-       struct task_struct *tsk = NULL;
-
-       if (entity_is_task(se))
-               tsk = task_of(se);
-
-       if (se->statistics.sleep_start) {
-               u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
-
-               if ((s64)delta < 0)
-                       delta = 0;
-
-               if (unlikely(delta > se->statistics.sleep_max))
-                       se->statistics.sleep_max = delta;
-
-               se->statistics.sleep_start = 0;
-               se->statistics.sum_sleep_runtime += delta;
-
-               if (tsk) {
-                       account_scheduler_latency(tsk, delta >> 10, 1);
-                       trace_sched_stat_sleep(tsk, delta);
-               }
-       }
-       if (se->statistics.block_start) {
-               u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
-
-               if ((s64)delta < 0)
-                       delta = 0;
-
-               if (unlikely(delta > se->statistics.block_max))
-                       se->statistics.block_max = delta;
-
-               se->statistics.block_start = 0;
-               se->statistics.sum_sleep_runtime += delta;
-
-               if (tsk) {
-                       if (tsk->in_iowait) {
-                               se->statistics.iowait_sum += delta;
-                               se->statistics.iowait_count++;
-                               trace_sched_stat_iowait(tsk, delta);
-                       }
-
-                       trace_sched_stat_blocked(tsk, delta);
-
-                       /*
-                        * Blocking time is in units of nanosecs, so shift by
-                        * 20 to get a milliseconds-range estimation of the
-                        * amount of time that the task spent sleeping:
-                        */
-                       if (unlikely(prof_on == SLEEP_PROFILING)) {
-                               profile_hits(SLEEP_PROFILING,
-                                               (void *)get_wchan(tsk),
-                                               delta >> 20);
-                       }
-                       account_scheduler_latency(tsk, delta >> 10, 0);
-               }
-       }
-#endif
-}
-
  static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
  #ifdef CONFIG_SCHED_DEBUG
@@ -3254,7 +3274,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 d = -d;
  
         if (d > 3*sysctl_sched_latency)
-               schedstat_inc(cfs_rq, nr_spread_over);
+               schedstat_inc(cfs_rq->nr_spread_over);
  #endif
  }
  
@@ -3371,17 +3391,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         account_entity_enqueue(cfs_rq, se);
         update_cfs_shares(cfs_rq);
  
-       if (flags & ENQUEUE_WAKEUP) {
+       if (flags & ENQUEUE_WAKEUP)
                 place_entity(cfs_rq, se, 0);
-               if (schedstat_enabled())
-                       enqueue_sleeper(cfs_rq, se);
-       }
  
         check_schedstat_required();
-       if (schedstat_enabled()) {
-               update_stats_enqueue(cfs_rq, se);
-               check_spread(cfs_rq, se);
-       }
+       update_stats_enqueue(cfs_rq, se, flags);
+       check_spread(cfs_rq, se);
         if (!curr)
                 __enqueue_entity(cfs_rq, se);
         se->on_rq = 1;
@@ -3448,8 +3463,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         update_curr(cfs_rq);
         dequeue_entity_load_avg(cfs_rq, se);
  
-       if (schedstat_enabled())
-               update_stats_dequeue(cfs_rq, se, flags);
+       update_stats_dequeue(cfs_rq, se, flags);
  
         clear_buddies(cfs_rq, se);
  
@@ -3523,25 +3537,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                  * a CPU. So account for the time it spent waiting on the
                  * runqueue.
                  */
-               if (schedstat_enabled())
-                       update_stats_wait_end(cfs_rq, se);
+               update_stats_wait_end(cfs_rq, se);
                 __dequeue_entity(cfs_rq, se);
                 update_load_avg(se, 1);
         }
  
         update_stats_curr_start(cfs_rq, se);
         cfs_rq->curr = se;
-#ifdef CONFIG_SCHEDSTATS
+
         /*
          * Track our maximum slice length, if the CPU's load is at
          * least twice that of our own weight (i.e. dont track it
          * when there are only lesser-weight tasks around):
          */
         if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
-               se->statistics.slice_max = max(se->statistics.slice_max,
-                       se->sum_exec_runtime - se->prev_sum_exec_runtime);
+               schedstat_set(se->statistics.slice_max,
+                       max((u64)schedstat_val(se->statistics.slice_max),
+                           se->sum_exec_runtime - se->prev_sum_exec_runtime));
         }
-#endif
+
         se->prev_sum_exec_runtime = se->sum_exec_runtime;
  }
  
@@ -3620,13 +3634,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
         /* throttle cfs_rqs exceeding runtime */
         check_cfs_rq_runtime(cfs_rq);
  
-       if (schedstat_enabled()) {
-               check_spread(cfs_rq, prev);
-               if (prev->on_rq)
-                       update_stats_wait_start(cfs_rq, prev);
-       }
+       check_spread(cfs_rq, prev);
  
         if (prev->on_rq) {
+               update_stats_wait_start(cfs_rq, prev);
                 /* Put 'current' back into the tree. */
                 __enqueue_entity(cfs_rq, prev);
                 /* in !on_rq case, update occurred at dequeue */
@@ -4458,7 +4469,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
  
         WARN_ON(task_rq(p) != rq);
  
-       if (cfs_rq->nr_running > 1) {
+       if (rq->cfs.h_nr_running > 1) {
                 u64 slice = sched_slice(cfs_rq, se);
                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
                 s64 delta = slice - ran;
@@ -5091,18 +5102,18 @@ static int wake_wide(struct task_struct *p)
         return 1;
  }
  
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+                      int prev_cpu, int sync)
  {
         s64 this_load, load;
         s64 this_eff_load, prev_eff_load;
-       int idx, this_cpu, prev_cpu;
+       int idx, this_cpu;
         struct task_group *tg;
         unsigned long weight;
         int balanced;
  
         idx       = sd->wake_idx;
         this_cpu  = smp_processor_id();
-       prev_cpu  = task_cpu(p);
         load      = source_load(prev_cpu, idx);
         this_load = target_load(this_cpu, idx);
  
@@ -5146,13 +5157,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
  
         balanced = this_eff_load <= prev_eff_load;
  
-       schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
+       schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
  
         if (!balanced)
                 return 0;
  
-       schedstat_inc(sd, ttwu_move_affine);
-       schedstat_inc(p, se.statistics.nr_wakeups_affine);
+       schedstat_inc(sd->ttwu_move_affine);
+       schedstat_inc(p->se.statistics.nr_wakeups_affine);
  
         return 1;
  }
@@ -5228,6 +5239,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
         int shallowest_idle_cpu = -1;
         int i;
  
+       /* Check if we have any choice: */
+       if (group->group_weight == 1)
+               return cpumask_first(sched_group_cpus(group));
+
         /* Traverse only the allowed CPUs */
         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
                 if (idle_cpu(i)) {
@@ -5267,11 +5282,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  /*
   * Try and locate an idle CPU in the sched_domain.
   */
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
  {
         struct sched_domain *sd;
         struct sched_group *sg;
-       int i = task_cpu(p);
  
         if (idle_cpu(target))
                 return target;
@@ -5279,8 +5293,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
         /*
          * If the prevous cpu is cache affine and idle, don't be stupid.
          */
-       if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-               return i;
+       if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+               return prev;
  
         /*
          * Otherwise, iterate the domains and find an eligible idle cpu.
@@ -5301,6 +5315,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
         for_each_lower_domain(sd) {
                 sg = sd->groups;
                 do {
+                       int i;
+
                         if (!cpumask_intersects(sched_group_cpus(sg),
                                                 tsk_cpus_allowed(p)))
                                 goto next;
@@ -5360,6 +5376,32 @@ static int cpu_util(int cpu)
         return (util >= capacity) ? capacity : util;
  }
  
+static inline int task_util(struct task_struct *p)
+{
+       return p->se.avg.util_avg;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+       long min_cap, max_cap;
+
+       min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+       max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+
+       /* Minimum capacity is close to max, no need to abort wake_affine */
+       if (max_cap - min_cap < max_cap >> 3)
+               return 0;
+
+       return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
  /*
   * select_task_rq_fair: Select target runqueue for the waking task in domains
   * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5383,7 +5425,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  
         if (sd_flag & SD_BALANCE_WAKE) {
                 record_wakee(p);
-               want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+               want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+                             && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
         }
  
         rcu_read_lock();
@@ -5409,13 +5452,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  
         if (affine_sd) {
                 sd = NULL; /* Prefer wake_affine over balance flags */
-               if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+               if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
                         new_cpu = cpu;
         }
  
         if (!sd) {
                 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
-                       new_cpu = select_idle_sibling(p, new_cpu);
+                       new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
  
         } else while (sd) {
                 struct sched_group *group;
@@ -5939,7 +5982,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
   *
   * The adjacency matrix of the resulting graph is given by:
   *
- *             log_2 n     
+ *             log_2 n
   *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
   *             k = 0
   *
@@ -5985,7 +6028,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
   *
   * [XXX write more on how we solve this.. _after_ merging pjt's patches that
   *      rewrite all of this once again.]
- */ 
+ */
  
  static unsigned long __read_mostly max_load_balance_interval = HZ/10;
  
@@ -6133,7 +6176,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
                 int cpu;
  
-               schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+               schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
  
                 env->flags |= LBF_SOME_PINNED;
  
@@ -6164,7 +6207,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         env->flags &= ~LBF_ALL_PINNED;
  
         if (task_running(env->src_rq, p)) {
-               schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+               schedstat_inc(p->se.statistics.nr_failed_migrations_running);
                 return 0;
         }
  
@@ -6181,13 +6224,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         if (tsk_cache_hot <= 0 ||
             env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
                 if (tsk_cache_hot == 1) {
-                       schedstat_inc(env->sd, lb_hot_gained[env->idle]);
-                       schedstat_inc(p, se.statistics.nr_forced_migrations);
+                       schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+                       schedstat_inc(p->se.statistics.nr_forced_migrations);
                 }
                 return 1;
         }
  
-       schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+       schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
         return 0;
  }
  
@@ -6227,7 +6270,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
                  * so we can safely collect stats here rather than
                  * inside detach_tasks().
                  */
-               schedstat_inc(env->sd, lb_gained[env->idle]);
+               schedstat_inc(env->sd->lb_gained[env->idle]);
                 return p;
         }
         return NULL;
@@ -6319,7 +6362,7 @@ next:
          * so we can safely collect detach_one_task() stats here rather
          * than inside detach_one_task().
          */
-       schedstat_add(env->sd, lb_gained[env->idle], detached);
+       schedstat_add(env->sd->lb_gained[env->idle], detached);
  
         return detached;
  }
@@ -6647,7 +6690,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                 /*
                  * !SD_OVERLAP domains can assume that child groups
                  * span the current group.
-                */ 
+                */
  
                 group = child->groups;
                 do {
@@ -7147,7 +7190,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
                 if (load_above_capacity > busiest->group_capacity) {
                         load_above_capacity -= busiest->group_capacity;
-                       load_above_capacity *= NICE_0_LOAD;
+                       load_above_capacity *= scale_load_down(NICE_0_LOAD);
                         load_above_capacity /= busiest->group_capacity;
                 } else
                         load_above_capacity = ~0UL;
@@ -7460,7 +7503,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
  
         cpumask_copy(cpus, cpu_active_mask);
  
-       schedstat_inc(sd, lb_count[idle]);
+       schedstat_inc(sd->lb_count[idle]);
  
  redo:
         if (!should_we_balance(&env)) {
@@ -7470,19 +7513,19 @@ redo:
  
         group = find_busiest_group(&env);
         if (!group) {
-               schedstat_inc(sd, lb_nobusyg[idle]);
+               schedstat_inc(sd->lb_nobusyg[idle]);
                 goto out_balanced;
         }
  
         busiest = find_busiest_queue(&env, group);
         if (!busiest) {
-               schedstat_inc(sd, lb_nobusyq[idle]);
+               schedstat_inc(sd->lb_nobusyq[idle]);
                 goto out_balanced;
         }
  
         BUG_ON(busiest == env.dst_rq);
  
-       schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+       schedstat_add(sd->lb_imbalance[idle], env.imbalance);
  
         env.src_cpu = busiest->cpu;
         env.src_rq = busiest;
@@ -7589,7 +7632,7 @@ more_balance:
         }
  
         if (!ld_moved) {
-               schedstat_inc(sd, lb_failed[idle]);
+               schedstat_inc(sd->lb_failed[idle]);
                 /*
                  * Increment the failure counter only on periodic balance.
                  * We do not want newidle balance, which can be very
@@ -7672,7 +7715,7 @@ out_all_pinned:
          * we can't migrate them. Let the imbalance flag set so parent level
          * can try to migrate them.
          */
-       schedstat_inc(sd, lb_balanced[idle]);
+       schedstat_inc(sd->lb_balanced[idle]);
  
         sd->nr_balance_failed = 0;
  
@@ -7704,11 +7747,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
  }
  
  static inline void
-update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
  {
         unsigned long interval, next;
  
-       interval = get_sd_balance_interval(sd, cpu_busy);
+       /* used by idle balance, so cpu_busy = 0 */
+       interval = get_sd_balance_interval(sd, 0);
         next = sd->last_balance + interval;
  
         if (time_after(*next_balance, next))
@@ -7738,7 +7782,7 @@ static int idle_balance(struct rq *this_rq)
                 rcu_read_lock();
                 sd = rcu_dereference_check_sched_domain(this_rq->sd);
                 if (sd)
-                       update_next_balance(sd, 0, &next_balance);
+                       update_next_balance(sd, &next_balance);
                 rcu_read_unlock();
  
                 goto out;
@@ -7756,7 +7800,7 @@ static int idle_balance(struct rq *this_rq)
                         continue;
  
                 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
-                       update_next_balance(sd, 0, &next_balance);
+                       update_next_balance(sd, &next_balance);
                         break;
                 }
  
@@ -7774,7 +7818,7 @@ static int idle_balance(struct rq *this_rq)
                         curr_cost += domain_cost;
                 }
  
-               update_next_balance(sd, 0, &next_balance);
+               update_next_balance(sd, &next_balance);
  
                 /*
                  * Stop searching for tasks to pull if there are
@@ -7864,15 +7908,15 @@ static int active_load_balance_cpu_stop(void *data)
                         .idle           = CPU_IDLE,
                 };
  
-               schedstat_inc(sd, alb_count);
+               schedstat_inc(sd->alb_count);
  
                 p = detach_one_task(&env);
                 if (p) {
-                       schedstat_inc(sd, alb_pushed);
+                       schedstat_inc(sd->alb_pushed);
                         /* Active balancing done, reset the failure counter. */
                         sd->nr_balance_failed = 0;
                 } else {
-                       schedstat_inc(sd, alb_failed);
+                       schedstat_inc(sd->alb_failed);
                 }
         }
         rcu_read_unlock();
@@ -8441,7 +8485,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         u64 now = cfs_rq_clock_task(cfs_rq);
-       int tg_update;
  
         if (!vruntime_normalized(p)) {
                 /*
@@ -8453,10 +8496,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
         }
  
         /* Catch up with the cfs_rq and remove our load when we leave */
-       tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+       update_cfs_rq_load_avg(now, cfs_rq, false);
         detach_entity_load_avg(cfs_rq, se);
-       if (tg_update)
-               update_tg_load_avg(cfs_rq, false);
+       update_tg_load_avg(cfs_rq, false);
  }
  
  static void attach_task_cfs_rq(struct task_struct *p)
@@ -8464,7 +8506,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         u64 now = cfs_rq_clock_task(cfs_rq);
-       int tg_update;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
         /*
@@ -8475,10 +8516,9 @@ static void attach_task_cfs_rq(struct task_struct *p)
  #endif
  
         /* Synchronize task with its cfs_rq */
-       tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+       update_cfs_rq_load_avg(now, cfs_rq, false);
         attach_entity_load_avg(cfs_rq, se);
-       if (tg_update)
-               update_tg_load_avg(cfs_rq, false);
+       update_tg_load_avg(cfs_rq, false);
  
         if (!vruntime_normalized(p))
                 se->vruntime += cfs_rq->min_vruntime;
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c

index 2ce5458..dedc81e 100644 (file)
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -28,7 +28,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie c
  {
         put_prev_task(rq, prev);
  
-       schedstat_inc(rq, sched_goidle);
+       schedstat_inc(rq->sched_goidle);
         return rq->idle;
  }
  
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index c64fc51..420c05d 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -565,6 +565,8 @@ struct root_domain {
          */
         cpumask_var_t rto_mask;
         struct cpupri cpupri;
+
+       unsigned long max_cpu_capacity;
  };
  
  extern struct root_domain def_root_domain;
@@ -597,7 +599,6 @@ struct rq {
  #ifdef CONFIG_SMP
         unsigned long last_load_update_tick;
  #endif /* CONFIG_SMP */
-       u64 nohz_stamp;
         unsigned long nohz_flags;
  #endif /* CONFIG_NO_HZ_COMMON */
  #ifdef CONFIG_NO_HZ_FULL
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h

index 78955cb..34659a8 100644 (file)
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
         if (rq)
                 rq->rq_sched_info.run_delay += delta;
  }
-# define schedstat_enabled()           static_branch_unlikely(&sched_schedstats)
-# define schedstat_inc(rq, field)      do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
-# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
-# define schedstat_set(var, val)       do { if (schedstat_enabled()) { var = (val); } } while (0)
-# define schedstat_val(rq, field)      ((schedstat_enabled()) ? (rq)->field : 0)
+#define schedstat_enabled()            static_branch_unlikely(&sched_schedstats)
+#define schedstat_inc(var)             do { if (schedstat_enabled()) { var++; } } while (0)
+#define schedstat_add(var, amt)                do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define schedstat_set(var, val)                do { if (schedstat_enabled()) { var = (val); } } while (0)
+#define schedstat_val(var)             (var)
+#define schedstat_val_or_zero(var)     ((schedstat_enabled()) ? (var) : 0)
  
  #else /* !CONFIG_SCHEDSTATS */
  static inline void
@@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
  static inline void
  rq_sched_info_depart(struct rq *rq, unsigned long long delta)
  {}
-# define schedstat_enabled()           0
-# define schedstat_inc(rq, field)      do { } while (0)
-# define schedstat_add(rq, field, amt) do { } while (0)
-# define schedstat_set(var, val)       do { } while (0)
-# define schedstat_val(rq, field)      0
-#endif
+#define schedstat_enabled()            0
+#define schedstat_inc(var)             do { } while (0)
+#define schedstat_add(var, amt)                do { } while (0)
+#define schedstat_set(var, val)                do { } while (0)
+#define schedstat_val(var)             0
+#define schedstat_val_or_zero(var)     0
+#endif /* CONFIG_SCHEDSTATS */
  
  #ifdef CONFIG_SCHED_INFO
  static inline void sched_info_reset_dequeued(struct task_struct *t)
diff --git a/kernel/smpboot.c b/kernel/smpboot.c

index 13bc43d..fc0d827 100644 (file)
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data)
  
                 if (kthread_should_park()) {
                         __set_current_state(TASK_RUNNING);
-                       preempt_enable();
                         if (ht->park && td->status == HP_THREAD_ACTIVE) {
                                 BUG_ON(td->cpu != smp_processor_id());
                                 ht->park(td->cpu);
                                 td->status = HP_THREAD_PARKED;
                         }
+                       preempt_enable();
                         kthread_parkme();
                         /* We might have been woken for stop */
                         continue;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c

index 4a1ca5f..082e71f 100644 (file)
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -126,6 +126,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
         cpu_stop_init_done(&done, 1);
         if (!cpu_stop_queue_work(cpu, &work))
                 return -ENOENT;
+       /*
+        * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup
+        * cycle by doing a preemption:
+        */
+       cond_resched();
         wait_for_completion(&done.completion);
         return done.ret;
  }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 53ae6d0..283583f 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1165,7 +1165,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
         }
  
         /* See similar comment in do_numa_page for explanation */
-       if (!(vma->vm_flags & VM_WRITE))
+       if (!pmd_write(pmd))
                 flags |= TNF_NO_GROUP;
  
         /*
diff --git a/mm/memory.c b/mm/memory.c

index 793fe0f..f1a6804 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3395,7 +3395,7 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
          * pte_dirty has unpredictable behaviour between PTE scan updates,
          * background writeback, dirty balancing and application behaviour.
          */
-       if (!(vma->vm_flags & VM_WRITE))
+       if (!pte_write(pte))
                 flags |= TNF_NO_GROUP;
  
         /*
diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c

index bd09d0e..143b6cd 100644 (file)
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -175,6 +175,7 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func,
                 "__stack_chk_fail",
                 "panic",
                 "do_exit",
+               "do_task_dead",
                 "__module_put_and_exit",
                 "complete_and_exit",
                 "kvm_spurious_fault",
author	Ingo Molnar <mingo@kernel.org>
	Fri, 30 Sep 2016 08:44:27 +0000 (10:44 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Fri, 30 Sep 2016 08:44:27 +0000 (10:44 +0200)
Documentation/scheduler/sched-deadline.txt		patch \| blob \| history
include/linux/kernel.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/cpudeadline.c		patch \| blob \| history
kernel/sched/cpudeadline.h		patch \| blob \| history
kernel/sched/cputime.c		patch \| blob \| history
kernel/sched/deadline.c		patch \| blob \| history
kernel/sched/debug.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/idle_task.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history
kernel/sched/stats.h		patch \| blob \| history
kernel/smpboot.c		patch \| blob \| history
kernel/stop_machine.c		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
tools/objtool/builtin-check.c		patch \| blob \| history