sched, trace: Add a tracepoint for IPI-less remote wakeups

[cascardo/linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index cdefcf7..e4c0ddd 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -505,6 +505,39 @@ static inline void init_hrtick(void)
  }
  #endif /* CONFIG_SCHED_HRTICK */
  
+/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, val)                                             \
+({     typeof(*(ptr)) __old, __val = *(ptr);                           \
+       for (;;) {                                                      \
+               __old = cmpxchg((ptr), __val, __val | (val));           \
+               if (__old == __val)                                     \
+                       break;                                          \
+               __val = __old;                                          \
+       }                                                               \
+       __old;                                                          \
+})
+
+#ifdef TIF_POLLING_NRFLAG
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+       struct thread_info *ti = task_thread_info(p);
+       return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+       set_tsk_need_resched(p);
+       return true;
+}
+#endif
+
  /*
   * resched_task - mark a task 'to be rescheduled now'.
   *
@@ -521,18 +554,18 @@ void resched_task(struct task_struct *p)
         if (test_tsk_need_resched(p))
                 return;
  
-       set_tsk_need_resched(p);
-
         cpu = task_cpu(p);
+
         if (cpu == smp_processor_id()) {
+               set_tsk_need_resched(p);
                 set_preempt_need_resched();
                 return;
         }
  
-       /* NEED_RESCHED must be visible before we test polling */
-       smp_mb();
-       if (!tsk_is_polling(p))
+       if (set_nr_and_not_polling(p))
                 smp_send_reschedule(cpu);
+       else
+               trace_sched_wake_idle_without_ipi(cpu);
  }
  
  void resched_cpu(int cpu)
@@ -616,6 +649,8 @@ static void wake_up_idle_cpu(int cpu)
         smp_mb();
         if (!tsk_is_polling(rq->idle))
                 smp_send_reschedule(cpu);
+       else
+               trace_sched_wake_idle_without_ipi(cpu);
  }
  
  static bool wake_up_full_nohz_cpu(int cpu)
@@ -841,7 +876,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
         rq->clock_task += delta;
  
  #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-       if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+       if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
                 sched_rt_avg_update(rq, irq_delta + steal);
  #endif
  }
@@ -2192,7 +2227,7 @@ static inline void post_schedule(struct rq *rq)
   * schedule_tail - first thing a freshly forked thread must call.
   * @prev: the thread we just switched away from.
   */
-asmlinkage void schedule_tail(struct task_struct *prev)
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
         __releases(rq->lock)
  {
         struct rq *rq = this_rq();
@@ -2747,7 +2782,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
                 blk_schedule_flush_plug(tsk);
  }
  
-asmlinkage void __sched schedule(void)
+asmlinkage __visible void __sched schedule(void)
  {
         struct task_struct *tsk = current;
  
@@ -2757,7 +2792,7 @@ asmlinkage void __sched schedule(void)
  EXPORT_SYMBOL(schedule);
  
  #ifdef CONFIG_CONTEXT_TRACKING
-asmlinkage void __sched schedule_user(void)
+asmlinkage __visible void __sched schedule_user(void)
  {
         /*
          * If we come here after a random call to set_need_resched(),
@@ -2789,7 +2824,7 @@ void __sched schedule_preempt_disabled(void)
   * off of preempt_enable. Kernel preemptions off return from interrupt
   * occur there and call schedule directly.
   */
-asmlinkage void __sched notrace preempt_schedule(void)
+asmlinkage __visible void __sched notrace preempt_schedule(void)
  {
         /*
          * If there is a non-zero preempt_count or interrupts are disabled,
@@ -2819,7 +2854,7 @@ EXPORT_SYMBOL(preempt_schedule);
   * Note, that this is called and return with irqs disabled. This will
   * protect us against recursive calling from irq.
   */
-asmlinkage void __sched preempt_schedule_irq(void)
+asmlinkage __visible void __sched preempt_schedule_irq(void)
  {
         enum ctx_state prev_state;
  
@@ -3002,7 +3037,7 @@ EXPORT_SYMBOL(set_user_nice);
  int can_nice(const struct task_struct *p, const int nice)
  {
         /* convert nice value [19,-20] to rlimit style value [1,40] */
-       int nice_rlim = 20 - nice;
+       int nice_rlim = nice_to_rlimit(nice);
  
         return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                 capable(CAP_SYS_NICE));
@@ -3026,17 +3061,10 @@ SYSCALL_DEFINE1(nice, int, increment)
          * We don't have to worry. Conceptually one call occurs first
          * and we have a single winner.
          */
-       if (increment < -40)
-               increment = -40;
-       if (increment > 40)
-               increment = 40;
-
+       increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
         nice = task_nice(current) + increment;
-       if (nice < MIN_NICE)
-               nice = MIN_NICE;
-       if (nice > MAX_NICE)
-               nice = MAX_NICE;
  
+       nice = clamp_val(nice, MIN_NICE, MAX_NICE);
         if (increment < 0 && !can_nice(current, nice))
                 return -EPERM;
  
@@ -3195,17 +3223,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
   * We ask for the deadline not being zero, and greater or equal
   * than the runtime, as well as the period of being zero or
   * greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution (1us); we
- * check sched_runtime only since it is always the smaller one.
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
   */
  static bool
  __checkparam_dl(const struct sched_attr *attr)
  {
-       return attr && attr->sched_deadline != 0 &&
-               (attr->sched_period == 0 ||
-               (s64)(attr->sched_period   - attr->sched_deadline) >= 0) &&
-               (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  &&
-               attr->sched_runtime >= (2 << (DL_SCALE - 1));
+       /* deadline != 0 */
+       if (attr->sched_deadline == 0)
+               return false;
+
+       /*
+        * Since we truncate DL_SCALE bits, make sure we're at least
+        * that big.
+        */
+       if (attr->sched_runtime < (1ULL << DL_SCALE))
+               return false;
+
+       /*
+        * Since we use the MSB for wrap-around and sign issues, make
+        * sure it's not set (mind that period can be equal to zero).
+        */
+       if (attr->sched_deadline & (1ULL << 63) ||
+           attr->sched_period & (1ULL << 63))
+               return false;
+
+       /* runtime <= deadline <= period (if period != 0) */
+       if ((attr->sched_period != 0 &&
+            attr->sched_period < attr->sched_deadline) ||
+           attr->sched_deadline < attr->sched_runtime)
+               return false;
+
+       return true;
  }
  
  /*
@@ -3603,13 +3654,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
          */
         attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
  
-out:
-       return ret;
+       return 0;
  
  err_size:
         put_user(sizeof(*attr), &uattr->size);
-       ret = -E2BIG;
-       goto out;
+       return -E2BIG;
  }
  
  /**
@@ -3713,7 +3762,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
   */
  SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
  {
-       struct sched_param lp;
+       struct sched_param lp = { .sched_priority = 0 };
         struct task_struct *p;
         int retval;
  
@@ -3730,11 +3779,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
         if (retval)
                 goto out_unlock;
  
-       if (task_has_dl_policy(p)) {
-               retval = -EINVAL;
-               goto out_unlock;
-       }
-       lp.sched_priority = p->rt_priority;
+       if (task_has_rt_policy(p))
+               lp.sched_priority = p->rt_priority;
         rcu_read_unlock();
  
         /*
@@ -3772,7 +3818,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
  
                 for (; addr < end; addr++) {
                         if (*addr)
-                               goto err_size;
+                               return -EFBIG;
                 }
  
                 attr->size = usize;
@@ -3782,12 +3828,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
         if (ret)
                 return -EFAULT;
  
-out:
-       return ret;
-
-err_size:
-       ret = -E2BIG;
-       goto out;
+       return 0;
  }
  
  /**
@@ -4158,7 +4199,7 @@ EXPORT_SYMBOL(yield);
   *     false (0) if we failed to boost the target.
   *     -ESRCH if there's no task to yield to.
   */
-bool __sched yield_to(struct task_struct *p, bool preempt)
+int __sched yield_to(struct task_struct *p, bool preempt)
  {
         struct task_struct *curr = current;
         struct rq *rq, *p_rq;
@@ -5052,11 +5093,20 @@ static struct notifier_block migration_notifier = {
         .priority = CPU_PRI_MIGRATION,
  };
  
+static void __cpuinit set_cpu_rq_start_time(void)
+{
+       int cpu = smp_processor_id();
+       struct rq *rq = cpu_rq(cpu);
+       rq->age_stamp = sched_clock_cpu(cpu);
+}
+
  static int sched_cpu_active(struct notifier_block *nfb,
                                       unsigned long action, void *hcpu)
  {
         switch (action & ~CPU_TASKS_FROZEN) {
         case CPU_STARTING:
+               set_cpu_rq_start_time();
+               return NOTIFY_OK;
         case CPU_DOWN_FAILED:
                 set_cpu_active((long)hcpu, true);
                 return NOTIFY_OK;
@@ -5175,14 +5225,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                 }
  
                 /*
-                * Even though we initialize ->power to something semi-sane,
-                * we leave power_orig unset. This allows us to detect if
+                * Even though we initialize ->capacity to something semi-sane,
+                * we leave capacity_orig unset. This allows us to detect if
                  * domain iteration is still funny without causing /0 traps.
                  */
-               if (!group->sgp->power_orig) {
+               if (!group->sgc->capacity_orig) {
                         printk(KERN_CONT "\n");
-                       printk(KERN_ERR "ERROR: domain->cpu_power not "
-                                       "set\n");
+                       printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
                         break;
                 }
  
@@ -5204,9 +5253,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
  
                 printk(KERN_CONT " %s", str);
-               if (group->sgp->power != SCHED_POWER_SCALE) {
-                       printk(KERN_CONT " (cpu_power = %d)",
-                               group->sgp->power);
+               if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+                       printk(KERN_CONT " (cpu_capacity = %d)",
+                               group->sgc->capacity);
                 }
  
                 group = group->next;
@@ -5264,8 +5313,9 @@ static int sd_degenerate(struct sched_domain *sd)
                          SD_BALANCE_NEWIDLE |
                          SD_BALANCE_FORK |
                          SD_BALANCE_EXEC |
-                        SD_SHARE_CPUPOWER |
-                        SD_SHARE_PKG_RESOURCES)) {
+                        SD_SHARE_CPUCAPACITY |
+                        SD_SHARE_PKG_RESOURCES |
+                        SD_SHARE_POWERDOMAIN)) {
                 if (sd->groups != sd->groups->next)
                         return 0;
         }
@@ -5294,9 +5344,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                 SD_BALANCE_NEWIDLE |
                                 SD_BALANCE_FORK |
                                 SD_BALANCE_EXEC |
-                               SD_SHARE_CPUPOWER |
+                               SD_SHARE_CPUCAPACITY |
                                 SD_SHARE_PKG_RESOURCES |
-                               SD_PREFER_SIBLING);
+                               SD_PREFER_SIBLING |
+                               SD_SHARE_POWERDOMAIN);
                 if (nr_node_ids == 1)
                         pflags &= ~SD_SERIALIZE;
         }
@@ -5418,7 +5469,7 @@ static struct root_domain *alloc_rootdomain(void)
         return rd;
  }
  
-static void free_sched_groups(struct sched_group *sg, int free_sgp)
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
  {
         struct sched_group *tmp, *first;
  
@@ -5429,8 +5480,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
         do {
                 tmp = sg->next;
  
-               if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
-                       kfree(sg->sgp);
+               if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+                       kfree(sg->sgc);
  
                 kfree(sg);
                 sg = tmp;
@@ -5448,7 +5499,7 @@ static void free_sched_domain(struct rcu_head *rcu)
         if (sd->flags & SD_OVERLAP) {
                 free_sched_groups(sd->groups, 1);
         } else if (atomic_dec_and_test(&sd->groups->ref)) {
-               kfree(sd->groups->sgp);
+               kfree(sd->groups->sgc);
                 kfree(sd->groups);
         }
         kfree(sd);
@@ -5570,17 +5621,6 @@ static int __init isolated_cpu_setup(char *str)
  
  __setup("isolcpus=", isolated_cpu_setup);
  
-static const struct cpumask *cpu_cpu_mask(int cpu)
-{
-       return cpumask_of_node(cpu_to_node(cpu));
-}
-
-struct sd_data {
-       struct sched_domain **__percpu sd;
-       struct sched_group **__percpu sg;
-       struct sched_group_power **__percpu sgp;
-};
-
  struct s_data {
         struct sched_domain ** __percpu sd;
         struct root_domain      *rd;
@@ -5593,21 +5633,6 @@ enum s_alloc {
         sa_none,
  };
  
-struct sched_domain_topology_level;
-
-typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-
-#define SDTL_OVERLAP   0x01
-
-struct sched_domain_topology_level {
-       sched_domain_init_f init;
-       sched_domain_mask_f mask;
-       int                 flags;
-       int                 numa_level;
-       struct sd_data      data;
-};
-
  /*
   * Build an iteration mask that can exclude certain CPUs from the upwards
   * domain traversal.
@@ -5685,17 +5710,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
  
                 cpumask_or(covered, covered, sg_span);
  
-               sg->sgp = *per_cpu_ptr(sdd->sgp, i);
-               if (atomic_inc_return(&sg->sgp->ref) == 1)
+               sg->sgc = *per_cpu_ptr(sdd->sgc, i);
+               if (atomic_inc_return(&sg->sgc->ref) == 1)
                         build_group_mask(sd, sg);
  
                 /*
-                * Initialize sgp->power such that even if we mess up the
+                * Initialize sgc->capacity such that even if we mess up the
                  * domains and no possible iteration will get us here, we won't
                  * die on a /0 trap.
                  */
-               sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
-               sg->sgp->power_orig = sg->sgp->power;
+               sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+               sg->sgc->capacity_orig = sg->sgc->capacity;
  
                 /*
                  * Make sure the first group of this domain contains the
@@ -5733,8 +5758,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
  
         if (sg) {
                 *sg = *per_cpu_ptr(sdd->sg, cpu);
-               (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
-               atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+               (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+               atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
         }
  
         return cpu;
@@ -5743,7 +5768,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
  /*
   * build_sched_groups will build a circular linked list of the groups
   * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
+ * and ->cpu_capacity to 0.
   *
   * Assumes the sched_domain tree is fully constructed
   */
@@ -5775,8 +5800,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
                         continue;
  
                 group = get_group(i, sdd, &sg);
-               cpumask_clear(sched_group_cpus(sg));
-               sg->sgp->power = 0;
                 cpumask_setall(sched_group_mask(sg));
  
                 for_each_cpu(j, span) {
@@ -5799,16 +5822,16 @@ build_sched_groups(struct sched_domain *sd, int cpu)
  }
  
  /*
- * Initialize sched groups cpu_power.
+ * Initialize sched groups cpu_capacity.
   *
- * cpu_power indicates the capacity of sched group, which is used while
+ * cpu_capacity indicates the capacity of sched group, which is used while
   * distributing the load between different sched groups in a sched domain.
- * Typically cpu_power for all the groups in a sched domain will be same unless
- * there are asymmetries in the topology. If there are asymmetries, group
- * having more cpu_power will pickup more load compared to the group having
- * less cpu_power.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
   */
-static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
  {
         struct sched_group *sg = sd->groups;
  
@@ -5822,13 +5845,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
         if (cpu != group_balance_cpu(sg))
                 return;
  
-       update_group_power(sd, cpu);
-       atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
-}
-
-int __weak arch_sd_sibling_asym_packing(void)
-{
-       return 0*SD_ASYM_PACKING;
+       update_group_capacity(sd, cpu);
+       atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
  }
  
  /*
@@ -5836,34 +5854,6 @@ int __weak arch_sd_sibling_asym_packing(void)
   * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
   */
  
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(sd, type)                sd->name = #type
-#else
-# define SD_INIT_NAME(sd, type)                do { } while (0)
-#endif
-
-#define SD_INIT_FUNC(type)                                             \
-static noinline struct sched_domain *                                  \
-sd_init_##type(struct sched_domain_topology_level *tl, int cpu)        \
-{                                                                      \
-       struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
-       *sd = SD_##type##_INIT;                                         \
-       SD_INIT_NAME(sd, type);                                         \
-       sd->private = &tl->data;                                        \
-       return sd;                                                      \
-}
-
-SD_INIT_FUNC(CPU)
-#ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
-#endif
-#ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
-#endif
-#ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
-#endif
-
  static int default_relax_domain_level = -1;
  int sched_domain_level_max;
  
@@ -5947,103 +5937,158 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
         if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
                 *per_cpu_ptr(sdd->sg, cpu) = NULL;
  
-       if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
-               *per_cpu_ptr(sdd->sgp, cpu) = NULL;
-}
-
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *cpu_smt_mask(int cpu)
-{
-       return topology_thread_cpumask(cpu);
+       if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+               *per_cpu_ptr(sdd->sgc, cpu) = NULL;
  }
-#endif
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-       { sd_init_SIBLING, cpu_smt_mask, },
-#endif
-#ifdef CONFIG_SCHED_MC
-       { sd_init_MC, cpu_coregroup_mask, },
-#endif
-#ifdef CONFIG_SCHED_BOOK
-       { sd_init_BOOK, cpu_book_mask, },
-#endif
-       { sd_init_CPU, cpu_cpu_mask, },
-       { NULL, },
-};
-
-static struct sched_domain_topology_level *sched_domain_topology = default_topology;
-
-#define for_each_sd_topology(tl)                       \
-       for (tl = sched_domain_topology; tl->init; tl++)
  
  #ifdef CONFIG_NUMA
-
  static int sched_domains_numa_levels;
  static int *sched_domains_numa_distance;
  static struct cpumask ***sched_domains_numa_masks;
  static int sched_domains_curr_level;
+#endif
  
-static inline int sd_local_flags(int level)
-{
-       if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
-               return 0;
-
-       return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
-}
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUCAPACITY      - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA                - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING        - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS              \
+       (SD_SHARE_CPUCAPACITY |         \
+        SD_SHARE_PKG_RESOURCES |       \
+        SD_NUMA |                      \
+        SD_ASYM_PACKING |              \
+        SD_SHARE_POWERDOMAIN)
  
  static struct sched_domain *
-sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl, int cpu)
  {
         struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
-       int level = tl->numa_level;
-       int sd_weight = cpumask_weight(
-                       sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+       int sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+       /*
+        * Ugly hack to pass state to sd_numa_mask()...
+        */
+       sched_domains_curr_level = tl->numa_level;
+#endif
+
+       sd_weight = cpumask_weight(tl->mask(cpu));
+
+       if (tl->sd_flags)
+               sd_flags = (*tl->sd_flags)();
+       if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+                       "wrong sd_flags in topology description\n"))
+               sd_flags &= ~TOPOLOGY_SD_FLAGS;
  
         *sd = (struct sched_domain){
                 .min_interval           = sd_weight,
                 .max_interval           = 2*sd_weight,
                 .busy_factor            = 32,
                 .imbalance_pct          = 125,
-               .cache_nice_tries       = 2,
-               .busy_idx               = 3,
-               .idle_idx               = 2,
+
+               .cache_nice_tries       = 0,
+               .busy_idx               = 0,
+               .idle_idx               = 0,
                 .newidle_idx            = 0,
                 .wake_idx               = 0,
                 .forkexec_idx           = 0,
  
                 .flags                  = 1*SD_LOAD_BALANCE
                                         | 1*SD_BALANCE_NEWIDLE
-                                       | 0*SD_BALANCE_EXEC
-                                       | 0*SD_BALANCE_FORK
+                                       | 1*SD_BALANCE_EXEC
+                                       | 1*SD_BALANCE_FORK
                                         | 0*SD_BALANCE_WAKE
-                                       | 0*SD_WAKE_AFFINE
-                                       | 0*SD_SHARE_CPUPOWER
+                                       | 1*SD_WAKE_AFFINE
+                                       | 0*SD_SHARE_CPUCAPACITY
                                         | 0*SD_SHARE_PKG_RESOURCES
-                                       | 1*SD_SERIALIZE
+                                       | 0*SD_SERIALIZE
                                         | 0*SD_PREFER_SIBLING
-                                       | 1*SD_NUMA
-                                       | sd_local_flags(level)
+                                       | 0*SD_NUMA
+                                       | sd_flags
                                         ,
+
                 .last_balance           = jiffies,
                 .balance_interval       = sd_weight,
+               .smt_gain               = 0,
                 .max_newidle_lb_cost    = 0,
                 .next_decay_max_lb_cost = jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+               .name                   = tl->name,
+#endif
         };
-       SD_INIT_NAME(sd, NUMA);
-       sd->private = &tl->data;
  
         /*
-        * Ugly hack to pass state to sd_numa_mask()...
+        * Convert topological properties into behaviour.
          */
-       sched_domains_curr_level = tl->numa_level;
+
+       if (sd->flags & SD_SHARE_CPUCAPACITY) {
+               sd->imbalance_pct = 110;
+               sd->smt_gain = 1178; /* ~15% */
+
+       } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+               sd->imbalance_pct = 117;
+               sd->cache_nice_tries = 1;
+               sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+       } else if (sd->flags & SD_NUMA) {
+               sd->cache_nice_tries = 2;
+               sd->busy_idx = 3;
+               sd->idle_idx = 2;
+
+               sd->flags |= SD_SERIALIZE;
+               if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+                       sd->flags &= ~(SD_BALANCE_EXEC |
+                                      SD_BALANCE_FORK |
+                                      SD_WAKE_AFFINE);
+               }
+
+#endif
+       } else {
+               sd->flags |= SD_PREFER_SIBLING;
+               sd->cache_nice_tries = 1;
+               sd->busy_idx = 2;
+               sd->idle_idx = 1;
+       }
+
+       sd->private = &tl->data;
  
         return sd;
  }
  
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+       { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+       { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+       { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+       { NULL, },
+};
+
+struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+#define for_each_sd_topology(tl)                       \
+       for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+       sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
  static const struct cpumask *sd_numa_mask(int cpu)
  {
         return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6187,7 +6232,10 @@ static void sched_init_numa(void)
                 }
         }
  
-       tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+       /* Compute default topology size */
+       for (i = 0; sched_domain_topology[i].mask; i++);
+
+       tl = kzalloc((i + level + 1) *
                         sizeof(struct sched_domain_topology_level), GFP_KERNEL);
         if (!tl)
                 return;
@@ -6195,18 +6243,19 @@ static void sched_init_numa(void)
         /*
          * Copy the default topology bits..
          */
-       for (i = 0; default_topology[i].init; i++)
-               tl[i] = default_topology[i];
+       for (i = 0; sched_domain_topology[i].mask; i++)
+               tl[i] = sched_domain_topology[i];
  
         /*
          * .. and append 'j' levels of NUMA goodness.
          */
         for (j = 0; j < level; i++, j++) {
                 tl[i] = (struct sched_domain_topology_level){
-                       .init = sd_numa_init,
                         .mask = sd_numa_mask,
+                       .sd_flags = cpu_numa_flags,
                         .flags = SDTL_OVERLAP,
                         .numa_level = j,
+                       SD_INIT_NAME(NUMA)
                 };
         }
  
@@ -6291,14 +6340,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                 if (!sdd->sg)
                         return -ENOMEM;
  
-               sdd->sgp = alloc_percpu(struct sched_group_power *);
-               if (!sdd->sgp)
+               sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+               if (!sdd->sgc)
                         return -ENOMEM;
  
                 for_each_cpu(j, cpu_map) {
                         struct sched_domain *sd;
                         struct sched_group *sg;
-                       struct sched_group_power *sgp;
+                       struct sched_group_capacity *sgc;
  
                         sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
                                         GFP_KERNEL, cpu_to_node(j));
@@ -6316,12 +6365,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
  
                         *per_cpu_ptr(sdd->sg, j) = sg;
  
-                       sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
+                       sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
                                         GFP_KERNEL, cpu_to_node(j));
-                       if (!sgp)
+                       if (!sgc)
                                 return -ENOMEM;
  
-                       *per_cpu_ptr(sdd->sgp, j) = sgp;
+                       *per_cpu_ptr(sdd->sgc, j) = sgc;
                 }
         }
  
@@ -6348,15 +6397,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
  
                         if (sdd->sg)
                                 kfree(*per_cpu_ptr(sdd->sg, j));
-                       if (sdd->sgp)
-                               kfree(*per_cpu_ptr(sdd->sgp, j));
+                       if (sdd->sgc)
+                               kfree(*per_cpu_ptr(sdd->sgc, j));
                 }
                 free_percpu(sdd->sd);
                 sdd->sd = NULL;
                 free_percpu(sdd->sg);
                 sdd->sg = NULL;
-               free_percpu(sdd->sgp);
-               sdd->sgp = NULL;
+               free_percpu(sdd->sgc);
+               sdd->sgc = NULL;
         }
  }
  
@@ -6364,7 +6413,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                 struct sched_domain *child, int cpu)
  {
-       struct sched_domain *sd = tl->init(tl, cpu);
+       struct sched_domain *sd = sd_init(tl, cpu);
         if (!sd)
                 return child;
  
@@ -6426,14 +6475,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
                 }
         }
  
-       /* Calculate CPU power for physical packages and nodes */
+       /* Calculate CPU capacity for physical packages and nodes */
         for (i = nr_cpumask_bits-1; i >= 0; i--) {
                 if (!cpumask_test_cpu(i, cpu_map))
                         continue;
  
                 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
                         claim_allocations(i, sd);
-                       init_sched_groups_power(i, sd);
+                       init_sched_groups_capacity(i, sd);
                 }
         }
  
@@ -6876,7 +6925,7 @@ void __init sched_init(void)
  #ifdef CONFIG_SMP
                 rq->sd = NULL;
                 rq->rd = NULL;
-               rq->cpu_power = SCHED_POWER_SCALE;
+               rq->cpu_capacity = SCHED_CAPACITY_SCALE;
                 rq->post_schedule = 0;
                 rq->active_balance = 0;
                 rq->next_balance = jiffies;
@@ -6934,6 +6983,7 @@ void __init sched_init(void)
         if (cpu_isolated_map == NULL)
                 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
         idle_thread_set_boot_cpu();
+       set_cpu_rq_start_time();
  #endif
         init_sched_fair_class();