sched/core: Optimize SCHED_SMT
[cascardo/linux.git] / kernel / sched / core.c
index 44817c6..9411545 100644 (file)
@@ -1063,8 +1063,12 @@ static int migration_cpu_stop(void *data)
         * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
         * we're holding p->pi_lock.
         */
-       if (task_rq(p) == rq && task_on_rq_queued(p))
-               rq = __migrate_task(rq, p, arg->dest_cpu);
+       if (task_rq(p) == rq) {
+               if (task_on_rq_queued(p))
+                       rq = __migrate_task(rq, p, arg->dest_cpu);
+               else
+                       p->wake_cpu = arg->dest_cpu;
+       }
        raw_spin_unlock(&rq->lock);
        raw_spin_unlock(&p->pi_lock);
 
@@ -1265,7 +1269,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
                /*
                 * Task isn't running anymore; make it appear like we migrated
                 * it before it went to sleep. This means on wakeup we make the
-                * previous cpu our targer instead of where it really is.
+                * previous cpu our target instead of where it really is.
                 */
                p->wake_cpu = cpu;
        }
@@ -1629,23 +1633,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
-#ifdef CONFIG_SCHEDSTATS
-       struct rq *rq = this_rq();
+       struct rq *rq;
 
-#ifdef CONFIG_SMP
-       int this_cpu = smp_processor_id();
+       if (!schedstat_enabled())
+               return;
+
+       rq = this_rq();
 
-       if (cpu == this_cpu) {
-               schedstat_inc(rq, ttwu_local);
-               schedstat_inc(p, se.statistics.nr_wakeups_local);
+#ifdef CONFIG_SMP
+       if (cpu == rq->cpu) {
+               schedstat_inc(rq->ttwu_local);
+               schedstat_inc(p->se.statistics.nr_wakeups_local);
        } else {
                struct sched_domain *sd;
 
-               schedstat_inc(pse.statistics.nr_wakeups_remote);
+               schedstat_inc(p->se.statistics.nr_wakeups_remote);
                rcu_read_lock();
-               for_each_domain(this_cpu, sd) {
+               for_each_domain(rq->cpu, sd) {
                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                               schedstat_inc(sdttwu_wake_remote);
+                               schedstat_inc(sd->ttwu_wake_remote);
                                break;
                        }
                }
@@ -1653,17 +1659,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
        }
 
        if (wake_flags & WF_MIGRATED)
-               schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-
+               schedstat_inc(p->se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */
 
-       schedstat_inc(rqttwu_count);
-       schedstat_inc(pse.statistics.nr_wakeups);
+       schedstat_inc(rq->ttwu_count);
+       schedstat_inc(p->se.statistics.nr_wakeups);
 
        if (wake_flags & WF_SYNC)
-               schedstat_inc(p, se.statistics.nr_wakeups_sync);
-
-#endif /* CONFIG_SCHEDSTATS */
+               schedstat_inc(p->se.statistics.nr_wakeups_sync);
 }
 
 static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -2084,8 +2087,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
        ttwu_queue(p, cpu, wake_flags);
 stat:
-       if (schedstat_enabled())
-               ttwu_stat(p, cpu, wake_flags);
+       ttwu_stat(p, cpu, wake_flags);
 out:
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
@@ -2095,6 +2097,7 @@ out:
 /**
  * try_to_wake_up_local - try to wake up a local task with rq lock held
  * @p: the thread to be awakened
+ * @cookie: context's cookie for pinning
  *
  * Put @p on the run-queue if it's not already there. The caller must
  * ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -2133,8 +2136,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
        ttwu_do_wakeup(rq, p, 0, cookie);
-       if (schedstat_enabled())
-               ttwu_stat(p, smp_processor_id(), 0);
+       ttwu_stat(p, smp_processor_id(), 0);
 out:
        raw_spin_unlock(&p->pi_lock);
 }
@@ -3192,6 +3194,9 @@ static inline void preempt_latency_stop(int val) { }
  */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
+       /* Save this before calling printk(), since that will clobber it */
+       unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
+
        if (oops_in_progress)
                return;
 
@@ -3202,13 +3207,12 @@ static noinline void __schedule_bug(struct task_struct *prev)
        print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
-       if (in_atomic_preempt_off()) {
+       if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+           && in_atomic_preempt_off()) {
                pr_err("Preemption disabled at:");
-               print_ip_sym(current->preempt_disable_ip);
+               print_ip_sym(preempt_disable_ip);
                pr_cont("\n");
        }
-#endif
        if (panic_on_warn)
                panic("scheduling while atomic\n");
 
@@ -3234,7 +3238,7 @@ static inline void schedule_debug(struct task_struct *prev)
 
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
-       schedstat_inc(this_rq()sched_count);
+       schedstat_inc(this_rq()->sched_count);
 }
 
 /*
@@ -3327,17 +3331,6 @@ static void __sched notrace __schedule(bool preempt)
        rq = cpu_rq(cpu);
        prev = rq->curr;
 
-       /*
-        * do_exit() calls schedule() with preemption disabled as an exception;
-        * however we must fix that up, otherwise the next task will see an
-        * inconsistent (higher) preempt count.
-        *
-        * It also avoids the below schedule_debug() test from complaining
-        * about this.
-        */
-       if (unlikely(prev->state == TASK_DEAD))
-               preempt_enable_no_resched_notrace();
-
        schedule_debug(prev);
 
        if (sched_feat(HRTICK))
@@ -3405,6 +3398,33 @@ static void __sched notrace __schedule(bool preempt)
 }
 STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
 
+void __noreturn do_task_dead(void)
+{
+       /*
+        * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+        * when the following two conditions become true.
+        *   - There is race condition of mmap_sem (It is acquired by
+        *     exit_mm()), and
+        *   - SMI occurs before setting TASK_RUNINNG.
+        *     (or hypervisor of virtual machine switches to other guest)
+        *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+        *
+        * To avoid it, we have to wait for releasing tsk->pi_lock which
+        * is held by try_to_wake_up()
+        */
+       smp_mb();
+       raw_spin_unlock_wait(&current->pi_lock);
+
+       /* causes final put_task_struct in finish_task_switch(). */
+       __set_current_state(TASK_DEAD);
+       current->flags |= PF_NOFREEZE;  /* tell freezer to ignore us */
+       __schedule(false);
+       BUG();
+       /* Avoid "noreturn function does return".  */
+       for (;;)
+               cpu_relax();    /* For when BUG is null */
+}
+
 static inline void sched_submit_work(struct task_struct *tsk)
 {
        if (!tsk->state || tsk_is_pi_blocked(tsk))
@@ -4846,7 +4866,7 @@ SYSCALL_DEFINE0(sched_yield)
 {
        struct rq *rq = this_rq_lock();
 
-       schedstat_inc(rqyld_count);
+       schedstat_inc(rq->yld_count);
        current->sched_class->yield_task(rq);
 
        /*
@@ -4863,6 +4883,7 @@ SYSCALL_DEFINE0(sched_yield)
        return 0;
 }
 
+#ifndef CONFIG_PREEMPT
 int __sched _cond_resched(void)
 {
        if (should_resched(0)) {
@@ -4872,6 +4893,7 @@ int __sched _cond_resched(void)
        return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
+#endif
 
 /*
  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4997,7 +5019,7 @@ again:
 
        yielded = curr->sched_class->yield_to_task(rq, p, preempt);
        if (yielded) {
-               schedstat_inc(rqyld_count);
+               schedstat_inc(rq->yld_count);
                /*
                 * Make p's CPU reschedule; pick_next_entity takes care of
                 * fairness.
@@ -5717,6 +5739,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
        }
 }
 #else /* !CONFIG_SCHED_DEBUG */
+
+# define sched_debug_enabled 0
 # define sched_domain_debug(sd, cpu) do { } while (0)
 static inline bool sched_debug(void)
 {
@@ -5735,6 +5759,7 @@ static int sd_degenerate(struct sched_domain *sd)
                         SD_BALANCE_FORK |
                         SD_BALANCE_EXEC |
                         SD_SHARE_CPUCAPACITY |
+                        SD_ASYM_CPUCAPACITY |
                         SD_SHARE_PKG_RESOURCES |
                         SD_SHARE_POWERDOMAIN)) {
                if (sd->groups != sd->groups->next)
@@ -5765,6 +5790,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                SD_BALANCE_NEWIDLE |
                                SD_BALANCE_FORK |
                                SD_BALANCE_EXEC |
+                               SD_ASYM_CPUCAPACITY |
                                SD_SHARE_CPUCAPACITY |
                                SD_SHARE_PKG_RESOURCES |
                                SD_PREFER_SIBLING |
@@ -5909,10 +5935,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
        } while (sg != first);
 }
 
-static void free_sched_domain(struct rcu_head *rcu)
+static void destroy_sched_domain(struct sched_domain *sd)
 {
-       struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-
        /*
         * If its an overlapping domain it has private groups, iterate and
         * nuke them all.
@@ -5923,18 +5947,26 @@ static void free_sched_domain(struct rcu_head *rcu)
                kfree(sd->groups->sgc);
                kfree(sd->groups);
        }
+       if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+               kfree(sd->shared);
        kfree(sd);
 }
 
-static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+static void destroy_sched_domains_rcu(struct rcu_head *rcu)
 {
-       call_rcu(&sd->rcu, free_sched_domain);
+       struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+       while (sd) {
+               struct sched_domain *parent = sd->parent;
+               destroy_sched_domain(sd);
+               sd = parent;
+       }
 }
 
-static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+static void destroy_sched_domains(struct sched_domain *sd)
 {
-       for (; sd; sd = sd->parent)
-               destroy_sched_domain(sd, cpu);
+       if (sd)
+               call_rcu(&sd->rcu, destroy_sched_domains_rcu);
 }
 
 /*
@@ -5949,14 +5981,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_busy);
 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 
 static void update_top_cache_domain(int cpu)
 {
+       struct sched_domain_shared *sds = NULL;
        struct sched_domain *sd;
-       struct sched_domain *busy_sd = NULL;
        int id = cpu;
        int size = 1;
 
@@ -5964,13 +5996,13 @@ static void update_top_cache_domain(int cpu)
        if (sd) {
                id = cpumask_first(sched_domain_span(sd));
                size = cpumask_weight(sched_domain_span(sd));
-               busy_sd = sd->parent; /* sd_busy */
+               sds = sd->shared;
        }
-       rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
 
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_size, cpu) = size;
        per_cpu(sd_llc_id, cpu) = id;
+       rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
 
        sd = lowest_flag_domain(cpu, SD_NUMA);
        rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
@@ -6006,7 +6038,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
                         */
                        if (parent->flags & SD_PREFER_SIBLING)
                                tmp->flags |= SD_PREFER_SIBLING;
-                       destroy_sched_domain(parent, cpu);
+                       destroy_sched_domain(parent);
                } else
                        tmp = tmp->parent;
        }
@@ -6014,7 +6046,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        if (sd && sd_degenerate(sd)) {
                tmp = sd;
                sd = sd->parent;
-               destroy_sched_domain(tmp, cpu);
+               destroy_sched_domain(tmp);
                if (sd)
                        sd->child = NULL;
        }
@@ -6024,7 +6056,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        rq_attach_root(rq, rd);
        tmp = rq->sd;
        rcu_assign_pointer(rq->sd, sd);
-       destroy_sched_domains(tmp, cpu);
+       destroy_sched_domains(tmp);
 
        update_top_cache_domain(cpu);
 }
@@ -6267,7 +6299,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
                return;
 
        update_group_capacity(sd, cpu);
-       atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
 }
 
 /*
@@ -6355,6 +6386,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
        WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
        *per_cpu_ptr(sdd->sd, cpu) = NULL;
 
+       if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+               *per_cpu_ptr(sdd->sds, cpu) = NULL;
+
        if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
                *per_cpu_ptr(sdd->sg, cpu) = NULL;
 
@@ -6374,26 +6408,37 @@ static int sched_domains_curr_level;
 /*
  * SD_flags allowed in topology descriptions.
  *
- * SD_SHARE_CPUCAPACITY      - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA                - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
+ *   SD_SHARE_PKG_RESOURCES - describes shared caches
+ *   SD_NUMA                - describes NUMA topologies
+ *   SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
  *
- * Odd one out:
- * SD_ASYM_PACKING        - describes SMT quirks
+ *   SD_ASYM_PACKING        - describes SMT quirks
  */
 #define TOPOLOGY_SD_FLAGS              \
        (SD_SHARE_CPUCAPACITY |         \
         SD_SHARE_PKG_RESOURCES |       \
         SD_NUMA |                      \
         SD_ASYM_PACKING |              \
+        SD_ASYM_CPUCAPACITY |          \
         SD_SHARE_POWERDOMAIN)
 
 static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+       const struct cpumask *cpu_map,
+       struct sched_domain *child, int cpu)
 {
-       struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
-       int sd_weight, sd_flags = 0;
+       struct sd_data *sdd = &tl->data;
+       struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+       int sd_id, sd_weight, sd_flags = 0;
 
 #ifdef CONFIG_NUMA
        /*
@@ -6442,15 +6487,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
                .smt_gain               = 0,
                .max_newidle_lb_cost    = 0,
                .next_decay_max_lb_cost = jiffies,
+               .child                  = child,
 #ifdef CONFIG_SCHED_DEBUG
                .name                   = tl->name,
 #endif
        };
 
+       cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+       sd_id = cpumask_first(sched_domain_span(sd));
+
        /*
         * Convert topological properties into behaviour.
         */
 
+       if (sd->flags & SD_ASYM_CPUCAPACITY) {
+               struct sched_domain *t = sd;
+
+               for_each_lower_domain(t)
+                       t->flags |= SD_BALANCE_WAKE;
+       }
+
        if (sd->flags & SD_SHARE_CPUCAPACITY) {
                sd->flags |= SD_PREFER_SIBLING;
                sd->imbalance_pct = 110;
@@ -6482,7 +6538,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
                sd->idle_idx = 1;
        }
 
-       sd->private = &tl->data;
+       /*
+        * For all levels sharing cache; connect a sched_domain_shared
+        * instance.
+        */
+       if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+               sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+               atomic_inc(&sd->shared->ref);
+               atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+       }
+
+       sd->private = sdd;
 
        return sd;
 }
@@ -6509,6 +6575,9 @@ static struct sched_domain_topology_level *sched_domain_topology =
 
 void set_sched_topology(struct sched_domain_topology_level *tl)
 {
+       if (WARN_ON_ONCE(sched_smp_initialized))
+               return;
+
        sched_domain_topology = tl;
 }
 
@@ -6789,6 +6858,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                if (!sdd->sd)
                        return -ENOMEM;
 
+               sdd->sds = alloc_percpu(struct sched_domain_shared *);
+               if (!sdd->sds)
+                       return -ENOMEM;
+
                sdd->sg = alloc_percpu(struct sched_group *);
                if (!sdd->sg)
                        return -ENOMEM;
@@ -6799,6 +6872,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
                for_each_cpu(j, cpu_map) {
                        struct sched_domain *sd;
+                       struct sched_domain_shared *sds;
                        struct sched_group *sg;
                        struct sched_group_capacity *sgc;
 
@@ -6809,6 +6883,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
                        *per_cpu_ptr(sdd->sd, j) = sd;
 
+                       sds = kzalloc_node(sizeof(struct sched_domain_shared),
+                                       GFP_KERNEL, cpu_to_node(j));
+                       if (!sds)
+                               return -ENOMEM;
+
+                       *per_cpu_ptr(sdd->sds, j) = sds;
+
                        sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
                        if (!sg)
@@ -6848,6 +6929,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
                                kfree(*per_cpu_ptr(sdd->sd, j));
                        }
 
+                       if (sdd->sds)
+                               kfree(*per_cpu_ptr(sdd->sds, j));
                        if (sdd->sg)
                                kfree(*per_cpu_ptr(sdd->sg, j));
                        if (sdd->sgc)
@@ -6855,6 +6938,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
                }
                free_percpu(sdd->sd);
                sdd->sd = NULL;
+               free_percpu(sdd->sds);
+               sdd->sds = NULL;
                free_percpu(sdd->sg);
                sdd->sg = NULL;
                free_percpu(sdd->sgc);
@@ -6866,16 +6951,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                struct sched_domain *child, int cpu)
 {
-       struct sched_domain *sd = sd_init(tl, cpu);
-       if (!sd)
-               return child;
+       struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
 
-       cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
        if (child) {
                sd->level = child->level + 1;
                sched_domain_level_max = max(sched_domain_level_max, sd->level);
                child->parent = sd;
-               sd->child = child;
 
                if (!cpumask_subset(sched_domain_span(child),
                                    sched_domain_span(sd))) {
@@ -6906,6 +6987,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
        enum s_alloc alloc_state;
        struct sched_domain *sd;
        struct s_data d;
+       struct rq *rq = NULL;
        int i, ret = -ENOMEM;
 
        alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -6956,11 +7038,22 @@ static int build_sched_domains(const struct cpumask *cpu_map,
        /* Attach the domains */
        rcu_read_lock();
        for_each_cpu(i, cpu_map) {
+               rq = cpu_rq(i);
                sd = *per_cpu_ptr(d.sd, i);
+
+               /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
+               if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
+                       WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+
                cpu_attach_domain(sd, d.rd, i);
        }
        rcu_read_unlock();
 
+       if (rq && sched_debug_enabled) {
+               pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+                       cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
+       }
+
        ret = 0;
 error:
        __free_domain_allocs(&d, alloc_state, cpu_map);
@@ -7319,6 +7412,22 @@ int sched_cpu_dying(unsigned int cpu)
 }
 #endif
 
+#ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+
+static void sched_init_smt(void)
+{
+       /*
+        * We've enumerated all CPUs and will assume that if any CPU
+        * has SMT siblings, CPU0 will too.
+        */
+       if (cpumask_weight(cpu_smt_mask(0)) > 1)
+               static_branch_enable(&sched_smt_present);
+}
+#else
+static inline void sched_init_smt(void) { }
+#endif
+
 void __init sched_init_smp(void)
 {
        cpumask_var_t non_isolated_cpus;
@@ -7348,6 +7457,9 @@ void __init sched_init_smp(void)
 
        init_sched_rt_class();
        init_sched_dl_class();
+
+       sched_init_smt();
+
        sched_smp_initialized = true;
 }
 
@@ -7385,6 +7497,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
 #endif
 
 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
+DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
 
 void __init sched_init(void)
 {
@@ -7421,6 +7534,8 @@ void __init sched_init(void)
        for_each_possible_cpu(i) {
                per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
                        cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+               per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
+                       cpumask_size(), GFP_KERNEL, cpu_to_node(i));
        }
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 
@@ -7523,21 +7638,12 @@ void __init sched_init(void)
 
        set_load_weight(&init_task);
 
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-       INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
-
        /*
         * The boot idle thread does lazy MMU switching as well:
         */
        atomic_inc(&init_mm.mm_count);
        enter_lazy_tlb(&init_mm, current);
 
-       /*
-        * During early bootup we pretend to be a normal task:
-        */
-       current->sched_class = &fair_sched_class;
-
        /*
         * Make us the idle thread. Technically, schedule() should not be
         * called from this thread, however somewhere below it might be,
@@ -7592,6 +7698,7 @@ EXPORT_SYMBOL(__might_sleep);
 void ___might_sleep(const char *file, int line, int preempt_offset)
 {
        static unsigned long prev_jiffy;        /* ratelimiting */
+       unsigned long preempt_disable_ip;
 
        rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
        if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
@@ -7602,6 +7709,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
                return;
        prev_jiffy = jiffies;
 
+       /* Save this before calling printk(), since that will clobber it */
+       preempt_disable_ip = get_preempt_disable_ip(current);
+
        printk(KERN_ERR
                "BUG: sleeping function called from invalid context at %s:%d\n",
                        file, line);
@@ -7616,14 +7726,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
        debug_show_held_locks(current);
        if (irqs_disabled())
                print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
-       if (!preempt_count_equals(preempt_offset)) {
+       if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+           && !preempt_count_equals(preempt_offset)) {
                pr_err("Preemption disabled at:");
-               print_ip_sym(current->preempt_disable_ip);
+               print_ip_sym(preempt_disable_ip);
                pr_cont("\n");
        }
-#endif
        dump_stack();
+       add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 EXPORT_SYMBOL(___might_sleep);
 #endif
@@ -7644,12 +7754,10 @@ void normalize_rt_tasks(void)
                if (p->flags & PF_KTHREAD)
                        continue;
 
-               p->se.exec_start                = 0;
-#ifdef CONFIG_SCHEDSTATS
-               p->se.statistics.wait_start     = 0;
-               p->se.statistics.sleep_start    = 0;
-               p->se.statistics.block_start    = 0;
-#endif
+               p->se.exec_start = 0;
+               schedstat_set(p->se.statistics.wait_start,  0);
+               schedstat_set(p->se.statistics.sleep_start, 0);
+               schedstat_set(p->se.statistics.block_start, 0);
 
                if (!dl_task(p) && !rt_task(p)) {
                        /*