sched/fair: Make ilb_notifier an explicit call

[cascardo/linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 8b489fc..28ffd68 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1082,13 +1082,21 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  static int __set_cpus_allowed_ptr(struct task_struct *p,
                                   const struct cpumask *new_mask, bool check)
  {
+       const struct cpumask *cpu_valid_mask = cpu_active_mask;
+       unsigned int dest_cpu;
         unsigned long flags;
         struct rq *rq;
-       unsigned int dest_cpu;
         int ret = 0;
  
         rq = task_rq_lock(p, &flags);
  
+       if (p->flags & PF_KTHREAD) {
+               /*
+                * Kernel threads are allowed on online && !active CPUs
+                */
+               cpu_valid_mask = cpu_online_mask;
+       }
+
         /*
          * Must re-check here, to close a race against __kthread_bind(),
          * sched_setaffinity() is not guaranteed to observe the flag.
@@ -1101,18 +1109,28 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
         if (cpumask_equal(&p->cpus_allowed, new_mask))
                 goto out;
  
-       if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+       if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
                 ret = -EINVAL;
                 goto out;
         }
  
         do_set_cpus_allowed(p, new_mask);
  
+       if (p->flags & PF_KTHREAD) {
+               /*
+                * For kernel threads that do indeed end up on online &&
+                * !active we want to ensure they are strict per-cpu threads.
+                */
+               WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+                       !cpumask_intersects(new_mask, cpu_active_mask) &&
+                       p->nr_cpus_allowed != 1);
+       }
+
         /* Can the task run on the task's current CPU? If so, we're done */
         if (cpumask_test_cpu(task_cpu(p), new_mask))
                 goto out;
  
-       dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+       dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
         if (task_running(rq, p) || p->state == TASK_WAKING) {
                 struct migration_arg arg = { p, dest_cpu };
                 /* Need help from migration thread: drop lock and wait. */
@@ -1431,6 +1449,25 @@ EXPORT_SYMBOL_GPL(kick_process);
  
  /*
   * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ *
+ * A few notes on cpu_active vs cpu_online:
+ *
+ *  - cpu_active must be a subset of cpu_online
+ *
+ *  - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
+ *    see __set_cpus_allowed_ptr(). At this point the newly online
+ *    cpu isn't yet part of the sched domains, and balancing will not
+ *    see it.
+ *
+ *  - on cpu-down we clear cpu_active() to mask the sched domains and
+ *    avoid the load balancer to place new tasks on the to be removed
+ *    cpu. Existing tasks will remain running there and will be taken
+ *    off.
+ *
+ * This means that fallback selection must not select !active CPUs.
+ * And can assume that any active CPU must be online. Conversely
+ * select_task_rq() below may allow selection of !active CPUs in order
+ * to satisfy the above rules.
   */
  static int select_fallback_rq(int cpu, struct task_struct *p)
  {
@@ -1449,8 +1486,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
  
                 /* Look for allowed, online CPU in same node. */
                 for_each_cpu(dest_cpu, nodemask) {
-                       if (!cpu_online(dest_cpu))
-                               continue;
                         if (!cpu_active(dest_cpu))
                                 continue;
                         if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
@@ -1461,8 +1496,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
         for (;;) {
                 /* Any allowed, online CPU? */
                 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-                       if (!cpu_online(dest_cpu))
-                               continue;
                         if (!cpu_active(dest_cpu))
                                 continue;
                         goto out;
@@ -1514,6 +1547,8 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
  
         if (p->nr_cpus_allowed > 1)
                 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+       else
+               cpu = cpumask_any(tsk_cpus_allowed(p));
  
         /*
          * In order not to call set_task_cpu() on a blocking task we need
@@ -5160,6 +5195,8 @@ out:
  
  #ifdef CONFIG_SMP
  
+static bool sched_smp_initialized __read_mostly;
+
  #ifdef CONFIG_NUMA_BALANCING
  /* Migrate current task p to target_cpu */
  int migrate_task_to(struct task_struct *p, int target_cpu)
@@ -5374,127 +5411,13 @@ static void set_rq_offline(struct rq *rq)
         }
  }
  
-/*
- * migration_call - callback that gets triggered when a CPU is added.
- * Here we can start up the necessary migration thread for the new CPU.
- */
-static int
-migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
+static void set_cpu_rq_start_time(unsigned int cpu)
  {
-       int cpu = (long)hcpu;
-       unsigned long flags;
         struct rq *rq = cpu_rq(cpu);
  
-       switch (action & ~CPU_TASKS_FROZEN) {
-
-       case CPU_UP_PREPARE:
-               rq->calc_load_update = calc_load_update;
-               account_reset_rq(rq);
-               break;
-
-       case CPU_ONLINE:
-               /* Update our root-domain */
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               if (rq->rd) {
-                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-
-                       set_rq_online(rq);
-               }
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-               break;
-
-#ifdef CONFIG_HOTPLUG_CPU
-       case CPU_DYING:
-               sched_ttwu_pending();
-               /* Update our root-domain */
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               if (rq->rd) {
-                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-                       set_rq_offline(rq);
-               }
-               migrate_tasks(rq);
-               BUG_ON(rq->nr_running != 1); /* the migration thread */
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-               break;
-
-       case CPU_DEAD:
-               calc_load_migrate(rq);
-               break;
-#endif
-       }
-
-       update_max_interval();
-
-       return NOTIFY_OK;
-}
-
-/*
- * Register at high priority so that task migration (migrate_all_tasks)
- * happens before everything else.  This has to be lower priority than
- * the notifier in the perf_event subsystem, though.
- */
-static struct notifier_block migration_notifier = {
-       .notifier_call = migration_call,
-       .priority = CPU_PRI_MIGRATION,
-};
-
-static void set_cpu_rq_start_time(void)
-{
-       int cpu = smp_processor_id();
-       struct rq *rq = cpu_rq(cpu);
         rq->age_stamp = sched_clock_cpu(cpu);
  }
  
-static int sched_cpu_active(struct notifier_block *nfb,
-                                     unsigned long action, void *hcpu)
-{
-       int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_STARTING:
-               set_cpu_rq_start_time();
-               return NOTIFY_OK;
-
-       case CPU_DOWN_FAILED:
-               set_cpu_active(cpu, true);
-               return NOTIFY_OK;
-
-       default:
-               return NOTIFY_DONE;
-       }
-}
-
-static int sched_cpu_inactive(struct notifier_block *nfb,
-                                       unsigned long action, void *hcpu)
-{
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DOWN_PREPARE:
-               set_cpu_active((long)hcpu, false);
-               return NOTIFY_OK;
-       default:
-               return NOTIFY_DONE;
-       }
-}
-
-static int __init migration_init(void)
-{
-       void *cpu = (void *)(long)smp_processor_id();
-       int err;
-
-       /* Initialize migration for the boot CPU */
-       err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
-       BUG_ON(err == NOTIFY_BAD);
-       migration_call(&migration_notifier, CPU_ONLINE, cpu);
-       register_cpu_notifier(&migration_notifier);
-
-       /* Register cpu active notifiers */
-       cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
-       cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
-
-       return 0;
-}
-early_initcall(migration_init);
-
  static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
  
  #ifdef CONFIG_SCHED_DEBUG
@@ -6642,10 +6565,10 @@ static void sched_init_numa(void)
         init_numa_topology_type();
  }
  
-static void sched_domains_numa_masks_set(int cpu)
+static void sched_domains_numa_masks_set(unsigned int cpu)
  {
-       int i, j;
         int node = cpu_to_node(cpu);
+       int i, j;
  
         for (i = 0; i < sched_domains_numa_levels; i++) {
                 for (j = 0; j < nr_node_ids; j++) {
@@ -6655,51 +6578,20 @@ static void sched_domains_numa_masks_set(int cpu)
         }
  }
  
-static void sched_domains_numa_masks_clear(int cpu)
+static void sched_domains_numa_masks_clear(unsigned int cpu)
  {
         int i, j;
+
         for (i = 0; i < sched_domains_numa_levels; i++) {
                 for (j = 0; j < nr_node_ids; j++)
                         cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
         }
  }
  
-/*
- * Update sched_domains_numa_masks[level][node] array when new cpus
- * are onlined.
- */
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
-                                          unsigned long action,
-                                          void *hcpu)
-{
-       int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_ONLINE:
-               sched_domains_numa_masks_set(cpu);
-               break;
-
-       case CPU_DEAD:
-               sched_domains_numa_masks_clear(cpu);
-               break;
-
-       default:
-               return NOTIFY_DONE;
-       }
-
-       return NOTIFY_OK;
-}
  #else
-static inline void sched_init_numa(void)
-{
-}
-
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
-                                          unsigned long action,
-                                          void *hcpu)
-{
-       return 0;
-}
+static inline void sched_init_numa(void) { }
+static void sched_domains_numa_masks_set(unsigned int cpu) { }
+static void sched_domains_numa_masks_clear(unsigned int cpu) { }
  #endif /* CONFIG_NUMA */
  
  static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -7089,13 +6981,9 @@ static int num_cpus_frozen;      /* used to mark begin/end of suspend/resume */
   * If we come here as part of a suspend/resume, don't touch cpusets because we
   * want to restore it back to its original state upon resume anyway.
   */
-static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
-                            void *hcpu)
+static void cpuset_cpu_active(void)
  {
-       switch (action) {
-       case CPU_ONLINE_FROZEN:
-       case CPU_DOWN_FAILED_FROZEN:
-
+       if (cpuhp_tasks_frozen) {
                 /*
                  * num_cpus_frozen tracks how many CPUs are involved in suspend
                  * resume sequence. As long as this is not the last online
@@ -7105,35 +6993,25 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
                 num_cpus_frozen--;
                 if (likely(num_cpus_frozen)) {
                         partition_sched_domains(1, NULL, NULL);
-                       break;
+                       return;
                 }
-
                 /*
                  * This is the last CPU online operation. So fall through and
                  * restore the original sched domains by considering the
                  * cpuset configurations.
                  */
-
-       case CPU_ONLINE:
-               cpuset_update_active_cpus(true);
-               break;
-       default:
-               return NOTIFY_DONE;
         }
-       return NOTIFY_OK;
+       cpuset_update_active_cpus(true);
  }
  
-static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
-                              void *hcpu)
+static int cpuset_cpu_inactive(unsigned int cpu)
  {
         unsigned long flags;
-       long cpu = (long)hcpu;
         struct dl_bw *dl_b;
         bool overflow;
         int cpus;
  
-       switch (action) {
-       case CPU_DOWN_PREPARE:
+       if (!cpuhp_tasks_frozen) {
                 rcu_read_lock_sched();
                 dl_b = dl_bw_of(cpu);
  
@@ -7145,19 +7023,119 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
                 rcu_read_unlock_sched();
  
                 if (overflow)
-                       return notifier_from_errno(-EBUSY);
+                       return -EBUSY;
                 cpuset_update_active_cpus(false);
-               break;
-       case CPU_DOWN_PREPARE_FROZEN:
+       } else {
                 num_cpus_frozen++;
                 partition_sched_domains(1, NULL, NULL);
-               break;
-       default:
-               return NOTIFY_DONE;
         }
-       return NOTIFY_OK;
+       return 0;
+}
+
+int sched_cpu_activate(unsigned int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       set_cpu_active(cpu, true);
+
+       if (sched_smp_initialized) {
+               sched_domains_numa_masks_set(cpu);
+               cpuset_cpu_active();
+       }
+
+       /*
+        * Put the rq online, if not already. This happens:
+        *
+        * 1) In the early boot process, because we build the real domains
+        *    after all cpus have been brought up.
+        *
+        * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
+        *    domains.
+        */
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       if (rq->rd) {
+               BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+               set_rq_online(rq);
+       }
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+       update_max_interval();
+
+       return 0;
  }
  
+int sched_cpu_deactivate(unsigned int cpu)
+{
+       int ret;
+
+       set_cpu_active(cpu, false);
+       /*
+        * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
+        * users of this state to go away such that all new such users will
+        * observe it.
+        *
+        * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+        * not imply sync_sched(), so wait for both.
+        *
+        * Do sync before park smpboot threads to take care the rcu boost case.
+        */
+       if (IS_ENABLED(CONFIG_PREEMPT))
+               synchronize_rcu_mult(call_rcu, call_rcu_sched);
+       else
+               synchronize_rcu();
+
+       if (!sched_smp_initialized)
+               return 0;
+
+       ret = cpuset_cpu_inactive(cpu);
+       if (ret) {
+               set_cpu_active(cpu, true);
+               return ret;
+       }
+       sched_domains_numa_masks_clear(cpu);
+       return 0;
+}
+
+static void sched_rq_cpu_starting(unsigned int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+       rq->calc_load_update = calc_load_update;
+       account_reset_rq(rq);
+       update_max_interval();
+}
+
+int sched_cpu_starting(unsigned int cpu)
+{
+       set_cpu_rq_start_time(cpu);
+       sched_rq_cpu_starting(cpu);
+       return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+int sched_cpu_dying(unsigned int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       /* Handle pending wakeups and then migrate everything off */
+       sched_ttwu_pending();
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       if (rq->rd) {
+               BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+               set_rq_offline(rq);
+       }
+       migrate_tasks(rq);
+       BUG_ON(rq->nr_running != 1);
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       calc_load_migrate(rq);
+       update_max_interval();
+       nohz_balance_exit_idle(cpu);
+       return 0;
+}
+#endif
+
  void __init sched_init_smp(void)
  {
         cpumask_var_t non_isolated_cpus;
@@ -7179,10 +7157,6 @@ void __init sched_init_smp(void)
                 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
         mutex_unlock(&sched_domains_mutex);
  
-       hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
-       hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
-       hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-
         init_hrtick();
  
         /* Move init over to a non-isolated CPU */
@@ -7193,7 +7167,16 @@ void __init sched_init_smp(void)
  
         init_sched_rt_class();
         init_sched_dl_class();
+       sched_smp_initialized = true;
  }
+
+static int __init migration_init(void)
+{
+       sched_rq_cpu_starting(smp_processor_id());
+       return 0;
+}
+early_initcall(migration_init);
+
  #else
  void __init sched_init_smp(void)
  {
@@ -7391,7 +7374,7 @@ void __init sched_init(void)
         if (cpu_isolated_map == NULL)
                 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
         idle_thread_set_boot_cpu();
-       set_cpu_rq_start_time();
+       set_cpu_rq_start_time(smp_processor_id());
  #endif
         init_sched_fair_class();