Merge branch 'timers-nohz-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[cascardo/linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index b8f4876..c86935a 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,26 +90,6 @@
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
  
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
-{
-       unsigned long delta;
-       ktime_t soft, hard, now;
-
-       for (;;) {
-               if (hrtimer_active(period_timer))
-                       break;
-
-               now = hrtimer_cb_get_time(period_timer);
-               hrtimer_forward(period_timer, now, period);
-
-               soft = hrtimer_get_softexpires(period_timer);
-               hard = hrtimer_get_expires(period_timer);
-               delta = ktime_to_ns(ktime_sub(hard, soft));
-               __hrtimer_start_range_ns(period_timer, soft, delta,
-                                        HRTIMER_MODE_ABS_PINNED, 0);
-       }
-}
-
  DEFINE_MUTEX(sched_domains_mutex);
  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  
@@ -355,12 +335,11 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
  
  #ifdef CONFIG_SMP
  
-static int __hrtick_restart(struct rq *rq)
+static void __hrtick_restart(struct rq *rq)
  {
         struct hrtimer *timer = &rq->hrtick_timer;
-       ktime_t time = hrtimer_get_softexpires(timer);
  
-       return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+       hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
  }
  
  /*
@@ -440,8 +419,8 @@ void hrtick_start(struct rq *rq, u64 delay)
          * doesn't make sense. Rely on vruntime for fairness.
          */
         delay = max_t(u64, delay, 10000LL);
-       __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-                       HRTIMER_MODE_REL_PINNED, 0);
+       hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
+                     HRTIMER_MODE_REL_PINNED);
  }
  
  static inline void init_hrtick(void)
@@ -511,7 +490,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
  static bool set_nr_if_polling(struct task_struct *p)
  {
         struct thread_info *ti = task_thread_info(p);
-       typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+       typeof(ti->flags) old, val = READ_ONCE(ti->flags);
  
         for (;;) {
                 if (!(val & _TIF_POLLING_NRFLAG))
@@ -541,6 +520,52 @@ static bool set_nr_if_polling(struct task_struct *p)
  #endif
  #endif
  
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+       struct wake_q_node *node = &task->wake_q;
+
+       /*
+        * Atomically grab the task, if ->wake_q is !nil already it means
+        * its already queued (either by us or someone else) and will get the
+        * wakeup due to that.
+        *
+        * This cmpxchg() implies a full barrier, which pairs with the write
+        * barrier implied by the wakeup in wake_up_list().
+        */
+       if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+               return;
+
+       get_task_struct(task);
+
+       /*
+        * The head is context local, there can be no concurrency.
+        */
+       *head->lastp = node;
+       head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+       struct wake_q_node *node = head->first;
+
+       while (node != WAKE_Q_TAIL) {
+               struct task_struct *task;
+
+               task = container_of(node, struct task_struct, wake_q);
+               BUG_ON(!task);
+               /* task can safely be re-inserted now */
+               node = node->next;
+               task->wake_q.next = NULL;
+
+               /*
+                * wake_up_process() implies a wmb() to pair with the queueing
+                * in wake_q_add() so as not to miss wakeups.
+                */
+               wake_up_process(task);
+               put_task_struct(task);
+       }
+}
+
  /*
   * resched_curr - mark rq's current task 'to be rescheduled now'.
   *
@@ -593,13 +618,12 @@ void resched_cpu(int cpu)
   * selecting an idle cpu will add more delays to the timers than intended
   * (as that cpu's timer base may not be uptodate wrt jiffies etc).
   */
-int get_nohz_timer_target(int pinned)
+int get_nohz_timer_target(void)
  {
-       int cpu = smp_processor_id();
-       int i;
+       int i, cpu = smp_processor_id();
         struct sched_domain *sd;
  
-       if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+       if (!idle_cpu(cpu))
                 return cpu;
  
         rcu_read_lock();
@@ -1049,7 +1073,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                 if (p->sched_class->migrate_task_rq)
                         p->sched_class->migrate_task_rq(p, new_cpu);
                 p->se.nr_migrations++;
-               perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+               perf_event_task_migrate(p);
         }
  
         __set_task_cpu(p, new_cpu);
@@ -2105,12 +2129,15 @@ void wake_up_new_task(struct task_struct *p)
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
  
+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+
  /**
   * preempt_notifier_register - tell me when current is being preempted & rescheduled
   * @notifier: notifier struct to register
   */
  void preempt_notifier_register(struct preempt_notifier *notifier)
  {
+       static_key_slow_inc(&preempt_notifier_key);
         hlist_add_head(&notifier->link, &current->preempt_notifiers);
  }
  EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@ -2119,15 +2146,16 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
   * preempt_notifier_unregister - no longer interested in preemption notifications
   * @notifier: notifier struct to unregister
   *
- * This is safe to call from within a preemption notifier.
+ * This is *not* safe to call from within a preemption notifier.
   */
  void preempt_notifier_unregister(struct preempt_notifier *notifier)
  {
         hlist_del(&notifier->link);
+       static_key_slow_dec(&preempt_notifier_key);
  }
  EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
  
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
  {
         struct preempt_notifier *notifier;
  
@@ -2135,9 +2163,15 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
                 notifier->ops->sched_in(notifier, raw_smp_processor_id());
  }
  
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+       if (static_key_false(&preempt_notifier_key))
+               __fire_sched_in_preempt_notifiers(curr);
+}
+
  static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
-                                struct task_struct *next)
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+                                  struct task_struct *next)
  {
         struct preempt_notifier *notifier;
  
@@ -2145,13 +2179,21 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
                 notifier->ops->sched_out(notifier, next);
  }
  
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+                                struct task_struct *next)
+{
+       if (static_key_false(&preempt_notifier_key))
+               __fire_sched_out_preempt_notifiers(curr, next);
+}
+
  #else /* !CONFIG_PREEMPT_NOTIFIERS */
  
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
  {
  }
  
-static void
+static inline void
  fire_sched_out_preempt_notifiers(struct task_struct *curr,
                                  struct task_struct *next)
  {
@@ -2396,9 +2438,9 @@ unsigned long nr_iowait_cpu(int cpu)
  
  void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
  {
-       struct rq *this = this_rq();
-       *nr_waiters = atomic_read(&this->nr_iowait);
-       *load = this->cpu_load[0];
+       struct rq *rq = this_rq();
+       *nr_waiters = atomic_read(&rq->nr_iowait);
+       *load = rq->load.weight;
  }
  
  #ifdef CONFIG_SMP
@@ -2496,6 +2538,7 @@ void scheduler_tick(void)
         update_rq_clock(rq);
         curr->sched_class->task_tick(rq, curr, 0);
         update_cpu_load_active(rq);
+       calc_global_load_tick(rq);
         raw_spin_unlock(&rq->lock);
  
         perf_event_task_tick();
@@ -2524,7 +2567,7 @@ void scheduler_tick(void)
  u64 scheduler_tick_max_deferment(void)
  {
         struct rq *rq = this_rq();
-       unsigned long next, now = ACCESS_ONCE(jiffies);
+       unsigned long next, now = READ_ONCE(jiffies);
  
         next = rq->last_sched_tick + HZ;
  
@@ -2725,9 +2768,7 @@ again:
   *          - return from syscall or exception to user-space
   *          - return from interrupt-handler to user-space
   *
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
+ * WARNING: must be called with preemption disabled!
   */
  static void __sched __schedule(void)
  {
@@ -2736,7 +2777,6 @@ static void __sched __schedule(void)
         struct rq *rq;
         int cpu;
  
-       preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
         rcu_note_context_switch();
@@ -2800,8 +2840,6 @@ static void __sched __schedule(void)
                 raw_spin_unlock_irq(&rq->lock);
  
         post_schedule(rq);
-
-       sched_preempt_enable_no_resched();
  }
  
  static inline void sched_submit_work(struct task_struct *tsk)
@@ -2822,7 +2860,9 @@ asmlinkage __visible void __sched schedule(void)
  
         sched_submit_work(tsk);
         do {
+               preempt_disable();
                 __schedule();
+               sched_preempt_enable_no_resched();
         } while (need_resched());
  }
  EXPORT_SYMBOL(schedule);
@@ -2861,15 +2901,14 @@ void __sched schedule_preempt_disabled(void)
  static void __sched notrace preempt_schedule_common(void)
  {
         do {
-               __preempt_count_add(PREEMPT_ACTIVE);
+               preempt_active_enter();
                 __schedule();
-               __preempt_count_sub(PREEMPT_ACTIVE);
+               preempt_active_exit();
  
                 /*
                  * Check again in case we missed a preemption opportunity
                  * between schedule and now.
                  */
-               barrier();
         } while (need_resched());
  }
  
@@ -2893,9 +2932,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
  NOKPROBE_SYMBOL(preempt_schedule);
  EXPORT_SYMBOL(preempt_schedule);
  
-#ifdef CONFIG_CONTEXT_TRACKING
  /**
- * preempt_schedule_context - preempt_schedule called by tracing
+ * preempt_schedule_notrace - preempt_schedule called by tracing
   *
   * The tracing infrastructure uses preempt_enable_notrace to prevent
   * recursion and tracing preempt enabling caused by the tracing
@@ -2908,7 +2946,7 @@ EXPORT_SYMBOL(preempt_schedule);
   * instead of preempt_schedule() to exit user context if needed before
   * calling the scheduler.
   */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
  {
         enum ctx_state prev_ctx;
  
@@ -2916,7 +2954,13 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
                 return;
  
         do {
-               __preempt_count_add(PREEMPT_ACTIVE);
+               /*
+                * Use raw __prempt_count() ops that don't call function.
+                * We can't call functions before disabling preemption which
+                * disarm preemption tracing recursions.
+                */
+               __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+               barrier();
                 /*
                  * Needs preempt disabled in case user_exit() is traced
                  * and the tracer calls preempt_enable_notrace() causing
@@ -2926,12 +2970,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
                 __schedule();
                 exception_exit(prev_ctx);
  
-               __preempt_count_sub(PREEMPT_ACTIVE);
                 barrier();
+               __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
         } while (need_resched());
  }
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
  
  #endif /* CONFIG_PREEMPT */
  
@@ -2951,17 +2994,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
         prev_state = exception_enter();
  
         do {
-               __preempt_count_add(PREEMPT_ACTIVE);
+               preempt_active_enter();
                 local_irq_enable();
                 __schedule();
                 local_irq_disable();
-               __preempt_count_sub(PREEMPT_ACTIVE);
-
-               /*
-                * Check again in case we missed a preemption opportunity
-                * between schedule and now.
-                */
-               barrier();
+               preempt_active_exit();
         } while (need_resched());
  
         exception_exit(prev_state);
@@ -3039,7 +3076,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                 if (!dl_prio(p->normal_prio) ||
                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                         p->dl.dl_boosted = 1;
-                       p->dl.dl_throttled = 0;
                         enqueue_flag = ENQUEUE_REPLENISH;
                 } else
                         p->dl.dl_boosted = 0;
@@ -3299,15 +3335,18 @@ static void __setscheduler_params(struct task_struct *p,
  
  /* Actually do priority change: must hold pi & rq lock. */
  static void __setscheduler(struct rq *rq, struct task_struct *p,
-                          const struct sched_attr *attr)
+                          const struct sched_attr *attr, bool keep_boost)
  {
         __setscheduler_params(p, attr);
  
         /*
-        * If we get here, there was no pi waiters boosting the
-        * task. It is safe to use the normal prio.
+        * Keep a potential priority boosting if called from
+        * sched_setscheduler().
          */
-       p->prio = normal_prio(p);
+       if (keep_boost)
+               p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
+       else
+               p->prio = normal_prio(p);
  
         if (dl_prio(p->prio))
                 p->sched_class = &dl_sched_class;
@@ -3407,7 +3446,7 @@ static int __sched_setscheduler(struct task_struct *p,
         int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
                       MAX_RT_PRIO - 1 - attr->sched_priority;
         int retval, oldprio, oldpolicy = -1, queued, running;
-       int policy = attr->sched_policy;
+       int new_effective_prio, policy = attr->sched_policy;
         unsigned long flags;
         const struct sched_class *prev_class;
         struct rq *rq;
@@ -3589,15 +3628,14 @@ change:
         oldprio = p->prio;
  
         /*
-        * Special case for priority boosted tasks.
-        *
-        * If the new priority is lower or equal (user space view)
-        * than the current (boosted) priority, we just store the new
+        * Take priority boosted tasks into account. If the new
+        * effective priority is unchanged, we just store the new
          * normal parameters and do not touch the scheduler class and
          * the runqueue. This will be done when the task deboost
          * itself.
          */
-       if (rt_mutex_check_prio(p, newprio)) {
+       new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+       if (new_effective_prio == oldprio) {
                 __setscheduler_params(p, attr);
                 task_rq_unlock(rq, p, &flags);
                 return 0;
@@ -3611,7 +3649,7 @@ change:
                 put_prev_task(rq, p);
  
         prev_class = p->sched_class;
-       __setscheduler(rq, p, attr);
+       __setscheduler(rq, p, attr, true);
  
         if (running)
                 p->sched_class->set_curr_task(rq);
@@ -4386,10 +4424,7 @@ long __sched io_schedule_timeout(long timeout)
         long ret;
  
         current->in_iowait = 1;
-       if (old_iowait)
-               blk_schedule_flush_plug(current);
-       else
-               blk_flush_plug(current);
+       blk_schedule_flush_plug(current);
  
         delayacct_blkio_start();
         rq = raw_rq();
@@ -5314,7 +5349,7 @@ static struct notifier_block migration_notifier = {
         .priority = CPU_PRI_MIGRATION,
  };
  
-static void __cpuinit set_cpu_rq_start_time(void)
+static void set_cpu_rq_start_time(void)
  {
         int cpu = smp_processor_id();
         struct rq *rq = cpu_rq(cpu);
@@ -6996,27 +7031,23 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
         unsigned long flags;
         long cpu = (long)hcpu;
         struct dl_bw *dl_b;
+       bool overflow;
+       int cpus;
  
-       switch (action & ~CPU_TASKS_FROZEN) {
+       switch (action) {
         case CPU_DOWN_PREPARE:
-               /* explicitly allow suspend */
-               if (!(action & CPU_TASKS_FROZEN)) {
-                       bool overflow;
-                       int cpus;
-
-                       rcu_read_lock_sched();
-                       dl_b = dl_bw_of(cpu);
+               rcu_read_lock_sched();
+               dl_b = dl_bw_of(cpu);
  
-                       raw_spin_lock_irqsave(&dl_b->lock, flags);
-                       cpus = dl_bw_cpus(cpu);
-                       overflow = __dl_overflow(dl_b, cpus, 0, 0);
-                       raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+               raw_spin_lock_irqsave(&dl_b->lock, flags);
+               cpus = dl_bw_cpus(cpu);
+               overflow = __dl_overflow(dl_b, cpus, 0, 0);
+               raw_spin_unlock_irqrestore(&dl_b->lock, flags);
  
-                       rcu_read_unlock_sched();
+               rcu_read_unlock_sched();
  
-                       if (overflow)
-                               return notifier_from_errno(-EBUSY);
-               }
+               if (overflow)
+                       return notifier_from_errno(-EBUSY);
                 cpuset_update_active_cpus(false);
                 break;
         case CPU_DOWN_PREPARE_FROZEN:
@@ -7075,8 +7106,6 @@ void __init sched_init_smp(void)
  }
  #endif /* CONFIG_SMP */
  
-const_debug unsigned int sysctl_timer_migration = 1;
-
  int in_sched_functions(unsigned long addr)
  {
         return in_lock_functions(addr) ||
@@ -7348,7 +7377,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
         queued = task_on_rq_queued(p);
         if (queued)
                 dequeue_task(rq, p, 0);
-       __setscheduler(rq, p, &attr);
+       __setscheduler(rq, p, &attr, false);
         if (queued) {
                 enqueue_task(rq, p, 0);
                 resched_curr(rq);
@@ -7741,11 +7770,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
         return rt_runtime_us;
  }
  
-static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
  {
         u64 rt_runtime, rt_period;
  
-       rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+       rt_period = rt_period_us * NSEC_PER_USEC;
         rt_runtime = tg->rt_bandwidth.rt_runtime;
  
         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
@@ -8112,10 +8141,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
  
         __refill_cfs_bandwidth_runtime(cfs_b);
         /* restart the period timer (if active) to handle new period expiry */
-       if (runtime_enabled && cfs_b->timer_active) {
-               /* force a reprogram */
-               __start_cfs_bandwidth(cfs_b, true);
-       }
+       if (runtime_enabled)
+               start_cfs_bandwidth(cfs_b);
         raw_spin_unlock_irq(&cfs_b->lock);
  
         for_each_online_cpu(i) {