Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux

[cascardo/linux.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index df77c60..c7395d9 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
         update_sysctl();
  }
  
-#if BITS_PER_LONG == 32
-# define WMULT_CONST   (~0UL)
-#else
-# define WMULT_CONST   (1UL << 32)
-#endif
-
+#define WMULT_CONST    (~0U)
  #define WMULT_SHIFT    32
  
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+static void __update_inv_weight(struct load_weight *lw)
+{
+       unsigned long w;
+
+       if (likely(lw->inv_weight))
+               return;
+
+       w = scale_load_down(lw->weight);
+
+       if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+               lw->inv_weight = 1;
+       else if (unlikely(!w))
+               lw->inv_weight = WMULT_CONST;
+       else
+               lw->inv_weight = WMULT_CONST / w;
+}
  
  /*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ *   OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
   */
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-               struct load_weight *lw)
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
  {
-       u64 tmp;
-
-       /*
-        * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-        * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-        * 2^SCHED_LOAD_RESOLUTION.
-        */
-       if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-               tmp = (u64)delta_exec * scale_load_down(weight);
-       else
-               tmp = (u64)delta_exec;
+       u64 fact = scale_load_down(weight);
+       int shift = WMULT_SHIFT;
  
-       if (!lw->inv_weight) {
-               unsigned long w = scale_load_down(lw->weight);
+       __update_inv_weight(lw);
  
-               if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
-                       lw->inv_weight = 1;
-               else if (unlikely(!w))
-                       lw->inv_weight = WMULT_CONST;
-               else
-                       lw->inv_weight = WMULT_CONST / w;
+       if (unlikely(fact >> 32)) {
+               while (fact >> 32) {
+                       fact >>= 1;
+                       shift--;
+               }
         }
  
-       /*
-        * Check whether we'd overflow the 64-bit multiplication:
-        */
-       if (unlikely(tmp > WMULT_CONST))
-               tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
-                       WMULT_SHIFT/2);
-       else
-               tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+       /* hint to use a 32x32->64 mul */
+       fact = (u64)(u32)fact * lw->inv_weight;
+
+       while (fact >> 32) {
+               fact >>= 1;
+               shift--;
+       }
  
-       return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+       return mul_u64_u32_shr(delta_exec, fact, shift);
  }
  
  
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
  
  /**************************************************************
   * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
  /*
   * delta /= w
   */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
  {
         if (unlikely(se->load.weight != NICE_0_LOAD))
-               delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+               delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
  
         return delta;
  }
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
                         update_load_add(&lw, se->load.weight);
                         load = &lw;
                 }
-               slice = calc_delta_mine(slice, se->load.weight, load);
+               slice = __calc_delta(slice, se->load.weight, load);
         }
         return slice;
  }
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
  #endif
  
  /*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
+ * Update the current task's runtime statistics.
   */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
-             unsigned long delta_exec)
-{
-       unsigned long delta_exec_weighted;
-
-       schedstat_set(curr->statistics.exec_max,
-                     max((u64)delta_exec, curr->statistics.exec_max));
-
-       curr->sum_exec_runtime += delta_exec;
-       schedstat_add(cfs_rq, exec_clock, delta_exec);
-       delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-
-       curr->vruntime += delta_exec_weighted;
-       update_min_vruntime(cfs_rq);
-}
-
  static void update_curr(struct cfs_rq *cfs_rq)
  {
         struct sched_entity *curr = cfs_rq->curr;
         u64 now = rq_clock_task(rq_of(cfs_rq));
-       unsigned long delta_exec;
+       u64 delta_exec;
  
         if (unlikely(!curr))
                 return;
  
-       /*
-        * Get the amount of time the current task was running
-        * since the last time we changed load (this cannot
-        * overflow on 32 bits):
-        */
-       delta_exec = (unsigned long)(now - curr->exec_start);
-       if (!delta_exec)
+       delta_exec = now - curr->exec_start;
+       if (unlikely((s64)delta_exec <= 0))
                 return;
  
-       __update_curr(cfs_rq, curr, delta_exec);
         curr->exec_start = now;
  
+       schedstat_set(curr->statistics.exec_max,
+                     max(delta_exec, curr->statistics.exec_max));
+
+       curr->sum_exec_runtime += delta_exec;
+       schedstat_add(cfs_rq, exec_clock, delta_exec);
+
+       curr->vruntime += calc_delta_fair(delta_exec, curr);
+       update_min_vruntime(cfs_rq);
+
         if (entity_is_task(curr)) {
                 struct task_struct *curtask = task_of(curr);
  
@@ -1000,7 +986,7 @@ struct numa_stats {
   */
  static void update_numa_stats(struct numa_stats *ns, int nid)
  {
-       int cpu;
+       int cpu, cpus = 0;
  
         memset(ns, 0, sizeof(*ns));
         for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1009,8 +995,21 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
                 ns->nr_running += rq->nr_running;
                 ns->load += weighted_cpuload(cpu);
                 ns->power += power_of(cpu);
+
+               cpus++;
         }
  
+       /*
+        * If we raced with hotplug and there are no CPUs left in our mask
+        * the @ns structure is NULL'ed and task_numa_compare() will
+        * not find this node attractive.
+        *
+        * We'll either bail at !has_capacity, or we'll detect a huge imbalance
+        * and bail there.
+        */
+       if (!cpus)
+               return;
+
         ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
         ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
         ns->has_capacity = (ns->nr_running < ns->capacity);
@@ -1201,9 +1200,21 @@ static int task_numa_migrate(struct task_struct *p)
          */
         rcu_read_lock();
         sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
-       env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+       if (sd)
+               env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
         rcu_read_unlock();
  
+       /*
+        * Cpusets can break the scheduler domain tree into smaller
+        * balance domains, some of which do not cross NUMA boundaries.
+        * Tasks that are "trapped" in such domains cannot be migrated
+        * elsewhere, so there is no point in (re)trying.
+        */
+       if (unlikely(!sd)) {
+               p->numa_preferred_nid = cpu_to_node(task_cpu(p));
+               return -EINVAL;
+       }
+
         taskweight = task_weight(p, env.src_nid);
         groupweight = group_weight(p, env.src_nid);
         update_numa_stats(&env.src_stats, env.src_nid);
@@ -1727,6 +1738,13 @@ void task_numa_work(struct callback_head *work)
                     (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
                         continue;
  
+               /*
+                * Skip inaccessible VMAs to avoid any confusion between
+                * PROT_NONE and NUMA hinting ptes
+                */
+               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+                       continue;
+
                 do {
                         start = max(start, vma->vm_start);
                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
@@ -2153,7 +2171,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
         long contrib;
  
         /* The fraction of a cpu used by this cfs_rq */
-       contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
+       contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
                           sa->runnable_avg_period + 1);
         contrib -= cfs_rq->tg_runnable_contrib;
  
@@ -2990,8 +3008,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
         }
  }
  
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-                                    unsigned long delta_exec)
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
  {
         /* dock delta_exec before expiring quota (as it could span periods) */
         cfs_rq->runtime_remaining -= delta_exec;
@@ -3009,7 +3026,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
  }
  
  static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
  {
         if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
                 return;
@@ -3549,8 +3566,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
         return rq_clock_task(rq_of(cfs_rq));
  }
  
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-                                    unsigned long delta_exec) {}
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
  static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -5354,10 +5370,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
                  */
  
                 for_each_cpu(cpu, sched_group_cpus(sdg)) {
-                       struct sched_group *sg = cpu_rq(cpu)->sd->groups;
+                       struct sched_group_power *sgp;
+                       struct rq *rq = cpu_rq(cpu);
+
+                       /*
+                        * build_sched_domains() -> init_sched_groups_power()
+                        * gets here before we've attached the domains to the
+                        * runqueues.
+                        *
+                        * Use power_of(), which is set irrespective of domains
+                        * in update_cpu_power().
+                        *
+                        * This avoids power/power_orig from being 0 and
+                        * causing divide-by-zero issues on boot.
+                        *
+                        * Runtime updates will correct power_orig.
+                        */
+                       if (unlikely(!rq->sd)) {
+                               power_orig += power_of(cpu);
+                               power += power_of(cpu);
+                               continue;
+                       }
  
-                       power_orig += sg->sgp->power_orig;
-                       power += sg->sgp->power;
+                       sgp = rq->sd->groups->sgp;
+                       power_orig += sgp->power_orig;
+                       power += sgp->power;
                 }
         } else  {
                 /*