timers: Switch to a non-cascading wheel

[cascardo/linux.git] / kernel / time / timer.c
diff --git a/kernel/time/timer.c b/kernel/time/timer.c

index f259a3e..86e95b7 100644 (file)
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -59,43 +59,151 @@ __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
  EXPORT_SYMBOL(jiffies_64);
  
  /*
- * per-CPU timer vector definitions:
+ * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
+ * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
+ * level has a different granularity.
+ *
+ * The level granularity is:           LVL_CLK_DIV ^ lvl
+ * The level clock frequency is:       HZ / (LVL_CLK_DIV ^ level)
+ *
+ * The array level of a newly armed timer depends on the relative expiry
+ * time. The farther the expiry time is away the higher the array level and
+ * therefor the granularity becomes.
+ *
+ * Contrary to the original timer wheel implementation, which aims for 'exact'
+ * expiry of the timers, this implementation removes the need for recascading
+ * the timers into the lower array levels. The previous 'classic' timer wheel
+ * implementation of the kernel already violated the 'exact' expiry by adding
+ * slack to the expiry time to provide batched expiration. The granularity
+ * levels provide implicit batching.
+ *
+ * This is an optimization of the original timer wheel implementation for the
+ * majority of the timer wheel use cases: timeouts. The vast majority of
+ * timeout timers (networking, disk I/O ...) are canceled before expiry. If
+ * the timeout expires it indicates that normal operation is disturbed, so it
+ * does not matter much whether the timeout comes with a slight delay.
+ *
+ * The only exception to this are networking timers with a small expiry
+ * time. They rely on the granularity. Those fit into the first wheel level,
+ * which has HZ granularity.
+ *
+ * We don't have cascading anymore. timers with a expiry time above the
+ * capacity of the last wheel level are force expired at the maximum timeout
+ * value of the last wheel level. From data sampling we know that the maximum
+ * value observed is 5 days (network connection tracking), so this should not
+ * be an issue.
+ *
+ * The currently chosen array constants values are a good compromise between
+ * array size and granularity.
+ *
+ * This results in the following granularity and range levels:
+ *
+ * HZ 1000 steps
+ * Level Offset  Granularity            Range
+ *  0      0         1 ms                0 ms -         63 ms
+ *  1     64         8 ms               64 ms -        511 ms
+ *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s)
+ *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s)
+ *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m)
+ *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m)
+ *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h)
+ *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d)
+ *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d)
+ *
+ * HZ  300
+ * Level Offset  Granularity            Range
+ *  0     0         3 ms                0 ms -        210 ms
+ *  1    64        26 ms              213 ms -       1703 ms (213ms - ~1s)
+ *  2   128       213 ms             1706 ms -      13650 ms (~1s - ~13s)
+ *  3   192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m)
+ *  4   256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m)
+ *  5   320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h)
+ *  6   384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h)
+ *  7   448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d)
+ *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
+ *
+ * HZ  250
+ * Level Offset  Granularity            Range
+ *  0     0         4 ms                0 ms -        255 ms
+ *  1    64        32 ms              256 ms -       2047 ms (256ms - ~2s)
+ *  2   128       256 ms             2048 ms -      16383 ms (~2s - ~16s)
+ *  3   192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m)
+ *  4   256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m)
+ *  5   320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h)
+ *  6   384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h)
+ *  7   448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d)
+ *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
+ *
+ * HZ  100
+ * Level Offset  Granularity            Range
+ *  0     0         10 ms               0 ms -        630 ms
+ *  1    64         80 ms             640 ms -       5110 ms (640ms - ~5s)
+ *  2   128        640 ms            5120 ms -      40950 ms (~5s - ~40s)
+ *  3   192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m)
+ *  4   256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m)
+ *  5   320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h)
+ *  6   384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d)
+ *  7   448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
   */
-#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
-#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
-#define TVN_SIZE (1 << TVN_BITS)
-#define TVR_SIZE (1 << TVR_BITS)
-#define TVN_MASK (TVN_SIZE - 1)
-#define TVR_MASK (TVR_SIZE - 1)
-#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
-
-struct tvec {
-       struct hlist_head vec[TVN_SIZE];
-};
  
-struct tvec_root {
-       struct hlist_head vec[TVR_SIZE];
-};
+/* Clock divisor for the next level */
+#define LVL_CLK_SHIFT  3
+#define LVL_CLK_DIV    (1UL << LVL_CLK_SHIFT)
+#define LVL_CLK_MASK   (LVL_CLK_DIV - 1)
+#define LVL_SHIFT(n)   ((n) * LVL_CLK_SHIFT)
+#define LVL_GRAN(n)    (1UL << LVL_SHIFT(n))
+
+/*
+ * The time start value for each level to select the bucket at enqueue
+ * time.
+ */
+#define LVL_START(n)   ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
+
+/* Size of each clock level */
+#define LVL_BITS       6
+#define LVL_SIZE       (1UL << LVL_BITS)
+#define LVL_MASK       (LVL_SIZE - 1)
+#define LVL_OFFS(n)    ((n) * LVL_SIZE)
+
+/* Level depth */
+#if HZ > 100
+# define LVL_DEPTH     9
+# else
+# define LVL_DEPTH     8
+#endif
+
+/* The cutoff (max. capacity of the wheel) */
+#define WHEEL_TIMEOUT_CUTOFF   (LVL_START(LVL_DEPTH))
+#define WHEEL_TIMEOUT_MAX      (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
+
+/*
+ * The resulting wheel size. If NOHZ is configured we allocate two
+ * wheels so we have a separate storage for the deferrable timers.
+ */
+#define WHEEL_SIZE     (LVL_SIZE * LVL_DEPTH)
+
+#ifdef CONFIG_NO_HZ_COMMON
+# define NR_BASES      2
+# define BASE_STD      0
+# define BASE_DEF      1
+#else
+# define NR_BASES      1
+# define BASE_STD      0
+# define BASE_DEF      0
+#endif
  
  struct timer_base {
-       spinlock_t lock;
-       struct timer_list *running_timer;
-       unsigned long clk;
-       unsigned long next_timer;
-       unsigned long active_timers;
-       unsigned long all_timers;
-       int cpu;
-       bool migration_enabled;
-       bool nohz_active;
-       struct tvec_root tv1;
-       struct tvec tv2;
-       struct tvec tv3;
-       struct tvec tv4;
-       struct tvec tv5;
+       spinlock_t              lock;
+       struct timer_list       *running_timer;
+       unsigned long           clk;
+       unsigned int            cpu;
+       bool                    migration_enabled;
+       bool                    nohz_active;
+       DECLARE_BITMAP(pending_map, WHEEL_SIZE);
+       struct hlist_head       vectors[WHEEL_SIZE];
  } ____cacheline_aligned;
  
-
-static DEFINE_PER_CPU(struct timer_base, timer_bases);
+static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
  
  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
  unsigned int sysctl_timer_migration = 1;
@@ -106,15 +214,17 @@ void timers_update_migration(bool update_nohz)
         unsigned int cpu;
  
         /* Avoid the loop, if nothing to update */
-       if (this_cpu_read(timer_bases.migration_enabled) == on)
+       if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
                 return;
  
         for_each_possible_cpu(cpu) {
-               per_cpu(timer_bases.migration_enabled, cpu) = on;
+               per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
+               per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
                 per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
                 if (!update_nohz)
                         continue;
-               per_cpu(timer_bases.nohz_active, cpu) = true;
+               per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
+               per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
                 per_cpu(hrtimer_bases.nohz_active, cpu) = true;
         }
  }
@@ -133,20 +243,6 @@ int timer_migration_handler(struct ctl_table *table, int write,
         mutex_unlock(&mutex);
         return ret;
  }
-
-static inline struct timer_base *get_target_base(struct timer_base *base,
-                                               int pinned)
-{
-       if (pinned || !base->migration_enabled)
-               return this_cpu_ptr(&timer_bases);
-       return per_cpu_ptr(&timer_bases, get_nohz_timer_target());
-}
-#else
-static inline struct timer_base *get_target_base(struct timer_base *base,
-                                               int pinned)
-{
-       return this_cpu_ptr(&timer_bases);
-}
  #endif
  
  static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -370,78 +466,91 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
  }
  EXPORT_SYMBOL_GPL(set_timer_slack);
  
+static inline unsigned int timer_get_idx(struct timer_list *timer)
+{
+       return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
+}
+
+static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
+{
+       timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
+                       idx << TIMER_ARRAYSHIFT;
+}
+
+/*
+ * Helper function to calculate the array index for a given expiry
+ * time.
+ */
+static inline unsigned calc_index(unsigned expires, unsigned lvl)
+{
+       expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
+       return LVL_OFFS(lvl) + (expires & LVL_MASK);
+}
+
  static void
  __internal_add_timer(struct timer_base *base, struct timer_list *timer)
  {
         unsigned long expires = timer->expires;
-       unsigned long idx = expires - base->clk;
+       unsigned long delta = expires - base->clk;
         struct hlist_head *vec;
-
-       if (idx < TVR_SIZE) {
-               int i = expires & TVR_MASK;
-               vec = base->tv1.vec + i;
-       } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
-               int i = (expires >> TVR_BITS) & TVN_MASK;
-               vec = base->tv2.vec + i;
-       } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
-               int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
-               vec = base->tv3.vec + i;
-       } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
-               int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
-               vec = base->tv4.vec + i;
-       } else if ((signed long) idx < 0) {
-               /*
-                * Can happen if you add a timer with expires == jiffies,
-                * or you set a timer to go off in the past
-                */
-               vec = base->tv1.vec + (base->clk & TVR_MASK);
+       unsigned int idx;
+
+       if (delta < LVL_START(1)) {
+               idx = calc_index(expires, 0);
+       } else if (delta < LVL_START(2)) {
+               idx = calc_index(expires, 1);
+       } else if (delta < LVL_START(3)) {
+               idx = calc_index(expires, 2);
+       } else if (delta < LVL_START(4)) {
+               idx = calc_index(expires, 3);
+       } else if (delta < LVL_START(5)) {
+               idx = calc_index(expires, 4);
+       } else if (delta < LVL_START(6)) {
+               idx = calc_index(expires, 5);
+       } else if (delta < LVL_START(7)) {
+               idx = calc_index(expires, 6);
+       } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
+               idx = calc_index(expires, 7);
+       } else if ((long) delta < 0) {
+               idx = base->clk & LVL_MASK;
         } else {
-               int i;
-               /* If the timeout is larger than MAX_TVAL (on 64-bit
-                * architectures or with CONFIG_BASE_SMALL=1) then we
-                * use the maximum timeout.
+               /*
+                * Force expire obscene large timeouts to expire at the
+                * capacity limit of the wheel.
                  */
-               if (idx > MAX_TVAL) {
-                       idx = MAX_TVAL;
-                       expires = idx + base->clk;
-               }
-               i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
-               vec = base->tv5.vec + i;
-       }
+               if (expires >= WHEEL_TIMEOUT_CUTOFF)
+                       expires = WHEEL_TIMEOUT_MAX;
  
+               idx = calc_index(expires, LVL_DEPTH - 1);
+       }
+       /*
+        * Enqueue the timer into the array bucket, mark it pending in
+        * the bitmap and store the index in the timer flags.
+        */
+       vec = base->vectors + idx;
         hlist_add_head(&timer->entry, vec);
+       __set_bit(idx, base->pending_map);
+       timer_set_idx(timer, idx);
  }
  
  static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
  {
-       /* Advance base->jiffies, if the base is empty */
-       if (!base->all_timers++)
-               base->clk = jiffies;
-
         __internal_add_timer(base, timer);
-       /*
-        * Update base->active_timers and base->next_timer
-        */
-       if (!(timer->flags & TIMER_DEFERRABLE)) {
-               if (!base->active_timers++ ||
-                   time_before(timer->expires, base->next_timer))
-                       base->next_timer = timer->expires;
-       }
  
         /*
          * Check whether the other CPU is in dynticks mode and needs
-        * to be triggered to reevaluate the timer wheel.
-        * We are protected against the other CPU fiddling
-        * with the timer by holding the timer base lock. This also
-        * makes sure that a CPU on the way to stop its tick can not
-        * evaluate the timer wheel.
+        * to be triggered to reevaluate the timer wheel.  We are
+        * protected against the other CPU fiddling with the timer by
+        * holding the timer base lock. This also makes sure that a
+        * CPU on the way to stop its tick can not evaluate the timer
+        * wheel.
          *
          * Spare the IPI for deferrable timers on idle targets though.
          * The next busy ticks will take care of it. Except full dynticks
          * require special care against races with idle_cpu(), lets deal
          * with that later.
          */
-       if (base->nohz_active) {
+       if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) {
                 if (!(timer->flags & TIMER_DEFERRABLE) ||
                     tick_nohz_full_cpu(base->cpu))
                         wake_up_nohz_cpu(base->cpu);
@@ -706,54 +815,87 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending)
         entry->next = LIST_POISON2;
  }
  
-static inline void
-detach_expired_timer(struct timer_list *timer, struct timer_base *base)
-{
-       detach_timer(timer, true);
-       if (!(timer->flags & TIMER_DEFERRABLE))
-               base->active_timers--;
-       base->all_timers--;
-}
-
  static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
                              bool clear_pending)
  {
+       unsigned idx = timer_get_idx(timer);
+
         if (!timer_pending(timer))
                 return 0;
  
+       if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
+               __clear_bit(idx, base->pending_map);
+
         detach_timer(timer, clear_pending);
-       if (!(timer->flags & TIMER_DEFERRABLE)) {
-               base->active_timers--;
-               if (timer->expires == base->next_timer)
-                       base->next_timer = base->clk;
-       }
-       /* If this was the last timer, advance base->jiffies */
-       if (!--base->all_timers)
-               base->clk = jiffies;
         return 1;
  }
  
+static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
+{
+       struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
+
+       /*
+        * If the timer is deferrable and nohz is active then we need to use
+        * the deferrable base.
+        */
+       if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+           (tflags & TIMER_DEFERRABLE))
+               base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
+       return base;
+}
+
+static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
+{
+       struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
+       /*
+        * If the timer is deferrable and nohz is active then we need to use
+        * the deferrable base.
+        */
+       if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+           (tflags & TIMER_DEFERRABLE))
+               base = this_cpu_ptr(&timer_bases[BASE_DEF]);
+       return base;
+}
+
+static inline struct timer_base *get_timer_base(u32 tflags)
+{
+       return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
+}
+
+static inline struct timer_base *get_target_base(struct timer_base *base,
+                                                unsigned tflags)
+{
+#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
+       if ((tflags & TIMER_PINNED) || !base->migration_enabled)
+               return get_timer_this_cpu_base(tflags);
+       return get_timer_cpu_base(tflags, get_nohz_timer_target());
+#else
+       return get_timer_this_cpu_base(tflags);
+#endif
+}
+
  /*
- * We are using hashed locking: holding per_cpu(timer_bases).lock
- * means that all timers which are tied to this base via timer->base are
- * locked, and the base itself is locked too.
+ * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
+ * that all timers which are tied to this base are locked, and the base itself
+ * is locked too.
   *
   * So __run_timers/migrate_timers can safely modify all timers which could
- * be found on ->tvX lists.
+ * be found in the base->vectors array.
   *
- * When the timer's base is locked and removed from the list, the
- * TIMER_MIGRATING flag is set, FIXME
+ * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
+ * to wait until the migration is done.
   */
  static struct timer_base *lock_timer_base(struct timer_list *timer,
-                                       unsigned long *flags)
+                                         unsigned long *flags)
         __acquires(timer->base->lock)
  {
         for (;;) {
-               u32 tf = timer->flags;
                 struct timer_base *base;
+               u32 tf = timer->flags;
  
                 if (!(tf & TIMER_MIGRATING)) {
-                       base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK);
+                       base = get_timer_base(tf);
                         spin_lock_irqsave(&base->lock, *flags);
                         if (timer->flags == tf)
                                 return base;
@@ -770,6 +912,27 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
         unsigned long flags;
         int ret = 0;
  
+       /*
+        * TODO: Calculate the array bucket of the timer right here w/o
+        * holding the base lock. This allows to check not only
+        * timer->expires == expires below, but also whether the timer
+        * ends up in the same bucket. If we really need to requeue
+        * the timer then we check whether base->clk have
+        * advanced between here and locking the timer base. If
+        * jiffies advanced we have to recalc the array bucket with the
+        * lock held.
+        */
+
+       /*
+        * This is a common optimization triggered by the
+        * networking code - if the timer is re-modified
+        * to be the same thing then just return:
+        */
+       if (timer_pending(timer)) {
+               if (timer->expires == expires)
+                       return 1;
+       }
+
         timer_stats_timer_set_start_info(timer);
         BUG_ON(!timer->function);
  
@@ -781,15 +944,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
  
         debug_activate(timer, expires);
  
-       new_base = get_target_base(base, timer->flags & TIMER_PINNED);
+       new_base = get_target_base(base, timer->flags);
  
         if (base != new_base) {
                 /*
-                * We are trying to schedule the timer on the local CPU.
+                * We are trying to schedule the timer on the new base.
                  * However we can't change timer's base while it is running,
                  * otherwise del_timer_sync() can't detect that the timer's
-                * handler yet has not finished. This also guarantees that
-                * the timer is serialized wrt itself.
+                * handler yet has not finished. This also guarantees that the
+                * timer is serialized wrt itself.
                  */
                 if (likely(base->running_timer != timer)) {
                         /* See the comment in lock_timer_base() */
@@ -828,45 +991,6 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
  }
  EXPORT_SYMBOL(mod_timer_pending);
  
-/*
- * Decide where to put the timer while taking the slack into account
- *
- * Algorithm:
- *   1) calculate the maximum (absolute) time
- *   2) calculate the highest bit where the expires and new max are different
- *   3) use this bit to make a mask
- *   4) use the bitmask to round down the maximum time, so that all last
- *      bits are zeros
- */
-static inline
-unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
-{
-       unsigned long expires_limit, mask;
-       int bit;
-
-       if (timer->slack >= 0) {
-               expires_limit = expires + timer->slack;
-       } else {
-               long delta = expires - jiffies;
-
-               if (delta < 256)
-                       return expires;
-
-               expires_limit = expires + delta / 256;
-       }
-       mask = expires ^ expires_limit;
-       if (mask == 0)
-               return expires;
-
-       bit = __fls(mask);
-
-       mask = (1UL << bit) - 1;
-
-       expires_limit = expires_limit & ~(mask);
-
-       return expires_limit;
-}
-
  /**
   * mod_timer - modify a timer's timeout
   * @timer: the timer to be modified
@@ -889,16 +1013,6 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
   */
  int mod_timer(struct timer_list *timer, unsigned long expires)
  {
-       expires = apply_slack(timer, expires);
-
-       /*
-        * This is a common optimization triggered by the
-        * networking code - if the timer is re-modified
-        * to be the same thing then just return:
-        */
-       if (timer_pending(timer) && timer->expires == expires)
-               return 1;
-
         return __mod_timer(timer, expires, false);
  }
  EXPORT_SYMBOL(mod_timer);
@@ -933,13 +1047,14 @@ EXPORT_SYMBOL(add_timer);
   */
  void add_timer_on(struct timer_list *timer, int cpu)
  {
-       struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu);
-       struct timer_base *base;
+       struct timer_base *new_base, *base;
         unsigned long flags;
  
         timer_stats_timer_set_start_info(timer);
         BUG_ON(timer_pending(timer) || !timer->function);
  
+       new_base = get_timer_cpu_base(timer->flags, cpu);
+
         /*
          * If @timer was on a different CPU, it should be migrated with the
          * old base locked to prevent other operations proceeding with the
@@ -1085,27 +1200,6 @@ int del_timer_sync(struct timer_list *timer)
  EXPORT_SYMBOL(del_timer_sync);
  #endif
  
-static int cascade(struct timer_base *base, struct tvec *tv, int index)
-{
-       /* cascade all the timers from tv up one level */
-       struct timer_list *timer;
-       struct hlist_node *tmp;
-       struct hlist_head tv_list;
-
-       hlist_move_list(tv->vec + index, &tv_list);
-
-       /*
-        * We are removing _all_ timers from the list, so we
-        * don't have to detach them individually.
-        */
-       hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
-               /* No accounting, while moving them */
-               __internal_add_timer(base, timer);
-       }
-
-       return index;
-}
-
  static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
                           unsigned long data)
  {
@@ -1149,68 +1243,80 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
         }
  }
  
-#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+static void expire_timers(struct timer_base *base, struct hlist_head *head)
+{
+       while (!hlist_empty(head)) {
+               struct timer_list *timer;
+               void (*fn)(unsigned long);
+               unsigned long data;
+
+               timer = hlist_entry(head->first, struct timer_list, entry);
+               timer_stats_account_timer(timer);
+
+               base->running_timer = timer;
+               detach_timer(timer, true);
+
+               fn = timer->function;
+               data = timer->data;
+
+               if (timer->flags & TIMER_IRQSAFE) {
+                       spin_unlock(&base->lock);
+                       call_timer_fn(timer, fn, data);
+                       spin_lock(&base->lock);
+               } else {
+                       spin_unlock_irq(&base->lock);
+                       call_timer_fn(timer, fn, data);
+                       spin_lock_irq(&base->lock);
+               }
+       }
+}
+
+static int collect_expired_timers(struct timer_base *base,
+                                 struct hlist_head *heads)
+{
+       unsigned long clk = base->clk;
+       struct hlist_head *vec;
+       int i, levels = 0;
+       unsigned int idx;
+
+       for (i = 0; i < LVL_DEPTH; i++) {
+               idx = (clk & LVL_MASK) + i * LVL_SIZE;
+
+               if (__test_and_clear_bit(idx, base->pending_map)) {
+                       vec = base->vectors + idx;
+                       hlist_move_list(vec, heads++);
+                       levels++;
+               }
+               /* Is it time to look at the next level? */
+               if (clk & LVL_CLK_MASK)
+                       break;
+               /* Shift clock for the next level granularity */
+               clk >>= LVL_CLK_SHIFT;
+       }
+       return levels;
+}
  
  /**
   * __run_timers - run all expired timers (if any) on this CPU.
   * @base: the timer vector to be processed.
- *
- * This function cascades all vectors and executes all expired timer
- * vectors.
   */
  static inline void __run_timers(struct timer_base *base)
  {
-       struct timer_list *timer;
+       struct hlist_head heads[LVL_DEPTH];
+       int levels;
+
+       if (!time_after_eq(jiffies, base->clk))
+               return;
  
         spin_lock_irq(&base->lock);
  
         while (time_after_eq(jiffies, base->clk)) {
-               struct hlist_head work_list;
-               struct hlist_head *head = &work_list;
-               int index;
  
-               if (!base->all_timers) {
-                       base->clk = jiffies;
-                       break;
-               }
-
-               index = base->clk & TVR_MASK;
+               levels = collect_expired_timers(base, heads);
+               base->clk++;
  
-               /*
-                * Cascade timers:
-                */
-               if (!index &&
-                       (!cascade(base, &base->tv2, INDEX(0))) &&
-                               (!cascade(base, &base->tv3, INDEX(1))) &&
-                                       !cascade(base, &base->tv4, INDEX(2)))
-                       cascade(base, &base->tv5, INDEX(3));
-               ++base->clk;
-               hlist_move_list(base->tv1.vec + index, head);
-               while (!hlist_empty(head)) {
-                       void (*fn)(unsigned long);
-                       unsigned long data;
-                       bool irqsafe;
-
-                       timer = hlist_entry(head->first, struct timer_list, entry);
-                       fn = timer->function;
-                       data = timer->data;
-                       irqsafe = timer->flags & TIMER_IRQSAFE;
-
-                       timer_stats_account_timer(timer);
-
-                       base->running_timer = timer;
-                       detach_expired_timer(timer, base);
-
-                       if (irqsafe) {
-                               spin_unlock(&base->lock);
-                               call_timer_fn(timer, fn, data);
-                               spin_lock(&base->lock);
-                       } else {
-                               spin_unlock_irq(&base->lock);
-                               call_timer_fn(timer, fn, data);
-                               spin_lock_irq(&base->lock);
-                       }
-               }
+               while (levels--)
+                       expire_timers(base, heads + levels);
         }
         base->running_timer = NULL;
         spin_unlock_irq(&base->lock);
@@ -1218,78 +1324,87 @@ static inline void __run_timers(struct timer_base *base)
  
  #ifdef CONFIG_NO_HZ_COMMON
  /*
- * Find out when the next timer event is due to happen. This
- * is used on S/390 to stop all activity when a CPU is idle.
- * This function needs to be called with interrupts disabled.
+ * Find the next pending bucket of a level. Search from @offset + @clk upwards
+ * and if nothing there, search from start of the level (@offset) up to
+ * @offset + clk.
+ */
+static int next_pending_bucket(struct timer_base *base, unsigned offset,
+                              unsigned clk)
+{
+       unsigned pos, start = offset + clk;
+       unsigned end = offset + LVL_SIZE;
+
+       pos = find_next_bit(base->pending_map, end, start);
+       if (pos < end)
+               return pos - start;
+
+       pos = find_next_bit(base->pending_map, start, offset);
+       return pos < start ? pos + LVL_SIZE - start : -1;
+}
+
+/*
+ * Search the first expiring timer in the various clock levels.
   */
  static unsigned long __next_timer_interrupt(struct timer_base *base)
  {
-       unsigned long clk = base->clk;
-       unsigned long expires = clk + NEXT_TIMER_MAX_DELTA;
-       int index, slot, array, found = 0;
-       struct timer_list *nte;
-       struct tvec *varray[4];
-
-       /* Look for timer events in tv1. */
-       index = slot = clk & TVR_MASK;
-       do {
-               hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
-                       if (nte->flags & TIMER_DEFERRABLE)
-                               continue;
-
-                       found = 1;
-                       expires = nte->expires;
-                       /* Look at the cascade bucket(s)? */
-                       if (!index || slot < index)
-                               goto cascade;
-                       return expires;
+       unsigned long clk, next, adj;
+       unsigned lvl, offset = 0;
+
+       spin_lock(&base->lock);
+       next = base->clk + NEXT_TIMER_MAX_DELTA;
+       clk = base->clk;
+       for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
+               int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
+
+               if (pos >= 0) {
+                       unsigned long tmp = clk + (unsigned long) pos;
+
+                       tmp <<= LVL_SHIFT(lvl);
+                       if (time_before(tmp, next))
+                               next = tmp;
                 }
-               slot = (slot + 1) & TVR_MASK;
-       } while (slot != index);
-
-cascade:
-       /* Calculate the next cascade event */
-       if (index)
-               clk += TVR_SIZE - index;
-       clk >>= TVR_BITS;
-
-       /* Check tv2-tv5. */
-       varray[0] = &base->tv2;
-       varray[1] = &base->tv3;
-       varray[2] = &base->tv4;
-       varray[3] = &base->tv5;
-
-       for (array = 0; array < 4; array++) {
-               struct tvec *varp = varray[array];
-
-               index = slot = clk & TVN_MASK;
-               do {
-                       hlist_for_each_entry(nte, varp->vec + slot, entry) {
-                               if (nte->flags & TIMER_DEFERRABLE)
-                                       continue;
-
-                               found = 1;
-                               if (time_before(nte->expires, expires))
-                                       expires = nte->expires;
-                       }
-                       /*
-                        * Do we still search for the first timer or are
-                        * we looking up the cascade buckets ?
-                        */
-                       if (found) {
-                               /* Look at the cascade bucket(s)? */
-                               if (!index || slot < index)
-                                       break;
-                               return expires;
-                       }
-                       slot = (slot + 1) & TVN_MASK;
-               } while (slot != index);
-
-               if (index)
-                       clk += TVN_SIZE - index;
-               clk >>= TVN_BITS;
+               /*
+                * Clock for the next level. If the current level clock lower
+                * bits are zero, we look at the next level as is. If not we
+                * need to advance it by one because that's going to be the
+                * next expiring bucket in that level. base->clk is the next
+                * expiring jiffie. So in case of:
+                *
+                * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+                *  0    0    0    0    0    0
+                *
+                * we have to look at all levels @index 0. With
+                *
+                * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+                *  0    0    0    0    0    2
+                *
+                * LVL0 has the next expiring bucket @index 2. The upper
+                * levels have the next expiring bucket @index 1.
+                *
+                * In case that the propagation wraps the next level the same
+                * rules apply:
+                *
+                * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+                *  0    0    0    0    F    2
+                *
+                * So after looking at LVL0 we get:
+                *
+                * LVL5 LVL4 LVL3 LVL2 LVL1
+                *  0    0    0    1    0
+                *
+                * So no propagation from LVL1 to LVL2 because that happened
+                * with the add already, but then we need to propagate further
+                * from LVL2 to LVL3.
+                *
+                * So the simple check whether the lower bits of the current
+                * level are 0 or not is sufficient for all cases.
+                */
+               adj = clk & LVL_CLK_MASK ? 1 : 0;
+               clk >>= LVL_CLK_SHIFT;
+               clk += adj;
         }
-       return expires;
+       spin_unlock(&base->lock);
+       return next;
  }
  
  /*
@@ -1335,7 +1450,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
   */
  u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
  {
-       struct timer_base *base = this_cpu_ptr(&timer_bases);
+       struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
         u64 expires = KTIME_MAX;
         unsigned long nextevt;
  
@@ -1346,17 +1461,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
         if (cpu_is_offline(smp_processor_id()))
                 return expires;
  
-       spin_lock(&base->lock);
-       if (base->active_timers) {
-               if (time_before_eq(base->next_timer, base->clk))
-                       base->next_timer = __next_timer_interrupt(base);
-               nextevt = base->next_timer;
-               if (time_before_eq(nextevt, basej))
-                       expires = basem;
-               else
-                       expires = basem + (nextevt - basej) * TICK_NSEC;
-       }
-       spin_unlock(&base->lock);
+       nextevt = __next_timer_interrupt(base);
+       if (time_before_eq(nextevt, basej))
+               expires = basem;
+       else
+               expires = basem + (nextevt - basej) * TICK_NSEC;
  
         return cmp_next_hrtimer_event(basem, expires);
  }
@@ -1387,10 +1496,11 @@ void update_process_times(int user_tick)
   */
  static void run_timer_softirq(struct softirq_action *h)
  {
-       struct timer_base *base = this_cpu_ptr(&timer_bases);
+       struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
  
-       if (time_after_eq(jiffies, base->clk))
-               __run_timers(base);
+       __run_timers(base);
+       if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+               __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
  }
  
  /*
@@ -1541,7 +1651,6 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
  
         while (!hlist_empty(head)) {
                 timer = hlist_entry(head->first, struct timer_list, entry);
-               /* We ignore the accounting on the dying cpu */
                 detach_timer(timer, false);
                 timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
                 internal_add_timer(new_base, timer);
@@ -1552,35 +1661,29 @@ static void migrate_timers(int cpu)
  {
         struct timer_base *old_base;
         struct timer_base *new_base;
-       int i;
+       int b, i;
  
         BUG_ON(cpu_online(cpu));
-       old_base = per_cpu_ptr(&timer_bases, cpu);
-       new_base = get_cpu_ptr(&timer_bases);
-       /*
-        * The caller is globally serialized and nobody else
-        * takes two locks at once, deadlock is not possible.
-        */
-       spin_lock_irq(&new_base->lock);
-       spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
-
-       BUG_ON(old_base->running_timer);
-
-       for (i = 0; i < TVR_SIZE; i++)
-               migrate_timer_list(new_base, old_base->tv1.vec + i);
-       for (i = 0; i < TVN_SIZE; i++) {
-               migrate_timer_list(new_base, old_base->tv2.vec + i);
-               migrate_timer_list(new_base, old_base->tv3.vec + i);
-               migrate_timer_list(new_base, old_base->tv4.vec + i);
-               migrate_timer_list(new_base, old_base->tv5.vec + i);
-       }
  
-       old_base->active_timers = 0;
-       old_base->all_timers = 0;
+       for (b = 0; b < NR_BASES; b++) {
+               old_base = per_cpu_ptr(&timer_bases[b], cpu);
+               new_base = get_cpu_ptr(&timer_bases[b]);
+               /*
+                * The caller is globally serialized and nobody else
+                * takes two locks at once, deadlock is not possible.
+                */
+               spin_lock_irq(&new_base->lock);
+               spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+
+               BUG_ON(old_base->running_timer);
+
+               for (i = 0; i < WHEEL_SIZE; i++)
+                       migrate_timer_list(new_base, old_base->vectors + i);
  
-       spin_unlock(&old_base->lock);
-       spin_unlock_irq(&new_base->lock);
-       put_cpu_ptr(&timer_bases);
+               spin_unlock(&old_base->lock);
+               spin_unlock_irq(&new_base->lock);
+               put_cpu_ptr(&timer_bases);
+       }
  }
  
  static int timer_cpu_notify(struct notifier_block *self,
@@ -1608,13 +1711,15 @@ static inline void timer_register_cpu_notifier(void) { }
  
  static void __init init_timer_cpu(int cpu)
  {
-       struct timer_base *base = per_cpu_ptr(&timer_bases, cpu);
-
-       base->cpu = cpu;
-       spin_lock_init(&base->lock);
+       struct timer_base *base;
+       int i;
  
-       base->clk = jiffies;
-       base->next_timer = base->clk;
+       for (i = 0; i < NR_BASES; i++) {
+               base = per_cpu_ptr(&timer_bases[i], cpu);
+               base->cpu = cpu;
+               spin_lock_init(&base->lock);
+               base->clk = jiffies;
+       }
  }
  
  static void __init init_timer_cpus(void)