Merge tag 'gcc-plugins-v4.9-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 15 Oct 2016 17:03:15 +0000 (10:03 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 15 Oct 2016 17:03:15 +0000 (10:03 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 15 Oct 2016 17:03:15 +0000 (10:03 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 15 Oct 2016 17:03:15 +0000 (10:03 -0700)
diff --combined arch/Kconfig

index 11d3495,ae80cd2..659bdd0
--- 1/arch/Kconfig
--- 2/arch/Kconfig
+++ b/arch/Kconfig
@@@ -383,6 -383,24 +383,24 @@@ config GCC_PLUGIN_SANCO
           gcc-4.5 on). It is based on the commit "Add fuzzing coverage support"
           by Dmitry Vyukov <dvyukov@google.com>.
   
+ config GCC_PLUGIN_LATENT_ENTROPY
+       bool "Generate some entropy during boot and runtime"
+       depends on GCC_PLUGINS
+       help
+         By saying Y here the kernel will instrument some kernel code to
+         extract some entropy from both original and artificially created
+         program state.  This will help especially embedded systems where
+         there is little 'natural' source of entropy normally.  The cost
+         is some slowdown of the boot process (about 0.5%) and fork and
+         irq processing.
+ 
+         Note that entropy extracted this way is not cryptographically
+         secure!
+ 
+         This plugin was ported from grsecurity/PaX. More information at:
+          * https://grsecurity.net/
+          * https://pax.grsecurity.net/
+ 
   config HAVE_CC_STACKPROTECTOR
         bool
         help
@@@ -450,27 -468,6 +468,27 @@@ config CC_STACKPROTECTOR_STRON
   
   endchoice
   
+ +config THIN_ARCHIVES
+ +      bool
+ +      help
+ +        Select this if the architecture wants to use thin archives
+ +        instead of ld -r to create the built-in.o files.
+ +
+ +config LD_DEAD_CODE_DATA_ELIMINATION
+ +      bool
+ +      help
+ +        Select this if the architecture wants to do dead code and
+ +        data elimination with the linker by compiling with
+ +        -ffunction-sections -fdata-sections and linking with
+ +        --gc-sections.
+ +
+ +        This requires that the arch annotates or otherwise protects
+ +        its external entry points from being discarded. Linker scripts
+ +        must also merge .text.*, .data.*, and .bss.* correctly into
+ +        output sections. Care must be taken not to pull in unrelated
+ +        sections (e.g., '.text.init'). Typically '.' in section names
+ +        is used to distinguish them from label names / C identifiers.
+ +
   config HAVE_ARCH_WITHIN_STACK_FRAMES
         bool
         help
@@@ -717,38 -714,4 +735,38 @@@ config ARCH_NO_COHERENT_DMA_MMA
   config CPU_NO_EFFICIENT_FFS
         def_bool n
   
+ +config HAVE_ARCH_VMAP_STACK
+ +      def_bool n
+ +      help
+ +        An arch should select this symbol if it can support kernel stacks
+ +        in vmalloc space.  This means:
+ +
+ +        - vmalloc space must be large enough to hold many kernel stacks.
+ +          This may rule out many 32-bit architectures.
+ +
+ +        - Stacks in vmalloc space need to work reliably.  For example, if
+ +          vmap page tables are created on demand, either this mechanism
+ +          needs to work while the stack points to a virtual address with
+ +          unpopulated page tables or arch code (switch_to() and switch_mm(),
+ +          most likely) needs to ensure that the stack's page table entries
+ +          are populated before running on a possibly unpopulated stack.
+ +
+ +        - If the stack overflows into a guard page, something reasonable
+ +          should happen.  The definition of "reasonable" is flexible, but
+ +          instantly rebooting without logging anything would be unfriendly.
+ +
+ +config VMAP_STACK
+ +      default y
+ +      bool "Use a virtually-mapped stack"
+ +      depends on HAVE_ARCH_VMAP_STACK && !KASAN
+ +      ---help---
+ +        Enable this if you want the use virtually-mapped kernel stacks
+ +        with guard pages.  This causes kernel stack overflows to be
+ +        caught immediately rather than causing difficult-to-diagnose
+ +        corruption.
+ +
+ +        This is presently incompatible with KASAN because KASAN expects
+ +        the stack to map directly to the KASAN shadow map using a formula
+ +        that is incorrect if the stack is in vmalloc space.
+ +
   source "kernel/gcov/Kconfig"
diff --combined arch/powerpc/kernel/Makefile

index 6913f67,62df36c..1925341
--- 1/arch/powerpc/kernel/Makefile
--- 2/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@@ -14,6 -14,11 +14,11 @@@ CFLAGS_prom_init.o      += -fPI
   CFLAGS_btext.o                += -fPIC
   endif
   
+ CFLAGS_cputable.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+ CFLAGS_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+ CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+ CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+ 
   ifdef CONFIG_FUNCTION_TRACER
   # Do not trace early boot code
   CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
@@@ -31,7 -36,8 +36,7 @@@ obj-y                         := cputable.o ptrace.o syscall
                                    process.o systbl.o idle.o \
                                    signal.o sysfs.o cacheinfo.o time.o \
                                    prom.o traps.o setup-common.o \
- -                                 udbg.o misc.o io.o dma.o \
- -                                 misc_$(CONFIG_WORD_SIZE).o \
+ +                                 udbg.o misc.o io.o dma.o misc_$(BITS).o \
                                    of_platform.o prom_parse.o
   obj-$(CONFIG_PPC64)           += setup_64.o sys_ppc32.o \
                                    signal_64.o ptrace32.o \
@@@ -69,27 -75,31 +74,27 @@@ obj-$(CONFIG_HIBERNATION)  += swsusp.o s
   ifeq ($(CONFIG_FSL_BOOKE),y)
   obj-$(CONFIG_HIBERNATION)     += swsusp_booke.o
   else
- -obj-$(CONFIG_HIBERNATION)     += swsusp_$(CONFIG_WORD_SIZE).o
+ +obj-$(CONFIG_HIBERNATION)     += swsusp_$(BITS).o
   endif
   obj64-$(CONFIG_HIBERNATION)   += swsusp_asm64.o
- -obj-$(CONFIG_MODULES)         += module.o module_$(CONFIG_WORD_SIZE).o
+ +obj-$(CONFIG_MODULES)         += module.o module_$(BITS).o
   obj-$(CONFIG_44x)             += cpu_setup_44x.o
   obj-$(CONFIG_PPC_FSL_BOOK3E)  += cpu_setup_fsl_booke.o
   obj-$(CONFIG_PPC_DOORBELL)    += dbell.o
   obj-$(CONFIG_JUMP_LABEL)      += jump_label.o
   
- -extra-y                               := head_$(CONFIG_WORD_SIZE).o
+ +extra-y                               := head_$(BITS).o
   extra-$(CONFIG_40x)           := head_40x.o
   extra-$(CONFIG_44x)           := head_44x.o
   extra-$(CONFIG_FSL_BOOKE)     := head_fsl_booke.o
   extra-$(CONFIG_8xx)           := head_8xx.o
   extra-y                               += vmlinux.lds
   
- -obj-$(CONFIG_RELOCATABLE)     += reloc_$(CONFIG_WORD_SIZE).o
+ +obj-$(CONFIG_RELOCATABLE)     += reloc_$(BITS).o
   
   obj-$(CONFIG_PPC32)           += entry_32.o setup_32.o
   obj-$(CONFIG_PPC64)           += dma-iommu.o iommu.o
   obj-$(CONFIG_KGDB)            += kgdb.o
- -obj-$(CONFIG_MODULES)         += ppc_ksyms.o
- -ifeq ($(CONFIG_PPC32),y)
- -obj-$(CONFIG_MODULES)         += ppc_ksyms_32.o
- -endif
   obj-$(CONFIG_BOOTX_TEXT)      += btext.o
   obj-$(CONFIG_SMP)             += smp.o
   obj-$(CONFIG_KPROBES)         += kprobes.o
@@@ -99,11 -109,11 +104,11 @@@ obj-$(CONFIG_STACKTRACE) += stacktrace.
   obj-$(CONFIG_SWIOTLB)         += dma-swiotlb.o
   
   pci64-$(CONFIG_PPC64)         += pci_dn.o pci-hotplug.o isa-bridge.o
- -obj-$(CONFIG_PCI)             += pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \
+ +obj-$(CONFIG_PCI)             += pci_$(BITS).o $(pci64-y) \
                                    pci-common.o pci_of_scan.o
   obj-$(CONFIG_PCI_MSI)         += msi.o
   obj-$(CONFIG_KEXEC)           += machine_kexec.o crash.o \
- -                                 machine_kexec_$(CONFIG_WORD_SIZE).o
+ +                                 machine_kexec_$(BITS).o
   obj-$(CONFIG_AUDIT)           += audit.o
   obj64-$(CONFIG_AUDIT)         += compat_audit.o
   
diff --combined block/blk-softirq.c

index 96631e6,489eab8..06cf980
--- 1/block/blk-softirq.c
--- 2/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@@ -18,7 -18,7 +18,7 @@@ static DEFINE_PER_CPU(struct list_head
    * Softirq action handler - move entries to local list and loop over them
    * while passing them to the queue registered handler.
    */
- static void blk_done_softirq(struct softirq_action *h)
+ static __latent_entropy void blk_done_softirq(struct softirq_action *h)
   {
         struct list_head *cpu_list, local_list;
   
@@@ -78,21 -78,30 +78,21 @@@ static int raise_blk_irq(int cpu, struc
   }
   #endif
   
- -static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
- -                        void *hcpu)
+ +static int blk_softirq_cpu_dead(unsigned int cpu)
   {
         /*
          * If a CPU goes away, splice its entries to the current CPU
          * and trigger a run of the softirq
          */
- -      if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- -              int cpu = (unsigned long) hcpu;
- -
- -              local_irq_disable();
- -              list_splice_init(&per_cpu(blk_cpu_done, cpu),
- -                               this_cpu_ptr(&blk_cpu_done));
- -              raise_softirq_irqoff(BLOCK_SOFTIRQ);
- -              local_irq_enable();
- -      }
+ +      local_irq_disable();
+ +      list_splice_init(&per_cpu(blk_cpu_done, cpu),
+ +                       this_cpu_ptr(&blk_cpu_done));
+ +      raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ +      local_irq_enable();
   
- -      return NOTIFY_OK;
+ +      return 0;
   }
   
- -static struct notifier_block blk_cpu_notifier = {
- -      .notifier_call  = blk_cpu_notify,
- -};
- -
   void __blk_complete_request(struct request *req)
   {
         int ccpu, cpu;
@@@ -171,9 -180,7 +171,9 @@@ static __init int blk_softirq_init(void
                 INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
   
         open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
- -      register_hotcpu_notifier(&blk_cpu_notifier);
+ +      cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
+ +                                "block/softirq:dead", NULL,
+ +                                blk_softirq_cpu_dead);
         return 0;
   }
   subsys_initcall(blk_softirq_init);
diff --combined drivers/char/random.c

index d131e15,7274ae8..d6876d5
--- 1/drivers/char/random.c
--- 2/drivers/char/random.c
+++ b/drivers/char/random.c
@@@ -479,8 -479,8 +479,8 @@@ static ssize_t _extract_entropy(struct 
   
   static void crng_reseed(struct crng_state *crng, struct entropy_store *r);
   static void push_to_pool(struct work_struct *work);
- static __u32 input_pool_data[INPUT_POOL_WORDS];
- static __u32 blocking_pool_data[OUTPUT_POOL_WORDS];
+ static __u32 input_pool_data[INPUT_POOL_WORDS] __latent_entropy;
+ static __u32 blocking_pool_data[OUTPUT_POOL_WORDS] __latent_entropy;
   
   static struct entropy_store input_pool = {
         .poolinfo = &poolinfo_table[0],
@@@ -2100,37 -2100,23 +2100,37 @@@ unsigned long get_random_long(void
   }
   EXPORT_SYMBOL(get_random_long);
   
- -/*
- - * randomize_range() returns a start address such that
+ +/**
+ + * randomize_page - Generate a random, page aligned address
+ + * @start:    The smallest acceptable address the caller will take.
+ + * @range:    The size of the area, starting at @start, within which the
+ + *            random address must fall.
+ + *
+ + * If @start + @range would overflow, @range is capped.
    *
- - *    [...... <range> .....]
- - *  start                  end
+ + * NOTE: Historical use of randomize_range, which this replaces, presumed that
+ + * @start was already page aligned.  We now align it regardless.
    *
- - * a <range> with size "len" starting at the return value is inside in the
- - * area defined by [start, end], but is otherwise randomized.
+ + * Return: A page aligned address within [start, start + range).  On error,
+ + * @start is returned.
    */
   unsigned long
- -randomize_range(unsigned long start, unsigned long end, unsigned long len)
+ +randomize_page(unsigned long start, unsigned long range)
   {
- -      unsigned long range = end - len - start;
+ +      if (!PAGE_ALIGNED(start)) {
+ +              range -= PAGE_ALIGN(start) - start;
+ +              start = PAGE_ALIGN(start);
+ +      }
   
- -      if (end <= start + len)
- -              return 0;
- -      return PAGE_ALIGN(get_random_int() % range + start);
+ +      if (start > ULONG_MAX - range)
+ +              range = ULONG_MAX - start;
+ +
+ +      range >>= PAGE_SHIFT;
+ +
+ +      if (range == 0)
+ +              return start;
+ +
+ +      return start + (get_random_long() % range << PAGE_SHIFT);
   }
   
   /* Interface for in-kernel drivers of true hardware RNGs.
diff --combined fs/namespace.c

index 58aca9c,4a9568b..e6c234b
--- 1/fs/namespace.c
--- 2/fs/namespace.c
+++ b/fs/namespace.c
@@@ -27,9 -27,6 +27,9 @@@
   #include "pnode.h"
   #include "internal.h"
   
+ +/* Maximum number of mounts in a mount namespace */
+ +unsigned int sysctl_mount_max __read_mostly = 100000;
+ +
   static unsigned int m_hash_mask __read_mostly;
   static unsigned int m_hash_shift __read_mostly;
   static unsigned int mp_hash_mask __read_mostly;
@@@ -902,9 -899,6 +902,9 @@@ static void commit_tree(struct mount *m
   
         list_splice(&head, n->list.prev);
   
+ +      n->mounts += n->pending_mounts;
+ +      n->pending_mounts = 0;
+ +
         attach_shadowed(mnt, parent, shadows);
         touch_mnt_namespace(n);
   }
@@@ -1425,16 -1419,11 +1425,16 @@@ static void umount_tree(struct mount *m
                 propagate_umount(&tmp_list);
   
         while (!list_empty(&tmp_list)) {
+ +              struct mnt_namespace *ns;
                 bool disconnect;
                 p = list_first_entry(&tmp_list, struct mount, mnt_list);
                 list_del_init(&p->mnt_expire);
                 list_del_init(&p->mnt_list);
- -              __touch_mnt_namespace(p->mnt_ns);
+ +              ns = p->mnt_ns;
+ +              if (ns) {
+ +                      ns->mounts--;
+ +                      __touch_mnt_namespace(ns);
+ +              }
                 p->mnt_ns = NULL;
                 if (how & UMOUNT_SYNC)
                         p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
@@@ -1851,28 -1840,6 +1851,28 @@@ static int invent_group_ids(struct moun
         return 0;
   }
   
+ +int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
+ +{
+ +      unsigned int max = READ_ONCE(sysctl_mount_max);
+ +      unsigned int mounts = 0, old, pending, sum;
+ +      struct mount *p;
+ +
+ +      for (p = mnt; p; p = next_mnt(p, mnt))
+ +              mounts++;
+ +
+ +      old = ns->mounts;
+ +      pending = ns->pending_mounts;
+ +      sum = old + pending;
+ +      if ((old > sum) ||
+ +          (pending > sum) ||
+ +          (max < sum) ||
+ +          (mounts > (max - sum)))
+ +              return -ENOSPC;
+ +
+ +      ns->pending_mounts = pending + mounts;
+ +      return 0;
+ +}
+ +
   /*
    *  @source_mnt : mount tree to be attached
    *  @nd         : place the mount tree @source_mnt is attached
@@@ -1942,18 -1909,10 +1942,18 @@@ static int attach_recursive_mnt(struct 
                         struct path *parent_path)
   {
         HLIST_HEAD(tree_list);
+ +      struct mnt_namespace *ns = dest_mnt->mnt_ns;
         struct mount *child, *p;
         struct hlist_node *n;
         int err;
   
+ +      /* Is there space to add these mounts to the mount namespace? */
+ +      if (!parent_path) {
+ +              err = count_mounts(ns, source_mnt);
+ +              if (err)
+ +                      goto out;
+ +      }
+ +
         if (IS_MNT_SHARED(dest_mnt)) {
                 err = invent_group_ids(source_mnt, true);
                 if (err)
@@@ -1990,13 -1949,11 +1990,13 @@@
    out_cleanup_ids:
         while (!hlist_empty(&tree_list)) {
                 child = hlist_entry(tree_list.first, struct mount, mnt_hash);
+ +              child->mnt_parent->mnt_ns->pending_mounts = 0;
                 umount_tree(child, UMOUNT_SYNC);
         }
         unlock_mount_hash();
         cleanup_group_ids(source_mnt, NULL);
    out:
+ +      ns->pending_mounts = 0;
         return err;
   }
   
@@@ -2743,7 -2700,7 +2743,7 @@@ long do_mount(const char *dev_name, con
   
         flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
                    MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
- -                 MS_STRICTATIME);
+ +                 MS_STRICTATIME | MS_NOREMOTELOCK);
   
         if (flags & MS_REMOUNT)
                 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
@@@ -2762,20 -2719,9 +2762,20 @@@ dput_out
         return retval;
   }
   
+ +static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
+ +{
+ +      return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
+ +}
+ +
+ +static void dec_mnt_namespaces(struct ucounts *ucounts)
+ +{
+ +      dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
+ +}
+ +
   static void free_mnt_ns(struct mnt_namespace *ns)
   {
         ns_free_inum(&ns->ns);
+ +      dec_mnt_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         kfree(ns);
   }
@@@ -2792,22 -2738,14 +2792,22 @@@ static atomic64_t mnt_ns_seq = ATOMIC64
   static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
   {
         struct mnt_namespace *new_ns;
+ +      struct ucounts *ucounts;
         int ret;
   
+ +      ucounts = inc_mnt_namespaces(user_ns);
+ +      if (!ucounts)
+ +              return ERR_PTR(-ENOSPC);
+ +
         new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
- -      if (!new_ns)
+ +      if (!new_ns) {
+ +              dec_mnt_namespaces(ucounts);
                 return ERR_PTR(-ENOMEM);
+ +      }
         ret = ns_alloc_inum(&new_ns->ns);
         if (ret) {
                 kfree(new_ns);
+ +              dec_mnt_namespaces(ucounts);
                 return ERR_PTR(ret);
         }
         new_ns->ns.ops = &mntns_operations;
@@@ -2818,12 -2756,10 +2818,13 @@@
         init_waitqueue_head(&new_ns->poll);
         new_ns->event = 0;
         new_ns->user_ns = get_user_ns(user_ns);
+ +      new_ns->ucounts = ucounts;
+ +      new_ns->mounts = 0;
+ +      new_ns->pending_mounts = 0;
         return new_ns;
   }
   
+ __latent_entropy
   struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
                 struct user_namespace *user_ns, struct fs_struct *new_fs)
   {
@@@ -2870,7 -2806,6 +2871,7 @@@
         q = new;
         while (p) {
                 q->mnt_ns = new_ns;
+ +              new_ns->mounts++;
                 if (new_fs) {
                         if (&p->mnt == new_fs->root.mnt) {
                                 new_fs->root.mnt = mntget(&q->mnt);
@@@ -2909,7 -2844,6 +2910,7 @@@ static struct mnt_namespace *create_mnt
                 struct mount *mnt = real_mount(m);
                 mnt->mnt_ns = new_ns;
                 new_ns->root = mnt;
+ +              new_ns->mounts++;
                 list_add(&mnt->mnt_list, &new_ns->list);
         } else {
                 mntput(m);
@@@ -3415,16 -3349,10 +3416,16 @@@ static int mntns_install(struct nsprox
         return 0;
   }
   
+ +static struct user_namespace *mntns_owner(struct ns_common *ns)
+ +{
+ +      return to_mnt_ns(ns)->user_ns;
+ +}
+ +
   const struct proc_ns_operations mntns_operations = {
         .name           = "mnt",
         .type           = CLONE_NEWNS,
         .get            = mntns_get,
         .put            = mntns_put,
         .install        = mntns_install,
+ +      .owner          = mntns_owner,
   };
diff --combined include/linux/compiler.h

index f1bfa15,ceaddaf..cf0fa5d
--- 1/include/linux/compiler.h
--- 2/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@@ -182,29 -182,6 +182,29 @@@ void ftrace_likely_update(struct ftrace
   # define unreachable() do { } while (1)
   #endif
   
+ +/*
+ + * KENTRY - kernel entry point
+ + * This can be used to annotate symbols (functions or data) that are used
+ + * without their linker symbol being referenced explicitly. For example,
+ + * interrupt vector handlers, or functions in the kernel image that are found
+ + * programatically.
+ + *
+ + * Not required for symbols exported with EXPORT_SYMBOL, or initcalls. Those
+ + * are handled in their own way (with KEEP() in linker scripts).
+ + *
+ + * KENTRY can be avoided if the symbols in question are marked as KEEP() in the
+ + * linker script. For example an architecture could KEEP() its entire
+ + * boot/exception vector code rather than annotate each function and data.
+ + */
+ +#ifndef KENTRY
+ +# define KENTRY(sym)                                          \
+ +      extern typeof(sym) sym;                                 \
+ +      static const unsigned long __kentry_##sym               \
+ +      __used                                                  \
+ +      __attribute__((section("___kentry" "+" #sym ), used))   \
+ +      = (unsigned long)&sym;
+ +#endif
+ +
   #ifndef RELOC_HIDE
   # define RELOC_HIDE(ptr, off)                                 \
     ({ unsigned long __ptr;                                     \
@@@ -429,6 -406,10 +429,10 @@@ static __always_inline void __write_onc
   # define __attribute_const__  /* unimplemented */
   #endif
   
+ #ifndef __latent_entropy
+ # define __latent_entropy
+ #endif
+ 
   /*
    * Tell gcc if a function is cold. The compiler will assume any path
    * directly leading to the call is unlikely.
diff --combined include/linux/fdtable.h

index aca2a6a,9852c7e..6e84b2c
--- 1/include/linux/fdtable.h
--- 2/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@@ -30,12 -30,12 +30,12 @@@ struct fdtable 
         struct rcu_head rcu;
   };
   
- -static inline bool close_on_exec(int fd, const struct fdtable *fdt)
+ +static inline bool close_on_exec(unsigned int fd, const struct fdtable *fdt)
   {
         return test_bit(fd, fdt->close_on_exec);
   }
   
- -static inline bool fd_is_open(int fd, const struct fdtable *fdt)
+ +static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
   {
         return test_bit(fd, fdt->open_fds);
   }
@@@ -57,7 -57,7 +57,7 @@@ struct files_struct 
      * written part on a separate cache line in SMP
      */
         spinlock_t file_lock ____cacheline_aligned_in_smp;
- -      int next_fd;
+ +      unsigned int next_fd;
         unsigned long close_on_exec_init[1];
         unsigned long open_fds_init[1];
         unsigned long full_fds_bits_init[1];
@@@ -105,7 -105,7 +105,7 @@@ struct files_struct *get_files_struct(s
   void put_files_struct(struct files_struct *fs);
   void reset_files_struct(struct files_struct *);
   int unshare_files(struct files_struct **);
- struct files_struct *dup_fd(struct files_struct *, int *);
+ struct files_struct *dup_fd(struct files_struct *, int *) __latent_entropy;
   void do_close_on_exec(struct files_struct *);
   int iterate_fd(struct files_struct *, unsigned,
                 int (*)(const void *, struct file *, unsigned),
diff --combined include/linux/init.h

index 024a0b5,1e5c131..e30104c
--- 1/include/linux/init.h
--- 2/include/linux/init.h
+++ b/include/linux/init.h
@@@ -39,12 -39,23 +39,12 @@@
   
   /* These are for everybody (although not all archs will actually
      discard it in modules) */
- #define __init                __section(.init.text) __cold notrace
+ #define __init                __section(.init.text) __cold notrace __latent_entropy
   #define __initdata    __section(.init.data)
- -#define __initconst   __constsection(.init.rodata)
+ +#define __initconst   __section(.init.rodata)
   #define __exitdata    __section(.exit.data)
   #define __exit_call   __used __section(.exitcall.exit)
   
- -/*
- - * Some architecture have tool chains which do not handle rodata attributes
- - * correctly. For those disable special sections for const, so that other
- - * architectures can annotate correctly.
- - */
- -#ifdef CONFIG_BROKEN_RODATA
- -#define __constsection(x)
- -#else
- -#define __constsection(x) __section(x)
- -#endif
- -
   /*
    * modpost check for section mismatches during the kernel build.
    * A section mismatch happens when there are references from a
@@@ -64,7 -75,7 +64,7 @@@
    */
   #define __ref            __section(.ref.text) noinline
   #define __refdata        __section(.ref.data)
- -#define __refconst       __constsection(.ref.rodata)
+ +#define __refconst       __section(.ref.rodata)
   
   #ifdef MODULE
   #define __exitused
@@@ -75,12 -86,13 +75,13 @@@
   #define __exit          __section(.exit.text) __exitused __cold notrace
   
   /* Used for MEMORY_HOTPLUG */
- #define __meminit        __section(.meminit.text) __cold notrace
+ #define __meminit        __section(.meminit.text) __cold notrace \
+                                                 __latent_entropy
   #define __meminitdata    __section(.meminit.data)
- -#define __meminitconst   __constsection(.meminit.rodata)
+ +#define __meminitconst   __section(.meminit.rodata)
   #define __memexit        __section(.memexit.text) __exitused __cold notrace
   #define __memexitdata    __section(.memexit.data)
- -#define __memexitconst   __constsection(.memexit.rodata)
+ +#define __memexitconst   __section(.memexit.rodata)
   
   /* For assembly routines */
   #define __HEAD                .section        ".head.text","ax"
@@@ -139,8 -151,24 +140,8 @@@ extern bool initcall_debug
   
   #ifndef __ASSEMBLY__
   
- -#ifdef CONFIG_LTO
- -/* Work around a LTO gcc problem: when there is no reference to a variable
- - * in a module it will be moved to the end of the program. This causes
- - * reordering of initcalls which the kernel does not like.
- - * Add a dummy reference function to avoid this. The function is
- - * deleted by the linker.
- - */
- -#define LTO_REFERENCE_INITCALL(x) \
- -      ; /* yes this is needed */                      \
- -      static __used __exit void *reference_##x(void)  \
- -      {                                               \
- -              return &x;                              \
- -      }
- -#else
- -#define LTO_REFERENCE_INITCALL(x)
- -#endif
- -
- -/* initcalls are now grouped by functionality into separate 
+ +/*
+ + * initcalls are now grouped by functionality into separate
    * subsections. Ordering inside the subsections is determined
    * by link order. 
    * For backwards compatibility, initcall() puts the call in 
@@@ -148,16 -176,12 +149,16 @@@
    *
    * The `id' arg to __define_initcall() is needed so that multiple initcalls
    * can point at the same handler without causing duplicate-symbol build errors.
+ + *
+ + * Initcalls are run by placing pointers in initcall sections that the
+ + * kernel iterates at runtime. The linker can do dead code / data elimination
+ + * and remove that completely, so the initcall sections have to be marked
+ + * as KEEP() in the linker script.
    */
   
   #define __define_initcall(fn, id) \
         static initcall_t __initcall_##fn##id __used \
- -      __attribute__((__section__(".initcall" #id ".init"))) = fn; \
- -      LTO_REFERENCE_INITCALL(__initcall_##fn##id)
+ +      __attribute__((__section__(".initcall" #id ".init"))) = fn;
   
   /*
    * Early initcalls run before initializing SMP.
@@@ -193,15 -217,15 +194,15 @@@
   
   #define __initcall(fn) device_initcall(fn)
   
- -#define __exitcall(fn) \
+ +#define __exitcall(fn)                                                \
         static exitcall_t __exitcall_##fn __exit_call = fn
   
- -#define console_initcall(fn) \
- -      static initcall_t __initcall_##fn \
+ +#define console_initcall(fn)                                  \
+ +      static initcall_t __initcall_##fn                       \
         __used __section(.con_initcall.init) = fn
   
- -#define security_initcall(fn) \
- -      static initcall_t __initcall_##fn \
+ +#define security_initcall(fn)                                 \
+ +      static initcall_t __initcall_##fn                       \
         __used __section(.security_initcall.init) = fn
   
   struct obs_kernel_param {
diff --combined include/linux/random.h

index f7bb7a3,d80a438..7bd2403
--- 1/include/linux/random.h
--- 2/include/linux/random.h
+++ b/include/linux/random.h
@@@ -18,9 -18,20 +18,20 @@@ struct random_ready_callback 
   };
   
   extern void add_device_randomness(const void *, unsigned int);
+ 
+ #if defined(CONFIG_GCC_PLUGIN_LATENT_ENTROPY) && !defined(__CHECKER__)
+ static inline void add_latent_entropy(void)
+ {
+       add_device_randomness((const void *)&latent_entropy,
+                             sizeof(latent_entropy));
+ }
+ #else
+ static inline void add_latent_entropy(void) {}
+ #endif
+ 
   extern void add_input_randomness(unsigned int type, unsigned int code,
-                                unsigned int value);
- extern void add_interrupt_randomness(int irq, int irq_flags);
+                                unsigned int value) __latent_entropy;
+ extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
   
   extern void get_random_bytes(void *buf, int nbytes);
   extern int add_random_ready_callback(struct random_ready_callback *rdy);
@@@ -34,7 -45,7 +45,7 @@@ extern const struct file_operations ran
   
   unsigned int get_random_int(void);
   unsigned long get_random_long(void);
- -unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len);
+ +unsigned long randomize_page(unsigned long start, unsigned long range);
   
   u32 prandom_u32(void);
   void prandom_bytes(void *buf, size_t nbytes);
diff --combined kernel/fork.c

index 6d42242,0539388..623259f
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -158,83 -158,19 +158,83 @@@ void __weak arch_release_thread_stack(u
    * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
    * kmemcache based allocator.
    */
- -# if THREAD_SIZE >= PAGE_SIZE
- -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
- -                                                int node)
+ +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
+ +
+ +#ifdef CONFIG_VMAP_STACK
+ +/*
+ + * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
+ + * flush.  Try to minimize the number of calls by caching stacks.
+ + */
+ +#define NR_CACHED_STACKS 2
+ +static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+ +#endif
+ +
+ +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
   {
+ +#ifdef CONFIG_VMAP_STACK
+ +      void *stack;
+ +      int i;
+ +
+ +      local_irq_disable();
+ +      for (i = 0; i < NR_CACHED_STACKS; i++) {
+ +              struct vm_struct *s = this_cpu_read(cached_stacks[i]);
+ +
+ +              if (!s)
+ +                      continue;
+ +              this_cpu_write(cached_stacks[i], NULL);
+ +
+ +              tsk->stack_vm_area = s;
+ +              local_irq_enable();
+ +              return s->addr;
+ +      }
+ +      local_irq_enable();
+ +
+ +      stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+ +                                   VMALLOC_START, VMALLOC_END,
+ +                                   THREADINFO_GFP | __GFP_HIGHMEM,
+ +                                   PAGE_KERNEL,
+ +                                   0, node, __builtin_return_address(0));
+ +
+ +      /*
+ +       * We can't call find_vm_area() in interrupt context, and
+ +       * free_thread_stack() can be called in interrupt context,
+ +       * so cache the vm_struct.
+ +       */
+ +      if (stack)
+ +              tsk->stack_vm_area = find_vm_area(stack);
+ +      return stack;
+ +#else
         struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                              THREAD_SIZE_ORDER);
   
         return page ? page_address(page) : NULL;
+ +#endif
   }
   
- -static inline void free_thread_stack(unsigned long *stack)
+ +static inline void free_thread_stack(struct task_struct *tsk)
   {
- -      __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
+ +#ifdef CONFIG_VMAP_STACK
+ +      if (task_stack_vm_area(tsk)) {
+ +              unsigned long flags;
+ +              int i;
+ +
+ +              local_irq_save(flags);
+ +              for (i = 0; i < NR_CACHED_STACKS; i++) {
+ +                      if (this_cpu_read(cached_stacks[i]))
+ +                              continue;
+ +
+ +                      this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
+ +                      local_irq_restore(flags);
+ +                      return;
+ +              }
+ +              local_irq_restore(flags);
+ +
+ +              vfree(tsk->stack);
+ +              return;
+ +      }
+ +#endif
+ +
+ +      __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
   }
   # else
   static struct kmem_cache *thread_stack_cache;
@@@ -245,9 -181,9 +245,9 @@@ static unsigned long *alloc_thread_stac
         return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
   }
   
- -static void free_thread_stack(unsigned long *stack)
+ +static void free_thread_stack(struct task_struct *tsk)
   {
- -      kmem_cache_free(thread_stack_cache, stack);
+ +      kmem_cache_free(thread_stack_cache, tsk->stack);
   }
   
   void thread_stack_cache_init(void)
@@@ -277,76 -213,24 +277,76 @@@ struct kmem_cache *vm_area_cachep
   /* SLAB cache for mm_struct structures (tsk->mm) */
   static struct kmem_cache *mm_cachep;
   
- -static void account_kernel_stack(unsigned long *stack, int account)
+ +static void account_kernel_stack(struct task_struct *tsk, int account)
   {
- -      /* All stack pages are in the same zone and belong to the same memcg. */
- -      struct page *first_page = virt_to_page(stack);
+ +      void *stack = task_stack_page(tsk);
+ +      struct vm_struct *vm = task_stack_vm_area(tsk);
+ +
+ +      BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+ +
+ +      if (vm) {
+ +              int i;
+ +
+ +              BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+ +
+ +              for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ +                      mod_zone_page_state(page_zone(vm->pages[i]),
+ +                                          NR_KERNEL_STACK_KB,
+ +                                          PAGE_SIZE / 1024 * account);
+ +              }
+ +
+ +              /* All stack pages belong to the same memcg. */
+ +              memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+ +                                          account * (THREAD_SIZE / 1024));
+ +      } else {
+ +              /*
+ +               * All stack pages are in the same zone and belong to the
+ +               * same memcg.
+ +               */
+ +              struct page *first_page = virt_to_page(stack);
   
- -      mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
- -                          THREAD_SIZE / 1024 * account);
+ +              mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+ +                                  THREAD_SIZE / 1024 * account);
   
- -      memcg_kmem_update_page_stat(
- -              first_page, MEMCG_KERNEL_STACK_KB,
- -              account * (THREAD_SIZE / 1024));
+ +              memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
+ +                                          account * (THREAD_SIZE / 1024));
+ +      }
   }
   
- -void free_task(struct task_struct *tsk)
+ +static void release_task_stack(struct task_struct *tsk)
   {
- -      account_kernel_stack(tsk->stack, -1);
+ +      account_kernel_stack(tsk, -1);
         arch_release_thread_stack(tsk->stack);
- -      free_thread_stack(tsk->stack);
+ +      free_thread_stack(tsk);
+ +      tsk->stack = NULL;
+ +#ifdef CONFIG_VMAP_STACK
+ +      tsk->stack_vm_area = NULL;
+ +#endif
+ +}
+ +
+ +#ifdef CONFIG_THREAD_INFO_IN_TASK
+ +void put_task_stack(struct task_struct *tsk)
+ +{
+ +      if (atomic_dec_and_test(&tsk->stack_refcount))
+ +              release_task_stack(tsk);
+ +}
+ +#endif
+ +
+ +void free_task(struct task_struct *tsk)
+ +{
+ +#ifndef CONFIG_THREAD_INFO_IN_TASK
+ +      /*
+ +       * The task is finally done with both the stack and thread_info,
+ +       * so free both.
+ +       */
+ +      release_task_stack(tsk);
+ +#else
+ +      /*
+ +       * If the task had a separate stack allocation, it should be gone
+ +       * by now.
+ +       */
+ +      WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
+ +#endif
         rt_mutex_debug_task_free(tsk);
         ftrace_graph_exit_task(tsk);
         put_seccomp_filter(tsk);
@@@ -359,12 -243,6 +359,12 @@@ static inline void free_signal_struct(s
   {
         taskstats_tgid_free(sig);
         sched_autogroup_exit(sig);
+ +      /*
+ +       * __mmdrop is not safe to call from softirq context on x86 due to
+ +       * pgd_dtor so postpone it to the async context
+ +       */
+ +      if (sig->oom_mm)
+ +              mmdrop_async(sig->oom_mm);
         kmem_cache_free(signal_cachep, sig);
   }
   
@@@ -424,7 -302,6 +424,7 @@@ int arch_task_struct_size __read_mostly
   
   void __init fork_init(void)
   {
+ +      int i;
   #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
   #ifndef ARCH_MIN_TASKALIGN
   #define ARCH_MIN_TASKALIGN    L1_CACHE_BYTES
@@@ -444,10 -321,6 +444,10 @@@
         init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
         init_task.signal->rlim[RLIMIT_SIGPENDING] =
                 init_task.signal->rlim[RLIMIT_NPROC];
+ +
+ +      for (i = 0; i < UCOUNT_COUNTS; i++) {
+ +              init_user_ns.ucount_max[i] = max_threads/2;
+ +      }
   }
   
   int __weak arch_dup_task_struct(struct task_struct *dst,
@@@ -469,7 -342,6 +469,7 @@@ static struct task_struct *dup_task_str
   {
         struct task_struct *tsk;
         unsigned long *stack;
+ +      struct vm_struct *stack_vm_area;
         int err;
   
         if (node == NUMA_NO_NODE)
@@@ -482,26 -354,11 +482,26 @@@
         if (!stack)
                 goto free_tsk;
   
+ +      stack_vm_area = task_stack_vm_area(tsk);
+ +
         err = arch_dup_task_struct(tsk, orig);
+ +
+ +      /*
+ +       * arch_dup_task_struct() clobbers the stack-related fields.  Make
+ +       * sure they're properly initialized before using any stack-related
+ +       * functions again.
+ +       */
+ +      tsk->stack = stack;
+ +#ifdef CONFIG_VMAP_STACK
+ +      tsk->stack_vm_area = stack_vm_area;
+ +#endif
+ +#ifdef CONFIG_THREAD_INFO_IN_TASK
+ +      atomic_set(&tsk->stack_refcount, 1);
+ +#endif
+ +
         if (err)
                 goto free_stack;
   
- -      tsk->stack = stack;
   #ifdef CONFIG_SECCOMP
         /*
          * We must handle setting up seccomp filters once we're under
@@@ -533,21 -390,22 +533,22 @@@
         tsk->task_frag.page = NULL;
         tsk->wake_q.next = NULL;
   
- -      account_kernel_stack(stack, 1);
+ +      account_kernel_stack(tsk, 1);
   
         kcov_task_init(tsk);
   
         return tsk;
   
   free_stack:
- -      free_thread_stack(stack);
+ +      free_thread_stack(tsk);
   free_tsk:
         free_task_struct(tsk);
         return NULL;
   }
   
   #ifdef CONFIG_MMU
- static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+ static __latent_entropy int dup_mmap(struct mm_struct *mm,
+                                       struct mm_struct *oldmm)
   {
         struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
         struct rb_node **rb_link, *rb_parent;
@@@ -854,7 -712,6 +855,7 @@@ static inline void __mmput(struct mm_st
         ksm_exit(mm);
         khugepaged_exit(mm); /* must run before exit_mmap */
         exit_mmap(mm);
+ +      mm_put_huge_zero_page(mm);
         set_mm_exe_file(mm, NULL);
         if (!list_empty(&mm->mmlist)) {
                 spin_lock(&mmlist_lock);
@@@ -863,7 -720,6 +864,7 @@@
         }
         if (mm->binfmt)
                 module_put(mm->binfmt->module);
+ +      set_bit(MMF_OOM_SKIP, &mm->flags);
         mmdrop(mm);
   }
   
@@@ -1441,7 -1297,8 +1442,8 @@@ init_task_pid(struct task_struct *task
    * parts of the process environment (as per the clone
    * flags). The actual kick-off is left to the caller.
    */
- static struct task_struct *copy_process(unsigned long clone_flags,
+ static __latent_entropy struct task_struct *copy_process(
+                                       unsigned long clone_flags,
                                         unsigned long stack_start,
                                         unsigned long stack_size,
                                         int __user *child_tidptr,
@@@ -1860,7 -1717,6 +1862,7 @@@ bad_fork_cleanup_count
         atomic_dec(&p->cred->user->processes);
         exit_creds(p);
   bad_fork_free:
+ +      put_task_stack(p);
         free_task(p);
   fork_out:
         return ERR_PTR(retval);
@@@ -1926,6 -1782,7 +1928,7 @@@ long _do_fork(unsigned long clone_flags
   
         p = copy_process(clone_flags, stack_start, stack_size,
                          child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+       add_latent_entropy();
         /*
          * Do this prior waking up the new thread - the thread pointer
          * might get invalid after that point, if the thread exits quickly.
diff --combined kernel/rcu/tree.c

index 7e2e038,e5164de..69a5611
--- 1/kernel/rcu/tree.c
--- 2/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@@ -41,6 -41,7 +41,6 @@@
   #include <linux/export.h>
   #include <linux/completion.h>
   #include <linux/moduleparam.h>
- -#include <linux/module.h>
   #include <linux/percpu.h>
   #include <linux/notifier.h>
   #include <linux/cpu.h>
@@@ -59,6 -60,7 +59,6 @@@
   #include "tree.h"
   #include "rcu.h"
   
- -MODULE_ALIAS("rcutree");
   #ifdef MODULE_PARAM_PREFIX
   #undef MODULE_PARAM_PREFIX
   #endif
@@@ -1846,7 -1848,6 +1846,7 @@@ static bool __note_gp_changes(struct rc
                               struct rcu_data *rdp)
   {
         bool ret;
+ +      bool need_gp;
   
         /* Handle the ends of any preceding grace periods first. */
         if (rdp->completed == rnp->completed &&
@@@ -1873,10 -1874,9 +1873,10 @@@
                  */
                 rdp->gpnum = rnp->gpnum;
                 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
- -              rdp->cpu_no_qs.b.norm = true;
+ +              need_gp = !!(rnp->qsmask & rdp->grpmask);
+ +              rdp->cpu_no_qs.b.norm = need_gp;
                 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
- -              rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
+ +              rdp->core_needs_qs = need_gp;
                 zero_cpu_stall_ticks(rdp);
                 WRITE_ONCE(rdp->gpwrap, false);
         }
@@@ -2344,7 -2344,7 +2344,7 @@@ static void rcu_report_qs_rsp(struct rc
         WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
         raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
- -      swake_up(&rsp->gp_wq);  /* Memory barrier implied by swake_up() path. */
+ +      rcu_gp_kthread_wake(rsp);
   }
   
   /*
@@@ -2970,7 -2970,7 +2970,7 @@@ static void force_quiescent_state(struc
         }
         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
         raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
- -      swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
+ +      rcu_gp_kthread_wake(rsp);
   }
   
   /*
@@@ -3013,7 -3013,7 +3013,7 @@@ __rcu_process_callbacks(struct rcu_stat
   /*
    * Do RCU core processing for the current CPU.
    */
- static void rcu_process_callbacks(struct softirq_action *unused)
+ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
   {
         struct rcu_state *rsp;
   
@@@ -3792,6 -3792,8 +3792,6 @@@ rcu_init_percpu_data(int cpu, struct rc
         rnp = rdp->mynode;
         mask = rdp->grpmask;
         raw_spin_lock_rcu_node(rnp);            /* irqs already disabled. */
- -      rnp->qsmaskinitnext |= mask;
- -      rnp->expmaskinitnext |= mask;
         if (!rdp->beenonline)
                 WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
         rdp->beenonline = true;  /* We have now been online. */
@@@ -3858,32 -3860,6 +3858,32 @@@ int rcutree_dead_cpu(unsigned int cpu
         return 0;
   }
   
+ +/*
+ + * Mark the specified CPU as being online so that subsequent grace periods
+ + * (both expedited and normal) will wait on it.  Note that this means that
+ + * incoming CPUs are not allowed to use RCU read-side critical sections
+ + * until this function is called.  Failing to observe this restriction
+ + * will result in lockdep splats.
+ + */
+ +void rcu_cpu_starting(unsigned int cpu)
+ +{
+ +      unsigned long flags;
+ +      unsigned long mask;
+ +      struct rcu_data *rdp;
+ +      struct rcu_node *rnp;
+ +      struct rcu_state *rsp;
+ +
+ +      for_each_rcu_flavor(rsp) {
+ +              rdp = this_cpu_ptr(rsp->rda);
+ +              rnp = rdp->mynode;
+ +              mask = rdp->grpmask;
+ +              raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ +              rnp->qsmaskinitnext |= mask;
+ +              rnp->expmaskinitnext |= mask;
+ +              raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ +      }
+ +}
+ +
   #ifdef CONFIG_HOTPLUG_CPU
   /*
    * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
@@@ -4233,10 -4209,8 +4233,10 @@@ void __init rcu_init(void
          * or the scheduler are operational.
          */
         pm_notifier(rcu_pm_notify, 0);
- -      for_each_online_cpu(cpu)
+ +      for_each_online_cpu(cpu) {
                 rcutree_prepare_cpu(cpu);
+ +              rcu_cpu_starting(cpu);
+ +      }
   }
   
   #include "tree_exp.h"
diff --combined kernel/sched/fair.c

index 502e95a,004996d..2d4ad72
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -114,12 -114,6 +114,12 @@@ unsigned int __read_mostly sysctl_sched
   unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
   #endif
   
+ +/*
+ + * The margin used when comparing utilization with CPU capacity:
+ + * util * 1024 < capacity * margin
+ + */
+ +unsigned int capacity_margin = 1280; /* ~20% */
+ +
   static inline void update_load_add(struct load_weight *lw, unsigned long inc)
   {
         lw->weight += inc;
@@@ -262,7 -256,9 +262,7 @@@ static inline struct rq *rq_of(struct c
   
   static inline struct task_struct *task_of(struct sched_entity *se)
   {
- -#ifdef CONFIG_SCHED_DEBUG
- -      WARN_ON_ONCE(!entity_is_task(se));
- -#endif
+ +      SCHED_WARN_ON(!entity_is_task(se));
         return container_of(se, struct task_struct, se);
   }
   
@@@ -460,23 -456,17 +460,23 @@@ static inline int entity_before(struct 
   
   static void update_min_vruntime(struct cfs_rq *cfs_rq)
   {
+ +      struct sched_entity *curr = cfs_rq->curr;
+ +
         u64 vruntime = cfs_rq->min_vruntime;
   
- -      if (cfs_rq->curr)
- -              vruntime = cfs_rq->curr->vruntime;
+ +      if (curr) {
+ +              if (curr->on_rq)
+ +                      vruntime = curr->vruntime;
+ +              else
+ +                      curr = NULL;
+ +      }
   
         if (cfs_rq->rb_leftmost) {
                 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
                                                    struct sched_entity,
                                                    run_node);
   
- -              if (!cfs_rq->curr)
+ +              if (!curr)
                         vruntime = se->vruntime;
                 else
                         vruntime = min_vruntime(vruntime, se->vruntime);
@@@ -666,7 -656,7 +666,7 @@@ static u64 sched_vslice(struct cfs_rq *
   }
   
   #ifdef CONFIG_SMP
- -static int select_idle_sibling(struct task_struct *p, int cpu);
+ +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
   static unsigned long task_h_load(struct task_struct *p);
   
   /*
@@@ -736,6 -726,7 +736,6 @@@ void post_init_entity_util_avg(struct s
         struct sched_avg *sa = &se->avg;
         long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
         u64 now = cfs_rq_clock_task(cfs_rq);
- -      int tg_update;
   
         if (cap > 0) {
                 if (cfs_rq->avg.util_avg != 0) {
@@@ -768,9 -759,10 +768,9 @@@
                 }
         }
   
- -      tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+ +      update_cfs_rq_load_avg(now, cfs_rq, false);
         attach_entity_load_avg(cfs_rq, se);
- -      if (tg_update)
- -              update_tg_load_avg(cfs_rq, false);
+ +      update_tg_load_avg(cfs_rq, false);
   }
   
   #else /* !CONFIG_SMP */
@@@ -807,7 -799,7 +807,7 @@@ static void update_curr(struct cfs_rq *
                       max(delta_exec, curr->statistics.exec_max));
   
         curr->sum_exec_runtime += delta_exec;
- -      schedstat_add(cfs_rq, exec_clock, delta_exec);
+ +      schedstat_add(cfs_rq->exec_clock, delta_exec);
   
         curr->vruntime += calc_delta_fair(delta_exec, curr);
         update_min_vruntime(cfs_rq);
@@@ -828,34 -820,26 +828,34 @@@ static void update_curr_fair(struct rq 
         update_curr(cfs_rq_of(&rq->curr->se));
   }
   
- -#ifdef CONFIG_SCHEDSTATS
   static inline void
   update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
- -      u64 wait_start = rq_clock(rq_of(cfs_rq));
+ +      u64 wait_start, prev_wait_start;
+ +
+ +      if (!schedstat_enabled())
+ +              return;
+ +
+ +      wait_start = rq_clock(rq_of(cfs_rq));
+ +      prev_wait_start = schedstat_val(se->statistics.wait_start);
   
         if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
- -          likely(wait_start > se->statistics.wait_start))
- -              wait_start -= se->statistics.wait_start;
+ +          likely(wait_start > prev_wait_start))
+ +              wait_start -= prev_wait_start;
   
- -      se->statistics.wait_start = wait_start;
+ +      schedstat_set(se->statistics.wait_start, wait_start);
   }
   
- -static void
+ +static inline void
   update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
         struct task_struct *p;
         u64 delta;
   
- -      delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+ +      if (!schedstat_enabled())
+ +              return;
+ +
+ +      delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
   
         if (entity_is_task(se)) {
                 p = task_of(se);
@@@ -865,114 -849,35 +865,114 @@@
                          * time stamp can be adjusted to accumulate wait time
                          * prior to migration.
                          */
- -                      se->statistics.wait_start = delta;
+ +                      schedstat_set(se->statistics.wait_start, delta);
                         return;
                 }
                 trace_sched_stat_wait(p, delta);
         }
   
- -      se->statistics.wait_max = max(se->statistics.wait_max, delta);
- -      se->statistics.wait_count++;
- -      se->statistics.wait_sum += delta;
- -      se->statistics.wait_start = 0;
+ +      schedstat_set(se->statistics.wait_max,
+ +                    max(schedstat_val(se->statistics.wait_max), delta));
+ +      schedstat_inc(se->statistics.wait_count);
+ +      schedstat_add(se->statistics.wait_sum, delta);
+ +      schedstat_set(se->statistics.wait_start, 0);
+ +}
+ +
+ +static inline void
+ +update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ +{
+ +      struct task_struct *tsk = NULL;
+ +      u64 sleep_start, block_start;
+ +
+ +      if (!schedstat_enabled())
+ +              return;
+ +
+ +      sleep_start = schedstat_val(se->statistics.sleep_start);
+ +      block_start = schedstat_val(se->statistics.block_start);
+ +
+ +      if (entity_is_task(se))
+ +              tsk = task_of(se);
+ +
+ +      if (sleep_start) {
+ +              u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
+ +
+ +              if ((s64)delta < 0)
+ +                      delta = 0;
+ +
+ +              if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
+ +                      schedstat_set(se->statistics.sleep_max, delta);
+ +
+ +              schedstat_set(se->statistics.sleep_start, 0);
+ +              schedstat_add(se->statistics.sum_sleep_runtime, delta);
+ +
+ +              if (tsk) {
+ +                      account_scheduler_latency(tsk, delta >> 10, 1);
+ +                      trace_sched_stat_sleep(tsk, delta);
+ +              }
+ +      }
+ +      if (block_start) {
+ +              u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
+ +
+ +              if ((s64)delta < 0)
+ +                      delta = 0;
+ +
+ +              if (unlikely(delta > schedstat_val(se->statistics.block_max)))
+ +                      schedstat_set(se->statistics.block_max, delta);
+ +
+ +              schedstat_set(se->statistics.block_start, 0);
+ +              schedstat_add(se->statistics.sum_sleep_runtime, delta);
+ +
+ +              if (tsk) {
+ +                      if (tsk->in_iowait) {
+ +                              schedstat_add(se->statistics.iowait_sum, delta);
+ +                              schedstat_inc(se->statistics.iowait_count);
+ +                              trace_sched_stat_iowait(tsk, delta);
+ +                      }
+ +
+ +                      trace_sched_stat_blocked(tsk, delta);
+ +
+ +                      /*
+ +                       * Blocking time is in units of nanosecs, so shift by
+ +                       * 20 to get a milliseconds-range estimation of the
+ +                       * amount of time that the task spent sleeping:
+ +                       */
+ +                      if (unlikely(prof_on == SLEEP_PROFILING)) {
+ +                              profile_hits(SLEEP_PROFILING,
+ +                                              (void *)get_wchan(tsk),
+ +                                              delta >> 20);
+ +                      }
+ +                      account_scheduler_latency(tsk, delta >> 10, 0);
+ +              }
+ +      }
   }
   
   /*
    * Task is being enqueued - update stats:
    */
   static inline void
- -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
   {
+ +      if (!schedstat_enabled())
+ +              return;
+ +
         /*
          * Are we enqueueing a waiting task? (for current tasks
          * a dequeue/enqueue event is a NOP)
          */
         if (se != cfs_rq->curr)
                 update_stats_wait_start(cfs_rq, se);
+ +
+ +      if (flags & ENQUEUE_WAKEUP)
+ +              update_stats_enqueue_sleeper(cfs_rq, se);
   }
   
   static inline void
   update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
   {
+ +
+ +      if (!schedstat_enabled())
+ +              return;
+ +
         /*
          * Mark the end of the wait period if dequeueing a
          * waiting task:
@@@ -980,18 -885,40 +980,18 @@@
         if (se != cfs_rq->curr)
                 update_stats_wait_end(cfs_rq, se);
   
- -      if (flags & DEQUEUE_SLEEP) {
- -              if (entity_is_task(se)) {
- -                      struct task_struct *tsk = task_of(se);
+ +      if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
+ +              struct task_struct *tsk = task_of(se);
   
- -                      if (tsk->state & TASK_INTERRUPTIBLE)
- -                              se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
- -                      if (tsk->state & TASK_UNINTERRUPTIBLE)
- -                              se->statistics.block_start = rq_clock(rq_of(cfs_rq));
- -              }
+ +              if (tsk->state & TASK_INTERRUPTIBLE)
+ +                      schedstat_set(se->statistics.sleep_start,
+ +                                    rq_clock(rq_of(cfs_rq)));
+ +              if (tsk->state & TASK_UNINTERRUPTIBLE)
+ +                      schedstat_set(se->statistics.block_start,
+ +                                    rq_clock(rq_of(cfs_rq)));
         }
- -
- -}
- -#else
- -static inline void
- -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
- -{
- -}
- -
- -static inline void
- -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
- -{
   }
   
- -static inline void
- -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
- -{
- -}
- -
- -static inline void
- -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- -{
- -}
- -#endif
- -
   /*
    * We are picking a new current task - update its stats:
    */
@@@ -1586,16 -1513,8 +1586,16 @@@ balance
          * One idle CPU per node is evaluated for a task numa move.
          * Call select_idle_sibling to maybe find a better one.
          */
- -      if (!cur)
- -              env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ +      if (!cur) {
+ +              /*
+ +               * select_idle_siblings() uses an per-cpu cpumask that
+ +               * can be used from IRQ context.
+ +               */
+ +              local_irq_disable();
+ +              env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+ +                                                 env->dst_cpu);
+ +              local_irq_enable();
+ +      }
   
   assign:
         task_numa_assign(env, cur, imp);
@@@ -2373,7 -2292,7 +2373,7 @@@ void task_numa_work(struct callback_hea
         unsigned long nr_pte_updates = 0;
         long pages, virtpages;
   
- -      WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+ +      SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
   
         work->next = work; /* protect against double add */
         /*
@@@ -2884,21 -2803,9 +2884,21 @@@ __update_load_avg(u64 now, int cpu, str
   }
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -/*
- - * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- - * and effective_load (which is not done because it is too costly).
+ +/**
+ + * update_tg_load_avg - update the tg's load avg
+ + * @cfs_rq: the cfs_rq whose avg changed
+ + * @force: update regardless of how small the difference
+ + *
+ + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ + * However, because tg->load_avg is a global value there are performance
+ + * considerations.
+ + *
+ + * In order to avoid having to look at the other cfs_rq's, we use a
+ + * differential update where we store the last value we propagated. This in
+ + * turn allows skipping updates if the differential is 'small'.
+ + *
+ + * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ + * done) and effective_load() (which is not done because it is too costly).
    */
   static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
   {
@@@ -2968,7 -2875,12 +2968,7 @@@ static inline void update_tg_load_avg(s
   
   static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
   {
- -      struct rq *rq = rq_of(cfs_rq);
- -      int cpu = cpu_of(rq);
- -
- -      if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
- -              unsigned long max = rq->cpu_capacity_orig;
- -
+ +      if (&this_rq()->cfs == cfs_rq) {
                 /*
                  * There are a few boundary cases this might miss but it should
                  * get called often enough that that should (hopefully) not be
@@@ -2985,7 -2897,8 +2985,7 @@@
                  *
                  * See cpu_util().
                  */
- -              cpufreq_update_util(rq_clock(rq),
- -                                  min(cfs_rq->avg.util_avg, max), max);
+ +              cpufreq_update_util(rq_of(cfs_rq), 0);
         }
   }
   
@@@ -3018,10 -2931,10 +3018,10 @@@
    *
    * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
    *
- - * Returns true if the load decayed or we removed utilization. It is expected
- - * that one calls update_tg_load_avg() on this condition, but after you've
- - * modified the cfs_rq avg (attach/detach), such that we propagate the new
- - * avg up.
+ + * Returns true if the load decayed or we removed load.
+ + *
+ + * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ + * call update_tg_load_avg() when this function returns true.
    */
   static inline int
   update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@@ -3246,7 -3159,10 +3246,7 @@@ update_cfs_rq_load_avg(u64 now, struct 
   
   static inline void update_load_avg(struct sched_entity *se, int not_used)
   {
- -      struct cfs_rq *cfs_rq = cfs_rq_of(se);
- -      struct rq *rq = rq_of(cfs_rq);
- -
- -      cpufreq_trigger_update(rq_clock(rq));
+ +      cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
   }
   
   static inline void
@@@ -3267,6 -3183,68 +3267,6 @@@ static inline int idle_balance(struct r
   
   #endif /* CONFIG_SMP */
   
- -static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
- -{
- -#ifdef CONFIG_SCHEDSTATS
- -      struct task_struct *tsk = NULL;
- -
- -      if (entity_is_task(se))
- -              tsk = task_of(se);
- -
- -      if (se->statistics.sleep_start) {
- -              u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
- -
- -              if ((s64)delta < 0)
- -                      delta = 0;
- -
- -              if (unlikely(delta > se->statistics.sleep_max))
- -                      se->statistics.sleep_max = delta;
- -
- -              se->statistics.sleep_start = 0;
- -              se->statistics.sum_sleep_runtime += delta;
- -
- -              if (tsk) {
- -                      account_scheduler_latency(tsk, delta >> 10, 1);
- -                      trace_sched_stat_sleep(tsk, delta);
- -              }
- -      }
- -      if (se->statistics.block_start) {
- -              u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
- -
- -              if ((s64)delta < 0)
- -                      delta = 0;
- -
- -              if (unlikely(delta > se->statistics.block_max))
- -                      se->statistics.block_max = delta;
- -
- -              se->statistics.block_start = 0;
- -              se->statistics.sum_sleep_runtime += delta;
- -
- -              if (tsk) {
- -                      if (tsk->in_iowait) {
- -                              se->statistics.iowait_sum += delta;
- -                              se->statistics.iowait_count++;
- -                              trace_sched_stat_iowait(tsk, delta);
- -                      }
- -
- -                      trace_sched_stat_blocked(tsk, delta);
- -
- -                      /*
- -                       * Blocking time is in units of nanosecs, so shift by
- -                       * 20 to get a milliseconds-range estimation of the
- -                       * amount of time that the task spent sleeping:
- -                       */
- -                      if (unlikely(prof_on == SLEEP_PROFILING)) {
- -                              profile_hits(SLEEP_PROFILING,
- -                                              (void *)get_wchan(tsk),
- -                                              delta >> 20);
- -                      }
- -                      account_scheduler_latency(tsk, delta >> 10, 0);
- -              }
- -      }
- -#endif
- -}
- -
   static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
   #ifdef CONFIG_SCHED_DEBUG
@@@ -3276,7 -3254,7 +3276,7 @@@
                 d = -d;
   
         if (d > 3*sysctl_sched_latency)
- -              schedstat_inc(cfs_rq, nr_spread_over);
+ +              schedstat_inc(cfs_rq->nr_spread_over);
   #endif
   }
   
@@@ -3393,12 -3371,17 +3393,12 @@@ enqueue_entity(struct cfs_rq *cfs_rq, s
         account_entity_enqueue(cfs_rq, se);
         update_cfs_shares(cfs_rq);
   
- -      if (flags & ENQUEUE_WAKEUP) {
+ +      if (flags & ENQUEUE_WAKEUP)
                 place_entity(cfs_rq, se, 0);
- -              if (schedstat_enabled())
- -                      enqueue_sleeper(cfs_rq, se);
- -      }
   
         check_schedstat_required();
- -      if (schedstat_enabled()) {
- -              update_stats_enqueue(cfs_rq, se);
- -              check_spread(cfs_rq, se);
- -      }
+ +      update_stats_enqueue(cfs_rq, se, flags);
+ +      check_spread(cfs_rq, se);
         if (!curr)
                 __enqueue_entity(cfs_rq, se);
         se->on_rq = 1;
@@@ -3465,7 -3448,8 +3465,7 @@@ dequeue_entity(struct cfs_rq *cfs_rq, s
         update_curr(cfs_rq);
         dequeue_entity_load_avg(cfs_rq, se);
   
- -      if (schedstat_enabled())
- -              update_stats_dequeue(cfs_rq, se, flags);
+ +      update_stats_dequeue(cfs_rq, se, flags);
   
         clear_buddies(cfs_rq, se);
   
@@@ -3475,10 -3459,9 +3475,10 @@@
         account_entity_dequeue(cfs_rq, se);
   
         /*
- -       * Normalize the entity after updating the min_vruntime because the
- -       * update can refer to the ->curr item and we need to reflect this
- -       * movement in our normalized position.
+ +       * Normalize after update_curr(); which will also have moved
+ +       * min_vruntime if @se is the one holding it back. But before doing
+ +       * update_min_vruntime() again, which will discount @se's position and
+ +       * can move min_vruntime forward still more.
          */
         if (!(flags & DEQUEUE_SLEEP))
                 se->vruntime -= cfs_rq->min_vruntime;
@@@ -3486,16 -3469,8 +3486,16 @@@
         /* return excess runtime on last dequeue */
         return_cfs_rq_runtime(cfs_rq);
   
- -      update_min_vruntime(cfs_rq);
         update_cfs_shares(cfs_rq);
+ +
+ +      /*
+ +       * Now advance min_vruntime if @se was the entity holding it back,
+ +       * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
+ +       * put back on, and if we advance min_vruntime, we'll be placed back
+ +       * further than we started -- ie. we'll be penalized.
+ +       */
+ +      if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+ +              update_min_vruntime(cfs_rq);
   }
   
   /*
@@@ -3548,25 -3523,25 +3548,25 @@@ set_next_entity(struct cfs_rq *cfs_rq, 
                  * a CPU. So account for the time it spent waiting on the
                  * runqueue.
                  */
- -              if (schedstat_enabled())
- -                      update_stats_wait_end(cfs_rq, se);
+ +              update_stats_wait_end(cfs_rq, se);
                 __dequeue_entity(cfs_rq, se);
                 update_load_avg(se, 1);
         }
   
         update_stats_curr_start(cfs_rq, se);
         cfs_rq->curr = se;
- -#ifdef CONFIG_SCHEDSTATS
+ +
         /*
          * Track our maximum slice length, if the CPU's load is at
          * least twice that of our own weight (i.e. dont track it
          * when there are only lesser-weight tasks around):
          */
         if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- -              se->statistics.slice_max = max(se->statistics.slice_max,
- -                      se->sum_exec_runtime - se->prev_sum_exec_runtime);
+ +              schedstat_set(se->statistics.slice_max,
+ +                      max((u64)schedstat_val(se->statistics.slice_max),
+ +                          se->sum_exec_runtime - se->prev_sum_exec_runtime));
         }
- -#endif
+ +
         se->prev_sum_exec_runtime = se->sum_exec_runtime;
   }
   
@@@ -3645,10 -3620,13 +3645,10 @@@ static void put_prev_entity(struct cfs_
         /* throttle cfs_rqs exceeding runtime */
         check_cfs_rq_runtime(cfs_rq);
   
- -      if (schedstat_enabled()) {
- -              check_spread(cfs_rq, prev);
- -              if (prev->on_rq)
- -                      update_stats_wait_start(cfs_rq, prev);
- -      }
+ +      check_spread(cfs_rq, prev);
   
         if (prev->on_rq) {
+ +              update_stats_wait_start(cfs_rq, prev);
                 /* Put 'current' back into the tree. */
                 __enqueue_entity(cfs_rq, prev);
                 /* in !on_rq case, update occurred at dequeue */
@@@ -4478,9 -4456,9 +4478,9 @@@ static void hrtick_start_fair(struct r
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
   
- -      WARN_ON(task_rq(p) != rq);
+ +      SCHED_WARN_ON(task_rq(p) != rq);
   
- -      if (cfs_rq->nr_running > 1) {
+ +      if (rq->cfs.h_nr_running > 1) {
                 u64 slice = sched_slice(cfs_rq, se);
                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
                 s64 delta = slice - ran;
@@@ -4531,14 -4509,6 +4531,14 @@@ enqueue_task_fair(struct rq *rq, struc
         struct cfs_rq *cfs_rq;
         struct sched_entity *se = &p->se;
   
+ +      /*
+ +       * If in_iowait is set, the code below may not trigger any cpufreq
+ +       * utilization updates, so do it here explicitly with the IOWAIT flag
+ +       * passed.
+ +       */
+ +      if (p->in_iowait)
+ +              cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+ +
         for_each_sched_entity(se) {
                 if (se->on_rq)
                         break;
@@@ -4635,11 -4605,6 +4635,11 @@@ static void dequeue_task_fair(struct r
   }
   
   #ifdef CONFIG_SMP
+ +
+ +/* Working cpumask for: load_balance, load_balance_newidle. */
+ +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+ +DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+ +
   #ifdef CONFIG_NO_HZ_COMMON
   /*
    * per rq 'load' arrray crap; XXX kill this.
@@@ -5041,9 -5006,9 +5041,9 @@@ static long effective_load(struct task_
                  * wl = S * s'_i; see (2)
                  */
                 if (W > 0 && w < W)
- -                      wl = (w * (long)tg->shares) / W;
+ +                      wl = (w * (long)scale_load_down(tg->shares)) / W;
                 else
- -                      wl = tg->shares;
+ +                      wl = scale_load_down(tg->shares);
   
                 /*
                  * Per the above, wl is the new se->load.weight value; since
@@@ -5126,18 -5091,18 +5126,18 @@@ static int wake_wide(struct task_struc
         return 1;
   }
   
- -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+ +static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+ +                     int prev_cpu, int sync)
   {
         s64 this_load, load;
         s64 this_eff_load, prev_eff_load;
- -      int idx, this_cpu, prev_cpu;
+ +      int idx, this_cpu;
         struct task_group *tg;
         unsigned long weight;
         int balanced;
   
         idx       = sd->wake_idx;
         this_cpu  = smp_processor_id();
- -      prev_cpu  = task_cpu(p);
         load      = source_load(prev_cpu, idx);
         this_load = target_load(this_cpu, idx);
   
@@@ -5181,13 -5146,13 +5181,13 @@@
   
         balanced = this_eff_load <= prev_eff_load;
   
- -      schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
+ +      schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
   
         if (!balanced)
                 return 0;
   
- -      schedstat_inc(sd, ttwu_move_affine);
- -      schedstat_inc(p, se.statistics.nr_wakeups_affine);
+ +      schedstat_inc(sd->ttwu_move_affine);
+ +      schedstat_inc(p->se.statistics.nr_wakeups_affine);
   
         return 1;
   }
@@@ -5263,10 -5228,6 +5263,10 @@@ find_idlest_cpu(struct sched_group *gro
         int shallowest_idle_cpu = -1;
         int i;
   
+ +      /* Check if we have any choice: */
+ +      if (group->group_weight == 1)
+ +              return cpumask_first(sched_group_cpus(group));
+ +
         /* Traverse only the allowed CPUs */
         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
                 if (idle_cpu(i)) {
@@@ -5304,237 -5265,64 +5304,237 @@@
   }
   
   /*
- - * Try and locate an idle CPU in the sched_domain.
+ + * Implement a for_each_cpu() variant that starts the scan at a given cpu
+ + * (@start), and wraps around.
+ + *
+ + * This is used to scan for idle CPUs; such that not all CPUs looking for an
+ + * idle CPU find the same CPU. The down-side is that tasks tend to cycle
+ + * through the LLC domain.
+ + *
+ + * Especially tbench is found sensitive to this.
+ + */
+ +
+ +static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
+ +{
+ +      int next;
+ +
+ +again:
+ +      next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
+ +
+ +      if (*wrapped) {
+ +              if (next >= start)
+ +                      return nr_cpumask_bits;
+ +      } else {
+ +              if (next >= nr_cpumask_bits) {
+ +                      *wrapped = 1;
+ +                      n = -1;
+ +                      goto again;
+ +              }
+ +      }
+ +
+ +      return next;
+ +}
+ +
+ +#define for_each_cpu_wrap(cpu, mask, start, wrap)                             \
+ +      for ((wrap) = 0, (cpu) = (start)-1;                                     \
+ +              (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)),     \
+ +              (cpu) < nr_cpumask_bits; )
+ +
+ +#ifdef CONFIG_SCHED_SMT
+ +
+ +static inline void set_idle_cores(int cpu, int val)
+ +{
+ +      struct sched_domain_shared *sds;
+ +
+ +      sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ +      if (sds)
+ +              WRITE_ONCE(sds->has_idle_cores, val);
+ +}
+ +
+ +static inline bool test_idle_cores(int cpu, bool def)
+ +{
+ +      struct sched_domain_shared *sds;
+ +
+ +      sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ +      if (sds)
+ +              return READ_ONCE(sds->has_idle_cores);
+ +
+ +      return def;
+ +}
+ +
+ +/*
+ + * Scans the local SMT mask to see if the entire core is idle, and records this
+ + * information in sd_llc_shared->has_idle_cores.
+ + *
+ + * Since SMT siblings share all cache levels, inspecting this limited remote
+ + * state should be fairly cheap.
+ + */
+ +void __update_idle_core(struct rq *rq)
+ +{
+ +      int core = cpu_of(rq);
+ +      int cpu;
+ +
+ +      rcu_read_lock();
+ +      if (test_idle_cores(core, true))
+ +              goto unlock;
+ +
+ +      for_each_cpu(cpu, cpu_smt_mask(core)) {
+ +              if (cpu == core)
+ +                      continue;
+ +
+ +              if (!idle_cpu(cpu))
+ +                      goto unlock;
+ +      }
+ +
+ +      set_idle_cores(core, 1);
+ +unlock:
+ +      rcu_read_unlock();
+ +}
+ +
+ +/*
+ + * Scan the entire LLC domain for idle cores; this dynamically switches off if
+ + * there are no idle cores left in the system; tracked through
+ + * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ + */
+ +static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+ +{
+ +      struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ +      int core, cpu, wrap;
+ +
+ +      if (!static_branch_likely(&sched_smt_present))
+ +              return -1;
+ +
+ +      if (!test_idle_cores(target, false))
+ +              return -1;
+ +
+ +      cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+ +
+ +      for_each_cpu_wrap(core, cpus, target, wrap) {
+ +              bool idle = true;
+ +
+ +              for_each_cpu(cpu, cpu_smt_mask(core)) {
+ +                      cpumask_clear_cpu(cpu, cpus);
+ +                      if (!idle_cpu(cpu))
+ +                              idle = false;
+ +              }
+ +
+ +              if (idle)
+ +                      return core;
+ +      }
+ +
+ +      /*
+ +       * Failed to find an idle core; stop looking for one.
+ +       */
+ +      set_idle_cores(target, 0);
+ +
+ +      return -1;
+ +}
+ +
+ +/*
+ + * Scan the local SMT mask for idle CPUs.
    */
- -static int select_idle_sibling(struct task_struct *p, int target)
+ +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+ +{
+ +      int cpu;
+ +
+ +      if (!static_branch_likely(&sched_smt_present))
+ +              return -1;
+ +
+ +      for_each_cpu(cpu, cpu_smt_mask(target)) {
+ +              if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ +                      continue;
+ +              if (idle_cpu(cpu))
+ +                      return cpu;
+ +      }
+ +
+ +      return -1;
+ +}
+ +
+ +#else /* CONFIG_SCHED_SMT */
+ +
+ +static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+ +{
+ +      return -1;
+ +}
+ +
+ +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+ +{
+ +      return -1;
+ +}
+ +
+ +#endif /* CONFIG_SCHED_SMT */
+ +
+ +/*
+ + * Scan the LLC domain for idle CPUs; this is dynamically regulated by
+ + * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
+ + * average idle time for this rq (as found in rq->avg_idle).
+ + */
+ +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+ +{
+ +      struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ +      u64 avg_idle = this_rq()->avg_idle;
+ +      u64 avg_cost = this_sd->avg_scan_cost;
+ +      u64 time, cost;
+ +      s64 delta;
+ +      int cpu, wrap;
+ +
+ +      /*
+ +       * Due to large variance we need a large fuzz factor; hackbench in
+ +       * particularly is sensitive here.
+ +       */
+ +      if ((avg_idle / 512) < avg_cost)
+ +              return -1;
+ +
+ +      time = local_clock();
+ +
+ +      for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+ +              if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ +                      continue;
+ +              if (idle_cpu(cpu))
+ +                      break;
+ +      }
+ +
+ +      time = local_clock() - time;
+ +      cost = this_sd->avg_scan_cost;
+ +      delta = (s64)(time - cost) / 8;
+ +      this_sd->avg_scan_cost += delta;
+ +
+ +      return cpu;
+ +}
+ +
+ +/*
+ + * Try and locate an idle core/thread in the LLC cache domain.
+ + */
+ +static int select_idle_sibling(struct task_struct *p, int prev, int target)
   {
         struct sched_domain *sd;
- -      struct sched_group *sg;
- -      int i = task_cpu(p);
+ +      int i;
   
         if (idle_cpu(target))
                 return target;
   
         /*
- -       * If the prevous cpu is cache affine and idle, don't be stupid.
+ +       * If the previous cpu is cache affine and idle, don't be stupid.
          */
- -      if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- -              return i;
+ +      if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+ +              return prev;
   
- -      /*
- -       * Otherwise, iterate the domains and find an eligible idle cpu.
- -       *
- -       * A completely idle sched group at higher domains is more
- -       * desirable than an idle group at a lower level, because lower
- -       * domains have smaller groups and usually share hardware
- -       * resources which causes tasks to contend on them, e.g. x86
- -       * hyperthread siblings in the lowest domain (SMT) can contend
- -       * on the shared cpu pipeline.
- -       *
- -       * However, while we prefer idle groups at higher domains
- -       * finding an idle cpu at the lowest domain is still better than
- -       * returning 'target', which we've already established, isn't
- -       * idle.
- -       */
         sd = rcu_dereference(per_cpu(sd_llc, target));
- -      for_each_lower_domain(sd) {
- -              sg = sd->groups;
- -              do {
- -                      if (!cpumask_intersects(sched_group_cpus(sg),
- -                                              tsk_cpus_allowed(p)))
- -                              goto next;
- -
- -                      /* Ensure the entire group is idle */
- -                      for_each_cpu(i, sched_group_cpus(sg)) {
- -                              if (i == target || !idle_cpu(i))
- -                                      goto next;
- -                      }
+ +      if (!sd)
+ +              return target;
+ +
+ +      i = select_idle_core(p, sd, target);
+ +      if ((unsigned)i < nr_cpumask_bits)
+ +              return i;
+ +
+ +      i = select_idle_cpu(p, sd, target);
+ +      if ((unsigned)i < nr_cpumask_bits)
+ +              return i;
+ +
+ +      i = select_idle_smt(p, sd, target);
+ +      if ((unsigned)i < nr_cpumask_bits)
+ +              return i;
   
- -                      /*
- -                       * It doesn't matter which cpu we pick, the
- -                       * whole group is idle.
- -                       */
- -                      target = cpumask_first_and(sched_group_cpus(sg),
- -                                      tsk_cpus_allowed(p));
- -                      goto done;
- -next:
- -                      sg = sg->next;
- -              } while (sg != sd->groups);
- -      }
- -done:
         return target;
   }
   
@@@ -5572,32 -5360,6 +5572,32 @@@ static int cpu_util(int cpu
         return (util >= capacity) ? capacity : util;
   }
   
+ +static inline int task_util(struct task_struct *p)
+ +{
+ +      return p->se.avg.util_avg;
+ +}
+ +
+ +/*
+ + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ + *
+ + * In that case WAKE_AFFINE doesn't make sense and we'll let
+ + * BALANCE_WAKE sort things out.
+ + */
+ +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+ +{
+ +      long min_cap, max_cap;
+ +
+ +      min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+ +      max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+ +
+ +      /* Minimum capacity is close to max, no need to abort wake_affine */
+ +      if (max_cap - min_cap < max_cap >> 3)
+ +              return 0;
+ +
+ +      return min_cap * 1024 < task_util(p) * capacity_margin;
+ +}
+ +
   /*
    * select_task_rq_fair: Select target runqueue for the waking task in domains
    * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@@ -5621,8 -5383,7 +5621,8 @@@ select_task_rq_fair(struct task_struct 
   
         if (sd_flag & SD_BALANCE_WAKE) {
                 record_wakee(p);
- -              want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ +              want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+ +                            && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
         }
   
         rcu_read_lock();
@@@ -5648,13 -5409,13 +5648,13 @@@
   
         if (affine_sd) {
                 sd = NULL; /* Prefer wake_affine over balance flags */
- -              if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ +              if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
                         new_cpu = cpu;
         }
   
         if (!sd) {
                 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
- -                      new_cpu = select_idle_sibling(p, new_cpu);
+ +                      new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
   
         } else while (sd) {
                 struct sched_group *group;
@@@ -6178,7 -5939,7 +6178,7 @@@ static bool yield_to_task_fair(struct r
    *
    * The adjacency matrix of the resulting graph is given by:
    *
- - *             log_2 n     
+ + *             log_2 n
    *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
    *             k = 0
    *
@@@ -6224,7 -5985,7 +6224,7 @@@
    *
    * [XXX write more on how we solve this.. _after_ merging pjt's patches that
    *      rewrite all of this once again.]
- - */ 
+ + */
   
   static unsigned long __read_mostly max_load_balance_interval = HZ/10;
   
@@@ -6372,7 -6133,7 +6372,7 @@@ int can_migrate_task(struct task_struc
         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
                 int cpu;
   
- -              schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+ +              schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
   
                 env->flags |= LBF_SOME_PINNED;
   
@@@ -6403,7 -6164,7 +6403,7 @@@
         env->flags &= ~LBF_ALL_PINNED;
   
         if (task_running(env->src_rq, p)) {
- -              schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+ +              schedstat_inc(p->se.statistics.nr_failed_migrations_running);
                 return 0;
         }
   
@@@ -6420,13 -6181,13 +6420,13 @@@
         if (tsk_cache_hot <= 0 ||
             env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
                 if (tsk_cache_hot == 1) {
- -                      schedstat_inc(env->sd, lb_hot_gained[env->idle]);
- -                      schedstat_inc(p, se.statistics.nr_forced_migrations);
+ +                      schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+ +                      schedstat_inc(p->se.statistics.nr_forced_migrations);
                 }
                 return 1;
         }
   
- -      schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+ +      schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
         return 0;
   }
   
@@@ -6466,7 -6227,7 +6466,7 @@@ static struct task_struct *detach_one_t
                  * so we can safely collect stats here rather than
                  * inside detach_tasks().
                  */
- -              schedstat_inc(env->sd, lb_gained[env->idle]);
+ +              schedstat_inc(env->sd->lb_gained[env->idle]);
                 return p;
         }
         return NULL;
@@@ -6558,7 -6319,7 +6558,7 @@@ next
          * so we can safely collect detach_one_task() stats here rather
          * than inside detach_one_task().
          */
- -      schedstat_add(env->sd, lb_gained[env->idle], detached);
+ +      schedstat_add(env->sd->lb_gained[env->idle], detached);
   
         return detached;
   }
@@@ -6886,7 -6647,7 +6886,7 @@@ void update_group_capacity(struct sched
                 /*
                  * !SD_OVERLAP domains can assume that child groups
                  * span the current group.
- -               */ 
+ +               */
   
                 group = child->groups;
                 do {
@@@ -7386,7 -7147,7 +7386,7 @@@ static inline void calculate_imbalance(
                 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
                 if (load_above_capacity > busiest->group_capacity) {
                         load_above_capacity -= busiest->group_capacity;
- -                      load_above_capacity *= NICE_0_LOAD;
+ +                      load_above_capacity *= scale_load_down(NICE_0_LOAD);
                         load_above_capacity /= busiest->group_capacity;
                 } else
                         load_above_capacity = ~0UL;
@@@ -7593,6 -7354,9 +7593,6 @@@ static struct rq *find_busiest_queue(st
    */
   #define MAX_PINNED_INTERVAL   512
   
- -/* Working cpumask for load_balance and load_balance_newidle. */
- -DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
- -
   static int need_active_balance(struct lb_env *env)
   {
         struct sched_domain *sd = env->sd;
@@@ -7696,7 -7460,7 +7696,7 @@@ static int load_balance(int this_cpu, s
   
         cpumask_copy(cpus, cpu_active_mask);
   
- -      schedstat_inc(sd, lb_count[idle]);
+ +      schedstat_inc(sd->lb_count[idle]);
   
   redo:
         if (!should_we_balance(&env)) {
@@@ -7706,19 -7470,19 +7706,19 @@@
   
         group = find_busiest_group(&env);
         if (!group) {
- -              schedstat_inc(sd, lb_nobusyg[idle]);
+ +              schedstat_inc(sd->lb_nobusyg[idle]);
                 goto out_balanced;
         }
   
         busiest = find_busiest_queue(&env, group);
         if (!busiest) {
- -              schedstat_inc(sd, lb_nobusyq[idle]);
+ +              schedstat_inc(sd->lb_nobusyq[idle]);
                 goto out_balanced;
         }
   
         BUG_ON(busiest == env.dst_rq);
   
- -      schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+ +      schedstat_add(sd->lb_imbalance[idle], env.imbalance);
   
         env.src_cpu = busiest->cpu;
         env.src_rq = busiest;
@@@ -7825,7 -7589,7 +7825,7 @@@ more_balance
         }
   
         if (!ld_moved) {
- -              schedstat_inc(sd, lb_failed[idle]);
+ +              schedstat_inc(sd->lb_failed[idle]);
                 /*
                  * Increment the failure counter only on periodic balance.
                  * We do not want newidle balance, which can be very
@@@ -7908,7 -7672,7 +7908,7 @@@ out_all_pinned
          * we can't migrate them. Let the imbalance flag set so parent level
          * can try to migrate them.
          */
- -      schedstat_inc(sd, lb_balanced[idle]);
+ +      schedstat_inc(sd->lb_balanced[idle]);
   
         sd->nr_balance_failed = 0;
   
@@@ -7940,12 -7704,11 +7940,12 @@@ get_sd_balance_interval(struct sched_do
   }
   
   static inline void
- -update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+ +update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
   {
         unsigned long interval, next;
   
- -      interval = get_sd_balance_interval(sd, cpu_busy);
+ +      /* used by idle balance, so cpu_busy = 0 */
+ +      interval = get_sd_balance_interval(sd, 0);
         next = sd->last_balance + interval;
   
         if (time_after(*next_balance, next))
@@@ -7975,7 -7738,7 +7975,7 @@@ static int idle_balance(struct rq *this
                 rcu_read_lock();
                 sd = rcu_dereference_check_sched_domain(this_rq->sd);
                 if (sd)
- -                      update_next_balance(sd, 0, &next_balance);
+ +                      update_next_balance(sd, &next_balance);
                 rcu_read_unlock();
   
                 goto out;
@@@ -7993,7 -7756,7 +7993,7 @@@
                         continue;
   
                 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
- -                      update_next_balance(sd, 0, &next_balance);
+ +                      update_next_balance(sd, &next_balance);
                         break;
                 }
   
@@@ -8011,7 -7774,7 +8011,7 @@@
                         curr_cost += domain_cost;
                 }
   
- -              update_next_balance(sd, 0, &next_balance);
+ +              update_next_balance(sd, &next_balance);
   
                 /*
                  * Stop searching for tasks to pull if there are
@@@ -8101,15 -7864,15 +8101,15 @@@ static int active_load_balance_cpu_stop
                         .idle           = CPU_IDLE,
                 };
   
- -              schedstat_inc(sd, alb_count);
+ +              schedstat_inc(sd->alb_count);
   
                 p = detach_one_task(&env);
                 if (p) {
- -                      schedstat_inc(sd, alb_pushed);
+ +                      schedstat_inc(sd->alb_pushed);
                         /* Active balancing done, reset the failure counter. */
                         sd->nr_balance_failed = 0;
                 } else {
- -                      schedstat_inc(sd, alb_failed);
+ +                      schedstat_inc(sd->alb_failed);
                 }
         }
         rcu_read_unlock();
@@@ -8201,13 -7964,13 +8201,13 @@@ static inline void set_cpu_sd_state_bus
         int cpu = smp_processor_id();
   
         rcu_read_lock();
- -      sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ +      sd = rcu_dereference(per_cpu(sd_llc, cpu));
   
         if (!sd || !sd->nohz_idle)
                 goto unlock;
         sd->nohz_idle = 0;
   
- -      atomic_inc(&sd->groups->sgc->nr_busy_cpus);
+ +      atomic_inc(&sd->shared->nr_busy_cpus);
   unlock:
         rcu_read_unlock();
   }
@@@ -8218,13 -7981,13 +8218,13 @@@ void set_cpu_sd_state_idle(void
         int cpu = smp_processor_id();
   
         rcu_read_lock();
- -      sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ +      sd = rcu_dereference(per_cpu(sd_llc, cpu));
   
         if (!sd || sd->nohz_idle)
                 goto unlock;
         sd->nohz_idle = 1;
   
- -      atomic_dec(&sd->groups->sgc->nr_busy_cpus);
+ +      atomic_dec(&sd->shared->nr_busy_cpus);
   unlock:
         rcu_read_unlock();
   }
@@@ -8451,8 -8214,8 +8451,8 @@@ end
   static inline bool nohz_kick_needed(struct rq *rq)
   {
         unsigned long now = jiffies;
+ +      struct sched_domain_shared *sds;
         struct sched_domain *sd;
- -      struct sched_group_capacity *sgc;
         int nr_busy, cpu = rq->cpu;
         bool kick = false;
   
@@@ -8480,13 -8243,11 +8480,13 @@@
                 return true;
   
         rcu_read_lock();
- -      sd = rcu_dereference(per_cpu(sd_busy, cpu));
- -      if (sd) {
- -              sgc = sd->groups->sgc;
- -              nr_busy = atomic_read(&sgc->nr_busy_cpus);
- -
+ +      sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ +      if (sds) {
+ +              /*
+ +               * XXX: write a coherent comment on why we do this.
+ +               * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
+ +               */
+ +              nr_busy = atomic_read(&sds->nr_busy_cpus);
                 if (nr_busy > 1) {
                         kick = true;
                         goto unlock;
@@@ -8522,7 -8283,7 +8522,7 @@@ static void nohz_idle_balance(struct r
    * run_rebalance_domains is triggered when needed from the scheduler tick.
    * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
    */
- static void run_rebalance_domains(struct softirq_action *h)
+ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
   {
         struct rq *this_rq = this_rq();
         enum cpu_idle_type idle = this_rq->idle_balance ?
@@@ -8680,6 -8441,7 +8680,6 @@@ static void detach_task_cfs_rq(struct t
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         u64 now = cfs_rq_clock_task(cfs_rq);
- -      int tg_update;
   
         if (!vruntime_normalized(p)) {
                 /*
@@@ -8691,9 -8453,10 +8691,9 @@@
         }
   
         /* Catch up with the cfs_rq and remove our load when we leave */
- -      tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+ +      update_cfs_rq_load_avg(now, cfs_rq, false);
         detach_entity_load_avg(cfs_rq, se);
- -      if (tg_update)
- -              update_tg_load_avg(cfs_rq, false);
+ +      update_tg_load_avg(cfs_rq, false);
   }
   
   static void attach_task_cfs_rq(struct task_struct *p)
@@@ -8701,6 -8464,7 +8701,6 @@@
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         u64 now = cfs_rq_clock_task(cfs_rq);
- -      int tg_update;
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
         /*
@@@ -8711,9 -8475,10 +8711,9 @@@
   #endif
   
         /* Synchronize task with its cfs_rq */
- -      tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+ +      update_cfs_rq_load_avg(now, cfs_rq, false);
         attach_entity_load_avg(cfs_rq, se);
- -      if (tg_update)
- -              update_tg_load_avg(cfs_rq, false);
+ +      update_tg_load_avg(cfs_rq, false);
   
         if (!vruntime_normalized(p))
                 se->vruntime += cfs_rq->min_vruntime;
diff --combined kernel/softirq.c

index 6676264,34033fd..1bf81ef
--- 1/kernel/softirq.c
--- 2/kernel/softirq.c
+++ b/kernel/softirq.c
@@@ -77,17 -77,6 +77,17 @@@ static void wakeup_softirqd(void
                 wake_up_process(tsk);
   }
   
+ +/*
+ + * If ksoftirqd is scheduled, we do not want to process pending softirqs
+ + * right now. Let ksoftirqd handle this at its own rate, to get fairness.
+ + */
+ +static bool ksoftirqd_running(void)
+ +{
+ +      struct task_struct *tsk = __this_cpu_read(ksoftirqd);
+ +
+ +      return tsk && (tsk->state == TASK_RUNNING);
+ +}
+ +
   /*
    * preempt_count and SOFTIRQ_OFFSET usage:
    * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
@@@ -324,7 -313,7 +324,7 @@@ asmlinkage __visible void do_softirq(vo
   
         pending = local_softirq_pending();
   
- -      if (pending)
+ +      if (pending && !ksoftirqd_running())
                 do_softirq_own_stack();
   
         local_irq_restore(flags);
@@@ -351,9 -340,6 +351,9 @@@ void irq_enter(void
   
   static inline void invoke_softirq(void)
   {
+ +      if (ksoftirqd_running())
+ +              return;
+ +
         if (!force_irqthreads) {
   #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
                 /*
@@@ -496,7 -482,7 +496,7 @@@ void __tasklet_hi_schedule_first(struc
   }
   EXPORT_SYMBOL(__tasklet_hi_schedule_first);
   
- static void tasklet_action(struct softirq_action *a)
+ static __latent_entropy void tasklet_action(struct softirq_action *a)
   {
         struct tasklet_struct *list;
   
@@@ -532,7 -518,7 +532,7 @@@
         }
   }
   
- static void tasklet_hi_action(struct softirq_action *a)
+ static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
   {
         struct tasklet_struct *list;
   
@@@ -714,7 -700,7 +714,7 @@@ void tasklet_kill_immediate(struct task
         BUG();
   }
   
- -static void takeover_tasklets(unsigned int cpu)
+ +static int takeover_tasklets(unsigned int cpu)
   {
         /* CPU is dead, so no lock needed. */
         local_irq_disable();
@@@ -737,12 -723,27 +737,12 @@@
         raise_softirq_irqoff(HI_SOFTIRQ);
   
         local_irq_enable();
+ +      return 0;
   }
+ +#else
+ +#define takeover_tasklets     NULL
   #endif /* CONFIG_HOTPLUG_CPU */
   
- -static int cpu_callback(struct notifier_block *nfb, unsigned long action,
- -                      void *hcpu)
- -{
- -      switch (action) {
- -#ifdef CONFIG_HOTPLUG_CPU
- -      case CPU_DEAD:
- -      case CPU_DEAD_FROZEN:
- -              takeover_tasklets((unsigned long)hcpu);
- -              break;
- -#endif /* CONFIG_HOTPLUG_CPU */
- -      }
- -      return NOTIFY_OK;
- -}
- -
- -static struct notifier_block cpu_nfb = {
- -      .notifier_call = cpu_callback
- -};
- -
   static struct smp_hotplug_thread softirq_threads = {
         .store                  = &ksoftirqd,
         .thread_should_run      = ksoftirqd_should_run,
@@@ -752,8 -753,8 +752,8 @@@
   
   static __init int spawn_ksoftirqd(void)
   {
- -      register_cpu_notifier(&cpu_nfb);
- -
+ +      cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
+ +                                takeover_tasklets);
         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
   
         return 0;
diff --combined lib/irq_poll.c

index 2be5569,63be749..1d6565e
--- 1/lib/irq_poll.c
--- 2/lib/irq_poll.c
+++ b/lib/irq_poll.c
@@@ -74,7 -74,7 +74,7 @@@ void irq_poll_complete(struct irq_poll 
   }
   EXPORT_SYMBOL(irq_poll_complete);
   
- static void irq_poll_softirq(struct softirq_action *h)
+ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
   {
         struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
         int rearm = 0, budget = irq_poll_budget;
@@@ -184,21 -184,30 +184,21 @@@ void irq_poll_init(struct irq_poll *iop
   }
   EXPORT_SYMBOL(irq_poll_init);
   
- -static int irq_poll_cpu_notify(struct notifier_block *self,
- -                               unsigned long action, void *hcpu)
+ +static int irq_poll_cpu_dead(unsigned int cpu)
   {
         /*
          * If a CPU goes away, splice its entries to the current CPU
          * and trigger a run of the softirq
          */
- -      if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- -              int cpu = (unsigned long) hcpu;
- -
- -              local_irq_disable();
- -              list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
- -                               this_cpu_ptr(&blk_cpu_iopoll));
- -              __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
- -              local_irq_enable();
- -      }
+ +      local_irq_disable();
+ +      list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
+ +                       this_cpu_ptr(&blk_cpu_iopoll));
+ +      __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
+ +      local_irq_enable();
   
- -      return NOTIFY_OK;
+ +      return 0;
   }
   
- -static struct notifier_block irq_poll_cpu_notifier = {
- -      .notifier_call  = irq_poll_cpu_notify,
- -};
- -
   static __init int irq_poll_setup(void)
   {
         int i;
@@@ -207,8 -216,7 +207,8 @@@
                 INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
   
         open_softirq(IRQ_POLL_SOFTIRQ, irq_poll_softirq);
- -      register_hotcpu_notifier(&irq_poll_cpu_notifier);
+ +      cpuhp_setup_state_nocalls(CPUHP_IRQ_POLL_DEAD, "irq_poll:dead", NULL,
+ +                                irq_poll_cpu_dead);
         return 0;
   }
   subsys_initcall(irq_poll_setup);
diff --combined lib/random32.c

index 915982b,a309235..fa594b1
--- 1/lib/random32.c
--- 2/lib/random32.c
+++ b/lib/random32.c
@@@ -47,7 -47,7 +47,7 @@@ static inline void prandom_state_selfte
   }
   #endif
   
- static DEFINE_PER_CPU(struct rnd_state, net_rand_state);
+ static DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy;
   
   /**
    *    prandom_u32_state - seeded pseudo-random number generator.
@@@ -81,7 -81,7 +81,7 @@@ u32 prandom_u32(void
         u32 res;
   
         res = prandom_u32_state(state);
- -      put_cpu_var(state);
+ +      put_cpu_var(net_rand_state);
   
         return res;
   }
@@@ -128,7 -128,7 +128,7 @@@ void prandom_bytes(void *buf, size_t by
         struct rnd_state *state = &get_cpu_var(net_rand_state);
   
         prandom_bytes_state(state, buf, bytes);
- -      put_cpu_var(state);
+ +      put_cpu_var(net_rand_state);
   }
   EXPORT_SYMBOL(prandom_bytes);
   
diff --combined mm/page_alloc.c

index ca423cc,901121a..2b3bf67
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -91,6 -91,11 +91,11 @@@ EXPORT_PER_CPU_SYMBOL(_numa_mem_)
   int _node_numa_mem_[MAX_NUMNODES];
   #endif
   
+ #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
+ volatile u64 latent_entropy __latent_entropy;
+ EXPORT_SYMBOL(latent_entropy);
+ #endif
+ 
   /*
    * Array of node states.
    */
@@@ -607,9 -612,6 +612,9 @@@ static bool need_debug_guardpage(void
         if (!debug_pagealloc_enabled())
                 return false;
   
+ +      if (!debug_guardpage_minorder())
+ +              return false;
+ +
         return true;
   }
   
@@@ -618,9 -620,6 +623,9 @@@ static void init_debug_guardpage(void
         if (!debug_pagealloc_enabled())
                 return;
   
+ +      if (!debug_guardpage_minorder())
+ +              return;
+ +
         _debug_guardpage_enabled = true;
   }
   
@@@ -641,22 -640,19 +646,22 @@@ static int __init debug_guardpage_minor
         pr_info("Setting debug_guardpage_minorder to %lu\n", res);
         return 0;
   }
- -__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+ +early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
   
- -static inline void set_page_guard(struct zone *zone, struct page *page,
+ +static inline bool set_page_guard(struct zone *zone, struct page *page,
                                 unsigned int order, int migratetype)
   {
         struct page_ext *page_ext;
   
         if (!debug_guardpage_enabled())
- -              return;
+ +              return false;
+ +
+ +      if (order >= debug_guardpage_minorder())
+ +              return false;
   
         page_ext = lookup_page_ext(page);
         if (unlikely(!page_ext))
- -              return;
+ +              return false;
   
         __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
   
@@@ -664,8 -660,6 +669,8 @@@
         set_page_private(page, order);
         /* Guard pages are not available for any usage */
         __mod_zone_freepage_state(zone, -(1 << order), migratetype);
+ +
+ +      return true;
   }
   
   static inline void clear_page_guard(struct zone *zone, struct page *page,
@@@ -687,9 -681,9 +692,9 @@@
                 __mod_zone_freepage_state(zone, (1 << order), migratetype);
   }
   #else
- -struct page_ext_operations debug_guardpage_ops = { NULL, };
- -static inline void set_page_guard(struct zone *zone, struct page *page,
- -                              unsigned int order, int migratetype) {}
+ +struct page_ext_operations debug_guardpage_ops;
+ +static inline bool set_page_guard(struct zone *zone, struct page *page,
+ +                      unsigned int order, int migratetype) { return false; }
   static inline void clear_page_guard(struct zone *zone, struct page *page,
                                 unsigned int order, int migratetype) {}
   #endif
@@@ -1404,18 -1398,15 +1409,18 @@@ static void __init deferred_free_range(
                 return;
   
         /* Free a large naturally-aligned chunk if possible */
- -      if (nr_pages == MAX_ORDER_NR_PAGES &&
- -          (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
+ +      if (nr_pages == pageblock_nr_pages &&
+ +          (pfn & (pageblock_nr_pages - 1)) == 0) {
                 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- -              __free_pages_boot_core(page, MAX_ORDER-1);
+ +              __free_pages_boot_core(page, pageblock_order);
                 return;
         }
   
- -      for (i = 0; i < nr_pages; i++, page++)
+ +      for (i = 0; i < nr_pages; i++, page++, pfn++) {
+ +              if ((pfn & (pageblock_nr_pages - 1)) == 0)
+ +                      set_pageblock_migratetype(page, MIGRATE_MOVABLE);
                 __free_pages_boot_core(page, 0);
+ +      }
   }
   
   /* Completion tracking for deferred_init_memmap() threads */
@@@ -1483,9 -1474,9 +1488,9 @@@ static int __init deferred_init_memmap(
   
                         /*
                          * Ensure pfn_valid is checked every
- -                       * MAX_ORDER_NR_PAGES for memory holes
+ +                       * pageblock_nr_pages for memory holes
                          */
- -                      if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+ +                      if ((pfn & (pageblock_nr_pages - 1)) == 0) {
                                 if (!pfn_valid(pfn)) {
                                         page = NULL;
                                         goto free_range;
@@@ -1498,7 -1489,7 +1503,7 @@@
                         }
   
                         /* Minimise pfn page lookups and scheduler checks */
- -                      if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) {
+ +                      if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
                                 page++;
                         } else {
                                 nr_pages += nr_to_free;
@@@ -1534,9 -1525,6 +1539,9 @@@ free_range
                         free_base_page = NULL;
                         free_base_pfn = nr_to_free = 0;
                 }
+ +              /* Free the last block of pages to allocator */
+ +              nr_pages += nr_to_free;
+ +              deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
   
                 first_init_pfn = max(end_pfn, first_init_pfn);
         }
@@@ -1633,15 -1621,18 +1638,15 @@@ static inline void expand(struct zone *
                 size >>= 1;
                 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
   
- -              if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
- -                      debug_guardpage_enabled() &&
- -                      high < debug_guardpage_minorder()) {
- -                      /*
- -                       * Mark as guard pages (or page), that will allow to
- -                       * merge back to allocator when buddy will be freed.
- -                       * Corresponding page table entries will not be touched,
- -                       * pages will stay not present in virtual address space
- -                       */
- -                      set_page_guard(zone, &page[size], high, migratetype);
+ +              /*
+ +               * Mark as guard pages (or page), that will allow to
+ +               * merge back to allocator when buddy will be freed.
+ +               * Corresponding page table entries will not be touched,
+ +               * pages will stay not present in virtual address space
+ +               */
+ +              if (set_page_guard(zone, &page[size], high, migratetype))
                         continue;
- -              }
+ +
                 list_add(&page[size].lru, &area->free_list[migratetype]);
                 area->nr_free++;
                 set_page_order(&page[size], high);
@@@ -2503,14 -2494,9 +2508,14 @@@ int __isolate_free_page(struct page *pa
         mt = get_pageblock_migratetype(page);
   
         if (!is_migrate_isolate(mt)) {
- -              /* Obey watermarks as if the page was being allocated */
- -              watermark = low_wmark_pages(zone) + (1 << order);
- -              if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ +              /*
+ +               * Obey watermarks as if the page was being allocated. We can
+ +               * emulate a high-order watermark check with a raised order-0
+ +               * watermark, because we already know our high-order page
+ +               * exists.
+ +               */
+ +              watermark = min_wmark_pages(zone) + (1UL << order);
+ +              if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
                         return 0;
   
                 __mod_zone_freepage_state(zone, -(1UL << order), mt);
@@@ -2979,11 -2965,9 +2984,11 @@@ static DEFINE_RATELIMIT_STATE(nopage_rs
                 DEFAULT_RATELIMIT_INTERVAL,
                 DEFAULT_RATELIMIT_BURST);
   
- -void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
+ +void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
   {
         unsigned int filter = SHOW_MEM_FILTER_NODES;
+ +      struct va_format vaf;
+ +      va_list args;
   
         if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
             debug_guardpage_minorder() > 0)
@@@ -3001,16 -2985,22 +3006,16 @@@
         if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
                 filter &= ~SHOW_MEM_FILTER_NODES;
   
- -      if (fmt) {
- -              struct va_format vaf;
- -              va_list args;
+ +      pr_warn("%s: ", current->comm);
   
- -              va_start(args, fmt);
+ +      va_start(args, fmt);
+ +      vaf.fmt = fmt;
+ +      vaf.va = &args;
+ +      pr_cont("%pV", &vaf);
+ +      va_end(args);
   
- -              vaf.fmt = fmt;
- -              vaf.va = &args;
+ +      pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
   
- -              pr_warn("%pV", &vaf);
- -
- -              va_end(args);
- -      }
- -
- -      pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
- -              current->comm, order, gfp_mask, &gfp_mask);
         dump_stack();
         if (!should_suppress_show_mem())
                 show_mem(filter);
@@@ -3152,65 -3142,6 +3157,65 @@@ __alloc_pages_direct_compact(gfp_t gfp_
         return NULL;
   }
   
+ +static inline bool
+ +should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+ +                   enum compact_result compact_result,
+ +                   enum compact_priority *compact_priority,
+ +                   int *compaction_retries)
+ +{
+ +      int max_retries = MAX_COMPACT_RETRIES;
+ +      int min_priority;
+ +
+ +      if (!order)
+ +              return false;
+ +
+ +      if (compaction_made_progress(compact_result))
+ +              (*compaction_retries)++;
+ +
+ +      /*
+ +       * compaction considers all the zone as desperately out of memory
+ +       * so it doesn't really make much sense to retry except when the
+ +       * failure could be caused by insufficient priority
+ +       */
+ +      if (compaction_failed(compact_result))
+ +              goto check_priority;
+ +
+ +      /*
+ +       * make sure the compaction wasn't deferred or didn't bail out early
+ +       * due to locks contention before we declare that we should give up.
+ +       * But do not retry if the given zonelist is not suitable for
+ +       * compaction.
+ +       */
+ +      if (compaction_withdrawn(compact_result))
+ +              return compaction_zonelist_suitable(ac, order, alloc_flags);
+ +
+ +      /*
+ +       * !costly requests are much more important than __GFP_REPEAT
+ +       * costly ones because they are de facto nofail and invoke OOM
+ +       * killer to move on while costly can fail and users are ready
+ +       * to cope with that. 1/4 retries is rather arbitrary but we
+ +       * would need much more detailed feedback from compaction to
+ +       * make a better decision.
+ +       */
+ +      if (order > PAGE_ALLOC_COSTLY_ORDER)
+ +              max_retries /= 4;
+ +      if (*compaction_retries <= max_retries)
+ +              return true;
+ +
+ +      /*
+ +       * Make sure there are attempts at the highest priority if we exhausted
+ +       * all retries or failed at the lower priorities.
+ +       */
+ +check_priority:
+ +      min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
+ +                      MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
+ +      if (*compact_priority > min_priority) {
+ +              (*compact_priority)--;
+ +              *compaction_retries = 0;
+ +              return true;
+ +      }
+ +      return false;
+ +}
   #else
   static inline struct page *
   __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
@@@ -3221,11 -3152,13 +3226,11 @@@
         return NULL;
   }
   
- -#endif /* CONFIG_COMPACTION */
- -
   static inline bool
   should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
                      enum compact_result compact_result,
                      enum compact_priority *compact_priority,
- -                   int compaction_retries)
+ +                   int *compaction_retries)
   {
         struct zone *zone;
         struct zoneref *z;
@@@ -3247,7 -3180,6 +3252,7 @@@
         }
         return false;
   }
+ +#endif /* CONFIG_COMPACTION */
   
   /* Perform direct synchronous page reclaim */
   static int
@@@ -3398,26 -3330,16 +3403,26 @@@ bool gfp_pfmemalloc_allowed(gfp_t gfp_m
   static inline bool
   should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                      struct alloc_context *ac, int alloc_flags,
- -                   bool did_some_progress, int no_progress_loops)
+ +                   bool did_some_progress, int *no_progress_loops)
   {
         struct zone *zone;
         struct zoneref *z;
   
+ +      /*
+ +       * Costly allocations might have made a progress but this doesn't mean
+ +       * their order will become available due to high fragmentation so
+ +       * always increment the no progress counter for them
+ +       */
+ +      if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
+ +              *no_progress_loops = 0;
+ +      else
+ +              (*no_progress_loops)++;
+ +
         /*
          * Make sure we converge to OOM if we cannot make any progress
          * several times in the row.
          */
- -      if (no_progress_loops > MAX_RECLAIM_RETRIES)
+ +      if (*no_progress_loops > MAX_RECLAIM_RETRIES)
                 return false;
   
         /*
@@@ -3432,7 -3354,7 +3437,7 @@@
                 unsigned long reclaimable;
   
                 available = reclaimable = zone_reclaimable_pages(zone);
- -              available -= DIV_ROUND_UP(no_progress_loops * available,
+ +              available -= DIV_ROUND_UP((*no_progress_loops) * available,
                                           MAX_RECLAIM_RETRIES);
                 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
   
@@@ -3493,8 -3415,6 +3498,8 @@@ __alloc_pages_slowpath(gfp_t gfp_mask, 
         enum compact_result compact_result;
         int compaction_retries = 0;
         int no_progress_loops = 0;
+ +      unsigned long alloc_start = jiffies;
+ +      unsigned int stall_timeout = 10 * HZ;
   
         /*
          * In the slowpath, we sanity check order to avoid ever trying to
@@@ -3639,6 -3559,9 +3644,6 @@@ retry
         if (page)
                 goto got_pg;
   
- -      if (order && compaction_made_progress(compact_result))
- -              compaction_retries++;
- -
         /* Do not loop if specifically requested */
         if (gfp_mask & __GFP_NORETRY)
                 goto nopage;
@@@ -3650,16 -3573,18 +3655,16 @@@
         if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
                 goto nopage;
   
- -      /*
- -       * Costly allocations might have made a progress but this doesn't mean
- -       * their order will become available due to high fragmentation so
- -       * always increment the no progress counter for them
- -       */
- -      if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
- -              no_progress_loops = 0;
- -      else
- -              no_progress_loops++;
+ +      /* Make sure we know about allocations which stall for too long */
+ +      if (time_after(jiffies, alloc_start + stall_timeout)) {
+ +              warn_alloc(gfp_mask,
+ +                      "page alloction stalls for %ums, order:%u\n",
+ +                      jiffies_to_msecs(jiffies-alloc_start), order);
+ +              stall_timeout += 10 * HZ;
+ +      }
   
         if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
- -                               did_some_progress > 0, no_progress_loops))
+ +                               did_some_progress > 0, &no_progress_loops))
                 goto retry;
   
         /*
@@@ -3671,7 -3596,7 +3676,7 @@@
         if (did_some_progress > 0 &&
                         should_compact_retry(ac, order, alloc_flags,
                                 compact_result, &compact_priority,
- -                              compaction_retries))
+ +                              &compaction_retries))
                 goto retry;
   
         /* Reclaim has failed us, start killing things */
@@@ -3686,8 -3611,7 +3691,8 @@@
         }
   
   nopage:
- -      warn_alloc_failed(gfp_mask, order, NULL);
+ +      warn_alloc(gfp_mask,
+ +                      "page allocation failure: order:%u", order);
   got_pg:
         return page;
   }
@@@ -4636,7 -4560,7 +4641,7 @@@ static void build_zonelists_in_node_ord
         int j;
         struct zonelist *zonelist;
   
- -      zonelist = &pgdat->node_zonelists[0];
+ +      zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
         for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
                 ;
         j = build_zonelists_node(NODE_DATA(node), zonelist, j);
@@@ -4652,7 -4576,7 +4657,7 @@@ static void build_thisnode_zonelists(pg
         int j;
         struct zonelist *zonelist;
   
- -      zonelist = &pgdat->node_zonelists[1];
+ +      zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
         j = build_zonelists_node(pgdat, zonelist, 0);
         zonelist->_zonerefs[j].zone = NULL;
         zonelist->_zonerefs[j].zone_idx = 0;
@@@ -4673,7 -4597,7 +4678,7 @@@ static void build_zonelists_in_zone_ord
         struct zone *z;
         struct zonelist *zonelist;
   
- -      zonelist = &pgdat->node_zonelists[0];
+ +      zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
         pos = 0;
         for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
                 for (j = 0; j < nr_nodes; j++) {
@@@ -4808,7 -4732,7 +4813,7 @@@ static void build_zonelists(pg_data_t *
   
         local_node = pgdat->node_id;
   
- -      zonelist = &pgdat->node_zonelists[0];
+ +      zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
         j = build_zonelists_node(pgdat, zonelist, 0);
   
         /*
@@@ -5080,6 -5004,15 +5085,6 @@@ void __meminit memmap_init_zone(unsigne
                         break;
   
   #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- -              /*
- -               * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
- -               * from zone_movable_pfn[nid] to end of each node should be
- -               * ZONE_MOVABLE not ZONE_NORMAL. skip it.
- -               */
- -              if (!mirrored_kernelcore && zone_movable_pfn[nid])
- -                      if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
- -                              continue;
- -
                 /*
                  * Check given memblock attribute by firmware which can affect
                  * kernel memory layout.  If zone==ZONE_MOVABLE but memory is
@@@ -5523,12 -5456,6 +5528,12 @@@ static void __meminit adjust_zone_range
                         *zone_end_pfn = min(node_end_pfn,
                                 arch_zone_highest_possible_pfn[movable_zone]);
   
+ +              /* Adjust for ZONE_MOVABLE starting within this range */
+ +              } else if (!mirrored_kernelcore &&
+ +                      *zone_start_pfn < zone_movable_pfn[nid] &&
+ +                      *zone_end_pfn > zone_movable_pfn[nid]) {
+ +                      *zone_end_pfn = zone_movable_pfn[nid];
+ +
                 /* Check if this whole range is within ZONE_MOVABLE */
                 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
                         *zone_start_pfn = *zone_end_pfn;
@@@ -5632,23 -5559,28 +5637,23 @@@ static unsigned long __meminit zone_abs
          * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
          * and vice versa.
          */
- -      if (zone_movable_pfn[nid]) {
- -              if (mirrored_kernelcore) {
- -                      unsigned long start_pfn, end_pfn;
- -                      struct memblock_region *r;
- -
- -                      for_each_memblock(memory, r) {
- -                              start_pfn = clamp(memblock_region_memory_base_pfn(r),
- -                                                zone_start_pfn, zone_end_pfn);
- -                              end_pfn = clamp(memblock_region_memory_end_pfn(r),
- -                                              zone_start_pfn, zone_end_pfn);
- -
- -                              if (zone_type == ZONE_MOVABLE &&
- -                                  memblock_is_mirror(r))
- -                                      nr_absent += end_pfn - start_pfn;
- -
- -                              if (zone_type == ZONE_NORMAL &&
- -                                  !memblock_is_mirror(r))
- -                                      nr_absent += end_pfn - start_pfn;
- -                      }
- -              } else {
- -                      if (zone_type == ZONE_NORMAL)
- -                              nr_absent += node_end_pfn - zone_movable_pfn[nid];
+ +      if (mirrored_kernelcore && zone_movable_pfn[nid]) {
+ +              unsigned long start_pfn, end_pfn;
+ +              struct memblock_region *r;
+ +
+ +              for_each_memblock(memory, r) {
+ +                      start_pfn = clamp(memblock_region_memory_base_pfn(r),
+ +                                        zone_start_pfn, zone_end_pfn);
+ +                      end_pfn = clamp(memblock_region_memory_end_pfn(r),
+ +                                      zone_start_pfn, zone_end_pfn);
+ +
+ +                      if (zone_type == ZONE_MOVABLE &&
+ +                          memblock_is_mirror(r))
+ +                              nr_absent += end_pfn - start_pfn;
+ +
+ +                      if (zone_type == ZONE_NORMAL &&
+ +                          !memblock_is_mirror(r))
+ +                              nr_absent += end_pfn - start_pfn;
                 }
         }
   
@@@ -7002,17 -6934,6 +7007,17 @@@ static int __init set_hashdist(char *st
   __setup("hashdist=", set_hashdist);
   #endif
   
+ +#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
+ +/*
+ + * Returns the number of pages that arch has reserved but
+ + * is not known to alloc_large_system_hash().
+ + */
+ +static unsigned long __init arch_reserved_kernel_pages(void)
+ +{
+ +      return 0;
+ +}
+ +#endif
+ +
   /*
    * allocate a large system hash table from bootmem
    * - it is assumed that the hash table must contain an exact power-of-2
@@@ -7037,7 -6958,6 +7042,7 @@@ void *__init alloc_large_system_hash(co
         if (!numentries) {
                 /* round applicable memory size up to nearest megabyte */
                 numentries = nr_kernel_pages;
+ +              numentries -= arch_reserved_kernel_pages();
   
                 /* It isn't necessary when PAGE_SIZE >= 1MB */
                 if (PAGE_SHIFT < 20)
diff --combined net/core/dev.c

index f1fe26f,ee076c2..4bc19a1
--- 1/net/core/dev.c
--- 2/net/core/dev.c
+++ b/net/core/dev.c
@@@ -3355,6 -3355,16 +3355,6 @@@ static int __dev_queue_xmit(struct sk_b
         else
                 skb_dst_force(skb);
   
- -#ifdef CONFIG_NET_SWITCHDEV
- -      /* Don't forward if offload device already forwarded */
- -      if (skb->offload_fwd_mark &&
- -          skb->offload_fwd_mark == dev->offload_fwd_mark) {
- -              consume_skb(skb);
- -              rc = NET_XMIT_SUCCESS;
- -              goto out;
- -      }
- -#endif
- -
         txq = netdev_pick_tx(dev, skb, accel_priv);
         q = rcu_dereference_bh(txq->qdisc);
   
@@@ -3845,7 -3855,7 +3845,7 @@@ int netif_rx_ni(struct sk_buff *skb
   }
   EXPORT_SYMBOL(netif_rx_ni);
   
- static void net_tx_action(struct softirq_action *h)
+ static __latent_entropy void net_tx_action(struct softirq_action *h)
   {
         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
   
@@@ -3904,7 -3914,8 +3904,7 @@@
         }
   }
   
- -#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
- -    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
+ +#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
   /* This hook is defined here for ATM LANE */
   int (*br_fdb_test_addr_hook)(struct net_device *dev,
                              unsigned char *addr) __read_mostly;
@@@ -4055,17 -4066,12 +4055,17 @@@ static inline int nf_ingress(struct sk_
   {
   #ifdef CONFIG_NETFILTER_INGRESS
         if (nf_hook_ingress_active(skb)) {
+ +              int ingress_retval;
+ +
                 if (*pt_prev) {
                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
                         *pt_prev = NULL;
                 }
   
- -              return nf_hook_ingress(skb);
+ +              rcu_read_lock();
+ +              ingress_retval = nf_hook_ingress(skb);
+ +              rcu_read_unlock();
+ +              return ingress_retval;
         }
   #endif /* CONFIG_NETFILTER_INGRESS */
         return 0;
@@@ -4302,53 -4308,32 +4302,53 @@@ int netif_receive_skb(struct sk_buff *s
   }
   EXPORT_SYMBOL(netif_receive_skb);
   
- -/* Network device is going away, flush any packets still pending
- - * Called with irqs disabled.
- - */
- -static void flush_backlog(void *arg)
+ +DEFINE_PER_CPU(struct work_struct, flush_works);
+ +
+ +/* Network device is going away, flush any packets still pending */
+ +static void flush_backlog(struct work_struct *work)
   {
- -      struct net_device *dev = arg;
- -      struct softnet_data *sd = this_cpu_ptr(&softnet_data);
         struct sk_buff *skb, *tmp;
+ +      struct softnet_data *sd;
+ +
+ +      local_bh_disable();
+ +      sd = this_cpu_ptr(&softnet_data);
   
+ +      local_irq_disable();
         rps_lock(sd);
         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
- -              if (skb->dev == dev) {
+ +              if (skb->dev->reg_state == NETREG_UNREGISTERING) {
                         __skb_unlink(skb, &sd->input_pkt_queue);
                         kfree_skb(skb);
                         input_queue_head_incr(sd);
                 }
         }
         rps_unlock(sd);
+ +      local_irq_enable();
   
         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
- -              if (skb->dev == dev) {
+ +              if (skb->dev->reg_state == NETREG_UNREGISTERING) {
                         __skb_unlink(skb, &sd->process_queue);
                         kfree_skb(skb);
                         input_queue_head_incr(sd);
                 }
         }
+ +      local_bh_enable();
+ +}
+ +
+ +static void flush_all_backlogs(void)
+ +{
+ +      unsigned int cpu;
+ +
+ +      get_online_cpus();
+ +
+ +      for_each_online_cpu(cpu)
+ +              queue_work_on(cpu, system_highpri_wq,
+ +                            per_cpu_ptr(&flush_works, cpu));
+ +
+ +      for_each_online_cpu(cpu)
+ +              flush_work(per_cpu_ptr(&flush_works, cpu));
+ +
+ +      put_online_cpus();
   }
   
   static int napi_gro_complete(struct sk_buff *skb)
@@@ -4836,9 -4821,8 +4836,9 @@@ static bool sd_has_rps_ipi_waiting(stru
   
   static int process_backlog(struct napi_struct *napi, int quota)
   {
- -      int work = 0;
         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
+ +      bool again = true;
+ +      int work = 0;
   
         /* Check if we have pending ipi, its better to send them now,
          * not waiting net_rx_action() end.
@@@ -4849,20 -4833,23 +4849,20 @@@
         }
   
         napi->weight = weight_p;
- -      local_irq_disable();
- -      while (1) {
+ +      while (again) {
                 struct sk_buff *skb;
   
                 while ((skb = __skb_dequeue(&sd->process_queue))) {
                         rcu_read_lock();
- -                      local_irq_enable();
                         __netif_receive_skb(skb);
                         rcu_read_unlock();
- -                      local_irq_disable();
                         input_queue_head_incr(sd);
- -                      if (++work >= quota) {
- -                              local_irq_enable();
+ +                      if (++work >= quota)
                                 return work;
- -                      }
+ +
                 }
   
+ +              local_irq_disable();
                 rps_lock(sd);
                 if (skb_queue_empty(&sd->input_pkt_queue)) {
                         /*
@@@ -4874,14 -4861,16 +4874,14 @@@
                          * and we dont need an smp_mb() memory barrier.
                          */
                         napi->state = 0;
- -                      rps_unlock(sd);
- -
- -                      break;
+ +                      again = false;
+ +              } else {
+ +                      skb_queue_splice_tail_init(&sd->input_pkt_queue,
+ +                                                 &sd->process_queue);
                 }
- -
- -              skb_queue_splice_tail_init(&sd->input_pkt_queue,
- -                                         &sd->process_queue);
                 rps_unlock(sd);
+ +              local_irq_enable();
         }
- -      local_irq_enable();
   
         return work;
   }
@@@ -5198,7 -5187,7 +5198,7 @@@ out_unlock
         return work;
   }
   
- static void net_rx_action(struct softirq_action *h)
+ static __latent_entropy void net_rx_action(struct softirq_action *h)
   {
         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
         unsigned long time_limit = jiffies + 2;
@@@ -5589,7 -5578,6 +5589,7 @@@ static inline bool netdev_adjacent_is_n
   
   static int __netdev_adjacent_dev_insert(struct net_device *dev,
                                         struct net_device *adj_dev,
+ +                                      u16 ref_nr,
                                         struct list_head *dev_list,
                                         void *private, bool master)
   {
@@@ -5599,7 -5587,7 +5599,7 @@@
         adj = __netdev_find_adj(adj_dev, dev_list);
   
         if (adj) {
- -              adj->ref_nr++;
+ +              adj->ref_nr += ref_nr;
                 return 0;
         }
   
@@@ -5609,7 -5597,7 +5609,7 @@@
   
         adj->dev = adj_dev;
         adj->master = master;
- -      adj->ref_nr = 1;
+ +      adj->ref_nr = ref_nr;
         adj->private = private;
         dev_hold(adj_dev);
   
@@@ -5648,7 -5636,6 +5648,7 @@@ free_adj
   
   static void __netdev_adjacent_dev_remove(struct net_device *dev,
                                          struct net_device *adj_dev,
+ +                                       u16 ref_nr,
                                          struct list_head *dev_list)
   {
         struct netdev_adjacent *adj;
@@@ -5661,10 -5648,10 +5661,10 @@@
                 BUG();
         }
   
- -      if (adj->ref_nr > 1) {
- -              pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
- -                       adj->ref_nr-1);
- -              adj->ref_nr--;
+ +      if (adj->ref_nr > ref_nr) {
+ +              pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
+ +                       ref_nr, adj->ref_nr-ref_nr);
+ +              adj->ref_nr -= ref_nr;
                 return;
         }
   
@@@ -5683,22 -5670,21 +5683,22 @@@
   
   static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
                                             struct net_device *upper_dev,
+ +                                          u16 ref_nr,
                                             struct list_head *up_list,
                                             struct list_head *down_list,
                                             void *private, bool master)
   {
         int ret;
   
- -      ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
- -                                         master);
+ +      ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
+ +                                         private, master);
         if (ret)
                 return ret;
   
- -      ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
- -                                         false);
+ +      ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
+ +                                         private, false);
         if (ret) {
- -              __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
+ +              __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
                 return ret;
         }
   
@@@ -5706,10 -5692,9 +5706,10 @@@
   }
   
   static int __netdev_adjacent_dev_link(struct net_device *dev,
- -                                    struct net_device *upper_dev)
+ +                                    struct net_device *upper_dev,
+ +                                    u16 ref_nr)
   {
- -      return __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ +      return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
                                                 &dev->all_adj_list.upper,
                                                 &upper_dev->all_adj_list.lower,
                                                 NULL, false);
@@@ -5717,19 -5702,17 +5717,19 @@@
   
   static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
                                                struct net_device *upper_dev,
+ +                                             u16 ref_nr,
                                                struct list_head *up_list,
                                                struct list_head *down_list)
   {
- -      __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
- -      __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
+ +      __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
+ +      __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
   }
   
   static void __netdev_adjacent_dev_unlink(struct net_device *dev,
- -                                       struct net_device *upper_dev)
+ +                                       struct net_device *upper_dev,
+ +                                       u16 ref_nr)
   {
- -      __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ +      __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
                                            &dev->all_adj_list.upper,
                                            &upper_dev->all_adj_list.lower);
   }
@@@ -5738,17 -5721,17 +5738,17 @@@ static int __netdev_adjacent_dev_link_n
                                                 struct net_device *upper_dev,
                                                 void *private, bool master)
   {
- -      int ret = __netdev_adjacent_dev_link(dev, upper_dev);
+ +      int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
   
         if (ret)
                 return ret;
   
- -      ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ +      ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
                                                &dev->adj_list.upper,
                                                &upper_dev->adj_list.lower,
                                                private, master);
         if (ret) {
- -              __netdev_adjacent_dev_unlink(dev, upper_dev);
+ +              __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
                 return ret;
         }
   
@@@ -5758,8 -5741,8 +5758,8 @@@
   static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
                                                    struct net_device *upper_dev)
   {
- -      __netdev_adjacent_dev_unlink(dev, upper_dev);
- -      __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ +      __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
+ +      __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
                                            &dev->adj_list.upper,
                                            &upper_dev->adj_list.lower);
   }
@@@ -5812,7 -5795,7 +5812,7 @@@ static int __netdev_upper_dev_link(stru
                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
                         pr_debug("Interlinking %s with %s, non-neighbour\n",
                                  i->dev->name, j->dev->name);
- -                      ret = __netdev_adjacent_dev_link(i->dev, j->dev);
+ +                      ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
                         if (ret)
                                 goto rollback_mesh;
                 }
@@@ -5822,7 -5805,7 +5822,7 @@@
         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
                 pr_debug("linking %s's upper device %s with %s\n",
                          upper_dev->name, i->dev->name, dev->name);
- -              ret = __netdev_adjacent_dev_link(dev, i->dev);
+ +              ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
                 if (ret)
                         goto rollback_upper_mesh;
         }
@@@ -5831,7 -5814,7 +5831,7 @@@
         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
                          i->dev->name, upper_dev->name);
- -              ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
+ +              ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
                 if (ret)
                         goto rollback_lower_mesh;
         }
@@@ -5849,7 -5832,7 +5849,7 @@@ rollback_lower_mesh
         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
                 if (i == to_i)
                         break;
- -              __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+ +              __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
         }
   
         i = NULL;
@@@ -5859,7 -5842,7 +5859,7 @@@ rollback_upper_mesh
         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
                 if (i == to_i)
                         break;
- -              __netdev_adjacent_dev_unlink(dev, i->dev);
+ +              __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
         }
   
         i = j = NULL;
@@@ -5871,7 -5854,7 +5871,7 @@@ rollback_mesh
                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
                         if (i == to_i && j == to_j)
                                 break;
- -                      __netdev_adjacent_dev_unlink(i->dev, j->dev);
+ +                      __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
                 }
                 if (i == to_i)
                         break;
@@@ -5951,16 -5934,16 +5951,16 @@@ void netdev_upper_dev_unlink(struct net
          */
         list_for_each_entry(i, &dev->all_adj_list.lower, list)
                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
- -                      __netdev_adjacent_dev_unlink(i->dev, j->dev);
+ +                      __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
   
         /* remove also the devices itself from lower/upper device
          * list
          */
         list_for_each_entry(i, &dev->all_adj_list.lower, list)
- -              __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+ +              __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
   
         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
- -              __netdev_adjacent_dev_unlink(dev, i->dev);
+ +              __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
   
         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
                                       &changeupper_info.info);
@@@ -6740,8 -6723,8 +6740,8 @@@ static void rollback_registered_many(st
                 unlist_netdevice(dev);
   
                 dev->reg_state = NETREG_UNREGISTERING;
- -              on_each_cpu(flush_backlog, dev, 1);
         }
+ +      flush_all_backlogs();
   
         synchronize_net();
   
@@@ -7658,9 -7641,6 +7658,9 @@@ struct net_device *alloc_netdev_mqs(in
         INIT_LIST_HEAD(&dev->all_adj_list.lower);
         INIT_LIST_HEAD(&dev->ptype_all);
         INIT_LIST_HEAD(&dev->ptype_specific);
+ +#ifdef CONFIG_NET_SCHED
+ +      hash_init(dev->qdisc_hash);
+ +#endif
         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
         setup(dev);
   
@@@ -8306,11 -8286,8 +8306,11 @@@ static int __init net_dev_init(void
          */
   
         for_each_possible_cpu(i) {
+ +              struct work_struct *flush = per_cpu_ptr(&flush_works, i);
                 struct softnet_data *sd = &per_cpu(softnet_data, i);
   
+ +              INIT_WORK(flush, flush_backlog);
+ +
                 skb_queue_head_init(&sd->input_pkt_queue);
                 skb_queue_head_init(&sd->process_queue);
                 INIT_LIST_HEAD(&sd->poll_list);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 15 Oct 2016 17:03:15 +0000 (10:03 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 15 Oct 2016 17:03:15 +0000 (10:03 -0700)
		1	2
arch/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-softirq.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/char/random.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/namespace.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/compiler.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fdtable.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/init.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/random.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/softirq.c	patch \|	diff1 \|	diff2 \|	blob \| history
lib/irq_poll.c	patch \|	diff1 \|	diff2 \|	blob \| history
lib/random32.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/dev.c	patch \|	diff1 \|	diff2 \|	blob \| history