Merge branch 'akpm' (patches from Andrew)

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 8 Oct 2016 04:38:00 +0000 (21:38 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 8 Oct 2016 04:38:00 +0000 (21:38 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 8 Oct 2016 04:38:00 +0000 (21:38 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 8 Oct 2016 04:38:00 +0000 (21:38 -0700)
diff --git a/.gitattributes b/.gitattributes

new file mode 100644 (file)

index 0000000..89c411b
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+*.c   diff=cpp
+*.h   diff=cpp
diff --git a/.mailmap b/.mailmap

index 967f882..2408e56 100644 (file)
--- a/.mailmap
+++ b/.mailmap
@@ -75,6 +75,8 @@ Jean Tourrilhes <jt@hpl.hp.com>
  Jeff Garzik <jgarzik@pretzel.yyz.us>
  Jens Axboe <axboe@suse.de>
  Jens Osterkamp <Jens.Osterkamp@de.ibm.com>
+Johan Hovold <johan@kernel.org> <jhovold@gmail.com>
+Johan Hovold <johan@kernel.org> <johan@hovoldconsulting.com>
  John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
  John Stultz <johnstul@us.ibm.com>
  <josh@joshtriplett.org> <josh@freedesktop.org>
diff --git a/CREDITS b/CREDITS

index df0a50e..513aaa3 100644 (file)
--- a/CREDITS
+++ b/CREDITS
@@ -2296,11 +2296,11 @@ D: Initial implementation of VC's, pty's and select()
  
  N: Pavel Machek
  E: pavel@ucw.cz
-D: Softcursor for vga, hypertech cdrom support, vcsa bugfix, nbd
+P: 4096R/92DFCE96 4FA7 9EEF FCD4 C44F C585  B8C7 C060 2241 92DF CE96
+D: Softcursor for vga, hypertech cdrom support, vcsa bugfix, nbd,
  D: sun4/330 port, capabilities for elf, speedup for rm on ext2, USB,
-D: work on suspend-to-ram/disk, killing duplicates from ioctl32
-S: Volkova 1131
-S: 198 00 Praha 9
+D: work on suspend-to-ram/disk, killing duplicates from ioctl32,
+D: Altera SoCFPGA and Nokia N900 support.
  S: Czech Republic
  
  N: Paul Mackerras
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt

index fcc1ac0..219ffd4 100644 (file)
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -515,6 +515,18 @@ be vanished or the reverse -- new added.
  This file is only present if the CONFIG_MMU kernel configuration option is
  enabled.
  
+Note: reading /proc/PID/maps or /proc/PID/smaps is inherently racy (consistent
+output can be achieved only in the single read call).
+This typically manifests when doing partial reads of these files while the
+memory map is being modified.  Despite the races, we do provide the following
+guarantees:
+
+1) The mapped addresses never go backwards, which implies no two
+   regions will ever overlap.
+2) If there is something at a given vaddr during the entirety of the
+   life of the smaps/maps walk, there will be some output for it.
+
+
  The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
  bits on both physical and virtual pages associated with a process, and the
  soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details).
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig

index 7f312d8..0e49d39 100644 (file)
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -15,7 +15,6 @@ config ALPHA
         select GENERIC_IRQ_SHOW
         select ARCH_WANT_IPC_PARSE_VERSION
         select ARCH_HAVE_NMI_SAFE_CMPXCHG
-       select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
         select AUDIT_ARCH
         select GENERIC_CLOCKEVENTS
         select GENERIC_SMP_IDLE_THREAD
diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S

index 647b84c..cebecfb 100644 (file)
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -22,6 +22,7 @@ SECTIONS
                 HEAD_TEXT
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 *(.fixup)
                 *(.gnu.warning)
diff --git a/arch/arc/kernel/vmlinux.lds.S b/arch/arc/kernel/vmlinux.lds.S

index 3661107..f35ed57 100644 (file)
--- a/arch/arc/kernel/vmlinux.lds.S
+++ b/arch/arc/kernel/vmlinux.lds.S
@@ -89,6 +89,7 @@ SECTIONS
                 _text = .;
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 KPROBES_TEXT
                 *(.fixup)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig

index 125657b..b5d529f 100644 (file)
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2,7 +2,6 @@ config ARM
         bool
         default y
         select ARCH_CLOCKSOURCE_DATA
-       select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
         select ARCH_HAS_DEVMEM_IS_ALLOWED
         select ARCH_HAS_ELF_RANDOMIZE
         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h

index 1bd9510..e53638c 100644 (file)
--- a/arch/arm/include/asm/irq.h
+++ b/arch/arm/include/asm/irq.h
@@ -36,8 +36,9 @@ extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
  #endif
  
  #ifdef CONFIG_SMP
-extern void arch_trigger_all_cpu_backtrace(bool);
-#define arch_trigger_all_cpu_backtrace(x) arch_trigger_all_cpu_backtrace(x)
+extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask,
+                                          bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
  #endif
  
  static inline int nr_legacy_irqs(void)
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c

index 937c892..7dd14e8 100644 (file)
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -748,19 +748,10 @@ core_initcall(register_cpufreq_notifier);
  
  static void raise_nmi(cpumask_t *mask)
  {
-       /*
-        * Generate the backtrace directly if we are running in a calling
-        * context that is not preemptible by the backtrace IPI. Note
-        * that nmi_cpu_backtrace() automatically removes the current cpu
-        * from mask.
-        */
-       if (cpumask_test_cpu(smp_processor_id(), mask) && irqs_disabled())
-               nmi_cpu_backtrace(NULL);
-
         smp_cross_call(mask, IPI_CPU_BACKTRACE);
  }
  
-void arch_trigger_all_cpu_backtrace(bool include_self)
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
  {
-       nmi_trigger_all_cpu_backtrace(include_self, raise_nmi);
+       nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_nmi);
  }
diff --git a/arch/arm/kernel/vmlinux-xip.lds.S b/arch/arm/kernel/vmlinux-xip.lds.S

index cba1ec8..7fa487e 100644 (file)
--- a/arch/arm/kernel/vmlinux-xip.lds.S
+++ b/arch/arm/kernel/vmlinux-xip.lds.S
@@ -98,6 +98,7 @@ SECTIONS
                         IRQENTRY_TEXT
                         TEXT_TEXT
                         SCHED_TEXT
+                       CPUIDLE_TEXT
                         LOCK_TEXT
                         KPROBES_TEXT
                         *(.gnu.warning)
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S

index d24e5dd..f7f55df 100644 (file)
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -111,6 +111,7 @@ SECTIONS
                         SOFTIRQENTRY_TEXT
                         TEXT_TEXT
                         SCHED_TEXT
+                       CPUIDLE_TEXT
                         LOCK_TEXT
                         HYPERVISOR_TEXT
                         KPROBES_TEXT
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig

index 0160040..30398db 100644 (file)
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -8,9 +8,9 @@ config ARM64
         select ARCH_CLOCKSOURCE_DATA
         select ARCH_HAS_DEVMEM_IS_ALLOWED
         select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
-       select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
         select ARCH_HAS_ELF_RANDOMIZE
         select ARCH_HAS_GCOV_PROFILE_ALL
+       select ARCH_HAS_GIGANTIC_PAGE
         select ARCH_HAS_KCOV
         select ARCH_HAS_SG_CHAIN
         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S

index 5ce9b29..1105aab 100644 (file)
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -122,6 +122,7 @@ SECTIONS
                         ENTRY_TEXT
                         TEXT_TEXT
                         SCHED_TEXT
+                       CPUIDLE_TEXT
                         LOCK_TEXT
                         KPROBES_TEXT
                         HYPERVISOR_TEXT
diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S

index a458917..17f2730 100644 (file)
--- a/arch/avr32/kernel/vmlinux.lds.S
+++ b/arch/avr32/kernel/vmlinux.lds.S
@@ -52,6 +52,7 @@ SECTIONS
                 KPROBES_TEXT
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 *(.fixup)
                 *(.gnu.warning)
diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S

index d920b95..68069a1 100644 (file)
--- a/arch/blackfin/kernel/vmlinux.lds.S
+++ b/arch/blackfin/kernel/vmlinux.lds.S
@@ -33,6 +33,7 @@ SECTIONS
  #ifndef CONFIG_SCHEDULE_L1
                 SCHED_TEXT
  #endif
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 IRQENTRY_TEXT
                 SOFTIRQENTRY_TEXT
diff --git a/arch/c6x/kernel/vmlinux.lds.S b/arch/c6x/kernel/vmlinux.lds.S

index 50bc10f..a1a5c16 100644 (file)
--- a/arch/c6x/kernel/vmlinux.lds.S
+++ b/arch/c6x/kernel/vmlinux.lds.S
@@ -70,6 +70,7 @@ SECTIONS
                 _stext = .;
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 IRQENTRY_TEXT
                 SOFTIRQENTRY_TEXT
diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S

index 7552c25..9795862 100644 (file)
--- a/arch/cris/kernel/vmlinux.lds.S
+++ b/arch/cris/kernel/vmlinux.lds.S
@@ -43,6 +43,7 @@ SECTIONS
                 HEAD_TEXT
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 *(.fixup)
                 *(.text.__*)
diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S

index 7e958d8..aa6e573 100644 (file)
--- a/arch/frv/kernel/vmlinux.lds.S
+++ b/arch/frv/kernel/vmlinux.lds.S
@@ -63,6 +63,7 @@ SECTIONS
         *(.text..tlbmiss)
         TEXT_TEXT
         SCHED_TEXT
+       CPUIDLE_TEXT
         LOCK_TEXT
  #ifdef CONFIG_DEBUG_INFO
         INIT_TEXT
diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S

index cb5dfb0..7f11da1 100644 (file)
--- a/arch/h8300/kernel/vmlinux.lds.S
+++ b/arch/h8300/kernel/vmlinux.lds.S
@@ -29,6 +29,7 @@ SECTIONS
         _stext = . ;
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
  #if defined(CONFIG_ROMKERNEL)
                 *(.int_redirect)
diff --git a/arch/hexagon/kernel/vmlinux.lds.S b/arch/hexagon/kernel/vmlinux.lds.S

index 5f268c1..ec87e67 100644 (file)
--- a/arch/hexagon/kernel/vmlinux.lds.S
+++ b/arch/hexagon/kernel/vmlinux.lds.S
@@ -50,6 +50,7 @@ SECTIONS
                 _text = .;
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 KPROBES_TEXT
                 *(.fixup)
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h

index f565ad3..65d4bb2 100644 (file)
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -269,6 +269,22 @@ static __inline__ long atomic64_add_unless(atomic64_t *v, long a, long u)
  
  #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
  
+static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
+{
+       long c, old, dec;
+       c = atomic64_read(v);
+       for (;;) {
+               dec = c - 1;
+               if (unlikely(dec < 0))
+                       break;
+               old = atomic64_cmpxchg((v), c, dec);
+               if (likely(old == c))
+                       break;
+               c = old;
+       }
+       return dec;
+}
+
  /*
   * Atomically add I to V and return TRUE if the resulting value is
   * negative.
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S

index dc506b0..f89d20c 100644 (file)
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -46,6 +46,7 @@ SECTIONS {
                 __end_ivt_text = .;
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 KPROBES_TEXT
                 *(.gnu.linkonce.t*)
diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S

index 018e4a7..ad1fe56 100644 (file)
--- a/arch/m32r/kernel/vmlinux.lds.S
+++ b/arch/m32r/kernel/vmlinux.lds.S
@@ -31,6 +31,7 @@ SECTIONS
         HEAD_TEXT
         TEXT_TEXT
         SCHED_TEXT
+       CPUIDLE_TEXT
         LOCK_TEXT
         *(.fixup)
         *(.gnu.warning)
diff --git a/arch/m68k/kernel/vmlinux-nommu.lds b/arch/m68k/kernel/vmlinux-nommu.lds

index 06a763f..d2c8abf 100644 (file)
--- a/arch/m68k/kernel/vmlinux-nommu.lds
+++ b/arch/m68k/kernel/vmlinux-nommu.lds
@@ -45,6 +45,7 @@ SECTIONS {
                 HEAD_TEXT
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 *(.fixup)
                 . = ALIGN(16);
diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds

index d099359..5b5ce1e 100644 (file)
--- a/arch/m68k/kernel/vmlinux-std.lds
+++ b/arch/m68k/kernel/vmlinux-std.lds
@@ -16,6 +16,7 @@ SECTIONS
         HEAD_TEXT
         TEXT_TEXT
         SCHED_TEXT
+       CPUIDLE_TEXT
         LOCK_TEXT
         *(.fixup)
         *(.gnu.warning)
diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds

index 8080469..fe5ea19 100644 (file)
--- a/arch/m68k/kernel/vmlinux-sun3.lds
+++ b/arch/m68k/kernel/vmlinux-sun3.lds
@@ -16,6 +16,7 @@ SECTIONS
         HEAD_TEXT
         TEXT_TEXT
         SCHED_TEXT
+       CPUIDLE_TEXT
         LOCK_TEXT
         *(.fixup)
         *(.gnu.warning)
diff --git a/arch/metag/kernel/vmlinux.lds.S b/arch/metag/kernel/vmlinux.lds.S

index 150ace9..e6c700e 100644 (file)
--- a/arch/metag/kernel/vmlinux.lds.S
+++ b/arch/metag/kernel/vmlinux.lds.S
@@ -21,6 +21,7 @@ SECTIONS
    .text : {
         TEXT_TEXT
         SCHED_TEXT
+       CPUIDLE_TEXT
         LOCK_TEXT
         KPROBES_TEXT
         IRQENTRY_TEXT
diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S

index 0a47f04..289d0e7 100644 (file)
--- a/arch/microblaze/kernel/vmlinux.lds.S
+++ b/arch/microblaze/kernel/vmlinux.lds.S
@@ -33,6 +33,7 @@ SECTIONS {
                 EXIT_TEXT
                 EXIT_CALL
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 KPROBES_TEXT
                 IRQENTRY_TEXT
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig

index 212ff92..1a322c8 100644 (file)
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -30,7 +30,6 @@ config MIPS
         select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT
         select RTC_LIB if !MACH_LOONGSON64
         select GENERIC_ATOMIC64 if !64BIT
-       select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
         select HAVE_DMA_CONTIGUOUS
         select HAVE_DMA_API_DEBUG
         select GENERIC_IRQ_PROBE
diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h

index 15e0fec..6bf10e7 100644 (file)
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -51,7 +51,8 @@ extern int cp0_fdc_irq;
  
  extern int get_c0_fdc_int(void);
  
-void arch_trigger_all_cpu_backtrace(bool);
-#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
+                                   bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
  
  #endif /* _ASM_IRQ_H */
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h

index 70128d3..9e9e944 100644 (file)
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -673,8 +673,6 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
  struct file;
  pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                 unsigned long size, pgprot_t vma_prot);
-int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
-               unsigned long size, pgprot_t *vma_prot);
  #endif
  
  /*
diff --git a/arch/mips/include/asm/uprobes.h b/arch/mips/include/asm/uprobes.h

index 70a4a2f..b86d1ae 100644 (file)
--- a/arch/mips/include/asm/uprobes.h
+++ b/arch/mips/include/asm/uprobes.h
@@ -42,16 +42,4 @@ struct arch_uprobe_task {
         unsigned long saved_trap_nr;
  };
  
-extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup,
-       struct mm_struct *mm, unsigned long addr);
-extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
-extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
-extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
-extern int arch_uprobe_exception_notify(struct notifier_block *self,
-       unsigned long val, void *data);
-extern void arch_uprobe_abort_xol(struct arch_uprobe *aup,
-       struct pt_regs *regs);
-extern unsigned long arch_uretprobe_hijack_return_addr(
-       unsigned long trampoline_vaddr, struct pt_regs *regs);
-
  #endif /* __ASM_UPROBES_H */
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c

index d2d0615..9514e5f 100644 (file)
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -569,9 +569,16 @@ static void arch_dump_stack(void *info)
         dump_stack();
  }
  
-void arch_trigger_all_cpu_backtrace(bool include_self)
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
  {
-       smp_call_function(arch_dump_stack, NULL, 1);
+       long this_cpu = get_cpu();
+
+       if (cpumask_test_cpu(this_cpu, mask) && !exclude_self)
+               dump_stack();
+
+       smp_call_function_many(mask, arch_dump_stack, NULL, 1);
+
+       put_cpu();
  }
  
  int mips_get_process_fp_mode(struct task_struct *task)
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S

index a82c178..d5de675 100644 (file)
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -55,6 +55,7 @@ SECTIONS
         .text : {
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 KPROBES_TEXT
                 IRQENTRY_TEXT
diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S

index 13c4814..2d5f1c3 100644 (file)
--- a/arch/mn10300/kernel/vmlinux.lds.S
+++ b/arch/mn10300/kernel/vmlinux.lds.S
@@ -30,6 +30,7 @@ SECTIONS
         HEAD_TEXT
         TEXT_TEXT
         SCHED_TEXT
+       CPUIDLE_TEXT
         LOCK_TEXT
         KPROBES_TEXT
         *(.fixup)
diff --git a/arch/nios2/kernel/vmlinux.lds.S b/arch/nios2/kernel/vmlinux.lds.S

index e23e895..6a8045b 100644 (file)
--- a/arch/nios2/kernel/vmlinux.lds.S
+++ b/arch/nios2/kernel/vmlinux.lds.S
@@ -37,6 +37,7 @@ SECTIONS
         .text : {
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 IRQENTRY_TEXT
                 SOFTIRQENTRY_TEXT
diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S

index d936de4..d68b9ed 100644 (file)
--- a/arch/openrisc/kernel/vmlinux.lds.S
+++ b/arch/openrisc/kernel/vmlinux.lds.S
@@ -47,6 +47,7 @@ SECTIONS
            _stext = .;
           TEXT_TEXT
           SCHED_TEXT
+         CPUIDLE_TEXT
           LOCK_TEXT
           KPROBES_TEXT
           IRQENTRY_TEXT
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig

index 2a0339a..71c4a3a 100644 (file)
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -16,7 +16,6 @@ config PARISC
         select BUILDTIME_EXTABLE_SORT
         select HAVE_PERF_EVENTS
         select GENERIC_ATOMIC64 if !64BIT
-       select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
         select GENERIC_IRQ_PROBE
         select GENERIC_PCI_IOMAP
         select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S

index 5b8fae8..b37787d 100644 (file)
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -69,6 +69,7 @@ SECTIONS
         .text ALIGN(PAGE_SIZE) : {
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 KPROBES_TEXT
                 IRQENTRY_TEXT
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig

index 569b4f9..65fba4c 100644 (file)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -108,7 +108,6 @@ config PPC
         select HAVE_DEBUG_KMEMLEAK
         select ARCH_HAS_SG_CHAIN
         select GENERIC_ATOMIC64 if PPC32
-       select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
         select HAVE_PERF_EVENTS
         select HAVE_PERF_REGS
         select HAVE_PERF_USER_STACK_DUMP
diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h

index 7b58917..4d52ccf 100644 (file)
--- a/arch/powerpc/include/asm/mmzone.h
+++ b/arch/powerpc/include/asm/mmzone.h
@@ -41,6 +41,9 @@ u64 memory_hotplug_max(void);
  #else
  #define memory_hotplug_max() memblock_end_of_DRAM()
  #endif /* CONFIG_NEED_MULTIPLE_NODES */
+#ifdef CONFIG_FA_DUMP
+#define __HAVE_ARCH_RESERVED_KERNEL_PAGES
+#endif
  
  #endif /* __KERNEL__ */
  #endif /* _ASM_MMZONE_H_ */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c

index 963918e..8f0c7c5 100644 (file)
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -333,6 +333,11 @@ int __init fadump_reserve_mem(void)
         return 1;
  }
  
+unsigned long __init arch_reserved_kernel_pages(void)
+{
+       return memblock_reserved_size() / PAGE_SIZE;
+}
+
  /* Look for fadump= cmdline option. */
  static int __init early_fadump_param(char *p)
  {
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S

index 2d1cfaf..8295f51 100644 (file)
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -99,6 +99,7 @@ SECTIONS
                 /* careful! __ftr_alt_* sections need to be close to .text */
                 *(.text .fixup __ftr_alt_* .ref.text)
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 KPROBES_TEXT
                 IRQENTRY_TEXT
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig

index deeadfa..426481d 100644 (file)
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -67,10 +67,10 @@ config DEBUG_RODATA
  
  config S390
         def_bool y
-       select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
         select ARCH_HAS_DEVMEM_IS_ALLOWED
         select ARCH_HAS_ELF_RANDOMIZE
         select ARCH_HAS_GCOV_PROFILE_ALL
+       select ARCH_HAS_GIGANTIC_PAGE
         select ARCH_HAS_KCOV
         select ARCH_HAS_SG_CHAIN
         select ARCH_HAS_UBSAN_SANITIZE_ALL
diff --git a/arch/s390/include/asm/uprobes.h b/arch/s390/include/asm/uprobes.h

index 1411dff..658393c 100644 (file)
--- a/arch/s390/include/asm/uprobes.h
+++ b/arch/s390/include/asm/uprobes.h
@@ -29,14 +29,4 @@ struct arch_uprobe {
  struct arch_uprobe_task {
  };
  
-int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm,
-                            unsigned long addr);
-int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
-int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
-bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
-int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
-                                void *data);
-void arch_uprobe_abort_xol(struct arch_uprobe *ap, struct pt_regs *regs);
-unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline,
-                                               struct pt_regs *regs);
  #endif /* _ASM_UPROBES_H */
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c

index 437e611..0f9cd90 100644 (file)
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -189,7 +189,7 @@ static int groups16_to_user(u16 __user *grouplist, struct group_info *group_info
         kgid_t kgid;
  
         for (i = 0; i < group_info->ngroups; i++) {
-               kgid = GROUP_AT(group_info, i);
+               kgid = group_info->gid[i];
                 group = (u16)from_kgid_munged(user_ns, kgid);
                 if (put_user(group, grouplist+i))
                         return -EFAULT;
@@ -213,7 +213,7 @@ static int groups16_from_user(struct group_info *group_info, u16 __user *groupli
                 if (!gid_valid(kgid))
                         return -EINVAL;
  
-               GROUP_AT(group_info, i) = kgid;
+               group_info->gid[i] = kgid;
         }
  
         return 0;
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S

index 429bfd1..000e6e9 100644 (file)
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -35,6 +35,7 @@ SECTIONS
                 HEAD_TEXT
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 KPROBES_TEXT
                 IRQENTRY_TEXT
diff --git a/arch/score/kernel/vmlinux.lds.S b/arch/score/kernel/vmlinux.lds.S

index 7274b5c..4117890 100644 (file)
--- a/arch/score/kernel/vmlinux.lds.S
+++ b/arch/score/kernel/vmlinux.lds.S
@@ -40,6 +40,7 @@ SECTIONS
                 _text = .;      /* Text and read-only data */
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 KPROBES_TEXT
                 *(.text.*)
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S

index 235a410..5b9a3cc 100644 (file)
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -36,6 +36,7 @@ SECTIONS
                 TEXT_TEXT
                 EXTRA_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 KPROBES_TEXT
                 IRQENTRY_TEXT
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig

index f5d60f1..b23c76b 100644 (file)
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -22,7 +22,6 @@ config SPARC
         select HAVE_ARCH_TRACEHOOK
         select HAVE_EXIT_THREAD
         select SYSCTL_EXCEPTION_TRACE
-       select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
         select RTC_CLASS
         select RTC_DRV_M48T59
         select RTC_SYSTOHC
diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h

index 3f70f90..1d51a11 100644 (file)
--- a/arch/sparc/include/asm/irq_64.h
+++ b/arch/sparc/include/asm/irq_64.h
@@ -86,8 +86,9 @@ static inline unsigned long get_softint(void)
         return retval;
  }
  
-void arch_trigger_all_cpu_backtrace(bool);
-#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
+                                   bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
  
  extern void *hardirq_stack[NR_CPUS];
  extern void *softirq_stack[NR_CPUS];
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c

index fa14402..47ff558 100644 (file)
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -239,7 +239,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp)
         }
  }
  
-void arch_trigger_all_cpu_backtrace(bool include_self)
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
  {
         struct thread_info *tp = current_thread_info();
         struct pt_regs *regs = get_irq_regs();
@@ -255,15 +255,15 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
  
         memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot));
  
-       if (include_self)
+       if (cpumask_test_cpu(this_cpu, mask) && !exclude_self)
                 __global_reg_self(tp, regs, this_cpu);
  
         smp_fetch_global_regs();
  
-       for_each_online_cpu(cpu) {
+       for_each_cpu(cpu, mask) {
                 struct global_reg_snapshot *gp;
  
-               if (!include_self && cpu == this_cpu)
+               if (exclude_self && cpu == this_cpu)
                         continue;
  
                 gp = &global_cpu_snapshot[cpu].reg;
@@ -300,7 +300,7 @@ void arch_trigger_all_cpu_backtrace(bool include_self)
  
  static void sysrq_handle_globreg(int key)
  {
-       arch_trigger_all_cpu_backtrace(true);
+       trigger_all_cpu_backtrace();
  }
  
  static struct sysrq_key_op sparc_globalreg_op = {
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S

index d79b3b7..572db68 100644 (file)
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -49,6 +49,7 @@ SECTIONS
                 HEAD_TEXT
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 KPROBES_TEXT
                 IRQENTRY_TEXT
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig

index 78da75b..4583c03 100644 (file)
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -3,7 +3,6 @@
  
  config TILE
         def_bool y
-       select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
         select ARCH_HAS_DEVMEM_IS_ALLOWED
         select ARCH_HAVE_NMI_SAFE_CMPXCHG
         select ARCH_WANT_FRAME_POINTERS
diff --git a/arch/tile/include/asm/irq.h b/arch/tile/include/asm/irq.h

index 84a9240..1fa1f25 100644 (file)
--- a/arch/tile/include/asm/irq.h
+++ b/arch/tile/include/asm/irq.h
@@ -79,8 +79,9 @@ void tile_irq_activate(unsigned int irq, int tile_irq_type);
  void setup_irq_regs(void);
  
  #ifdef __tilegx__
-void arch_trigger_all_cpu_backtrace(bool self);
-#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
+                                   bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
  #endif
  
  #endif /* _ASM_TILE_IRQ_H */
diff --git a/arch/tile/kernel/entry.S b/arch/tile/kernel/entry.S

index 670a356..101de13 100644 (file)
--- a/arch/tile/kernel/entry.S
+++ b/arch/tile/kernel/entry.S
@@ -50,7 +50,7 @@ STD_ENTRY(smp_nap)
   * When interrupted at _cpu_idle_nap, we bump the PC forward 8, and
   * as a result return to the function that called _cpu_idle().
   */
-STD_ENTRY(_cpu_idle)
+STD_ENTRY_SECTION(_cpu_idle, .cpuidle.text)
         movei r1, 1
         IRQ_ENABLE_LOAD(r2, r3)
         mtspr INTERRUPT_CRITICAL_SECTION, r1
diff --git a/arch/tile/kernel/pmc.c b/arch/tile/kernel/pmc.c

index db62cc3..81cf874 100644 (file)
--- a/arch/tile/kernel/pmc.c
+++ b/arch/tile/kernel/pmc.c
@@ -16,7 +16,6 @@
  #include <linux/spinlock.h>
  #include <linux/module.h>
  #include <linux/atomic.h>
-#include <linux/interrupt.h>
  
  #include <asm/processor.h>
  #include <asm/pmc.h>
@@ -29,9 +28,7 @@ int handle_perf_interrupt(struct pt_regs *regs, int fault)
         if (!perf_irq)
                 panic("Unexpected PERF_COUNT interrupt %d\n", fault);
  
-       nmi_enter();
         retval = perf_irq(regs, fault);
-       nmi_exit();
         return retval;
  }
  
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c

index a465d83..9f37106 100644 (file)
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -22,7 +22,7 @@
  #include <linux/init.h>
  #include <linux/mm.h>
  #include <linux/compat.h>
-#include <linux/hardirq.h>
+#include <linux/nmi.h>
  #include <linux/syscalls.h>
  #include <linux/kernel.h>
  #include <linux/tracehook.h>
@@ -594,66 +594,18 @@ void show_regs(struct pt_regs *regs)
         tile_show_stack(&kbt);
  }
  
-/* To ensure stack dump on tiles occurs one by one. */
-static DEFINE_SPINLOCK(backtrace_lock);
-/* To ensure no backtrace occurs before all of the stack dump are done. */
-static atomic_t backtrace_cpus;
-/* The cpu mask to avoid reentrance. */
-static struct cpumask backtrace_mask;
-
-void do_nmi_dump_stack(struct pt_regs *regs)
-{
-       int is_idle = is_idle_task(current) && !in_interrupt();
-       int cpu;
-
-       nmi_enter();
-       cpu = smp_processor_id();
-       if (WARN_ON_ONCE(!cpumask_test_and_clear_cpu(cpu, &backtrace_mask)))
-               goto done;
-
-       spin_lock(&backtrace_lock);
-       if (is_idle)
-               pr_info("CPU: %d idle\n", cpu);
-       else
-               show_regs(regs);
-       spin_unlock(&backtrace_lock);
-       atomic_dec(&backtrace_cpus);
-done:
-       nmi_exit();
-}
-
  #ifdef __tilegx__
-void arch_trigger_all_cpu_backtrace(bool self)
+void nmi_raise_cpu_backtrace(struct cpumask *in_mask)
  {
         struct cpumask mask;
         HV_Coord tile;
         unsigned int timeout;
         int cpu;
-       int ongoing;
         HV_NMI_Info info[NR_CPUS];
  
-       ongoing = atomic_cmpxchg(&backtrace_cpus, 0, num_online_cpus() - 1);
-       if (ongoing != 0) {
-               pr_err("Trying to do all-cpu backtrace.\n");
-               pr_err("But another all-cpu backtrace is ongoing (%d cpus left)\n",
-                      ongoing);
-               if (self) {
-                       pr_err("Reporting the stack on this cpu only.\n");
-                       dump_stack();
-               }
-               return;
-       }
-
-       cpumask_copy(&mask, cpu_online_mask);
-       cpumask_clear_cpu(smp_processor_id(), &mask);
-       cpumask_copy(&backtrace_mask, &mask);
-
-       /* Backtrace for myself first. */
-       if (self)
-               dump_stack();
-
         /* Tentatively dump stack on remote tiles via NMI. */
         timeout = 100;
+       cpumask_copy(&mask, in_mask);
         while (!cpumask_empty(&mask) && timeout) {
                 for_each_cpu(cpu, &mask) {
                         tile.x = cpu_x(cpu);
@@ -664,12 +616,17 @@ void arch_trigger_all_cpu_backtrace(bool self)
                 }
  
                 mdelay(10);
+               touch_softlockup_watchdog();
                 timeout--;
         }
  
-       /* Warn about cpus stuck in ICS and decrement their counts here. */
+       /* Warn about cpus stuck in ICS. */
         if (!cpumask_empty(&mask)) {
                 for_each_cpu(cpu, &mask) {
+
+                       /* Clear the bit as if nmi_cpu_backtrace() ran. */
+                       cpumask_clear_cpu(cpu, in_mask);
+
                         switch (info[cpu].result) {
                         case HV_NMI_RESULT_FAIL_ICS:
                                 pr_warn("Skipping stack dump of cpu %d in ICS at pc %#llx\n",
@@ -680,16 +637,20 @@ void arch_trigger_all_cpu_backtrace(bool self)
                                         cpu);
                                 break;
                         case HV_ENOSYS:
-                               pr_warn("Hypervisor too old to allow remote stack dumps.\n");
-                               goto skip_for_each;
+                               WARN_ONCE(1, "Hypervisor too old to allow remote stack dumps.\n");
+                               break;
                         default:  /* should not happen */
                                 pr_warn("Skipping stack dump of cpu %d [%d,%#llx]\n",
                                         cpu, info[cpu].result, info[cpu].pc);
                                 break;
                         }
                 }
-skip_for_each:
-               atomic_sub(cpumask_weight(&mask), &backtrace_cpus);
         }
  }
+
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
+{
+       nmi_trigger_cpumask_backtrace(mask, exclude_self,
+                                     nmi_raise_cpu_backtrace);
+}
  #endif /* __tilegx_ */
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c

index 4d9651c..39f427b 100644 (file)
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -20,6 +20,8 @@
  #include <linux/reboot.h>
  #include <linux/uaccess.h>
  #include <linux/ptrace.h>
+#include <linux/hardirq.h>
+#include <linux/nmi.h>
  #include <asm/stack.h>
  #include <asm/traps.h>
  #include <asm/setup.h>
@@ -392,14 +394,17 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num,
  
  void do_nmi(struct pt_regs *regs, int fault_num, unsigned long reason)
  {
+       nmi_enter();
         switch (reason) {
+#ifdef arch_trigger_cpumask_backtrace
         case TILE_NMI_DUMP_STACK:
-               do_nmi_dump_stack(regs);
+               nmi_cpu_backtrace(regs);
                 break;
+#endif
         default:
                 panic("Unexpected do_nmi type %ld", reason);
-               return;
         }
+       nmi_exit();
  }
  
  /* Deprecated function currently only used here. */
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S

index 9d449ca..e1baf09 100644 (file)
--- a/arch/tile/kernel/vmlinux.lds.S
+++ b/arch/tile/kernel/vmlinux.lds.S
@@ -42,6 +42,7 @@ SECTIONS
    .text : AT (ADDR(.text) - LOAD_OFFSET) {
      HEAD_TEXT
      SCHED_TEXT
+    CPUIDLE_TEXT
      LOCK_TEXT
      KPROBES_TEXT
      IRQENTRY_TEXT
diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S

index adde088..4fdbcf9 100644 (file)
--- a/arch/um/kernel/dyn.lds.S
+++ b/arch/um/kernel/dyn.lds.S
@@ -68,6 +68,7 @@ SECTIONS
      _stext = .;
      TEXT_TEXT
      SCHED_TEXT
+    CPUIDLE_TEXT
      LOCK_TEXT
      *(.fixup)
      *(.stub .text.* .gnu.linkonce.t.*)
diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S

index 6899195..1840f55 100644 (file)
--- a/arch/um/kernel/uml.lds.S
+++ b/arch/um/kernel/uml.lds.S
@@ -28,6 +28,7 @@ SECTIONS
      _stext = .;
      TEXT_TEXT
      SCHED_TEXT
+    CPUIDLE_TEXT
      LOCK_TEXT
      *(.fixup)
      /* .gnu.warning sections are handled specially by elf32.em.  */
diff --git a/arch/unicore32/kernel/vmlinux.lds.S b/arch/unicore32/kernel/vmlinux.lds.S

index 77e407e..56e788e 100644 (file)
--- a/arch/unicore32/kernel/vmlinux.lds.S
+++ b/arch/unicore32/kernel/vmlinux.lds.S
@@ -37,6 +37,7 @@ SECTIONS
         .text : {               /* Real text segment */
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
  
                 *(.fixup)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index 58bec8f..bada636 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,11 +23,11 @@ config X86
         select ARCH_CLOCKSOURCE_DATA
         select ARCH_DISCARD_MEMBLOCK
         select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
-       select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
         select ARCH_HAS_DEVMEM_IS_ALLOWED
         select ARCH_HAS_ELF_RANDOMIZE
         select ARCH_HAS_FAST_MULTIPLIER
         select ARCH_HAS_GCOV_PROFILE_ALL
+       select ARCH_HAS_GIGANTIC_PAGE           if X86_64
         select ARCH_HAS_KCOV                    if X86_64
         select ARCH_HAS_PMEM_API                if X86_64
         select ARCH_HAS_MMIO_FLUSH
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h

index e7de5c9..16d3fa2 100644 (file)
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -50,8 +50,9 @@ extern int vector_used_by_percpu_irq(unsigned int vector);
  extern void init_ISA_irqs(void);
  
  #ifdef CONFIG_X86_LOCAL_APIC
-void arch_trigger_all_cpu_backtrace(bool);
-#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
+                                   bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
  #endif
  
  #endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h

index b77f5ed..ac7692d 100644 (file)
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -4,6 +4,10 @@
  #include <asm/processor-flags.h>
  
  #ifndef __ASSEMBLY__
+
+/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
+#define __cpuidle __attribute__((__section__(".cpuidle.text")))
+
  /*
   * Interrupt control:
   */
@@ -44,12 +48,12 @@ static inline void native_irq_enable(void)
         asm volatile("sti": : :"memory");
  }
  
-static inline void native_safe_halt(void)
+static inline __cpuidle void native_safe_halt(void)
  {
         asm volatile("sti; hlt": : :"memory");
  }
  
-static inline void native_halt(void)
+static inline __cpuidle void native_halt(void)
  {
         asm volatile("hlt": : :"memory");
  }
@@ -86,7 +90,7 @@ static inline notrace void arch_local_irq_enable(void)
   * Used in the idle loop; sti takes one instruction cycle
   * to complete:
   */
-static inline void arch_safe_halt(void)
+static inline __cpuidle void arch_safe_halt(void)
  {
         native_safe_halt();
  }
@@ -95,7 +99,7 @@ static inline void arch_safe_halt(void)
   * Used when interrupts are already enabled or to
   * shutdown the processor:
   */
-static inline void halt(void)
+static inline __cpuidle void halt(void)
  {
         native_halt();
  }
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h

index f1218f5..8b4de22 100644 (file)
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -439,8 +439,6 @@ extern pgprot_t pgprot_writethrough(pgprot_t prot);
  struct file;
  pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                                unsigned long size, pgprot_t vma_prot);
-int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
-                              unsigned long size, pgprot_t *vma_prot);
  
  /* Install a pte for a particular vaddr in kernel space. */
  void set_pte_vaddr(unsigned long vaddr, pte_t pte);
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c

index bdfad64..af15f44 100644 (file)
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -152,7 +152,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
  }
  EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
  
-void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
  {
         unsigned int cpu = smp_processor_id();
         struct cstate_entry *percpu_entry;
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c

index f29501e..c73c9fb 100644 (file)
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -26,32 +26,32 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
  }
  #endif
  
-#ifdef arch_trigger_all_cpu_backtrace
+#ifdef arch_trigger_cpumask_backtrace
  static void nmi_raise_cpu_backtrace(cpumask_t *mask)
  {
         apic->send_IPI_mask(mask, NMI_VECTOR);
  }
  
-void arch_trigger_all_cpu_backtrace(bool include_self)
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
  {
-       nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace);
+       nmi_trigger_cpumask_backtrace(mask, exclude_self,
+                                     nmi_raise_cpu_backtrace);
  }
  
-static int
-arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
+static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
  {
         if (nmi_cpu_backtrace(regs))
                 return NMI_HANDLED;
  
         return NMI_DONE;
  }
-NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler);
+NOKPROBE_SYMBOL(nmi_cpu_backtrace_handler);
  
-static int __init register_trigger_all_cpu_backtrace(void)
+static int __init register_nmi_cpu_backtrace_handler(void)
  {
-       register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler,
+       register_nmi_handler(NMI_LOCAL, nmi_cpu_backtrace_handler,
                                 0, "arch_bt");
         return 0;
  }
-early_initcall(register_trigger_all_cpu_backtrace);
+early_initcall(register_nmi_cpu_backtrace_handler);
  #endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

index 4002b47..28cea78 100644 (file)
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -302,7 +302,7 @@ void arch_cpu_idle(void)
  /*
   * We use this if we don't have any better idle routine..
   */
-void default_idle(void)
+void __cpuidle default_idle(void)
  {
         trace_cpu_idle_rcuidle(1, smp_processor_id());
         safe_halt();
@@ -417,7 +417,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
   * with interrupts enabled and no flags, which is backwards compatible with the
   * original MWAIT implementation.
   */
-static void mwait_idle(void)
+static __cpuidle void mwait_idle(void)
  {
         if (!current_set_polling_and_test()) {
                 trace_cpu_idle_rcuidle(1, smp_processor_id());
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S

index 9297a00..dbf67f6 100644 (file)
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -97,6 +97,7 @@ SECTIONS
                 _stext = .;
                 TEXT_TEXT
                 SCHED_TEXT
+               CPUIDLE_TEXT
                 LOCK_TEXT
                 KPROBES_TEXT
                 ENTRY_TEXT
diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S

index 72cfe35..31411fc 100644 (file)
--- a/arch/xtensa/kernel/vmlinux.lds.S
+++ b/arch/xtensa/kernel/vmlinux.lds.S
@@ -89,6 +89,9 @@ SECTIONS
      VMLINUX_SYMBOL(__sched_text_start) = .;
      *(.sched.literal .sched.text)
      VMLINUX_SYMBOL(__sched_text_end) = .;
+    VMLINUX_SYMBOL(__cpuidle_text_start) = .;
+    *(.cpuidle.literal .cpuidle.text)
+    VMLINUX_SYMBOL(__cpuidle_text_end) = .;
      VMLINUX_SYMBOL(__lock_text_start) = .;
      *(.spinlock.literal .spinlock.text)
      VMLINUX_SYMBOL(__lock_text_end) = .;
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c

index cea5252..2237d3f 100644 (file)
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -31,6 +31,7 @@
  #include <linux/sched.h>       /* need_resched() */
  #include <linux/tick.h>
  #include <linux/cpuidle.h>
+#include <linux/cpu.h>
  #include <acpi/processor.h>
  
  /*
@@ -115,7 +116,7 @@ static const struct dmi_system_id processor_power_dmi_table[] = {
   * Callers should disable interrupts before the call and enable
   * interrupts after return.
   */
-static void acpi_safe_halt(void)
+static void __cpuidle acpi_safe_halt(void)
  {
         if (!tif_need_resched()) {
                 safe_halt();
@@ -645,7 +646,7 @@ static int acpi_idle_bm_check(void)
   *
   * Caller disables interrupt before call and enables interrupt after return.
   */
-static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
+static void __cpuidle acpi_idle_do_entry(struct acpi_processor_cx *cx)
  {
         if (cx->entry_method == ACPI_CSTATE_FFH) {
                 /* Call into architectural FFH based C-state */
diff --git a/drivers/base/memory.c b/drivers/base/memory.c

index dc75de9..62c63c0 100644 (file)
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -361,8 +361,11 @@ store_mem_state(struct device *dev,
  err:
         unlock_device_hotplug();
  
-       if (ret)
+       if (ret < 0)
                 return ret;
+       if (ret)
+               return -EINVAL;
+
         return count;
  }
  
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c

index 389ade4..ab264d3 100644 (file)
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -14,6 +14,7 @@
  #include <linux/cpuidle.h>
  #include <linux/cpumask.h>
  #include <linux/tick.h>
+#include <linux/cpu.h>
  
  #include "cpuidle.h"
  
@@ -178,8 +179,8 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
  }
  
  #ifdef CONFIG_ARCH_HAS_CPU_RELAX
-static int poll_idle(struct cpuidle_device *dev,
-               struct cpuidle_driver *drv, int index)
+static int __cpuidle poll_idle(struct cpuidle_device *dev,
+                              struct cpuidle_driver *drv, int index)
  {
         local_irq_enable();
         if (!current_set_polling_and_test()) {
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c

index 67ec58f..4466a2f 100644 (file)
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -863,8 +863,8 @@ static struct cpuidle_state dnv_cstates[] = {
   *
   * Must be called under local_irq_disable().
   */
-static int intel_idle(struct cpuidle_device *dev,
-               struct cpuidle_driver *drv, int index)
+static __cpuidle int intel_idle(struct cpuidle_device *dev,
+                               struct cpuidle_driver *drv, int index)
  {
         unsigned long ecx = 1; /* break on interrupt flag */
         struct cpuidle_state *state = &drv->states[index];
diff --git a/drivers/of/base.c b/drivers/of/base.c

index a0bccb5..d687e6d 100644 (file)
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -2077,6 +2077,8 @@ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align))
                         name = of_get_property(of_aliases, "stdout", NULL);
                 if (name)
                         of_stdout = of_find_node_opts_by_path(name, &of_stdout_options);
+               if (of_stdout)
+                       console_set_by_of();
         }
  
         if (!of_aliases)
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c

index 5d3995d..a7416cd 100644 (file)
--- a/drivers/staging/lustre/lustre/ptlrpc/sec.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec.c
@@ -2220,7 +2220,7 @@ int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset)
         task_lock(current);
         if (pud->pud_ngroups > current_ngroups)
                 pud->pud_ngroups = current_ngroups;
-       memcpy(pud->pud_groups, current_cred()->group_info->blocks[0],
+       memcpy(pud->pud_groups, current_cred()->group_info->gid,
                pud->pud_ngroups * sizeof(__u32));
         task_unlock(current);
  
diff --git a/fs/Kconfig b/fs/Kconfig

index 3ef62ba..4bd03a2 100644 (file)
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -200,6 +200,9 @@ config HUGETLBFS
  config HUGETLB_PAGE
         def_bool HUGETLBFS
  
+config ARCH_HAS_GIGANTIC_PAGE
+       bool
+
  source "fs/configfs/Kconfig"
  source "fs/efivarfs/Kconfig"
  
diff --git a/fs/dax.c b/fs/dax.c

index cc025f8..014defd 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1036,7 +1036,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
         if (!write && !buffer_mapped(&bh)) {
                 spinlock_t *ptl;
                 pmd_t entry;
-               struct page *zero_page = get_huge_zero_page();
+               struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
  
                 if (unlikely(!zero_page)) {
                         dax_pmd_dbg(&bh, address, "no zero page");
diff --git a/fs/ext2/file.c b/fs/ext2/file.c

index 423cc01..0ca363d 100644 (file)
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -234,6 +234,7 @@ const struct file_operations ext2_file_operations = {
         .open           = dquot_file_open,
         .release        = ext2_release_file,
         .fsync          = ext2_fsync,
+       .get_unmapped_area = thp_get_unmapped_area,
         .splice_read    = generic_file_splice_read,
         .splice_write   = iter_file_splice_write,
  };
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index 25f763f..36d49cf 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -697,6 +697,7 @@ const struct file_operations ext4_file_operations = {
         .open           = ext4_file_open,
         .release        = ext4_release_file,
         .fsync          = ext4_sync_file,
+       .get_unmapped_area = thp_get_unmapped_area,
         .splice_read    = generic_file_splice_read,
         .splice_write   = iter_file_splice_write,
         .fallocate      = ext4_fallocate,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c

index 4ea71eb..7337cac 100644 (file)
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -416,7 +416,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
  
                 for (i = 0; i < pagevec_count(&pvec); ++i) {
                         struct page *page = pvec.pages[i];
-                       bool rsv_on_error;
                         u32 hash;
  
                         /*
@@ -458,18 +457,17 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                          * cache (remove_huge_page) BEFORE removing the
                          * region/reserve map (hugetlb_unreserve_pages).  In
                          * rare out of memory conditions, removal of the
-                        * region/reserve map could fail.  Before free'ing
-                        * the page, note PagePrivate which is used in case
-                        * of error.
+                        * region/reserve map could fail. Correspondingly,
+                        * the subpool and global reserve usage count can need
+                        * to be adjusted.
                          */
-                       rsv_on_error = !PagePrivate(page);
+                       VM_BUG_ON(PagePrivate(page));
                         remove_huge_page(page);
                         freed++;
                         if (!truncate_op) {
                                 if (unlikely(hugetlb_unreserve_pages(inode,
                                                         next, next + 1, 1)))
-                                       hugetlb_fix_reserve_counts(inode,
-                                                               rsv_on_error);
+                                       hugetlb_fix_reserve_counts(inode);
                         }
  
                         unlock_page(page);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h

index d7b062b..4b308a1 100644 (file)
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -679,11 +679,11 @@ unsigned int nfs_page_length(struct page *page)
         loff_t i_size = i_size_read(page_file_mapping(page)->host);
  
         if (i_size > 0) {
-               pgoff_t page_index = page_file_index(page);
+               pgoff_t index = page_index(page);
                 pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT;
-               if (page_index < end_index)
+               if (index < end_index)
                         return PAGE_SIZE;
-               if (page_index == end_index)
+               if (index == end_index)
                         return ((i_size - 1) & ~PAGE_MASK) + 1;
         }
         return 0;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c

index 174dd4c..965db47 100644 (file)
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -342,7 +342,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
          * update_nfs_request below if the region is not locked. */
         req->wb_page    = page;
         if (page) {
-               req->wb_index = page_file_index(page);
+               req->wb_index = page_index(page);
                 get_page(page);
         }
         req->wb_offset  = offset;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c

index 572e5b3..defc923 100644 (file)
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -295,7 +295,7 @@ int nfs_readpage(struct file *file, struct page *page)
         int             error;
  
         dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
-               page, PAGE_SIZE, page_file_index(page));
+               page, PAGE_SIZE, page_index(page));
         nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
         nfs_add_stats(inode, NFSIOS_READPAGES, 1);
  
diff --git a/fs/nfs/write.c b/fs/nfs/write.c

index 3a6724c..5321183 100644 (file)
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -151,7 +151,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
         spin_lock(&inode->i_lock);
         i_size = i_size_read(inode);
         end_index = (i_size - 1) >> PAGE_SHIFT;
-       if (i_size > 0 && page_file_index(page) < end_index)
+       if (i_size > 0 && page_index(page) < end_index)
                 goto out;
         end = page_file_offset(page) + ((loff_t)offset+count);
         if (i_size >= end)
@@ -603,7 +603,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
  {
         int ret;
  
-       nfs_pageio_cond_complete(pgio, page_file_index(page));
+       nfs_pageio_cond_complete(pgio, page_index(page));
         ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
                                    launder);
         if (ret == -EAGAIN) {
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c

index 9d46a0b..62469c6 100644 (file)
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -55,10 +55,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
                         goto oom;
  
                 for (i = 0; i < rqgi->ngroups; i++) {
-                       if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i)))
-                               GROUP_AT(gi, i) = exp->ex_anon_gid;
+                       if (gid_eq(GLOBAL_ROOT_GID, rqgi->gid[i]))
+                               gi->gid[i] = exp->ex_anon_gid;
                         else
-                               GROUP_AT(gi, i) = GROUP_AT(rqgi, i);
+                               gi->gid[i] = rqgi->gid[i];
                 }
         } else {
                 gi = get_group_info(rqgi);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c

index a204d7e..39bfaba 100644 (file)
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1903,7 +1903,7 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2)
         if (g1->ngroups != g2->ngroups)
                 return false;
         for (i=0; i<g1->ngroups; i++)
-               if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i)))
+               if (!gid_eq(g1->gid[i], g2->gid[i]))
                         return false;
         return true;
  }
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c

index a643138..7ebfca6 100644 (file)
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -49,12 +49,12 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
   * enough to fit in "count". Return an error pointer if the count
   * is not large enough.
   *
- * Called with the group->notification_mutex held.
+ * Called with the group->notification_lock held.
   */
  static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
                                             size_t count)
  {
-       BUG_ON(!mutex_is_locked(&group->notification_mutex));
+       assert_spin_locked(&group->notification_lock);
  
         pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
  
@@ -64,7 +64,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
         if (FAN_EVENT_METADATA_LEN > count)
                 return ERR_PTR(-EINVAL);
  
-       /* held the notification_mutex the whole time, so this is the
+       /* held the notification_lock the whole time, so this is the
          * same event we peeked above */
         return fsnotify_remove_first_event(group);
  }
@@ -147,7 +147,7 @@ static struct fanotify_perm_event_info *dequeue_event(
  {
         struct fanotify_perm_event_info *event, *return_e = NULL;
  
-       spin_lock(&group->fanotify_data.access_lock);
+       spin_lock(&group->notification_lock);
         list_for_each_entry(event, &group->fanotify_data.access_list,
                             fae.fse.list) {
                 if (event->fd != fd)
@@ -157,7 +157,7 @@ static struct fanotify_perm_event_info *dequeue_event(
                 return_e = event;
                 break;
         }
-       spin_unlock(&group->fanotify_data.access_lock);
+       spin_unlock(&group->notification_lock);
  
         pr_debug("%s: found return_re=%p\n", __func__, return_e);
  
@@ -244,10 +244,10 @@ static unsigned int fanotify_poll(struct file *file, poll_table *wait)
         int ret = 0;
  
         poll_wait(file, &group->notification_waitq, wait);
-       mutex_lock(&group->notification_mutex);
+       spin_lock(&group->notification_lock);
         if (!fsnotify_notify_queue_is_empty(group))
                 ret = POLLIN | POLLRDNORM;
-       mutex_unlock(&group->notification_mutex);
+       spin_unlock(&group->notification_lock);
  
         return ret;
  }
@@ -268,9 +268,9 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
  
         add_wait_queue(&group->notification_waitq, &wait);
         while (1) {
-               mutex_lock(&group->notification_mutex);
+               spin_lock(&group->notification_lock);
                 kevent = get_one_event(group, count);
-               mutex_unlock(&group->notification_mutex);
+               spin_unlock(&group->notification_lock);
  
                 if (IS_ERR(kevent)) {
                         ret = PTR_ERR(kevent);
@@ -309,10 +309,10 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
                                 wake_up(&group->fanotify_data.access_waitq);
                                 break;
                         }
-                       spin_lock(&group->fanotify_data.access_lock);
+                       spin_lock(&group->notification_lock);
                         list_add_tail(&kevent->list,
                                       &group->fanotify_data.access_list);
-                       spin_unlock(&group->fanotify_data.access_lock);
+                       spin_unlock(&group->notification_lock);
  #endif
                 }
                 buf += ret;
@@ -371,7 +371,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
          * Process all permission events on access_list and notification queue
          * and simulate reply from userspace.
          */
-       spin_lock(&group->fanotify_data.access_lock);
+       spin_lock(&group->notification_lock);
         list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
                                  fae.fse.list) {
                 pr_debug("%s: found group=%p event=%p\n", __func__, group,
@@ -380,22 +380,22 @@ static int fanotify_release(struct inode *ignored, struct file *file)
                 list_del_init(&event->fae.fse.list);
                 event->response = FAN_ALLOW;
         }
-       spin_unlock(&group->fanotify_data.access_lock);
  
         /*
          * Destroy all non-permission events. For permission events just
          * dequeue them and set the response. They will be freed once the
          * response is consumed and fanotify_get_response() returns.
          */
-       mutex_lock(&group->notification_mutex);
         while (!fsnotify_notify_queue_is_empty(group)) {
                 fsn_event = fsnotify_remove_first_event(group);
-               if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS))
+               if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) {
+                       spin_unlock(&group->notification_lock);
                         fsnotify_destroy_event(group, fsn_event);
-               else
+                       spin_lock(&group->notification_lock);
+               } else
                         FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
         }
-       mutex_unlock(&group->notification_mutex);
+       spin_unlock(&group->notification_lock);
  
         /* Response for all permission events it set, wakeup waiters */
         wake_up(&group->fanotify_data.access_waitq);
@@ -421,10 +421,10 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
  
         switch (cmd) {
         case FIONREAD:
-               mutex_lock(&group->notification_mutex);
+               spin_lock(&group->notification_lock);
                 list_for_each_entry(fsn_event, &group->notification_list, list)
                         send_len += FAN_EVENT_METADATA_LEN;
-               mutex_unlock(&group->notification_mutex);
+               spin_unlock(&group->notification_lock);
                 ret = put_user(send_len, (int __user *) p);
                 break;
         }
@@ -765,7 +765,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
                 event_f_flags |= O_LARGEFILE;
         group->fanotify_data.f_flags = event_f_flags;
  #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-       spin_lock_init(&group->fanotify_data.access_lock);
         init_waitqueue_head(&group->fanotify_data.access_waitq);
         INIT_LIST_HEAD(&group->fanotify_data.access_list);
  #endif
diff --git a/fs/notify/group.c b/fs/notify/group.c

index b47f7cf..fbe3cbe 100644 (file)
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -45,9 +45,9 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
   */
  void fsnotify_group_stop_queueing(struct fsnotify_group *group)
  {
-       mutex_lock(&group->notification_mutex);
+       spin_lock(&group->notification_lock);
         group->shutdown = true;
-       mutex_unlock(&group->notification_mutex);
+       spin_unlock(&group->notification_lock);
  }
  
  /*
@@ -125,7 +125,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
         atomic_set(&group->refcnt, 1);
         atomic_set(&group->num_marks, 0);
  
-       mutex_init(&group->notification_mutex);
+       spin_lock_init(&group->notification_lock);
         INIT_LIST_HEAD(&group->notification_list);
         init_waitqueue_head(&group->notification_waitq);
         group->max_events = UINT_MAX;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c

index b8d08d0..69d1ea3 100644 (file)
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -115,10 +115,10 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
         int ret = 0;
  
         poll_wait(file, &group->notification_waitq, wait);
-       mutex_lock(&group->notification_mutex);
+       spin_lock(&group->notification_lock);
         if (!fsnotify_notify_queue_is_empty(group))
                 ret = POLLIN | POLLRDNORM;
-       mutex_unlock(&group->notification_mutex);
+       spin_unlock(&group->notification_lock);
  
         return ret;
  }
@@ -138,7 +138,7 @@ static int round_event_name_len(struct fsnotify_event *fsn_event)
   * enough to fit in "count". Return an error pointer if
   * not large enough.
   *
- * Called with the group->notification_mutex held.
+ * Called with the group->notification_lock held.
   */
  static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
                                             size_t count)
@@ -157,7 +157,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
         if (event_size > count)
                 return ERR_PTR(-EINVAL);
  
-       /* held the notification_mutex the whole time, so this is the
+       /* held the notification_lock the whole time, so this is the
          * same event we peeked above */
         fsnotify_remove_first_event(group);
  
@@ -234,9 +234,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
  
         add_wait_queue(&group->notification_waitq, &wait);
         while (1) {
-               mutex_lock(&group->notification_mutex);
+               spin_lock(&group->notification_lock);
                 kevent = get_one_event(group, count);
-               mutex_unlock(&group->notification_mutex);
+               spin_unlock(&group->notification_lock);
  
                 pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent);
  
@@ -300,13 +300,13 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
  
         switch (cmd) {
         case FIONREAD:
-               mutex_lock(&group->notification_mutex);
+               spin_lock(&group->notification_lock);
                 list_for_each_entry(fsn_event, &group->notification_list,
                                     list) {
                         send_len += sizeof(struct inotify_event);
                         send_len += round_event_name_len(fsn_event);
                 }
-               mutex_unlock(&group->notification_mutex);
+               spin_unlock(&group->notification_lock);
                 ret = put_user(send_len, (int __user *) p);
                 break;
         }
diff --git a/fs/notify/notification.c b/fs/notify/notification.c

index e455e83..66f85c6 100644 (file)
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -63,7 +63,7 @@ EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
  /* return true if the notify queue is empty, false otherwise */
  bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
  {
-       BUG_ON(!mutex_is_locked(&group->notification_mutex));
+       assert_spin_locked(&group->notification_lock);
         return list_empty(&group->notification_list) ? true : false;
  }
  
@@ -73,8 +73,17 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
         /* Overflow events are per-group and we don't want to free them */
         if (!event || event->mask == FS_Q_OVERFLOW)
                 return;
-       /* If the event is still queued, we have a problem... */
-       WARN_ON(!list_empty(&event->list));
+       /*
+        * If the event is still queued, we have a problem... Do an unreliable
+        * lockless check first to avoid locking in the common case. The
+        * locking may be necessary for permission events which got removed
+        * from the list by a different CPU than the one freeing the event.
+        */
+       if (!list_empty(&event->list)) {
+               spin_lock(&group->notification_lock);
+               WARN_ON(!list_empty(&event->list));
+               spin_unlock(&group->notification_lock);
+       }
         group->ops->free_event(event);
  }
  
@@ -95,10 +104,10 @@ int fsnotify_add_event(struct fsnotify_group *group,
  
         pr_debug("%s: group=%p event=%p\n", __func__, group, event);
  
-       mutex_lock(&group->notification_mutex);
+       spin_lock(&group->notification_lock);
  
         if (group->shutdown) {
-               mutex_unlock(&group->notification_mutex);
+               spin_unlock(&group->notification_lock);
                 return 2;
         }
  
@@ -106,7 +115,7 @@ int fsnotify_add_event(struct fsnotify_group *group,
                 ret = 2;
                 /* Queue overflow event only if it isn't already queued */
                 if (!list_empty(&group->overflow_event->list)) {
-                       mutex_unlock(&group->notification_mutex);
+                       spin_unlock(&group->notification_lock);
                         return ret;
                 }
                 event = group->overflow_event;
@@ -116,7 +125,7 @@ int fsnotify_add_event(struct fsnotify_group *group,
         if (!list_empty(list) && merge) {
                 ret = merge(list, event);
                 if (ret) {
-                       mutex_unlock(&group->notification_mutex);
+                       spin_unlock(&group->notification_lock);
                         return ret;
                 }
         }
@@ -124,7 +133,7 @@ int fsnotify_add_event(struct fsnotify_group *group,
  queue:
         group->q_len++;
         list_add_tail(&event->list, list);
-       mutex_unlock(&group->notification_mutex);
+       spin_unlock(&group->notification_lock);
  
         wake_up(&group->notification_waitq);
         kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);
@@ -139,7 +148,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
  {
         struct fsnotify_event *event;
  
-       BUG_ON(!mutex_is_locked(&group->notification_mutex));
+       assert_spin_locked(&group->notification_lock);
  
         pr_debug("%s: group=%p\n", __func__, group);
  
@@ -161,7 +170,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
   */
  struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
  {
-       BUG_ON(!mutex_is_locked(&group->notification_mutex));
+       assert_spin_locked(&group->notification_lock);
  
         return list_first_entry(&group->notification_list,
                                 struct fsnotify_event, list);
@@ -175,12 +184,14 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
  {
         struct fsnotify_event *event;
  
-       mutex_lock(&group->notification_mutex);
+       spin_lock(&group->notification_lock);
         while (!fsnotify_notify_queue_is_empty(group)) {
                 event = fsnotify_remove_first_event(group);
+               spin_unlock(&group->notification_lock);
                 fsnotify_destroy_event(group, event);
+               spin_lock(&group->notification_lock);
         }
-       mutex_unlock(&group->notification_mutex);
+       spin_unlock(&group->notification_lock);
  }
  
  /*
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c

index 1d67fcb..8abab16 100644 (file)
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -2104,7 +2104,7 @@ int o2net_start_listening(struct o2nm_node *node)
         BUG_ON(o2net_listen_sock != NULL);
  
         mlog(ML_KTHREAD, "starting o2net thread...\n");
-       o2net_wq = create_singlethread_workqueue("o2net");
+       o2net_wq = alloc_ordered_workqueue("o2net", WQ_MEM_RECLAIM);
         if (o2net_wq == NULL) {
                 mlog(ML_ERROR, "unable to launch o2net thread\n");
                 return -ENOMEM; /* ? */
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c

index 533bd52..733e4e7 100644 (file)
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1904,7 +1904,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
         }
  
         snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
-       dlm->dlm_worker = create_singlethread_workqueue(wq_name);
+       dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0);
         if (!dlm->dlm_worker) {
                 status = -ENOMEM;
                 mlog_errno(status);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c

index ef474cd..354cdf9 100644 (file)
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -646,7 +646,7 @@ static int __init init_dlmfs_fs(void)
         }
         cleanup_inode = 1;
  
-       user_dlm_worker = create_singlethread_workqueue("user_dlm");
+       user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0);
         if (!user_dlm_worker) {
                 status = -ENOMEM;
                 goto bail;
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h

index 50cc550..5af68fc 100644 (file)
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -123,8 +123,6 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
  #define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
  #define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
  
-extern struct kmem_cache *ocfs2_inode_cache;
-
  extern const struct address_space_operations ocfs2_aops;
  extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops;
  
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c

index 603b28d..f56fe39 100644 (file)
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2329,7 +2329,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
         }
         cleancache_init_shared_fs(sb);
  
-       osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+       osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM);
         if (!osb->ocfs2_wq) {
                 status = -ENOMEM;
                 mlog_errno(status);
diff --git a/fs/proc/array.c b/fs/proc/array.c

index 88c7de1..89600fd 100644 (file)
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -186,51 +186,45 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
         task_unlock(p);
         rcu_read_unlock();
  
-       seq_printf(m,
-               "State:\t%s\n"
-               "Tgid:\t%d\n"
-               "Ngid:\t%d\n"
-               "Pid:\t%d\n"
-               "PPid:\t%d\n"
-               "TracerPid:\t%d\n"
-               "Uid:\t%d\t%d\t%d\t%d\n"
-               "Gid:\t%d\t%d\t%d\t%d\n"
-               "FDSize:\t%d\nGroups:\t",
-               get_task_state(p),
-               tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid,
-               from_kuid_munged(user_ns, cred->uid),
-               from_kuid_munged(user_ns, cred->euid),
-               from_kuid_munged(user_ns, cred->suid),
-               from_kuid_munged(user_ns, cred->fsuid),
-               from_kgid_munged(user_ns, cred->gid),
-               from_kgid_munged(user_ns, cred->egid),
-               from_kgid_munged(user_ns, cred->sgid),
-               from_kgid_munged(user_ns, cred->fsgid),
-               max_fds);
-
+       seq_printf(m, "State:\t%s", get_task_state(p));
+
+       seq_put_decimal_ull(m, "\nTgid:\t", tgid);
+       seq_put_decimal_ull(m, "\nNgid:\t", ngid);
+       seq_put_decimal_ull(m, "\nPid:\t", pid_nr_ns(pid, ns));
+       seq_put_decimal_ull(m, "\nPPid:\t", ppid);
+       seq_put_decimal_ull(m, "\nTracerPid:\t", tpid);
+       seq_put_decimal_ull(m, "\nUid:\t", from_kuid_munged(user_ns, cred->uid));
+       seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->euid));
+       seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->suid));
+       seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->fsuid));
+       seq_put_decimal_ull(m, "\nGid:\t", from_kgid_munged(user_ns, cred->gid));
+       seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->egid));
+       seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid));
+       seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid));
+       seq_put_decimal_ull(m, "\nFDSize:\t", max_fds);
+
+       seq_puts(m, "\nGroups:\t");
         group_info = cred->group_info;
         for (g = 0; g < group_info->ngroups; g++)
-               seq_printf(m, "%d ",
-                          from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
+               seq_put_decimal_ull(m, g ? " " : "",
+                               from_kgid_munged(user_ns, group_info->gid[g]));
         put_cred(cred);
+       /* Trailing space shouldn't have been added in the first place. */
+       seq_putc(m, ' ');
  
  #ifdef CONFIG_PID_NS
         seq_puts(m, "\nNStgid:");
         for (g = ns->level; g <= pid->level; g++)
-               seq_printf(m, "\t%d",
-                       task_tgid_nr_ns(p, pid->numbers[g].ns));
+               seq_put_decimal_ull(m, "\t", task_tgid_nr_ns(p, pid->numbers[g].ns));
         seq_puts(m, "\nNSpid:");
         for (g = ns->level; g <= pid->level; g++)
-               seq_printf(m, "\t%d",
-                       task_pid_nr_ns(p, pid->numbers[g].ns));
+               seq_put_decimal_ull(m, "\t", task_pid_nr_ns(p, pid->numbers[g].ns));
         seq_puts(m, "\nNSpgid:");
         for (g = ns->level; g <= pid->level; g++)
-               seq_printf(m, "\t%d",
-                       task_pgrp_nr_ns(p, pid->numbers[g].ns));
+               seq_put_decimal_ull(m, "\t", task_pgrp_nr_ns(p, pid->numbers[g].ns));
         seq_puts(m, "\nNSsid:");
         for (g = ns->level; g <= pid->level; g++)
-               seq_printf(m, "\t%d",
-                       task_session_nr_ns(p, pid->numbers[g].ns));
+               seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns));
  #endif
         seq_putc(m, '\n');
  }
@@ -299,11 +293,12 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
                 unlock_task_sighand(p, &flags);
         }
  
-       seq_printf(m, "Threads:\t%d\n", num_threads);
-       seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
+       seq_put_decimal_ull(m, "Threads:\t", num_threads);
+       seq_put_decimal_ull(m, "\nSigQ:\t", qsize);
+       seq_put_decimal_ull(m, "/", qlim);
  
         /* render them all */
-       render_sigset_t(m, "SigPnd:\t", &pending);
+       render_sigset_t(m, "\nSigPnd:\t", &pending);
         render_sigset_t(m, "ShdPnd:\t", &shpending);
         render_sigset_t(m, "SigBlk:\t", &blocked);
         render_sigset_t(m, "SigIgn:\t", &ignored);
@@ -348,17 +343,17 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
  static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
  {
  #ifdef CONFIG_SECCOMP
-       seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode);
+       seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode);
+       seq_putc(m, '\n');
  #endif
  }
  
  static inline void task_context_switch_counts(struct seq_file *m,
                                                 struct task_struct *p)
  {
-       seq_printf(m,   "voluntary_ctxt_switches:\t%lu\n"
-                       "nonvoluntary_ctxt_switches:\t%lu\n",
-                       p->nvcsw,
-                       p->nivcsw);
+       seq_put_decimal_ull(m, "voluntary_ctxt_switches:\t", p->nvcsw);
+       seq_put_decimal_ull(m, "\nnonvoluntary_ctxt_switches:\t", p->nivcsw);
+       seq_putc(m, '\n');
  }
  
  static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
@@ -490,41 +485,41 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
         start_time = nsec_to_clock_t(task->real_start_time);
  
         seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
-       seq_put_decimal_ll(m, ' ', ppid);
-       seq_put_decimal_ll(m, ' ', pgid);
-       seq_put_decimal_ll(m, ' ', sid);
-       seq_put_decimal_ll(m, ' ', tty_nr);
-       seq_put_decimal_ll(m, ' ', tty_pgrp);
-       seq_put_decimal_ull(m, ' ', task->flags);
-       seq_put_decimal_ull(m, ' ', min_flt);
-       seq_put_decimal_ull(m, ' ', cmin_flt);
-       seq_put_decimal_ull(m, ' ', maj_flt);
-       seq_put_decimal_ull(m, ' ', cmaj_flt);
-       seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime));
-       seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime));
-       seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime));
-       seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime));
-       seq_put_decimal_ll(m, ' ', priority);
-       seq_put_decimal_ll(m, ' ', nice);
-       seq_put_decimal_ll(m, ' ', num_threads);
-       seq_put_decimal_ull(m, ' ', 0);
-       seq_put_decimal_ull(m, ' ', start_time);
-       seq_put_decimal_ull(m, ' ', vsize);
-       seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0);
-       seq_put_decimal_ull(m, ' ', rsslim);
-       seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0);
-       seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0);
-       seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0);
-       seq_put_decimal_ull(m, ' ', esp);
-       seq_put_decimal_ull(m, ' ', eip);
+       seq_put_decimal_ll(m, " ", ppid);
+       seq_put_decimal_ll(m, " ", pgid);
+       seq_put_decimal_ll(m, " ", sid);
+       seq_put_decimal_ll(m, " ", tty_nr);
+       seq_put_decimal_ll(m, " ", tty_pgrp);
+       seq_put_decimal_ull(m, " ", task->flags);
+       seq_put_decimal_ull(m, " ", min_flt);
+       seq_put_decimal_ull(m, " ", cmin_flt);
+       seq_put_decimal_ull(m, " ", maj_flt);
+       seq_put_decimal_ull(m, " ", cmaj_flt);
+       seq_put_decimal_ull(m, " ", cputime_to_clock_t(utime));
+       seq_put_decimal_ull(m, " ", cputime_to_clock_t(stime));
+       seq_put_decimal_ll(m, " ", cputime_to_clock_t(cutime));
+       seq_put_decimal_ll(m, " ", cputime_to_clock_t(cstime));
+       seq_put_decimal_ll(m, " ", priority);
+       seq_put_decimal_ll(m, " ", nice);
+       seq_put_decimal_ll(m, " ", num_threads);
+       seq_put_decimal_ull(m, " ", 0);
+       seq_put_decimal_ull(m, " ", start_time);
+       seq_put_decimal_ull(m, " ", vsize);
+       seq_put_decimal_ull(m, " ", mm ? get_mm_rss(mm) : 0);
+       seq_put_decimal_ull(m, " ", rsslim);
+       seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->start_code : 1) : 0);
+       seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->end_code : 1) : 0);
+       seq_put_decimal_ull(m, " ", (permitted && mm) ? mm->start_stack : 0);
+       seq_put_decimal_ull(m, " ", esp);
+       seq_put_decimal_ull(m, " ", eip);
         /* The signal information here is obsolete.
          * It must be decimal for Linux 2.0 compatibility.
          * Use /proc/#/status for real-time signals.
          */
-       seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL);
-       seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
-       seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
-       seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
+       seq_put_decimal_ull(m, " ", task->pending.signal.sig[0] & 0x7fffffffUL);
+       seq_put_decimal_ull(m, " ", task->blocked.sig[0] & 0x7fffffffUL);
+       seq_put_decimal_ull(m, " ", sigign.sig[0] & 0x7fffffffUL);
+       seq_put_decimal_ull(m, " ", sigcatch.sig[0] & 0x7fffffffUL);
  
         /*
          * We used to output the absolute kernel address, but that's an
@@ -538,31 +533,31 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
         else
                 seq_puts(m, " 0");
  
-       seq_put_decimal_ull(m, ' ', 0);
-       seq_put_decimal_ull(m, ' ', 0);
-       seq_put_decimal_ll(m, ' ', task->exit_signal);
-       seq_put_decimal_ll(m, ' ', task_cpu(task));
-       seq_put_decimal_ull(m, ' ', task->rt_priority);
-       seq_put_decimal_ull(m, ' ', task->policy);
-       seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task));
-       seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime));
-       seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime));
+       seq_put_decimal_ull(m, " ", 0);
+       seq_put_decimal_ull(m, " ", 0);
+       seq_put_decimal_ll(m, " ", task->exit_signal);
+       seq_put_decimal_ll(m, " ", task_cpu(task));
+       seq_put_decimal_ull(m, " ", task->rt_priority);
+       seq_put_decimal_ull(m, " ", task->policy);
+       seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task));
+       seq_put_decimal_ull(m, " ", cputime_to_clock_t(gtime));
+       seq_put_decimal_ll(m, " ", cputime_to_clock_t(cgtime));
  
         if (mm && permitted) {
-               seq_put_decimal_ull(m, ' ', mm->start_data);
-               seq_put_decimal_ull(m, ' ', mm->end_data);
-               seq_put_decimal_ull(m, ' ', mm->start_brk);
-               seq_put_decimal_ull(m, ' ', mm->arg_start);
-               seq_put_decimal_ull(m, ' ', mm->arg_end);
-               seq_put_decimal_ull(m, ' ', mm->env_start);
-               seq_put_decimal_ull(m, ' ', mm->env_end);
+               seq_put_decimal_ull(m, " ", mm->start_data);
+               seq_put_decimal_ull(m, " ", mm->end_data);
+               seq_put_decimal_ull(m, " ", mm->start_brk);
+               seq_put_decimal_ull(m, " ", mm->arg_start);
+               seq_put_decimal_ull(m, " ", mm->arg_end);
+               seq_put_decimal_ull(m, " ", mm->env_start);
+               seq_put_decimal_ull(m, " ", mm->env_end);
         } else
-               seq_printf(m, " 0 0 0 0 0 0 0");
+               seq_puts(m, " 0 0 0 0 0 0 0");
  
         if (permitted)
-               seq_put_decimal_ll(m, ' ', task->exit_code);
+               seq_put_decimal_ll(m, " ", task->exit_code);
         else
-               seq_put_decimal_ll(m, ' ', 0);
+               seq_puts(m, " 0");
  
         seq_putc(m, '\n');
         if (mm)
@@ -598,13 +593,13 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
          * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
          *               size, resident, shared, text, data);
          */
-       seq_put_decimal_ull(m, 0, size);
-       seq_put_decimal_ull(m, ' ', resident);
-       seq_put_decimal_ull(m, ' ', shared);
-       seq_put_decimal_ull(m, ' ', text);
-       seq_put_decimal_ull(m, ' ', 0);
-       seq_put_decimal_ull(m, ' ', data);
-       seq_put_decimal_ull(m, ' ', 0);
+       seq_put_decimal_ull(m, "", size);
+       seq_put_decimal_ull(m, " ", resident);
+       seq_put_decimal_ull(m, " ", shared);
+       seq_put_decimal_ull(m, " ", text);
+       seq_put_decimal_ull(m, " ", 0);
+       seq_put_decimal_ull(m, " ", data);
+       seq_put_decimal_ull(m, " ", 0);
         seq_putc(m, '\n');
  
         return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c

index 3b792ab..dc7fe5f 100644 (file)
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2280,16 +2280,27 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
         if (!p)
                 return -ESRCH;
  
-       if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
-               task_lock(p);
-               if (slack_ns == 0)
-                       p->timer_slack_ns = p->default_timer_slack_ns;
-               else
-                       p->timer_slack_ns = slack_ns;
-               task_unlock(p);
-       } else
-               count = -EPERM;
+       if (p != current) {
+               if (!capable(CAP_SYS_NICE)) {
+                       count = -EPERM;
+                       goto out;
+               }
+
+               err = security_task_setscheduler(p);
+               if (err) {
+                       count = err;
+                       goto out;
+               }
+       }
+
+       task_lock(p);
+       if (slack_ns == 0)
+               p->timer_slack_ns = p->default_timer_slack_ns;
+       else
+               p->timer_slack_ns = slack_ns;
+       task_unlock(p);
  
+out:
         put_task_struct(p);
  
         return count;
@@ -2299,19 +2310,28 @@ static int timerslack_ns_show(struct seq_file *m, void *v)
  {
         struct inode *inode = m->private;
         struct task_struct *p;
-       int err =  0;
+       int err = 0;
  
         p = get_proc_task(inode);
         if (!p)
                 return -ESRCH;
  
-       if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
-               task_lock(p);
-               seq_printf(m, "%llu\n", p->timer_slack_ns);
-               task_unlock(p);
-       } else
-               err = -EPERM;
+       if (p != current) {
+
+               if (!capable(CAP_SYS_NICE)) {
+                       err = -EPERM;
+                       goto out;
+               }
+               err = security_task_getscheduler(p);
+               if (err)
+                       goto out;
+       }
  
+       task_lock(p);
+       seq_printf(m, "%llu\n", p->timer_slack_ns);
+       task_unlock(p);
+
+out:
         put_task_struct(p);
  
         return err;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c

index b9a8c81..8a42849 100644 (file)
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -23,6 +23,25 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
  {
  }
  
+static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
+{
+       char v[32];
+       static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '};
+       int len;
+
+       len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10));
+
+       seq_write(m, s, 16);
+
+       if (len > 0) {
+               if (len < 8)
+                       seq_write(m, blanks, 8 - len);
+
+               seq_write(m, v, len);
+       }
+       seq_write(m, " kB\n", 4);
+}
+
  static int meminfo_proc_show(struct seq_file *m, void *v)
  {
         struct sysinfo i;
@@ -32,10 +51,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
         unsigned long pages[NR_LRU_LISTS];
         int lru;
  
-/*
- * display in kilobytes.
- */
-#define K(x) ((x) << (PAGE_SHIFT - 10))
         si_meminfo(&i);
         si_swapinfo(&i);
         committed = percpu_counter_read_positive(&vm_committed_as);
@@ -50,136 +65,100 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
  
         available = si_mem_available();
  
-       /*
-        * Tagged format, for easy grepping and expansion.
-        */
-       seq_printf(m,
-               "MemTotal:       %8lu kB\n"
-               "MemFree:        %8lu kB\n"
-               "MemAvailable:   %8lu kB\n"
-               "Buffers:        %8lu kB\n"
-               "Cached:         %8lu kB\n"
-               "SwapCached:     %8lu kB\n"
-               "Active:         %8lu kB\n"
-               "Inactive:       %8lu kB\n"
-               "Active(anon):   %8lu kB\n"
-               "Inactive(anon): %8lu kB\n"
-               "Active(file):   %8lu kB\n"
-               "Inactive(file): %8lu kB\n"
-               "Unevictable:    %8lu kB\n"
-               "Mlocked:        %8lu kB\n"
-#ifdef CONFIG_HIGHMEM
-               "HighTotal:      %8lu kB\n"
-               "HighFree:       %8lu kB\n"
-               "LowTotal:       %8lu kB\n"
-               "LowFree:        %8lu kB\n"
-#endif
-#ifndef CONFIG_MMU
-               "MmapCopy:       %8lu kB\n"
-#endif
-               "SwapTotal:      %8lu kB\n"
-               "SwapFree:       %8lu kB\n"
-               "Dirty:          %8lu kB\n"
-               "Writeback:      %8lu kB\n"
-               "AnonPages:      %8lu kB\n"
-               "Mapped:         %8lu kB\n"
-               "Shmem:          %8lu kB\n"
-               "Slab:           %8lu kB\n"
-               "SReclaimable:   %8lu kB\n"
-               "SUnreclaim:     %8lu kB\n"
-               "KernelStack:    %8lu kB\n"
-               "PageTables:     %8lu kB\n"
-#ifdef CONFIG_QUICKLIST
-               "Quicklists:     %8lu kB\n"
-#endif
-               "NFS_Unstable:   %8lu kB\n"
-               "Bounce:         %8lu kB\n"
-               "WritebackTmp:   %8lu kB\n"
-               "CommitLimit:    %8lu kB\n"
-               "Committed_AS:   %8lu kB\n"
-               "VmallocTotal:   %8lu kB\n"
-               "VmallocUsed:    %8lu kB\n"
-               "VmallocChunk:   %8lu kB\n"
-#ifdef CONFIG_MEMORY_FAILURE
-               "HardwareCorrupted: %5lu kB\n"
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-               "AnonHugePages:  %8lu kB\n"
-               "ShmemHugePages: %8lu kB\n"
-               "ShmemPmdMapped: %8lu kB\n"
-#endif
-#ifdef CONFIG_CMA
-               "CmaTotal:       %8lu kB\n"
-               "CmaFree:        %8lu kB\n"
-#endif
-               ,
-               K(i.totalram),
-               K(i.freeram),
-               K(available),
-               K(i.bufferram),
-               K(cached),
-               K(total_swapcache_pages()),
-               K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
-               K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
-               K(pages[LRU_ACTIVE_ANON]),
-               K(pages[LRU_INACTIVE_ANON]),
-               K(pages[LRU_ACTIVE_FILE]),
-               K(pages[LRU_INACTIVE_FILE]),
-               K(pages[LRU_UNEVICTABLE]),
-               K(global_page_state(NR_MLOCK)),
+       show_val_kb(m, "MemTotal:       ", i.totalram);
+       show_val_kb(m, "MemFree:        ", i.freeram);
+       show_val_kb(m, "MemAvailable:   ", available);
+       show_val_kb(m, "Buffers:        ", i.bufferram);
+       show_val_kb(m, "Cached:         ", cached);
+       show_val_kb(m, "SwapCached:     ", total_swapcache_pages());
+       show_val_kb(m, "Active:         ", pages[LRU_ACTIVE_ANON] +
+                                          pages[LRU_ACTIVE_FILE]);
+       show_val_kb(m, "Inactive:       ", pages[LRU_INACTIVE_ANON] +
+                                          pages[LRU_INACTIVE_FILE]);
+       show_val_kb(m, "Active(anon):   ", pages[LRU_ACTIVE_ANON]);
+       show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]);
+       show_val_kb(m, "Active(file):   ", pages[LRU_ACTIVE_FILE]);
+       show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
+       show_val_kb(m, "Unevictable:    ", pages[LRU_UNEVICTABLE]);
+       show_val_kb(m, "Mlocked:        ", global_page_state(NR_MLOCK));
+
  #ifdef CONFIG_HIGHMEM
-               K(i.totalhigh),
-               K(i.freehigh),
-               K(i.totalram-i.totalhigh),
-               K(i.freeram-i.freehigh),
+       show_val_kb(m, "HighTotal:      ", i.totalhigh);
+       show_val_kb(m, "HighFree:       ", i.freehigh);
+       show_val_kb(m, "LowTotal:       ", i.totalram - i.totalhigh);
+       show_val_kb(m, "LowFree:        ", i.freeram - i.freehigh);
  #endif
+
  #ifndef CONFIG_MMU
-               K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
+       show_val_kb(m, "MmapCopy:       ",
+                   (unsigned long)atomic_long_read(&mmap_pages_allocated));
  #endif
-               K(i.totalswap),
-               K(i.freeswap),
-               K(global_node_page_state(NR_FILE_DIRTY)),
-               K(global_node_page_state(NR_WRITEBACK)),
-               K(global_node_page_state(NR_ANON_MAPPED)),
-               K(global_node_page_state(NR_FILE_MAPPED)),
-               K(i.sharedram),
-               K(global_page_state(NR_SLAB_RECLAIMABLE) +
-                               global_page_state(NR_SLAB_UNRECLAIMABLE)),
-               K(global_page_state(NR_SLAB_RECLAIMABLE)),
-               K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
-               global_page_state(NR_KERNEL_STACK_KB),
-               K(global_page_state(NR_PAGETABLE)),
+
+       show_val_kb(m, "SwapTotal:      ", i.totalswap);
+       show_val_kb(m, "SwapFree:       ", i.freeswap);
+       show_val_kb(m, "Dirty:          ",
+                   global_node_page_state(NR_FILE_DIRTY));
+       show_val_kb(m, "Writeback:      ",
+                   global_node_page_state(NR_WRITEBACK));
+       show_val_kb(m, "AnonPages:      ",
+                   global_node_page_state(NR_ANON_MAPPED));
+       show_val_kb(m, "Mapped:         ",
+                   global_node_page_state(NR_FILE_MAPPED));
+       show_val_kb(m, "Shmem:          ", i.sharedram);
+       show_val_kb(m, "Slab:           ",
+                   global_page_state(NR_SLAB_RECLAIMABLE) +
+                   global_page_state(NR_SLAB_UNRECLAIMABLE));
+
+       show_val_kb(m, "SReclaimable:   ",
+                   global_page_state(NR_SLAB_RECLAIMABLE));
+       show_val_kb(m, "SUnreclaim:     ",
+                   global_page_state(NR_SLAB_UNRECLAIMABLE));
+       seq_printf(m, "KernelStack:    %8lu kB\n",
+                  global_page_state(NR_KERNEL_STACK_KB));
+       show_val_kb(m, "PageTables:     ",
+                   global_page_state(NR_PAGETABLE));
  #ifdef CONFIG_QUICKLIST
-               K(quicklist_total_size()),
+       show_val_kb(m, "Quicklists:     ", quicklist_total_size());
  #endif
-               K(global_node_page_state(NR_UNSTABLE_NFS)),
-               K(global_page_state(NR_BOUNCE)),
-               K(global_node_page_state(NR_WRITEBACK_TEMP)),
-               K(vm_commit_limit()),
-               K(committed),
-               (unsigned long)VMALLOC_TOTAL >> 10,
-               0ul, // used to be vmalloc 'used'
-               0ul  // used to be vmalloc 'largest_chunk'
+
+       show_val_kb(m, "NFS_Unstable:   ",
+                   global_node_page_state(NR_UNSTABLE_NFS));
+       show_val_kb(m, "Bounce:         ",
+                   global_page_state(NR_BOUNCE));
+       show_val_kb(m, "WritebackTmp:   ",
+                   global_node_page_state(NR_WRITEBACK_TEMP));
+       show_val_kb(m, "CommitLimit:    ", vm_commit_limit());
+       show_val_kb(m, "Committed_AS:   ", committed);
+       seq_printf(m, "VmallocTotal:   %8lu kB\n",
+                  (unsigned long)VMALLOC_TOTAL >> 10);
+       show_val_kb(m, "VmallocUsed:    ", 0ul);
+       show_val_kb(m, "VmallocChunk:   ", 0ul);
+
  #ifdef CONFIG_MEMORY_FAILURE
-               , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
+       seq_printf(m, "HardwareCorrupted: %5lu kB\n",
+                  atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10));
  #endif
+
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-               , K(global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR)
-               , K(global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR)
-               , K(global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)
+       show_val_kb(m, "AnonHugePages:  ",
+                   global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR);
+       show_val_kb(m, "ShmemHugePages: ",
+                   global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
+       show_val_kb(m, "ShmemPmdMapped: ",
+                   global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
  #endif
+
  #ifdef CONFIG_CMA
-               , K(totalcma_pages)
-               , K(global_page_state(NR_FREE_CMA_PAGES))
+       show_val_kb(m, "CmaTotal:       ", totalcma_pages);
+       show_val_kb(m, "CmaFree:        ",
+                   global_page_state(NR_FREE_CMA_PAGES));
  #endif
-               );
  
         hugetlb_report_meminfo(m);
  
         arch_report_meminfo(m);
  
         return 0;
-#undef K
  }
  
  static int meminfo_proc_open(struct inode *inode, struct file *file)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c

index 7907e45..d700c42 100644 (file)
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -115,17 +115,16 @@ static int show_stat(struct seq_file *p, void *v)
         }
         sum += arch_irq_stat();
  
-       seq_puts(p, "cpu ");
-       seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
-       seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
-       seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
-       seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
-       seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
-       seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
-       seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
-       seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
-       seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
-       seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+       seq_put_decimal_ull(p, "cpu  ", cputime64_to_clock_t(user));
+       seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice));
+       seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system));
+       seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle));
+       seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait));
+       seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq));
+       seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq));
+       seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal));
+       seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest));
+       seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice));
         seq_putc(p, '\n');
  
         for_each_online_cpu(i) {
@@ -141,23 +140,23 @@ static int show_stat(struct seq_file *p, void *v)
                 guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
                 guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
                 seq_printf(p, "cpu%d", i);
-               seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
-               seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
-               seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
-               seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
-               seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
-               seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
-               seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
-               seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
-               seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
-               seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+               seq_put_decimal_ull(p, " ", cputime64_to_clock_t(user));
+               seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice));
+               seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system));
+               seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle));
+               seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait));
+               seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq));
+               seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq));
+               seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal));
+               seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest));
+               seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice));
                 seq_putc(p, '\n');
         }
-       seq_printf(p, "intr %llu", (unsigned long long)sum);
+       seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
  
         /* sum again ? it could be updated? */
         for_each_irq_nr(j)
-               seq_put_decimal_ull(p, ' ', kstat_irqs_usr(j));
+               seq_put_decimal_ull(p, " ", kstat_irqs_usr(j));
  
         seq_printf(p,
                 "\nctxt %llu\n"
@@ -171,10 +170,10 @@ static int show_stat(struct seq_file *p, void *v)
                 nr_running(),
                 nr_iowait());
  
-       seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
+       seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq);
  
         for (i = 0; i < NR_SOFTIRQS; i++)
-               seq_put_decimal_ull(p, ' ', per_softirq_sums[i]);
+               seq_put_decimal_ull(p, " ", per_softirq_sums[i]);
         seq_putc(p, '\n');
  
         return 0;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c

index f6fa99e..6909582 100644 (file)
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -147,7 +147,7 @@ m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
  static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
  {
         if (m->count < m->size) /* vma is copied successfully */
-               m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
+               m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;
  }
  
  static void *m_start(struct seq_file *m, loff_t *ppos)
@@ -175,8 +175,10 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
         priv->tail_vma = get_gate_vma(mm);
  
         if (last_addr) {
-               vma = find_vma(mm, last_addr);
-               if (vma && (vma = m_next_vma(priv, vma)))
+               vma = find_vma(mm, last_addr - 1);
+               if (vma && vma->vm_start <= last_addr)
+                       vma = m_next_vma(priv, vma);
+               if (vma)
                         return vma;
         }
  
@@ -1070,7 +1072,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                         }
                         mmu_notifier_invalidate_range_start(mm, 0, -1);
                 }
-               walk_page_range(0, ~0UL, &clear_refs_walk);
+               walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
                 if (type == CLEAR_REFS_SOFT_DIRTY)
                         mmu_notifier_invalidate_range_end(mm, 0, -1);
                 flush_tlb_mm(mm);
diff --git a/fs/seq_file.c b/fs/seq_file.c

index 6dc4296..368bfb9 100644 (file)
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -679,11 +679,11 @@ EXPORT_SYMBOL(seq_puts);
  /*
   * A helper routine for putting decimal numbers without rich format of printf().
   * only 'unsigned long long' is supported.
- * This routine will put one byte delimiter + number into seq_file.
+ * This routine will put strlen(delimiter) + number into seq_file.
   * This routine is very quick when you show lots of numbers.
   * In usual cases, it will be better to use seq_printf(). It's easier to read.
   */
-void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
                          unsigned long long num)
  {
         int len;
@@ -691,8 +691,15 @@ void seq_put_decimal_ull(struct seq_file *m, char delimiter,
         if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
                 goto overflow;
  
-       if (delimiter)
-               m->buf[m->count++] = delimiter;
+       len = strlen(delimiter);
+       if (m->count + len >= m->size)
+               goto overflow;
+
+       memcpy(m->buf + m->count, delimiter, len);
+       m->count += len;
+
+       if (m->count + 1 >= m->size)
+               goto overflow;
  
         if (num < 10) {
                 m->buf[m->count++] = num + '0';
@@ -702,6 +709,7 @@ void seq_put_decimal_ull(struct seq_file *m, char delimiter,
         len = num_to_str(m->buf + m->count, m->size - m->count, num);
         if (!len)
                 goto overflow;
+
         m->count += len;
         return;
  
@@ -710,19 +718,42 @@ overflow:
  }
  EXPORT_SYMBOL(seq_put_decimal_ull);
  
-void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num)
+void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num)
  {
+       int len;
+
+       if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */
+               goto overflow;
+
+       len = strlen(delimiter);
+       if (m->count + len >= m->size)
+               goto overflow;
+
+       memcpy(m->buf + m->count, delimiter, len);
+       m->count += len;
+
+       if (m->count + 2 >= m->size)
+               goto overflow;
+
         if (num < 0) {
-               if (m->count + 3 >= m->size) {
-                       seq_set_overflow(m);
-                       return;
-               }
-               if (delimiter)
-                       m->buf[m->count++] = delimiter;
+               m->buf[m->count++] = '-';
                 num = -num;
-               delimiter = '-';
         }
-       seq_put_decimal_ull(m, delimiter, num);
+
+       if (num < 10) {
+               m->buf[m->count++] = num + '0';
+               return;
+       }
+
+       len = num_to_str(m->buf + m->count, m->size - m->count, num);
+       if (!len)
+               goto overflow;
+
+       m->count += len;
+       return;
+
+overflow:
+       seq_set_overflow(m);
  }
  EXPORT_SYMBOL(seq_put_decimal_ll);
  
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index f46b292..26acfbb 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1579,6 +1579,7 @@ const struct file_operations xfs_file_operations = {
         .open           = xfs_file_open,
         .release        = xfs_file_release,
         .fsync          = xfs_file_fsync,
+       .get_unmapped_area = thp_get_unmapped_area,
         .fallocate      = xfs_file_fallocate,
  };
  
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h

index d4458b6..c4f8fd2 100644 (file)
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -800,6 +800,9 @@ static inline int pmd_clear_huge(pmd_t *pmd)
  #endif
  #endif
  
+struct file;
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+                       unsigned long size, pgprot_t *vma_prot);
  #endif /* !__ASSEMBLY__ */
  
  #ifndef io_remap_pfn_range
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h

index 2456397..3e42bcd 100644 (file)
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -454,6 +454,12 @@
                 *(.spinlock.text)                                       \
                 VMLINUX_SYMBOL(__lock_text_end) = .;
  
+#define CPUIDLE_TEXT                                                   \
+               ALIGN_FUNCTION();                                       \
+               VMLINUX_SYMBOL(__cpuidle_text_start) = .;               \
+               *(.cpuidle.text)                                        \
+               VMLINUX_SYMBOL(__cpuidle_text_end) = .;
+
  #define KPROBES_TEXT                                                   \
                 ALIGN_FUNCTION();                                       \
                 VMLINUX_SYMBOL(__kprobes_text_start) = .;               \
diff --git a/include/linux/bitops.h b/include/linux/bitops.h

index 299e76b..a83c822 100644 (file)
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -65,16 +65,6 @@ static inline int get_bitmask_order(unsigned int count)
         return order;   /* We could be slightly more clever with -1 here... */
  }
  
-static inline int get_count_order(unsigned int count)
-{
-       int order;
-
-       order = fls(count) - 1;
-       if (count & (count - 1))
-               order++;
-       return order;
-}
-
  static __always_inline unsigned long hweight_long(unsigned long w)
  {
         return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
@@ -191,6 +181,32 @@ static inline unsigned fls_long(unsigned long l)
         return fls64(l);
  }
  
+static inline int get_count_order(unsigned int count)
+{
+       int order;
+
+       order = fls(count) - 1;
+       if (count & (count - 1))
+               order++;
+       return order;
+}
+
+/**
+ * get_count_order_long - get order after rounding @l up to power of 2
+ * @l: parameter
+ *
+ * it is same as get_count_order() but with long type parameter
+ */
+static inline int get_count_order_long(unsigned long l)
+{
+       if (l == 0UL)
+               return -1;
+       else if (l & (l - 1UL))
+               return (int)fls_long(l);
+       else
+               return (int)fls_long(l) - 1;
+}
+
  /**
   * __ffs64 - find first set bit in a 64 bit word
   * @word: The 64 bit word
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h

index f9be326..962164d 100644 (file)
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -7,6 +7,7 @@
  #include <linux/mmzone.h>
  #include <linux/mm_types.h>
  #include <asm/dma.h>
+#include <asm/processor.h>
  
  /*
   *  simple boot-time physical memory area allocator.
@@ -119,6 +120,10 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
  #define BOOTMEM_LOW_LIMIT __pa(MAX_DMA_ADDRESS)
  #endif
  
+#ifndef ARCH_LOW_ADDRESS_LIMIT
+#define ARCH_LOW_ADDRESS_LIMIT  0xffffffffUL
+#endif
+
  #define alloc_bootmem(x) \
         __alloc_bootmem(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
  #define alloc_bootmem_align(x, align) \
@@ -180,10 +185,6 @@ static inline void * __init memblock_virt_alloc_nopanic(
                                                     NUMA_NO_NODE);
  }
  
-#ifndef ARCH_LOW_ADDRESS_LIMIT
-#define ARCH_LOW_ADDRESS_LIMIT  0xffffffffUL
-#endif
-
  static inline void * __init memblock_virt_alloc_low(
                                         phys_addr_t size, phys_addr_t align)
  {
diff --git a/include/linux/compaction.h b/include/linux/compaction.h

index d4e106b..0d84158 100644 (file)
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -6,8 +6,10 @@
   * Lower value means higher priority, analogically to reclaim priority.
   */
  enum compact_priority {
+       COMPACT_PRIO_SYNC_FULL,
+       MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_FULL,
         COMPACT_PRIO_SYNC_LIGHT,
-       MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
+       MIN_COMPACT_COSTLY_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
         DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
         COMPACT_PRIO_ASYNC,
         INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC
@@ -49,14 +51,37 @@ enum compact_result {
         COMPACT_CONTENDED,
  
         /*
-        * direct compaction partially compacted a zone and there might be
-        * suitable pages
+        * direct compaction terminated after concluding that the allocation
+        * should now succeed
          */
-       COMPACT_PARTIAL,
+       COMPACT_SUCCESS,
  };
  
  struct alloc_context; /* in mm/internal.h */
  
+/*
+ * Number of free order-0 pages that should be available above given watermark
+ * to make sure compaction has reasonable chance of not running out of free
+ * pages that it needs to isolate as migration target during its work.
+ */
+static inline unsigned long compact_gap(unsigned int order)
+{
+       /*
+        * Although all the isolations for migration are temporary, compaction
+        * free scanner may have up to 1 << order pages on its list and then
+        * try to split an (order - 1) free page. At that point, a gap of
+        * 1 << order might not be enough, so it's safer to require twice that
+        * amount. Note that the number of pages on the list is also
+        * effectively limited by COMPACT_CLUSTER_MAX, as that's the maximum
+        * that the migrate scanner can have isolated on migrate list, and free
+        * scanner is only invoked when the number of isolated free pages is
+        * lower than that. But it's not worth to complicate the formula here
+        * as a bigger gap for higher orders than strictly necessary can also
+        * improve chances of compaction success.
+        */
+       return 2UL << order;
+}
+
  #ifdef CONFIG_COMPACTION
  extern int sysctl_compact_memory;
  extern int sysctl_compaction_handler(struct ctl_table *table, int write,
@@ -70,7 +95,6 @@ extern int fragmentation_index(struct zone *zone, unsigned int order);
  extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
                 unsigned int order, unsigned int alloc_flags,
                 const struct alloc_context *ac, enum compact_priority prio);
-extern void compact_pgdat(pg_data_t *pgdat, int order);
  extern void reset_isolation_suitable(pg_data_t *pgdat);
  extern enum compact_result compaction_suitable(struct zone *zone, int order,
                 unsigned int alloc_flags, int classzone_idx);
@@ -89,7 +113,7 @@ static inline bool compaction_made_progress(enum compact_result result)
          * that the compaction successfully isolated and migrated some
          * pageblocks.
          */
-       if (result == COMPACT_PARTIAL)
+       if (result == COMPACT_SUCCESS)
                 return true;
  
         return false;
@@ -154,10 +178,6 @@ extern void kcompactd_stop(int nid);
  extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
  
  #else
-static inline void compact_pgdat(pg_data_t *pgdat, int order)
-{
-}
-
  static inline void reset_isolation_suitable(pg_data_t *pgdat)
  {
  }
diff --git a/include/linux/console.h b/include/linux/console.h

index d530c46..3672809 100644 (file)
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -173,6 +173,12 @@ static inline void console_sysfs_notify(void)
  #endif
  extern bool console_suspend_enabled;
  
+#ifdef CONFIG_OF
+extern void console_set_by_of(void);
+#else
+static inline void console_set_by_of(void) {}
+#endif
+
  /* Suspend and resume console messages over PM events */
  extern void suspend_console(void);
  extern void resume_console(void);
diff --git a/include/linux/cpu.h b/include/linux/cpu.h

index 7572d9e..b886dc1 100644 (file)
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -231,6 +231,11 @@ void cpu_startup_entry(enum cpuhp_state state);
  
  void cpu_idle_poll_ctrl(bool enable);
  
+/* Attach to any functions which should be considered cpuidle. */
+#define __cpuidle      __attribute__((__section__(".cpuidle.text")))
+
+bool cpu_in_idle(unsigned long pc);
+
  void arch_cpu_idle(void);
  void arch_cpu_idle_prepare(void);
  void arch_cpu_idle_enter(void);
diff --git a/include/linux/cred.h b/include/linux/cred.h

index 257db64..f0e70a1 100644 (file)
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -26,15 +26,10 @@ struct inode;
  /*
   * COW Supplementary groups list
   */
-#define NGROUPS_SMALL          32
-#define NGROUPS_PER_BLOCK      ((unsigned int)(PAGE_SIZE / sizeof(kgid_t)))
-
  struct group_info {
         atomic_t        usage;
         int             ngroups;
-       int             nblocks;
-       kgid_t          small_block[NGROUPS_SMALL];
-       kgid_t          *blocks[0];
+       kgid_t          gid[0];
  };
  
  /**
@@ -88,10 +83,6 @@ extern void set_groups(struct cred *, struct group_info *);
  extern int groups_search(const struct group_info *, kgid_t);
  extern bool may_setgroups(void);
  
-/* access the groups "array" with this macro */
-#define GROUP_AT(gi, i) \
-       ((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK])
-
  /*
   * The security context of a task
   *
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h

index 7268ed0..79467b2 100644 (file)
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -135,7 +135,7 @@ struct fsnotify_group {
         const struct fsnotify_ops *ops; /* how this group handles things */
  
         /* needed to send notification to userspace */
-       struct mutex notification_mutex;        /* protect the notification_list */
+       spinlock_t notification_lock;           /* protect the notification_list */
         struct list_head notification_list;     /* list of event_holder this group needs to send to userspace */
         wait_queue_head_t notification_waitq;   /* read() on the notification file blocks on this waitq */
         unsigned int q_len;                     /* events on the queue */
@@ -177,7 +177,6 @@ struct fsnotify_group {
                 struct fanotify_group_private_data {
  #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
                         /* allows a group to block waiting for a userspace response */
-                       spinlock_t access_lock;
                         struct list_head access_list;
                         wait_queue_head_t access_waitq;
  #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h

index 6f14de4..9b9f65d 100644 (file)
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -87,6 +87,10 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
  
  extern unsigned long transparent_hugepage_flags;
  
+extern unsigned long thp_get_unmapped_area(struct file *filp,
+               unsigned long addr, unsigned long len, unsigned long pgoff,
+               unsigned long flags);
+
  extern void prep_transhuge_page(struct page *page);
  extern void free_transhuge_page(struct page *page);
  
@@ -152,8 +156,8 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
         return is_huge_zero_page(pmd_page(pmd));
  }
  
-struct page *get_huge_zero_page(void);
-void put_huge_zero_page(void);
+struct page *mm_get_huge_zero_page(struct mm_struct *mm);
+void mm_put_huge_zero_page(struct mm_struct *mm);
  
  #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
  
@@ -169,6 +173,9 @@ void put_huge_zero_page(void);
  static inline void prep_transhuge_page(struct page *page) {}
  
  #define transparent_hugepage_flags 0UL
+
+#define thp_get_unmapped_area  NULL
+
  static inline int
  split_huge_page_to_list(struct page *page, struct list_head *list)
  {
@@ -213,9 +220,9 @@ static inline bool is_huge_zero_page(struct page *page)
         return false;
  }
  
-static inline void put_huge_zero_page(void)
+static inline void mm_put_huge_zero_page(struct mm_struct *mm)
  {
-       BUILD_BUG();
+       return;
  }
  
  static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index c26d463..48c76d6 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -90,7 +90,7 @@ int dequeue_hwpoisoned_huge_page(struct page *page);
  bool isolate_huge_page(struct page *page, struct list_head *list);
  void putback_active_hugepage(struct page *page);
  void free_huge_page(struct page *page);
-void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve);
+void hugetlb_fix_reserve_counts(struct inode *inode);
  extern struct mutex *hugetlb_fault_mutex_table;
  u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
                                 struct vm_area_struct *vma,
@@ -450,8 +450,8 @@ static inline pgoff_t basepage_index(struct page *page)
         return __basepage_index(page);
  }
  
-extern void dissolve_free_huge_pages(unsigned long start_pfn,
-                                    unsigned long end_pfn);
+extern int dissolve_free_huge_pages(unsigned long start_pfn,
+                                   unsigned long end_pfn);
  static inline bool hugepage_migration_supported(struct hstate *h)
  {
  #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
@@ -518,7 +518,7 @@ static inline pgoff_t basepage_index(struct page *page)
  {
         return page->index;
  }
-#define dissolve_free_huge_pages(s, e) do {} while (0)
+#define dissolve_free_huge_pages(s, e) 0
  #define hugepage_migration_supported(h)        false
  
  static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h

index 5fdc553..589d14e 100644 (file)
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -150,15 +150,19 @@ static inline u64 get_jiffies_64(void)
  
  /* time_is_before_jiffies(a) return true if a is before jiffies */
  #define time_is_before_jiffies(a) time_after(jiffies, a)
+#define time_is_before_jiffies64(a) time_after64(get_jiffies_64(), a)
  
  /* time_is_after_jiffies(a) return true if a is after jiffies */
  #define time_is_after_jiffies(a) time_before(jiffies, a)
+#define time_is_after_jiffies64(a) time_before64(get_jiffies_64(), a)
  
  /* time_is_before_eq_jiffies(a) return true if a is before or equal to jiffies*/
  #define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a)
+#define time_is_before_eq_jiffies64(a) time_after_eq64(get_jiffies_64(), a)
  
  /* time_is_after_eq_jiffies(a) return true if a is after or equal to jiffies*/
  #define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a)
+#define time_is_after_eq_jiffies64(a) time_before_eq64(get_jiffies_64(), a)
  
  /*
   * Have the 32 bit jiffies value wrap 5 minutes after boot
diff --git a/include/linux/kernel.h b/include/linux/kernel.h

index 74fd6f0..bc6ed52 100644 (file)
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -733,17 +733,25 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
   * strict type-checking.. See the
   * "unnecessary" pointer comparison.
   */
-#define min(x, y) ({                           \
-       typeof(x) _min1 = (x);                  \
-       typeof(y) _min2 = (y);                  \
-       (void) (&_min1 == &_min2);              \
-       _min1 < _min2 ? _min1 : _min2; })
-
-#define max(x, y) ({                           \
-       typeof(x) _max1 = (x);                  \
-       typeof(y) _max2 = (y);                  \
-       (void) (&_max1 == &_max2);              \
-       _max1 > _max2 ? _max1 : _max2; })
+#define __min(t1, t2, min1, min2, x, y) ({             \
+       t1 min1 = (x);                                  \
+       t2 min2 = (y);                                  \
+       (void) (&min1 == &min2);                        \
+       min1 < min2 ? min1 : min2; })
+#define min(x, y)                                      \
+       __min(typeof(x), typeof(y),                     \
+             __UNIQUE_ID(min1_), __UNIQUE_ID(min2_),   \
+             x, y)
+
+#define __max(t1, t2, max1, max2, x, y) ({             \
+       t1 max1 = (x);                                  \
+       t2 max2 = (y);                                  \
+       (void) (&max1 == &max2);                        \
+       max1 > max2 ? max1 : max2; })
+#define max(x, y)                                      \
+       __max(typeof(x), typeof(y),                     \
+             __UNIQUE_ID(max1_), __UNIQUE_ID(max2_),   \
+             x, y)
  
  #define min3(x, y, z) min((typeof(x))min(x, y), z)
  #define max3(x, y, z) max((typeof(x))max(x, y), z)
@@ -775,15 +783,15 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
   *
   * Or not use min/max/clamp at all, of course.
   */
-#define min_t(type, x, y) ({                   \
-       type __min1 = (x);                      \
-       type __min2 = (y);                      \
-       __min1 < __min2 ? __min1: __min2; })
-
-#define max_t(type, x, y) ({                   \
-       type __max1 = (x);                      \
-       type __max2 = (y);                      \
-       __max1 > __max2 ? __max1: __max2; })
+#define min_t(type, x, y)                              \
+       __min(type, type,                               \
+             __UNIQUE_ID(min1_), __UNIQUE_ID(min2_),   \
+             x, y)
+
+#define max_t(type, x, y)                              \
+       __max(type, type,                               \
+             __UNIQUE_ID(min1_), __UNIQUE_ID(min2_),   \
+             x, y)
  
  /**
   * clamp_t - return a value clamped to a given range using a given type
diff --git a/include/linux/memblock.h b/include/linux/memblock.h

index 2925da2..5b759c9 100644 (file)
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -328,6 +328,7 @@ phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
  phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
                                   phys_addr_t max_addr);
  phys_addr_t memblock_phys_mem_size(void);
+phys_addr_t memblock_reserved_size(void);
  phys_addr_t memblock_mem_size(unsigned long limit_pfn);
  phys_addr_t memblock_start_of_DRAM(void);
  phys_addr_t memblock_end_of_DRAM(void);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index 5d8ca6e..61d20c1 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -366,6 +366,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
                                    struct mem_cgroup *,
                                    struct mem_cgroup_reclaim_cookie *);
  void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+int mem_cgroup_scan_tasks(struct mem_cgroup *,
+                         int (*)(struct task_struct *, void *), void *);
  
  static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
  {
@@ -446,6 +448,8 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
  
  void mem_cgroup_handle_over_high(void);
  
+unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg);
+
  void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
                                 struct task_struct *p);
  
@@ -639,6 +643,12 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
  {
  }
  
+static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+               int (*fn)(struct task_struct *, void *), void *arg)
+{
+       return 0;
+}
+
  static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
  {
         return 0;
@@ -669,6 +679,11 @@ mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
         return 0;
  }
  
+static inline unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+       return 0;
+}
+
  static inline void
  mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
  {
@@ -758,13 +773,13 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
  #endif /* CONFIG_CGROUP_WRITEBACK */
  
  struct sock;
-void sock_update_memcg(struct sock *sk);
-void sock_release_memcg(struct sock *sk);
  bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
  void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
  #ifdef CONFIG_MEMCG
  extern struct static_key_false memcg_sockets_enabled_key;
  #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
+void mem_cgroup_sk_alloc(struct sock *sk);
+void mem_cgroup_sk_free(struct sock *sk);
  static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
  {
         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)
@@ -777,6 +792,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
  }
  #else
  #define mem_cgroup_sockets_enabled 0
+static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
+static inline void mem_cgroup_sk_free(struct sock *sk) { };
  static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
  {
         return false;
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 5f14534..e9caec6 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -126,7 +126,7 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
  #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
  
  /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
-#define PAGE_ALIGNED(addr)     IS_ALIGNED((unsigned long)addr, PAGE_SIZE)
+#define PAGE_ALIGNED(addr)     IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
  
  /*
   * Linux kernel virtual memory manager primitives.
@@ -1048,28 +1048,16 @@ struct address_space *page_file_mapping(struct page *page)
         return page->mapping;
  }
  
-/*
- * Return the pagecache index of the passed page.  Regular pagecache pages
- * use ->index whereas swapcache pages use ->private
- */
-static inline pgoff_t page_index(struct page *page)
-{
-       if (unlikely(PageSwapCache(page)))
-               return page_private(page);
-       return page->index;
-}
-
  extern pgoff_t __page_file_index(struct page *page);
  
  /*
- * Return the file index of the page. Regular pagecache pages use ->index
- * whereas swapcache pages use swp_offset(->private)
+ * Return the pagecache index of the passed page.  Regular pagecache pages
+ * use ->index whereas swapcache pages use swp_offset(->private)
   */
-static inline pgoff_t page_file_index(struct page *page)
+static inline pgoff_t page_index(struct page *page)
  {
         if (unlikely(PageSwapCache(page)))
                 return __page_file_index(page);
-
         return page->index;
  }
  
@@ -1197,10 +1185,10 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
   * @pte_hole: if set, called for each hole at all levels
   * @hugetlb_entry: if set, called for each hugetlb entry
   * @test_walk: caller specific callback function to determine whether
- *             we walk over the current vma or not. A positive returned
+ *             we walk over the current vma or not. Returning 0
   *             value means "do page table walk over the current vma,"
   *             and a negative one means "abort current page table walk
- *             right now." 0 means "skip the current vma."
+ *             right now." 1 means "skip the current vma."
   * @mm:        mm_struct representing the target process of page table walk
   * @vma:       vma currently walked (NULL if walking outside vmas)
   * @private:   private data for callbacks' usage
@@ -1529,7 +1517,7 @@ static inline int pte_devmap(pte_t pte)
  }
  #endif
  
-int vma_wants_writenotify(struct vm_area_struct *vma);
+int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
  
  extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                                spinlock_t **ptl);
@@ -1924,10 +1912,12 @@ extern void show_mem(unsigned int flags);
  extern long si_mem_available(void);
  extern void si_meminfo(struct sysinfo * val);
  extern void si_meminfo_node(struct sysinfo *val, int nid);
+#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES
+extern unsigned long arch_reserved_kernel_pages(void);
+#endif
  
-extern __printf(3, 4)
-void warn_alloc_failed(gfp_t gfp_mask, unsigned int order,
-               const char *fmt, ...);
+extern __printf(2, 3)
+void warn_alloc(gfp_t gfp_mask, const char *fmt, ...);
  
  extern void setup_per_cpu_pageset(void);
  
@@ -1977,8 +1967,14 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
  
  /* mmap.c */
  extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
-extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
-       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
+extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
+       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
+       struct vm_area_struct *expand);
+static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
+       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
+{
+       return __vma_adjust(vma, start, end, pgoff, insert, NULL);
+}
  extern struct vm_area_struct *vma_merge(struct mm_struct *,
         struct vm_area_struct *prev, unsigned long addr, unsigned long end,
         unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index 903200f..4a8aced 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -515,9 +515,7 @@ struct mm_struct {
  #ifdef CONFIG_HUGETLB_PAGE
         atomic_long_t hugetlb_usage;
  #endif
-#ifdef CONFIG_MMU
         struct work_struct async_put_work;
-#endif
  };
  
  static inline void mm_init_cpumask(struct mm_struct *mm)
diff --git a/include/linux/nmi.h b/include/linux/nmi.h

index 4630eea..a78c35c 100644 (file)
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -35,21 +35,34 @@ static inline void hardlockup_detector_disable(void) {}
   * base function. Return whether such support was available,
   * to allow calling code to fall back to some other mechanism:
   */
-#ifdef arch_trigger_all_cpu_backtrace
+#ifdef arch_trigger_cpumask_backtrace
  static inline bool trigger_all_cpu_backtrace(void)
  {
-       arch_trigger_all_cpu_backtrace(true);
-
+       arch_trigger_cpumask_backtrace(cpu_online_mask, false);
         return true;
  }
+
  static inline bool trigger_allbutself_cpu_backtrace(void)
  {
-       arch_trigger_all_cpu_backtrace(false);
+       arch_trigger_cpumask_backtrace(cpu_online_mask, true);
+       return true;
+}
+
+static inline bool trigger_cpumask_backtrace(struct cpumask *mask)
+{
+       arch_trigger_cpumask_backtrace(mask, false);
+       return true;
+}
+
+static inline bool trigger_single_cpu_backtrace(int cpu)
+{
+       arch_trigger_cpumask_backtrace(cpumask_of(cpu), false);
         return true;
  }
  
  /* generic implementation */
-void nmi_trigger_all_cpu_backtrace(bool include_self,
+void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
+                                  bool exclude_self,
                                    void (*raise)(cpumask_t *mask));
  bool nmi_cpu_backtrace(struct pt_regs *regs);
  
@@ -62,6 +75,14 @@ static inline bool trigger_allbutself_cpu_backtrace(void)
  {
         return false;
  }
+static inline bool trigger_cpumask_backtrace(struct cpumask *mask)
+{
+       return false;
+}
+static inline bool trigger_single_cpu_backtrace(int cpu)
+{
+       return false;
+}
  #endif
  
  #ifdef CONFIG_LOCKUP_DETECTOR
diff --git a/include/linux/oom.h b/include/linux/oom.h

index 5bc0457..b4e36e9 100644 (file)
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -34,23 +34,11 @@ struct oom_control {
          * for display purposes.
          */
         const int order;
-};
-
-/*
- * Types of limitations to the nodes from which allocations may occur
- */
-enum oom_constraint {
-       CONSTRAINT_NONE,
-       CONSTRAINT_CPUSET,
-       CONSTRAINT_MEMORY_POLICY,
-       CONSTRAINT_MEMCG,
-};
  
-enum oom_scan_t {
-       OOM_SCAN_OK,            /* scan thread and find its badness */
-       OOM_SCAN_CONTINUE,      /* do not consider thread for oom kill */
-       OOM_SCAN_ABORT,         /* abort the iteration and return */
-       OOM_SCAN_SELECT,        /* always select this thread first */
+       /* Used by oom implementation, do not set */
+       unsigned long totalpages;
+       struct task_struct *chosen;
+       unsigned long chosen_points;
  };
  
  extern struct mutex oom_lock;
@@ -70,45 +58,27 @@ static inline bool oom_task_origin(const struct task_struct *p)
         return p->signal->oom_flag_origin;
  }
  
-extern void mark_oom_victim(struct task_struct *tsk);
-
-#ifdef CONFIG_MMU
-extern void wake_oom_reaper(struct task_struct *tsk);
-#else
-static inline void wake_oom_reaper(struct task_struct *tsk)
+static inline bool tsk_is_oom_victim(struct task_struct * tsk)
  {
+       return tsk->signal->oom_mm;
  }
-#endif
  
  extern unsigned long oom_badness(struct task_struct *p,
                 struct mem_cgroup *memcg, const nodemask_t *nodemask,
                 unsigned long totalpages);
  
-extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
-                            unsigned int points, unsigned long totalpages,
-                            const char *message);
-
-extern void check_panic_on_oom(struct oom_control *oc,
-                              enum oom_constraint constraint);
-
-extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
-                                              struct task_struct *task);
-
  extern bool out_of_memory(struct oom_control *oc);
  
-extern void exit_oom_victim(struct task_struct *tsk);
+extern void exit_oom_victim(void);
  
  extern int register_oom_notifier(struct notifier_block *nb);
  extern int unregister_oom_notifier(struct notifier_block *nb);
  
-extern bool oom_killer_disabled;
-extern bool oom_killer_disable(void);
+extern bool oom_killer_disable(signed long timeout);
  extern void oom_killer_enable(void);
  
  extern struct task_struct *find_lock_task_mm(struct task_struct *p);
  
-bool task_will_free_mem(struct task_struct *task);
-
  /* sysctls */
  extern int sysctl_oom_dump_tasks;
  extern int sysctl_oom_kill_allocating_task;
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h

index 03f2a3e..9298c39 100644 (file)
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -7,6 +7,8 @@
  
  struct pglist_data;
  struct page_ext_operations {
+       size_t offset;
+       size_t size;
         bool (*need)(void);
         void (*init)(void);
  };
@@ -42,12 +44,6 @@ enum page_ext_flags {
   */
  struct page_ext {
         unsigned long flags;
-#ifdef CONFIG_PAGE_OWNER
-       unsigned int order;
-       gfp_t gfp_mask;
-       int last_migrate_reason;
-       depot_stack_handle_t handle;
-#endif
  };
  
  extern void pgdat_page_ext_init(struct pglist_data *pgdat);
diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h

index 30583ab..2be728d 100644 (file)
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -14,6 +14,8 @@ extern void __split_page_owner(struct page *page, unsigned int order);
  extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
  extern void __set_page_owner_migrate_reason(struct page *page, int reason);
  extern void __dump_page_owner(struct page *page);
+extern void pagetypeinfo_showmixedcount_print(struct seq_file *m,
+                                       pg_data_t *pgdat, struct zone *zone);
  
  static inline void reset_page_owner(struct page *page, unsigned int order)
  {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h

index 01e8443..794dbcb 100644 (file)
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -25,6 +25,8 @@ enum mapping_flags {
         AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */
         AS_UNEVICTABLE  = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */
         AS_EXITING      = __GFP_BITS_SHIFT + 4, /* final truncate in progress */
+       /* writeback related tags are not used */
+       AS_NO_WRITEBACK_TAGS = __GFP_BITS_SHIFT + 5,
  };
  
  static inline void mapping_set_error(struct address_space *mapping, int error)
@@ -64,6 +66,16 @@ static inline int mapping_exiting(struct address_space *mapping)
         return test_bit(AS_EXITING, &mapping->flags);
  }
  
+static inline void mapping_set_no_writeback_tags(struct address_space *mapping)
+{
+       set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
+}
+
+static inline int mapping_use_writeback_tags(struct address_space *mapping)
+{
+       return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
+}
+
  static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
  {
         return (__force gfp_t)mapping->flags & __GFP_BITS_MASK;
@@ -396,7 +408,7 @@ static inline loff_t page_offset(struct page *page)
  
  static inline loff_t page_file_offset(struct page *page)
  {
-       return ((loff_t)page_file_index(page)) << PAGE_SHIFT;
+       return ((loff_t)page_index(page)) << PAGE_SHIFT;
  }
  
  extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 7543a47..348f51b 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -524,8 +524,9 @@ static inline int get_dumpable(struct mm_struct *mm)
  
  #define MMF_HAS_UPROBES                19      /* has uprobes */
  #define MMF_RECALC_UPROBES     20      /* MMF_HAS_UPROBES can be wrong */
-#define MMF_OOM_REAPED         21      /* mm has been already reaped */
-#define MMF_OOM_NOT_REAPABLE   22      /* mm couldn't be reaped */
+#define MMF_OOM_SKIP           21      /* mm is of no interest for the OOM killer */
+#define MMF_UNSTABLE           22      /* mm is unstable for copy_from_user */
+#define MMF_HUGE_ZERO_PAGE     23      /* mm has ever used the global huge zero page */
  
  #define MMF_INIT_MASK          (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
  
@@ -673,7 +674,6 @@ struct signal_struct {
         atomic_t                sigcnt;
         atomic_t                live;
         int                     nr_threads;
-       atomic_t oom_victims; /* # of TIF_MEDIE threads in this thread group */
         struct list_head        thread_head;
  
         wait_queue_head_t       wait_chldexit;  /* for wait4() */
@@ -806,6 +806,8 @@ struct signal_struct {
         short oom_score_adj;            /* OOM kill score adjustment */
         short oom_score_adj_min;        /* OOM kill score adjustment min value.
                                          * Only settable by CAP_SYS_RESOURCE. */
+       struct mm_struct *oom_mm;       /* recorded mm when the thread group got
+                                        * killed by the oom killer */
  
         struct mutex cred_guard_mutex;  /* guard against foreign influences on
                                          * credential calculations
@@ -2876,6 +2878,20 @@ static inline void mmdrop(struct mm_struct *mm)
                 __mmdrop(mm);
  }
  
+static inline void mmdrop_async_fn(struct work_struct *work)
+{
+       struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+       __mmdrop(mm);
+}
+
+static inline void mmdrop_async(struct mm_struct *mm)
+{
+       if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
+               INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
+               schedule_work(&mm->async_put_work);
+       }
+}
+
  static inline bool mmget_not_zero(struct mm_struct *mm)
  {
         return atomic_inc_not_zero(&mm->mm_users);
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h

index f3d45dd..e305b66 100644 (file)
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -117,9 +117,9 @@ __printf(2, 3)
  void seq_printf(struct seq_file *m, const char *fmt, ...);
  void seq_putc(struct seq_file *m, char c);
  void seq_puts(struct seq_file *m, const char *s);
-void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
                          unsigned long long num);
-void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num);
+void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num);
  void seq_escape(struct seq_file *m, const char *s, const char *esc);
  
  void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
diff --git a/include/linux/swap.h b/include/linux/swap.h

index e1d7614..a56523c 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -191,6 +191,11 @@ struct percpu_cluster {
         unsigned int next; /* Likely next allocation offset */
  };
  
+struct swap_cluster_list {
+       struct swap_cluster_info head;
+       struct swap_cluster_info tail;
+};
+
  /*
   * The in-memory structure used to track swap areas.
   */
@@ -203,8 +208,7 @@ struct swap_info_struct {
         unsigned int    max;            /* extent of the swap_map */
         unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
         struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
-       struct swap_cluster_info free_cluster_head; /* free cluster list head */
-       struct swap_cluster_info free_cluster_tail; /* free cluster list tail */
+       struct swap_cluster_list free_clusters; /* free clusters list */
         unsigned int lowest_bit;        /* index of first free in swap_map */
         unsigned int highest_bit;       /* index of last free in swap_map */
         unsigned int pages;             /* total of usable pages of swap */
@@ -235,8 +239,7 @@ struct swap_info_struct {
                                          * first.
                                          */
         struct work_struct discard_work; /* discard worker */
-       struct swap_cluster_info discard_cluster_head; /* list head of discard clusters */
-       struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
+       struct swap_cluster_list discard_clusters; /* discard clusters list */
  };
  
  /* linux/mm/workingset.c */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h

index fc1e16c..797100e 100644 (file)
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -319,7 +319,6 @@ void laptop_mode_timer_fn(unsigned long data);
  #else
  static inline void laptop_sync_completion(void) { }
  #endif
-void throttle_vm_writeout(gfp_t gfp_mask);
  bool node_dirty_ok(struct pglist_data *pgdat);
  int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
  #ifdef CONFIG_CGROUP_WRITEBACK
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h

index c2ba402..cbdb90b 100644 (file)
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -13,7 +13,7 @@
         EM( COMPACT_SKIPPED,            "skipped")              \
         EM( COMPACT_DEFERRED,           "deferred")             \
         EM( COMPACT_CONTINUE,           "continue")             \
-       EM( COMPACT_PARTIAL,            "partial")              \
+       EM( COMPACT_SUCCESS,            "success")              \
         EM( COMPACT_PARTIAL_SKIPPED,    "partial_skipped")      \
         EM( COMPACT_COMPLETE,           "complete")             \
         EM( COMPACT_NO_SUITABLE_PAGE,   "no_suitable_page")     \
diff --git a/kernel/exit.c b/kernel/exit.c

index 1e1d913..9d68c45 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -511,7 +511,7 @@ static void exit_mm(struct task_struct *tsk)
         mm_update_next_owner(mm);
         mmput(mm);
         if (test_thread_flag(TIF_MEMDIE))
-               exit_oom_victim(tsk);
+               exit_oom_victim();
  }
  
  static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/fork.c b/kernel/fork.c

index 9a05bd9..6d42242 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -359,6 +359,12 @@ static inline void free_signal_struct(struct signal_struct *sig)
  {
         taskstats_tgid_free(sig);
         sched_autogroup_exit(sig);
+       /*
+        * __mmdrop is not safe to call from softirq context on x86 due to
+        * pgd_dtor so postpone it to the async context
+        */
+       if (sig->oom_mm)
+               mmdrop_async(sig->oom_mm);
         kmem_cache_free(signal_cachep, sig);
  }
  
@@ -848,6 +854,7 @@ static inline void __mmput(struct mm_struct *mm)
         ksm_exit(mm);
         khugepaged_exit(mm); /* must run before exit_mmap */
         exit_mmap(mm);
+       mm_put_huge_zero_page(mm);
         set_mm_exe_file(mm, NULL);
         if (!list_empty(&mm->mmlist)) {
                 spin_lock(&mmlist_lock);
@@ -856,6 +863,7 @@ static inline void __mmput(struct mm_struct *mm)
         }
         if (mm->binfmt)
                 module_put(mm->binfmt->module);
+       set_bit(MMF_OOM_SKIP, &mm->flags);
         mmdrop(mm);
  }
  
diff --git a/kernel/groups.c b/kernel/groups.c

index 74d431d..2fcadd6 100644 (file)
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -7,55 +7,31 @@
  #include <linux/security.h>
  #include <linux/syscalls.h>
  #include <linux/user_namespace.h>
+#include <linux/vmalloc.h>
  #include <asm/uaccess.h>
  
  struct group_info *groups_alloc(int gidsetsize)
  {
-       struct group_info *group_info;
-       int nblocks;
-       int i;
-
-       nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
-       /* Make sure we always allocate at least one indirect block pointer */
-       nblocks = nblocks ? : 1;
-       group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
-       if (!group_info)
+       struct group_info *gi;
+       unsigned int len;
+
+       len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize;
+       gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
+       if (!gi)
+               gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL);
+       if (!gi)
                 return NULL;
-       group_info->ngroups = gidsetsize;
-       group_info->nblocks = nblocks;
-       atomic_set(&group_info->usage, 1);
-
-       if (gidsetsize <= NGROUPS_SMALL)
-               group_info->blocks[0] = group_info->small_block;
-       else {
-               for (i = 0; i < nblocks; i++) {
-                       kgid_t *b;
-                       b = (void *)__get_free_page(GFP_USER);
-                       if (!b)
-                               goto out_undo_partial_alloc;
-                       group_info->blocks[i] = b;
-               }
-       }
-       return group_info;
  
-out_undo_partial_alloc:
-       while (--i >= 0) {
-               free_page((unsigned long)group_info->blocks[i]);
-       }
-       kfree(group_info);
-       return NULL;
+       atomic_set(&gi->usage, 1);
+       gi->ngroups = gidsetsize;
+       return gi;
  }
  
  EXPORT_SYMBOL(groups_alloc);
  
  void groups_free(struct group_info *group_info)
  {
-       if (group_info->blocks[0] != group_info->small_block) {
-               int i;
-               for (i = 0; i < group_info->nblocks; i++)
-                       free_page((unsigned long)group_info->blocks[i]);
-       }
-       kfree(group_info);
+       kvfree(group_info);
  }
  
  EXPORT_SYMBOL(groups_free);
@@ -70,7 +46,7 @@ static int groups_to_user(gid_t __user *grouplist,
  
         for (i = 0; i < count; i++) {
                 gid_t gid;
-               gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i));
+               gid = from_kgid_munged(user_ns, group_info->gid[i]);
                 if (put_user(gid, grouplist+i))
                         return -EFAULT;
         }
@@ -95,7 +71,7 @@ static int groups_from_user(struct group_info *group_info,
                 if (!gid_valid(kgid))
                         return -EINVAL;
  
-               GROUP_AT(group_info, i) = kgid;
+               group_info->gid[i] = kgid;
         }
         return 0;
  }
@@ -115,15 +91,14 @@ static void groups_sort(struct group_info *group_info)
                 for (base = 0; base < max; base++) {
                         int left = base;
                         int right = left + stride;
-                       kgid_t tmp = GROUP_AT(group_info, right);
+                       kgid_t tmp = group_info->gid[right];
  
-                       while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) {
-                               GROUP_AT(group_info, right) =
-                                   GROUP_AT(group_info, left);
+                       while (left >= 0 && gid_gt(group_info->gid[left], tmp)) {
+                               group_info->gid[right] = group_info->gid[left];
                                 right = left;
                                 left -= stride;
                         }
-                       GROUP_AT(group_info, right) = tmp;
+                       group_info->gid[right] = tmp;
                 }
                 stride /= 3;
         }
@@ -141,9 +116,9 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
         right = group_info->ngroups;
         while (left < right) {
                 unsigned int mid = (left+right)/2;
-               if (gid_gt(grp, GROUP_AT(group_info, mid)))
+               if (gid_gt(grp, group_info->gid[mid]))
                         left = mid + 1;
-               else if (gid_lt(grp, GROUP_AT(group_info, mid)))
+               else if (gid_lt(grp, group_info->gid[mid]))
                         right = mid;
                 else
                         return 1;
diff --git a/kernel/power/process.c b/kernel/power/process.c

index 8f27d5a..2fba066 100644 (file)
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -144,23 +144,12 @@ int freeze_processes(void)
         /*
          * Now that the whole userspace is frozen we need to disbale
          * the OOM killer to disallow any further interference with
-        * killable tasks.
+        * killable tasks. There is no guarantee oom victims will
+        * ever reach a point they go away we have to wait with a timeout.
          */
-       if (!error && !oom_killer_disable())
+       if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs)))
                 error = -EBUSY;
  
-       /*
-        * There is a hard to fix race between oom_reaper kernel thread
-        * and oom_killer_disable. oom_reaper calls exit_oom_victim
-        * before the victim reaches exit_mm so try to freeze all the tasks
-        * again and catch such a left over task.
-        */
-       if (!error) {
-               pr_info("Double checking all user space processes after OOM killer disable... ");
-               error = try_to_freeze_tasks(true);
-               pr_cont("\n");
-       }
-
         if (error)
                 thaw_processes();
         return error;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c

index eea6dbc..8019cc0 100644 (file)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -253,6 +253,17 @@ static int preferred_console = -1;
  int console_set_on_cmdline;
  EXPORT_SYMBOL(console_set_on_cmdline);
  
+#ifdef CONFIG_OF
+static bool of_specified_console;
+
+void console_set_by_of(void)
+{
+       of_specified_console = true;
+}
+#else
+# define of_specified_console false
+#endif
+
  /* Flag: console code may call schedule() */
  static int console_may_schedule;
  
@@ -2647,7 +2658,7 @@ void register_console(struct console *newcon)
          *      didn't select a console we take the first one
          *      that registers here.
          */
-       if (preferred_console < 0) {
+       if (preferred_console < 0 && !of_specified_console) {
                 if (newcon->index < 0)
                         newcon->index = 0;
                 if (newcon->setup == NULL ||
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c

index 9fb873c..1d8718d 100644 (file)
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -16,6 +16,9 @@
  
  #include "sched.h"
  
+/* Linker adds these: start and end of __cpuidle functions */
+extern char __cpuidle_text_start[], __cpuidle_text_end[];
+
  /**
   * sched_idle_set_state - Record idle state for the current CPU.
   * @idle_state: State to record.
@@ -53,7 +56,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused)
  __setup("hlt", cpu_idle_nopoll_setup);
  #endif
  
-static inline int cpu_idle_poll(void)
+static noinline int __cpuidle cpu_idle_poll(void)
  {
         rcu_idle_enter();
         trace_cpu_idle_rcuidle(0, smp_processor_id());
@@ -84,7 +87,7 @@ void __weak arch_cpu_idle(void)
   *
   * To use when the cpuidle framework cannot be used.
   */
-void default_idle_call(void)
+void __cpuidle default_idle_call(void)
  {
         if (current_clr_polling_and_test()) {
                 local_irq_enable();
@@ -271,6 +274,12 @@ static void cpu_idle_loop(void)
         }
  }
  
+bool cpu_in_idle(unsigned long pc)
+{
+       return pc >= (unsigned long)__cpuidle_text_start &&
+               pc < (unsigned long)__cpuidle_text_end;
+}
+
  void cpu_startup_entry(enum cpuhp_state state)
  {
         /*
diff --git a/kernel/uid16.c b/kernel/uid16.c

index d58cc4d..cc40793 100644 (file)
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -117,7 +117,7 @@ static int groups16_to_user(old_gid_t __user *grouplist,
         kgid_t kgid;
  
         for (i = 0; i < group_info->ngroups; i++) {
-               kgid = GROUP_AT(group_info, i);
+               kgid = group_info->gid[i];
                 group = high2lowgid(from_kgid_munged(user_ns, kgid));
                 if (put_user(group, grouplist+i))
                         return -EFAULT;
@@ -142,7 +142,7 @@ static int groups16_from_user(struct group_info *group_info,
                 if (!gid_valid(kgid))
                         return -EINVAL;
  
-               GROUP_AT(group_info, i) = kgid;
+               group_info->gid[i] = kgid;
         }
  
         return 0;
diff --git a/lib/Kconfig b/lib/Kconfig

index 942fb80..260a80e 100644 (file)
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -457,9 +457,6 @@ config NLATTR
  config GENERIC_ATOMIC64
         bool
  
-config ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
-       def_bool y if GENERIC_ATOMIC64
-
  config LRU_CACHE
         tristate
  
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c

index dbb3691..4604290 100644 (file)
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -213,7 +213,6 @@ static __init void test_atomic64(void)
         r += one;
         BUG_ON(v.counter != r);
  
-#ifdef CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
         INIT(onestwos);
         BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1));
         r -= one;
@@ -226,9 +225,6 @@ static __init void test_atomic64(void)
         INIT(-one);
         BUG_ON(atomic64_dec_if_positive(&v) != (-one - one));
         BUG_ON(v.counter != r);
-#else
-#warning Please implement atomic64_dec_if_positive for your architecture and select the above Kconfig symbol
-#endif
  
         INIT(onestwos);
         BUG_ON(!atomic64_inc_not_zero(&v));
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c

index 26caf51..7555475 100644 (file)
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -16,21 +16,23 @@
  #include <linux/delay.h>
  #include <linux/kprobes.h>
  #include <linux/nmi.h>
+#include <linux/cpu.h>
  
-#ifdef arch_trigger_all_cpu_backtrace
+#ifdef arch_trigger_cpumask_backtrace
  /* For reliability, we're prepared to waste bits here. */
  static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
  
-/* "in progress" flag of arch_trigger_all_cpu_backtrace */
+/* "in progress" flag of arch_trigger_cpumask_backtrace */
  static unsigned long backtrace_flag;
  
  /*
- * When raise() is called it will be is passed a pointer to the
+ * When raise() is called it will be passed a pointer to the
   * backtrace_mask. Architectures that call nmi_cpu_backtrace()
   * directly from their raise() functions may rely on the mask
   * they are passed being updated as a side effect of this call.
   */
-void nmi_trigger_all_cpu_backtrace(bool include_self,
+void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
+                                  bool exclude_self,
                                    void (*raise)(cpumask_t *mask))
  {
         int i, this_cpu = get_cpu();
@@ -44,13 +46,22 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
                 return;
         }
  
-       cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
-       if (!include_self)
+       cpumask_copy(to_cpumask(backtrace_mask), mask);
+       if (exclude_self)
                 cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
  
+       /*
+        * Don't try to send an NMI to this cpu; it may work on some
+        * architectures, but on others it may not, and we'll get
+        * information at least as useful just by doing a dump_stack() here.
+        * Note that nmi_cpu_backtrace(NULL) will clear the cpu bit.
+        */
+       if (cpumask_test_cpu(this_cpu, to_cpumask(backtrace_mask)))
+               nmi_cpu_backtrace(NULL);
+
         if (!cpumask_empty(to_cpumask(backtrace_mask))) {
-               pr_info("Sending NMI to %s CPUs:\n",
-                       (include_self ? "all" : "other"));
+               pr_info("Sending NMI from CPU %d to CPUs %*pbl:\n",
+                       this_cpu, nr_cpumask_bits, to_cpumask(backtrace_mask));
                 raise(to_cpumask(backtrace_mask));
         }
  
@@ -77,11 +88,16 @@ bool nmi_cpu_backtrace(struct pt_regs *regs)
         int cpu = smp_processor_id();
  
         if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-               pr_warn("NMI backtrace for cpu %d\n", cpu);
-               if (regs)
-                       show_regs(regs);
-               else
-                       dump_stack();
+               if (regs && cpu_in_idle(instruction_pointer(regs))) {
+                       pr_warn("NMI backtrace for cpu %d skipped: idling at pc %#lx\n",
+                               cpu, instruction_pointer(regs));
+               } else {
+                       pr_warn("NMI backtrace for cpu %d\n", cpu);
+                       if (regs)
+                               show_regs(regs);
+                       else
+                               dump_stack();
+               }
                 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
                 return true;
         }
diff --git a/mm/bootmem.c b/mm/bootmem.c

index 0aa7dda..a869f84 100644 (file)
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -11,15 +11,12 @@
  #include <linux/init.h>
  #include <linux/pfn.h>
  #include <linux/slab.h>
-#include <linux/bootmem.h>
  #include <linux/export.h>
  #include <linux/kmemleak.h>
  #include <linux/range.h>
-#include <linux/memblock.h>
  #include <linux/bug.h>
  #include <linux/io.h>
-
-#include <asm/processor.h>
+#include <linux/bootmem.h>
  
  #include "internal.h"
  
@@ -712,7 +709,7 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
         void *ptr;
  
         if (WARN_ON_ONCE(slab_is_available()))
-               return kzalloc(size, GFP_NOWAIT);
+               return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
  again:
  
         /* do not panic in alloc_bootmem_bdata() */
@@ -738,9 +735,6 @@ again:
  void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
                                    unsigned long align, unsigned long goal)
  {
-       if (WARN_ON_ONCE(slab_is_available()))
-               return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
         return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
  }
  
@@ -812,10 +806,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
  
  }
  
-#ifndef ARCH_LOW_ADDRESS_LIMIT
-#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
-#endif
-
  /**
   * __alloc_bootmem_low - allocate low boot memory
   * @size: size of the request in bytes
diff --git a/mm/compaction.c b/mm/compaction.c

index 9affb29..0409a4a 100644 (file)
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -997,8 +997,12 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
  #ifdef CONFIG_COMPACTION
  
  /* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
+static bool suitable_migration_target(struct compact_control *cc,
+                                                       struct page *page)
  {
+       if (cc->ignore_block_suitable)
+               return true;
+
         /* If the page is a large free page, then disallow migration */
         if (PageBuddy(page)) {
                 /*
@@ -1083,7 +1087,7 @@ static void isolate_freepages(struct compact_control *cc)
                         continue;
  
                 /* Check the block is suitable for migration */
-               if (!suitable_migration_target(page))
+               if (!suitable_migration_target(cc, page))
                         continue;
  
                 /* If isolation recently failed, do not retry */
@@ -1316,7 +1320,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
                 return COMPACT_CONTINUE;
  
         /* Compaction run is not finished if the watermark is not met */
-       watermark = low_wmark_pages(zone);
+       watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK];
  
         if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
                                                         cc->alloc_flags))
@@ -1329,13 +1333,13 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
  
                 /* Job done if page is free of the right migratetype */
                 if (!list_empty(&area->free_list[migratetype]))
-                       return COMPACT_PARTIAL;
+                       return COMPACT_SUCCESS;
  
  #ifdef CONFIG_CMA
                 /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
                 if (migratetype == MIGRATE_MOVABLE &&
                         !list_empty(&area->free_list[MIGRATE_CMA]))
-                       return COMPACT_PARTIAL;
+                       return COMPACT_SUCCESS;
  #endif
                 /*
                  * Job done if allocation would steal freepages from
@@ -1343,7 +1347,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
                  */
                 if (find_suitable_fallback(area, order, migratetype,
                                                 true, &can_steal) != -1)
-                       return COMPACT_PARTIAL;
+                       return COMPACT_SUCCESS;
         }
  
         return COMPACT_NO_SUITABLE_PAGE;
@@ -1367,7 +1371,7 @@ static enum compact_result compact_finished(struct zone *zone,
   * compaction_suitable: Is this suitable to run compaction on this zone now?
   * Returns
   *   COMPACT_SKIPPED  - If there are too few free pages for compaction
- *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
+ *   COMPACT_SUCCESS  - If the allocation would succeed without compaction
   *   COMPACT_CONTINUE - If compaction should run now
   */
  static enum compact_result __compaction_suitable(struct zone *zone, int order,
@@ -1375,46 +1379,41 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
                                         int classzone_idx,
                                         unsigned long wmark_target)
  {
-       int fragindex;
         unsigned long watermark;
  
         if (is_via_compact_memory(order))
                 return COMPACT_CONTINUE;
  
-       watermark = low_wmark_pages(zone);
+       watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
         /*
          * If watermarks for high-order allocation are already met, there
          * should be no need for compaction at all.
          */
         if (zone_watermark_ok(zone, order, watermark, classzone_idx,
                                                                 alloc_flags))
-               return COMPACT_PARTIAL;
+               return COMPACT_SUCCESS;
  
         /*
-        * Watermarks for order-0 must be met for compaction. Note the 2UL.
-        * This is because during migration, copies of pages need to be
-        * allocated and for a short time, the footprint is higher
+        * Watermarks for order-0 must be met for compaction to be able to
+        * isolate free pages for migration targets. This means that the
+        * watermark and alloc_flags have to match, or be more pessimistic than
+        * the check in __isolate_free_page(). We don't use the direct
+        * compactor's alloc_flags, as they are not relevant for freepage
+        * isolation. We however do use the direct compactor's classzone_idx to
+        * skip over zones where lowmem reserves would prevent allocation even
+        * if compaction succeeds.
+        * For costly orders, we require low watermark instead of min for
+        * compaction to proceed to increase its chances.
+        * ALLOC_CMA is used, as pages in CMA pageblocks are considered
+        * suitable migration targets
          */
-       watermark += (2UL << order);
+       watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
+                               low_wmark_pages(zone) : min_wmark_pages(zone);
+       watermark += compact_gap(order);
         if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
-                                alloc_flags, wmark_target))
+                                               ALLOC_CMA, wmark_target))
                 return COMPACT_SKIPPED;
  
-       /*
-        * fragmentation index determines if allocation failures are due to
-        * low memory or external fragmentation
-        *
-        * index of -1000 would imply allocations might succeed depending on
-        * watermarks, but we already failed the high-order watermark check
-        * index towards 0 implies failure is due to lack of memory
-        * index towards 1000 implies failure is due to fragmentation
-        *
-        * Only compact if a failure would be due to fragmentation.
-        */
-       fragindex = fragmentation_index(zone, order);
-       if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-               return COMPACT_NOT_SUITABLE_ZONE;
-
         return COMPACT_CONTINUE;
  }
  
@@ -1423,9 +1422,32 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
                                         int classzone_idx)
  {
         enum compact_result ret;
+       int fragindex;
  
         ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
                                     zone_page_state(zone, NR_FREE_PAGES));
+       /*
+        * fragmentation index determines if allocation failures are due to
+        * low memory or external fragmentation
+        *
+        * index of -1000 would imply allocations might succeed depending on
+        * watermarks, but we already failed the high-order watermark check
+        * index towards 0 implies failure is due to lack of memory
+        * index towards 1000 implies failure is due to fragmentation
+        *
+        * Only compact if a failure would be due to fragmentation. Also
+        * ignore fragindex for non-costly orders where the alternative to
+        * a successful reclaim/compaction is OOM. Fragindex and the
+        * vm.extfrag_threshold sysctl is meant as a heuristic to prevent
+        * excessive compaction for costly orders, but it should not be at the
+        * expense of system stability.
+        */
+       if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
+               fragindex = fragmentation_index(zone, order);
+               if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+                       ret = COMPACT_NOT_SUITABLE_ZONE;
+       }
+
         trace_mm_compaction_suitable(zone, order, ret);
         if (ret == COMPACT_NOT_SUITABLE_ZONE)
                 ret = COMPACT_SKIPPED;
@@ -1458,8 +1480,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
                 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
                 compact_result = __compaction_suitable(zone, order, alloc_flags,
                                 ac_classzone_idx(ac), available);
-               if (compact_result != COMPACT_SKIPPED &&
-                               compact_result != COMPACT_NOT_SUITABLE_ZONE)
+               if (compact_result != COMPACT_SKIPPED)
                         return true;
         }
  
@@ -1477,7 +1498,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
         ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
                                                         cc->classzone_idx);
         /* Compaction is likely to fail */
-       if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED)
+       if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
                 return ret;
  
         /* huh, compaction_suitable is returning something unexpected */
@@ -1492,23 +1513,29 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
  
         /*
          * Setup to move all movable pages to the end of the zone. Used cached
-        * information on where the scanners should start but check that it
-        * is initialised by ensuring the values are within zone boundaries.
+        * information on where the scanners should start (unless we explicitly
+        * want to compact the whole zone), but check that it is initialised
+        * by ensuring the values are within zone boundaries.
          */
-       cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
-       cc->free_pfn = zone->compact_cached_free_pfn;
-       if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
-               cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
-               zone->compact_cached_free_pfn = cc->free_pfn;
-       }
-       if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
+       if (cc->whole_zone) {
                 cc->migrate_pfn = start_pfn;
-               zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
-               zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
-       }
+               cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
+       } else {
+               cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
+               cc->free_pfn = zone->compact_cached_free_pfn;
+               if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
+                       cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
+                       zone->compact_cached_free_pfn = cc->free_pfn;
+               }
+               if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
+                       cc->migrate_pfn = start_pfn;
+                       zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+                       zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
+               }
  
-       if (cc->migrate_pfn == start_pfn)
-               cc->whole_zone = true;
+               if (cc->migrate_pfn == start_pfn)
+                       cc->whole_zone = true;
+       }
  
         cc->last_migrated_pfn = 0;
  
@@ -1638,6 +1665,9 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
                 .alloc_flags = alloc_flags,
                 .classzone_idx = classzone_idx,
                 .direct_compaction = true,
+               .whole_zone = (prio == MIN_COMPACT_PRIORITY),
+               .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
+               .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
         };
         INIT_LIST_HEAD(&cc.freepages);
         INIT_LIST_HEAD(&cc.migratepages);
@@ -1683,7 +1713,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
                                                                 ac->nodemask) {
                 enum compact_result status;
  
-               if (compaction_deferred(zone, order)) {
+               if (prio > MIN_COMPACT_PRIORITY
+                                       && compaction_deferred(zone, order)) {
                         rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
                         continue;
                 }
@@ -1692,9 +1723,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
                                         alloc_flags, ac_classzone_idx(ac));
                 rc = max(status, rc);
  
-               /* If a normal allocation would succeed, stop compacting */
-               if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
-                                       ac_classzone_idx(ac), alloc_flags)) {
+               /* The allocation should succeed, stop compacting */
+               if (status == COMPACT_SUCCESS) {
                         /*
                          * We think the allocation will succeed in this zone,
                          * but it is not certain, hence the false. The caller
@@ -1730,10 +1760,18 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
  
  
  /* Compact all zones within a node */
-static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
+static void compact_node(int nid)
  {
+       pg_data_t *pgdat = NODE_DATA(nid);
         int zoneid;
         struct zone *zone;
+       struct compact_control cc = {
+               .order = -1,
+               .mode = MIGRATE_SYNC,
+               .ignore_skip_hint = true,
+               .whole_zone = true,
+       };
+
  
         for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
  
@@ -1741,60 +1779,19 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                 if (!populated_zone(zone))
                         continue;
  
-               cc->nr_freepages = 0;
-               cc->nr_migratepages = 0;
-               cc->zone = zone;
-               INIT_LIST_HEAD(&cc->freepages);
-               INIT_LIST_HEAD(&cc->migratepages);
-
-               /*
-                * When called via /proc/sys/vm/compact_memory
-                * this makes sure we compact the whole zone regardless of
-                * cached scanner positions.
-                */
-               if (is_via_compact_memory(cc->order))
-                       __reset_isolation_suitable(zone);
-
-               if (is_via_compact_memory(cc->order) ||
-                               !compaction_deferred(zone, cc->order))
-                       compact_zone(zone, cc);
-
-               VM_BUG_ON(!list_empty(&cc->freepages));
-               VM_BUG_ON(!list_empty(&cc->migratepages));
+               cc.nr_freepages = 0;
+               cc.nr_migratepages = 0;
+               cc.zone = zone;
+               INIT_LIST_HEAD(&cc.freepages);
+               INIT_LIST_HEAD(&cc.migratepages);
  
-               if (is_via_compact_memory(cc->order))
-                       continue;
+               compact_zone(zone, &cc);
  
-               if (zone_watermark_ok(zone, cc->order,
-                               low_wmark_pages(zone), 0, 0))
-                       compaction_defer_reset(zone, cc->order, false);
+               VM_BUG_ON(!list_empty(&cc.freepages));
+               VM_BUG_ON(!list_empty(&cc.migratepages));
         }
  }
  
-void compact_pgdat(pg_data_t *pgdat, int order)
-{
-       struct compact_control cc = {
-               .order = order,
-               .mode = MIGRATE_ASYNC,
-       };
-
-       if (!order)
-               return;
-
-       __compact_pgdat(pgdat, &cc);
-}
-
-static void compact_node(int nid)
-{
-       struct compact_control cc = {
-               .order = -1,
-               .mode = MIGRATE_SYNC,
-               .ignore_skip_hint = true,
-       };
-
-       __compact_pgdat(NODE_DATA(nid), &cc);
-}
-
  /* Compact all nodes in the system */
  static void compact_nodes(void)
  {
@@ -1900,8 +1897,6 @@ static void kcompactd_do_work(pg_data_t *pgdat)
                 .ignore_skip_hint = true,
  
         };
-       bool success = false;
-
         trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
                                                         cc.classzone_idx);
         count_vm_event(KCOMPACTD_WAKE);
@@ -1930,9 +1925,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
                         return;
                 status = compact_zone(zone, &cc);
  
-               if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
-                                               cc.classzone_idx, 0)) {
-                       success = true;
+               if (status == COMPACT_SUCCESS) {
                         compaction_defer_reset(zone, cc.order, false);
                 } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
                         /*
diff --git a/mm/debug.c b/mm/debug.c

index 74c7cae..9feb699 100644 (file)
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -42,6 +42,11 @@ const struct trace_print_flags vmaflag_names[] = {
  
  void __dump_page(struct page *page, const char *reason)
  {
+       /*
+        * Avoid VM_BUG_ON() in page_mapcount().
+        * page->_mapcount space in struct page is used by sl[aou]b pages to
+        * encode own info.
+        */
         int mapcount = PageSlab(page) ? 0 : page_mapcount(page);
  
         pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
diff --git a/mm/filemap.c b/mm/filemap.c

index 68f1813..2f7b778 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1687,6 +1687,10 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
         unsigned int prev_offset;
         int error = 0;
  
+       if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
+               return -EINVAL;
+       iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
+
         index = *ppos >> PAGE_SHIFT;
         prev_index = ra->prev_pos >> PAGE_SHIFT;
         prev_offset = ra->prev_pos & (PAGE_SIZE-1);
@@ -1721,7 +1725,9 @@ find_page:
                          * wait_on_page_locked is used to avoid unnecessarily
                          * serialisations and why it's safe.
                          */
-                       wait_on_page_locked_killable(page);
+                       error = wait_on_page_locked_killable(page);
+                       if (unlikely(error))
+                               goto readpage_error;
                         if (PageUptodate(page))
                                 goto page_ok;
  
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 283583f..cdcd25c 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker;
  static atomic_t huge_zero_refcount;
  struct page *huge_zero_page __read_mostly;
  
-struct page *get_huge_zero_page(void)
+static struct page *get_huge_zero_page(void)
  {
         struct page *zero_page;
  retry:
@@ -86,7 +86,7 @@ retry:
         return READ_ONCE(huge_zero_page);
  }
  
-void put_huge_zero_page(void)
+static void put_huge_zero_page(void)
  {
         /*
          * Counter should never go to zero here. Only shrinker can put
@@ -95,6 +95,26 @@ void put_huge_zero_page(void)
         BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
  }
  
+struct page *mm_get_huge_zero_page(struct mm_struct *mm)
+{
+       if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+               return READ_ONCE(huge_zero_page);
+
+       if (!get_huge_zero_page())
+               return NULL;
+
+       if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+               put_huge_zero_page();
+
+       return READ_ONCE(huge_zero_page);
+}
+
+void mm_put_huge_zero_page(struct mm_struct *mm)
+{
+       if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+               put_huge_zero_page();
+}
+
  static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
                                         struct shrink_control *sc)
  {
@@ -469,6 +489,49 @@ void prep_transhuge_page(struct page *page)
         set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
  }
  
+unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
+               loff_t off, unsigned long flags, unsigned long size)
+{
+       unsigned long addr;
+       loff_t off_end = off + len;
+       loff_t off_align = round_up(off, size);
+       unsigned long len_pad;
+
+       if (off_end <= off_align || (off_end - off_align) < size)
+               return 0;
+
+       len_pad = len + size;
+       if (len_pad < len || (off + len_pad) < off)
+               return 0;
+
+       addr = current->mm->get_unmapped_area(filp, 0, len_pad,
+                                             off >> PAGE_SHIFT, flags);
+       if (IS_ERR_VALUE(addr))
+               return 0;
+
+       addr += (off - addr) & (size - 1);
+       return addr;
+}
+
+unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
+               unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+       loff_t off = (loff_t)pgoff << PAGE_SHIFT;
+
+       if (addr)
+               goto out;
+       if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
+               goto out;
+
+       addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
+       if (addr)
+               return addr;
+
+ out:
+       return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
+
  static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
                 gfp_t gfp)
  {
@@ -601,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
                 pgtable = pte_alloc_one(vma->vm_mm, haddr);
                 if (unlikely(!pgtable))
                         return VM_FAULT_OOM;
-               zero_page = get_huge_zero_page();
+               zero_page = mm_get_huge_zero_page(vma->vm_mm);
                 if (unlikely(!zero_page)) {
                         pte_free(vma->vm_mm, pgtable);
                         count_vm_event(THP_FAULT_FALLBACK);
@@ -623,10 +686,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
                         }
                 } else
                         spin_unlock(fe->ptl);
-               if (!set) {
+               if (!set)
                         pte_free(vma->vm_mm, pgtable);
-                       put_huge_zero_page();
-               }
                 return ret;
         }
         gfp = alloc_hugepage_direct_gfpmask(vma);
@@ -780,7 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  * since we already have a zero page to copy. It just takes a
                  * reference.
                  */
-               zero_page = get_huge_zero_page();
+               zero_page = mm_get_huge_zero_page(dst_mm);
                 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
                                 zero_page);
                 ret = 0;
@@ -1038,7 +1099,6 @@ alloc:
                 update_mmu_cache_pmd(vma, fe->address, fe->pmd);
                 if (!page) {
                         add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-                       put_huge_zero_page();
                 } else {
                         VM_BUG_ON_PAGE(!PageHead(page), page);
                         page_remove_rmap(page, true);
@@ -1499,7 +1559,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
         }
         smp_wmb(); /* make pte visible before pmd */
         pmd_populate(mm, pmd, pgtable);
-       put_huge_zero_page();
  }
  
  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
@@ -1522,8 +1581,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
  
         if (!vma_is_anonymous(vma)) {
                 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
-               if (is_huge_zero_pmd(_pmd))
-                       put_huge_zero_page();
                 if (vma_is_dax(vma))
                         return;
                 page = pmd_page(_pmd);
@@ -1563,7 +1620,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                         if (soft_dirty)
                                 entry = pte_swp_mksoft_dirty(entry);
                 } else {
-                       entry = mk_pte(page + i, vma->vm_page_prot);
+                       entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
                         entry = maybe_mkwrite(entry, vma);
                         if (!write)
                                 entry = pte_wrprotect(entry);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 87e11d8..ec49d9e 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -567,13 +567,13 @@ retry:
   * appear as a "reserved" entry instead of simply dangling with incorrect
   * counts.
   */
-void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
+void hugetlb_fix_reserve_counts(struct inode *inode)
  {
         struct hugepage_subpool *spool = subpool_inode(inode);
         long rsv_adjust;
  
         rsv_adjust = hugepage_subpool_get_pages(spool, 1);
-       if (restore_reserve && rsv_adjust) {
+       if (rsv_adjust) {
                 struct hstate *h = hstate_inode(inode);
  
                 hugetlb_acct_memory(h, 1);
@@ -1022,7 +1022,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
                 ((node = hstate_next_node_to_free(hs, mask)) || 1);     \
                 nr_nodes--)
  
-#if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \
+#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \
         ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \
         defined(CONFIG_CMA))
  static void destroy_compound_gigantic_page(struct page *page,
@@ -1437,38 +1437,61 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
  
  /*
   * Dissolve a given free hugepage into free buddy pages. This function does
- * nothing for in-use (including surplus) hugepages.
+ * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the
+ * number of free hugepages would be reduced below the number of reserved
+ * hugepages.
   */
-static void dissolve_free_huge_page(struct page *page)
+static int dissolve_free_huge_page(struct page *page)
  {
+       int rc = 0;
+
         spin_lock(&hugetlb_lock);
         if (PageHuge(page) && !page_count(page)) {
-               struct hstate *h = page_hstate(page);
-               int nid = page_to_nid(page);
-               list_del(&page->lru);
+               struct page *head = compound_head(page);
+               struct hstate *h = page_hstate(head);
+               int nid = page_to_nid(head);
+               if (h->free_huge_pages - h->resv_huge_pages == 0) {
+                       rc = -EBUSY;
+                       goto out;
+               }
+               list_del(&head->lru);
                 h->free_huge_pages--;
                 h->free_huge_pages_node[nid]--;
                 h->max_huge_pages--;
-               update_and_free_page(h, page);
+               update_and_free_page(h, head);
         }
+out:
         spin_unlock(&hugetlb_lock);
+       return rc;
  }
  
  /*
   * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
   * make specified memory blocks removable from the system.
- * Note that start_pfn should aligned with (minimum) hugepage size.
+ * Note that this will dissolve a free gigantic hugepage completely, if any
+ * part of it lies within the given range.
+ * Also note that if dissolve_free_huge_page() returns with an error, all
+ * free hugepages that were dissolved before that error are lost.
   */
-void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
+int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
  {
         unsigned long pfn;
+       struct page *page;
+       int rc = 0;
  
         if (!hugepages_supported())
-               return;
+               return rc;
+
+       for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
+               page = pfn_to_page(pfn);
+               if (PageHuge(page) && !page_count(page)) {
+                       rc = dissolve_free_huge_page(page);
+                       if (rc)
+                               break;
+               }
+       }
  
-       VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
-       for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
-               dissolve_free_huge_page(pfn_to_page(pfn));
+       return rc;
  }
  
  /*
diff --git a/mm/internal.h b/mm/internal.h

index 1501304..537ac99 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -178,8 +178,9 @@ struct compact_control {
         unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
         enum migrate_mode mode;         /* Async or sync migration mode */
         bool ignore_skip_hint;          /* Scan blocks even if marked skip */
+       bool ignore_block_suitable;     /* Scan blocks considered unsuitable */
         bool direct_compaction;         /* False from kcompactd or /proc/... */
-       bool whole_zone;                /* Whole zone has been scanned */
+       bool whole_zone;                /* Whole zone should/has been scanned */
         int order;                      /* order a direct compactor needs */
         const gfp_t gfp_mask;           /* gfp mask of a direct compactor */
         const unsigned int alloc_flags; /* alloc flags of a direct compactor */
diff --git a/mm/ksm.c b/mm/ksm.c

index 5048083..9ae6011 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -299,7 +299,12 @@ static inline void free_rmap_item(struct rmap_item *rmap_item)
  
  static inline struct stable_node *alloc_stable_node(void)
  {
-       return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
+       /*
+        * The allocation can take too long with GFP_KERNEL when memory is under
+        * pressure, which may lead to hung task warnings.  Adding __GFP_HIGH
+        * grants access to memory reserves, helping to avoid this problem.
+        */
+       return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
  }
  
  static inline void free_stable_node(struct stable_node *stable_node)
diff --git a/mm/memblock.c b/mm/memblock.c

index 483197e..c8dfa43 100644 (file)
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1438,6 +1438,11 @@ phys_addr_t __init_memblock memblock_phys_mem_size(void)
         return memblock.memory.total_size;
  }
  
+phys_addr_t __init_memblock memblock_reserved_size(void)
+{
+       return memblock.reserved.total_size;
+}
+
  phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
  {
         unsigned long pages = 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 4be518d..ae052b5 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -920,6 +920,43 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
              iter != NULL;                              \
              iter = mem_cgroup_iter(NULL, iter, NULL))
  
+/**
+ * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
+ * @memcg: hierarchy root
+ * @fn: function to call for each task
+ * @arg: argument passed to @fn
+ *
+ * This function iterates over tasks attached to @memcg or to any of its
+ * descendants and calls @fn for each task. If @fn returns a non-zero
+ * value, the function breaks the iteration loop and returns the value.
+ * Otherwise, it will iterate over all tasks and return 0.
+ *
+ * This function must not be called for the root memory cgroup.
+ */
+int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+                         int (*fn)(struct task_struct *, void *), void *arg)
+{
+       struct mem_cgroup *iter;
+       int ret = 0;
+
+       BUG_ON(memcg == root_mem_cgroup);
+
+       for_each_mem_cgroup_tree(iter, memcg) {
+               struct css_task_iter it;
+               struct task_struct *task;
+
+               css_task_iter_start(&iter->css, &it);
+               while (!ret && (task = css_task_iter_next(&it)))
+                       ret = fn(task, arg);
+               css_task_iter_end(&it);
+               if (ret) {
+                       mem_cgroup_iter_break(memcg, iter);
+                       break;
+               }
+       }
+       return ret;
+}
+
  /**
   * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
   * @page: the page
@@ -1178,7 +1215,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
  /*
   * Return the memory (and swap, if configured) limit for a memcg.
   */
-static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
  {
         unsigned long limit;
  
@@ -1205,79 +1242,12 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                 .gfp_mask = gfp_mask,
                 .order = order,
         };
-       struct mem_cgroup *iter;
-       unsigned long chosen_points = 0;
-       unsigned long totalpages;
-       unsigned int points = 0;
-       struct task_struct *chosen = NULL;
+       bool ret;
  
         mutex_lock(&oom_lock);
-
-       /*
-        * If current has a pending SIGKILL or is exiting, then automatically
-        * select it.  The goal is to allow it to allocate so that it may
-        * quickly exit and free its memory.
-        */
-       if (task_will_free_mem(current)) {
-               mark_oom_victim(current);
-               wake_oom_reaper(current);
-               goto unlock;
-       }
-
-       check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
-       totalpages = mem_cgroup_get_limit(memcg) ? : 1;
-       for_each_mem_cgroup_tree(iter, memcg) {
-               struct css_task_iter it;
-               struct task_struct *task;
-
-               css_task_iter_start(&iter->css, &it);
-               while ((task = css_task_iter_next(&it))) {
-                       switch (oom_scan_process_thread(&oc, task)) {
-                       case OOM_SCAN_SELECT:
-                               if (chosen)
-                                       put_task_struct(chosen);
-                               chosen = task;
-                               chosen_points = ULONG_MAX;
-                               get_task_struct(chosen);
-                               /* fall through */
-                       case OOM_SCAN_CONTINUE:
-                               continue;
-                       case OOM_SCAN_ABORT:
-                               css_task_iter_end(&it);
-                               mem_cgroup_iter_break(memcg, iter);
-                               if (chosen)
-                                       put_task_struct(chosen);
-                               /* Set a dummy value to return "true". */
-                               chosen = (void *) 1;
-                               goto unlock;
-                       case OOM_SCAN_OK:
-                               break;
-                       };
-                       points = oom_badness(task, memcg, NULL, totalpages);
-                       if (!points || points < chosen_points)
-                               continue;
-                       /* Prefer thread group leaders for display purposes */
-                       if (points == chosen_points &&
-                           thread_group_leader(chosen))
-                               continue;
-
-                       if (chosen)
-                               put_task_struct(chosen);
-                       chosen = task;
-                       chosen_points = points;
-                       get_task_struct(chosen);
-               }
-               css_task_iter_end(&it);
-       }
-
-       if (chosen) {
-               points = chosen_points * 1000 / totalpages;
-               oom_kill_process(&oc, chosen, points, totalpages,
-                                "Memory cgroup out of memory");
-       }
-unlock:
+       ret = out_of_memory(&oc);
         mutex_unlock(&oom_lock);
-       return chosen;
+       return ret;
  }
  
  #if MAX_NUMNODES > 1
@@ -1600,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
         if (!memcg)
                 return false;
  
-       if (!handle || oom_killer_disabled)
+       if (!handle)
                 goto cleanup;
  
         owait.memcg = memcg;
@@ -2969,16 +2939,16 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
                 /*
                  * The active flag needs to be written after the static_key
                  * update. This is what guarantees that the socket activation
-                * function is the last one to run. See sock_update_memcg() for
-                * details, and note that we don't mark any socket as belonging
-                * to this memcg until that flag is up.
+                * function is the last one to run. See mem_cgroup_sk_alloc()
+                * for details, and note that we don't mark any socket as
+                * belonging to this memcg until that flag is up.
                  *
                  * We need to do this, because static_keys will span multiple
                  * sites, but we can't control their order. If we mark a socket
                  * as accounted, but the accounting functions are not patched in
                  * yet, we'll lose accounting.
                  *
-                * We never race with the readers in sock_update_memcg(),
+                * We never race with the readers in mem_cgroup_sk_alloc(),
                  * because when this value change, the code to process it is not
                  * patched in yet.
                  */
@@ -4092,11 +4062,13 @@ static DEFINE_IDR(mem_cgroup_idr);
  
  static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
  {
+       VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
         atomic_add(n, &memcg->id.ref);
  }
  
  static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
  {
+       VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
         if (atomic_sub_and_test(n, &memcg->id.ref)) {
                 idr_remove(&mem_cgroup_idr, memcg->id.id);
                 memcg->id.id = 0;
@@ -4285,8 +4257,10 @@ fail:
  
  static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
  {
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
         /* Online state pins memcg ID, memcg ID pins CSS */
-       mem_cgroup_id_get(mem_cgroup_from_css(css));
+       atomic_set(&memcg->id.ref, 1);
         css_get(css);
         return 0;
  }
@@ -4434,7 +4408,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
          * Because lookup_swap_cache() updates some statistics counter,
          * we call find_get_page() with swapper_space directly.
          */
-       page = find_get_page(swap_address_space(ent), ent.val);
+       page = find_get_page(swap_address_space(ent), swp_offset(ent));
         if (do_memsw_account())
                 entry->val = ent.val;
  
@@ -4472,7 +4446,8 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
                         swp_entry_t swp = radix_to_swp_entry(page);
                         if (do_memsw_account())
                                 *entry = swp;
-                       page = find_get_page(swap_address_space(swp), swp.val);
+                       page = find_get_page(swap_address_space(swp),
+                                            swp_offset(swp));
                 }
         } else
                 page = find_get_page(mapping, pgoff);
@@ -4707,7 +4682,8 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
                 .mm = mm,
         };
         down_read(&mm->mmap_sem);
-       walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
+       walk_page_range(0, mm->highest_vm_end,
+                       &mem_cgroup_count_precharge_walk);
         up_read(&mm->mmap_sem);
  
         precharge = mc.precharge;
@@ -4995,7 +4971,8 @@ retry:
          * When we have consumed all precharges and failed in doing
          * additional charge, the page walk just aborts.
          */
-       walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
+       walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
+
         up_read(&mc.mm->mmap_sem);
         atomic_dec(&mc.from->moving_account);
  }
@@ -5674,11 +5651,15 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
  EXPORT_SYMBOL(memcg_sockets_enabled_key);
  
-void sock_update_memcg(struct sock *sk)
+void mem_cgroup_sk_alloc(struct sock *sk)
  {
         struct mem_cgroup *memcg;
  
-       /* Socket cloning can throw us here with sk_cgrp already
+       if (!mem_cgroup_sockets_enabled)
+               return;
+
+       /*
+        * Socket cloning can throw us here with sk_memcg already
          * filled. It won't however, necessarily happen from
          * process context. So the test for root memcg given
          * the current task's memcg won't help us in this case.
@@ -5703,12 +5684,11 @@ void sock_update_memcg(struct sock *sk)
  out:
         rcu_read_unlock();
  }
-EXPORT_SYMBOL(sock_update_memcg);
  
-void sock_release_memcg(struct sock *sk)
+void mem_cgroup_sk_free(struct sock *sk)
  {
-       WARN_ON(!sk->sk_memcg);
-       css_put(&sk->sk_memcg->css);
+       if (sk->sk_memcg)
+               css_put(&sk->sk_memcg->css);
  }
  
  /**
diff --git a/mm/memory.c b/mm/memory.c

index f1a6804..fc1987d 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1649,10 +1649,14 @@ EXPORT_SYMBOL(vm_insert_pfn_prot);
  int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                         pfn_t pfn)
  {
+       pgprot_t pgprot = vma->vm_page_prot;
+
         BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
  
         if (addr < vma->vm_start || addr >= vma->vm_end)
                 return -EFAULT;
+       if (track_pfn_insert(vma, &pgprot, pfn))
+               return -EINVAL;
  
         /*
          * If we don't have pte special, then we have to use the pfn_valid()
@@ -1670,9 +1674,9 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                  * result in pfn_t_has_page() == false.
                  */
                 page = pfn_to_page(pfn_t_to_pfn(pfn));
-               return insert_page(vma, addr, page, vma->vm_page_prot);
+               return insert_page(vma, addr, page, pgprot);
         }
-       return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+       return insert_pfn(vma, addr, pfn, pgprot);
  }
  EXPORT_SYMBOL(vm_insert_mixed);
  
@@ -3658,6 +3662,19 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                          mem_cgroup_oom_synchronize(false);
         }
  
+       /*
+        * This mm has been already reaped by the oom reaper and so the
+        * refault cannot be trusted in general. Anonymous refaults would
+        * lose data and give a zero page instead e.g. This is especially
+        * problem for use_mm() because regular tasks will just die and
+        * the corrupted data will not be visible anywhere while kthread
+        * will outlive the oom victim and potentially propagate the date
+        * further.
+        */
+       if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
+                               && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
+               ret = VM_FAULT_SIGBUS;
+
         return ret;
  }
  EXPORT_SYMBOL_GPL(handle_mm_fault);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

index 9d29ba0..9629273 100644 (file)
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1945,7 +1945,9 @@ repeat:
          * dissolve free hugepages in the memory block before doing offlining
          * actually in order to make hugetlbfs's object counting consistent.
          */
-       dissolve_free_huge_pages(start_pfn, end_pfn);
+       ret = dissolve_free_huge_pages(start_pfn, end_pfn);
+       if (ret)
+               goto failed_removal;
         /* check again */
         offlined_pages = check_pages_isolated(start_pfn, end_pfn);
         if (offlined_pages < 0) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 2da72a5..ad1c96a 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1749,7 +1749,7 @@ unsigned int mempolicy_slab_node(void)
                  */
                 struct zonelist *zonelist;
                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
-               zonelist = &NODE_DATA(node)->node_zonelists[0];
+               zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
                 z = first_zones_zonelist(zonelist, highest_zoneidx,
                                                         &policy->v.nodes);
                 return z->zone ? z->zone->node : node;
diff --git a/mm/migrate.c b/mm/migrate.c

index f7ee04a..99250ae 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -234,7 +234,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                 goto unlock;
  
         get_page(new);
-       pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
+       pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
         if (pte_swp_soft_dirty(*ptep))
                 pte = pte_mksoft_dirty(pte);
  
diff --git a/mm/mincore.c b/mm/mincore.c

index c0b5ba9..bfb8664 100644 (file)
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -66,7 +66,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
                  */
                 if (radix_tree_exceptional_entry(page)) {
                         swp_entry_t swp = radix_to_swp_entry(page);
-                       page = find_get_page(swap_address_space(swp), swp.val);
+                       page = find_get_page(swap_address_space(swp),
+                                            swp_offset(swp));
                 }
         } else
                 page = find_get_page(mapping, pgoff);
@@ -150,7 +151,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                         } else {
  #ifdef CONFIG_SWAP
                                 *vec = mincore_page(swap_address_space(entry),
-                                       entry.val);
+                                                   swp_offset(entry));
  #else
                                 WARN_ON(1);
                                 *vec = 1;
diff --git a/mm/mlock.c b/mm/mlock.c

index 14645be..145a425 100644 (file)
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -516,6 +516,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
         int nr_pages;
         int ret = 0;
         int lock = !!(newflags & VM_LOCKED);
+       vm_flags_t old_flags = vma->vm_flags;
  
         if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
             is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
@@ -550,6 +551,8 @@ success:
         nr_pages = (end - start) >> PAGE_SHIFT;
         if (!lock)
                 nr_pages = -nr_pages;
+       else if (old_flags & VM_LOCKED)
+               nr_pages = 0;
         mm->locked_vm += nr_pages;
  
         /*
@@ -617,6 +620,45 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
         return error;
  }
  
+/*
+ * Go through vma areas and sum size of mlocked
+ * vma pages, as return value.
+ * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
+ * is also counted.
+ * Return value: previously mlocked page counts
+ */
+static int count_mm_mlocked_page_nr(struct mm_struct *mm,
+               unsigned long start, size_t len)
+{
+       struct vm_area_struct *vma;
+       int count = 0;
+
+       if (mm == NULL)
+               mm = current->mm;
+
+       vma = find_vma(mm, start);
+       if (vma == NULL)
+               vma = mm->mmap;
+
+       for (; vma ; vma = vma->vm_next) {
+               if (start >= vma->vm_end)
+                       continue;
+               if (start + len <=  vma->vm_start)
+                       break;
+               if (vma->vm_flags & VM_LOCKED) {
+                       if (start > vma->vm_start)
+                               count -= (start - vma->vm_start);
+                       if (start + len < vma->vm_end) {
+                               count += start + len - vma->vm_start;
+                               break;
+                       }
+                       count += vma->vm_end - vma->vm_start;
+               }
+       }
+
+       return count >> PAGE_SHIFT;
+}
+
  static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
  {
         unsigned long locked;
@@ -639,6 +681,16 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
                 return -EINTR;
  
         locked += current->mm->locked_vm;
+       if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
+               /*
+                * It is possible that the regions requested intersect with
+                * previously mlocked areas, that part area in "mm->locked_vm"
+                * should not be counted to new mlock increment count. So check
+                * and adjust locked count if necessary.
+                */
+               locked -= count_mm_mlocked_page_nr(current->mm,
+                               start, len);
+       }
  
         /* check against resource limits */
         if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
diff --git a/mm/mmap.c b/mm/mmap.c

index 7a0707a..1af87c1 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -116,13 +116,15 @@ static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
  void vma_set_page_prot(struct vm_area_struct *vma)
  {
         unsigned long vm_flags = vma->vm_flags;
+       pgprot_t vm_page_prot;
  
-       vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
-       if (vma_wants_writenotify(vma)) {
+       vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
+       if (vma_wants_writenotify(vma, vm_page_prot)) {
                 vm_flags &= ~VM_SHARED;
-               vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot,
-                                                    vm_flags);
+               vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
         }
+       /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */
+       WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
  }
  
  /*
@@ -400,14 +402,8 @@ static inline void vma_rb_insert(struct vm_area_struct *vma,
         rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
  }
  
-static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
+static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
  {
-       /*
-        * All rb_subtree_gap values must be consistent prior to erase,
-        * with the possible exception of the vma being erased.
-        */
-       validate_mm_rb(root, vma);
-
         /*
          * Note rb_erase_augmented is a fairly large inline function,
          * so make sure we instantiate it only once with our desired
@@ -416,6 +412,32 @@ static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
         rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
  }
  
+static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
+                                               struct rb_root *root,
+                                               struct vm_area_struct *ignore)
+{
+       /*
+        * All rb_subtree_gap values must be consistent prior to erase,
+        * with the possible exception of the "next" vma being erased if
+        * next->vm_start was reduced.
+        */
+       validate_mm_rb(root, ignore);
+
+       __vma_rb_erase(vma, root);
+}
+
+static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
+                                        struct rb_root *root)
+{
+       /*
+        * All rb_subtree_gap values must be consistent prior to erase,
+        * with the possible exception of the vma being erased.
+        */
+       validate_mm_rb(root, vma);
+
+       __vma_rb_erase(vma, root);
+}
+
  /*
   * vma has some anon_vma assigned, and is already inserted on that
   * anon_vma's interval trees.
@@ -599,14 +621,25 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
         mm->map_count++;
  }
  
-static inline void
-__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
-               struct vm_area_struct *prev)
+static __always_inline void __vma_unlink_common(struct mm_struct *mm,
+                                               struct vm_area_struct *vma,
+                                               struct vm_area_struct *prev,
+                                               bool has_prev,
+                                               struct vm_area_struct *ignore)
  {
         struct vm_area_struct *next;
  
-       vma_rb_erase(vma, &mm->mm_rb);
-       prev->vm_next = next = vma->vm_next;
+       vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
+       next = vma->vm_next;
+       if (has_prev)
+               prev->vm_next = next;
+       else {
+               prev = vma->vm_prev;
+               if (prev)
+                       prev->vm_next = next;
+               else
+                       mm->mmap = next;
+       }
         if (next)
                 next->vm_prev = prev;
  
@@ -614,6 +647,13 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
         vmacache_invalidate(mm);
  }
  
+static inline void __vma_unlink_prev(struct mm_struct *mm,
+                                    struct vm_area_struct *vma,
+                                    struct vm_area_struct *prev)
+{
+       __vma_unlink_common(mm, vma, prev, true, vma);
+}
+
  /*
   * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
   * is already present in an i_mmap tree without adjusting the tree.
@@ -621,11 +661,12 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
   * are necessary.  The "insert" vma (if any) is to be inserted
   * before we drop the necessary locks.
   */
-int vma_adjust(struct vm_area_struct *vma, unsigned long start,
-       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
+int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
+       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
+       struct vm_area_struct *expand)
  {
         struct mm_struct *mm = vma->vm_mm;
-       struct vm_area_struct *next = vma->vm_next;
+       struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
         struct address_space *mapping = NULL;
         struct rb_root *root = NULL;
         struct anon_vma *anon_vma = NULL;
@@ -641,9 +682,38 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
                         /*
                          * vma expands, overlapping all the next, and
                          * perhaps the one after too (mprotect case 6).
+                        * The only other cases that gets here are
+                        * case 1, case 7 and case 8.
                          */
-                       remove_next = 1 + (end > next->vm_end);
-                       end = next->vm_end;
+                       if (next == expand) {
+                               /*
+                                * The only case where we don't expand "vma"
+                                * and we expand "next" instead is case 8.
+                                */
+                               VM_WARN_ON(end != next->vm_end);
+                               /*
+                                * remove_next == 3 means we're
+                                * removing "vma" and that to do so we
+                                * swapped "vma" and "next".
+                                */
+                               remove_next = 3;
+                               VM_WARN_ON(file != next->vm_file);
+                               swap(vma, next);
+                       } else {
+                               VM_WARN_ON(expand != vma);
+                               /*
+                                * case 1, 6, 7, remove_next == 2 is case 6,
+                                * remove_next == 1 is case 1 or 7.
+                                */
+                               remove_next = 1 + (end > next->vm_end);
+                               VM_WARN_ON(remove_next == 2 &&
+                                          end != next->vm_next->vm_end);
+                               VM_WARN_ON(remove_next == 1 &&
+                                          end != next->vm_end);
+                               /* trim end to next, for case 6 first pass */
+                               end = next->vm_end;
+                       }
+
                         exporter = next;
                         importer = vma;
  
@@ -651,7 +721,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
                          * If next doesn't have anon_vma, import from vma after
                          * next, if the vma overlaps with it.
                          */
-                       if (remove_next == 2 && next && !next->anon_vma)
+                       if (remove_next == 2 && !next->anon_vma)
                                 exporter = next->vm_next;
  
                 } else if (end > next->vm_start) {
@@ -662,6 +732,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
                         adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
                         exporter = next;
                         importer = vma;
+                       VM_WARN_ON(expand != importer);
                 } else if (end < vma->vm_end) {
                         /*
                          * vma shrinks, and !insert tells it's not
@@ -671,6 +742,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
                         adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
                         exporter = vma;
                         importer = next;
+                       VM_WARN_ON(expand != importer);
                 }
  
                 /*
@@ -688,7 +760,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
                 }
         }
  again:
-       vma_adjust_trans_huge(vma, start, end, adjust_next);
+       vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
  
         if (file) {
                 mapping = file->f_mapping;
@@ -714,8 +786,8 @@ again:
         if (!anon_vma && adjust_next)
                 anon_vma = next->anon_vma;
         if (anon_vma) {
-               VM_BUG_ON_VMA(adjust_next && next->anon_vma &&
-                         anon_vma != next->anon_vma, next);
+               VM_WARN_ON(adjust_next && next->anon_vma &&
+                          anon_vma != next->anon_vma);
                 anon_vma_lock_write(anon_vma);
                 anon_vma_interval_tree_pre_update_vma(vma);
                 if (adjust_next)
@@ -755,7 +827,19 @@ again:
                  * vma_merge has merged next into vma, and needs
                  * us to remove next before dropping the locks.
                  */
-               __vma_unlink(mm, next, vma);
+               if (remove_next != 3)
+                       __vma_unlink_prev(mm, next, vma);
+               else
+                       /*
+                        * vma is not before next if they've been
+                        * swapped.
+                        *
+                        * pre-swap() next->vm_start was reduced so
+                        * tell validate_mm_rb to ignore pre-swap()
+                        * "next" (which is stored in post-swap()
+                        * "vma").
+                        */
+                       __vma_unlink_common(mm, next, NULL, false, vma);
                 if (file)
                         __remove_shared_vm_struct(next, file, mapping);
         } else if (insert) {
@@ -807,7 +891,27 @@ again:
                  * we must remove another next too. It would clutter
                  * up the code too much to do both in one go.
                  */
-               next = vma->vm_next;
+               if (remove_next != 3) {
+                       /*
+                        * If "next" was removed and vma->vm_end was
+                        * expanded (up) over it, in turn
+                        * "next->vm_prev->vm_end" changed and the
+                        * "vma->vm_next" gap must be updated.
+                        */
+                       next = vma->vm_next;
+               } else {
+                       /*
+                        * For the scope of the comment "next" and
+                        * "vma" considered pre-swap(): if "vma" was
+                        * removed, next->vm_start was expanded (down)
+                        * over it and the "next" gap must be updated.
+                        * Because of the swap() the post-swap() "vma"
+                        * actually points to pre-swap() "next"
+                        * (post-swap() "next" as opposed is now a
+                        * dangling pointer).
+                        */
+                       next = vma;
+               }
                 if (remove_next == 2) {
                         remove_next = 1;
                         end = next->vm_end;
@@ -815,8 +919,28 @@ again:
                 }
                 else if (next)
                         vma_gap_update(next);
-               else
-                       mm->highest_vm_end = end;
+               else {
+                       /*
+                        * If remove_next == 2 we obviously can't
+                        * reach this path.
+                        *
+                        * If remove_next == 3 we can't reach this
+                        * path because pre-swap() next is always not
+                        * NULL. pre-swap() "next" is not being
+                        * removed and its next->vm_end is not altered
+                        * (and furthermore "end" already matches
+                        * next->vm_end in remove_next == 3).
+                        *
+                        * We reach this only in the remove_next == 1
+                        * case if the "next" vma that was removed was
+                        * the highest vma of the mm. However in such
+                        * case next->vm_end == "end" and the extended
+                        * "vma" has vma->vm_end == next->vm_end so
+                        * mm->highest_vm_end doesn't need any update
+                        * in remove_next == 1 case.
+                        */
+                       VM_WARN_ON(mm->highest_vm_end != end);
+               }
         }
         if (insert && file)
                 uprobe_mmap(insert);
@@ -936,13 +1060,24 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
   *    cannot merge    might become    might become    might become
   *                    PPNNNNNNNNNN    PPPPPPPPPPNN    PPPPPPPPPPPP 6 or
   *    mmap, brk or    case 4 below    case 5 below    PPPPPPPPXXXX 7 or
- *    mremap move:                                    PPPPNNNNNNNN 8
+ *    mremap move:                                    PPPPXXXXXXXX 8
   *        AAAA
   *    PPPP    NNNN    PPPPPPPPPPPP    PPPPPPPPNNNN    PPPPNNNNNNNN
   *    might become    case 1 below    case 2 below    case 3 below
   *
- * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
- * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
+ * It is important for case 8 that the the vma NNNN overlapping the
+ * region AAAA is never going to extended over XXXX. Instead XXXX must
+ * be extended in region AAAA and NNNN must be removed. This way in
+ * all cases where vma_merge succeeds, the moment vma_adjust drops the
+ * rmap_locks, the properties of the merged vma will be already
+ * correct for the whole merged range. Some of those properties like
+ * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
+ * be correct for the whole merged range immediately after the
+ * rmap_locks are released. Otherwise if XXXX would be removed and
+ * NNNN would be extended over the XXXX range, remove_migration_ptes
+ * or other rmap walkers (if working on addresses beyond the "end"
+ * parameter) may establish ptes with the wrong permissions of NNNN
+ * instead of the right permissions of XXXX.
   */
  struct vm_area_struct *vma_merge(struct mm_struct *mm,
                         struct vm_area_struct *prev, unsigned long addr,
@@ -967,9 +1102,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
         else
                 next = mm->mmap;
         area = next;
-       if (next && next->vm_end == end)                /* cases 6, 7, 8 */
+       if (area && area->vm_end == end)                /* cases 6, 7, 8 */
                 next = next->vm_next;
  
+       /* verify some invariant that must be enforced by the caller */
+       VM_WARN_ON(prev && addr <= prev->vm_start);
+       VM_WARN_ON(area && end > area->vm_end);
+       VM_WARN_ON(addr >= end);
+
         /*
          * Can it merge with the predecessor?
          */
@@ -990,11 +1130,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                 is_mergeable_anon_vma(prev->anon_vma,
                                                       next->anon_vma, NULL)) {
                                                         /* cases 1, 6 */
-                       err = vma_adjust(prev, prev->vm_start,
-                               next->vm_end, prev->vm_pgoff, NULL);
+                       err = __vma_adjust(prev, prev->vm_start,
+                                        next->vm_end, prev->vm_pgoff, NULL,
+                                        prev);
                 } else                                  /* cases 2, 5, 7 */
-                       err = vma_adjust(prev, prev->vm_start,
-                               end, prev->vm_pgoff, NULL);
+                       err = __vma_adjust(prev, prev->vm_start,
+                                        end, prev->vm_pgoff, NULL, prev);
                 if (err)
                         return NULL;
                 khugepaged_enter_vma_merge(prev, vm_flags);
@@ -1010,11 +1151,18 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                              anon_vma, file, pgoff+pglen,
                                              vm_userfaultfd_ctx)) {
                 if (prev && addr < prev->vm_end)        /* case 4 */
-                       err = vma_adjust(prev, prev->vm_start,
-                               addr, prev->vm_pgoff, NULL);
-               else                                    /* cases 3, 8 */
-                       err = vma_adjust(area, addr, next->vm_end,
-                               next->vm_pgoff - pglen, NULL);
+                       err = __vma_adjust(prev, prev->vm_start,
+                                        addr, prev->vm_pgoff, NULL, next);
+               else {                                  /* cases 3, 8 */
+                       err = __vma_adjust(area, addr, next->vm_end,
+                                        next->vm_pgoff - pglen, NULL, next);
+                       /*
+                        * In case 3 area is already equal to next and
+                        * this is a noop, but in case 8 "area" has
+                        * been removed and next was expanded over it.
+                        */
+                       area = next;
+               }
                 if (err)
                         return NULL;
                 khugepaged_enter_vma_merge(area, vm_flags);
@@ -1386,7 +1534,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
   * to the private version (using protection_map[] without the
   * VM_SHARED bit).
   */
-int vma_wants_writenotify(struct vm_area_struct *vma)
+int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
  {
         vm_flags_t vm_flags = vma->vm_flags;
         const struct vm_operations_struct *vm_ops = vma->vm_ops;
@@ -1401,8 +1549,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
  
         /* The open routine did something to the protections that pgprot_modify
          * won't preserve? */
-       if (pgprot_val(vma->vm_page_prot) !=
-           pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags)))
+       if (pgprot_val(vm_page_prot) !=
+           pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
                 return 0;
  
         /* Do we need to track softdirty? */
diff --git a/mm/mprotect.c b/mm/mprotect.c

index a4830f0..ec91dfd 100644 (file)
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -304,6 +304,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
                            vma->vm_userfaultfd_ctx);
         if (*pprev) {
                 vma = *pprev;
+               VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
                 goto success;
         }
  
@@ -327,7 +328,7 @@ success:
          * held in write mode.
          */
         vma->vm_flags = newflags;
-       dirty_accountable = vma_wants_writenotify(vma);
+       dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
         vma_set_page_prot(vma);
  
         change_protection(vma, start, end, vma->vm_page_prot,
diff --git a/mm/nobootmem.c b/mm/nobootmem.c

index bd05a70..ba609b6 100644 (file)
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -11,18 +11,21 @@
  #include <linux/init.h>
  #include <linux/pfn.h>
  #include <linux/slab.h>
-#include <linux/bootmem.h>
  #include <linux/export.h>
  #include <linux/kmemleak.h>
  #include <linux/range.h>
  #include <linux/memblock.h>
+#include <linux/bootmem.h>
  
  #include <asm/bug.h>
  #include <asm/io.h>
-#include <asm/processor.h>
  
  #include "internal.h"
  
+#ifndef CONFIG_HAVE_MEMBLOCK
+#error CONFIG_HAVE_MEMBLOCK not defined
+#endif
+
  #ifndef CONFIG_NEED_MULTIPLE_NODES
  struct pglist_data __refdata contig_page_data;
  EXPORT_SYMBOL(contig_page_data);
@@ -134,6 +137,11 @@ static unsigned long __init free_low_memory_core_early(void)
         for_each_reserved_mem_region(i, &start, &end)
                 reserve_bootmem_region(start, end);
  
+       /*
+        * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
+        *  because in some case like Node0 doesn't have RAM installed
+        *  low ram will be on Node1
+        */
         for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
                                 NULL)
                 count += __free_memory_core(start, end);
@@ -191,11 +199,6 @@ unsigned long __init free_all_bootmem(void)
  
         reset_all_zones_managed_pages();
  
-       /*
-        * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
-        *  because in some case like Node0 doesn't have RAM installed
-        *  low ram will be on Node1
-        */
         pages = free_low_memory_core_early();
         totalram_pages += pages;
  
@@ -395,9 +398,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
         return __alloc_bootmem_node(pgdat, size, align, goal);
  }
  
-#ifndef ARCH_LOW_ADDRESS_LIMIT
-#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
-#endif
  
  /**
   * __alloc_bootmem_low - allocate low boot memory
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index d53a9aa..ec9f11d 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -132,6 +132,11 @@ static inline bool is_sysrq_oom(struct oom_control *oc)
         return oc->order == -1;
  }
  
+static inline bool is_memcg_oom(struct oom_control *oc)
+{
+       return oc->memcg != NULL;
+}
+
  /* return true if the task is not adequate as candidate victim task. */
  static bool oom_unkillable_task(struct task_struct *p,
                 struct mem_cgroup *memcg, const nodemask_t *nodemask)
@@ -181,7 +186,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
          */
         adj = (long)p->signal->oom_score_adj;
         if (adj == OOM_SCORE_ADJ_MIN ||
-                       test_bit(MMF_OOM_REAPED, &p->mm->flags) ||
+                       test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
                         in_vfork(p)) {
                 task_unlock(p);
                 return 0;
@@ -213,12 +218,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
         return points > 0 ? points : 1;
  }
  
+enum oom_constraint {
+       CONSTRAINT_NONE,
+       CONSTRAINT_CPUSET,
+       CONSTRAINT_MEMORY_POLICY,
+       CONSTRAINT_MEMCG,
+};
+
  /*
   * Determine the type of allocation constraint.
   */
-#ifdef CONFIG_NUMA
-static enum oom_constraint constrained_alloc(struct oom_control *oc,
-                                            unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc)
  {
         struct zone *zone;
         struct zoneref *z;
@@ -226,8 +236,16 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
         bool cpuset_limited = false;
         int nid;
  
+       if (is_memcg_oom(oc)) {
+               oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1;
+               return CONSTRAINT_MEMCG;
+       }
+
         /* Default to all available memory */
-       *totalpages = totalram_pages + total_swap_pages;
+       oc->totalpages = totalram_pages + total_swap_pages;
+
+       if (!IS_ENABLED(CONFIG_NUMA))
+               return CONSTRAINT_NONE;
  
         if (!oc->zonelist)
                 return CONSTRAINT_NONE;
@@ -246,9 +264,9 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
          */
         if (oc->nodemask &&
             !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
-               *totalpages = total_swap_pages;
+               oc->totalpages = total_swap_pages;
                 for_each_node_mask(nid, *oc->nodemask)
-                       *totalpages += node_spanned_pages(nid);
+                       oc->totalpages += node_spanned_pages(nid);
                 return CONSTRAINT_MEMORY_POLICY;
         }
  
@@ -259,98 +277,84 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
                         cpuset_limited = true;
  
         if (cpuset_limited) {
-               *totalpages = total_swap_pages;
+               oc->totalpages = total_swap_pages;
                 for_each_node_mask(nid, cpuset_current_mems_allowed)
-                       *totalpages += node_spanned_pages(nid);
+                       oc->totalpages += node_spanned_pages(nid);
                 return CONSTRAINT_CPUSET;
         }
         return CONSTRAINT_NONE;
  }
-#else
-static enum oom_constraint constrained_alloc(struct oom_control *oc,
-                                            unsigned long *totalpages)
-{
-       *totalpages = totalram_pages + total_swap_pages;
-       return CONSTRAINT_NONE;
-}
-#endif
  
-enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
-                                       struct task_struct *task)
+static int oom_evaluate_task(struct task_struct *task, void *arg)
  {
+       struct oom_control *oc = arg;
+       unsigned long points;
+
         if (oom_unkillable_task(task, NULL, oc->nodemask))
-               return OOM_SCAN_CONTINUE;
+               goto next;
  
         /*
          * This task already has access to memory reserves and is being killed.
          * Don't allow any other task to have access to the reserves unless
-        * the task has MMF_OOM_REAPED because chances that it would release
+        * the task has MMF_OOM_SKIP because chances that it would release
          * any memory is quite low.
          */
-       if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) {
-               struct task_struct *p = find_lock_task_mm(task);
-               enum oom_scan_t ret = OOM_SCAN_ABORT;
-
-               if (p) {
-                       if (test_bit(MMF_OOM_REAPED, &p->mm->flags))
-                               ret = OOM_SCAN_CONTINUE;
-                       task_unlock(p);
-               }
-
-               return ret;
+       if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
+               if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
+                       goto next;
+               goto abort;
         }
  
         /*
          * If task is allocating a lot of memory and has been marked to be
          * killed first if it triggers an oom, then select it.
          */
-       if (oom_task_origin(task))
-               return OOM_SCAN_SELECT;
+       if (oom_task_origin(task)) {
+               points = ULONG_MAX;
+               goto select;
+       }
  
-       return OOM_SCAN_OK;
+       points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
+       if (!points || points < oc->chosen_points)
+               goto next;
+
+       /* Prefer thread group leaders for display purposes */
+       if (points == oc->chosen_points && thread_group_leader(oc->chosen))
+               goto next;
+select:
+       if (oc->chosen)
+               put_task_struct(oc->chosen);
+       get_task_struct(task);
+       oc->chosen = task;
+       oc->chosen_points = points;
+next:
+       return 0;
+abort:
+       if (oc->chosen)
+               put_task_struct(oc->chosen);
+       oc->chosen = (void *)-1UL;
+       return 1;
  }
  
  /*
- * Simple selection loop. We chose the process with the highest
- * number of 'points'.  Returns -1 on scan abort.
+ * Simple selection loop. We choose the process with the highest number of
+ * 'points'. In case scan was aborted, oc->chosen is set to -1.
   */
-static struct task_struct *select_bad_process(struct oom_control *oc,
-               unsigned int *ppoints, unsigned long totalpages)
+static void select_bad_process(struct oom_control *oc)
  {
-       struct task_struct *p;
-       struct task_struct *chosen = NULL;
-       unsigned long chosen_points = 0;
-
-       rcu_read_lock();
-       for_each_process(p) {
-               unsigned int points;
-
-               switch (oom_scan_process_thread(oc, p)) {
-               case OOM_SCAN_SELECT:
-                       chosen = p;
-                       chosen_points = ULONG_MAX;
-                       /* fall through */
-               case OOM_SCAN_CONTINUE:
-                       continue;
-               case OOM_SCAN_ABORT:
-                       rcu_read_unlock();
-                       return (struct task_struct *)(-1UL);
-               case OOM_SCAN_OK:
-                       break;
-               };
-               points = oom_badness(p, NULL, oc->nodemask, totalpages);
-               if (!points || points < chosen_points)
-                       continue;
+       if (is_memcg_oom(oc))
+               mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
+       else {
+               struct task_struct *p;
  
-               chosen = p;
-               chosen_points = points;
+               rcu_read_lock();
+               for_each_process(p)
+                       if (oom_evaluate_task(p, oc))
+                               break;
+               rcu_read_unlock();
         }
-       if (chosen)
-               get_task_struct(chosen);
-       rcu_read_unlock();
  
-       *ppoints = chosen_points * 1000 / totalpages;
-       return chosen;
+       oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
  }
  
  /**
@@ -399,9 +403,14 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
  
  static void dump_header(struct oom_control *oc, struct task_struct *p)
  {
-       pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
-               current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
+       nodemask_t *nm = (oc->nodemask) ? oc->nodemask : &cpuset_current_mems_allowed;
+
+       pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
+               current->comm, oc->gfp_mask, &oc->gfp_mask,
+               nodemask_pr_args(nm), oc->order,
                 current->signal->oom_score_adj);
+       if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
+               pr_warn("COMPACTION is disabled!!!\n");
  
         cpuset_print_current_mems_allowed();
         dump_stack();
@@ -419,7 +428,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
  static atomic_t oom_victims = ATOMIC_INIT(0);
  static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
  
-bool oom_killer_disabled __read_mostly;
+static bool oom_killer_disabled __read_mostly;
  
  #define K(x) ((x) << (PAGE_SHIFT-10))
  
@@ -452,12 +461,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
  static struct task_struct *oom_reaper_list;
  static DEFINE_SPINLOCK(oom_reaper_lock);
  
-static bool __oom_reap_task(struct task_struct *tsk)
+static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
  {
         struct mmu_gather tlb;
         struct vm_area_struct *vma;
-       struct mm_struct *mm = NULL;
-       struct task_struct *p;
         struct zap_details details = {.check_swap_entries = true,
                                       .ignore_dirty = true};
         bool ret = true;
@@ -465,7 +472,7 @@ static bool __oom_reap_task(struct task_struct *tsk)
         /*
          * We have to make sure to not race with the victim exit path
          * and cause premature new oom victim selection:
-        * __oom_reap_task              exit_mm
+        * __oom_reap_task_mm           exit_mm
          *   mmget_not_zero
          *                                mmput
          *                                  atomic_dec_and_test
@@ -478,22 +485,9 @@ static bool __oom_reap_task(struct task_struct *tsk)
          */
         mutex_lock(&oom_lock);
  
-       /*
-        * Make sure we find the associated mm_struct even when the particular
-        * thread has already terminated and cleared its mm.
-        * We might have race with exit path so consider our work done if there
-        * is no mm.
-        */
-       p = find_lock_task_mm(tsk);
-       if (!p)
-               goto unlock_oom;
-       mm = p->mm;
-       atomic_inc(&mm->mm_count);
-       task_unlock(p);
-
         if (!down_read_trylock(&mm->mmap_sem)) {
                 ret = false;
-               goto mm_drop;
+               goto unlock_oom;
         }
  
         /*
@@ -503,9 +497,17 @@ static bool __oom_reap_task(struct task_struct *tsk)
          */
         if (!mmget_not_zero(mm)) {
                 up_read(&mm->mmap_sem);
-               goto mm_drop;
+               goto unlock_oom;
         }
  
+       /*
+        * Tell all users of get_user/copy_from_user etc... that the content
+        * is no longer stable. No barriers really needed because unmapping
+        * should imply barriers already and the reader would hit a page fault
+        * if it stumbled over a reaped memory.
+        */
+       set_bit(MMF_UNSTABLE, &mm->flags);
+
         tlb_gather_mmu(&tlb, mm, 0, -1);
         for (vma = mm->mmap ; vma; vma = vma->vm_next) {
                 if (is_vm_hugetlb_page(vma))
@@ -540,19 +542,12 @@ static bool __oom_reap_task(struct task_struct *tsk)
                         K(get_mm_counter(mm, MM_SHMEMPAGES)));
         up_read(&mm->mmap_sem);
  
-       /*
-        * This task can be safely ignored because we cannot do much more
-        * to release its memory.
-        */
-       set_bit(MMF_OOM_REAPED, &mm->flags);
         /*
          * Drop our reference but make sure the mmput slow path is called from a
          * different context because we shouldn't risk we get stuck there and
          * put the oom_reaper out of the way.
          */
         mmput_async(mm);
-mm_drop:
-       mmdrop(mm);
  unlock_oom:
         mutex_unlock(&oom_lock);
         return ret;
@@ -562,44 +557,28 @@ unlock_oom:
  static void oom_reap_task(struct task_struct *tsk)
  {
         int attempts = 0;
+       struct mm_struct *mm = tsk->signal->oom_mm;
  
         /* Retry the down_read_trylock(mmap_sem) a few times */
-       while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk))
+       while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
                 schedule_timeout_idle(HZ/10);
  
-       if (attempts > MAX_OOM_REAP_RETRIES) {
-               struct task_struct *p;
+       if (attempts <= MAX_OOM_REAP_RETRIES)
+               goto done;
  
-               pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
-                               task_pid_nr(tsk), tsk->comm);
  
-               /*
-                * If we've already tried to reap this task in the past and
-                * failed it probably doesn't make much sense to try yet again
-                * so hide the mm from the oom killer so that it can move on
-                * to another task with a different mm struct.
-                */
-               p = find_lock_task_mm(tsk);
-               if (p) {
-                       if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) {
-                               pr_info("oom_reaper: giving up pid:%d (%s)\n",
-                                               task_pid_nr(tsk), tsk->comm);
-                               set_bit(MMF_OOM_REAPED, &p->mm->flags);
-                       }
-                       task_unlock(p);
-               }
+       pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
+               task_pid_nr(tsk), tsk->comm);
+       debug_show_all_locks();
  
-               debug_show_all_locks();
-       }
+done:
+       tsk->oom_reaper_list = NULL;
  
         /*
-        * Clear TIF_MEMDIE because the task shouldn't be sitting on a
-        * reasonably reclaimable memory anymore or it is not a good candidate
-        * for the oom victim right now because it cannot release its memory
-        * itself nor by the oom reaper.
+        * Hide this mm from OOM killer because it has been either reaped or
+        * somebody can't call up_write(mmap_sem).
          */
-       tsk->oom_reaper_list = NULL;
-       exit_oom_victim(tsk);
+       set_bit(MMF_OOM_SKIP, &mm->flags);
  
         /* Drop a reference taken by wake_oom_reaper */
         put_task_struct(tsk);
@@ -607,8 +586,6 @@ static void oom_reap_task(struct task_struct *tsk)
  
  static int oom_reaper(void *unused)
  {
-       set_freezable();
-
         while (true) {
                 struct task_struct *tsk = NULL;
  
@@ -627,7 +604,7 @@ static int oom_reaper(void *unused)
         return 0;
  }
  
-void wake_oom_reaper(struct task_struct *tsk)
+static void wake_oom_reaper(struct task_struct *tsk)
  {
         if (!oom_reaper_th)
                 return;
@@ -656,7 +633,11 @@ static int __init oom_init(void)
         return 0;
  }
  subsys_initcall(oom_init)
-#endif
+#else
+static inline void wake_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif /* CONFIG_MMU */
  
  /**
   * mark_oom_victim - mark the given task as OOM victim
@@ -664,14 +645,23 @@ subsys_initcall(oom_init)
   *
   * Has to be called with oom_lock held and never after
   * oom has been disabled already.
+ *
+ * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
+ * under task_lock or operate on the current).
   */
-void mark_oom_victim(struct task_struct *tsk)
+static void mark_oom_victim(struct task_struct *tsk)
  {
+       struct mm_struct *mm = tsk->mm;
+
         WARN_ON(oom_killer_disabled);
         /* OOM killer might race with memcg OOM */
         if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
                 return;
-       atomic_inc(&tsk->signal->oom_victims);
+
+       /* oom_mm is bound to the signal struct life time. */
+       if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
+               atomic_inc(&tsk->signal->oom_mm->mm_count);
+
         /*
          * Make sure that the task is woken up from uninterruptible sleep
          * if it is frozen because OOM killer wouldn't be able to free
@@ -685,21 +675,29 @@ void mark_oom_victim(struct task_struct *tsk)
  /**
   * exit_oom_victim - note the exit of an OOM victim
   */
-void exit_oom_victim(struct task_struct *tsk)
+void exit_oom_victim(void)
  {
-       if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
-               return;
-       atomic_dec(&tsk->signal->oom_victims);
+       clear_thread_flag(TIF_MEMDIE);
  
         if (!atomic_dec_return(&oom_victims))
                 wake_up_all(&oom_victims_wait);
  }
  
+/**
+ * oom_killer_enable - enable OOM killer
+ */
+void oom_killer_enable(void)
+{
+       oom_killer_disabled = false;
+}
+
  /**
   * oom_killer_disable - disable OOM killer
+ * @timeout: maximum timeout to wait for oom victims in jiffies
   *
   * Forces all page allocations to fail rather than trigger OOM killer.
- * Will block and wait until all OOM victims are killed.
+ * Will block and wait until all OOM victims are killed or the given
+ * timeout expires.
   *
   * The function cannot be called when there are runnable user tasks because
   * the userspace would see unexpected allocation failures as a result. Any
@@ -708,8 +706,10 @@ void exit_oom_victim(struct task_struct *tsk)
   * Returns true if successful and false if the OOM killer cannot be
   * disabled.
   */
-bool oom_killer_disable(void)
+bool oom_killer_disable(signed long timeout)
  {
+       signed long ret;
+
         /*
          * Make sure to not race with an ongoing OOM killer. Check that the
          * current is not killed (possibly due to sharing the victim's memory).
@@ -719,19 +719,16 @@ bool oom_killer_disable(void)
         oom_killer_disabled = true;
         mutex_unlock(&oom_lock);
  
-       wait_event(oom_victims_wait, !atomic_read(&oom_victims));
+       ret = wait_event_interruptible_timeout(oom_victims_wait,
+                       !atomic_read(&oom_victims), timeout);
+       if (ret <= 0) {
+               oom_killer_enable();
+               return false;
+       }
  
         return true;
  }
  
-/**
- * oom_killer_enable - enable OOM killer
- */
-void oom_killer_enable(void)
-{
-       oom_killer_disabled = false;
-}
-
  static inline bool __task_will_free_mem(struct task_struct *task)
  {
         struct signal_struct *sig = task->signal;
@@ -760,7 +757,7 @@ static inline bool __task_will_free_mem(struct task_struct *task)
   * Caller has to make sure that task->mm is stable (hold task_lock or
   * it operates on the current).
   */
-bool task_will_free_mem(struct task_struct *task)
+static bool task_will_free_mem(struct task_struct *task)
  {
         struct mm_struct *mm = task->mm;
         struct task_struct *p;
@@ -781,15 +778,16 @@ bool task_will_free_mem(struct task_struct *task)
          * This task has already been drained by the oom reaper so there are
          * only small chances it will free some more
          */
-       if (test_bit(MMF_OOM_REAPED, &mm->flags))
+       if (test_bit(MMF_OOM_SKIP, &mm->flags))
                 return false;
  
         if (atomic_read(&mm->mm_users) <= 1)
                 return true;
  
         /*
-        * This is really pessimistic but we do not have any reliable way
-        * to check that external processes share with our mm
+        * Make sure that all tasks which share the mm with the given tasks
+        * are dying as well to make sure that a) nobody pins its mm and
+        * b) the task is also reapable by the oom reaper.
          */
         rcu_read_lock();
         for_each_process(p) {
@@ -806,14 +804,10 @@ bool task_will_free_mem(struct task_struct *task)
         return ret;
  }
  
-/*
- * Must be called while holding a reference to p, which will be released upon
- * returning.
- */
-void oom_kill_process(struct oom_control *oc, struct task_struct *p,
-                     unsigned int points, unsigned long totalpages,
-                     const char *message)
+static void oom_kill_process(struct oom_control *oc, const char *message)
  {
+       struct task_struct *p = oc->chosen;
+       unsigned int points = oc->chosen_points;
         struct task_struct *victim = p;
         struct task_struct *child;
         struct task_struct *t;
@@ -860,7 +854,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                          * oom_badness() returns 0 if the thread is unkillable
                          */
                         child_points = oom_badness(child,
-                                       oc->memcg, oc->nodemask, totalpages);
+                               oc->memcg, oc->nodemask, oc->totalpages);
                         if (child_points > victim_points) {
                                 put_task_struct(victim);
                                 victim = child;
@@ -913,20 +907,20 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                         continue;
                 if (same_thread_group(p, victim))
                         continue;
-               if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) {
-                       /*
-                        * We cannot use oom_reaper for the mm shared by this
-                        * process because it wouldn't get killed and so the
-                        * memory might be still used. Hide the mm from the oom
-                        * killer to guarantee OOM forward progress.
-                        */
+               if (is_global_init(p)) {
                         can_oom_reap = false;
-                       set_bit(MMF_OOM_REAPED, &mm->flags);
+                       set_bit(MMF_OOM_SKIP, &mm->flags);
                         pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
                                         task_pid_nr(victim), victim->comm,
                                         task_pid_nr(p), p->comm);
                         continue;
                 }
+               /*
+                * No use_mm() user needs to read from the userspace so we are
+                * ok to reap it.
+                */
+               if (unlikely(p->flags & PF_KTHREAD))
+                       continue;
                 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
         }
         rcu_read_unlock();
@@ -942,7 +936,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
  /*
   * Determines whether the kernel must panic because of the panic_on_oom sysctl.
   */
-void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint)
+static void check_panic_on_oom(struct oom_control *oc,
+                              enum oom_constraint constraint)
  {
         if (likely(!sysctl_panic_on_oom))
                 return;
@@ -988,19 +983,18 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
   */
  bool out_of_memory(struct oom_control *oc)
  {
-       struct task_struct *p;
-       unsigned long totalpages;
         unsigned long freed = 0;
-       unsigned int uninitialized_var(points);
         enum oom_constraint constraint = CONSTRAINT_NONE;
  
         if (oom_killer_disabled)
                 return false;
  
-       blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
-       if (freed > 0)
-               /* Got some memory back in the last second. */
-               return true;
+       if (!is_memcg_oom(oc)) {
+               blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+               if (freed > 0)
+                       /* Got some memory back in the last second. */
+                       return true;
+       }
  
         /*
          * If current has a pending SIGKILL or is exiting, then automatically
@@ -1024,37 +1018,38 @@ bool out_of_memory(struct oom_control *oc)
  
         /*
          * Check if there were limitations on the allocation (only relevant for
-        * NUMA) that may require different handling.
+        * NUMA and memcg) that may require different handling.
          */
-       constraint = constrained_alloc(oc, &totalpages);
+       constraint = constrained_alloc(oc);
         if (constraint != CONSTRAINT_MEMORY_POLICY)
                 oc->nodemask = NULL;
         check_panic_on_oom(oc, constraint);
  
-       if (sysctl_oom_kill_allocating_task && current->mm &&
-           !oom_unkillable_task(current, NULL, oc->nodemask) &&
+       if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
+           current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
             current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
                 get_task_struct(current);
-               oom_kill_process(oc, current, 0, totalpages,
-                                "Out of memory (oom_kill_allocating_task)");
+               oc->chosen = current;
+               oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
                 return true;
         }
  
-       p = select_bad_process(oc, &points, totalpages);
+       select_bad_process(oc);
         /* Found nothing?!?! Either we hang forever, or we panic. */
-       if (!p && !is_sysrq_oom(oc)) {
+       if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
                 dump_header(oc, NULL);
                 panic("Out of memory and no killable processes...\n");
         }
-       if (p && p != (void *)-1UL) {
-               oom_kill_process(oc, p, points, totalpages, "Out of memory");
+       if (oc->chosen && oc->chosen != (void *)-1UL) {
+               oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
+                                "Memory cgroup out of memory");
                 /*
                  * Give the killed process a good chance to exit before trying
                  * to allocate memory again.
                  */
                 schedule_timeout_killable(1);
         }
-       return true;
+       return !!oc->chosen;
  }
  
  /*
@@ -1077,16 +1072,6 @@ void pagefault_out_of_memory(void)
  
         if (!mutex_trylock(&oom_lock))
                 return;
-
-       if (!out_of_memory(&oc)) {
-               /*
-                * There shouldn't be any user tasks runnable while the
-                * OOM killer is disabled, so the current task has to
-                * be a racing OOM victim for which oom_killer_disable()
-                * is waiting for.
-                */
-               WARN_ON(test_thread_flag(TIF_MEMDIE));
-       }
-
+       out_of_memory(&oc);
         mutex_unlock(&oom_lock);
  }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index 28d6f36..439cc63 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1965,36 +1965,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
         return false;
  }
  
-void throttle_vm_writeout(gfp_t gfp_mask)
-{
-       unsigned long background_thresh;
-       unsigned long dirty_thresh;
-
-        for ( ; ; ) {
-               global_dirty_limits(&background_thresh, &dirty_thresh);
-               dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
-
-                /*
-                 * Boost the allowable dirty threshold a bit for page
-                 * allocators so they don't get DoS'ed by heavy writers
-                 */
-                dirty_thresh += dirty_thresh / 10;      /* wheeee... */
-
-                if (global_node_page_state(NR_UNSTABLE_NFS) +
-                       global_node_page_state(NR_WRITEBACK) <= dirty_thresh)
-                               break;
-                congestion_wait(BLK_RW_ASYNC, HZ/10);
-
-               /*
-                * The caller might hold locks which can prevent IO completion
-                * or progress in the filesystem.  So we cannot just sit here
-                * waiting for IO to complete.
-                */
-               if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
-                       break;
-        }
-}
-
  /*
   * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
   */
@@ -2746,7 +2716,7 @@ int test_clear_page_writeback(struct page *page)
         int ret;
  
         lock_page_memcg(page);
-       if (mapping) {
+       if (mapping && mapping_use_writeback_tags(mapping)) {
                 struct inode *inode = mapping->host;
                 struct backing_dev_info *bdi = inode_to_bdi(inode);
                 unsigned long flags;
@@ -2789,7 +2759,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
         int ret;
  
         lock_page_memcg(page);
-       if (mapping) {
+       if (mapping && mapping_use_writeback_tags(mapping)) {
                 struct inode *inode = mapping->host;
                 struct backing_dev_info *bdi = inode_to_bdi(inode);
                 unsigned long flags;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index a2214c6..ca423cc 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -607,6 +607,9 @@ static bool need_debug_guardpage(void)
         if (!debug_pagealloc_enabled())
                 return false;
  
+       if (!debug_guardpage_minorder())
+               return false;
+
         return true;
  }
  
@@ -615,6 +618,9 @@ static void init_debug_guardpage(void)
         if (!debug_pagealloc_enabled())
                 return;
  
+       if (!debug_guardpage_minorder())
+               return;
+
         _debug_guardpage_enabled = true;
  }
  
@@ -635,19 +641,22 @@ static int __init debug_guardpage_minorder_setup(char *buf)
         pr_info("Setting debug_guardpage_minorder to %lu\n", res);
         return 0;
  }
-__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
  
-static inline void set_page_guard(struct zone *zone, struct page *page,
+static inline bool set_page_guard(struct zone *zone, struct page *page,
                                 unsigned int order, int migratetype)
  {
         struct page_ext *page_ext;
  
         if (!debug_guardpage_enabled())
-               return;
+               return false;
+
+       if (order >= debug_guardpage_minorder())
+               return false;
  
         page_ext = lookup_page_ext(page);
         if (unlikely(!page_ext))
-               return;
+               return false;
  
         __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
  
@@ -655,6 +664,8 @@ static inline void set_page_guard(struct zone *zone, struct page *page,
         set_page_private(page, order);
         /* Guard pages are not available for any usage */
         __mod_zone_freepage_state(zone, -(1 << order), migratetype);
+
+       return true;
  }
  
  static inline void clear_page_guard(struct zone *zone, struct page *page,
@@ -676,9 +687,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
                 __mod_zone_freepage_state(zone, (1 << order), migratetype);
  }
  #else
-struct page_ext_operations debug_guardpage_ops = { NULL, };
-static inline void set_page_guard(struct zone *zone, struct page *page,
-                               unsigned int order, int migratetype) {}
+struct page_ext_operations debug_guardpage_ops;
+static inline bool set_page_guard(struct zone *zone, struct page *page,
+                       unsigned int order, int migratetype) { return false; }
  static inline void clear_page_guard(struct zone *zone, struct page *page,
                                 unsigned int order, int migratetype) {}
  #endif
@@ -1393,15 +1404,18 @@ static void __init deferred_free_range(struct page *page,
                 return;
  
         /* Free a large naturally-aligned chunk if possible */
-       if (nr_pages == MAX_ORDER_NR_PAGES &&
-           (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
+       if (nr_pages == pageblock_nr_pages &&
+           (pfn & (pageblock_nr_pages - 1)) == 0) {
                 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-               __free_pages_boot_core(page, MAX_ORDER-1);
+               __free_pages_boot_core(page, pageblock_order);
                 return;
         }
  
-       for (i = 0; i < nr_pages; i++, page++)
+       for (i = 0; i < nr_pages; i++, page++, pfn++) {
+               if ((pfn & (pageblock_nr_pages - 1)) == 0)
+                       set_pageblock_migratetype(page, MIGRATE_MOVABLE);
                 __free_pages_boot_core(page, 0);
+       }
  }
  
  /* Completion tracking for deferred_init_memmap() threads */
@@ -1469,9 +1483,9 @@ static int __init deferred_init_memmap(void *data)
  
                         /*
                          * Ensure pfn_valid is checked every
-                        * MAX_ORDER_NR_PAGES for memory holes
+                        * pageblock_nr_pages for memory holes
                          */
-                       if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+                       if ((pfn & (pageblock_nr_pages - 1)) == 0) {
                                 if (!pfn_valid(pfn)) {
                                         page = NULL;
                                         goto free_range;
@@ -1484,7 +1498,7 @@ static int __init deferred_init_memmap(void *data)
                         }
  
                         /* Minimise pfn page lookups and scheduler checks */
-                       if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) {
+                       if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
                                 page++;
                         } else {
                                 nr_pages += nr_to_free;
@@ -1520,6 +1534,9 @@ free_range:
                         free_base_page = NULL;
                         free_base_pfn = nr_to_free = 0;
                 }
+               /* Free the last block of pages to allocator */
+               nr_pages += nr_to_free;
+               deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
  
                 first_init_pfn = max(end_pfn, first_init_pfn);
         }
@@ -1616,18 +1633,15 @@ static inline void expand(struct zone *zone, struct page *page,
                 size >>= 1;
                 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
  
-               if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
-                       debug_guardpage_enabled() &&
-                       high < debug_guardpage_minorder()) {
-                       /*
-                        * Mark as guard pages (or page), that will allow to
-                        * merge back to allocator when buddy will be freed.
-                        * Corresponding page table entries will not be touched,
-                        * pages will stay not present in virtual address space
-                        */
-                       set_page_guard(zone, &page[size], high, migratetype);
+               /*
+                * Mark as guard pages (or page), that will allow to
+                * merge back to allocator when buddy will be freed.
+                * Corresponding page table entries will not be touched,
+                * pages will stay not present in virtual address space
+                */
+               if (set_page_guard(zone, &page[size], high, migratetype))
                         continue;
-               }
+
                 list_add(&page[size].lru, &area->free_list[migratetype]);
                 area->nr_free++;
                 set_page_order(&page[size], high);
@@ -2489,9 +2503,14 @@ int __isolate_free_page(struct page *page, unsigned int order)
         mt = get_pageblock_migratetype(page);
  
         if (!is_migrate_isolate(mt)) {
-               /* Obey watermarks as if the page was being allocated */
-               watermark = low_wmark_pages(zone) + (1 << order);
-               if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+               /*
+                * Obey watermarks as if the page was being allocated. We can
+                * emulate a high-order watermark check with a raised order-0
+                * watermark, because we already know our high-order page
+                * exists.
+                */
+               watermark = min_wmark_pages(zone) + (1UL << order);
+               if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
                         return 0;
  
                 __mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -2960,9 +2979,11 @@ static DEFINE_RATELIMIT_STATE(nopage_rs,
                 DEFAULT_RATELIMIT_INTERVAL,
                 DEFAULT_RATELIMIT_BURST);
  
-void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
+void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
  {
         unsigned int filter = SHOW_MEM_FILTER_NODES;
+       struct va_format vaf;
+       va_list args;
  
         if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
             debug_guardpage_minorder() > 0)
@@ -2980,22 +3001,16 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
         if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
                 filter &= ~SHOW_MEM_FILTER_NODES;
  
-       if (fmt) {
-               struct va_format vaf;
-               va_list args;
+       pr_warn("%s: ", current->comm);
  
-               va_start(args, fmt);
+       va_start(args, fmt);
+       vaf.fmt = fmt;
+       vaf.va = &args;
+       pr_cont("%pV", &vaf);
+       va_end(args);
  
-               vaf.fmt = fmt;
-               vaf.va = &args;
+       pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
  
-               pr_warn("%pV", &vaf);
-
-               va_end(args);
-       }
-
-       pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
-               current->comm, order, gfp_mask, &gfp_mask);
         dump_stack();
         if (!should_suppress_show_mem())
                 show_mem(filter);
@@ -3137,6 +3152,65 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         return NULL;
  }
  
+static inline bool
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+                    enum compact_result compact_result,
+                    enum compact_priority *compact_priority,
+                    int *compaction_retries)
+{
+       int max_retries = MAX_COMPACT_RETRIES;
+       int min_priority;
+
+       if (!order)
+               return false;
+
+       if (compaction_made_progress(compact_result))
+               (*compaction_retries)++;
+
+       /*
+        * compaction considers all the zone as desperately out of memory
+        * so it doesn't really make much sense to retry except when the
+        * failure could be caused by insufficient priority
+        */
+       if (compaction_failed(compact_result))
+               goto check_priority;
+
+       /*
+        * make sure the compaction wasn't deferred or didn't bail out early
+        * due to locks contention before we declare that we should give up.
+        * But do not retry if the given zonelist is not suitable for
+        * compaction.
+        */
+       if (compaction_withdrawn(compact_result))
+               return compaction_zonelist_suitable(ac, order, alloc_flags);
+
+       /*
+        * !costly requests are much more important than __GFP_REPEAT
+        * costly ones because they are de facto nofail and invoke OOM
+        * killer to move on while costly can fail and users are ready
+        * to cope with that. 1/4 retries is rather arbitrary but we
+        * would need much more detailed feedback from compaction to
+        * make a better decision.
+        */
+       if (order > PAGE_ALLOC_COSTLY_ORDER)
+               max_retries /= 4;
+       if (*compaction_retries <= max_retries)
+               return true;
+
+       /*
+        * Make sure there are attempts at the highest priority if we exhausted
+        * all retries or failed at the lower priorities.
+        */
+check_priority:
+       min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
+                       MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
+       if (*compact_priority > min_priority) {
+               (*compact_priority)--;
+               *compaction_retries = 0;
+               return true;
+       }
+       return false;
+}
  #else
  static inline struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
@@ -3147,13 +3221,11 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         return NULL;
  }
  
-#endif /* CONFIG_COMPACTION */
-
  static inline bool
  should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
                      enum compact_result compact_result,
                      enum compact_priority *compact_priority,
-                    int compaction_retries)
+                    int *compaction_retries)
  {
         struct zone *zone;
         struct zoneref *z;
@@ -3175,6 +3247,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
         }
         return false;
  }
+#endif /* CONFIG_COMPACTION */
  
  /* Perform direct synchronous page reclaim */
  static int
@@ -3325,16 +3398,26 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
  static inline bool
  should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                      struct alloc_context *ac, int alloc_flags,
-                    bool did_some_progress, int no_progress_loops)
+                    bool did_some_progress, int *no_progress_loops)
  {
         struct zone *zone;
         struct zoneref *z;
  
+       /*
+        * Costly allocations might have made a progress but this doesn't mean
+        * their order will become available due to high fragmentation so
+        * always increment the no progress counter for them
+        */
+       if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
+               *no_progress_loops = 0;
+       else
+               (*no_progress_loops)++;
+
         /*
          * Make sure we converge to OOM if we cannot make any progress
          * several times in the row.
          */
-       if (no_progress_loops > MAX_RECLAIM_RETRIES)
+       if (*no_progress_loops > MAX_RECLAIM_RETRIES)
                 return false;
  
         /*
@@ -3349,7 +3432,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                 unsigned long reclaimable;
  
                 available = reclaimable = zone_reclaimable_pages(zone);
-               available -= DIV_ROUND_UP(no_progress_loops * available,
+               available -= DIV_ROUND_UP((*no_progress_loops) * available,
                                           MAX_RECLAIM_RETRIES);
                 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
  
@@ -3410,6 +3493,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         enum compact_result compact_result;
         int compaction_retries = 0;
         int no_progress_loops = 0;
+       unsigned long alloc_start = jiffies;
+       unsigned int stall_timeout = 10 * HZ;
  
         /*
          * In the slowpath, we sanity check order to avoid ever trying to
@@ -3554,9 +3639,6 @@ retry:
         if (page)
                 goto got_pg;
  
-       if (order && compaction_made_progress(compact_result))
-               compaction_retries++;
-
         /* Do not loop if specifically requested */
         if (gfp_mask & __GFP_NORETRY)
                 goto nopage;
@@ -3568,18 +3650,16 @@ retry:
         if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
                 goto nopage;
  
-       /*
-        * Costly allocations might have made a progress but this doesn't mean
-        * their order will become available due to high fragmentation so
-        * always increment the no progress counter for them
-        */
-       if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
-               no_progress_loops = 0;
-       else
-               no_progress_loops++;
+       /* Make sure we know about allocations which stall for too long */
+       if (time_after(jiffies, alloc_start + stall_timeout)) {
+               warn_alloc(gfp_mask,
+                       "page alloction stalls for %ums, order:%u\n",
+                       jiffies_to_msecs(jiffies-alloc_start), order);
+               stall_timeout += 10 * HZ;
+       }
  
         if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
-                                did_some_progress > 0, no_progress_loops))
+                                did_some_progress > 0, &no_progress_loops))
                 goto retry;
  
         /*
@@ -3591,7 +3671,7 @@ retry:
         if (did_some_progress > 0 &&
                         should_compact_retry(ac, order, alloc_flags,
                                 compact_result, &compact_priority,
-                               compaction_retries))
+                               &compaction_retries))
                 goto retry;
  
         /* Reclaim has failed us, start killing things */
@@ -3606,7 +3686,8 @@ retry:
         }
  
  nopage:
-       warn_alloc_failed(gfp_mask, order, NULL);
+       warn_alloc(gfp_mask,
+                       "page allocation failure: order:%u", order);
  got_pg:
         return page;
  }
@@ -4555,7 +4636,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
         int j;
         struct zonelist *zonelist;
  
-       zonelist = &pgdat->node_zonelists[0];
+       zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
         for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
                 ;
         j = build_zonelists_node(NODE_DATA(node), zonelist, j);
@@ -4571,7 +4652,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
         int j;
         struct zonelist *zonelist;
  
-       zonelist = &pgdat->node_zonelists[1];
+       zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
         j = build_zonelists_node(pgdat, zonelist, 0);
         zonelist->_zonerefs[j].zone = NULL;
         zonelist->_zonerefs[j].zone_idx = 0;
@@ -4592,7 +4673,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
         struct zone *z;
         struct zonelist *zonelist;
  
-       zonelist = &pgdat->node_zonelists[0];
+       zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
         pos = 0;
         for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
                 for (j = 0; j < nr_nodes; j++) {
@@ -4727,7 +4808,7 @@ static void build_zonelists(pg_data_t *pgdat)
  
         local_node = pgdat->node_id;
  
-       zonelist = &pgdat->node_zonelists[0];
+       zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
         j = build_zonelists_node(pgdat, zonelist, 0);
  
         /*
@@ -4999,15 +5080,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                         break;
  
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-               /*
-                * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
-                * from zone_movable_pfn[nid] to end of each node should be
-                * ZONE_MOVABLE not ZONE_NORMAL. skip it.
-                */
-               if (!mirrored_kernelcore && zone_movable_pfn[nid])
-                       if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
-                               continue;
-
                 /*
                  * Check given memblock attribute by firmware which can affect
                  * kernel memory layout.  If zone==ZONE_MOVABLE but memory is
@@ -5451,6 +5523,12 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
                         *zone_end_pfn = min(node_end_pfn,
                                 arch_zone_highest_possible_pfn[movable_zone]);
  
+               /* Adjust for ZONE_MOVABLE starting within this range */
+               } else if (!mirrored_kernelcore &&
+                       *zone_start_pfn < zone_movable_pfn[nid] &&
+                       *zone_end_pfn > zone_movable_pfn[nid]) {
+                       *zone_end_pfn = zone_movable_pfn[nid];
+
                 /* Check if this whole range is within ZONE_MOVABLE */
                 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
                         *zone_start_pfn = *zone_end_pfn;
@@ -5554,28 +5632,23 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
          * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
          * and vice versa.
          */
-       if (zone_movable_pfn[nid]) {
-               if (mirrored_kernelcore) {
-                       unsigned long start_pfn, end_pfn;
-                       struct memblock_region *r;
-
-                       for_each_memblock(memory, r) {
-                               start_pfn = clamp(memblock_region_memory_base_pfn(r),
-                                                 zone_start_pfn, zone_end_pfn);
-                               end_pfn = clamp(memblock_region_memory_end_pfn(r),
-                                               zone_start_pfn, zone_end_pfn);
-
-                               if (zone_type == ZONE_MOVABLE &&
-                                   memblock_is_mirror(r))
-                                       nr_absent += end_pfn - start_pfn;
-
-                               if (zone_type == ZONE_NORMAL &&
-                                   !memblock_is_mirror(r))
-                                       nr_absent += end_pfn - start_pfn;
-                       }
-               } else {
-                       if (zone_type == ZONE_NORMAL)
-                               nr_absent += node_end_pfn - zone_movable_pfn[nid];
+       if (mirrored_kernelcore && zone_movable_pfn[nid]) {
+               unsigned long start_pfn, end_pfn;
+               struct memblock_region *r;
+
+               for_each_memblock(memory, r) {
+                       start_pfn = clamp(memblock_region_memory_base_pfn(r),
+                                         zone_start_pfn, zone_end_pfn);
+                       end_pfn = clamp(memblock_region_memory_end_pfn(r),
+                                       zone_start_pfn, zone_end_pfn);
+
+                       if (zone_type == ZONE_MOVABLE &&
+                           memblock_is_mirror(r))
+                               nr_absent += end_pfn - start_pfn;
+
+                       if (zone_type == ZONE_NORMAL &&
+                           !memblock_is_mirror(r))
+                               nr_absent += end_pfn - start_pfn;
                 }
         }
  
@@ -6929,6 +7002,17 @@ static int __init set_hashdist(char *str)
  __setup("hashdist=", set_hashdist);
  #endif
  
+#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
+/*
+ * Returns the number of pages that arch has reserved but
+ * is not known to alloc_large_system_hash().
+ */
+static unsigned long __init arch_reserved_kernel_pages(void)
+{
+       return 0;
+}
+#endif
+
  /*
   * allocate a large system hash table from bootmem
   * - it is assumed that the hash table must contain an exact power-of-2
@@ -6953,6 +7037,7 @@ void *__init alloc_large_system_hash(const char *tablename,
         if (!numentries) {
                 /* round applicable memory size up to nearest megabyte */
                 numentries = nr_kernel_pages;
+               numentries -= arch_reserved_kernel_pages();
  
                 /* It isn't necessary when PAGE_SIZE >= 1MB */
                 if (PAGE_SHIFT < 20)
diff --git a/mm/page_ext.c b/mm/page_ext.c

index 44a4c02..121dcff 100644 (file)
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -42,6 +42,11 @@
   * and page extension core can skip to allocate memory. As result,
   * none of memory is wasted.
   *
+ * When need callback returns true, page_ext checks if there is a request for
+ * extra memory through size in struct page_ext_operations. If it is non-zero,
+ * extra space is allocated for each page_ext entry and offset is returned to
+ * user through offset in struct page_ext_operations.
+ *
   * The init callback is used to do proper initialization after page extension
   * is completely initialized. In sparse memory system, extra memory is
   * allocated some time later than memmap is allocated. In other words, lifetime
@@ -66,18 +71,24 @@ static struct page_ext_operations *page_ext_ops[] = {
  };
  
  static unsigned long total_usage;
+static unsigned long extra_mem;
  
  static bool __init invoke_need_callbacks(void)
  {
         int i;
         int entries = ARRAY_SIZE(page_ext_ops);
+       bool need = false;
  
         for (i = 0; i < entries; i++) {
-               if (page_ext_ops[i]->need && page_ext_ops[i]->need())
-                       return true;
+               if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
+                       page_ext_ops[i]->offset = sizeof(struct page_ext) +
+                                               extra_mem;
+                       extra_mem += page_ext_ops[i]->size;
+                       need = true;
+               }
         }
  
-       return false;
+       return need;
  }
  
  static void __init invoke_init_callbacks(void)
@@ -91,6 +102,16 @@ static void __init invoke_init_callbacks(void)
         }
  }
  
+static unsigned long get_entry_size(void)
+{
+       return sizeof(struct page_ext) + extra_mem;
+}
+
+static inline struct page_ext *get_entry(void *base, unsigned long index)
+{
+       return base + get_entry_size() * index;
+}
+
  #if !defined(CONFIG_SPARSEMEM)
  
  
@@ -102,7 +123,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
  struct page_ext *lookup_page_ext(struct page *page)
  {
         unsigned long pfn = page_to_pfn(page);
-       unsigned long offset;
+       unsigned long index;
         struct page_ext *base;
  
         base = NODE_DATA(page_to_nid(page))->node_page_ext;
@@ -119,9 +140,9 @@ struct page_ext *lookup_page_ext(struct page *page)
         if (unlikely(!base))
                 return NULL;
  #endif
-       offset = pfn - round_down(node_start_pfn(page_to_nid(page)),
+       index = pfn - round_down(node_start_pfn(page_to_nid(page)),
                                         MAX_ORDER_NR_PAGES);
-       return base + offset;
+       return get_entry(base, index);
  }
  
  static int __init alloc_node_page_ext(int nid)
@@ -143,7 +164,7 @@ static int __init alloc_node_page_ext(int nid)
                 !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
                 nr_pages += MAX_ORDER_NR_PAGES;
  
-       table_size = sizeof(struct page_ext) * nr_pages;
+       table_size = get_entry_size() * nr_pages;
  
         base = memblock_virt_alloc_try_nid_nopanic(
                         table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
@@ -196,7 +217,7 @@ struct page_ext *lookup_page_ext(struct page *page)
         if (!section->page_ext)
                 return NULL;
  #endif
-       return section->page_ext + pfn;
+       return get_entry(section->page_ext, pfn);
  }
  
  static void *__meminit alloc_page_ext(size_t size, int nid)
@@ -229,7 +250,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
         if (section->page_ext)
                 return 0;
  
-       table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
+       table_size = get_entry_size() * PAGES_PER_SECTION;
         base = alloc_page_ext(table_size, nid);
  
         /*
@@ -249,7 +270,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
          * we need to apply a mask.
          */
         pfn &= PAGE_SECTION_MASK;
-       section->page_ext = base - pfn;
+       section->page_ext = (void *)base - get_entry_size() * pfn;
         total_usage += table_size;
         return 0;
  }
@@ -262,7 +283,7 @@ static void free_page_ext(void *addr)
                 struct page *page = virt_to_page(addr);
                 size_t table_size;
  
-               table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
+               table_size = get_entry_size() * PAGES_PER_SECTION;
  
                 BUG_ON(PageReserved(page));
                 free_pages_exact(addr, table_size);
@@ -277,7 +298,7 @@ static void __free_page_ext(unsigned long pfn)
         ms = __pfn_to_section(pfn);
         if (!ms || !ms->page_ext)
                 return;
-       base = ms->page_ext + pfn;
+       base = get_entry(ms->page_ext, pfn);
         free_page_ext(base);
         ms->page_ext = NULL;
  }
diff --git a/mm/page_io.c b/mm/page_io.c

index eafe5dd..a2651f5 100644 (file)
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -264,7 +264,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
         int ret;
         struct swap_info_struct *sis = page_swap_info(page);
  
-       BUG_ON(!PageSwapCache(page));
+       VM_BUG_ON_PAGE(!PageSwapCache(page), page);
         if (sis->flags & SWP_FILE) {
                 struct kiocb kiocb;
                 struct file *swap_file = sis->swap_file;
@@ -338,7 +338,7 @@ int swap_readpage(struct page *page)
         int ret = 0;
         struct swap_info_struct *sis = page_swap_info(page);
  
-       BUG_ON(!PageSwapCache(page));
+       VM_BUG_ON_PAGE(!PageSwapCache(page), page);
         VM_BUG_ON_PAGE(!PageLocked(page), page);
         VM_BUG_ON_PAGE(PageUptodate(page), page);
         if (frontswap_load(page) == 0) {
@@ -388,7 +388,8 @@ int swap_set_page_dirty(struct page *page)
  
         if (sis->flags & SWP_FILE) {
                 struct address_space *mapping = sis->swap_file->f_mapping;
-               BUG_ON(!PageSwapCache(page));
+
+               VM_BUG_ON_PAGE(!PageSwapCache(page), page);
                 return mapping->a_ops->set_page_dirty(page);
         } else {
                 return __set_page_dirty_no_writeback(page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c

index 064b7fb..a5594bf 100644 (file)
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -55,7 +55,7 @@ static int set_migratetype_isolate(struct page *page,
                 ret = 0;
  
         /*
-        * immobile means "not-on-lru" paes. If immobile is larger than
+        * immobile means "not-on-lru" pages. If immobile is larger than
          * removable-by-driver pages reported by notifier, we'll fail.
          */
  
diff --git a/mm/page_owner.c b/mm/page_owner.c

index ec6dc18..60634dc 100644 (file)
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -8,6 +8,7 @@
  #include <linux/jump_label.h>
  #include <linux/migrate.h>
  #include <linux/stackdepot.h>
+#include <linux/seq_file.h>
  
  #include "internal.h"
  
@@ -17,6 +18,13 @@
   */
  #define PAGE_OWNER_STACK_DEPTH (16)
  
+struct page_owner {
+       unsigned int order;
+       gfp_t gfp_mask;
+       int last_migrate_reason;
+       depot_stack_handle_t handle;
+};
+
  static bool page_owner_disabled = true;
  DEFINE_STATIC_KEY_FALSE(page_owner_inited);
  
@@ -85,10 +93,16 @@ static void init_page_owner(void)
  }
  
  struct page_ext_operations page_owner_ops = {
+       .size = sizeof(struct page_owner),
         .need = need_page_owner,
         .init = init_page_owner,
  };
  
+static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
+{
+       return (void *)page_ext + page_owner_ops.offset;
+}
+
  void __reset_page_owner(struct page *page, unsigned int order)
  {
         int i;
@@ -155,14 +169,16 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
                                         gfp_t gfp_mask)
  {
         struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_owner *page_owner;
  
         if (unlikely(!page_ext))
                 return;
  
-       page_ext->handle = save_stack(gfp_mask);
-       page_ext->order = order;
-       page_ext->gfp_mask = gfp_mask;
-       page_ext->last_migrate_reason = -1;
+       page_owner = get_page_owner(page_ext);
+       page_owner->handle = save_stack(gfp_mask);
+       page_owner->order = order;
+       page_owner->gfp_mask = gfp_mask;
+       page_owner->last_migrate_reason = -1;
  
         __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
  }
@@ -170,21 +186,26 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
  void __set_page_owner_migrate_reason(struct page *page, int reason)
  {
         struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_owner *page_owner;
+
         if (unlikely(!page_ext))
                 return;
  
-       page_ext->last_migrate_reason = reason;
+       page_owner = get_page_owner(page_ext);
+       page_owner->last_migrate_reason = reason;
  }
  
  void __split_page_owner(struct page *page, unsigned int order)
  {
         int i;
         struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_owner *page_owner;
  
         if (unlikely(!page_ext))
                 return;
  
-       page_ext->order = 0;
+       page_owner = get_page_owner(page_ext);
+       page_owner->order = 0;
         for (i = 1; i < (1 << order); i++)
                 __copy_page_owner(page, page + i);
  }
@@ -193,14 +214,18 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
  {
         struct page_ext *old_ext = lookup_page_ext(oldpage);
         struct page_ext *new_ext = lookup_page_ext(newpage);
+       struct page_owner *old_page_owner, *new_page_owner;
  
         if (unlikely(!old_ext || !new_ext))
                 return;
  
-       new_ext->order = old_ext->order;
-       new_ext->gfp_mask = old_ext->gfp_mask;
-       new_ext->last_migrate_reason = old_ext->last_migrate_reason;
-       new_ext->handle = old_ext->handle;
+       old_page_owner = get_page_owner(old_ext);
+       new_page_owner = get_page_owner(new_ext);
+       new_page_owner->order = old_page_owner->order;
+       new_page_owner->gfp_mask = old_page_owner->gfp_mask;
+       new_page_owner->last_migrate_reason =
+               old_page_owner->last_migrate_reason;
+       new_page_owner->handle = old_page_owner->handle;
  
         /*
          * We don't clear the bit on the oldpage as it's going to be freed
@@ -214,9 +239,88 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
         __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
  }
  
+void pagetypeinfo_showmixedcount_print(struct seq_file *m,
+                                      pg_data_t *pgdat, struct zone *zone)
+{
+       struct page *page;
+       struct page_ext *page_ext;
+       struct page_owner *page_owner;
+       unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
+       unsigned long end_pfn = pfn + zone->spanned_pages;
+       unsigned long count[MIGRATE_TYPES] = { 0, };
+       int pageblock_mt, page_mt;
+       int i;
+
+       /* Scan block by block. First and last block may be incomplete */
+       pfn = zone->zone_start_pfn;
+
+       /*
+        * Walk the zone in pageblock_nr_pages steps. If a page block spans
+        * a zone boundary, it will be double counted between zones. This does
+        * not matter as the mixed block count will still be correct
+        */
+       for (; pfn < end_pfn; ) {
+               if (!pfn_valid(pfn)) {
+                       pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+                       continue;
+               }
+
+               block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+               block_end_pfn = min(block_end_pfn, end_pfn);
+
+               page = pfn_to_page(pfn);
+               pageblock_mt = get_pageblock_migratetype(page);
+
+               for (; pfn < block_end_pfn; pfn++) {
+                       if (!pfn_valid_within(pfn))
+                               continue;
+
+                       page = pfn_to_page(pfn);
+
+                       if (page_zone(page) != zone)
+                               continue;
+
+                       if (PageBuddy(page)) {
+                               pfn += (1UL << page_order(page)) - 1;
+                               continue;
+                       }
+
+                       if (PageReserved(page))
+                               continue;
+
+                       page_ext = lookup_page_ext(page);
+                       if (unlikely(!page_ext))
+                               continue;
+
+                       if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+                               continue;
+
+                       page_owner = get_page_owner(page_ext);
+                       page_mt = gfpflags_to_migratetype(
+                                       page_owner->gfp_mask);
+                       if (pageblock_mt != page_mt) {
+                               if (is_migrate_cma(pageblock_mt))
+                                       count[MIGRATE_MOVABLE]++;
+                               else
+                                       count[pageblock_mt]++;
+
+                               pfn = block_end_pfn;
+                               break;
+                       }
+                       pfn += (1UL << page_owner->order) - 1;
+               }
+       }
+
+       /* Print counts */
+       seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+       for (i = 0; i < MIGRATE_TYPES; i++)
+               seq_printf(m, "%12lu ", count[i]);
+       seq_putc(m, '\n');
+}
+
  static ssize_t
  print_page_owner(char __user *buf, size_t count, unsigned long pfn,
-               struct page *page, struct page_ext *page_ext,
+               struct page *page, struct page_owner *page_owner,
                 depot_stack_handle_t handle)
  {
         int ret;
@@ -236,15 +340,15 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
  
         ret = snprintf(kbuf, count,
                         "Page allocated via order %u, mask %#x(%pGg)\n",
-                       page_ext->order, page_ext->gfp_mask,
-                       &page_ext->gfp_mask);
+                       page_owner->order, page_owner->gfp_mask,
+                       &page_owner->gfp_mask);
  
         if (ret >= count)
                 goto err;
  
         /* Print information relevant to grouping pages by mobility */
         pageblock_mt = get_pageblock_migratetype(page);
-       page_mt  = gfpflags_to_migratetype(page_ext->gfp_mask);
+       page_mt  = gfpflags_to_migratetype(page_owner->gfp_mask);
         ret += snprintf(kbuf + ret, count - ret,
                         "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
                         pfn,
@@ -261,10 +365,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
         if (ret >= count)
                 goto err;
  
-       if (page_ext->last_migrate_reason != -1) {
+       if (page_owner->last_migrate_reason != -1) {
                 ret += snprintf(kbuf + ret, count - ret,
                         "Page has been migrated, last migrate reason: %s\n",
-                       migrate_reason_names[page_ext->last_migrate_reason]);
+                       migrate_reason_names[page_owner->last_migrate_reason]);
                 if (ret >= count)
                         goto err;
         }
@@ -287,6 +391,7 @@ err:
  void __dump_page_owner(struct page *page)
  {
         struct page_ext *page_ext = lookup_page_ext(page);
+       struct page_owner *page_owner;
         unsigned long entries[PAGE_OWNER_STACK_DEPTH];
         struct stack_trace trace = {
                 .nr_entries = 0,
@@ -302,7 +407,9 @@ void __dump_page_owner(struct page *page)
                 pr_alert("There is not page extension available.\n");
                 return;
         }
-       gfp_mask = page_ext->gfp_mask;
+
+       page_owner = get_page_owner(page_ext);
+       gfp_mask = page_owner->gfp_mask;
         mt = gfpflags_to_migratetype(gfp_mask);
  
         if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
@@ -310,7 +417,7 @@ void __dump_page_owner(struct page *page)
                 return;
         }
  
-       handle = READ_ONCE(page_ext->handle);
+       handle = READ_ONCE(page_owner->handle);
         if (!handle) {
                 pr_alert("page_owner info is not active (free page?)\n");
                 return;
@@ -318,12 +425,12 @@ void __dump_page_owner(struct page *page)
  
         depot_fetch_stack(handle, &trace);
         pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
-                page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask);
+                page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
         print_stack_trace(&trace, 0);
  
-       if (page_ext->last_migrate_reason != -1)
+       if (page_owner->last_migrate_reason != -1)
                 pr_alert("page has been migrated, last migrate reason: %s\n",
-                       migrate_reason_names[page_ext->last_migrate_reason]);
+                       migrate_reason_names[page_owner->last_migrate_reason]);
  }
  
  static ssize_t
@@ -332,6 +439,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
         unsigned long pfn;
         struct page *page;
         struct page_ext *page_ext;
+       struct page_owner *page_owner;
         depot_stack_handle_t handle;
  
         if (!static_branch_unlikely(&page_owner_inited))
@@ -381,11 +489,13 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
                 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
                         continue;
  
+               page_owner = get_page_owner(page_ext);
+
                 /*
                  * Access to page_ext->handle isn't synchronous so we should
                  * be careful to access it.
                  */
-               handle = READ_ONCE(page_ext->handle);
+               handle = READ_ONCE(page_owner->handle);
                 if (!handle)
                         continue;
  
@@ -393,7 +503,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
                 *ppos = (pfn - min_low_pfn) + 1;
  
                 return print_page_owner(buf, count, pfn, page,
-                               page_ext, handle);
+                               page_owner, handle);
         }
  
         return 0;
diff --git a/mm/shmem.c b/mm/shmem.c

index d86b5e4..0e9901e 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3965,7 +3965,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
  
  /* common code */
  
-static struct dentry_operations anon_ops = {
+static const struct dentry_operations anon_ops = {
         .d_dname = simple_dname
  };
  
diff --git a/mm/swap.c b/mm/swap.c

index 75c63bb..4dcf852 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -748,10 +748,8 @@ void release_pages(struct page **pages, int nr, bool cold)
                         locked_pgdat = NULL;
                 }
  
-               if (is_huge_zero_page(page)) {
-                       put_huge_zero_page();
+               if (is_huge_zero_page(page))
                         continue;
-               }
  
                 page = compound_head(page);
                 if (!put_page_testzero(page))
diff --git a/mm/swap_state.c b/mm/swap_state.c

index c8310a3..35d7e0e 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,6 +37,8 @@ struct address_space swapper_spaces[MAX_SWAPFILES] = {
                 .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
                 .i_mmap_writable = ATOMIC_INIT(0),
                 .a_ops          = &swap_aops,
+               /* swap cache doesn't use writeback related tags */
+               .flags          = 1 << AS_NO_WRITEBACK_TAGS,
         }
  };
  
@@ -92,7 +94,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
         address_space = swap_address_space(entry);
         spin_lock_irq(&address_space->tree_lock);
         error = radix_tree_insert(&address_space->page_tree,
-                                       entry.val, page);
+                                 swp_offset(entry), page);
         if (likely(!error)) {
                 address_space->nrpages++;
                 __inc_node_page_state(page, NR_FILE_PAGES);
@@ -143,7 +145,7 @@ void __delete_from_swap_cache(struct page *page)
  
         entry.val = page_private(page);
         address_space = swap_address_space(entry);
-       radix_tree_delete(&address_space->page_tree, page_private(page));
+       radix_tree_delete(&address_space->page_tree, swp_offset(entry));
         set_page_private(page, 0);
         ClearPageSwapCache(page);
         address_space->nrpages--;
@@ -252,9 +254,7 @@ static inline void free_swap_cache(struct page *page)
  void free_page_and_swap_cache(struct page *page)
  {
         free_swap_cache(page);
-       if (is_huge_zero_page(page))
-               put_huge_zero_page();
-       else
+       if (!is_huge_zero_page(page))
                 put_page(page);
  }
  
@@ -283,7 +283,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
  {
         struct page *page;
  
-       page = find_get_page(swap_address_space(entry), entry.val);
+       page = find_get_page(swap_address_space(entry), swp_offset(entry));
  
         if (page) {
                 INC_CACHE_INFO(find_success);
@@ -310,7 +310,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                  * called after lookup_swap_cache() failed, re-calling
                  * that would confuse statistics.
                  */
-               found_page = find_get_page(swapper_space, entry.val);
+               found_page = find_get_page(swapper_space, swp_offset(entry));
                 if (found_page)
                         break;
  
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 2657acc..2210de2 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -105,7 +105,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
         struct page *page;
         int ret = 0;
  
-       page = find_get_page(swap_address_space(entry), entry.val);
+       page = find_get_page(swap_address_space(entry), swp_offset(entry));
         if (!page)
                 return 0;
         /*
@@ -257,6 +257,53 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
         info->data = 0;
  }
  
+static inline bool cluster_list_empty(struct swap_cluster_list *list)
+{
+       return cluster_is_null(&list->head);
+}
+
+static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
+{
+       return cluster_next(&list->head);
+}
+
+static void cluster_list_init(struct swap_cluster_list *list)
+{
+       cluster_set_null(&list->head);
+       cluster_set_null(&list->tail);
+}
+
+static void cluster_list_add_tail(struct swap_cluster_list *list,
+                                 struct swap_cluster_info *ci,
+                                 unsigned int idx)
+{
+       if (cluster_list_empty(list)) {
+               cluster_set_next_flag(&list->head, idx, 0);
+               cluster_set_next_flag(&list->tail, idx, 0);
+       } else {
+               unsigned int tail = cluster_next(&list->tail);
+
+               cluster_set_next(&ci[tail], idx);
+               cluster_set_next_flag(&list->tail, idx, 0);
+       }
+}
+
+static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
+                                          struct swap_cluster_info *ci)
+{
+       unsigned int idx;
+
+       idx = cluster_next(&list->head);
+       if (cluster_next(&list->tail) == idx) {
+               cluster_set_null(&list->head);
+               cluster_set_null(&list->tail);
+       } else
+               cluster_set_next_flag(&list->head,
+                                     cluster_next(&ci[idx]), 0);
+
+       return idx;
+}
+
  /* Add a cluster to discard list and schedule it to do discard */
  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
                 unsigned int idx)
@@ -270,17 +317,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
         memset(si->swap_map + idx * SWAPFILE_CLUSTER,
                         SWAP_MAP_BAD, SWAPFILE_CLUSTER);
  
-       if (cluster_is_null(&si->discard_cluster_head)) {
-               cluster_set_next_flag(&si->discard_cluster_head,
-                                               idx, 0);
-               cluster_set_next_flag(&si->discard_cluster_tail,
-                                               idx, 0);
-       } else {
-               unsigned int tail = cluster_next(&si->discard_cluster_tail);
-               cluster_set_next(&si->cluster_info[tail], idx);
-               cluster_set_next_flag(&si->discard_cluster_tail,
-                                               idx, 0);
-       }
+       cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
  
         schedule_work(&si->discard_work);
  }
@@ -296,15 +333,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
  
         info = si->cluster_info;
  
-       while (!cluster_is_null(&si->discard_cluster_head)) {
-               idx = cluster_next(&si->discard_cluster_head);
-
-               cluster_set_next_flag(&si->discard_cluster_head,
-                                               cluster_next(&info[idx]), 0);
-               if (cluster_next(&si->discard_cluster_tail) == idx) {
-                       cluster_set_null(&si->discard_cluster_head);
-                       cluster_set_null(&si->discard_cluster_tail);
-               }
+       while (!cluster_list_empty(&si->discard_clusters)) {
+               idx = cluster_list_del_first(&si->discard_clusters, info);
                 spin_unlock(&si->lock);
  
                 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
@@ -312,19 +342,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
  
                 spin_lock(&si->lock);
                 cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
-               if (cluster_is_null(&si->free_cluster_head)) {
-                       cluster_set_next_flag(&si->free_cluster_head,
-                                               idx, 0);
-                       cluster_set_next_flag(&si->free_cluster_tail,
-                                               idx, 0);
-               } else {
-                       unsigned int tail;
-
-                       tail = cluster_next(&si->free_cluster_tail);
-                       cluster_set_next(&info[tail], idx);
-                       cluster_set_next_flag(&si->free_cluster_tail,
-                                               idx, 0);
-               }
+               cluster_list_add_tail(&si->free_clusters, info, idx);
                 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
                                 0, SWAPFILE_CLUSTER);
         }
@@ -353,13 +371,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
         if (!cluster_info)
                 return;
         if (cluster_is_free(&cluster_info[idx])) {
-               VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx);
-               cluster_set_next_flag(&p->free_cluster_head,
-                       cluster_next(&cluster_info[idx]), 0);
-               if (cluster_next(&p->free_cluster_tail) == idx) {
-                       cluster_set_null(&p->free_cluster_tail);
-                       cluster_set_null(&p->free_cluster_head);
-               }
+               VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx);
+               cluster_list_del_first(&p->free_clusters, cluster_info);
                 cluster_set_count_flag(&cluster_info[idx], 0, 0);
         }
  
@@ -398,14 +411,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
                 }
  
                 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
-               if (cluster_is_null(&p->free_cluster_head)) {
-                       cluster_set_next_flag(&p->free_cluster_head, idx, 0);
-                       cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
-               } else {
-                       unsigned int tail = cluster_next(&p->free_cluster_tail);
-                       cluster_set_next(&cluster_info[tail], idx);
-                       cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
-               }
+               cluster_list_add_tail(&p->free_clusters, cluster_info, idx);
         }
  }
  
@@ -421,8 +427,8 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
         bool conflict;
  
         offset /= SWAPFILE_CLUSTER;
-       conflict = !cluster_is_null(&si->free_cluster_head) &&
-               offset != cluster_next(&si->free_cluster_head) &&
+       conflict = !cluster_list_empty(&si->free_clusters) &&
+               offset != cluster_list_first(&si->free_clusters) &&
                 cluster_is_free(&si->cluster_info[offset]);
  
         if (!conflict)
@@ -447,11 +453,11 @@ static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
  new_cluster:
         cluster = this_cpu_ptr(si->percpu_cluster);
         if (cluster_is_null(&cluster->index)) {
-               if (!cluster_is_null(&si->free_cluster_head)) {
-                       cluster->index = si->free_cluster_head;
+               if (!cluster_list_empty(&si->free_clusters)) {
+                       cluster->index = si->free_clusters.head;
                         cluster->next = cluster_next(&cluster->index) *
                                         SWAPFILE_CLUSTER;
-               } else if (!cluster_is_null(&si->discard_cluster_head)) {
+               } else if (!cluster_list_empty(&si->discard_clusters)) {
                         /*
                          * we don't have free cluster but have some clusters in
                          * discarding, do discard now and reclaim them
@@ -999,7 +1005,7 @@ int free_swap_and_cache(swp_entry_t entry)
         if (p) {
                 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
                         page = find_get_page(swap_address_space(entry),
-                                               entry.val);
+                                            swp_offset(entry));
                         if (page && !trylock_page(page)) {
                                 put_page(page);
                                 page = NULL;
@@ -2292,10 +2298,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
  
         nr_good_pages = maxpages - 1;   /* omit header page */
  
-       cluster_set_null(&p->free_cluster_head);
-       cluster_set_null(&p->free_cluster_tail);
-       cluster_set_null(&p->discard_cluster_head);
-       cluster_set_null(&p->discard_cluster_tail);
+       cluster_list_init(&p->free_clusters);
+       cluster_list_init(&p->discard_clusters);
  
         for (i = 0; i < swap_header->info.nr_badpages; i++) {
                 unsigned int page_nr = swap_header->info.badpages[i];
@@ -2341,19 +2345,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
         for (i = 0; i < nr_clusters; i++) {
                 if (!cluster_count(&cluster_info[idx])) {
                         cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
-                       if (cluster_is_null(&p->free_cluster_head)) {
-                               cluster_set_next_flag(&p->free_cluster_head,
-                                                               idx, 0);
-                               cluster_set_next_flag(&p->free_cluster_tail,
-                                                               idx, 0);
-                       } else {
-                               unsigned int tail;
-
-                               tail = cluster_next(&p->free_cluster_tail);
-                               cluster_set_next(&cluster_info[tail], idx);
-                               cluster_set_next_flag(&p->free_cluster_tail,
-                                                               idx, 0);
-                       }
+                       cluster_list_add_tail(&p->free_clusters, cluster_info,
+                                             idx);
                 }
                 idx++;
                 if (idx == nr_clusters)
diff --git a/mm/vmacache.c b/mm/vmacache.c

index fd09dc9..035fdeb 100644 (file)
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -87,11 +87,11 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
  {
         int i;
  
+       count_vm_vmacache_event(VMACACHE_FIND_CALLS);
+
         if (!vmacache_valid(mm))
                 return NULL;
  
-       count_vm_vmacache_event(VMACACHE_FIND_CALLS);
-
         for (i = 0; i < VMACACHE_SIZE; i++) {
                 struct vm_area_struct *vma = current->vmacache[i];
  
@@ -115,11 +115,11 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
  {
         int i;
  
+       count_vm_vmacache_event(VMACACHE_FIND_CALLS);
+
         if (!vmacache_valid(mm))
                 return NULL;
  
-       count_vm_vmacache_event(VMACACHE_FIND_CALLS);
-
         for (i = 0; i < VMACACHE_SIZE; i++) {
                 struct vm_area_struct *vma = current->vmacache[i];
  
diff --git a/mm/vmalloc.c b/mm/vmalloc.c

index 91f44e7..f2481cb 100644 (file)
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1359,14 +1359,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
         struct vm_struct *area;
  
         BUG_ON(in_interrupt());
-       if (flags & VM_IOREMAP)
-               align = 1ul << clamp_t(int, fls_long(size),
-                                      PAGE_SHIFT, IOREMAP_MAX_ORDER);
-
         size = PAGE_ALIGN(size);
         if (unlikely(!size))
                 return NULL;
  
+       if (flags & VM_IOREMAP)
+               align = 1ul << clamp_t(int, get_count_order_long(size),
+                                      PAGE_SHIFT, IOREMAP_MAX_ORDER);
+
         area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
         if (unlikely(!area))
                 return NULL;
@@ -1601,7 +1601,6 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
  static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                                  pgprot_t prot, int node)
  {
-       const int order = 0;
         struct page **pages;
         unsigned int nr_pages, array_size, i;
         const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
@@ -1629,9 +1628,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                 struct page *page;
  
                 if (node == NUMA_NO_NODE)
-                       page = alloc_pages(alloc_mask, order);
+                       page = alloc_page(alloc_mask);
                 else
-                       page = alloc_pages_node(node, alloc_mask, order);
+                       page = alloc_pages_node(node, alloc_mask, 0);
  
                 if (unlikely(!page)) {
                         /* Successfully allocated i pages, free them in __vunmap() */
@@ -1648,8 +1647,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
         return area->addr;
  
  fail:
-       warn_alloc_failed(gfp_mask, order,
-                         "vmalloc: allocation failure, allocated %ld of %ld bytes\n",
+       warn_alloc(gfp_mask,
+                         "vmalloc: allocation failure, allocated %ld of %ld bytes",
                           (area->nr_pages*PAGE_SIZE), area->size);
         vfree(area->addr);
         return NULL;
@@ -1710,9 +1709,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
         return addr;
  
  fail:
-       warn_alloc_failed(gfp_mask, 0,
-                         "vmalloc: allocation failure: %lu bytes\n",
-                         real_size);
+       warn_alloc(gfp_mask,
+                         "vmalloc: allocation failure: %lu bytes", real_size);
         return NULL;
  }
  
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 0fe8b71..744f926 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2418,8 +2418,6 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
         if (inactive_list_is_low(lruvec, false, sc))
                 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                    sc, LRU_ACTIVE_ANON);
-
-       throttle_vm_writeout(sc->gfp_mask);
  }
  
  /* Use reclaim/compaction for costly allocs or under memory pressure */
@@ -2480,7 +2478,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
          * If we have not reclaimed enough pages for compaction and the
          * inactive lists are large enough, continue reclaiming
          */
-       pages_for_compaction = (2UL << sc->order);
+       pages_for_compaction = compact_gap(sc->order);
         inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
         if (get_nr_swap_pages() > 0)
                 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
@@ -2495,7 +2493,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
                         continue;
  
                 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
-               case COMPACT_PARTIAL:
+               case COMPACT_SUCCESS:
                 case COMPACT_CONTINUE:
                         return false;
                 default:
@@ -2598,38 +2596,35 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
  }
  
  /*
- * Returns true if compaction should go ahead for a high-order request, or
- * the high-order allocation would succeed without compaction.
+ * Returns true if compaction should go ahead for a costly-order request, or
+ * the allocation would already succeed without compaction. Return false if we
+ * should reclaim first.
   */
  static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
  {
         unsigned long watermark;
-       bool watermark_ok;
+       enum compact_result suitable;
  
-       /*
-        * Compaction takes time to run and there are potentially other
-        * callers using the pages just freed. Continue reclaiming until
-        * there is a buffer of free pages available to give compaction
-        * a reasonable chance of completing and allocating the page
-        */
-       watermark = high_wmark_pages(zone) + (2UL << sc->order);
-       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
-
-       /*
-        * If compaction is deferred, reclaim up to a point where
-        * compaction will have a chance of success when re-enabled
-        */
-       if (compaction_deferred(zone, sc->order))
-               return watermark_ok;
+       suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
+       if (suitable == COMPACT_SUCCESS)
+               /* Allocation should succeed already. Don't reclaim. */
+               return true;
+       if (suitable == COMPACT_SKIPPED)
+               /* Compaction cannot yet proceed. Do reclaim. */
+               return false;
  
         /*
-        * If compaction is not ready to start and allocation is not likely
-        * to succeed without it, then keep reclaiming.
+        * Compaction is already possible, but it takes time to run and there
+        * are potentially other callers using the pages just freed. So proceed
+        * with reclaim to make a buffer of free pages available to give
+        * compaction a reasonable chance of completing and allocating the page.
+        * Note that we won't actually reclaim the whole buffer in one attempt
+        * as the target watermark in should_continue_reclaim() is lower. But if
+        * we are already above the high+gap watermark, don't reclaim at all.
          */
-       if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED)
-               return false;
+       watermark = high_wmark_pages(zone) + compact_gap(sc->order);
  
-       return watermark_ok;
+       return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
  }
  
  /*
@@ -3041,7 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
          */
         nid = mem_cgroup_select_victim_node(memcg);
  
-       zonelist = NODE_DATA(nid)->node_zonelists;
+       zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
  
         trace_mm_vmscan_memcg_reclaim_begin(0,
                                             sc.may_writepage,
@@ -3169,7 +3164,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
          * excessive reclaim. Assume that a process requested a high-order
          * can direct reclaim/compact.
          */
-       if (sc->order && sc->nr_reclaimed >= 2UL << sc->order)
+       if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
                 sc->order = 0;
  
         return sc->nr_scanned >= sc->nr_to_reclaim;
diff --git a/mm/vmstat.c b/mm/vmstat.c

index 89cec42..604f26a 100644 (file)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1254,85 +1254,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
         return 0;
  }
  
-#ifdef CONFIG_PAGE_OWNER
-static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
-                                                       pg_data_t *pgdat,
-                                                       struct zone *zone)
-{
-       struct page *page;
-       struct page_ext *page_ext;
-       unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
-       unsigned long end_pfn = pfn + zone->spanned_pages;
-       unsigned long count[MIGRATE_TYPES] = { 0, };
-       int pageblock_mt, page_mt;
-       int i;
-
-       /* Scan block by block. First and last block may be incomplete */
-       pfn = zone->zone_start_pfn;
-
-       /*
-        * Walk the zone in pageblock_nr_pages steps. If a page block spans
-        * a zone boundary, it will be double counted between zones. This does
-        * not matter as the mixed block count will still be correct
-        */
-       for (; pfn < end_pfn; ) {
-               if (!pfn_valid(pfn)) {
-                       pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
-                       continue;
-               }
-
-               block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
-               block_end_pfn = min(block_end_pfn, end_pfn);
-
-               page = pfn_to_page(pfn);
-               pageblock_mt = get_pageblock_migratetype(page);
-
-               for (; pfn < block_end_pfn; pfn++) {
-                       if (!pfn_valid_within(pfn))
-                               continue;
-
-                       page = pfn_to_page(pfn);
-
-                       if (page_zone(page) != zone)
-                               continue;
-
-                       if (PageBuddy(page)) {
-                               pfn += (1UL << page_order(page)) - 1;
-                               continue;
-                       }
-
-                       if (PageReserved(page))
-                               continue;
-
-                       page_ext = lookup_page_ext(page);
-                       if (unlikely(!page_ext))
-                               continue;
-
-                       if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
-                               continue;
-
-                       page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
-                       if (pageblock_mt != page_mt) {
-                               if (is_migrate_cma(pageblock_mt))
-                                       count[MIGRATE_MOVABLE]++;
-                               else
-                                       count[pageblock_mt]++;
-
-                               pfn = block_end_pfn;
-                               break;
-                       }
-                       pfn += (1UL << page_ext->order) - 1;
-               }
-       }
-
-       /* Print counts */
-       seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
-       for (i = 0; i < MIGRATE_TYPES; i++)
-               seq_printf(m, "%12lu ", count[i]);
-       seq_putc(m, '\n');
-}
-#endif /* CONFIG_PAGE_OWNER */
-
  /*
   * Print out the number of pageblocks for each migratetype that contain pages
   * of other types. This gives an indication of how well fallbacks are being
@@ -1592,7 +1513,10 @@ static int vmstat_show(struct seq_file *m, void *arg)
  {
         unsigned long *l = arg;
         unsigned long off = l - (unsigned long *)m->private;
-       seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
+
+       seq_puts(m, vmstat_text[off]);
+       seq_put_decimal_ull(m, " ", *l);
+       seq_putc(m, '\n');
         return 0;
  }
  
@@ -1794,6 +1718,16 @@ static void __init start_shepherd_timer(void)
                 round_jiffies_relative(sysctl_stat_interval));
  }
  
+static void __init init_cpu_node_state(void)
+{
+       int cpu;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu)
+               node_set_state(cpu_to_node(cpu), N_CPU);
+       put_online_cpus();
+}
+
  static void vmstat_cpu_dead(int node)
  {
         int cpu;
@@ -1851,6 +1785,7 @@ static int __init setup_vmstat(void)
  #ifdef CONFIG_SMP
         cpu_notifier_register_begin();
         __register_cpu_notifier(&vmstat_notifier);
+       init_cpu_node_state();
  
         start_shepherd_timer();
         cpu_notifier_register_done();
diff --git a/net/core/sock.c b/net/core/sock.c

index 038e660..c73e28f 100644 (file)
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1363,6 +1363,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
         slab = prot->slab;
  
         cgroup_sk_free(&sk->sk_cgrp_data);
+       mem_cgroup_sk_free(sk);
         security_sk_free(sk);
         if (slab != NULL)
                 kmem_cache_free(slab, sk);
@@ -1399,6 +1400,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                 sock_net_set(sk, net);
                 atomic_set(&sk->sk_wmem_alloc, 1);
  
+               mem_cgroup_sk_alloc(sk);
                 cgroup_sk_alloc(&sk->sk_cgrp_data);
                 sock_update_classid(&sk->sk_cgrp_data);
                 sock_update_netprioidx(&sk->sk_cgrp_data);
@@ -1545,6 +1547,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
                 newsk->sk_incoming_cpu = raw_smp_processor_id();
                 atomic64_set(&newsk->sk_cookie, 0);
  
+               mem_cgroup_sk_alloc(newsk);
                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
  
                 /*
@@ -1569,9 +1572,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
                 sk_set_socket(newsk, NULL);
                 newsk->sk_wq = NULL;
  
-               if (mem_cgroup_sockets_enabled && sk->sk_memcg)
-                       sock_update_memcg(newsk);
-
                 if (newsk->sk_prot->sockets_allocated)
                         sk_sockets_allocated_inc(newsk);
  
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c

index 66ddcb6..7cf7d6e 100644 (file)
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -258,7 +258,7 @@ int ping_init_sock(struct sock *sk)
         struct net *net = sock_net(sk);
         kgid_t group = current_egid();
         struct group_info *group_info;
-       int i, j, count;
+       int i;
         kgid_t low, high;
         int ret = 0;
  
@@ -270,16 +270,11 @@ int ping_init_sock(struct sock *sk)
                 return 0;
  
         group_info = get_current_groups();
-       count = group_info->ngroups;
-       for (i = 0; i < group_info->nblocks; i++) {
-               int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
-               for (j = 0; j < cp_count; j++) {
-                       kgid_t gid = group_info->blocks[i][j];
-                       if (gid_lte(low, gid) && gid_lte(gid, high))
-                               goto out_release_group;
-               }
+       for (i = 0; i < group_info->ngroups; i++) {
+               kgid_t gid = group_info->gid[i];
  
-               count -= cp_count;
+               if (gid_lte(low, gid) && gid_lte(gid, high))
+                       goto out_release_group;
         }
  
         ret = -EACCES;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c

index 2414b7c..3251fe7 100644 (file)
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -424,8 +424,6 @@ void tcp_init_sock(struct sock *sk)
         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
  
         local_bh_disable();
-       if (mem_cgroup_sockets_enabled)
-               sock_update_memcg(sk);
         sk_sockets_allocated_inc(sk);
         local_bh_enable();
  }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c

index 7ac37c3..bd5e8d1 100644 (file)
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1871,9 +1871,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
         local_bh_disable();
         sk_sockets_allocated_dec(sk);
         local_bh_enable();
-
-       if (mem_cgroup_sockets_enabled && sk->sk_memcg)
-               sock_release_memcg(sk);
  }
  EXPORT_SYMBOL(tcp_v4_destroy_sock);
  
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c

index 1682195..83dffea 100644 (file)
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -176,8 +176,8 @@ generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags)
         if (gcred->acred.group_info->ngroups != acred->group_info->ngroups)
                 goto out_nomatch;
         for (i = 0; i < gcred->acred.group_info->ngroups; i++) {
-               if (!gid_eq(GROUP_AT(gcred->acred.group_info, i),
-                               GROUP_AT(acred->group_info, i)))
+               if (!gid_eq(gcred->acred.group_info->gid[i],
+                               acred->group_info->gid[i]))
                         goto out_nomatch;
         }
  out_match:
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c

index eeeba5a..dc6fb79 100644 (file)
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -229,7 +229,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr,
                 kgid = make_kgid(&init_user_ns, tmp);
                 if (!gid_valid(kgid))
                         goto out_free_groups;
-               GROUP_AT(creds->cr_group_info, i) = kgid;
+               creds->cr_group_info->gid[i] = kgid;
         }
  
         return 0;
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c

index d858202..d67f7e1 100644 (file)
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -479,7 +479,7 @@ static int rsc_parse(struct cache_detail *cd,
                         kgid = make_kgid(&init_user_ns, id);
                         if (!gid_valid(kgid))
                                 goto out;
-                       GROUP_AT(rsci.cred.cr_group_info, i) = kgid;
+                       rsci.cred.cr_group_info->gid[i] = kgid;
                 }
  
                 /* mech name */
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c

index a99278c..a1d768a 100644 (file)
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -79,7 +79,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t
  
         cred->uc_gid = acred->gid;
         for (i = 0; i < groups; i++)
-               cred->uc_gids[i] = GROUP_AT(acred->group_info, i);
+               cred->uc_gids[i] = acred->group_info->gid[i];
         if (i < NFS_NGROUPS)
                 cred->uc_gids[i] = INVALID_GID;
  
@@ -127,7 +127,7 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
         if (groups > NFS_NGROUPS)
                 groups = NFS_NGROUPS;
         for (i = 0; i < groups ; i++)
-               if (!gid_eq(cred->uc_gids[i], GROUP_AT(acred->group_info, i)))
+               if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i]))
                         return 0;
         if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups]))
                 return 0;
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c

index dfacdc9..64af4f0 100644 (file)
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -517,7 +517,7 @@ static int unix_gid_parse(struct cache_detail *cd,
                 kgid = make_kgid(&init_user_ns, gid);
                 if (!gid_valid(kgid))
                         goto out;
-               GROUP_AT(ug.gi, i) = kgid;
+               ug.gi->gid[i] = kgid;
         }
  
         ugp = unix_gid_lookup(cd, uid);
@@ -564,7 +564,7 @@ static int unix_gid_show(struct seq_file *m,
  
         seq_printf(m, "%u %d:", from_kuid_munged(user_ns, ug->uid), glen);
         for (i = 0; i < glen; i++)
-               seq_printf(m, " %d", from_kgid_munged(user_ns, GROUP_AT(ug->gi, i)));
+               seq_printf(m, " %d", from_kgid_munged(user_ns, ug->gi->gid[i]));
         seq_printf(m, "\n");
         return 0;
  }
@@ -817,7 +817,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
                 return SVC_CLOSE;
         for (i = 0; i < slen; i++) {
                 kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv));
-               GROUP_AT(cred->cr_group_info, i) = kgid;
+               cred->cr_group_info->gid[i] = kgid;
         }
         if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
                 *authp = rpc_autherr_badverf;
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c

index 48958d3..bd83497 100644 (file)
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -888,7 +888,7 @@ static void check_section(const char *modname, struct elf_info *elf,
  
  #define DATA_SECTIONS ".data", ".data.rel"
  #define TEXT_SECTIONS ".text", ".text.unlikely", ".sched.text", \
-               ".kprobes.text"
+               ".kprobes.text", ".cpuidle.text"
  #define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \
                 ".fixup", ".entry.text", ".exception.text", ".text.*", \
                 ".coldtext"
diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c

index a68f031..5423a58 100644 (file)
--- a/scripts/recordmcount.c
+++ b/scripts/recordmcount.c
@@ -365,6 +365,7 @@ is_mcounted_section_name(char const *const txtname)
                 strcmp(".irqentry.text", txtname) == 0 ||
                 strcmp(".softirqentry.text", txtname) == 0 ||
                 strcmp(".kprobes.text", txtname) == 0 ||
+               strcmp(".cpuidle.text", txtname) == 0 ||
                 strcmp(".text.unlikely", txtname) == 0;
  }
  
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl

index 2d48011..faac4b1 100755 (executable)
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -136,6 +136,7 @@ my %text_sections = (
       ".irqentry.text" => 1,
       ".softirqentry.text" => 1,
       ".kprobes.text" => 1,
+     ".cpuidle.text" => 1,
       ".text.unlikely" => 1,
  );
  
diff --git a/scripts/spelling.txt b/scripts/spelling.txt

index fa79c6d..163c720 100644 (file)
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -629,7 +629,6 @@ mispelt||misspelt
  miximum||maximum
  mmnemonic||mnemonic
  mnay||many
-modeled||modelled
  modulues||modules
  monochorome||monochrome
  monochromo||monochrome
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore

index a937a9d..142c565 100644 (file)
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -7,3 +7,4 @@ mlock2-tests
  on-fault-limit
  transhuge-stress
  userfaultfd
+mlock-intersect-test
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile

index e4bb1de..bbab7f4 100644 (file)
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -10,6 +10,7 @@ BINARIES += on-fault-limit
  BINARIES += thuge-gen
  BINARIES += transhuge-stress
  BINARIES += userfaultfd
+BINARIES += mlock-random-test
  
  all: $(BINARIES)
  %: %.c
@@ -17,6 +18,9 @@ all: $(BINARIES)
  userfaultfd: userfaultfd.c ../../../../usr/include/linux/kernel.h
         $(CC) $(CFLAGS) -O2 -o $@ $< -lpthread
  
+mlock-random-test: mlock-random-test.c
+       $(CC) $(CFLAGS) -o $@ $< -lcap
+
  ../../../../usr/include/linux/kernel.h:
         make -C ../../../.. headers_install
  
diff --git a/tools/testing/selftests/vm/mlock-random-test.c b/tools/testing/selftests/vm/mlock-random-test.c

new file mode 100644 (file)

index 0000000..83de4f5
--- /dev/null
+++ b/tools/testing/selftests/vm/mlock-random-test.c
@@ -0,0 +1,293 @@
+/*
+ * It tests the mlock/mlock2() when they are invoked
+ * on randomly memory region.
+ */
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/capability.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <time.h>
+#include "mlock2.h"
+
+#define CHUNK_UNIT (128 * 1024)
+#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2)
+#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT
+#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3)
+
+#define TEST_LOOP 100
+#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1))
+
+int set_cap_limits(rlim_t max)
+{
+       struct rlimit new;
+       cap_t cap = cap_init();
+
+       new.rlim_cur = max;
+       new.rlim_max = max;
+       if (setrlimit(RLIMIT_MEMLOCK, &new)) {
+               perror("setrlimit() returns error\n");
+               return -1;
+       }
+
+       /* drop capabilities including CAP_IPC_LOCK */
+       if (cap_set_proc(cap)) {
+               perror("cap_set_proc() returns error\n");
+               return -2;
+       }
+
+       return 0;
+}
+
+int get_proc_locked_vm_size(void)
+{
+       FILE *f;
+       int ret = -1;
+       char line[1024] = {0};
+       unsigned long lock_size = 0;
+
+       f = fopen("/proc/self/status", "r");
+       if (!f) {
+               perror("fopen");
+               return -1;
+       }
+
+       while (fgets(line, 1024, f)) {
+               if (strstr(line, "VmLck")) {
+                       ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size);
+                       if (ret <= 0) {
+                               printf("sscanf() on VmLck error: %s: %d\n",
+                                               line, ret);
+                               fclose(f);
+                               return -1;
+                       }
+                       fclose(f);
+                       return (int)(lock_size << 10);
+               }
+       }
+
+       perror("cann't parse VmLck in /proc/self/status\n");
+       fclose(f);
+       return -1;
+}
+
+/*
+ * Get the MMUPageSize of the memory region including input
+ * address from proc file.
+ *
+ * return value: on error case, 0 will be returned.
+ * Otherwise the page size(in bytes) is returned.
+ */
+int get_proc_page_size(unsigned long addr)
+{
+       FILE *smaps;
+       char *line;
+       unsigned long mmupage_size = 0;
+       size_t size;
+
+       smaps = seek_to_smaps_entry(addr);
+       if (!smaps) {
+               printf("Unable to parse /proc/self/smaps\n");
+               return 0;
+       }
+
+       while (getline(&line, &size, smaps) > 0) {
+               if (!strstr(line, "MMUPageSize")) {
+                       free(line);
+                       line = NULL;
+                       size = 0;
+                       continue;
+               }
+
+               /* found the MMUPageSize of this section */
+               if (sscanf(line, "MMUPageSize:    %8lu kB",
+                                       &mmupage_size) < 1) {
+                       printf("Unable to parse smaps entry for Size:%s\n",
+                                       line);
+                       break;
+               }
+
+       }
+       free(line);
+       if (smaps)
+               fclose(smaps);
+       return mmupage_size << 10;
+}
+
+/*
+ * Test mlock/mlock2() on provided memory chunk.
+ * It expects the mlock/mlock2() to be successful (within rlimit)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will choose start/len randomly to perform mlock/mlock2
+ * [start, start +  len] memory range. The range is within range
+ * of the allocated chunk.
+ *
+ * The memory region size alloc_size is within the rlimit.
+ * So we always expect a success of mlock/mlock2.
+ *
+ * VmLck is assumed to be 0 before this test.
+ *
+ *    return value: 0 - success
+ *    else: failure
+ */
+int test_mlock_within_limit(char *p, int alloc_size)
+{
+       int i;
+       int ret = 0;
+       int locked_vm_size = 0;
+       struct rlimit cur;
+       int page_size = 0;
+
+       getrlimit(RLIMIT_MEMLOCK, &cur);
+       if (cur.rlim_cur < alloc_size) {
+               printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
+                               alloc_size, (unsigned int)cur.rlim_cur);
+               return -1;
+       }
+
+       srand(time(NULL));
+       for (i = 0; i < TEST_LOOP; i++) {
+               /*
+                * - choose mlock/mlock2 randomly
+                * - choose lock_size randomly but lock_size < alloc_size
+                * - choose start_offset randomly but p+start_offset+lock_size
+                *   < p+alloc_size
+                */
+               int is_mlock = !!(rand() % 2);
+               int lock_size = rand() % alloc_size;
+               int start_offset = rand() % (alloc_size - lock_size);
+
+               if (is_mlock)
+                       ret = mlock(p + start_offset, lock_size);
+               else
+                       ret = mlock2_(p + start_offset, lock_size,
+                                      MLOCK_ONFAULT);
+
+               if (ret) {
+                       printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
+                                       is_mlock ? "mlock" : "mlock2",
+                                       p, alloc_size,
+                                       p + start_offset, lock_size);
+                       return ret;
+               }
+       }
+
+       /*
+        * Check VmLck left by the tests.
+        */
+       locked_vm_size = get_proc_locked_vm_size();
+       page_size = get_proc_page_size((unsigned long)p);
+       if (page_size == 0) {
+               printf("cannot get proc MMUPageSize\n");
+               return -1;
+       }
+
+       if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) {
+               printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n",
+                               locked_vm_size, alloc_size);
+               return -1;
+       }
+
+       return 0;
+}
+
+
+/*
+ * We expect the mlock/mlock2() to be fail (outof limitation)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will randomly choose start/len and perform mlock/mlock2
+ * on [start, start+len] range.
+ *
+ * The memory region size alloc_size is above the rlimit.
+ * And the len to be locked is higher than rlimit.
+ * So we always expect a failure of mlock/mlock2.
+ * No locked page number should be increased as a side effect.
+ *
+ *    return value: 0 - success
+ *    else: failure
+ */
+int test_mlock_outof_limit(char *p, int alloc_size)
+{
+       int i;
+       int ret = 0;
+       int locked_vm_size = 0, old_locked_vm_size = 0;
+       struct rlimit cur;
+
+       getrlimit(RLIMIT_MEMLOCK, &cur);
+       if (cur.rlim_cur >= alloc_size) {
+               printf("alloc_size[%d] >%u rlimit, violates test condition\n",
+                               alloc_size, (unsigned int)cur.rlim_cur);
+               return -1;
+       }
+
+       old_locked_vm_size = get_proc_locked_vm_size();
+       srand(time(NULL));
+       for (i = 0; i < TEST_LOOP; i++) {
+               int is_mlock = !!(rand() % 2);
+               int lock_size = (rand() % (alloc_size - cur.rlim_cur))
+                       + cur.rlim_cur;
+               int start_offset = rand() % (alloc_size - lock_size);
+
+               if (is_mlock)
+                       ret = mlock(p + start_offset, lock_size);
+               else
+                       ret = mlock2_(p + start_offset, lock_size,
+                                       MLOCK_ONFAULT);
+               if (ret == 0) {
+                       printf("%s() succeeds? on %p(%d) mlock%p(%d)\n",
+                                       is_mlock ? "mlock" : "mlock2",
+                                       p, alloc_size,
+                                       p + start_offset, lock_size);
+                       return -1;
+               }
+       }
+
+       locked_vm_size = get_proc_locked_vm_size();
+       if (locked_vm_size != old_locked_vm_size) {
+               printf("tests leads to new mlocked page: old[%d], new[%d]\n",
+                               old_locked_vm_size,
+                               locked_vm_size);
+               return -1;
+       }
+
+       return 0;
+}
+
+int main(int argc, char **argv)
+{
+       char *p = NULL;
+       int ret = 0;
+
+       if (set_cap_limits(MLOCK_RLIMIT_SIZE))
+               return -1;
+
+       p = malloc(MLOCK_WITHIN_LIMIT_SIZE);
+       if (p == NULL) {
+               perror("malloc() failure\n");
+               return -1;
+       }
+       ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
+       if (ret)
+               return ret;
+       munlock(p, MLOCK_WITHIN_LIMIT_SIZE);
+       free(p);
+
+
+       p = malloc(MLOCK_OUTOF_LIMIT_SIZE);
+       if (p == NULL) {
+               perror("malloc() failure\n");
+               return -1;
+       }
+       ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
+       if (ret)
+               return ret;
+       munlock(p, MLOCK_OUTOF_LIMIT_SIZE);
+       free(p);
+
+       return 0;
+}
diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c

index 02ca5e0..ff0cda2 100644 (file)
--- a/tools/testing/selftests/vm/mlock2-tests.c
+++ b/tools/testing/selftests/vm/mlock2-tests.c
@@ -1,33 +1,12 @@
  #define _GNU_SOURCE
  #include <sys/mman.h>
  #include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
  #include <unistd.h>
  #include <string.h>
  #include <sys/time.h>
  #include <sys/resource.h>
-#include <syscall.h>
-#include <errno.h>
  #include <stdbool.h>
-
-#ifndef MLOCK_ONFAULT
-#define MLOCK_ONFAULT 1
-#endif
-
-#ifndef MCL_ONFAULT
-#define MCL_ONFAULT (MCL_FUTURE << 1)
-#endif
-
-static int mlock2_(void *start, size_t len, int flags)
-{
-#ifdef __NR_mlock2
-       return syscall(__NR_mlock2, start, len, flags);
-#else
-       errno = ENOSYS;
-       return -1;
-#endif
-}
+#include "mlock2.h"
  
  struct vm_boundaries {
         unsigned long start;
@@ -138,46 +117,6 @@ static uint64_t get_kpageflags(unsigned long pfn)
         return flags;
  }
  
-static FILE *seek_to_smaps_entry(unsigned long addr)
-{
-       FILE *file;
-       char *line = NULL;
-       size_t size = 0;
-       unsigned long start, end;
-       char perms[5];
-       unsigned long offset;
-       char dev[32];
-       unsigned long inode;
-       char path[BUFSIZ];
-
-       file = fopen("/proc/self/smaps", "r");
-       if (!file) {
-               perror("fopen smaps");
-               _exit(1);
-       }
-
-       while (getline(&line, &size, file) > 0) {
-               if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
-                          &start, &end, perms, &offset, dev, &inode, path) < 6)
-                       goto next;
-
-               if (start <= addr && addr < end)
-                       goto out;
-
-next:
-               free(line);
-               line = NULL;
-               size = 0;
-       }
-
-       fclose(file);
-       file = NULL;
-
-out:
-       free(line);
-       return file;
-}
-
  #define VMFLAGS "VmFlags:"
  
  static bool is_vmflag_set(unsigned long addr, const char *vmflag)
diff --git a/tools/testing/selftests/vm/mlock2.h b/tools/testing/selftests/vm/mlock2.h

new file mode 100644 (file)

index 0000000..7ee0629
--- /dev/null
+++ b/tools/testing/selftests/vm/mlock2.h
@@ -0,0 +1,62 @@
+#include <syscall.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 1
+#endif
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int mlock2_(void *start, size_t len, int flags)
+{
+#ifdef __NR_mlock2
+       return syscall(__NR_mlock2, start, len, flags);
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+static FILE *seek_to_smaps_entry(unsigned long addr)
+{
+       FILE *file;
+       char *line = NULL;
+       size_t size = 0;
+       unsigned long start, end;
+       char perms[5];
+       unsigned long offset;
+       char dev[32];
+       unsigned long inode;
+       char path[BUFSIZ];
+
+       file = fopen("/proc/self/smaps", "r");
+       if (!file) {
+               perror("fopen smaps");
+               _exit(1);
+       }
+
+       while (getline(&line, &size, file) > 0) {
+               if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
+                          &start, &end, perms, &offset, dev, &inode, path) < 6)
+                       goto next;
+
+               if (start <= addr && addr < end)
+                       goto out;
+
+next:
+               free(line);
+               line = NULL;
+               size = 0;
+       }
+
+       fclose(file);
+       file = NULL;
+
+out:
+       free(line);
+       return file;
+}
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 8 Oct 2016 04:38:00 +0000 (21:38 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 8 Oct 2016 04:38:00 +0000 (21:38 -0700)
.gitattributes	[new file with mode: 0644]	patch \| blob
.mailmap		patch \| blob \| history
CREDITS		patch \| blob \| history
Documentation/filesystems/proc.txt		patch \| blob \| history
arch/alpha/Kconfig		patch \| blob \| history
arch/alpha/kernel/vmlinux.lds.S		patch \| blob \| history
arch/arc/kernel/vmlinux.lds.S		patch \| blob \| history
arch/arm/Kconfig		patch \| blob \| history
arch/arm/include/asm/irq.h		patch \| blob \| history
arch/arm/kernel/smp.c		patch \| blob \| history
arch/arm/kernel/vmlinux-xip.lds.S		patch \| blob \| history
arch/arm/kernel/vmlinux.lds.S		patch \| blob \| history
arch/arm64/Kconfig		patch \| blob \| history
arch/arm64/kernel/vmlinux.lds.S		patch \| blob \| history
arch/avr32/kernel/vmlinux.lds.S		patch \| blob \| history
arch/blackfin/kernel/vmlinux.lds.S		patch \| blob \| history
arch/c6x/kernel/vmlinux.lds.S		patch \| blob \| history
arch/cris/kernel/vmlinux.lds.S		patch \| blob \| history
arch/frv/kernel/vmlinux.lds.S		patch \| blob \| history
arch/h8300/kernel/vmlinux.lds.S		patch \| blob \| history
arch/hexagon/kernel/vmlinux.lds.S		patch \| blob \| history
arch/ia64/include/asm/atomic.h		patch \| blob \| history
arch/ia64/kernel/vmlinux.lds.S		patch \| blob \| history
arch/m32r/kernel/vmlinux.lds.S		patch \| blob \| history
arch/m68k/kernel/vmlinux-nommu.lds		patch \| blob \| history
arch/m68k/kernel/vmlinux-std.lds		patch \| blob \| history
arch/m68k/kernel/vmlinux-sun3.lds		patch \| blob \| history
arch/metag/kernel/vmlinux.lds.S		patch \| blob \| history
arch/microblaze/kernel/vmlinux.lds.S		patch \| blob \| history
arch/mips/Kconfig		patch \| blob \| history
arch/mips/include/asm/irq.h		patch \| blob \| history
arch/mips/include/asm/pgtable.h		patch \| blob \| history
arch/mips/include/asm/uprobes.h		patch \| blob \| history
arch/mips/kernel/process.c		patch \| blob \| history
arch/mips/kernel/vmlinux.lds.S		patch \| blob \| history
arch/mn10300/kernel/vmlinux.lds.S		patch \| blob \| history
arch/nios2/kernel/vmlinux.lds.S		patch \| blob \| history
arch/openrisc/kernel/vmlinux.lds.S		patch \| blob \| history
arch/parisc/Kconfig		patch \| blob \| history
arch/parisc/kernel/vmlinux.lds.S		patch \| blob \| history
arch/powerpc/Kconfig		patch \| blob \| history
arch/powerpc/include/asm/mmzone.h		patch \| blob \| history
arch/powerpc/kernel/fadump.c		patch \| blob \| history
arch/powerpc/kernel/vmlinux.lds.S		patch \| blob \| history
arch/s390/Kconfig		patch \| blob \| history
arch/s390/include/asm/uprobes.h		patch \| blob \| history
arch/s390/kernel/compat_linux.c		patch \| blob \| history
arch/s390/kernel/vmlinux.lds.S		patch \| blob \| history
arch/score/kernel/vmlinux.lds.S		patch \| blob \| history
arch/sh/kernel/vmlinux.lds.S		patch \| blob \| history
arch/sparc/Kconfig		patch \| blob \| history
arch/sparc/include/asm/irq_64.h		patch \| blob \| history
arch/sparc/kernel/process_64.c		patch \| blob \| history
arch/sparc/kernel/vmlinux.lds.S		patch \| blob \| history
arch/tile/Kconfig		patch \| blob \| history
arch/tile/include/asm/irq.h		patch \| blob \| history
arch/tile/kernel/entry.S		patch \| blob \| history
arch/tile/kernel/pmc.c		patch \| blob \| history
arch/tile/kernel/process.c		patch \| blob \| history
arch/tile/kernel/traps.c		patch \| blob \| history
arch/tile/kernel/vmlinux.lds.S		patch \| blob \| history
arch/um/kernel/dyn.lds.S		patch \| blob \| history
arch/um/kernel/uml.lds.S		patch \| blob \| history
arch/unicore32/kernel/vmlinux.lds.S		patch \| blob \| history
arch/x86/Kconfig		patch \| blob \| history
arch/x86/include/asm/irq.h		patch \| blob \| history
arch/x86/include/asm/irqflags.h		patch \| blob \| history
arch/x86/include/asm/pgtable_types.h		patch \| blob \| history
arch/x86/kernel/acpi/cstate.c		patch \| blob \| history
arch/x86/kernel/apic/hw_nmi.c		patch \| blob \| history
arch/x86/kernel/process.c		patch \| blob \| history
arch/x86/kernel/vmlinux.lds.S		patch \| blob \| history
arch/xtensa/kernel/vmlinux.lds.S		patch \| blob \| history
drivers/acpi/processor_idle.c		patch \| blob \| history
drivers/base/memory.c		patch \| blob \| history
drivers/cpuidle/driver.c		patch \| blob \| history
drivers/idle/intel_idle.c		patch \| blob \| history
drivers/of/base.c		patch \| blob \| history
drivers/staging/lustre/lustre/ptlrpc/sec.c		patch \| blob \| history
fs/Kconfig		patch \| blob \| history
fs/dax.c		patch \| blob \| history
fs/ext2/file.c		patch \| blob \| history
fs/ext4/file.c		patch \| blob \| history
fs/hugetlbfs/inode.c		patch \| blob \| history
fs/nfs/internal.h		patch \| blob \| history
fs/nfs/pagelist.c		patch \| blob \| history
fs/nfs/read.c		patch \| blob \| history
fs/nfs/write.c		patch \| blob \| history
fs/nfsd/auth.c		patch \| blob \| history
fs/nfsd/nfs4state.c		patch \| blob \| history
fs/notify/fanotify/fanotify_user.c		patch \| blob \| history
fs/notify/group.c		patch \| blob \| history
fs/notify/inotify/inotify_user.c		patch \| blob \| history
fs/notify/notification.c		patch \| blob \| history
fs/ocfs2/cluster/tcp.c		patch \| blob \| history
fs/ocfs2/dlm/dlmdomain.c		patch \| blob \| history
fs/ocfs2/dlmfs/dlmfs.c		patch \| blob \| history
fs/ocfs2/inode.h		patch \| blob \| history
fs/ocfs2/super.c		patch \| blob \| history
fs/proc/array.c		patch \| blob \| history
fs/proc/base.c		patch \| blob \| history
fs/proc/meminfo.c		patch \| blob \| history
fs/proc/stat.c		patch \| blob \| history
fs/proc/task_mmu.c		patch \| blob \| history
fs/seq_file.c		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
include/asm-generic/pgtable.h		patch \| blob \| history
include/asm-generic/vmlinux.lds.h		patch \| blob \| history
include/linux/bitops.h		patch \| blob \| history
include/linux/bootmem.h		patch \| blob \| history
include/linux/compaction.h		patch \| blob \| history
include/linux/console.h		patch \| blob \| history
include/linux/cpu.h		patch \| blob \| history
include/linux/cred.h		patch \| blob \| history
include/linux/fsnotify_backend.h		patch \| blob \| history
include/linux/huge_mm.h		patch \| blob \| history
include/linux/hugetlb.h		patch \| blob \| history
include/linux/jiffies.h		patch \| blob \| history
include/linux/kernel.h		patch \| blob \| history
include/linux/memblock.h		patch \| blob \| history
include/linux/memcontrol.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
include/linux/nmi.h		patch \| blob \| history
include/linux/oom.h		patch \| blob \| history
include/linux/page_ext.h		patch \| blob \| history
include/linux/page_owner.h		patch \| blob \| history
include/linux/pagemap.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/seq_file.h		patch \| blob \| history
include/linux/swap.h		patch \| blob \| history
include/linux/writeback.h		patch \| blob \| history
include/trace/events/compaction.h		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/groups.c		patch \| blob \| history
kernel/power/process.c		patch \| blob \| history
kernel/printk/printk.c		patch \| blob \| history
kernel/sched/idle.c		patch \| blob \| history
kernel/uid16.c		patch \| blob \| history
lib/Kconfig		patch \| blob \| history
lib/atomic64_test.c		patch \| blob \| history
lib/nmi_backtrace.c		patch \| blob \| history
mm/bootmem.c		patch \| blob \| history
mm/compaction.c		patch \| blob \| history
mm/debug.c		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/internal.h		patch \| blob \| history
mm/ksm.c		patch \| blob \| history
mm/memblock.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/memory_hotplug.c		patch \| blob \| history
mm/mempolicy.c		patch \| blob \| history
mm/migrate.c		patch \| blob \| history
mm/mincore.c		patch \| blob \| history
mm/mlock.c		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/mprotect.c		patch \| blob \| history
mm/nobootmem.c		patch \| blob \| history
mm/oom_kill.c		patch \| blob \| history
mm/page-writeback.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/page_ext.c		patch \| blob \| history
mm/page_io.c		patch \| blob \| history
mm/page_isolation.c		patch \| blob \| history
mm/page_owner.c		patch \| blob \| history
mm/shmem.c		patch \| blob \| history
mm/swap.c		patch \| blob \| history
mm/swap_state.c		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history
mm/vmacache.c		patch \| blob \| history
mm/vmalloc.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history
mm/vmstat.c		patch \| blob \| history
net/core/sock.c		patch \| blob \| history
net/ipv4/ping.c		patch \| blob \| history
net/ipv4/tcp.c		patch \| blob \| history
net/ipv4/tcp_ipv4.c		patch \| blob \| history
net/sunrpc/auth_generic.c		patch \| blob \| history
net/sunrpc/auth_gss/gss_rpc_xdr.c		patch \| blob \| history
net/sunrpc/auth_gss/svcauth_gss.c		patch \| blob \| history
net/sunrpc/auth_unix.c		patch \| blob \| history
net/sunrpc/svcauth_unix.c		patch \| blob \| history
scripts/mod/modpost.c		patch \| blob \| history
scripts/recordmcount.c		patch \| blob \| history
scripts/recordmcount.pl		patch \| blob \| history
scripts/spelling.txt		patch \| blob \| history
tools/testing/selftests/vm/.gitignore		patch \| blob \| history
tools/testing/selftests/vm/Makefile		patch \| blob \| history
tools/testing/selftests/vm/mlock-random-test.c	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/vm/mlock2-tests.c		patch \| blob \| history
tools/testing/selftests/vm/mlock2.h	[new file with mode: 0644]	patch \| blob