tpm: Remove all uses of drvdata from the TPM Core

[cascardo/linux.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index c1069ef..6903b69 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -352,6 +352,106 @@ static inline bool update_defer_init(pg_data_t *pgdat,
  }
  #endif
  
+/* Return a pointer to the bitmap storing bits affecting a block of pages */
+static inline unsigned long *get_pageblock_bitmap(struct page *page,
+                                                       unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+       return __pfn_to_section(pfn)->pageblock_flags;
+#else
+       return page_zone(page)->pageblock_flags;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+       pfn &= (PAGES_PER_SECTION-1);
+       return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#else
+       pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
+       return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest to retrieve
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
+static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
+                                       unsigned long pfn,
+                                       unsigned long end_bitidx,
+                                       unsigned long mask)
+{
+       unsigned long *bitmap;
+       unsigned long bitidx, word_bitidx;
+       unsigned long word;
+
+       bitmap = get_pageblock_bitmap(page, pfn);
+       bitidx = pfn_to_bitidx(page, pfn);
+       word_bitidx = bitidx / BITS_PER_LONG;
+       bitidx &= (BITS_PER_LONG-1);
+
+       word = bitmap[word_bitidx];
+       bitidx += end_bitidx;
+       return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
+}
+
+unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+                                       unsigned long end_bitidx,
+                                       unsigned long mask)
+{
+       return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
+}
+
+static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+{
+       return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
+}
+
+/**
+ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @flags: The flags to set
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest
+ * @mask: mask of bits that the caller is interested in
+ */
+void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
+                                       unsigned long pfn,
+                                       unsigned long end_bitidx,
+                                       unsigned long mask)
+{
+       unsigned long *bitmap;
+       unsigned long bitidx, word_bitidx;
+       unsigned long old_word, word;
+
+       BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+
+       bitmap = get_pageblock_bitmap(page, pfn);
+       bitidx = pfn_to_bitidx(page, pfn);
+       word_bitidx = bitidx / BITS_PER_LONG;
+       bitidx &= (BITS_PER_LONG-1);
+
+       VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+
+       bitidx += end_bitidx;
+       mask <<= (BITS_PER_LONG - bitidx - 1);
+       flags <<= (BITS_PER_LONG - bitidx - 1);
+
+       word = READ_ONCE(bitmap[word_bitidx]);
+       for (;;) {
+               old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+               if (word == old_word)
+                       break;
+               word = old_word;
+       }
+}
  
  void set_pageblock_migratetype(struct page *page, int migratetype)
  {
@@ -422,12 +522,6 @@ static void bad_page(struct page *page, const char *reason,
         static unsigned long nr_shown;
         static unsigned long nr_unshown;
  
-       /* Don't complain about poisoned pages */
-       if (PageHWPoison(page)) {
-               page_mapcount_reset(page); /* remove PageBuddy */
-               return;
-       }
-
         /*
          * Allow a burst of 60 reports, then keep quiet for that minute;
          * or allow a steady drip of one report per second.
@@ -513,14 +607,7 @@ static int __init early_debug_pagealloc(char *buf)
  {
         if (!buf)
                 return -EINVAL;
-
-       if (strcmp(buf, "on") == 0)
-               _debug_pagealloc_enabled = true;
-
-       if (strcmp(buf, "off") == 0)
-               _debug_pagealloc_enabled = false;
-
-       return 0;
+       return kstrtobool(buf, &_debug_pagealloc_enabled);
  }
  early_param("debug_pagealloc", early_debug_pagealloc);
  
@@ -569,6 +656,9 @@ static inline void set_page_guard(struct zone *zone, struct page *page,
                 return;
  
         page_ext = lookup_page_ext(page);
+       if (unlikely(!page_ext))
+               return;
+
         __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
  
         INIT_LIST_HEAD(&page->lru);
@@ -586,6 +676,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
                 return;
  
         page_ext = lookup_page_ext(page);
+       if (unlikely(!page_ext))
+               return;
+
         __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
  
         set_page_private(page, 0);
@@ -784,17 +877,42 @@ out:
         zone->free_area[order].nr_free++;
  }
  
-static inline int free_pages_check(struct page *page)
+/*
+ * A bad page could be due to a number of fields. Instead of multiple branches,
+ * try and check multiple fields with one check. The caller must do a detailed
+ * check if necessary.
+ */
+static inline bool page_expected_state(struct page *page,
+                                       unsigned long check_flags)
  {
-       const char *bad_reason = NULL;
-       unsigned long bad_flags = 0;
+       if (unlikely(atomic_read(&page->_mapcount) != -1))
+               return false;
+
+       if (unlikely((unsigned long)page->mapping |
+                       page_ref_count(page) |
+#ifdef CONFIG_MEMCG
+                       (unsigned long)page->mem_cgroup |
+#endif
+                       (page->flags & check_flags)))
+               return false;
+
+       return true;
+}
+
+static void free_pages_check_bad(struct page *page)
+{
+       const char *bad_reason;
+       unsigned long bad_flags;
+
+       bad_reason = NULL;
+       bad_flags = 0;
  
         if (unlikely(atomic_read(&page->_mapcount) != -1))
                 bad_reason = "nonzero mapcount";
         if (unlikely(page->mapping != NULL))
                 bad_reason = "non-NULL mapping";
         if (unlikely(page_ref_count(page) != 0))
-               bad_reason = "nonzero _count";
+               bad_reason = "nonzero _refcount";
         if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
                 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
                 bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
@@ -803,16 +921,146 @@ static inline int free_pages_check(struct page *page)
         if (unlikely(page->mem_cgroup))
                 bad_reason = "page still charged to cgroup";
  #endif
-       if (unlikely(bad_reason)) {
-               bad_page(page, bad_reason, bad_flags);
-               return 1;
+       bad_page(page, bad_reason, bad_flags);
+}
+
+static inline int free_pages_check(struct page *page)
+{
+       if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
+               return 0;
+
+       /* Something has gone sideways, find it */
+       free_pages_check_bad(page);
+       return 1;
+}
+
+static int free_tail_pages_check(struct page *head_page, struct page *page)
+{
+       int ret = 1;
+
+       /*
+        * We rely page->lru.next never has bit 0 set, unless the page
+        * is PageTail(). Let's make sure that's true even for poisoned ->lru.
+        */
+       BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
+
+       if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
+               ret = 0;
+               goto out;
+       }
+       switch (page - head_page) {
+       case 1:
+               /* the first tail page: ->mapping is compound_mapcount() */
+               if (unlikely(compound_mapcount(page))) {
+                       bad_page(page, "nonzero compound_mapcount", 0);
+                       goto out;
+               }
+               break;
+       case 2:
+               /*
+                * the second tail page: ->mapping is
+                * page_deferred_list().next -- ignore value.
+                */
+               break;
+       default:
+               if (page->mapping != TAIL_MAPPING) {
+                       bad_page(page, "corrupted mapping in tail page", 0);
+                       goto out;
+               }
+               break;
+       }
+       if (unlikely(!PageTail(page))) {
+               bad_page(page, "PageTail not set", 0);
+               goto out;
+       }
+       if (unlikely(compound_head(page) != head_page)) {
+               bad_page(page, "compound_head not consistent", 0);
+               goto out;
         }
+       ret = 0;
+out:
+       page->mapping = NULL;
+       clear_compound_head(page);
+       return ret;
+}
+
+static __always_inline bool free_pages_prepare(struct page *page,
+                                       unsigned int order, bool check_free)
+{
+       int bad = 0;
+
+       VM_BUG_ON_PAGE(PageTail(page), page);
+
+       trace_mm_page_free(page, order);
+       kmemcheck_free_shadow(page, order);
+
+       /*
+        * Check tail pages before head page information is cleared to
+        * avoid checking PageCompound for order-0 pages.
+        */
+       if (unlikely(order)) {
+               bool compound = PageCompound(page);
+               int i;
+
+               VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
+
+               for (i = 1; i < (1 << order); i++) {
+                       if (compound)
+                               bad += free_tail_pages_check(page, page + i);
+                       if (unlikely(free_pages_check(page + i))) {
+                               bad++;
+                               continue;
+                       }
+                       (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+               }
+       }
+       if (PageAnonHead(page))
+               page->mapping = NULL;
+       if (check_free)
+               bad += free_pages_check(page);
+       if (bad)
+               return false;
+
         page_cpupid_reset_last(page);
-       if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
-               page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
-       return 0;
+       page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+       reset_page_owner(page, order);
+
+       if (!PageHighMem(page)) {
+               debug_check_no_locks_freed(page_address(page),
+                                          PAGE_SIZE << order);
+               debug_check_no_obj_freed(page_address(page),
+                                          PAGE_SIZE << order);
+       }
+       arch_free_page(page, order);
+       kernel_poison_pages(page, 1 << order, 0);
+       kernel_map_pages(page, 1 << order, 0);
+       kasan_free_pages(page, order);
+
+       return true;
+}
+
+#ifdef CONFIG_DEBUG_VM
+static inline bool free_pcp_prepare(struct page *page)
+{
+       return free_pages_prepare(page, 0, true);
+}
+
+static inline bool bulkfree_pcp_prepare(struct page *page)
+{
+       return false;
+}
+#else
+static bool free_pcp_prepare(struct page *page)
+{
+       return free_pages_prepare(page, 0, false);
  }
  
+static bool bulkfree_pcp_prepare(struct page *page)
+{
+       return free_pages_check(page);
+}
+#endif /* CONFIG_DEBUG_VM */
+
  /*
   * Frees a number of pages from the PCP lists
   * Assumes all pages on list are in same zone, and of same order.
@@ -829,15 +1077,16 @@ static void free_pcppages_bulk(struct zone *zone, int count,
  {
         int migratetype = 0;
         int batch_free = 0;
-       int to_free = count;
         unsigned long nr_scanned;
+       bool isolated_pageblocks;
  
         spin_lock(&zone->lock);
+       isolated_pageblocks = has_isolate_pageblock(zone);
         nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
         if (nr_scanned)
                 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
  
-       while (to_free) {
+       while (count) {
                 struct page *page;
                 struct list_head *list;
  
@@ -857,7 +1106,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
  
                 /* This is the only non-empty list. Free them all. */
                 if (batch_free == MIGRATE_PCPTYPES)
-                       batch_free = to_free;
+                       batch_free = count;
  
                 do {
                         int mt; /* migratetype of the to-be-freed page */
@@ -870,12 +1119,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                         /* MIGRATE_ISOLATE page should not go to pcplists */
                         VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
                         /* Pageblock could have been isolated meanwhile */
-                       if (unlikely(has_isolate_pageblock(zone)))
+                       if (unlikely(isolated_pageblocks))
                                 mt = get_pageblock_migratetype(page);
  
+                       if (bulkfree_pcp_prepare(page))
+                               continue;
+
                         __free_one_page(page, page_to_pfn(page), zone, 0, mt);
                         trace_mm_page_pcpu_drain(page, 0, mt);
-               } while (--to_free && --batch_free && !list_empty(list));
+               } while (--count && --batch_free && !list_empty(list));
         }
         spin_unlock(&zone->lock);
  }
@@ -899,56 +1151,6 @@ static void free_one_page(struct zone *zone,
         spin_unlock(&zone->lock);
  }
  
-static int free_tail_pages_check(struct page *head_page, struct page *page)
-{
-       int ret = 1;
-
-       /*
-        * We rely page->lru.next never has bit 0 set, unless the page
-        * is PageTail(). Let's make sure that's true even for poisoned ->lru.
-        */
-       BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
-
-       if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
-               ret = 0;
-               goto out;
-       }
-       switch (page - head_page) {
-       case 1:
-               /* the first tail page: ->mapping is compound_mapcount() */
-               if (unlikely(compound_mapcount(page))) {
-                       bad_page(page, "nonzero compound_mapcount", 0);
-                       goto out;
-               }
-               break;
-       case 2:
-               /*
-                * the second tail page: ->mapping is
-                * page_deferred_list().next -- ignore value.
-                */
-               break;
-       default:
-               if (page->mapping != TAIL_MAPPING) {
-                       bad_page(page, "corrupted mapping in tail page", 0);
-                       goto out;
-               }
-               break;
-       }
-       if (unlikely(!PageTail(page))) {
-               bad_page(page, "PageTail not set", 0);
-               goto out;
-       }
-       if (unlikely(compound_head(page) != head_page)) {
-               bad_page(page, "compound_head not consistent", 0);
-               goto out;
-       }
-       ret = 0;
-out:
-       page->mapping = NULL;
-       clear_compound_head(page);
-       return ret;
-}
-
  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
                                 unsigned long zone, int nid)
  {
@@ -1003,7 +1205,7 @@ static inline void init_reserved_page(unsigned long pfn)
   * marks the pages PageReserved. The remaining valid pages are later
   * sent to the buddy page allocator.
   */
-void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
+void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
  {
         unsigned long start_pfn = PFN_DOWN(start);
         unsigned long end_pfn = PFN_UP(end);
@@ -1022,51 +1224,13 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
         }
  }
  
-static bool free_pages_prepare(struct page *page, unsigned int order)
-{
-       bool compound = PageCompound(page);
-       int i, bad = 0;
-
-       VM_BUG_ON_PAGE(PageTail(page), page);
-       VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
-
-       trace_mm_page_free(page, order);
-       kmemcheck_free_shadow(page, order);
-       kasan_free_pages(page, order);
-
-       if (PageAnon(page))
-               page->mapping = NULL;
-       bad += free_pages_check(page);
-       for (i = 1; i < (1 << order); i++) {
-               if (compound)
-                       bad += free_tail_pages_check(page, page + i);
-               bad += free_pages_check(page + i);
-       }
-       if (bad)
-               return false;
-
-       reset_page_owner(page, order);
-
-       if (!PageHighMem(page)) {
-               debug_check_no_locks_freed(page_address(page),
-                                          PAGE_SIZE << order);
-               debug_check_no_obj_freed(page_address(page),
-                                          PAGE_SIZE << order);
-       }
-       arch_free_page(page, order);
-       kernel_poison_pages(page, 1 << order, 0);
-       kernel_map_pages(page, 1 << order, 0);
-
-       return true;
-}
-
  static void __free_pages_ok(struct page *page, unsigned int order)
  {
         unsigned long flags;
         int migratetype;
         unsigned long pfn = page_to_pfn(page);
  
-       if (!free_pages_prepare(page, order))
+       if (!free_pages_prepare(page, order, true))
                 return;
  
         migratetype = get_pfnblock_migratetype(page, pfn);
@@ -1076,8 +1240,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
         local_irq_restore(flags);
  }
  
-static void __init __free_pages_boot_core(struct page *page,
-                                       unsigned long pfn, unsigned int order)
+static void __init __free_pages_boot_core(struct page *page, unsigned int order)
  {
         unsigned int nr_pages = 1 << order;
         struct page *p = page;
@@ -1154,7 +1317,7 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
  {
         if (early_page_uninitialised(pfn))
                 return;
-       return __free_pages_boot_core(page, pfn, order);
+       return __free_pages_boot_core(page, order);
  }
  
  /*
@@ -1239,12 +1402,12 @@ static void __init deferred_free_range(struct page *page,
         if (nr_pages == MAX_ORDER_NR_PAGES &&
             (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
                 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-               __free_pages_boot_core(page, pfn, MAX_ORDER-1);
+               __free_pages_boot_core(page, MAX_ORDER-1);
                 return;
         }
  
-       for (i = 0; i < nr_pages; i++, page++, pfn++)
-               __free_pages_boot_core(page, pfn, 0);
+       for (i = 0; i < nr_pages; i++, page++)
+               __free_pages_boot_core(page, 0);
  }
  
  /* Completion tracking for deferred_init_memmap() threads */
@@ -1477,10 +1640,7 @@ static inline void expand(struct zone *zone, struct page *page,
         }
  }
  
-/*
- * This page is about to be returned from the page allocator
- */
-static inline int check_new_page(struct page *page)
+static void check_new_page_bad(struct page *page)
  {
         const char *bad_reason = NULL;
         unsigned long bad_flags = 0;
@@ -1494,6 +1654,9 @@ static inline int check_new_page(struct page *page)
         if (unlikely(page->flags & __PG_HWPOISON)) {
                 bad_reason = "HWPoisoned (hardware-corrupted)";
                 bad_flags = __PG_HWPOISON;
+               /* Don't complain about hwpoisoned pages */
+               page_mapcount_reset(page); /* remove PageBuddy */
+               return;
         }
         if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
                 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
@@ -1503,11 +1666,20 @@ static inline int check_new_page(struct page *page)
         if (unlikely(page->mem_cgroup))
                 bad_reason = "page still charged to cgroup";
  #endif
-       if (unlikely(bad_reason)) {
-               bad_page(page, bad_reason, bad_flags);
-               return 1;
-       }
-       return 0;
+       bad_page(page, bad_reason, bad_flags);
+}
+
+/*
+ * This page is about to be returned from the page allocator
+ */
+static inline int check_new_page(struct page *page)
+{
+       if (likely(page_expected_state(page,
+                               PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
+               return 0;
+
+       check_new_page_bad(page);
+       return 1;
  }
  
  static inline bool free_pages_prezeroed(bool poisoned)
@@ -1516,16 +1688,48 @@ static inline bool free_pages_prezeroed(bool poisoned)
                 page_poisoning_enabled() && poisoned;
  }
  
-static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
-                                                               int alloc_flags)
+#ifdef CONFIG_DEBUG_VM
+static bool check_pcp_refill(struct page *page)
+{
+       return false;
+}
+
+static bool check_new_pcp(struct page *page)
+{
+       return check_new_page(page);
+}
+#else
+static bool check_pcp_refill(struct page *page)
+{
+       return check_new_page(page);
+}
+static bool check_new_pcp(struct page *page)
+{
+       return false;
+}
+#endif /* CONFIG_DEBUG_VM */
+
+static bool check_new_pages(struct page *page, unsigned int order)
+{
+       int i;
+       for (i = 0; i < (1 << order); i++) {
+               struct page *p = page + i;
+
+               if (unlikely(check_new_page(p)))
+                       return true;
+       }
+
+       return false;
+}
+
+static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
+                                                       unsigned int alloc_flags)
  {
         int i;
         bool poisoned = true;
  
         for (i = 0; i < (1 << order); i++) {
                 struct page *p = page + i;
-               if (unlikely(check_new_page(p)))
-                       return 1;
                 if (poisoned)
                         poisoned &= page_is_poisoned(p);
         }
@@ -1557,8 +1761,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
                 set_page_pfmemalloc(page);
         else
                 clear_page_pfmemalloc(page);
-
-       return 0;
  }
  
  /*
@@ -1980,6 +2182,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                 if (unlikely(page == NULL))
                         break;
  
+               if (unlikely(check_pcp_refill(page)))
+                       continue;
+
                 /*
                  * Split buddy pages returned by expand() are received here
                  * in physical page order. The page is added to the callers and
@@ -2157,6 +2362,10 @@ void mark_free_pages(struct zone *zone)
         for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
                 if (pfn_valid(pfn)) {
                         page = pfn_to_page(pfn);
+
+                       if (page_zone(page) != zone)
+                               continue;
+
                         if (!swsusp_page_is_forbidden(page))
                                 swsusp_unset_page_free(page);
                 }
@@ -2187,7 +2396,7 @@ void free_hot_cold_page(struct page *page, bool cold)
         unsigned long pfn = page_to_pfn(page);
         int migratetype;
  
-       if (!free_pages_prepare(page, 0))
+       if (!free_pcp_prepare(page))
                 return;
  
         migratetype = get_pfnblock_migratetype(page, pfn);
@@ -2342,13 +2551,45 @@ int split_free_page(struct page *page)
         return nr_pages;
  }
  
+/*
+ * Update NUMA hit/miss statistics
+ *
+ * Must be called with interrupts disabled.
+ *
+ * When __GFP_OTHER_NODE is set assume the node of the preferred
+ * zone is the local node. This is useful for daemons who allocate
+ * memory on behalf of other processes.
+ */
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+                                                               gfp_t flags)
+{
+#ifdef CONFIG_NUMA
+       int local_nid = numa_node_id();
+       enum zone_stat_item local_stat = NUMA_LOCAL;
+
+       if (unlikely(flags & __GFP_OTHER_NODE)) {
+               local_stat = NUMA_OTHER;
+               local_nid = preferred_zone->node;
+       }
+
+       if (z->node == local_nid) {
+               __inc_zone_state(z, NUMA_HIT);
+               __inc_zone_state(z, local_stat);
+       } else {
+               __inc_zone_state(z, NUMA_MISS);
+               __inc_zone_state(preferred_zone, NUMA_FOREIGN);
+       }
+#endif
+}
+
  /*
   * Allocate a page from the given zone. Use pcplists for order-0 allocations.
   */
  static inline
  struct page *buffered_rmqueue(struct zone *preferred_zone,
                         struct zone *zone, unsigned int order,
-                       gfp_t gfp_flags, int alloc_flags, int migratetype)
+                       gfp_t gfp_flags, unsigned int alloc_flags,
+                       int migratetype)
  {
         unsigned long flags;
         struct page *page;
@@ -2359,23 +2600,27 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                 struct list_head *list;
  
                 local_irq_save(flags);
-               pcp = &this_cpu_ptr(zone->pageset)->pcp;
-               list = &pcp->lists[migratetype];
-               if (list_empty(list)) {
-                       pcp->count += rmqueue_bulk(zone, 0,
-                                       pcp->batch, list,
-                                       migratetype, cold);
-                       if (unlikely(list_empty(list)))
-                               goto failed;
-               }
+               do {
+                       pcp = &this_cpu_ptr(zone->pageset)->pcp;
+                       list = &pcp->lists[migratetype];
+                       if (list_empty(list)) {
+                               pcp->count += rmqueue_bulk(zone, 0,
+                                               pcp->batch, list,
+                                               migratetype, cold);
+                               if (unlikely(list_empty(list)))
+                                       goto failed;
+                       }
  
-               if (cold)
-                       page = list_last_entry(list, struct page, lru);
-               else
-                       page = list_first_entry(list, struct page, lru);
+                       if (cold)
+                               page = list_last_entry(list, struct page, lru);
+                       else
+                               page = list_first_entry(list, struct page, lru);
  
-               list_del(&page->lru);
-               pcp->count--;
+                       __dec_zone_state(zone, NR_ALLOC_BATCH);
+                       list_del(&page->lru);
+                       pcp->count--;
+
+               } while (check_new_pcp(page));
         } else {
                 /*
                  * We most definitely don't want callers attempting to
@@ -2384,22 +2629,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
                 spin_lock_irqsave(&zone->lock, flags);
  
-               page = NULL;
-               if (alloc_flags & ALLOC_HARDER) {
-                       page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
-                       if (page)
-                               trace_mm_page_alloc_zone_locked(page, order, migratetype);
-               }
-               if (!page)
-                       page = __rmqueue(zone, order, migratetype);
+               do {
+                       page = NULL;
+                       if (alloc_flags & ALLOC_HARDER) {
+                               page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+                               if (page)
+                                       trace_mm_page_alloc_zone_locked(page, order, migratetype);
+                       }
+                       if (!page)
+                               page = __rmqueue(zone, order, migratetype);
+               } while (page && check_new_pages(page, order));
                 spin_unlock(&zone->lock);
                 if (!page)
                         goto failed;
+               __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
                 __mod_zone_freepage_state(zone, -(1 << order),
                                           get_pcppage_migratetype(page));
         }
  
-       __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
         if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
             !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
                 set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
@@ -2500,13 +2747,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
   * one free page of a suitable size. Checking now avoids taking the zone lock
   * to check in the allocation paths if no pages are free.
   */
-static bool __zone_watermark_ok(struct zone *z, unsigned int order,
-                       unsigned long mark, int classzone_idx, int alloc_flags,
-                       long free_pages)
+bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+                        int classzone_idx, unsigned int alloc_flags,
+                        long free_pages)
  {
         long min = mark;
         int o;
-       const int alloc_harder = (alloc_flags & ALLOC_HARDER);
+       const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
  
         /* free_pages may go negative - that's OK */
         free_pages -= (1 << order) - 1;
@@ -2569,12 +2816,38 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
  }
  
  bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
-                     int classzone_idx, int alloc_flags)
+                     int classzone_idx, unsigned int alloc_flags)
  {
         return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
                                         zone_page_state(z, NR_FREE_PAGES));
  }
  
+static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
+               unsigned long mark, int classzone_idx, unsigned int alloc_flags)
+{
+       long free_pages = zone_page_state(z, NR_FREE_PAGES);
+       long cma_pages = 0;
+
+#ifdef CONFIG_CMA
+       /* If allocation can't use CMA areas don't use free CMA pages */
+       if (!(alloc_flags & ALLOC_CMA))
+               cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
+       /*
+        * Fast check for order-0 only. If this fails then the reserves
+        * need to be calculated. There is a corner case where the check
+        * passes but only the high-order atomic reserve are free. If
+        * the caller is !atomic then it'll uselessly search the free
+        * list. That corner case is then slower but it is harmless.
+        */
+       if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
+               return true;
+
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                       free_pages);
+}
+
  bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
                         unsigned long mark, int classzone_idx)
  {
@@ -2630,27 +2903,24 @@ static struct page *
  get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                                                 const struct alloc_context *ac)
  {
-       struct zonelist *zonelist = ac->zonelist;
-       struct zoneref *z;
-       struct page *page = NULL;
+       struct zoneref *z = ac->preferred_zoneref;
         struct zone *zone;
-       int nr_fair_skipped = 0;
-       bool zonelist_rescan;
+       bool fair_skipped = false;
+       bool apply_fair = (alloc_flags & ALLOC_FAIR);
  
  zonelist_scan:
-       zonelist_rescan = false;
-
         /*
          * Scan zonelist, looking for a zone with enough free.
          * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
          */
-       for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+       for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
                                                                 ac->nodemask) {
+               struct page *page;
                 unsigned long mark;
  
                 if (cpusets_enabled() &&
                         (alloc_flags & ALLOC_CPUSET) &&
-                       !cpuset_zone_allowed(zone, gfp_mask))
+                       !__cpuset_zone_allowed(zone, gfp_mask))
                                 continue;
                 /*
                  * Distribute pages in proportion to the individual
@@ -2658,13 +2928,16 @@ zonelist_scan:
                  * page was allocated in should have no effect on the
                  * time the page has in memory before being reclaimed.
                  */
-               if (alloc_flags & ALLOC_FAIR) {
-                       if (!zone_local(ac->preferred_zone, zone))
-                               break;
+               if (apply_fair) {
                         if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
-                               nr_fair_skipped++;
+                               fair_skipped = true;
                                 continue;
                         }
+                       if (!zone_local(ac->preferred_zoneref->zone, zone)) {
+                               if (fair_skipped)
+                                       goto reset_fair;
+                               apply_fair = false;
+                       }
                 }
                 /*
                  * When allocating a page cache page for writing, we
@@ -2696,8 +2969,8 @@ zonelist_scan:
                         continue;
  
                 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
-               if (!zone_watermark_ok(zone, order, mark,
-                                      ac->classzone_idx, alloc_flags)) {
+               if (!zone_watermark_fast(zone, order, mark,
+                                      ac_classzone_idx(ac), alloc_flags)) {
                         int ret;
  
                         /* Checked here to keep the fast path fast */
@@ -2706,7 +2979,7 @@ zonelist_scan:
                                 goto try_this_zone;
  
                         if (zone_reclaim_mode == 0 ||
-                           !zone_allows_reclaim(ac->preferred_zone, zone))
+                           !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
                                 continue;
  
                         ret = zone_reclaim(zone, gfp_mask, order);
@@ -2720,7 +2993,7 @@ zonelist_scan:
                         default:
                                 /* did we reclaim enough */
                                 if (zone_watermark_ok(zone, order, mark,
-                                               ac->classzone_idx, alloc_flags))
+                                               ac_classzone_idx(ac), alloc_flags))
                                         goto try_this_zone;
  
                                 continue;
@@ -2728,11 +3001,10 @@ zonelist_scan:
                 }
  
  try_this_zone:
-               page = buffered_rmqueue(ac->preferred_zone, zone, order,
+               page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
                                 gfp_mask, alloc_flags, ac->migratetype);
                 if (page) {
-                       if (prep_new_page(page, order, gfp_mask, alloc_flags))
-                               goto try_this_zone;
+                       prep_new_page(page, order, gfp_mask, alloc_flags);
  
                         /*
                          * If this is a high-order atomic allocation then check
@@ -2753,18 +3025,14 @@ try_this_zone:
          * include remote zones now, before entering the slowpath and waking
          * kswapd: prefer spilling to a remote zone over swapping locally.
          */
-       if (alloc_flags & ALLOC_FAIR) {
-               alloc_flags &= ~ALLOC_FAIR;
-               if (nr_fair_skipped) {
-                       zonelist_rescan = true;
-                       reset_alloc_batches(ac->preferred_zone);
-               }
-               if (nr_online_nodes > 1)
-                       zonelist_rescan = true;
-       }
-
-       if (zonelist_rescan)
+       if (fair_skipped) {
+reset_fair:
+               apply_fair = false;
+               fair_skipped = false;
+               reset_alloc_batches(ac->preferred_zoneref->zone);
+               z = ac->preferred_zoneref;
                 goto zonelist_scan;
+       }
  
         return NULL;
  }
@@ -2872,22 +3140,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                 /* The OOM killer does not needlessly kill tasks for lowmem */
                 if (ac->high_zoneidx < ZONE_NORMAL)
                         goto out;
-               /* The OOM killer does not compensate for IO-less reclaim */
-               if (!(gfp_mask & __GFP_FS)) {
-                       /*
-                        * XXX: Page reclaim didn't yield anything,
-                        * and the OOM killer can't be invoked, but
-                        * keep looping as per tradition.
-                        *
-                        * But do not keep looping if oom_killer_disable()
-                        * was already called, for the system is trying to
-                        * enter a quiescent state during suspend.
-                        */
-                       *did_some_progress = !oom_killer_disabled;
-                       goto out;
-               }
                 if (pm_suspended_storage())
                         goto out;
+               /*
+                * XXX: GFP_NOFS allocations should rather fail than rely on
+                * other request to make a forward progress.
+                * We are in an unfortunate situation where out_of_memory cannot
+                * do much for this context but let's try it to at least get
+                * access to memory reserved if the current task is killed (see
+                * out_of_memory). Once filesystems are ready to handle allocation
+                * failures more gracefully we should just bail out here.
+                */
+
                 /* The OOM killer may not free memory on a specific node */
                 if (gfp_mask & __GFP_THISNODE)
                         goto out;
@@ -2913,34 +3177,33 @@ out:
         return page;
  }
  
+
+/*
+ * Maximum number of compaction retries wit a progress before OOM
+ * killer is consider as the only way to move forward.
+ */
+#define MAX_COMPACT_RETRIES 16
+
  #ifdef CONFIG_COMPACTION
  /* Try memory compaction for high-order allocations before reclaim */
  static struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
-               int alloc_flags, const struct alloc_context *ac,
-               enum migrate_mode mode, int *contended_compaction,
-               bool *deferred_compaction)
+               unsigned int alloc_flags, const struct alloc_context *ac,
+               enum migrate_mode mode, enum compact_result *compact_result)
  {
-       unsigned long compact_result;
         struct page *page;
+       int contended_compaction;
  
         if (!order)
                 return NULL;
  
         current->flags |= PF_MEMALLOC;
-       compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
-                                               mode, contended_compaction);
+       *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
+                                               mode, &contended_compaction);
         current->flags &= ~PF_MEMALLOC;
  
-       switch (compact_result) {
-       case COMPACT_DEFERRED:
-               *deferred_compaction = true;
-               /* fall-through */
-       case COMPACT_SKIPPED:
+       if (*compact_result <= COMPACT_INACTIVE)
                 return NULL;
-       default:
-               break;
-       }
  
         /*
          * At least in one zone compaction wasn't deferred or skipped, so let's
@@ -2966,19 +3229,112 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
          */
         count_vm_event(COMPACTFAIL);
  
+       /*
+        * In all zones where compaction was attempted (and not
+        * deferred or skipped), lock contention has been detected.
+        * For THP allocation we do not want to disrupt the others
+        * so we fallback to base pages instead.
+        */
+       if (contended_compaction == COMPACT_CONTENDED_LOCK)
+               *compact_result = COMPACT_CONTENDED;
+
+       /*
+        * If compaction was aborted due to need_resched(), we do not
+        * want to further increase allocation latency, unless it is
+        * khugepaged trying to collapse.
+        */
+       if (contended_compaction == COMPACT_CONTENDED_SCHED
+               && !(current->flags & PF_KTHREAD))
+               *compact_result = COMPACT_CONTENDED;
+
         cond_resched();
  
         return NULL;
  }
+
+static inline bool
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+                    enum compact_result compact_result, enum migrate_mode *migrate_mode,
+                    int compaction_retries)
+{
+       int max_retries = MAX_COMPACT_RETRIES;
+
+       if (!order)
+               return false;
+
+       /*
+        * compaction considers all the zone as desperately out of memory
+        * so it doesn't really make much sense to retry except when the
+        * failure could be caused by weak migration mode.
+        */
+       if (compaction_failed(compact_result)) {
+               if (*migrate_mode == MIGRATE_ASYNC) {
+                       *migrate_mode = MIGRATE_SYNC_LIGHT;
+                       return true;
+               }
+               return false;
+       }
+
+       /*
+        * make sure the compaction wasn't deferred or didn't bail out early
+        * due to locks contention before we declare that we should give up.
+        * But do not retry if the given zonelist is not suitable for
+        * compaction.
+        */
+       if (compaction_withdrawn(compact_result))
+               return compaction_zonelist_suitable(ac, order, alloc_flags);
+
+       /*
+        * !costly requests are much more important than __GFP_REPEAT
+        * costly ones because they are de facto nofail and invoke OOM
+        * killer to move on while costly can fail and users are ready
+        * to cope with that. 1/4 retries is rather arbitrary but we
+        * would need much more detailed feedback from compaction to
+        * make a better decision.
+        */
+       if (order > PAGE_ALLOC_COSTLY_ORDER)
+               max_retries /= 4;
+       if (compaction_retries <= max_retries)
+               return true;
+
+       return false;
+}
  #else
  static inline struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
-               int alloc_flags, const struct alloc_context *ac,
-               enum migrate_mode mode, int *contended_compaction,
-               bool *deferred_compaction)
+               unsigned int alloc_flags, const struct alloc_context *ac,
+               enum migrate_mode mode, enum compact_result *compact_result)
  {
+       *compact_result = COMPACT_SKIPPED;
         return NULL;
  }
+
+static inline bool
+should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+                    enum compact_result compact_result,
+                    enum migrate_mode *migrate_mode,
+                    int compaction_retries)
+{
+       struct zone *zone;
+       struct zoneref *z;
+
+       if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
+               return false;
+
+       /*
+        * There are setups with compaction disabled which would prefer to loop
+        * inside the allocator rather than hit the oom killer prematurely.
+        * Let's give them a good hope and keep retrying while the order-0
+        * watermarks are OK.
+        */
+       for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+                                       ac->nodemask) {
+               if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
+                                       ac_classzone_idx(ac), alloc_flags))
+                       return true;
+       }
+       return false;
+}
  #endif /* CONFIG_COMPACTION */
  
  /* Perform direct synchronous page reclaim */
@@ -3013,7 +3369,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
  /* The really slow allocator path where we enter direct reclaim */
  static inline struct page *
  __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
-               int alloc_flags, const struct alloc_context *ac,
+               unsigned int alloc_flags, const struct alloc_context *ac,
                 unsigned long *did_some_progress)
  {
         struct page *page = NULL;
@@ -3049,13 +3405,13 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
  
         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
                                                 ac->high_zoneidx, ac->nodemask)
-               wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
+               wakeup_kswapd(zone, order, ac_classzone_idx(ac));
  }
  
-static inline int
+static inline unsigned int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
-       int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+       unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
  
         /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -3110,18 +3466,113 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
         return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
  }
  
+/*
+ * Maximum number of reclaim retries without any progress before OOM killer
+ * is consider as the only way to move forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
+/*
+ * Checks whether it makes sense to retry the reclaim to make a forward progress
+ * for the given allocation request.
+ * The reclaim feedback represented by did_some_progress (any progress during
+ * the last reclaim round) and no_progress_loops (number of reclaim rounds without
+ * any progress in a row) is considered as well as the reclaimable pages on the
+ * applicable zone list (with a backoff mechanism which is a function of
+ * no_progress_loops).
+ *
+ * Returns true if a retry is viable or false to enter the oom path.
+ */
+static inline bool
+should_reclaim_retry(gfp_t gfp_mask, unsigned order,
+                    struct alloc_context *ac, int alloc_flags,
+                    bool did_some_progress, int no_progress_loops)
+{
+       struct zone *zone;
+       struct zoneref *z;
+
+       /*
+        * Make sure we converge to OOM if we cannot make any progress
+        * several times in the row.
+        */
+       if (no_progress_loops > MAX_RECLAIM_RETRIES)
+               return false;
+
+       /*
+        * Keep reclaiming pages while there is a chance this will lead somewhere.
+        * If none of the target zones can satisfy our allocation request even
+        * if all reclaimable pages are considered then we are screwed and have
+        * to go OOM.
+        */
+       for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+                                       ac->nodemask) {
+               unsigned long available;
+               unsigned long reclaimable;
+
+               available = reclaimable = zone_reclaimable_pages(zone);
+               available -= DIV_ROUND_UP(no_progress_loops * available,
+                                         MAX_RECLAIM_RETRIES);
+               available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+               /*
+                * Would the allocation succeed if we reclaimed the whole
+                * available?
+                */
+               if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
+                               ac_classzone_idx(ac), alloc_flags, available)) {
+                       /*
+                        * If we didn't make any progress and have a lot of
+                        * dirty + writeback pages then we should wait for
+                        * an IO to complete to slow down the reclaim and
+                        * prevent from pre mature OOM
+                        */
+                       if (!did_some_progress) {
+                               unsigned long writeback;
+                               unsigned long dirty;
+
+                               writeback = zone_page_state_snapshot(zone,
+                                                                    NR_WRITEBACK);
+                               dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
+
+                               if (2*(writeback + dirty) > reclaimable) {
+                                       congestion_wait(BLK_RW_ASYNC, HZ/10);
+                                       return true;
+                               }
+                       }
+
+                       /*
+                        * Memory allocation/reclaim might be called from a WQ
+                        * context and the current implementation of the WQ
+                        * concurrency control doesn't recognize that
+                        * a particular WQ is congested if the worker thread is
+                        * looping without ever sleeping. Therefore we have to
+                        * do a short sleep here rather than calling
+                        * cond_resched().
+                        */
+                       if (current->flags & PF_WQ_WORKER)
+                               schedule_timeout_uninterruptible(1);
+                       else
+                               cond_resched();
+
+                       return true;
+               }
+       }
+
+       return false;
+}
+
  static inline struct page *
  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                                 struct alloc_context *ac)
  {
         bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
         struct page *page = NULL;
-       int alloc_flags;
-       unsigned long pages_reclaimed = 0;
+       unsigned int alloc_flags;
         unsigned long did_some_progress;
         enum migrate_mode migration_mode = MIGRATE_ASYNC;
-       bool deferred_compaction = false;
-       int contended_compaction = COMPACT_CONTENDED_NONE;
+       enum compact_result compact_result;
+       int compaction_retries = 0;
+       int no_progress_loops = 0;
  
         /*
          * In the slowpath, we sanity check order to avoid ever trying to
@@ -3154,14 +3605,14 @@ retry:
         alloc_flags = gfp_to_alloc_flags(gfp_mask);
  
         /*
-        * Find the true preferred zone if the allocation is unconstrained by
-        * cpusets.
+        * Reset the zonelist iterators if memory policies can be ignored.
+        * These allocations are high priority and system rather than user
+        * orientated.
          */
-       if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
-               struct zoneref *preferred_zoneref;
-               preferred_zoneref = first_zones_zonelist(ac->zonelist,
-                               ac->high_zoneidx, NULL, &ac->preferred_zone);
-               ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
+       if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) {
+               ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
+               ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+                                       ac->high_zoneidx, ac->nodemask);
         }
  
         /* This is the last chance, in general, before the goto nopage. */
@@ -3172,12 +3623,6 @@ retry:
  
         /* Allocate without watermarks if the context allows */
         if (alloc_flags & ALLOC_NO_WATERMARKS) {
-               /*
-                * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
-                * the allocation is high priority and these type of
-                * allocations are system rather than user orientated
-                */
-               ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
                 page = get_page_from_freelist(gfp_mask, order,
                                                 ALLOC_NO_WATERMARKS, ac);
                 if (page)
@@ -3219,8 +3664,7 @@ retry:
          */
         page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
                                         migration_mode,
-                                       &contended_compaction,
-                                       &deferred_compaction);
+                                       &compact_result);
         if (page)
                 goto got_pg;
  
@@ -3233,35 +3677,19 @@ retry:
                  * to heavily disrupt the system, so we fail the allocation
                  * instead of entering direct reclaim.
                  */
-               if (deferred_compaction)
-                       goto nopage;
-
-               /*
-                * In all zones where compaction was attempted (and not
-                * deferred or skipped), lock contention has been detected.
-                * For THP allocation we do not want to disrupt the others
-                * so we fallback to base pages instead.
-                */
-               if (contended_compaction == COMPACT_CONTENDED_LOCK)
+               if (compact_result == COMPACT_DEFERRED)
                         goto nopage;
  
                 /*
-                * If compaction was aborted due to need_resched(), we do not
-                * want to further increase allocation latency, unless it is
-                * khugepaged trying to collapse.
+                * Compaction is contended so rather back off than cause
+                * excessive stalls.
                  */
-               if (contended_compaction == COMPACT_CONTENDED_SCHED
-                       && !(current->flags & PF_KTHREAD))
+               if(compact_result == COMPACT_CONTENDED)
                         goto nopage;
         }
  
-       /*
-        * It can become very expensive to allocate transparent hugepages at
-        * fault, so use asynchronous memory compaction for THP unless it is
-        * khugepaged trying to collapse.
-        */
-       if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD))
-               migration_mode = MIGRATE_SYNC_LIGHT;
+       if (order && compaction_made_progress(compact_result))
+               compaction_retries++;
  
         /* Try direct reclaim and then allocating */
         page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
@@ -3273,14 +3701,38 @@ retry:
         if (gfp_mask & __GFP_NORETRY)
                 goto noretry;
  
-       /* Keep reclaiming pages as long as there is reasonable progress */
-       pages_reclaimed += did_some_progress;
-       if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
-           ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
-               /* Wait for some write requests to complete then retry */
-               wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
+       /*
+        * Do not retry costly high order allocations unless they are
+        * __GFP_REPEAT
+        */
+       if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+               goto noretry;
+
+       /*
+        * Costly allocations might have made a progress but this doesn't mean
+        * their order will become available due to high fragmentation so
+        * always increment the no progress counter for them
+        */
+       if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
+               no_progress_loops = 0;
+       else
+               no_progress_loops++;
+
+       if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
+                                did_some_progress > 0, no_progress_loops))
+               goto retry;
+
+       /*
+        * It doesn't make any sense to retry for the compaction if the order-0
+        * reclaim is not able to make any progress because the current
+        * implementation of the compaction depends on the sufficient amount
+        * of free memory (see __compaction_suitable)
+        */
+       if (did_some_progress > 0 &&
+                       should_compact_retry(ac, order, alloc_flags,
+                               compact_result, &migration_mode,
+                               compaction_retries))
                 goto retry;
-       }
  
         /* Reclaim has failed us, start killing things */
         page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@ -3288,19 +3740,28 @@ retry:
                 goto got_pg;
  
         /* Retry as long as the OOM killer is making progress */
-       if (did_some_progress)
+       if (did_some_progress) {
+               no_progress_loops = 0;
                 goto retry;
+       }
  
  noretry:
         /*
-        * High-order allocations do not necessarily loop after
-        * direct reclaim and reclaim/compaction depends on compaction
-        * being called after reclaim so call directly if necessary
+        * High-order allocations do not necessarily loop after direct reclaim
+        * and reclaim/compaction depends on compaction being called after
+        * reclaim so call directly if necessary.
+        * It can become very expensive to allocate transparent hugepages at
+        * fault, so use asynchronous memory compaction for THP unless it is
+        * khugepaged trying to collapse. All other requests should tolerate
+        * at least light sync migration.
          */
+       if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD))
+               migration_mode = MIGRATE_ASYNC;
+       else
+               migration_mode = MIGRATE_SYNC_LIGHT;
         page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
                                             ac, migration_mode,
-                                           &contended_compaction,
-                                           &deferred_compaction);
+                                           &compact_result);
         if (page)
                 goto got_pg;
  nopage:
@@ -3316,17 +3777,24 @@ struct page *
  __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                         struct zonelist *zonelist, nodemask_t *nodemask)
  {
-       struct zoneref *preferred_zoneref;
-       struct page *page = NULL;
+       struct page *page;
         unsigned int cpuset_mems_cookie;
-       int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
-       gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+       unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
+       gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
         struct alloc_context ac = {
                 .high_zoneidx = gfp_zone(gfp_mask),
+               .zonelist = zonelist,
                 .nodemask = nodemask,
                 .migratetype = gfpflags_to_migratetype(gfp_mask),
         };
  
+       if (cpusets_enabled()) {
+               alloc_mask |= __GFP_HARDWALL;
+               alloc_flags |= ALLOC_CPUSET;
+               if (!ac.nodemask)
+                       ac.nodemask = &cpuset_current_mems_allowed;
+       }
+
         gfp_mask &= gfp_allowed_mask;
  
         lockdep_trace_alloc(gfp_mask);
@@ -3350,49 +3818,58 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  retry_cpuset:
         cpuset_mems_cookie = read_mems_allowed_begin();
  
-       /* We set it here, as __alloc_pages_slowpath might have changed it */
-       ac.zonelist = zonelist;
-
         /* Dirty zone balancing only done in the fast path */
         ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
  
-       /* The preferred zone is used for statistics later */
-       preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
-                               ac.nodemask ? : &cpuset_current_mems_allowed,
-                               &ac.preferred_zone);
-       if (!ac.preferred_zone)
-               goto out;
-       ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
+       /*
+        * The preferred zone is used for statistics but crucially it is
+        * also used as the starting point for the zonelist iterator. It
+        * may get reset for allocations that ignore memory policies.
+        */
+       ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+                                       ac.high_zoneidx, ac.nodemask);
+       if (!ac.preferred_zoneref) {
+               page = NULL;
+               goto no_zone;
+       }
  
         /* First allocation attempt */
-       alloc_mask = gfp_mask|__GFP_HARDWALL;
         page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
-       if (unlikely(!page)) {
-               /*
-                * Runtime PM, block IO and its error handling path
-                * can deadlock because I/O on the device might not
-                * complete.
-                */
-               alloc_mask = memalloc_noio_flags(gfp_mask);
-               ac.spread_dirty_pages = false;
-
-               page = __alloc_pages_slowpath(alloc_mask, order, &ac);
-       }
+       if (likely(page))
+               goto out;
  
-       if (kmemcheck_enabled && page)
-               kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+       /*
+        * Runtime PM, block IO and its error handling path can deadlock
+        * because I/O on the device might not complete.
+        */
+       alloc_mask = memalloc_noio_flags(gfp_mask);
+       ac.spread_dirty_pages = false;
  
-       trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
+       /*
+        * Restore the original nodemask if it was potentially replaced with
+        * &cpuset_current_mems_allowed to optimize the fast-path attempt.
+        */
+       if (cpusets_enabled())
+               ac.nodemask = nodemask;
+       page = __alloc_pages_slowpath(alloc_mask, order, &ac);
  
-out:
+no_zone:
         /*
          * When updating a task's mems_allowed, it is possible to race with
          * parallel threads in such a way that an allocation can fail while
          * the mask is being updated. If a page allocation is about to fail,
          * check if the cpuset changed during allocation and if so, retry.
          */
-       if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+       if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
+               alloc_mask = gfp_mask;
                 goto retry_cpuset;
+       }
+
+out:
+       if (kmemcheck_enabled && page)
+               kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+
+       trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
  
         return page;
  }
@@ -3790,6 +4267,8 @@ void si_meminfo_node(struct sysinfo *val, int nid)
  {
         int zone_type;          /* needs to be signed */
         unsigned long managed_pages = 0;
+       unsigned long managed_highpages = 0;
+       unsigned long free_highpages = 0;
         pg_data_t *pgdat = NODE_DATA(nid);
  
         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
@@ -3798,12 +4277,19 @@ void si_meminfo_node(struct sysinfo *val, int nid)
         val->sharedram = node_page_state(nid, NR_SHMEM);
         val->freeram = node_page_state(nid, NR_FREE_PAGES);
  #ifdef CONFIG_HIGHMEM
-       val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
-       val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
-                       NR_FREE_PAGES);
+       for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+               struct zone *zone = &pgdat->node_zones[zone_type];
+
+               if (is_highmem(zone)) {
+                       managed_highpages += zone->managed_pages;
+                       free_highpages += zone_page_state(zone, NR_FREE_PAGES);
+               }
+       }
+       val->totalhigh = managed_highpages;
+       val->freehigh = free_highpages;
  #else
-       val->totalhigh = 0;
-       val->freehigh = 0;
+       val->totalhigh = managed_highpages;
+       val->freehigh = free_highpages;
  #endif
         val->mem_unit = PAGE_SIZE;
  }
@@ -4390,13 +4876,12 @@ static void build_zonelists(pg_data_t *pgdat)
   */
  int local_memory_node(int node)
  {
-       struct zone *zone;
+       struct zoneref *z;
  
-       (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+       z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
                                    gfp_zone(GFP_KERNEL),
-                                  NULL,
-                                  &zone);
-       return zone->node;
+                                  NULL);
+       return z->zone->node;
  }
  #endif
  
@@ -6394,49 +6879,6 @@ void setup_per_zone_wmarks(void)
         mutex_unlock(&zonelists_mutex);
  }
  
-/*
- * The inactive anon list should be small enough that the VM never has to
- * do too much work, but large enough that each inactive page has a chance
- * to be referenced again before it is swapped out.
- *
- * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
- * INACTIVE_ANON pages on this zone's LRU, maintained by the
- * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
- * the anonymous pages are kept on the inactive list.
- *
- * total     target    max
- * memory    ratio     inactive anon
- * -------------------------------------
- *   10MB       1         5MB
- *  100MB       1        50MB
- *    1GB       3       250MB
- *   10GB      10       0.9GB
- *  100GB      31         3GB
- *    1TB     101        10GB
- *   10TB     320        32GB
- */
-static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
-{
-       unsigned int gb, ratio;
-
-       /* Zone size in gigabytes */
-       gb = zone->managed_pages >> (30 - PAGE_SHIFT);
-       if (gb)
-               ratio = int_sqrt(10 * gb);
-       else
-               ratio = 1;
-
-       zone->inactive_ratio = ratio;
-}
-
-static void __meminit setup_per_zone_inactive_ratio(void)
-{
-       struct zone *zone;
-
-       for_each_zone(zone)
-               calculate_zone_inactive_ratio(zone);
-}
-
  /*
   * Initialise min_free_kbytes.
   *
@@ -6482,7 +6924,6 @@ int __meminit init_per_zone_wmark_min(void)
         setup_per_zone_wmarks();
         refresh_zone_stat_thresholds();
         setup_per_zone_lowmem_reserve();
-       setup_per_zone_inactive_ratio();
         return 0;
  }
  core_initcall(init_per_zone_wmark_min)
@@ -6725,98 +7166,6 @@ void *__init alloc_large_system_hash(const char *tablename,
         return table;
  }
  
-/* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
-                                                       unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
-       return __pfn_to_section(pfn)->pageblock_flags;
-#else
-       return zone->pageblock_flags;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
-       pfn &= (PAGES_PER_SECTION-1);
-       return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#else
-       pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
-       return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest to retrieve
- * @mask: mask of bits that the caller is interested in
- *
- * Return: pageblock_bits flags
- */
-unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
-                                       unsigned long end_bitidx,
-                                       unsigned long mask)
-{
-       struct zone *zone;
-       unsigned long *bitmap;
-       unsigned long bitidx, word_bitidx;
-       unsigned long word;
-
-       zone = page_zone(page);
-       bitmap = get_pageblock_bitmap(zone, pfn);
-       bitidx = pfn_to_bitidx(zone, pfn);
-       word_bitidx = bitidx / BITS_PER_LONG;
-       bitidx &= (BITS_PER_LONG-1);
-
-       word = bitmap[word_bitidx];
-       bitidx += end_bitidx;
-       return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
-}
-
-/**
- * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @flags: The flags to set
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest
- * @mask: mask of bits that the caller is interested in
- */
-void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
-                                       unsigned long pfn,
-                                       unsigned long end_bitidx,
-                                       unsigned long mask)
-{
-       struct zone *zone;
-       unsigned long *bitmap;
-       unsigned long bitidx, word_bitidx;
-       unsigned long old_word, word;
-
-       BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
-
-       zone = page_zone(page);
-       bitmap = get_pageblock_bitmap(zone, pfn);
-       bitidx = pfn_to_bitidx(zone, pfn);
-       word_bitidx = bitidx / BITS_PER_LONG;
-       bitidx &= (BITS_PER_LONG-1);
-
-       VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
-
-       bitidx += end_bitidx;
-       mask <<= (BITS_PER_LONG - bitidx - 1);
-       flags <<= (BITS_PER_LONG - bitidx - 1);
-
-       word = READ_ONCE(bitmap[word_bitidx]);
-       for (;;) {
-               old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
-               if (word == old_word)
-                       break;
-               word = old_word;
-       }
-}
-
  /*
   * This function checks whether pageblock includes unmovable pages or not.
   * If @count is not zero, it is okay to include less @count unmovable pages
@@ -6864,7 +7213,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                  * We can't use page_count without pin a page
                  * because another CPU can free compound page.
                  * This check already skips compound tails of THP
-                * because their page->_count is zero at all time.
+                * because their page->_refcount is zero at all time.
                  */
                 if (!page_ref_count(page)) {
                         if (PageBuddy(page))
@@ -7177,7 +7526,8 @@ void zone_pcp_reset(struct zone *zone)
  
  #ifdef CONFIG_MEMORY_HOTREMOVE
  /*
- * All pages in the range must be isolated before calling this.
+ * All pages in the range must be in a single zone and isolated
+ * before calling this.
   */
  void
  __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)