checkpatch: yet another commit id improvement

[cascardo/linux.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index c4a2f45..374d95d 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -84,6 +84,9 @@ struct scan_control {
         /* Scan (total_size >> priority) pages at once */
         int priority;
  
+       /* The highest zone to isolate pages for reclaim from */
+       enum zone_type reclaim_idx;
+
         unsigned int may_writepage:1;
  
         /* Can mapped pages be reclaimed? */
@@ -191,26 +194,44 @@ static bool sane_reclaim(struct scan_control *sc)
  }
  #endif
  
+/*
+ * This misses isolated pages which are not accounted for to save counters.
+ * As the data only determines if reclaim or compaction continues, it is
+ * not expected that isolated pages will be a dominating factor.
+ */
  unsigned long zone_reclaimable_pages(struct zone *zone)
  {
         unsigned long nr;
  
-       nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
-            zone_page_state_snapshot(zone, NR_INACTIVE_FILE) +
-            zone_page_state_snapshot(zone, NR_ISOLATED_FILE);
+       nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
+               zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
+       if (get_nr_swap_pages() > 0)
+               nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
+                       zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
+
+       return nr;
+}
+
+unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
+{
+       unsigned long nr;
+
+       nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) +
+            node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) +
+            node_page_state_snapshot(pgdat, NR_ISOLATED_FILE);
  
         if (get_nr_swap_pages() > 0)
-               nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) +
-                     zone_page_state_snapshot(zone, NR_INACTIVE_ANON) +
-                     zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
+               nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) +
+                     node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) +
+                     node_page_state_snapshot(pgdat, NR_ISOLATED_ANON);
  
         return nr;
  }
  
-bool zone_reclaimable(struct zone *zone)
+bool pgdat_reclaimable(struct pglist_data *pgdat)
  {
-       return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) <
-               zone_reclaimable_pages(zone) * 6;
+       return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) <
+               pgdat_reclaimable_pages(pgdat) * 6;
  }
  
  unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -218,7 +239,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
         if (!mem_cgroup_disabled())
                 return mem_cgroup_get_lru_size(lruvec, lru);
  
-       return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
+       return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
  }
  
  /*
@@ -593,7 +614,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                         ClearPageReclaim(page);
                 }
                 trace_mm_vmscan_writepage(page);
-               inc_zone_page_state(page, NR_VMSCAN_WRITE);
+               inc_node_page_state(page, NR_VMSCAN_WRITE);
                 return PAGE_SUCCESS;
         }
  
@@ -877,7 +898,7 @@ static void page_check_dirty_writeback(struct page *page,
   * shrink_page_list() returns the number of reclaimed pages
   */
  static unsigned long shrink_page_list(struct list_head *page_list,
-                                     struct zone *zone,
+                                     struct pglist_data *pgdat,
                                       struct scan_control *sc,
                                       enum ttu_flags ttu_flags,
                                       unsigned long *ret_nr_dirty,
@@ -917,7 +938,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         goto keep;
  
                 VM_BUG_ON_PAGE(PageActive(page), page);
-               VM_BUG_ON_PAGE(page_zone(page) != zone, page);
  
                 sc->nr_scanned++;
  
@@ -996,7 +1016,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         /* Case 1 above */
                         if (current_is_kswapd() &&
                             PageReclaim(page) &&
-                           test_bit(ZONE_WRITEBACK, &zone->flags)) {
+                           test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
                                 nr_immediate++;
                                 goto keep_locked;
  
@@ -1055,8 +1075,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  
                         /* Adding to swap updated mapping */
                         mapping = page_mapping(page);
+               } else if (unlikely(PageTransHuge(page))) {
+                       /* Split file THP */
+                       if (split_huge_page_to_list(page, page_list))
+                               goto keep_locked;
                 }
  
+               VM_BUG_ON_PAGE(PageTransHuge(page), page);
+
                 /*
                  * The page is mapped into the page tables of one or more
                  * processes. Try to unmap it here.
@@ -1086,14 +1112,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                          */
                         if (page_is_file_cache(page) &&
                                         (!current_is_kswapd() ||
-                                        !test_bit(ZONE_DIRTY, &zone->flags))) {
+                                        !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
                                 /*
                                  * Immediately reclaim when written back.
                                  * Similar in principal to deactivate_page()
                                  * except we already have the page isolated
                                  * and know it's dirty
                                  */
-                               inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+                               inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
                                 SetPageReclaim(page);
  
                                 goto keep_locked;
@@ -1254,17 +1280,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
  
         list_for_each_entry_safe(page, next, page_list, lru) {
                 if (page_is_file_cache(page) && !PageDirty(page) &&
-                   !isolated_balloon_page(page)) {
+                   !__PageMovable(page)) {
                         ClearPageActive(page);
                         list_move(&page->lru, &clean_pages);
                 }
         }
  
-       ret = shrink_page_list(&clean_pages, zone, &sc,
+       ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
                         TTU_UNMAP|TTU_IGNORE_ACCESS,
                         &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
         list_splice(&clean_pages, page_list);
-       mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+       mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
         return ret;
  }
  
@@ -1342,8 +1368,31 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
         return ret;
  }
  
+
  /*
- * zone->lru_lock is heavily contended.  Some of the functions that
+ * Update LRU sizes after isolating pages. The LRU size updates must
+ * be complete before mem_cgroup_update_lru_size due to a santity check.
+ */
+static __always_inline void update_lru_sizes(struct lruvec *lruvec,
+                       enum lru_list lru, unsigned long *nr_zone_taken,
+                       unsigned long nr_taken)
+{
+       int zid;
+
+       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+               if (!nr_zone_taken[zid])
+                       continue;
+
+               __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
+       }
+
+#ifdef CONFIG_MEMCG
+       mem_cgroup_update_lru_size(lruvec, lru, -nr_taken);
+#endif
+}
+
+/*
+ * zone_lru_lock is heavily contended.  Some of the functions that
   * shrink the lists perform better by taking out a batch of pages
   * and working on them outside the LRU lock.
   *
@@ -1369,10 +1418,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
  {
         struct list_head *src = &lruvec->lists[lru];
         unsigned long nr_taken = 0;
-       unsigned long scan;
+       unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
+       unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
+       unsigned long scan, nr_pages;
+       LIST_HEAD(pages_skipped);
  
         for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
-                                       !list_empty(src); scan++) {
+                                       !list_empty(src);) {
                 struct page *page;
  
                 page = lru_to_page(src);
@@ -1380,9 +1432,23 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
  
                 VM_BUG_ON_PAGE(!PageLRU(page), page);
  
+               if (page_zonenum(page) > sc->reclaim_idx) {
+                       list_move(&page->lru, &pages_skipped);
+                       nr_skipped[page_zonenum(page)]++;
+                       continue;
+               }
+
+               /*
+                * Account for scanned and skipped separetly to avoid the pgdat
+                * being prematurely marked unreclaimable by pgdat_reclaimable.
+                */
+               scan++;
+
                 switch (__isolate_lru_page(page, mode)) {
                 case 0:
-                       nr_taken += hpage_nr_pages(page);
+                       nr_pages = hpage_nr_pages(page);
+                       nr_taken += nr_pages;
+                       nr_zone_taken[page_zonenum(page)] += nr_pages;
                         list_move(&page->lru, dst);
                         break;
  
@@ -1396,9 +1462,38 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 }
         }
  
+       /*
+        * Splice any skipped pages to the start of the LRU list. Note that
+        * this disrupts the LRU order when reclaiming for lower zones but
+        * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
+        * scanning would soon rescan the same pages to skip and put the
+        * system at risk of premature OOM.
+        */
+       if (!list_empty(&pages_skipped)) {
+               int zid;
+               unsigned long total_skipped = 0;
+
+               for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                       if (!nr_skipped[zid])
+                               continue;
+
+                       __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
+                       total_skipped += nr_skipped[zid];
+               }
+
+               /*
+                * Account skipped pages as a partial scan as the pgdat may be
+                * close to unreclaimable. If the LRU list is empty, account
+                * skipped pages as a full scan.
+                */
+               scan += list_empty(src) ? total_skipped : total_skipped >> 2;
+
+               list_splice(&pages_skipped, src);
+       }
         *nr_scanned = scan;
-       trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
+       trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
                                     nr_taken, mode, is_file_lru(lru));
+       update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken);
         return nr_taken;
  }
  
@@ -1438,8 +1533,8 @@ int isolate_lru_page(struct page *page)
                 struct zone *zone = page_zone(page);
                 struct lruvec *lruvec;
  
-               spin_lock_irq(&zone->lru_lock);
-               lruvec = mem_cgroup_page_lruvec(page, zone);
+               spin_lock_irq(zone_lru_lock(zone));
+               lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
                 if (PageLRU(page)) {
                         int lru = page_lru(page);
                         get_page(page);
@@ -1447,7 +1542,7 @@ int isolate_lru_page(struct page *page)
                         del_page_from_lru_list(page, lruvec, lru);
                         ret = 0;
                 }
-               spin_unlock_irq(&zone->lru_lock);
+               spin_unlock_irq(zone_lru_lock(zone));
         }
         return ret;
  }
@@ -1459,7 +1554,7 @@ int isolate_lru_page(struct page *page)
   * the LRU list will go small and be scanned faster than necessary, leading to
   * unnecessary swapping, thrashing and OOM.
   */
-static int too_many_isolated(struct zone *zone, int file,
+static int too_many_isolated(struct pglist_data *pgdat, int file,
                 struct scan_control *sc)
  {
         unsigned long inactive, isolated;
@@ -1471,11 +1566,11 @@ static int too_many_isolated(struct zone *zone, int file,
                 return 0;
  
         if (file) {
-               inactive = zone_page_state(zone, NR_INACTIVE_FILE);
-               isolated = zone_page_state(zone, NR_ISOLATED_FILE);
+               inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
+               isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
         } else {
-               inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-               isolated = zone_page_state(zone, NR_ISOLATED_ANON);
+               inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
+               isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
         }
  
         /*
@@ -1493,7 +1588,7 @@ static noinline_for_stack void
  putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
  {
         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
-       struct zone *zone = lruvec_zone(lruvec);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
         LIST_HEAD(pages_to_free);
  
         /*
@@ -1506,13 +1601,13 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
                 VM_BUG_ON_PAGE(PageLRU(page), page);
                 list_del(&page->lru);
                 if (unlikely(!page_evictable(page))) {
-                       spin_unlock_irq(&zone->lru_lock);
+                       spin_unlock_irq(&pgdat->lru_lock);
                         putback_lru_page(page);
-                       spin_lock_irq(&zone->lru_lock);
+                       spin_lock_irq(&pgdat->lru_lock);
                         continue;
                 }
  
-               lruvec = mem_cgroup_page_lruvec(page, zone);
+               lruvec = mem_cgroup_page_lruvec(page, pgdat);
  
                 SetPageLRU(page);
                 lru = page_lru(page);
@@ -1529,10 +1624,10 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
                         del_page_from_lru_list(page, lruvec, lru);
  
                         if (unlikely(PageCompound(page))) {
-                               spin_unlock_irq(&zone->lru_lock);
+                               spin_unlock_irq(&pgdat->lru_lock);
                                 mem_cgroup_uncharge(page);
                                 (*get_compound_page_dtor(page))(page);
-                               spin_lock_irq(&zone->lru_lock);
+                               spin_lock_irq(&pgdat->lru_lock);
                         } else
                                 list_add(&page->lru, &pages_to_free);
                 }
@@ -1557,8 +1652,32 @@ static int current_may_throttle(void)
                 bdi_write_congested(current->backing_dev_info);
  }
  
+static bool inactive_reclaimable_pages(struct lruvec *lruvec,
+                               struct scan_control *sc, enum lru_list lru)
+{
+       int zid;
+       struct zone *zone;
+       int file = is_file_lru(lru);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+       if (!global_reclaim(sc))
+               return true;
+
+       for (zid = sc->reclaim_idx; zid >= 0; zid--) {
+               zone = &pgdat->node_zones[zid];
+               if (!populated_zone(zone))
+                       continue;
+
+               if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
+                               LRU_FILE * file) >= SWAP_CLUSTER_MAX)
+                       return true;
+       }
+
+       return false;
+}
+
  /*
- * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * shrink_inactive_list() is a helper for shrink_node().  It returns the number
   * of reclaimed pages
   */
  static noinline_for_stack unsigned long
@@ -1576,10 +1695,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         unsigned long nr_immediate = 0;
         isolate_mode_t isolate_mode = 0;
         int file = is_file_lru(lru);
-       struct zone *zone = lruvec_zone(lruvec);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
  
-       while (unlikely(too_many_isolated(zone, file, sc))) {
+       if (!inactive_reclaimable_pages(lruvec, sc, lru))
+               return 0;
+
+       while (unlikely(too_many_isolated(pgdat, file, sc))) {
                 congestion_wait(BLK_RW_ASYNC, HZ/10);
  
                 /* We are about to die and free our memory. Return now. */
@@ -1594,48 +1716,45 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         if (!sc->may_writepage)
                 isolate_mode |= ISOLATE_CLEAN;
  
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(&pgdat->lru_lock);
  
         nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
                                      &nr_scanned, sc, isolate_mode, lru);
  
-       update_lru_size(lruvec, lru, -nr_taken);
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+       __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
         reclaim_stat->recent_scanned[file] += nr_taken;
  
         if (global_reclaim(sc)) {
-               __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+               __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
                 if (current_is_kswapd())
-                       __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
+                       __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
                 else
-                       __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
+                       __count_vm_events(PGSCAN_DIRECT, nr_scanned);
         }
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(&pgdat->lru_lock);
  
         if (nr_taken == 0)
                 return 0;
  
-       nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
+       nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
                                 &nr_dirty, &nr_unqueued_dirty, &nr_congested,
                                 &nr_writeback, &nr_immediate,
                                 false);
  
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(&pgdat->lru_lock);
  
         if (global_reclaim(sc)) {
                 if (current_is_kswapd())
-                       __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
-                                              nr_reclaimed);
+                       __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
                 else
-                       __count_zone_vm_events(PGSTEAL_DIRECT, zone,
-                                              nr_reclaimed);
+                       __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
         }
  
         putback_inactive_pages(lruvec, &page_list);
  
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
+       __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
  
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(&pgdat->lru_lock);
  
         mem_cgroup_uncharge_list(&page_list);
         free_hot_cold_page_list(&page_list, true);
@@ -1655,7 +1774,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
          * are encountered in the nr_immediate check below.
          */
         if (nr_writeback && nr_writeback == nr_taken)
-               set_bit(ZONE_WRITEBACK, &zone->flags);
+               set_bit(PGDAT_WRITEBACK, &pgdat->flags);
  
         /*
          * Legacy memcg will stall in page writeback so avoid forcibly
@@ -1667,16 +1786,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                  * backed by a congested BDI and wait_iff_congested will stall.
                  */
                 if (nr_dirty && nr_dirty == nr_congested)
-                       set_bit(ZONE_CONGESTED, &zone->flags);
+                       set_bit(PGDAT_CONGESTED, &pgdat->flags);
  
                 /*
                  * If dirty pages are scanned that are not queued for IO, it
                  * implies that flushers are not keeping up. In this case, flag
-                * the zone ZONE_DIRTY and kswapd will start writing pages from
+                * the pgdat PGDAT_DIRTY and kswapd will start writing pages from
                  * reclaim context.
                  */
                 if (nr_unqueued_dirty == nr_taken)
-                       set_bit(ZONE_DIRTY, &zone->flags);
+                       set_bit(PGDAT_DIRTY, &pgdat->flags);
  
                 /*
                  * If kswapd scans pages marked marked for immediate
@@ -1695,9 +1814,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
          */
         if (!sc->hibernation_mode && !current_is_kswapd() &&
             current_may_throttle())
-               wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+               wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
  
-       trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed,
+       trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
+                       nr_scanned, nr_reclaimed,
                         sc->priority, file);
         return nr_reclaimed;
  }
@@ -1709,9 +1829,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
   * processes, from rmap.
   *
   * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone->lru_lock across the whole operation.  But if
+ * appropriate to hold zone_lru_lock across the whole operation.  But if
   * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone->lru_lock around each page.  It's impossible to balance
+ * should drop zone_lru_lock around each page.  It's impossible to balance
   * this, so instead we remove the pages from the LRU while processing them.
   * It is safe to rely on PG_active against the non-LRU pages in here because
   * nobody will play with that bit on a non-LRU page.
@@ -1725,20 +1845,20 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
                                      struct list_head *pages_to_free,
                                      enum lru_list lru)
  {
-       struct zone *zone = lruvec_zone(lruvec);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
         unsigned long pgmoved = 0;
         struct page *page;
         int nr_pages;
  
         while (!list_empty(list)) {
                 page = lru_to_page(list);
-               lruvec = mem_cgroup_page_lruvec(page, zone);
+               lruvec = mem_cgroup_page_lruvec(page, pgdat);
  
                 VM_BUG_ON_PAGE(PageLRU(page), page);
                 SetPageLRU(page);
  
                 nr_pages = hpage_nr_pages(page);
-               update_lru_size(lruvec, lru, nr_pages);
+               update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
                 list_move(&page->lru, &lruvec->lists[lru]);
                 pgmoved += nr_pages;
  
@@ -1748,10 +1868,10 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
                         del_page_from_lru_list(page, lruvec, lru);
  
                         if (unlikely(PageCompound(page))) {
-                               spin_unlock_irq(&zone->lru_lock);
+                               spin_unlock_irq(&pgdat->lru_lock);
                                 mem_cgroup_uncharge(page);
                                 (*get_compound_page_dtor(page))(page);
-                               spin_lock_irq(&zone->lru_lock);
+                               spin_lock_irq(&pgdat->lru_lock);
                         } else
                                 list_add(&page->lru, pages_to_free);
                 }
@@ -1777,7 +1897,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
         unsigned long nr_rotated = 0;
         isolate_mode_t isolate_mode = 0;
         int file = is_file_lru(lru);
-       struct zone *zone = lruvec_zone(lruvec);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  
         lru_add_drain();
  
@@ -1786,20 +1906,19 @@ static void shrink_active_list(unsigned long nr_to_scan,
         if (!sc->may_writepage)
                 isolate_mode |= ISOLATE_CLEAN;
  
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(&pgdat->lru_lock);
  
         nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
                                      &nr_scanned, sc, isolate_mode, lru);
  
-       update_lru_size(lruvec, lru, -nr_taken);
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+       __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
         reclaim_stat->recent_scanned[file] += nr_taken;
  
         if (global_reclaim(sc))
-               __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
-       __count_zone_vm_events(PGREFILL, zone, nr_scanned);
+               __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
+       __count_vm_events(PGREFILL, nr_scanned);
  
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(&pgdat->lru_lock);
  
         while (!list_empty(&l_hold)) {
                 cond_resched();
@@ -1844,7 +1963,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
         /*
          * Move pages back to the lru list.
          */
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(&pgdat->lru_lock);
         /*
          * Count referenced pages from currently used mappings as rotated,
          * even though only some of them are actually re-activated.  This
@@ -1855,8 +1974,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
  
         move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
         move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
-       spin_unlock_irq(&zone->lru_lock);
+       __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+       spin_unlock_irq(&pgdat->lru_lock);
  
         mem_cgroup_uncharge_list(&l_hold);
         free_hot_cold_page_list(&l_hold, true);
@@ -1888,12 +2007,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
   *    1TB     101        10GB
   *   10TB     320        32GB
   */
-static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
+static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
+                                               struct scan_control *sc)
  {
         unsigned long inactive_ratio;
         unsigned long inactive;
         unsigned long active;
         unsigned long gb;
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+       int zid;
  
         /*
          * If we don't have swap space, anonymous page deactivation
@@ -1905,6 +2027,27 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
         inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
         active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
  
+       /*
+        * For zone-constrained allocations, it is necessary to check if
+        * deactivations are required for lowmem to be reclaimed. This
+        * calculates the inactive/active pages available in eligible zones.
+        */
+       for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
+               struct zone *zone = &pgdat->node_zones[zid];
+               unsigned long inactive_zone, active_zone;
+
+               if (!populated_zone(zone))
+                       continue;
+
+               inactive_zone = zone_page_state(zone,
+                               NR_ZONE_LRU_BASE + (file * LRU_FILE));
+               active_zone = zone_page_state(zone,
+                               NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
+
+               inactive -= min(inactive, inactive_zone);
+               active -= min(active, active_zone);
+       }
+
         gb = (inactive + active) >> (30 - PAGE_SHIFT);
         if (gb)
                 inactive_ratio = int_sqrt(10 * gb);
@@ -1918,7 +2061,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
                                  struct lruvec *lruvec, struct scan_control *sc)
  {
         if (is_active_lru(lru)) {
-               if (inactive_list_is_low(lruvec, is_file_lru(lru)))
+               if (inactive_list_is_low(lruvec, is_file_lru(lru), sc))
                         shrink_active_list(nr_to_scan, lruvec, sc, lru);
                 return 0;
         }
@@ -1950,7 +2093,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
         u64 fraction[2];
         u64 denominator = 0;    /* gcc */
-       struct zone *zone = lruvec_zone(lruvec);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
         unsigned long anon_prio, file_prio;
         enum scan_balance scan_balance;
         unsigned long anon, file;
@@ -1971,7 +2114,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
          * well.
          */
         if (current_is_kswapd()) {
-               if (!zone_reclaimable(zone))
+               if (!pgdat_reclaimable(pgdat))
                         force_scan = true;
                 if (!mem_cgroup_online(memcg))
                         force_scan = true;
@@ -2017,14 +2160,24 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
          * anon pages.  Try to detect this based on file LRU size.
          */
         if (global_reclaim(sc)) {
-               unsigned long zonefile;
-               unsigned long zonefree;
+               unsigned long pgdatfile;
+               unsigned long pgdatfree;
+               int z;
+               unsigned long total_high_wmark = 0;
+
+               pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+               pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
+                          node_page_state(pgdat, NR_INACTIVE_FILE);
  
-               zonefree = zone_page_state(zone, NR_FREE_PAGES);
-               zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
-                          zone_page_state(zone, NR_INACTIVE_FILE);
+               for (z = 0; z < MAX_NR_ZONES; z++) {
+                       struct zone *zone = &pgdat->node_zones[z];
+                       if (!populated_zone(zone))
+                               continue;
  
-               if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
+                       total_high_wmark += high_wmark_pages(zone);
+               }
+
+               if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
                         scan_balance = SCAN_ANON;
                         goto out;
                 }
@@ -2039,7 +2192,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
          * lruvec even if it has plenty of old anonymous pages unless the
          * system is under heavy pressure.
          */
-       if (!inactive_list_is_low(lruvec, true) &&
+       if (!inactive_list_is_low(lruvec, true, sc) &&
             lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
                 scan_balance = SCAN_FILE;
                 goto out;
@@ -2071,7 +2224,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
                 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
  
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(&pgdat->lru_lock);
         if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
                 reclaim_stat->recent_scanned[0] /= 2;
                 reclaim_stat->recent_rotated[0] /= 2;
@@ -2092,7 +2245,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
  
         fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
         fp /= reclaim_stat->recent_rotated[1] + 1;
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(&pgdat->lru_lock);
  
         fraction[0] = ap;
         fraction[1] = fp;
@@ -2168,12 +2321,12 @@ static inline void init_tlb_ubc(void)
  #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
  
  /*
- * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
+ * This is a basic per-node page freer.  Used by both kswapd and direct reclaim.
   */
-static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
+static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
                               struct scan_control *sc, unsigned long *lru_pages)
  {
-       struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+       struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
         unsigned long nr[NR_LRU_LISTS];
         unsigned long targets[NR_LRU_LISTS];
         unsigned long nr_to_scan;
@@ -2281,7 +2434,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
          * Even if we did not try to evict anon pages at all, we want to
          * rebalance the anon lru active/inactive ratio.
          */
-       if (inactive_list_is_low(lruvec, false))
+       if (inactive_list_is_low(lruvec, false, sc))
                 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                    sc, LRU_ACTIVE_ANON);
  
@@ -2306,13 +2459,14 @@ static bool in_reclaim_compaction(struct scan_control *sc)
   * calls try_to_compact_zone() that it will have enough free pages to succeed.
   * It will give up earlier than that if there is difficulty reclaiming pages.
   */
-static inline bool should_continue_reclaim(struct zone *zone,
+static inline bool should_continue_reclaim(struct pglist_data *pgdat,
                                         unsigned long nr_reclaimed,
                                         unsigned long nr_scanned,
                                         struct scan_control *sc)
  {
         unsigned long pages_for_compaction;
         unsigned long inactive_lru_pages;
+       int z;
  
         /* If not in reclaim/compaction mode, stop */
         if (!in_reclaim_compaction(sc))
@@ -2346,25 +2500,32 @@ static inline bool should_continue_reclaim(struct zone *zone,
          * inactive lists are large enough, continue reclaiming
          */
         pages_for_compaction = (2UL << sc->order);
-       inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
+       inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
         if (get_nr_swap_pages() > 0)
-               inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
+               inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
         if (sc->nr_reclaimed < pages_for_compaction &&
                         inactive_lru_pages > pages_for_compaction)
                 return true;
  
         /* If compaction would go ahead or the allocation would succeed, stop */
-       switch (compaction_suitable(zone, sc->order, 0, 0)) {
-       case COMPACT_PARTIAL:
-       case COMPACT_CONTINUE:
-               return false;
-       default:
-               return true;
+       for (z = 0; z <= sc->reclaim_idx; z++) {
+               struct zone *zone = &pgdat->node_zones[z];
+               if (!populated_zone(zone))
+                       continue;
+
+               switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
+               case COMPACT_PARTIAL:
+               case COMPACT_CONTINUE:
+                       return false;
+               default:
+                       /* check next zone */
+                       ;
+               }
         }
+       return true;
  }
  
-static bool shrink_zone(struct zone *zone, struct scan_control *sc,
-                       bool is_classzone)
+static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
  {
         struct reclaim_state *reclaim_state = current->reclaim_state;
         unsigned long nr_reclaimed, nr_scanned;
@@ -2373,10 +2534,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
         do {
                 struct mem_cgroup *root = sc->target_mem_cgroup;
                 struct mem_cgroup_reclaim_cookie reclaim = {
-                       .zone = zone,
+                       .pgdat = pgdat,
                         .priority = sc->priority,
                 };
-               unsigned long zone_lru_pages = 0;
+               unsigned long node_lru_pages = 0;
                 struct mem_cgroup *memcg;
  
                 nr_reclaimed = sc->nr_reclaimed;
@@ -2397,11 +2558,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                         reclaimed = sc->nr_reclaimed;
                         scanned = sc->nr_scanned;
  
-                       shrink_zone_memcg(zone, memcg, sc, &lru_pages);
-                       zone_lru_pages += lru_pages;
+                       shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
+                       node_lru_pages += lru_pages;
  
-                       if (memcg && is_classzone)
-                               shrink_slab(sc->gfp_mask, zone_to_nid(zone),
+                       if (memcg)
+                               shrink_slab(sc->gfp_mask, pgdat->node_id,
                                             memcg, sc->nr_scanned - scanned,
                                             lru_pages);
  
@@ -2413,7 +2574,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                         /*
                          * Direct reclaim and kswapd have to scan all memory
                          * cgroups to fulfill the overall scan target for the
-                        * zone.
+                        * node.
                          *
                          * Limit reclaim, on the other hand, only cares about
                          * nr_to_reclaim pages to be reclaimed and it will
@@ -2431,10 +2592,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                  * Shrink the slab caches in the same proportion that
                  * the eligible LRU pages were scanned.
                  */
-               if (global_reclaim(sc) && is_classzone)
-                       shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
+               if (global_reclaim(sc))
+                       shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
                                     sc->nr_scanned - nr_scanned,
-                                   zone_lru_pages);
+                                   node_lru_pages);
  
                 if (reclaim_state) {
                         sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -2449,7 +2610,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                 if (sc->nr_reclaimed - nr_reclaimed)
                         reclaimable = true;
  
-       } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
+       } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
                                          sc->nr_scanned - nr_scanned, sc));
  
         return reclaimable;
@@ -2459,9 +2620,9 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
   * Returns true if compaction should go ahead for a high-order request, or
   * the high-order allocation would succeed without compaction.
   */
-static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx)
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
  {
-       unsigned long balance_gap, watermark;
+       unsigned long watermark;
         bool watermark_ok;
  
         /*
@@ -2470,23 +2631,21 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_
          * there is a buffer of free pages available to give compaction
          * a reasonable chance of completing and allocating the page
          */
-       balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
-                       zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
-       watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
-       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx);
+       watermark = high_wmark_pages(zone) + (2UL << sc->order);
+       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
  
         /*
          * If compaction is deferred, reclaim up to a point where
          * compaction will have a chance of success when re-enabled
          */
-       if (compaction_deferred(zone, order))
+       if (compaction_deferred(zone, sc->order))
                 return watermark_ok;
  
         /*
          * If compaction is not ready to start and allocation is not likely
          * to succeed without it, then keep reclaiming.
          */
-       if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED)
+       if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED)
                 return false;
  
         return watermark_ok;
@@ -2497,14 +2656,6 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_
   * try to reclaim pages from zones which will satisfy the caller's allocation
   * request.
   *
- * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
- * Because:
- * a) The caller may be trying to free *extra* pages to satisfy a higher-order
- *    allocation or
- * b) The target zone may be at high_wmark_pages(zone) but the lower zones
- *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
- *    zone defense algorithm.
- *
   * If a zone is deemed to be full of pinned pages then just give it a light
   * scan then give up on it.
   */
@@ -2515,7 +2666,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
         unsigned long nr_soft_reclaimed;
         unsigned long nr_soft_scanned;
         gfp_t orig_mask;
-       enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
+       pg_data_t *last_pgdat = NULL;
  
         /*
          * If the number of buffer_heads in the machine exceeds the maximum
@@ -2523,21 +2674,13 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
          * highmem pages could be pinning lowmem pages storing buffer_heads
          */
         orig_mask = sc->gfp_mask;
-       if (buffer_heads_over_limit)
+       if (buffer_heads_over_limit) {
                 sc->gfp_mask |= __GFP_HIGHMEM;
+               sc->reclaim_idx = gfp_zone(sc->gfp_mask);
+       }
  
         for_each_zone_zonelist_nodemask(zone, z, zonelist,
-                                       gfp_zone(sc->gfp_mask), sc->nodemask) {
-               enum zone_type classzone_idx;
-
-               if (!populated_zone(zone))
-                       continue;
-
-               classzone_idx = requested_highidx;
-               while (!populated_zone(zone->zone_pgdat->node_zones +
-                                                       classzone_idx))
-                       classzone_idx--;
-
+                                       sc->reclaim_idx, sc->nodemask) {
                 /*
                  * Take care memory controller reclaiming has small influence
                  * to global LRU.
@@ -2548,7 +2691,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                                 continue;
  
                         if (sc->priority != DEF_PRIORITY &&
-                           !zone_reclaimable(zone))
+                           !pgdat_reclaimable(zone->zone_pgdat))
                                 continue;       /* Let kswapd poll it */
  
                         /*
@@ -2562,12 +2705,20 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                          */
                         if (IS_ENABLED(CONFIG_COMPACTION) &&
                             sc->order > PAGE_ALLOC_COSTLY_ORDER &&
-                           zonelist_zone_idx(z) <= requested_highidx &&
-                           compaction_ready(zone, sc->order, requested_highidx)) {
+                           compaction_ready(zone, sc)) {
                                 sc->compaction_ready = true;
                                 continue;
                         }
  
+                       /*
+                        * Shrink each node in the zonelist once. If the
+                        * zonelist is ordered by zone (not the default) then a
+                        * node may be shrunk multiple times but in that case
+                        * the user prefers lower zones being preserved.
+                        */
+                       if (zone->zone_pgdat == last_pgdat)
+                               continue;
+
                         /*
                          * This steals pages from memory cgroups over softlimit
                          * and returns the number of reclaimed pages and
@@ -2575,7 +2726,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                          * and balancing, not for a memcg's limit.
                          */
                         nr_soft_scanned = 0;
-                       nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+                       nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
                                                 sc->order, sc->gfp_mask,
                                                 &nr_soft_scanned);
                         sc->nr_reclaimed += nr_soft_reclaimed;
@@ -2583,7 +2734,11 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                         /* need some check for avoid more shrink_zone() */
                 }
  
-               shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
+               /* See comment about same check for global reclaim above */
+               if (zone->zone_pgdat == last_pgdat)
+                       continue;
+               last_pgdat = zone->zone_pgdat;
+               shrink_node(zone->zone_pgdat, sc);
         }
  
         /*
@@ -2619,7 +2774,7 @@ retry:
         delayacct_freepages_start();
  
         if (global_reclaim(sc))
-               count_vm_event(ALLOCSTALL);
+               __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
  
         do {
                 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
@@ -2686,7 +2841,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
         for (i = 0; i <= ZONE_NORMAL; i++) {
                 zone = &pgdat->node_zones[i];
                 if (!populated_zone(zone) ||
-                   zone_reclaimable_pages(zone) == 0)
+                   pgdat_reclaimable_pages(pgdat) == 0)
                         continue;
  
                 pfmemalloc_reserve += min_wmark_pages(zone);
@@ -2701,7 +2856,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
  
         /* kswapd must be awake if processes are being throttled */
         if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
-               pgdat->classzone_idx = min(pgdat->classzone_idx,
+               pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx,
                                                 (enum zone_type)ZONE_NORMAL);
                 wake_up_interruptible(&pgdat->kswapd_wait);
         }
@@ -2809,6 +2964,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
         struct scan_control sc = {
                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
                 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+               .reclaim_idx = gfp_zone(gfp_mask),
                 .order = order,
                 .nodemask = nodemask,
                 .priority = DEF_PRIORITY,
@@ -2827,7 +2983,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
  
         trace_mm_vmscan_direct_reclaim_begin(order,
                                 sc.may_writepage,
-                               gfp_mask);
+                               gfp_mask,
+                               sc.reclaim_idx);
  
         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
  
@@ -2838,9 +2995,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
  
  #ifdef CONFIG_MEMCG
  
-unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
+unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
                                                 gfp_t gfp_mask, bool noswap,
-                                               struct zone *zone,
+                                               pg_data_t *pgdat,
                                                 unsigned long *nr_scanned)
  {
         struct scan_control sc = {
@@ -2848,6 +3005,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                 .target_mem_cgroup = memcg,
                 .may_writepage = !laptop_mode,
                 .may_unmap = 1,
+               .reclaim_idx = MAX_NR_ZONES - 1,
                 .may_swap = !noswap,
         };
         unsigned long lru_pages;
@@ -2857,16 +3015,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
  
         trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
                                                       sc.may_writepage,
-                                                     sc.gfp_mask);
+                                                     sc.gfp_mask,
+                                                     sc.reclaim_idx);
  
         /*
          * NOTE: Although we can get the priority field, using it
          * here is not a good idea, since it limits the pages we can scan.
-        * if we don't reclaim here, the shrink_zone from balance_pgdat
+        * if we don't reclaim here, the shrink_node from balance_pgdat
          * will pick up pages from other mem cgroup's as well. We hack
          * the priority and make it zero.
          */
-       shrink_zone_memcg(zone, memcg, &sc, &lru_pages);
+       shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
  
         trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
  
@@ -2886,6 +3045,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                                 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+               .reclaim_idx = MAX_NR_ZONES - 1,
                 .target_mem_cgroup = memcg,
                 .priority = DEF_PRIORITY,
                 .may_writepage = !laptop_mode,
@@ -2904,7 +3064,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
  
         trace_mm_vmscan_memcg_reclaim_begin(0,
                                             sc.may_writepage,
-                                           sc.gfp_mask);
+                                           sc.gfp_mask,
+                                           sc.reclaim_idx);
  
         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
  
@@ -2914,7 +3075,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
  }
  #endif
  
-static void age_active_anon(struct zone *zone, struct scan_control *sc)
+static void age_active_anon(struct pglist_data *pgdat,
+                               struct scan_control *sc)
  {
         struct mem_cgroup *memcg;
  
@@ -2923,9 +3085,9 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
  
         memcg = mem_cgroup_iter(NULL, NULL, NULL);
         do {
-               struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+               struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
  
-               if (inactive_list_is_low(lruvec, false))
+               if (inactive_list_is_low(lruvec, false, sc))
                         shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                            sc, LRU_ACTIVE_ANON);
  
@@ -2933,82 +3095,21 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
         } while (memcg);
  }
  
-static bool zone_balanced(struct zone *zone, int order, bool highorder,
-                       unsigned long balance_gap, int classzone_idx)
+static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
  {
-       unsigned long mark = high_wmark_pages(zone) + balance_gap;
+       unsigned long mark = high_wmark_pages(zone);
+
+       if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
+               return false;
  
         /*
-        * When checking from pgdat_balanced(), kswapd should stop and sleep
-        * when it reaches the high order-0 watermark and let kcompactd take
-        * over. Other callers such as wakeup_kswapd() want to determine the
-        * true high-order watermark.
+        * If any eligible zone is balanced then the node is not considered
+        * to be congested or dirty
          */
-       if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
-               mark += (1UL << order);
-               order = 0;
-       }
-
-       return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
-}
-
-/*
- * pgdat_balanced() is used when checking if a node is balanced.
- *
- * For order-0, all zones must be balanced!
- *
- * For high-order allocations only zones that meet watermarks and are in a
- * zone allowed by the callers classzone_idx are added to balanced_pages. The
- * total of balanced pages must be at least 25% of the zones allowed by
- * classzone_idx for the node to be considered balanced. Forcing all zones to
- * be balanced for high orders can cause excessive reclaim when there are
- * imbalanced zones.
- * The choice of 25% is due to
- *   o a 16M DMA zone that is balanced will not balance a zone on any
- *     reasonable sized machine
- *   o On all other machines, the top zone must be at least a reasonable
- *     percentage of the middle zones. For example, on 32-bit x86, highmem
- *     would need to be at least 256M for it to be balance a whole node.
- *     Similarly, on x86-64 the Normal zone would need to be at least 1G
- *     to balance a node on its own. These seemed like reasonable ratios.
- */
-static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
-{
-       unsigned long managed_pages = 0;
-       unsigned long balanced_pages = 0;
-       int i;
+       clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
+       clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
  
-       /* Check the watermark levels */
-       for (i = 0; i <= classzone_idx; i++) {
-               struct zone *zone = pgdat->node_zones + i;
-
-               if (!populated_zone(zone))
-                       continue;
-
-               managed_pages += zone->managed_pages;
-
-               /*
-                * A special case here:
-                *
-                * balance_pgdat() skips over all_unreclaimable after
-                * DEF_PRIORITY. Effectively, it considers them balanced so
-                * they must be considered balanced here as well!
-                */
-               if (!zone_reclaimable(zone)) {
-                       balanced_pages += zone->managed_pages;
-                       continue;
-               }
-
-               if (zone_balanced(zone, order, false, 0, i))
-                       balanced_pages += zone->managed_pages;
-               else if (!order)
-                       return false;
-       }
-
-       if (order)
-               return balanced_pages >= (managed_pages >> 2);
-       else
-               return true;
+       return true;
  }
  
  /*
@@ -3017,12 +3118,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
   *
   * Returns true if kswapd is ready to sleep
   */
-static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
-                                       int classzone_idx)
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  {
-       /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
-       if (remaining)
-               return false;
+       int i;
  
         /*
          * The throttled processes are normally woken up in balance_pgdat() as
@@ -3040,91 +3138,81 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
         if (waitqueue_active(&pgdat->pfmemalloc_wait))
                 wake_up_all(&pgdat->pfmemalloc_wait);
  
-       return pgdat_balanced(pgdat, order, classzone_idx);
+       for (i = 0; i <= classzone_idx; i++) {
+               struct zone *zone = pgdat->node_zones + i;
+
+               if (!populated_zone(zone))
+                       continue;
+
+               if (!zone_balanced(zone, order, classzone_idx))
+                       return false;
+       }
+
+       return true;
  }
  
  /*
- * kswapd shrinks the zone by the number of pages required to reach
- * the high watermark.
+ * kswapd shrinks a node of pages that are at or below the highest usable
+ * zone that is currently unbalanced.
   *
   * Returns true if kswapd scanned at least the requested number of pages to
   * reclaim or if the lack of progress was due to pages under writeback.
   * This is used to determine if the scanning priority needs to be raised.
   */
-static bool kswapd_shrink_zone(struct zone *zone,
-                              int classzone_idx,
+static bool kswapd_shrink_node(pg_data_t *pgdat,
                                struct scan_control *sc)
  {
-       unsigned long balance_gap;
-       bool lowmem_pressure;
+       struct zone *zone;
+       int z;
  
-       /* Reclaim above the high watermark. */
-       sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
+       /* Reclaim a number of pages proportional to the number of zones */
+       sc->nr_to_reclaim = 0;
+       for (z = 0; z <= sc->reclaim_idx; z++) {
+               zone = pgdat->node_zones + z;
+               if (!populated_zone(zone))
+                       continue;
  
-       /*
-        * We put equal pressure on every zone, unless one zone has way too
-        * many pages free already. The "too many pages" is defined as the
-        * high wmark plus a "gap" where the gap is either the low
-        * watermark or 1% of the zone, whichever is smaller.
-        */
-       balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
-                       zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
+               sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
+       }
  
         /*
-        * If there is no low memory pressure or the zone is balanced then no
-        * reclaim is necessary
+        * Historically care was taken to put equal pressure on all zones but
+        * now pressure is applied based on node LRU order.
          */
-       lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
-       if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
-                                               balance_gap, classzone_idx))
-               return true;
-
-       shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
-
-       clear_bit(ZONE_WRITEBACK, &zone->flags);
+       shrink_node(pgdat, sc);
  
         /*
-        * If a zone reaches its high watermark, consider it to be no longer
-        * congested. It's possible there are dirty pages backed by congested
-        * BDIs but as pressure is relieved, speculatively avoid congestion
-        * waits.
+        * Fragmentation may mean that the system cannot be rebalanced for
+        * high-order allocations. If twice the allocation size has been
+        * reclaimed then recheck watermarks only at order-0 to prevent
+        * excessive reclaim. Assume that a process requested a high-order
+        * can direct reclaim/compact.
          */
-       if (zone_reclaimable(zone) &&
-           zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
-               clear_bit(ZONE_CONGESTED, &zone->flags);
-               clear_bit(ZONE_DIRTY, &zone->flags);
-       }
+       if (sc->order && sc->nr_reclaimed >= 2UL << sc->order)
+               sc->order = 0;
  
         return sc->nr_scanned >= sc->nr_to_reclaim;
  }
  
  /*
- * For kswapd, balance_pgdat() will work across all this node's zones until
- * they are all at high_wmark_pages(zone).
+ * For kswapd, balance_pgdat() will reclaim pages across a node from zones
+ * that are eligible for use by the caller until at least one zone is
+ * balanced.
   *
- * Returns the highest zone idx kswapd was reclaiming at
- *
- * There is special handling here for zones which are full of pinned pages.
- * This can happen if the pages are all mlocked, or if they are all used by
- * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
- * What we do is to detect the case where all pages in the zone have been
- * scanned twice and there has been zero successful reclaim.  Mark the zone as
- * dead and from now on, only perform a short scan.  Basically we're polling
- * the zone for when the problem goes away.
+ * Returns the order kswapd finished reclaiming at.
   *
   * kswapd scans the zones in the highmem->normal->dma direction.  It skips
   * zones which have free_pages > high_wmark_pages(zone), but once a zone is
- * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
- * lower zones regardless of the number of free pages in the lower zones. This
- * interoperates with the page allocator fallback scheme to ensure that aging
- * of pages is balanced across the zones.
+ * found to have free_pages <= high_wmark_pages(zone), any page is that zone
+ * or lower is eligible for reclaim until at least one usable zone is
+ * balanced.
   */
  static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
  {
         int i;
-       int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
         unsigned long nr_soft_reclaimed;
         unsigned long nr_soft_scanned;
+       struct zone *zone;
         struct scan_control sc = {
                 .gfp_mask = GFP_KERNEL,
                 .order = order,
@@ -3139,100 +3227,77 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 bool raise_priority = true;
  
                 sc.nr_reclaimed = 0;
+               sc.reclaim_idx = classzone_idx;
  
                 /*
-                * Scan in the highmem->dma direction for the highest
-                * zone which needs scanning
+                * If the number of buffer_heads exceeds the maximum allowed
+                * then consider reclaiming from all zones. This has a dual
+                * purpose -- on 64-bit systems it is expected that
+                * buffer_heads are stripped during active rotation. On 32-bit
+                * systems, highmem pages can pin lowmem memory and shrinking
+                * buffers can relieve lowmem pressure. Reclaim may still not
+                * go ahead if all eligible zones for the original allocation
+                * request are balanced to avoid excessive reclaim from kswapd.
                  */
-               for (i = pgdat->nr_zones - 1; i >= 0; i--) {
-                       struct zone *zone = pgdat->node_zones + i;
-
-                       if (!populated_zone(zone))
-                               continue;
-
-                       if (sc.priority != DEF_PRIORITY &&
-                           !zone_reclaimable(zone))
-                               continue;
-
-                       /*
-                        * Do some background aging of the anon list, to give
-                        * pages a chance to be referenced before reclaiming.
-                        */
-                       age_active_anon(zone, &sc);
+               if (buffer_heads_over_limit) {
+                       for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
+                               zone = pgdat->node_zones + i;
+                               if (!populated_zone(zone))
+                                       continue;
  
-                       /*
-                        * If the number of buffer_heads in the machine
-                        * exceeds the maximum allowed level and this node
-                        * has a highmem zone, force kswapd to reclaim from
-                        * it to relieve lowmem pressure.
-                        */
-                       if (buffer_heads_over_limit && is_highmem_idx(i)) {
-                               end_zone = i;
+                               sc.reclaim_idx = i;
                                 break;
                         }
+               }
  
-                       if (!zone_balanced(zone, order, false, 0, 0)) {
-                               end_zone = i;
-                               break;
-                       } else {
-                               /*
-                                * If balanced, clear the dirty and congested
-                                * flags
-                                */
-                               clear_bit(ZONE_CONGESTED, &zone->flags);
-                               clear_bit(ZONE_DIRTY, &zone->flags);
-                       }
+               /*
+                * Only reclaim if there are no eligible zones. Check from
+                * high to low zone as allocations prefer higher zones.
+                * Scanning from low to high zone would allow congestion to be
+                * cleared during a very small window when a small low
+                * zone was balanced even under extreme pressure when the
+                * overall node may be congested. Note that sc.reclaim_idx
+                * is not used as buffer_heads_over_limit may have adjusted
+                * it.
+                */
+               for (i = classzone_idx; i >= 0; i--) {
+                       zone = pgdat->node_zones + i;
+                       if (!populated_zone(zone))
+                               continue;
+
+                       if (zone_balanced(zone, sc.order, classzone_idx))
+                               goto out;
                 }
  
-               if (i < 0)
-                       goto out;
+               /*
+                * Do some background aging of the anon list, to give
+                * pages a chance to be referenced before reclaiming. All
+                * pages are rotated regardless of classzone as this is
+                * about consistent aging.
+                */
+               age_active_anon(pgdat, &sc);
  
                 /*
                  * If we're getting trouble reclaiming, start doing writepage
                  * even in laptop mode.
                  */
-               if (sc.priority < DEF_PRIORITY - 2)
+               if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
                         sc.may_writepage = 1;
  
+               /* Call soft limit reclaim before calling shrink_node. */
+               sc.nr_scanned = 0;
+               nr_soft_scanned = 0;
+               nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
+                                               sc.gfp_mask, &nr_soft_scanned);
+               sc.nr_reclaimed += nr_soft_reclaimed;
+
                 /*
-                * Now scan the zone in the dma->highmem direction, stopping
-                * at the last zone which needs scanning.
-                *
-                * We do this because the page allocator works in the opposite
-                * direction.  This prevents the page allocator from allocating
-                * pages behind kswapd's direction of progress, which would
-                * cause too much scanning of the lower zones.
+                * There should be no need to raise the scanning priority if
+                * enough pages are already being scanned that that high
+                * watermark would be met at 100% efficiency.
                  */
-               for (i = 0; i <= end_zone; i++) {
-                       struct zone *zone = pgdat->node_zones + i;
-
-                       if (!populated_zone(zone))
-                               continue;
-
-                       if (sc.priority != DEF_PRIORITY &&
-                           !zone_reclaimable(zone))
-                               continue;
-
-                       sc.nr_scanned = 0;
-
-                       nr_soft_scanned = 0;
-                       /*
-                        * Call soft limit reclaim before calling shrink_zone.
-                        */
-                       nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
-                                                       order, sc.gfp_mask,
-                                                       &nr_soft_scanned);
-                       sc.nr_reclaimed += nr_soft_reclaimed;
-
-                       /*
-                        * There should be no need to raise the scanning
-                        * priority if enough pages are already being scanned
-                        * that that high watermark would be met at 100%
-                        * efficiency.
-                        */
-                       if (kswapd_shrink_zone(zone, end_zone, &sc))
-                               raise_priority = false;
-               }
+               if (kswapd_shrink_node(pgdat, &sc))
+                       raise_priority = false;
  
                 /*
                  * If the low watermark is met there is no need for processes
@@ -3253,19 +3318,20 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  */
                 if (raise_priority || !sc.nr_reclaimed)
                         sc.priority--;
-       } while (sc.priority >= 1 &&
-                       !pgdat_balanced(pgdat, order, classzone_idx));
+       } while (sc.priority >= 1);
  
  out:
         /*
-        * Return the highest zone idx we were reclaiming at so
-        * prepare_kswapd_sleep() makes the same decisions as here.
+        * Return the order kswapd stopped reclaiming at as
+        * prepare_kswapd_sleep() takes it into account. If another caller
+        * entered the allocator slow path while kswapd was awake, order will
+        * remain at the higher level.
          */
-       return end_zone;
+       return sc.order;
  }
  
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
-                               int classzone_idx, int balanced_classzone_idx)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
+                               unsigned int classzone_idx)
  {
         long remaining = 0;
         DEFINE_WAIT(wait);
@@ -3276,8 +3342,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
         prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
  
         /* Try to sleep for a short interval */
-       if (prepare_kswapd_sleep(pgdat, order, remaining,
-                                               balanced_classzone_idx)) {
+       if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
                 /*
                  * Compaction records what page blocks it recently failed to
                  * isolate pages from and skips them in the future scanning.
@@ -3290,9 +3355,20 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
                  * We have freed the memory, now we should compact it to make
                  * allocation of the requested order possible.
                  */
-               wakeup_kcompactd(pgdat, order, classzone_idx);
+               wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
  
                 remaining = schedule_timeout(HZ/10);
+
+               /*
+                * If woken prematurely then reset kswapd_classzone_idx and
+                * order. The values will either be from a wakeup request or
+                * the previous request that slept prematurely.
+                */
+               if (remaining) {
+                       pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+                       pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
+               }
+
                 finish_wait(&pgdat->kswapd_wait, &wait);
                 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
         }
@@ -3301,8 +3377,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
          * After a short sleep, check if it was a premature sleep. If not, then
          * go fully to sleep until explicitly woken up.
          */
-       if (prepare_kswapd_sleep(pgdat, order, remaining,
-                                               balanced_classzone_idx)) {
+       if (!remaining &&
+           prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
                 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
  
                 /*
@@ -3343,9 +3419,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
   */
  static int kswapd(void *p)
  {
-       unsigned long order, new_order;
-       int classzone_idx, new_classzone_idx;
-       int balanced_classzone_idx;
+       unsigned int alloc_order, reclaim_order, classzone_idx;
         pg_data_t *pgdat = (pg_data_t*)p;
         struct task_struct *tsk = current;
  
@@ -3375,38 +3449,20 @@ static int kswapd(void *p)
         tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
         set_freezable();
  
-       order = new_order = 0;
-       classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
-       balanced_classzone_idx = classzone_idx;
+       pgdat->kswapd_order = alloc_order = reclaim_order = 0;
+       pgdat->kswapd_classzone_idx = classzone_idx = 0;
         for ( ; ; ) {
                 bool ret;
  
-               /*
-                * While we were reclaiming, there might have been another
-                * wakeup, so check the values.
-                */
-               new_order = pgdat->kswapd_max_order;
-               new_classzone_idx = pgdat->classzone_idx;
-               pgdat->kswapd_max_order =  0;
-               pgdat->classzone_idx = pgdat->nr_zones - 1;
+kswapd_try_sleep:
+               kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
+                                       classzone_idx);
  
-               if (order < new_order || classzone_idx > new_classzone_idx) {
-                       /*
-                        * Don't sleep if someone wants a larger 'order'
-                        * allocation or has tigher zone constraints
-                        */
-                       order = new_order;
-                       classzone_idx = new_classzone_idx;
-               } else {
-                       kswapd_try_to_sleep(pgdat, order, classzone_idx,
-                                               balanced_classzone_idx);
-                       order = pgdat->kswapd_max_order;
-                       classzone_idx = pgdat->classzone_idx;
-                       new_order = order;
-                       new_classzone_idx = classzone_idx;
-                       pgdat->kswapd_max_order = 0;
-                       pgdat->classzone_idx = pgdat->nr_zones - 1;
-               }
+               /* Read the new order and classzone_idx */
+               alloc_order = reclaim_order = pgdat->kswapd_order;
+               classzone_idx = pgdat->kswapd_classzone_idx;
+               pgdat->kswapd_order = 0;
+               pgdat->kswapd_classzone_idx = 0;
  
                 ret = try_to_freeze();
                 if (kthread_should_stop())
@@ -3416,11 +3472,25 @@ static int kswapd(void *p)
                  * We can speed up thawing tasks if we don't call balance_pgdat
                  * after returning from the refrigerator
                  */
-               if (!ret) {
-                       trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                       balanced_classzone_idx = balance_pgdat(pgdat, order,
-                                                               classzone_idx);
-               }
+               if (ret)
+                       continue;
+
+               /*
+                * Reclaim begins at the requested order but if a high-order
+                * reclaim fails then kswapd falls back to reclaiming for
+                * order-0. If that happens, kswapd will consider sleeping
+                * for the order it finished reclaiming at (reclaim_order)
+                * but kcompactd is woken to compact for the original
+                * request (alloc_order).
+                */
+               trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
+                                               alloc_order);
+               reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
+               if (reclaim_order < alloc_order)
+                       goto kswapd_try_sleep;
+
+               alloc_order = reclaim_order = pgdat->kswapd_order;
+               classzone_idx = pgdat->kswapd_classzone_idx;
         }
  
         tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
@@ -3436,6 +3506,7 @@ static int kswapd(void *p)
  void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
  {
         pg_data_t *pgdat;
+       int z;
  
         if (!populated_zone(zone))
                 return;
@@ -3443,14 +3514,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
         if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
                 return;
         pgdat = zone->zone_pgdat;
-       if (pgdat->kswapd_max_order < order) {
-               pgdat->kswapd_max_order = order;
-               pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
-       }
+       pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+       pgdat->kswapd_order = max(pgdat->kswapd_order, order);
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
-       if (zone_balanced(zone, order, true, 0, 0))
-               return;
+
+       /* Only wake kswapd if all zones are unbalanced */
+       for (z = 0; z <= classzone_idx; z++) {
+               zone = pgdat->node_zones + z;
+               if (!populated_zone(zone))
+                       continue;
+
+               if (zone_balanced(zone, order, classzone_idx))
+                       return;
+       }
  
         trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
         wake_up_interruptible(&pgdat->kswapd_wait);
@@ -3471,6 +3548,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
         struct scan_control sc = {
                 .nr_to_reclaim = nr_to_reclaim,
                 .gfp_mask = GFP_HIGHUSER_MOVABLE,
+               .reclaim_idx = MAX_NR_ZONES - 1,
                 .priority = DEF_PRIORITY,
                 .may_writepage = 1,
                 .may_unmap = 1,
@@ -3572,12 +3650,12 @@ module_init(kswapd_init)
  
  #ifdef CONFIG_NUMA
  /*
- * Zone reclaim mode
+ * Node reclaim mode
   *
- * If non-zero call zone_reclaim when the number of free pages falls below
+ * If non-zero call node_reclaim when the number of free pages falls below
   * the watermarks.
   */
-int zone_reclaim_mode __read_mostly;
+int node_reclaim_mode __read_mostly;
  
  #define RECLAIM_OFF 0
  #define RECLAIM_ZONE (1<<0)    /* Run shrink_inactive_list on the zone */
@@ -3585,14 +3663,14 @@ int zone_reclaim_mode __read_mostly;
  #define RECLAIM_UNMAP (1<<2)   /* Unmap pages during reclaim */
  
  /*
- * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * Priority for NODE_RECLAIM. This determines the fraction of pages
   * of a node considered for each zone_reclaim. 4 scans 1/16th of
   * a zone.
   */
-#define ZONE_RECLAIM_PRIORITY 4
+#define NODE_RECLAIM_PRIORITY 4
  
  /*
- * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * Percentage of pages in a zone that must be unmapped for node_reclaim to
   * occur.
   */
  int sysctl_min_unmapped_ratio = 1;
@@ -3603,11 +3681,11 @@ int sysctl_min_unmapped_ratio = 1;
   */
  int sysctl_min_slab_ratio = 5;
  
-static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
+static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
  {
-       unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
-       unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
-               zone_page_state(zone, NR_ACTIVE_FILE);
+       unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
+       unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
+               node_page_state(pgdat, NR_ACTIVE_FILE);
  
         /*
          * It's possible for there to be more file mapped pages than
@@ -3618,7 +3696,7 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
  }
  
  /* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static unsigned long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
  {
         unsigned long nr_pagecache_reclaimable;
         unsigned long delta = 0;
@@ -3626,17 +3704,17 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
         /*
          * If RECLAIM_UNMAP is set, then all file pages are considered
          * potentially reclaimable. Otherwise, we have to worry about
-        * pages like swapcache and zone_unmapped_file_pages() provides
+        * pages like swapcache and node_unmapped_file_pages() provides
          * a better estimate
          */
-       if (zone_reclaim_mode & RECLAIM_UNMAP)
-               nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
+       if (node_reclaim_mode & RECLAIM_UNMAP)
+               nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
         else
-               nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
+               nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
  
         /* If we can't clean pages, remove dirty pages from consideration */
-       if (!(zone_reclaim_mode & RECLAIM_WRITE))
-               delta += zone_page_state(zone, NR_FILE_DIRTY);
+       if (!(node_reclaim_mode & RECLAIM_WRITE))
+               delta += node_page_state(pgdat, NR_FILE_DIRTY);
  
         /* Watch for any possible underflows due to delta */
         if (unlikely(delta > nr_pagecache_reclaimable))
@@ -3646,22 +3724,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
  }
  
  /*
- * Try to free up some pages from this zone through reclaim.
+ * Try to free up some pages from this node through reclaim.
   */
-static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
  {
         /* Minimum pages needed in order to stay on node */
         const unsigned long nr_pages = 1 << order;
         struct task_struct *p = current;
         struct reclaim_state reclaim_state;
+       int classzone_idx = gfp_zone(gfp_mask);
         struct scan_control sc = {
                 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
                 .order = order,
-               .priority = ZONE_RECLAIM_PRIORITY,
-               .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
-               .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
+               .priority = NODE_RECLAIM_PRIORITY,
+               .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+               .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
                 .may_swap = 1,
+               .reclaim_idx = classzone_idx,
         };
  
         cond_resched();
@@ -3675,13 +3755,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         reclaim_state.reclaimed_slab = 0;
         p->reclaim_state = &reclaim_state;
  
-       if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
+       if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
                 /*
                  * Free memory by calling shrink zone with increasing
                  * priorities until we have enough memory freed.
                  */
                 do {
-                       shrink_zone(zone, &sc, true);
+                       shrink_node(pgdat, &sc);
                 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
         }
  
@@ -3691,49 +3771,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         return sc.nr_reclaimed >= nr_pages;
  }
  
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
  {
-       int node_id;
         int ret;
  
         /*
-        * Zone reclaim reclaims unmapped file backed pages and
+        * Node reclaim reclaims unmapped file backed pages and
          * slab pages if we are over the defined limits.
          *
          * A small portion of unmapped file backed pages is needed for
          * file I/O otherwise pages read by file I/O will be immediately
-        * thrown out if the zone is overallocated. So we do not reclaim
-        * if less than a specified percentage of the zone is used by
+        * thrown out if the node is overallocated. So we do not reclaim
+        * if less than a specified percentage of the node is used by
          * unmapped file backed pages.
          */
-       if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
-           zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
-               return ZONE_RECLAIM_FULL;
+       if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
+           sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+               return NODE_RECLAIM_FULL;
  
-       if (!zone_reclaimable(zone))
-               return ZONE_RECLAIM_FULL;
+       if (!pgdat_reclaimable(pgdat))
+               return NODE_RECLAIM_FULL;
  
         /*
          * Do not scan if the allocation should not be delayed.
          */
         if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
-               return ZONE_RECLAIM_NOSCAN;
+               return NODE_RECLAIM_NOSCAN;
  
         /*
-        * Only run zone reclaim on the local zone or on zones that do not
+        * Only run node reclaim on the local node or on nodes that do not
          * have associated processors. This will favor the local processor
          * over remote processors and spread off node memory allocations
          * as wide as possible.
          */
-       node_id = zone_to_nid(zone);
-       if (node_state(node_id, N_CPU) && node_id != numa_node_id())
-               return ZONE_RECLAIM_NOSCAN;
+       if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
+               return NODE_RECLAIM_NOSCAN;
  
-       if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
-               return ZONE_RECLAIM_NOSCAN;
+       if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+               return NODE_RECLAIM_NOSCAN;
  
-       ret = __zone_reclaim(zone, gfp_mask, order);
-       clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+       ret = __node_reclaim(pgdat, gfp_mask, order);
+       clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
  
         if (!ret)
                 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
@@ -3772,24 +3850,23 @@ int page_evictable(struct page *page)
  void check_move_unevictable_pages(struct page **pages, int nr_pages)
  {
         struct lruvec *lruvec;
-       struct zone *zone = NULL;
+       struct pglist_data *pgdat = NULL;
         int pgscanned = 0;
         int pgrescued = 0;
         int i;
  
         for (i = 0; i < nr_pages; i++) {
                 struct page *page = pages[i];
-               struct zone *pagezone;
+               struct pglist_data *pagepgdat = page_pgdat(page);
  
                 pgscanned++;
-               pagezone = page_zone(page);
-               if (pagezone != zone) {
-                       if (zone)
-                               spin_unlock_irq(&zone->lru_lock);
-                       zone = pagezone;
-                       spin_lock_irq(&zone->lru_lock);
+               if (pagepgdat != pgdat) {
+                       if (pgdat)
+                               spin_unlock_irq(&pgdat->lru_lock);
+                       pgdat = pagepgdat;
+                       spin_lock_irq(&pgdat->lru_lock);
                 }
-               lruvec = mem_cgroup_page_lruvec(page, zone);
+               lruvec = mem_cgroup_page_lruvec(page, pgdat);
  
                 if (!PageLRU(page) || !PageUnevictable(page))
                         continue;
@@ -3805,10 +3882,10 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
                 }
         }
  
-       if (zone) {
+       if (pgdat) {
                 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
                 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
-               spin_unlock_irq(&zone->lru_lock);
+               spin_unlock_irq(&pgdat->lru_lock);
         }
  }
  #endif /* CONFIG_SHMEM */