Merge branch 'linux-3.17' of git://anongit.freedesktop.org/git/nouveau/linux-2.6
[cascardo/linux.git] / mm / page_alloc.c
index b7381d1..18cee0d 100644 (file)
@@ -680,9 +680,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
        int migratetype = 0;
        int batch_free = 0;
        int to_free = count;
+       unsigned long nr_scanned;
 
        spin_lock(&zone->lock);
-       zone->pages_scanned = 0;
+       nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+       if (nr_scanned)
+               __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
 
        while (to_free) {
                struct page *page;
@@ -731,8 +734,11 @@ static void free_one_page(struct zone *zone,
                                unsigned int order,
                                int migratetype)
 {
+       unsigned long nr_scanned;
        spin_lock(&zone->lock);
-       zone->pages_scanned = 0;
+       nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+       if (nr_scanned)
+               __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
 
        __free_one_page(page, pfn, zone, order, migratetype);
        if (unlikely(!is_migrate_isolate(migratetype)))
@@ -1606,6 +1612,9 @@ again:
        }
 
        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+       if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
+           !zone_is_fair_depleted(zone))
+               zone_set_flag(zone, ZONE_FAIR_DEPLETED);
 
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
        zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1917,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 
 #endif /* CONFIG_NUMA */
 
+static void reset_alloc_batches(struct zone *preferred_zone)
+{
+       struct zone *zone = preferred_zone->zone_pgdat->node_zones;
+
+       do {
+               mod_zone_page_state(zone, NR_ALLOC_BATCH,
+                       high_wmark_pages(zone) - low_wmark_pages(zone) -
+                       atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+               zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
+       } while (zone++ != preferred_zone);
+}
+
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
@@ -1934,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
        int did_zlc_setup = 0;          /* just call zlc_setup() one time */
        bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
                                (gfp_mask & __GFP_WRITE);
+       int nr_fair_skipped = 0;
+       bool zonelist_rescan;
 
 zonelist_scan:
+       zonelist_rescan = false;
+
        /*
         * Scan zonelist, looking for a zone with enough free.
         * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
@@ -1959,9 +1984,11 @@ zonelist_scan:
                 */
                if (alloc_flags & ALLOC_FAIR) {
                        if (!zone_local(preferred_zone, zone))
+                               break;
+                       if (zone_is_fair_depleted(zone)) {
+                               nr_fair_skipped++;
                                continue;
-                       if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
-                               continue;
+                       }
                }
                /*
                 * When allocating a page cache page for writing, we
@@ -2067,13 +2094,7 @@ this_zone_full:
                        zlc_mark_zone_full(zonelist, z);
        }
 
-       if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
-               /* Disable zlc cache for second zonelist scan */
-               zlc_active = 0;
-               goto zonelist_scan;
-       }
-
-       if (page)
+       if (page) {
                /*
                 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
                 * necessary to allocate the page. The expectation is
@@ -2082,8 +2103,37 @@ this_zone_full:
                 * for !PFMEMALLOC purposes.
                 */
                page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+               return page;
+       }
 
-       return page;
+       /*
+        * The first pass makes sure allocations are spread fairly within the
+        * local node.  However, the local node might have free pages left
+        * after the fairness batches are exhausted, and remote zones haven't
+        * even been considered yet.  Try once more without fairness, and
+        * include remote zones now, before entering the slowpath and waking
+        * kswapd: prefer spilling to a remote zone over swapping locally.
+        */
+       if (alloc_flags & ALLOC_FAIR) {
+               alloc_flags &= ~ALLOC_FAIR;
+               if (nr_fair_skipped) {
+                       zonelist_rescan = true;
+                       reset_alloc_batches(preferred_zone);
+               }
+               if (nr_online_nodes > 1)
+                       zonelist_rescan = true;
+       }
+
+       if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
+               /* Disable zlc cache for second zonelist scan */
+               zlc_active = 0;
+               zonelist_rescan = true;
+       }
+
+       if (zonelist_rescan)
+               goto zonelist_scan;
+
+       return NULL;
 }
 
 /*
@@ -2196,8 +2246,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 {
        struct page *page;
 
-       /* Acquire the OOM killer lock for the zones in zonelist */
-       if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
+       /* Acquire the per-zone oom lock for each zone */
+       if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
                schedule_timeout_uninterruptible(1);
                return NULL;
        }
@@ -2235,7 +2285,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 
 out:
-       clear_zonelist_oom(zonelist, gfp_mask);
+       oom_zonelist_unlock(zonelist, gfp_mask);
        return page;
 }
 
@@ -2404,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
        return page;
 }
 
-static void reset_alloc_batches(struct zonelist *zonelist,
-                               enum zone_type high_zoneidx,
-                               struct zone *preferred_zone)
-{
-       struct zoneref *z;
-       struct zone *zone;
-
-       for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-               /*
-                * Only reset the batches of zones that were actually
-                * considered in the fairness pass, we don't want to
-                * trash fairness information for zones that are not
-                * actually part of this zonelist's round-robin cycle.
-                */
-               if (!zone_local(preferred_zone, zone))
-                       continue;
-               mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                       high_wmark_pages(zone) - low_wmark_pages(zone) -
-                       atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-       }
-}
-
 static void wake_all_kswapds(unsigned int order,
                             struct zonelist *zonelist,
                             enum zone_type high_zoneidx,
@@ -2610,14 +2638,6 @@ rebalance:
        if (page)
                goto got_pg;
 
-       /*
-        * It can become very expensive to allocate transparent hugepages at
-        * fault, so use asynchronous memory compaction for THP unless it is
-        * khugepaged trying to collapse.
-        */
-       if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
-               migration_mode = MIGRATE_SYNC_LIGHT;
-
        /*
         * If compaction is deferred for high-order allocations, it is because
         * sync compaction recently failed. In this is the case and the caller
@@ -2628,6 +2648,15 @@ rebalance:
                                                (gfp_mask & __GFP_NO_KSWAPD))
                goto nopage;
 
+       /*
+        * It can become very expensive to allocate transparent hugepages at
+        * fault, so use asynchronous memory compaction for THP unless it is
+        * khugepaged trying to collapse.
+        */
+       if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
+                                               (current->flags & PF_KTHREAD))
+               migration_mode = MIGRATE_SYNC_LIGHT;
+
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                        zonelist, high_zoneidx,
@@ -2761,28 +2790,11 @@ retry_cpuset:
        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
 #endif
-retry:
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                        zonelist, high_zoneidx, alloc_flags,
                        preferred_zone, classzone_idx, migratetype);
        if (unlikely(!page)) {
-               /*
-                * The first pass makes sure allocations are spread
-                * fairly within the local node.  However, the local
-                * node might have free pages left after the fairness
-                * batches are exhausted, and remote zones haven't
-                * even been considered yet.  Try once more without
-                * fairness, and include remote zones now, before
-                * entering the slowpath and waking kswapd: prefer
-                * spilling to a remote zone over swapping locally.
-                */
-               if (alloc_flags & ALLOC_FAIR) {
-                       reset_alloc_batches(zonelist, high_zoneidx,
-                                           preferred_zone);
-                       alloc_flags &= ~ALLOC_FAIR;
-                       goto retry;
-               }
                /*
                 * Runtime PM, block IO and its error handling path
                 * can deadlock because I/O on the device might not
@@ -3248,7 +3260,7 @@ void show_free_areas(unsigned int filter)
                        K(zone_page_state(zone, NR_BOUNCE)),
                        K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
                        K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
-                       zone->pages_scanned,
+                       K(zone_page_state(zone, NR_PAGES_SCANNED)),
                        (!zone_reclaimable(zone) ? "yes" : "no")
                        );
                printk("lowmem_reserve[]:");