Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc
[cascardo/linux.git] / mm / vmscan.c
index 6f43df4..2836b53 100644 (file)
 #include <trace/events/vmscan.h>
 
 struct scan_control {
-       /* Incremented by the number of inactive pages that were scanned */
-       unsigned long nr_scanned;
-
-       /* Number of pages freed so far during a call to shrink_zones() */
-       unsigned long nr_reclaimed;
-
-       /* One of the zones is ready for compaction */
-       int compaction_ready;
-
        /* How many pages shrink_list() should reclaim */
        unsigned long nr_to_reclaim;
 
-       unsigned long hibernation_mode;
-
        /* This context's GFP mask */
        gfp_t gfp_mask;
 
-       int may_writepage;
-
-       /* Can mapped pages be reclaimed? */
-       int may_unmap;
-
-       /* Can pages be swapped as part of reclaim? */
-       int may_swap;
-
+       /* Allocation order */
        int order;
 
-       /* Scan (total_size >> priority) pages at once */
-       int priority;
-
-       /* anon vs. file LRUs scanning "ratio" */
-       int swappiness;
+       /*
+        * Nodemask of nodes allowed by the caller. If NULL, all nodes
+        * are scanned.
+        */
+       nodemask_t      *nodemask;
 
        /*
         * The memory cgroup that hit its limit and as a result is the
@@ -98,11 +80,27 @@ struct scan_control {
         */
        struct mem_cgroup *target_mem_cgroup;
 
-       /*
-        * Nodemask of nodes allowed by the caller. If NULL, all nodes
-        * are scanned.
-        */
-       nodemask_t      *nodemask;
+       /* Scan (total_size >> priority) pages at once */
+       int priority;
+
+       unsigned int may_writepage:1;
+
+       /* Can mapped pages be reclaimed? */
+       unsigned int may_unmap:1;
+
+       /* Can pages be swapped as part of reclaim? */
+       unsigned int may_swap:1;
+
+       unsigned int hibernation_mode:1;
+
+       /* One of the zones is ready for compaction */
+       unsigned int compaction_ready:1;
+
+       /* Incremented by the number of inactive pages that were scanned */
+       unsigned long nr_scanned;
+
+       /* Number of pages freed so far during a call to shrink_zones() */
+       unsigned long nr_reclaimed;
 };
 
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -139,7 +137,11 @@ struct scan_control {
  * From 0 .. 100.  Higher means more swappy.
  */
 int vm_swappiness = 60;
-unsigned long vm_total_pages;  /* The total number of pages which the VM controls */
+/*
+ * The total number of pages which are beyond the high watermark within all
+ * zones.
+ */
+unsigned long vm_total_pages;
 
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
@@ -172,7 +174,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
 
 bool zone_reclaimable(struct zone *zone)
 {
-       return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+       return zone_page_state(zone, NR_PAGES_SCANNED) <
+               zone_reclaimable_pages(zone) * 6;
 }
 
 static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -574,9 +577,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 
        if (PageSwapCache(page)) {
                swp_entry_t swap = { .val = page_private(page) };
+               mem_cgroup_swapout(page, swap);
                __delete_from_swap_cache(page);
                spin_unlock_irq(&mapping->tree_lock);
-               swapcache_free(swap, page);
+               swapcache_free(swap);
        } else {
                void (*freepage)(struct page *);
                void *shadow = NULL;
@@ -597,7 +601,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                        shadow = workingset_eviction(mapping, page);
                __delete_from_page_cache(page, shadow);
                spin_unlock_irq(&mapping->tree_lock);
-               mem_cgroup_uncharge_cache_page(page);
 
                if (freepage != NULL)
                        freepage(page);
@@ -819,7 +822,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
        cond_resched();
 
-       mem_cgroup_uncharge_start();
        while (!list_empty(page_list)) {
                struct address_space *mapping;
                struct page *page;
@@ -1130,11 +1132,12 @@ keep:
                VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
        }
 
+       mem_cgroup_uncharge_list(&free_pages);
        free_hot_cold_page_list(&free_pages, true);
 
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
-       mem_cgroup_uncharge_end();
+
        *ret_nr_dirty += nr_dirty;
        *ret_nr_congested += nr_congested;
        *ret_nr_unqueued_dirty += nr_unqueued_dirty;
@@ -1434,6 +1437,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 
                        if (unlikely(PageCompound(page))) {
                                spin_unlock_irq(&zone->lru_lock);
+                               mem_cgroup_uncharge(page);
                                (*get_compound_page_dtor(page))(page);
                                spin_lock_irq(&zone->lru_lock);
                        } else
@@ -1506,7 +1510,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
 
        if (global_reclaim(sc)) {
-               zone->pages_scanned += nr_scanned;
+               __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
                if (current_is_kswapd())
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
                else
@@ -1541,6 +1545,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
        spin_unlock_irq(&zone->lru_lock);
 
+       mem_cgroup_uncharge_list(&page_list);
        free_hot_cold_page_list(&page_list, true);
 
        /*
@@ -1655,6 +1660,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
 
                        if (unlikely(PageCompound(page))) {
                                spin_unlock_irq(&zone->lru_lock);
+                               mem_cgroup_uncharge(page);
                                (*get_compound_page_dtor(page))(page);
                                spin_lock_irq(&zone->lru_lock);
                        } else
@@ -1696,7 +1702,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
        nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
                                     &nr_scanned, sc, isolate_mode, lru);
        if (global_reclaim(sc))
-               zone->pages_scanned += nr_scanned;
+               __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
 
        reclaim_stat->recent_scanned[file] += nr_taken;
 
@@ -1753,7 +1759,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
         * Count referenced pages from currently used mappings as rotated,
         * even though only some of them are actually re-activated.  This
         * helps balance scan pressure between file and anonymous pages in
-        * get_scan_ratio.
+        * get_scan_count.
         */
        reclaim_stat->recent_rotated[file] += nr_rotated;
 
@@ -1762,6 +1768,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
        __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
        spin_unlock_irq(&zone->lru_lock);
 
+       mem_cgroup_uncharge_list(&l_hold);
        free_hot_cold_page_list(&l_hold, true);
 }
 
@@ -1868,8 +1875,8 @@ enum scan_balance {
  * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
  * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
  */
-static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
-                          unsigned long *nr)
+static void get_scan_count(struct lruvec *lruvec, int swappiness,
+                          struct scan_control *sc, unsigned long *nr)
 {
        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
        u64 fraction[2];
@@ -1912,7 +1919,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         * using the memory controller's swap limit feature would be
         * too expensive.
         */
-       if (!global_reclaim(sc) && !sc->swappiness) {
+       if (!global_reclaim(sc) && !swappiness) {
                scan_balance = SCAN_FILE;
                goto out;
        }
@@ -1922,16 +1929,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         * system is close to OOM, scan both anon and file equally
         * (unless the swappiness setting disagrees with swapping).
         */
-       if (!sc->priority && sc->swappiness) {
+       if (!sc->priority && swappiness) {
                scan_balance = SCAN_EQUAL;
                goto out;
        }
 
-       anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
-               get_lru_size(lruvec, LRU_INACTIVE_ANON);
-       file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
-               get_lru_size(lruvec, LRU_INACTIVE_FILE);
-
        /*
         * Prevent the reclaimer from falling into the cache trap: as
         * cache pages start out inactive, every cache fault will tip
@@ -1942,9 +1944,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         * anon pages.  Try to detect this based on file LRU size.
         */
        if (global_reclaim(sc)) {
-               unsigned long free = zone_page_state(zone, NR_FREE_PAGES);
+               unsigned long zonefile;
+               unsigned long zonefree;
+
+               zonefree = zone_page_state(zone, NR_FREE_PAGES);
+               zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
+                          zone_page_state(zone, NR_INACTIVE_FILE);
 
-               if (unlikely(file + free <= high_wmark_pages(zone))) {
+               if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
                        scan_balance = SCAN_ANON;
                        goto out;
                }
@@ -1965,7 +1972,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         * With swappiness at 100, anonymous and file have the same priority.
         * This scanning priority is essentially the inverse of IO cost.
         */
-       anon_prio = sc->swappiness;
+       anon_prio = swappiness;
        file_prio = 200 - anon_prio;
 
        /*
@@ -1979,6 +1986,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         *
         * anon in [0], file in [1]
         */
+
+       anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+               get_lru_size(lruvec, LRU_INACTIVE_ANON);
+       file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+               get_lru_size(lruvec, LRU_INACTIVE_FILE);
+
        spin_lock_irq(&zone->lru_lock);
        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
                reclaim_stat->recent_scanned[0] /= 2;
@@ -2055,7 +2068,8 @@ out:
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
+                         struct scan_control *sc)
 {
        unsigned long nr[NR_LRU_LISTS];
        unsigned long targets[NR_LRU_LISTS];
@@ -2066,7 +2080,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
        struct blk_plug plug;
        bool scan_adjusted;
 
-       get_scan_count(lruvec, sc, nr);
+       get_scan_count(lruvec, swappiness, sc, nr);
 
        /* Record the original scan target for proportional adjustments later */
        memcpy(targets, nr, sizeof(nr));
@@ -2244,9 +2258,10 @@ static inline bool should_continue_reclaim(struct zone *zone,
        }
 }
 
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
+static bool shrink_zone(struct zone *zone, struct scan_control *sc)
 {
        unsigned long nr_reclaimed, nr_scanned;
+       bool reclaimable = false;
 
        do {
                struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2262,11 +2277,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
                memcg = mem_cgroup_iter(root, NULL, &reclaim);
                do {
                        struct lruvec *lruvec;
+                       int swappiness;
 
                        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+                       swappiness = mem_cgroup_swappiness(memcg);
 
-                       sc->swappiness = mem_cgroup_swappiness(memcg);
-                       shrink_lruvec(lruvec, sc);
+                       shrink_lruvec(lruvec, swappiness, sc);
 
                        /*
                         * Direct reclaim and kswapd have to scan all memory
@@ -2290,8 +2306,13 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
                           sc->nr_scanned - nr_scanned,
                           sc->nr_reclaimed - nr_reclaimed);
 
+               if (sc->nr_reclaimed - nr_reclaimed)
+                       reclaimable = true;
+
        } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
                                         sc->nr_scanned - nr_scanned, sc));
+
+       return reclaimable;
 }
 
 /* Returns true if compaction should go ahead for a high-order request */
@@ -2340,8 +2361,10 @@ static inline bool compaction_ready(struct zone *zone, int order)
  *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
+ *
+ * Returns true if a zone was reclaimable.
  */
-static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
        struct zoneref *z;
        struct zone *zone;
@@ -2354,6 +2377,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                .gfp_mask = sc->gfp_mask,
        };
        enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
+       bool reclaimable = false;
 
        /*
         * If the number of buffer_heads in the machine exceeds the maximum
@@ -2414,10 +2438,17 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                                                &nr_soft_scanned);
                        sc->nr_reclaimed += nr_soft_reclaimed;
                        sc->nr_scanned += nr_soft_scanned;
+                       if (nr_soft_reclaimed)
+                               reclaimable = true;
                        /* need some check for avoid more shrink_zone() */
                }
 
-               shrink_zone(zone, sc);
+               if (shrink_zone(zone, sc))
+                       reclaimable = true;
+
+               if (global_reclaim(sc) &&
+                   !reclaimable && zone_reclaimable(zone))
+                       reclaimable = true;
        }
 
        /*
@@ -2439,26 +2470,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
         * promoted it to __GFP_HIGHMEM.
         */
        sc->gfp_mask = orig_mask;
-}
 
-/* All zones in zonelist are unreclaimable? */
-static bool all_unreclaimable(struct zonelist *zonelist,
-               struct scan_control *sc)
-{
-       struct zoneref *z;
-       struct zone *zone;
-
-       for_each_zone_zonelist_nodemask(zone, z, zonelist,
-                       gfp_zone(sc->gfp_mask), sc->nodemask) {
-               if (!populated_zone(zone))
-                       continue;
-               if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                       continue;
-               if (zone_reclaimable(zone))
-                       return false;
-       }
-
-       return true;
+       return reclaimable;
 }
 
 /*
@@ -2482,6 +2495,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 {
        unsigned long total_scanned = 0;
        unsigned long writeback_threshold;
+       bool zones_reclaimable;
 
        delayacct_freepages_start();
 
@@ -2492,7 +2506,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
                                sc->priority);
                sc->nr_scanned = 0;
-               shrink_zones(zonelist, sc);
+               zones_reclaimable = shrink_zones(zonelist, sc);
 
                total_scanned += sc->nr_scanned;
                if (sc->nr_reclaimed >= sc->nr_to_reclaim)
@@ -2532,8 +2546,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        if (sc->compaction_ready)
                return 1;
 
-       /* top priority shrink_zones still had more to do? don't OOM, then */
-       if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
+       /* Any of the zones still reclaimable?  Don't OOM. */
+       if (zones_reclaimable)
                return 1;
 
        return 0;
@@ -2670,15 +2684,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 {
        unsigned long nr_reclaimed;
        struct scan_control sc = {
+               .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+               .order = order,
+               .nodemask = nodemask,
+               .priority = DEF_PRIORITY,
                .may_writepage = !laptop_mode,
-               .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .may_unmap = 1,
                .may_swap = 1,
-               .order = order,
-               .priority = DEF_PRIORITY,
-               .target_mem_cgroup = NULL,
-               .nodemask = nodemask,
        };
 
        /*
@@ -2708,17 +2721,14 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                                                unsigned long *nr_scanned)
 {
        struct scan_control sc = {
-               .nr_scanned = 0,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
+               .target_mem_cgroup = memcg,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = !noswap,
-               .order = 0,
-               .priority = 0,
-               .swappiness = mem_cgroup_swappiness(memcg),
-               .target_mem_cgroup = memcg,
        };
        struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+       int swappiness = mem_cgroup_swappiness(memcg);
 
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2734,7 +2744,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
         * will pick up pages from other mem cgroup's as well. We hack
         * the priority and make it zero.
         */
-       shrink_lruvec(lruvec, &sc);
+       shrink_lruvec(lruvec, swappiness, &sc);
 
        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
 
@@ -2750,16 +2760,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
        unsigned long nr_reclaimed;
        int nid;
        struct scan_control sc = {
-               .may_writepage = !laptop_mode,
-               .may_unmap = 1,
-               .may_swap = !noswap,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
-               .order = 0,
-               .priority = DEF_PRIORITY,
-               .target_mem_cgroup = memcg,
-               .nodemask = NULL, /* we don't care the placement */
                .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                                (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+               .target_mem_cgroup = memcg,
+               .priority = DEF_PRIORITY,
+               .may_writepage = !laptop_mode,
+               .may_unmap = 1,
+               .may_swap = !noswap,
        };
 
        /*
@@ -3017,12 +3025,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
        unsigned long nr_soft_scanned;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
+               .order = order,
                .priority = DEF_PRIORITY,
+               .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = 1,
-               .may_writepage = !laptop_mode,
-               .order = order,
-               .target_mem_cgroup = NULL,
        };
        count_vm_event(PAGEOUTRUN);
 
@@ -3403,14 +3410,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 {
        struct reclaim_state reclaim_state;
        struct scan_control sc = {
+               .nr_to_reclaim = nr_to_reclaim,
                .gfp_mask = GFP_HIGHUSER_MOVABLE,
-               .may_swap = 1,
-               .may_unmap = 1,
+               .priority = DEF_PRIORITY,
                .may_writepage = 1,
-               .nr_to_reclaim = nr_to_reclaim,
+               .may_unmap = 1,
+               .may_swap = 1,
                .hibernation_mode = 1,
-               .order = 0,
-               .priority = DEF_PRIORITY,
        };
        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
        struct task_struct *p = current;
@@ -3590,13 +3596,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        struct task_struct *p = current;
        struct reclaim_state reclaim_state;
        struct scan_control sc = {
-               .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
-               .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
-               .may_swap = 1,
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
                .order = order,
                .priority = ZONE_RECLAIM_PRIORITY,
+               .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+               .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+               .may_swap = 1,
        };
        struct shrink_control shrink = {
                .gfp_mask = sc.gfp_mask,