powerpc/tm: Switch out userspace PPR and DSCR sooner
[cascardo/linux.git] / mm / vmscan.c
index 2cff0d4..beb3577 100644 (file)
@@ -146,6 +146,25 @@ static bool global_reclaim(struct scan_control *sc)
 }
 #endif
 
+unsigned long zone_reclaimable_pages(struct zone *zone)
+{
+       int nr;
+
+       nr = zone_page_state(zone, NR_ACTIVE_FILE) +
+            zone_page_state(zone, NR_INACTIVE_FILE);
+
+       if (get_nr_swap_pages() > 0)
+               nr += zone_page_state(zone, NR_ACTIVE_ANON) +
+                     zone_page_state(zone, NR_INACTIVE_ANON);
+
+       return nr;
+}
+
+bool zone_reclaimable(struct zone *zone)
+{
+       return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+}
+
 static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
        if (!mem_cgroup_disabled())
@@ -155,14 +174,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 }
 
 /*
- * Add a shrinker callback to be called from the vm
+ * Add a shrinker callback to be called from the vm.
  */
-void register_shrinker(struct shrinker *shrinker)
+int register_shrinker(struct shrinker *shrinker)
 {
-       atomic_long_set(&shrinker->nr_in_batch, 0);
+       size_t size = sizeof(*shrinker->nr_deferred);
+
+       /*
+        * If we only have one possible node in the system anyway, save
+        * ourselves the trouble and disable NUMA aware behavior. This way we
+        * will save memory and some small loop time later.
+        */
+       if (nr_node_ids == 1)
+               shrinker->flags &= ~SHRINKER_NUMA_AWARE;
+
+       if (shrinker->flags & SHRINKER_NUMA_AWARE)
+               size *= nr_node_ids;
+
+       shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
+       if (!shrinker->nr_deferred)
+               return -ENOMEM;
+
        down_write(&shrinker_rwsem);
        list_add_tail(&shrinker->list, &shrinker_list);
        up_write(&shrinker_rwsem);
+       return 0;
 }
 EXPORT_SYMBOL(register_shrinker);
 
@@ -177,15 +213,102 @@ void unregister_shrinker(struct shrinker *shrinker)
 }
 EXPORT_SYMBOL(unregister_shrinker);
 
-static inline int do_shrinker_shrink(struct shrinker *shrinker,
-                                    struct shrink_control *sc,
-                                    unsigned long nr_to_scan)
-{
-       sc->nr_to_scan = nr_to_scan;
-       return (*shrinker->shrink)(shrinker, sc);
+#define SHRINK_BATCH 128
+
+static unsigned long
+shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+                unsigned long nr_pages_scanned, unsigned long lru_pages)
+{
+       unsigned long freed = 0;
+       unsigned long long delta;
+       long total_scan;
+       long max_pass;
+       long nr;
+       long new_nr;
+       int nid = shrinkctl->nid;
+       long batch_size = shrinker->batch ? shrinker->batch
+                                         : SHRINK_BATCH;
+
+       max_pass = shrinker->count_objects(shrinker, shrinkctl);
+       if (max_pass == 0)
+               return 0;
+
+       /*
+        * copy the current shrinker scan count into a local variable
+        * and zero it so that other concurrent shrinker invocations
+        * don't also do this scanning work.
+        */
+       nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+
+       total_scan = nr;
+       delta = (4 * nr_pages_scanned) / shrinker->seeks;
+       delta *= max_pass;
+       do_div(delta, lru_pages + 1);
+       total_scan += delta;
+       if (total_scan < 0) {
+               printk(KERN_ERR
+               "shrink_slab: %pF negative objects to delete nr=%ld\n",
+                      shrinker->scan_objects, total_scan);
+               total_scan = max_pass;
+       }
+
+       /*
+        * We need to avoid excessive windup on filesystem shrinkers
+        * due to large numbers of GFP_NOFS allocations causing the
+        * shrinkers to return -1 all the time. This results in a large
+        * nr being built up so when a shrink that can do some work
+        * comes along it empties the entire cache due to nr >>>
+        * max_pass.  This is bad for sustaining a working set in
+        * memory.
+        *
+        * Hence only allow the shrinker to scan the entire cache when
+        * a large delta change is calculated directly.
+        */
+       if (delta < max_pass / 4)
+               total_scan = min(total_scan, max_pass / 2);
+
+       /*
+        * Avoid risking looping forever due to too large nr value:
+        * never try to free more than twice the estimate number of
+        * freeable entries.
+        */
+       if (total_scan > max_pass * 2)
+               total_scan = max_pass * 2;
+
+       trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+                               nr_pages_scanned, lru_pages,
+                               max_pass, delta, total_scan);
+
+       while (total_scan >= batch_size) {
+               unsigned long ret;
+
+               shrinkctl->nr_to_scan = batch_size;
+               ret = shrinker->scan_objects(shrinker, shrinkctl);
+               if (ret == SHRINK_STOP)
+                       break;
+               freed += ret;
+
+               count_vm_events(SLABS_SCANNED, batch_size);
+               total_scan -= batch_size;
+
+               cond_resched();
+       }
+
+       /*
+        * move the unused scan count back into the shrinker in a
+        * manner that handles concurrent updates. If we exhausted the
+        * scan, there is no need to do an update.
+        */
+       if (total_scan > 0)
+               new_nr = atomic_long_add_return(total_scan,
+                                               &shrinker->nr_deferred[nid]);
+       else
+               new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+
+       trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+       return freed;
 }
 
-#define SHRINK_BATCH 128
 /*
  * Call the shrink functions to age shrinkable caches
  *
@@ -205,115 +328,45 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
  *
  * Returns the number of slab objects which we shrunk.
  */
-unsigned long shrink_slab(struct shrink_control *shrink,
+unsigned long shrink_slab(struct shrink_control *shrinkctl,
                          unsigned long nr_pages_scanned,
                          unsigned long lru_pages)
 {
        struct shrinker *shrinker;
-       unsigned long ret = 0;
+       unsigned long freed = 0;
 
        if (nr_pages_scanned == 0)
                nr_pages_scanned = SWAP_CLUSTER_MAX;
 
        if (!down_read_trylock(&shrinker_rwsem)) {
-               /* Assume we'll be able to shrink next time */
-               ret = 1;
+               /*
+                * If we would return 0, our callers would understand that we
+                * have nothing else to shrink and give up trying. By returning
+                * 1 we keep it going and assume we'll be able to shrink next
+                * time.
+                */
+               freed = 1;
                goto out;
        }
 
        list_for_each_entry(shrinker, &shrinker_list, list) {
-               unsigned long long delta;
-               long total_scan;
-               long max_pass;
-               int shrink_ret = 0;
-               long nr;
-               long new_nr;
-               long batch_size = shrinker->batch ? shrinker->batch
-                                                 : SHRINK_BATCH;
-
-               max_pass = do_shrinker_shrink(shrinker, shrink, 0);
-               if (max_pass <= 0)
-                       continue;
-
-               /*
-                * copy the current shrinker scan count into a local variable
-                * and zero it so that other concurrent shrinker invocations
-                * don't also do this scanning work.
-                */
-               nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
-
-               total_scan = nr;
-               delta = (4 * nr_pages_scanned) / shrinker->seeks;
-               delta *= max_pass;
-               do_div(delta, lru_pages + 1);
-               total_scan += delta;
-               if (total_scan < 0) {
-                       printk(KERN_ERR "shrink_slab: %pF negative objects to "
-                              "delete nr=%ld\n",
-                              shrinker->shrink, total_scan);
-                       total_scan = max_pass;
-               }
-
-               /*
-                * We need to avoid excessive windup on filesystem shrinkers
-                * due to large numbers of GFP_NOFS allocations causing the
-                * shrinkers to return -1 all the time. This results in a large
-                * nr being built up so when a shrink that can do some work
-                * comes along it empties the entire cache due to nr >>>
-                * max_pass.  This is bad for sustaining a working set in
-                * memory.
-                *
-                * Hence only allow the shrinker to scan the entire cache when
-                * a large delta change is calculated directly.
-                */
-               if (delta < max_pass / 4)
-                       total_scan = min(total_scan, max_pass / 2);
-
-               /*
-                * Avoid risking looping forever due to too large nr value:
-                * never try to free more than twice the estimate number of
-                * freeable entries.
-                */
-               if (total_scan > max_pass * 2)
-                       total_scan = max_pass * 2;
-
-               trace_mm_shrink_slab_start(shrinker, shrink, nr,
-                                       nr_pages_scanned, lru_pages,
-                                       max_pass, delta, total_scan);
-
-               while (total_scan >= batch_size) {
-                       int nr_before;
+               for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+                       if (!node_online(shrinkctl->nid))
+                               continue;
 
-                       nr_before = do_shrinker_shrink(shrinker, shrink, 0);
-                       shrink_ret = do_shrinker_shrink(shrinker, shrink,
-                                                       batch_size);
-                       if (shrink_ret == -1)
+                       if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
+                           (shrinkctl->nid != 0))
                                break;
-                       if (shrink_ret < nr_before)
-                               ret += nr_before - shrink_ret;
-                       count_vm_events(SLABS_SCANNED, batch_size);
-                       total_scan -= batch_size;
 
-                       cond_resched();
-               }
+                       freed += shrink_slab_node(shrinkctl, shrinker,
+                                nr_pages_scanned, lru_pages);
 
-               /*
-                * move the unused scan count back into the shrinker in a
-                * manner that handles concurrent updates. If we exhausted the
-                * scan, there is no need to do an update.
-                */
-               if (total_scan > 0)
-                       new_nr = atomic_long_add_return(total_scan,
-                                       &shrinker->nr_in_batch);
-               else
-                       new_nr = atomic_long_read(&shrinker->nr_in_batch);
-
-               trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
+               }
        }
        up_read(&shrinker_rwsem);
 out:
        cond_resched();
-       return ret;
+       return freed;
 }
 
 static inline int is_page_cache_freeable(struct page *page)
@@ -545,7 +598,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
  */
 void putback_lru_page(struct page *page)
 {
-       int lru;
+       bool is_unevictable;
        int was_unevictable = PageUnevictable(page);
 
        VM_BUG_ON(PageLRU(page));
@@ -560,14 +613,14 @@ redo:
                 * unevictable page on [in]active list.
                 * We know how to handle that.
                 */
-               lru = page_lru_base_type(page);
+               is_unevictable = false;
                lru_cache_add(page);
        } else {
                /*
                 * Put unevictable pages directly on zone's unevictable
                 * list.
                 */
-               lru = LRU_UNEVICTABLE;
+               is_unevictable = true;
                add_page_to_unevictable_list(page);
                /*
                 * When racing with an mlock or AS_UNEVICTABLE clearing
@@ -587,7 +640,7 @@ redo:
         * page is on unevictable list, it never be freed. To avoid that,
         * check after we added it to the list, again.
         */
-       if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
+       if (is_unevictable && page_evictable(page)) {
                if (!isolate_lru_page(page)) {
                        put_page(page);
                        goto redo;
@@ -598,9 +651,9 @@ redo:
                 */
        }
 
-       if (was_unevictable && lru != LRU_UNEVICTABLE)
+       if (was_unevictable && !is_unevictable)
                count_vm_event(UNEVICTABLE_PGRESCUED);
-       else if (!was_unevictable && lru == LRU_UNEVICTABLE)
+       else if (!was_unevictable && is_unevictable)
                count_vm_event(UNEVICTABLE_PGCULLED);
 
        put_page(page);         /* drop ref from isolate */
@@ -1789,7 +1842,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         * latencies, so it's better to scan a minimum amount there as
         * well.
         */
-       if (current_is_kswapd() && zone->all_unreclaimable)
+       if (current_is_kswapd() && !zone_reclaimable(zone))
                force_scan = true;
        if (!global_reclaim(sc))
                force_scan = true;
@@ -2244,8 +2297,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                if (global_reclaim(sc)) {
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                continue;
-                       if (zone->all_unreclaimable &&
-                                       sc->priority != DEF_PRIORITY)
+                       if (sc->priority != DEF_PRIORITY &&
+                           !zone_reclaimable(zone))
                                continue;       /* Let kswapd poll it */
                        if (IS_ENABLED(CONFIG_COMPACTION)) {
                                /*
@@ -2283,11 +2336,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
        return aborted_reclaim;
 }
 
-static bool zone_reclaimable(struct zone *zone)
-{
-       return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
-}
-
 /* All zones in zonelist are unreclaimable? */
 static bool all_unreclaimable(struct zonelist *zonelist,
                struct scan_control *sc)
@@ -2301,7 +2349,7 @@ static bool all_unreclaimable(struct zonelist *zonelist,
                        continue;
                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                        continue;
-               if (!zone->all_unreclaimable)
+               if (zone_reclaimable(zone))
                        return false;
        }
 
@@ -2354,12 +2402,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                 */
                if (global_reclaim(sc)) {
                        unsigned long lru_pages = 0;
+
+                       nodes_clear(shrink->nodes_to_scan);
                        for_each_zone_zonelist(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask)) {
                                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                        continue;
 
                                lru_pages += zone_reclaimable_pages(zone);
+                               node_set(zone_to_nid(zone),
+                                        shrink->nodes_to_scan);
                        }
 
                        shrink_slab(shrink, sc->nr_scanned, lru_pages);
@@ -2712,7 +2764,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
                 * DEF_PRIORITY. Effectively, it considers them balanced so
                 * they must be considered balanced here as well!
                 */
-               if (zone->all_unreclaimable) {
+               if (!zone_reclaimable(zone)) {
                        balanced_pages += zone->managed_pages;
                        continue;
                }
@@ -2773,7 +2825,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
                               unsigned long lru_pages,
                               unsigned long *nr_attempted)
 {
-       unsigned long nr_slab;
        int testorder = sc->order;
        unsigned long balance_gap;
        struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2816,17 +2867,16 @@ static bool kswapd_shrink_zone(struct zone *zone,
                return true;
 
        shrink_zone(zone, sc);
+       nodes_clear(shrink.nodes_to_scan);
+       node_set(zone_to_nid(zone), shrink.nodes_to_scan);
 
        reclaim_state->reclaimed_slab = 0;
-       nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
+       shrink_slab(&shrink, sc->nr_scanned, lru_pages);
        sc->nr_reclaimed += reclaim_state->reclaimed_slab;
 
        /* Account for the number of pages attempted to reclaim */
        *nr_attempted += sc->nr_to_reclaim;
 
-       if (nr_slab == 0 && !zone_reclaimable(zone))
-               zone->all_unreclaimable = 1;
-
        zone_clear_flag(zone, ZONE_WRITEBACK);
 
        /*
@@ -2835,7 +2885,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
         * BDIs but as pressure is relieved, speculatively avoid congestion
         * waits.
         */
-       if (!zone->all_unreclaimable &&
+       if (zone_reclaimable(zone) &&
            zone_balanced(zone, testorder, 0, classzone_idx)) {
                zone_clear_flag(zone, ZONE_CONGESTED);
                zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
@@ -2901,8 +2951,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                        if (!populated_zone(zone))
                                continue;
 
-                       if (zone->all_unreclaimable &&
-                           sc.priority != DEF_PRIORITY)
+                       if (sc.priority != DEF_PRIORITY &&
+                           !zone_reclaimable(zone))
                                continue;
 
                        /*
@@ -2980,8 +3030,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                        if (!populated_zone(zone))
                                continue;
 
-                       if (zone->all_unreclaimable &&
-                           sc.priority != DEF_PRIORITY)
+                       if (sc.priority != DEF_PRIORITY &&
+                           !zone_reclaimable(zone))
                                continue;
 
                        sc.nr_scanned = 0;
@@ -3237,7 +3287,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
        }
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
-       if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+       if (zone_balanced(zone, order, 0, 0))
                return;
 
        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
@@ -3265,20 +3315,6 @@ unsigned long global_reclaimable_pages(void)
        return nr;
 }
 
-unsigned long zone_reclaimable_pages(struct zone *zone)
-{
-       int nr;
-
-       nr = zone_page_state(zone, NR_ACTIVE_FILE) +
-            zone_page_state(zone, NR_INACTIVE_FILE);
-
-       if (get_nr_swap_pages() > 0)
-               nr += zone_page_state(zone, NR_ACTIVE_ANON) +
-                     zone_page_state(zone, NR_INACTIVE_ANON);
-
-       return nr;
-}
-
 #ifdef CONFIG_HIBERNATION
 /*
  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -3524,10 +3560,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 * number of slab pages and shake the slab until it is reduced
                 * by the same nr_pages that we used for reclaiming unmapped
                 * pages.
-                *
-                * Note that shrink_slab will free memory on all zones and may
-                * take a long time.
                 */
+               nodes_clear(shrink.nodes_to_scan);
+               node_set(zone_to_nid(zone), shrink.nodes_to_scan);
                for (;;) {
                        unsigned long lru_pages = zone_reclaimable_pages(zone);
 
@@ -3576,7 +3611,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
            zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
                return ZONE_RECLAIM_FULL;
 
-       if (zone->all_unreclaimable)
+       if (!zone_reclaimable(zone))
                return ZONE_RECLAIM_FULL;
 
        /*