Merge tag 'arm64-perf' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux
[cascardo/linux.git] / mm / vmscan.c
index 08547a7..b934223 100644 (file)
@@ -213,7 +213,7 @@ bool zone_reclaimable(struct zone *zone)
                zone_reclaimable_pages(zone) * 6;
 }
 
-static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
        if (!mem_cgroup_disabled())
                return mem_cgroup_get_lru_size(lruvec, lru);
@@ -382,9 +382,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
  *
  * @memcg specifies the memory cgroup to target. If it is not NULL,
  * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
- * objects from the memory cgroup specified. Otherwise all shrinkers
- * are called, and memcg aware shrinkers are supposed to scan the
- * global list then.
+ * objects from the memory cgroup specified. Otherwise, only unaware
+ * shrinkers are called.
  *
  * @nr_scanned and @nr_eligible form a ratio that indicate how much of
  * the available objects should be scanned.  Page reclaim for example
@@ -404,7 +403,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
        struct shrinker *shrinker;
        unsigned long freed = 0;
 
-       if (memcg && !memcg_kmem_online(memcg))
+       if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
                return 0;
 
        if (nr_scanned == 0)
@@ -428,7 +427,13 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
                        .memcg = memcg,
                };
 
-               if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+               /*
+                * If kernel memory accounting is disabled, we ignore
+                * SHRINKER_MEMCG_AWARE flag and call all shrinkers
+                * passing NULL for memcg.
+                */
+               if (memcg_kmem_enabled() &&
+                   !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE))
                        continue;
 
                if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
@@ -603,12 +608,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                            bool reclaimed)
 {
        unsigned long flags;
-       struct mem_cgroup *memcg;
 
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
 
-       memcg = lock_page_memcg(page);
        spin_lock_irqsave(&mapping->tree_lock, flags);
        /*
         * The non racy check for a busy page.
@@ -635,11 +638,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
         * Note that if SetPageDirty is always performed via set_page_dirty,
         * and thus under tree_lock, then this ordering is not required.
         */
-       if (!page_freeze_refs(page, 2))
+       if (!page_ref_freeze(page, 2))
                goto cannot_free;
        /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
        if (unlikely(PageDirty(page))) {
-               page_unfreeze_refs(page, 2);
+               page_ref_unfreeze(page, 2);
                goto cannot_free;
        }
 
@@ -648,7 +651,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                mem_cgroup_swapout(page, swap);
                __delete_from_swap_cache(page);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
-               unlock_page_memcg(memcg);
                swapcache_free(swap);
        } else {
                void (*freepage)(struct page *);
@@ -674,9 +676,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                if (reclaimed && page_is_file_cache(page) &&
                    !mapping_exiting(mapping) && !dax_mapping(mapping))
                        shadow = workingset_eviction(mapping, page);
-               __delete_from_page_cache(page, shadow, memcg);
+               __delete_from_page_cache(page, shadow);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
-               unlock_page_memcg(memcg);
 
                if (freepage != NULL)
                        freepage(page);
@@ -686,7 +687,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 
 cannot_free:
        spin_unlock_irqrestore(&mapping->tree_lock, flags);
-       unlock_page_memcg(memcg);
        return 0;
 }
 
@@ -704,7 +704,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
                 * drops the pagecache ref for us without requiring another
                 * atomic operation.
                 */
-               page_unfreeze_refs(page, 1);
+               page_ref_unfreeze(page, 1);
                return 1;
        }
        return 0;
@@ -1923,8 +1923,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec)
        unsigned long inactive;
        unsigned long active;
 
-       inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
-       active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
+       inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+       active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
 
        return active > inactive;
 }
@@ -2063,7 +2063,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         * system is under heavy pressure.
         */
        if (!inactive_file_is_low(lruvec) &&
-           get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+           lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
                scan_balance = SCAN_FILE;
                goto out;
        }
@@ -2089,10 +2089,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         * anon in [0], file in [1]
         */
 
-       anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
-               get_lru_size(lruvec, LRU_INACTIVE_ANON);
-       file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
-               get_lru_size(lruvec, LRU_INACTIVE_FILE);
+       anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
+               lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
+       file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
+               lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
 
        spin_lock_irq(&zone->lru_lock);
        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2130,7 +2130,7 @@ out:
                        unsigned long size;
                        unsigned long scan;
 
-                       size = get_lru_size(lruvec, lru);
+                       size = lruvec_lru_size(lruvec, lru);
                        scan = size >> sc->priority;
 
                        if (!scan && pass && force_scan)
@@ -2973,18 +2973,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
        } while (memcg);
 }
 
-static bool zone_balanced(struct zone *zone, int order,
-                         unsigned long balance_gap, int classzone_idx)
+static bool zone_balanced(struct zone *zone, int order, bool highorder,
+                       unsigned long balance_gap, int classzone_idx)
 {
-       if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
-                                   balance_gap, classzone_idx))
-               return false;
+       unsigned long mark = high_wmark_pages(zone) + balance_gap;
 
-       if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
-                               order, 0, classzone_idx) == COMPACT_SKIPPED)
-               return false;
+       /*
+        * When checking from pgdat_balanced(), kswapd should stop and sleep
+        * when it reaches the high order-0 watermark and let kcompactd take
+        * over. Other callers such as wakeup_kswapd() want to determine the
+        * true high-order watermark.
+        */
+       if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
+               mark += (1UL << order);
+               order = 0;
+       }
 
-       return true;
+       return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
 }
 
 /*
@@ -3034,7 +3039,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
                        continue;
                }
 
-               if (zone_balanced(zone, order, 0, i))
+               if (zone_balanced(zone, order, false, 0, i))
                        balanced_pages += zone->managed_pages;
                else if (!order)
                        return false;
@@ -3088,27 +3093,14 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
  */
 static bool kswapd_shrink_zone(struct zone *zone,
                               int classzone_idx,
-                              struct scan_control *sc,
-                              unsigned long *nr_attempted)
+                              struct scan_control *sc)
 {
-       int testorder = sc->order;
        unsigned long balance_gap;
        bool lowmem_pressure;
 
        /* Reclaim above the high watermark. */
        sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
 
-       /*
-        * Kswapd reclaims only single pages with compaction enabled. Trying
-        * too hard to reclaim until contiguous free pages have become
-        * available can hurt performance by evicting too much useful data
-        * from memory. Do not reclaim more than needed for compaction.
-        */
-       if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
-                       compaction_suitable(zone, sc->order, 0, classzone_idx)
-                                                       != COMPACT_SKIPPED)
-               testorder = 0;
-
        /*
         * We put equal pressure on every zone, unless one zone has way too
         * many pages free already. The "too many pages" is defined as the
@@ -3123,15 +3115,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
         * reclaim is necessary
         */
        lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
-       if (!lowmem_pressure && zone_balanced(zone, testorder,
+       if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
                                                balance_gap, classzone_idx))
                return true;
 
        shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
 
-       /* Account for the number of pages attempted to reclaim */
-       *nr_attempted += sc->nr_to_reclaim;
-
        clear_bit(ZONE_WRITEBACK, &zone->flags);
 
        /*
@@ -3141,7 +3130,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
         * waits.
         */
        if (zone_reclaimable(zone) &&
-           zone_balanced(zone, testorder, 0, classzone_idx)) {
+           zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
                clear_bit(ZONE_CONGESTED, &zone->flags);
                clear_bit(ZONE_DIRTY, &zone->flags);
        }
@@ -3153,7 +3142,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at high_wmark_pages(zone).
  *
- * Returns the final order kswapd was reclaiming at
+ * Returns the highest zone idx kswapd was reclaiming at
  *
  * There is special handling here for zones which are full of pinned pages.
  * This can happen if the pages are all mlocked, or if they are all used by
@@ -3170,8 +3159,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
  * interoperates with the page allocator fallback scheme to ensure that aging
  * of pages is balanced across the zones.
  */
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
-                                                       int *classzone_idx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 {
        int i;
        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
@@ -3188,9 +3176,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
        count_vm_event(PAGEOUTRUN);
 
        do {
-               unsigned long nr_attempted = 0;
                bool raise_priority = true;
-               bool pgdat_needs_compaction = (order > 0);
 
                sc.nr_reclaimed = 0;
 
@@ -3225,7 +3211,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                break;
                        }
 
-                       if (!zone_balanced(zone, order, 0, 0)) {
+                       if (!zone_balanced(zone, order, false, 0, 0)) {
                                end_zone = i;
                                break;
                        } else {
@@ -3241,24 +3227,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                if (i < 0)
                        goto out;
 
-               for (i = 0; i <= end_zone; i++) {
-                       struct zone *zone = pgdat->node_zones + i;
-
-                       if (!populated_zone(zone))
-                               continue;
-
-                       /*
-                        * If any zone is currently balanced then kswapd will
-                        * not call compaction as it is expected that the
-                        * necessary pages are already available.
-                        */
-                       if (pgdat_needs_compaction &&
-                                       zone_watermark_ok(zone, order,
-                                               low_wmark_pages(zone),
-                                               *classzone_idx, 0))
-                               pgdat_needs_compaction = false;
-               }
-
                /*
                 * If we're getting trouble reclaiming, start doing writepage
                 * even in laptop mode.
@@ -3302,8 +3270,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                         * that that high watermark would be met at 100%
                         * efficiency.
                         */
-                       if (kswapd_shrink_zone(zone, end_zone,
-                                              &sc, &nr_attempted))
+                       if (kswapd_shrink_zone(zone, end_zone, &sc))
                                raise_priority = false;
                }
 
@@ -3316,28 +3283,10 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                pfmemalloc_watermark_ok(pgdat))
                        wake_up_all(&pgdat->pfmemalloc_wait);
 
-               /*
-                * Fragmentation may mean that the system cannot be rebalanced
-                * for high-order allocations in all zones. If twice the
-                * allocation size has been reclaimed and the zones are still
-                * not balanced then recheck the watermarks at order-0 to
-                * prevent kswapd reclaiming excessively. Assume that a
-                * process requested a high-order can direct reclaim/compact.
-                */
-               if (order && sc.nr_reclaimed >= 2UL << order)
-                       order = sc.order = 0;
-
                /* Check if kswapd should be suspending */
                if (try_to_freeze() || kthread_should_stop())
                        break;
 
-               /*
-                * Compact if necessary and kswapd is reclaiming at least the
-                * high watermark number of pages as requsted
-                */
-               if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
-                       compact_pgdat(pgdat, order);
-
                /*
                 * Raise priority if scanning rate is too low or there was no
                 * progress in reclaiming pages
@@ -3345,20 +3294,18 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                if (raise_priority || !sc.nr_reclaimed)
                        sc.priority--;
        } while (sc.priority >= 1 &&
-                !pgdat_balanced(pgdat, order, *classzone_idx));
+                       !pgdat_balanced(pgdat, order, classzone_idx));
 
 out:
        /*
-        * Return the order we were reclaiming at so prepare_kswapd_sleep()
-        * makes a decision on the order we were last reclaiming at. However,
-        * if another caller entered the allocator slow path while kswapd
-        * was awake, order will remain at the higher level
+        * Return the highest zone idx we were reclaiming at so
+        * prepare_kswapd_sleep() makes the same decisions as here.
         */
-       *classzone_idx = end_zone;
-       return order;
+       return end_zone;
 }
 
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
+                               int classzone_idx, int balanced_classzone_idx)
 {
        long remaining = 0;
        DEFINE_WAIT(wait);
@@ -3369,7 +3316,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 
        /* Try to sleep for a short interval */
-       if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+       if (prepare_kswapd_sleep(pgdat, order, remaining,
+                                               balanced_classzone_idx)) {
                remaining = schedule_timeout(HZ/10);
                finish_wait(&pgdat->kswapd_wait, &wait);
                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -3379,7 +3327,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         * After a short sleep, check if it was a premature sleep. If not, then
         * go fully to sleep until explicitly woken up.
         */
-       if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+       if (prepare_kswapd_sleep(pgdat, order, remaining,
+                                               balanced_classzone_idx)) {
                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
 
                /*
@@ -3400,6 +3349,12 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
                 */
                reset_isolation_suitable(pgdat);
 
+               /*
+                * We have freed the memory, now we should compact it to make
+                * allocation of the requested order possible.
+                */
+               wakeup_kcompactd(pgdat, order, classzone_idx);
+
                if (!kthread_should_stop())
                        schedule();
 
@@ -3429,7 +3384,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 static int kswapd(void *p)
 {
        unsigned long order, new_order;
-       unsigned balanced_order;
        int classzone_idx, new_classzone_idx;
        int balanced_classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
@@ -3462,24 +3416,19 @@ static int kswapd(void *p)
        set_freezable();
 
        order = new_order = 0;
-       balanced_order = 0;
        classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
        balanced_classzone_idx = classzone_idx;
        for ( ; ; ) {
                bool ret;
 
                /*
-                * If the last balance_pgdat was unsuccessful it's unlikely a
-                * new request of a similar or harder type will succeed soon
-                * so consider going to sleep on the basis we reclaimed at
+                * While we were reclaiming, there might have been another
+                * wakeup, so check the values.
                 */
-               if (balanced_classzone_idx >= new_classzone_idx &&
-                                       balanced_order == new_order) {
-                       new_order = pgdat->kswapd_max_order;
-                       new_classzone_idx = pgdat->classzone_idx;
-                       pgdat->kswapd_max_order =  0;
-                       pgdat->classzone_idx = pgdat->nr_zones - 1;
-               }
+               new_order = pgdat->kswapd_max_order;
+               new_classzone_idx = pgdat->classzone_idx;
+               pgdat->kswapd_max_order =  0;
+               pgdat->classzone_idx = pgdat->nr_zones - 1;
 
                if (order < new_order || classzone_idx > new_classzone_idx) {
                        /*
@@ -3489,7 +3438,7 @@ static int kswapd(void *p)
                        order = new_order;
                        classzone_idx = new_classzone_idx;
                } else {
-                       kswapd_try_to_sleep(pgdat, balanced_order,
+                       kswapd_try_to_sleep(pgdat, order, classzone_idx,
                                                balanced_classzone_idx);
                        order = pgdat->kswapd_max_order;
                        classzone_idx = pgdat->classzone_idx;
@@ -3509,9 +3458,8 @@ static int kswapd(void *p)
                 */
                if (!ret) {
                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                       balanced_classzone_idx = classzone_idx;
-                       balanced_order = balance_pgdat(pgdat, order,
-                                               &balanced_classzone_idx);
+                       balanced_classzone_idx = balance_pgdat(pgdat, order,
+                                                               classzone_idx);
                }
        }
 
@@ -3541,7 +3489,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
        }
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
-       if (zone_balanced(zone, order, 0, 0))
+       if (zone_balanced(zone, order, true, 0, 0))
                return;
 
        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);