Merge tag 'arm64-perf' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux

[cascardo/linux.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 08547a7..b934223 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -213,7 +213,7 @@ bool zone_reclaimable(struct zone *zone)
                 zone_reclaimable_pages(zone) * 6;
  }
  
-static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
  {
         if (!mem_cgroup_disabled())
                 return mem_cgroup_get_lru_size(lruvec, lru);
@@ -382,9 +382,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
   *
   * @memcg specifies the memory cgroup to target. If it is not NULL,
   * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
- * objects from the memory cgroup specified. Otherwise all shrinkers
- * are called, and memcg aware shrinkers are supposed to scan the
- * global list then.
+ * objects from the memory cgroup specified. Otherwise, only unaware
+ * shrinkers are called.
   *
   * @nr_scanned and @nr_eligible form a ratio that indicate how much of
   * the available objects should be scanned.  Page reclaim for example
@@ -404,7 +403,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
         struct shrinker *shrinker;
         unsigned long freed = 0;
  
-       if (memcg && !memcg_kmem_online(memcg))
+       if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
                 return 0;
  
         if (nr_scanned == 0)
@@ -428,7 +427,13 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
                         .memcg = memcg,
                 };
  
-               if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+               /*
+                * If kernel memory accounting is disabled, we ignore
+                * SHRINKER_MEMCG_AWARE flag and call all shrinkers
+                * passing NULL for memcg.
+                */
+               if (memcg_kmem_enabled() &&
+                   !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE))
                         continue;
  
                 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
@@ -603,12 +608,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                             bool reclaimed)
  {
         unsigned long flags;
-       struct mem_cgroup *memcg;
  
         BUG_ON(!PageLocked(page));
         BUG_ON(mapping != page_mapping(page));
  
-       memcg = lock_page_memcg(page);
         spin_lock_irqsave(&mapping->tree_lock, flags);
         /*
          * The non racy check for a busy page.
@@ -635,11 +638,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
          * Note that if SetPageDirty is always performed via set_page_dirty,
          * and thus under tree_lock, then this ordering is not required.
          */
-       if (!page_freeze_refs(page, 2))
+       if (!page_ref_freeze(page, 2))
                 goto cannot_free;
         /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
         if (unlikely(PageDirty(page))) {
-               page_unfreeze_refs(page, 2);
+               page_ref_unfreeze(page, 2);
                 goto cannot_free;
         }
  
@@ -648,7 +651,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                 mem_cgroup_swapout(page, swap);
                 __delete_from_swap_cache(page);
                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
-               unlock_page_memcg(memcg);
                 swapcache_free(swap);
         } else {
                 void (*freepage)(struct page *);
@@ -674,9 +676,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                 if (reclaimed && page_is_file_cache(page) &&
                     !mapping_exiting(mapping) && !dax_mapping(mapping))
                         shadow = workingset_eviction(mapping, page);
-               __delete_from_page_cache(page, shadow, memcg);
+               __delete_from_page_cache(page, shadow);
                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
-               unlock_page_memcg(memcg);
  
                 if (freepage != NULL)
                         freepage(page);
@@ -686,7 +687,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
  
  cannot_free:
         spin_unlock_irqrestore(&mapping->tree_lock, flags);
-       unlock_page_memcg(memcg);
         return 0;
  }
  
@@ -704,7 +704,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
                  * drops the pagecache ref for us without requiring another
                  * atomic operation.
                  */
-               page_unfreeze_refs(page, 1);
+               page_ref_unfreeze(page, 1);
                 return 1;
         }
         return 0;
@@ -1923,8 +1923,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec)
         unsigned long inactive;
         unsigned long active;
  
-       inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
-       active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
+       inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+       active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
  
         return active > inactive;
  }
@@ -2063,7 +2063,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
          * system is under heavy pressure.
          */
         if (!inactive_file_is_low(lruvec) &&
-           get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+           lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
                 scan_balance = SCAN_FILE;
                 goto out;
         }
@@ -2089,10 +2089,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
          * anon in [0], file in [1]
          */
  
-       anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
-               get_lru_size(lruvec, LRU_INACTIVE_ANON);
-       file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
-               get_lru_size(lruvec, LRU_INACTIVE_FILE);
+       anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
+               lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
+       file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
+               lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
  
         spin_lock_irq(&zone->lru_lock);
         if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2130,7 +2130,7 @@ out:
                         unsigned long size;
                         unsigned long scan;
  
-                       size = get_lru_size(lruvec, lru);
+                       size = lruvec_lru_size(lruvec, lru);
                         scan = size >> sc->priority;
  
                         if (!scan && pass && force_scan)
@@ -2973,18 +2973,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
         } while (memcg);
  }
  
-static bool zone_balanced(struct zone *zone, int order,
-                         unsigned long balance_gap, int classzone_idx)
+static bool zone_balanced(struct zone *zone, int order, bool highorder,
+                       unsigned long balance_gap, int classzone_idx)
  {
-       if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
-                                   balance_gap, classzone_idx))
-               return false;
+       unsigned long mark = high_wmark_pages(zone) + balance_gap;
  
-       if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
-                               order, 0, classzone_idx) == COMPACT_SKIPPED)
-               return false;
+       /*
+        * When checking from pgdat_balanced(), kswapd should stop and sleep
+        * when it reaches the high order-0 watermark and let kcompactd take
+        * over. Other callers such as wakeup_kswapd() want to determine the
+        * true high-order watermark.
+        */
+       if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
+               mark += (1UL << order);
+               order = 0;
+       }
  
-       return true;
+       return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
  }
  
  /*
@@ -3034,7 +3039,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
                         continue;
                 }
  
-               if (zone_balanced(zone, order, 0, i))
+               if (zone_balanced(zone, order, false, 0, i))
                         balanced_pages += zone->managed_pages;
                 else if (!order)
                         return false;
@@ -3088,27 +3093,14 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
   */
  static bool kswapd_shrink_zone(struct zone *zone,
                                int classzone_idx,
-                              struct scan_control *sc,
-                              unsigned long *nr_attempted)
+                              struct scan_control *sc)
  {
-       int testorder = sc->order;
         unsigned long balance_gap;
         bool lowmem_pressure;
  
         /* Reclaim above the high watermark. */
         sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
  
-       /*
-        * Kswapd reclaims only single pages with compaction enabled. Trying
-        * too hard to reclaim until contiguous free pages have become
-        * available can hurt performance by evicting too much useful data
-        * from memory. Do not reclaim more than needed for compaction.
-        */
-       if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
-                       compaction_suitable(zone, sc->order, 0, classzone_idx)
-                                                       != COMPACT_SKIPPED)
-               testorder = 0;
-
         /*
          * We put equal pressure on every zone, unless one zone has way too
          * many pages free already. The "too many pages" is defined as the
@@ -3123,15 +3115,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
          * reclaim is necessary
          */
         lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
-       if (!lowmem_pressure && zone_balanced(zone, testorder,
+       if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
                                                 balance_gap, classzone_idx))
                 return true;
  
         shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
  
-       /* Account for the number of pages attempted to reclaim */
-       *nr_attempted += sc->nr_to_reclaim;
-
         clear_bit(ZONE_WRITEBACK, &zone->flags);
  
         /*
@@ -3141,7 +3130,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
          * waits.
          */
         if (zone_reclaimable(zone) &&
-           zone_balanced(zone, testorder, 0, classzone_idx)) {
+           zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
                 clear_bit(ZONE_CONGESTED, &zone->flags);
                 clear_bit(ZONE_DIRTY, &zone->flags);
         }
@@ -3153,7 +3142,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
   * For kswapd, balance_pgdat() will work across all this node's zones until
   * they are all at high_wmark_pages(zone).
   *
- * Returns the final order kswapd was reclaiming at
+ * Returns the highest zone idx kswapd was reclaiming at
   *
   * There is special handling here for zones which are full of pinned pages.
   * This can happen if the pages are all mlocked, or if they are all used by
@@ -3170,8 +3159,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
   * interoperates with the page allocator fallback scheme to ensure that aging
   * of pages is balanced across the zones.
   */
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
-                                                       int *classzone_idx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
  {
         int i;
         int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
@@ -3188,9 +3176,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
         count_vm_event(PAGEOUTRUN);
  
         do {
-               unsigned long nr_attempted = 0;
                 bool raise_priority = true;
-               bool pgdat_needs_compaction = (order > 0);
  
                 sc.nr_reclaimed = 0;
  
@@ -3225,7 +3211,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                 break;
                         }
  
-                       if (!zone_balanced(zone, order, 0, 0)) {
+                       if (!zone_balanced(zone, order, false, 0, 0)) {
                                 end_zone = i;
                                 break;
                         } else {
@@ -3241,24 +3227,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                 if (i < 0)
                         goto out;
  
-               for (i = 0; i <= end_zone; i++) {
-                       struct zone *zone = pgdat->node_zones + i;
-
-                       if (!populated_zone(zone))
-                               continue;
-
-                       /*
-                        * If any zone is currently balanced then kswapd will
-                        * not call compaction as it is expected that the
-                        * necessary pages are already available.
-                        */
-                       if (pgdat_needs_compaction &&
-                                       zone_watermark_ok(zone, order,
-                                               low_wmark_pages(zone),
-                                               *classzone_idx, 0))
-                               pgdat_needs_compaction = false;
-               }
-
                 /*
                  * If we're getting trouble reclaiming, start doing writepage
                  * even in laptop mode.
@@ -3302,8 +3270,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                          * that that high watermark would be met at 100%
                          * efficiency.
                          */
-                       if (kswapd_shrink_zone(zone, end_zone,
-                                              &sc, &nr_attempted))
+                       if (kswapd_shrink_zone(zone, end_zone, &sc))
                                 raise_priority = false;
                 }
  
@@ -3316,28 +3283,10 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                 pfmemalloc_watermark_ok(pgdat))
                         wake_up_all(&pgdat->pfmemalloc_wait);
  
-               /*
-                * Fragmentation may mean that the system cannot be rebalanced
-                * for high-order allocations in all zones. If twice the
-                * allocation size has been reclaimed and the zones are still
-                * not balanced then recheck the watermarks at order-0 to
-                * prevent kswapd reclaiming excessively. Assume that a
-                * process requested a high-order can direct reclaim/compact.
-                */
-               if (order && sc.nr_reclaimed >= 2UL << order)
-                       order = sc.order = 0;
-
                 /* Check if kswapd should be suspending */
                 if (try_to_freeze() || kthread_should_stop())
                         break;
  
-               /*
-                * Compact if necessary and kswapd is reclaiming at least the
-                * high watermark number of pages as requsted
-                */
-               if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
-                       compact_pgdat(pgdat, order);
-
                 /*
                  * Raise priority if scanning rate is too low or there was no
                  * progress in reclaiming pages
@@ -3345,20 +3294,18 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                 if (raise_priority || !sc.nr_reclaimed)
                         sc.priority--;
         } while (sc.priority >= 1 &&
-                !pgdat_balanced(pgdat, order, *classzone_idx));
+                       !pgdat_balanced(pgdat, order, classzone_idx));
  
  out:
         /*
-        * Return the order we were reclaiming at so prepare_kswapd_sleep()
-        * makes a decision on the order we were last reclaiming at. However,
-        * if another caller entered the allocator slow path while kswapd
-        * was awake, order will remain at the higher level
+        * Return the highest zone idx we were reclaiming at so
+        * prepare_kswapd_sleep() makes the same decisions as here.
          */
-       *classzone_idx = end_zone;
-       return order;
+       return end_zone;
  }
  
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
+                               int classzone_idx, int balanced_classzone_idx)
  {
         long remaining = 0;
         DEFINE_WAIT(wait);
@@ -3369,7 +3316,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
  
         /* Try to sleep for a short interval */
-       if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+       if (prepare_kswapd_sleep(pgdat, order, remaining,
+                                               balanced_classzone_idx)) {
                 remaining = schedule_timeout(HZ/10);
                 finish_wait(&pgdat->kswapd_wait, &wait);
                 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -3379,7 +3327,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
          * After a short sleep, check if it was a premature sleep. If not, then
          * go fully to sleep until explicitly woken up.
          */
-       if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+       if (prepare_kswapd_sleep(pgdat, order, remaining,
+                                               balanced_classzone_idx)) {
                 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
  
                 /*
@@ -3400,6 +3349,12 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
                  */
                 reset_isolation_suitable(pgdat);
  
+               /*
+                * We have freed the memory, now we should compact it to make
+                * allocation of the requested order possible.
+                */
+               wakeup_kcompactd(pgdat, order, classzone_idx);
+
                 if (!kthread_should_stop())
                         schedule();
  
@@ -3429,7 +3384,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  static int kswapd(void *p)
  {
         unsigned long order, new_order;
-       unsigned balanced_order;
         int classzone_idx, new_classzone_idx;
         int balanced_classzone_idx;
         pg_data_t *pgdat = (pg_data_t*)p;
@@ -3462,24 +3416,19 @@ static int kswapd(void *p)
         set_freezable();
  
         order = new_order = 0;
-       balanced_order = 0;
         classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
         balanced_classzone_idx = classzone_idx;
         for ( ; ; ) {
                 bool ret;
  
                 /*
-                * If the last balance_pgdat was unsuccessful it's unlikely a
-                * new request of a similar or harder type will succeed soon
-                * so consider going to sleep on the basis we reclaimed at
+                * While we were reclaiming, there might have been another
+                * wakeup, so check the values.
                  */
-               if (balanced_classzone_idx >= new_classzone_idx &&
-                                       balanced_order == new_order) {
-                       new_order = pgdat->kswapd_max_order;
-                       new_classzone_idx = pgdat->classzone_idx;
-                       pgdat->kswapd_max_order =  0;
-                       pgdat->classzone_idx = pgdat->nr_zones - 1;
-               }
+               new_order = pgdat->kswapd_max_order;
+               new_classzone_idx = pgdat->classzone_idx;
+               pgdat->kswapd_max_order =  0;
+               pgdat->classzone_idx = pgdat->nr_zones - 1;
  
                 if (order < new_order || classzone_idx > new_classzone_idx) {
                         /*
@@ -3489,7 +3438,7 @@ static int kswapd(void *p)
                         order = new_order;
                         classzone_idx = new_classzone_idx;
                 } else {
-                       kswapd_try_to_sleep(pgdat, balanced_order,
+                       kswapd_try_to_sleep(pgdat, order, classzone_idx,
                                                 balanced_classzone_idx);
                         order = pgdat->kswapd_max_order;
                         classzone_idx = pgdat->classzone_idx;
@@ -3509,9 +3458,8 @@ static int kswapd(void *p)
                  */
                 if (!ret) {
                         trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                       balanced_classzone_idx = classzone_idx;
-                       balanced_order = balance_pgdat(pgdat, order,
-                                               &balanced_classzone_idx);
+                       balanced_classzone_idx = balance_pgdat(pgdat, order,
+                                                               classzone_idx);
                 }
         }
  
@@ -3541,7 +3489,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
         }
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
-       if (zone_balanced(zone, order, 0, 0))
+       if (zone_balanced(zone, order, true, 0, 0))
                 return;
  
         trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);