Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[cascardo/linux.git] / mm / page_alloc.c
index d0e3d2f..736d8e1 100644 (file)
@@ -53,8 +53,6 @@
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
-#include <linux/ftrace_event.h>
-#include <linux/memcontrol.h>
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
@@ -85,6 +83,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node);
  */
 DEFINE_PER_CPU(int, _numa_mem_);               /* Kernel "local memory" node */
 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
+int _node_numa_mem_[MAX_NUMNODES];
 #endif
 
 /*
@@ -1014,7 +1013,7 @@ int move_freepages(struct zone *zone,
         * Remove at a later date when no bug reports exist related to
         * grouping pages by mobility
         */
-       BUG_ON(page_zone(start_page) != page_zone(end_page));
+       VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
 
        for (page = start_page; page <= end_page;) {
@@ -1612,9 +1611,9 @@ again:
        }
 
        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
-       if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
-           !zone_is_fair_depleted(zone))
-               zone_set_flag(zone, ZONE_FAIR_DEPLETED);
+       if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
+           !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
+               set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
 
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
        zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1934,7 +1933,7 @@ static void reset_alloc_batches(struct zone *preferred_zone)
                mod_zone_page_state(zone, NR_ALLOC_BATCH,
                        high_wmark_pages(zone) - low_wmark_pages(zone) -
                        atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-               zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
+               clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
        } while (zone++ != preferred_zone);
 }
 
@@ -1985,7 +1984,7 @@ zonelist_scan:
                if (alloc_flags & ALLOC_FAIR) {
                        if (!zone_local(preferred_zone, zone))
                                break;
-                       if (zone_is_fair_depleted(zone)) {
+                       if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
                                nr_fair_skipped++;
                                continue;
                        }
@@ -2296,58 +2295,72 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
        int classzone_idx, int migratetype, enum migrate_mode mode,
-       bool *contended_compaction, bool *deferred_compaction,
-       unsigned long *did_some_progress)
+       int *contended_compaction, bool *deferred_compaction)
 {
-       if (!order)
-               return NULL;
+       struct zone *last_compact_zone = NULL;
+       unsigned long compact_result;
+       struct page *page;
 
-       if (compaction_deferred(preferred_zone, order)) {
-               *deferred_compaction = true;
+       if (!order)
                return NULL;
-       }
 
        current->flags |= PF_MEMALLOC;
-       *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+       compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
                                                nodemask, mode,
-                                               contended_compaction);
+                                               contended_compaction,
+                                               &last_compact_zone);
        current->flags &= ~PF_MEMALLOC;
 
-       if (*did_some_progress != COMPACT_SKIPPED) {
-               struct page *page;
+       switch (compact_result) {
+       case COMPACT_DEFERRED:
+               *deferred_compaction = true;
+               /* fall-through */
+       case COMPACT_SKIPPED:
+               return NULL;
+       default:
+               break;
+       }
 
-               /* Page migration frees to the PCP lists but we want merging */
-               drain_pages(get_cpu());
-               put_cpu();
+       /*
+        * At least in one zone compaction wasn't deferred or skipped, so let's
+        * count a compaction stall
+        */
+       count_vm_event(COMPACTSTALL);
 
-               page = get_page_from_freelist(gfp_mask, nodemask,
-                               order, zonelist, high_zoneidx,
-                               alloc_flags & ~ALLOC_NO_WATERMARKS,
-                               preferred_zone, classzone_idx, migratetype);
-               if (page) {
-                       preferred_zone->compact_blockskip_flush = false;
-                       compaction_defer_reset(preferred_zone, order, true);
-                       count_vm_event(COMPACTSUCCESS);
-                       return page;
-               }
+       /* Page migration frees to the PCP lists but we want merging */
+       drain_pages(get_cpu());
+       put_cpu();
 
-               /*
-                * It's bad if compaction run occurs and fails.
-                * The most likely reason is that pages exist,
-                * but not enough to satisfy watermarks.
-                */
-               count_vm_event(COMPACTFAIL);
+       page = get_page_from_freelist(gfp_mask, nodemask,
+                       order, zonelist, high_zoneidx,
+                       alloc_flags & ~ALLOC_NO_WATERMARKS,
+                       preferred_zone, classzone_idx, migratetype);
 
-               /*
-                * As async compaction considers a subset of pageblocks, only
-                * defer if the failure was a sync compaction failure.
-                */
-               if (mode != MIGRATE_ASYNC)
-                       defer_compaction(preferred_zone, order);
+       if (page) {
+               struct zone *zone = page_zone(page);
 
-               cond_resched();
+               zone->compact_blockskip_flush = false;
+               compaction_defer_reset(zone, order, true);
+               count_vm_event(COMPACTSUCCESS);
+               return page;
        }
 
+       /*
+        * last_compact_zone is where try_to_compact_pages thought allocation
+        * should succeed, so it did not defer compaction. But here we know
+        * that it didn't succeed, so we do the defer.
+        */
+       if (last_compact_zone && mode != MIGRATE_ASYNC)
+               defer_compaction(last_compact_zone, order);
+
+       /*
+        * It's bad if compaction run occurs and fails. The most likely reason
+        * is that pages exist, but not enough to satisfy watermarks.
+        */
+       count_vm_event(COMPACTFAIL);
+
+       cond_resched();
+
        return NULL;
 }
 #else
@@ -2355,9 +2368,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int classzone_idx, int migratetype,
-       enum migrate_mode mode, bool *contended_compaction,
-       bool *deferred_compaction, unsigned long *did_some_progress)
+       int classzone_idx, int migratetype, enum migrate_mode mode,
+       int *contended_compaction, bool *deferred_compaction)
 {
        return NULL;
 }
@@ -2457,12 +2469,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 static void wake_all_kswapds(unsigned int order,
                             struct zonelist *zonelist,
                             enum zone_type high_zoneidx,
-                            struct zone *preferred_zone)
+                            struct zone *preferred_zone,
+                            nodemask_t *nodemask)
 {
        struct zoneref *z;
        struct zone *zone;
 
-       for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+       for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                               high_zoneidx, nodemask)
                wakeup_kswapd(zone, order, zone_idx(preferred_zone));
 }
 
@@ -2509,7 +2523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
 #ifdef CONFIG_CMA
-       if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+       if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
 #endif
        return alloc_flags;
@@ -2533,7 +2547,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        unsigned long did_some_progress;
        enum migrate_mode migration_mode = MIGRATE_ASYNC;
        bool deferred_compaction = false;
-       bool contended_compaction = false;
+       int contended_compaction = COMPACT_CONTENDED_NONE;
 
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2560,7 +2574,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 
 restart:
        if (!(gfp_mask & __GFP_NO_KSWAPD))
-               wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
+               wake_all_kswapds(order, zonelist, high_zoneidx,
+                               preferred_zone, nodemask);
 
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2633,20 +2648,40 @@ rebalance:
                                        preferred_zone,
                                        classzone_idx, migratetype,
                                        migration_mode, &contended_compaction,
-                                       &deferred_compaction,
-                                       &did_some_progress);
+                                       &deferred_compaction);
        if (page)
                goto got_pg;
 
-       /*
-        * If compaction is deferred for high-order allocations, it is because
-        * sync compaction recently failed. In this is the case and the caller
-        * requested a movable allocation that does not heavily disrupt the
-        * system then fail the allocation instead of entering direct reclaim.
-        */
-       if ((deferred_compaction || contended_compaction) &&
-                                               (gfp_mask & __GFP_NO_KSWAPD))
-               goto nopage;
+       /* Checks for THP-specific high-order allocations */
+       if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
+               /*
+                * If compaction is deferred for high-order allocations, it is
+                * because sync compaction recently failed. If this is the case
+                * and the caller requested a THP allocation, we do not want
+                * to heavily disrupt the system, so we fail the allocation
+                * instead of entering direct reclaim.
+                */
+               if (deferred_compaction)
+                       goto nopage;
+
+               /*
+                * In all zones where compaction was attempted (and not
+                * deferred or skipped), lock contention has been detected.
+                * For THP allocation we do not want to disrupt the others
+                * so we fallback to base pages instead.
+                */
+               if (contended_compaction == COMPACT_CONTENDED_LOCK)
+                       goto nopage;
+
+               /*
+                * If compaction was aborted due to need_resched(), we do not
+                * want to further increase allocation latency, unless it is
+                * khugepaged trying to collapse.
+                */
+               if (contended_compaction == COMPACT_CONTENDED_SCHED
+                       && !(current->flags & PF_KTHREAD))
+                       goto nopage;
+       }
 
        /*
         * It can become very expensive to allocate transparent hugepages at
@@ -2726,8 +2761,7 @@ rebalance:
                                        preferred_zone,
                                        classzone_idx, migratetype,
                                        migration_mode, &contended_compaction,
-                                       &deferred_compaction,
-                                       &did_some_progress);
+                                       &deferred_compaction);
                if (page)
                        goto got_pg;
        }
@@ -2753,7 +2787,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        struct zone *preferred_zone;
        struct zoneref *preferred_zoneref;
        struct page *page = NULL;
-       int migratetype = allocflags_to_migratetype(gfp_mask);
+       int migratetype = gfpflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
        int classzone_idx;
@@ -2775,6 +2809,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
 
+       if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE)
+               alloc_flags |= ALLOC_CMA;
+
 retry_cpuset:
        cpuset_mems_cookie = read_mems_allowed_begin();
 
@@ -2786,10 +2823,6 @@ retry_cpuset:
                goto out;
        classzone_idx = zonelist_zone_idx(preferred_zoneref);
 
-#ifdef CONFIG_CMA
-       if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
-               alloc_flags |= ALLOC_CMA;
-#endif
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                        zonelist, high_zoneidx, alloc_flags,
@@ -3579,68 +3612,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
        zonelist->_zonerefs[pos].zone_idx = 0;
 }
 
+#if defined(CONFIG_64BIT)
+/*
+ * Devices that require DMA32/DMA are relatively rare and do not justify a
+ * penalty to every machine in case the specialised case applies. Default
+ * to Node-ordering on 64-bit NUMA machines
+ */
+static int default_zonelist_order(void)
+{
+       return ZONELIST_ORDER_NODE;
+}
+#else
+/*
+ * On 32-bit, the Normal zone needs to be preserved for allocations accessible
+ * by the kernel. If processes running on node 0 deplete the low memory zone
+ * then reclaim will occur more frequency increasing stalls and potentially
+ * be easier to OOM if a large percentage of the zone is under writeback or
+ * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
+ * Hence, default to zone ordering on 32-bit.
+ */
 static int default_zonelist_order(void)
 {
-       int nid, zone_type;
-       unsigned long low_kmem_size, total_size;
-       struct zone *z;
-       int average_size;
-       /*
-        * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
-        * If they are really small and used heavily, the system can fall
-        * into OOM very easily.
-        * This function detect ZONE_DMA/DMA32 size and configures zone order.
-        */
-       /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
-       low_kmem_size = 0;
-       total_size = 0;
-       for_each_online_node(nid) {
-               for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
-                       z = &NODE_DATA(nid)->node_zones[zone_type];
-                       if (populated_zone(z)) {
-                               if (zone_type < ZONE_NORMAL)
-                                       low_kmem_size += z->managed_pages;
-                               total_size += z->managed_pages;
-                       } else if (zone_type == ZONE_NORMAL) {
-                               /*
-                                * If any node has only lowmem, then node order
-                                * is preferred to allow kernel allocations
-                                * locally; otherwise, they can easily infringe
-                                * on other nodes when there is an abundance of
-                                * lowmem available to allocate from.
-                                */
-                               return ZONELIST_ORDER_NODE;
-                       }
-               }
-       }
-       if (!low_kmem_size ||  /* there are no DMA area. */
-           low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
-               return ZONELIST_ORDER_NODE;
-       /*
-        * look into each node's config.
-        * If there is a node whose DMA/DMA32 memory is very big area on
-        * local memory, NODE_ORDER may be suitable.
-        */
-       average_size = total_size /
-                               (nodes_weight(node_states[N_MEMORY]) + 1);
-       for_each_online_node(nid) {
-               low_kmem_size = 0;
-               total_size = 0;
-               for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
-                       z = &NODE_DATA(nid)->node_zones[zone_type];
-                       if (populated_zone(z)) {
-                               if (zone_type < ZONE_NORMAL)
-                                       low_kmem_size += z->present_pages;
-                               total_size += z->present_pages;
-                       }
-               }
-               if (low_kmem_size &&
-                   total_size > average_size && /* ignore small node */
-                   low_kmem_size > total_size * 70/100)
-                       return ZONELIST_ORDER_NODE;
-       }
        return ZONELIST_ORDER_ZONE;
 }
+#endif /* CONFIG_64BIT */
 
 static void set_zonelist_order(void)
 {
@@ -5703,9 +5698,8 @@ static void __setup_per_zone_wmarks(void)
                zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
 
                __mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                                     high_wmark_pages(zone) -
-                                     low_wmark_pages(zone) -
-                                     zone_page_state(zone, NR_ALLOC_BATCH));
+                       high_wmark_pages(zone) - low_wmark_pages(zone) -
+                       atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
 
                setup_zone_migrate_reserve(zone);
                spin_unlock_irqrestore(&zone->lock, flags);
@@ -6280,8 +6274,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 
                if (list_empty(&cc->migratepages)) {
                        cc->nr_migratepages = 0;
-                       pfn = isolate_migratepages_range(cc->zone, cc,
-                                                        pfn, end, true);
+                       pfn = isolate_migratepages_range(cc, pfn, end);
                        if (!pfn) {
                                ret = -EINTR;
                                break;
@@ -6557,97 +6550,3 @@ bool is_free_buddy_page(struct page *page)
        return order < MAX_ORDER;
 }
 #endif
-
-static const struct trace_print_flags pageflag_names[] = {
-       {1UL << PG_locked,              "locked"        },
-       {1UL << PG_error,               "error"         },
-       {1UL << PG_referenced,          "referenced"    },
-       {1UL << PG_uptodate,            "uptodate"      },
-       {1UL << PG_dirty,               "dirty"         },
-       {1UL << PG_lru,                 "lru"           },
-       {1UL << PG_active,              "active"        },
-       {1UL << PG_slab,                "slab"          },
-       {1UL << PG_owner_priv_1,        "owner_priv_1"  },
-       {1UL << PG_arch_1,              "arch_1"        },
-       {1UL << PG_reserved,            "reserved"      },
-       {1UL << PG_private,             "private"       },
-       {1UL << PG_private_2,           "private_2"     },
-       {1UL << PG_writeback,           "writeback"     },
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
-       {1UL << PG_head,                "head"          },
-       {1UL << PG_tail,                "tail"          },
-#else
-       {1UL << PG_compound,            "compound"      },
-#endif
-       {1UL << PG_swapcache,           "swapcache"     },
-       {1UL << PG_mappedtodisk,        "mappedtodisk"  },
-       {1UL << PG_reclaim,             "reclaim"       },
-       {1UL << PG_swapbacked,          "swapbacked"    },
-       {1UL << PG_unevictable,         "unevictable"   },
-#ifdef CONFIG_MMU
-       {1UL << PG_mlocked,             "mlocked"       },
-#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-       {1UL << PG_uncached,            "uncached"      },
-#endif
-#ifdef CONFIG_MEMORY_FAILURE
-       {1UL << PG_hwpoison,            "hwpoison"      },
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       {1UL << PG_compound_lock,       "compound_lock" },
-#endif
-};
-
-static void dump_page_flags(unsigned long flags)
-{
-       const char *delim = "";
-       unsigned long mask;
-       int i;
-
-       BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
-
-       printk(KERN_ALERT "page flags: %#lx(", flags);
-
-       /* remove zone id */
-       flags &= (1UL << NR_PAGEFLAGS) - 1;
-
-       for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
-
-               mask = pageflag_names[i].mask;
-               if ((flags & mask) != mask)
-                       continue;
-
-               flags &= ~mask;
-               printk("%s%s", delim, pageflag_names[i].name);
-               delim = "|";
-       }
-
-       /* check for left over flags */
-       if (flags)
-               printk("%s%#lx", delim, flags);
-
-       printk(")\n");
-}
-
-void dump_page_badflags(struct page *page, const char *reason,
-               unsigned long badflags)
-{
-       printk(KERN_ALERT
-              "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
-               page, atomic_read(&page->_count), page_mapcount(page),
-               page->mapping, page->index);
-       dump_page_flags(page->flags);
-       if (reason)
-               pr_alert("page dumped because: %s\n", reason);
-       if (page->flags & badflags) {
-               pr_alert("bad because of flags:\n");
-               dump_page_flags(page->flags & badflags);
-       }
-       mem_cgroup_print_bad_page(page);
-}
-
-void dump_page(struct page *page, const char *reason)
-{
-       dump_page_badflags(page, reason, 0);
-}
-EXPORT_SYMBOL(dump_page);