mm, page_alloc: avoid looking up the first zone in a zonelist twice
[cascardo/linux.git] / mm / vmstat.c
index 5e43004..f1a73bf 100644 (file)
@@ -569,50 +569,19 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
 #endif
 
 #ifdef CONFIG_NUMA
-/*
- * zonelist = the list of zones passed to the allocator
- * z       = the zone from which the allocation occurred.
- *
- * Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
- */
-void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
-{
-       if (z->zone_pgdat == preferred_zone->zone_pgdat) {
-               __inc_zone_state(z, NUMA_HIT);
-       } else {
-               __inc_zone_state(z, NUMA_MISS);
-               __inc_zone_state(preferred_zone, NUMA_FOREIGN);
-       }
-       if (z->node == ((flags & __GFP_OTHER_NODE) ?
-                       preferred_zone->node : numa_node_id()))
-               __inc_zone_state(z, NUMA_LOCAL);
-       else
-               __inc_zone_state(z, NUMA_OTHER);
-}
-
 /*
  * Determine the per node value of a stat item.
  */
 unsigned long node_page_state(int node, enum zone_stat_item item)
 {
        struct zone *zones = NODE_DATA(node)->node_zones;
+       int i;
+       unsigned long count = 0;
 
-       return
-#ifdef CONFIG_ZONE_DMA
-               zone_page_state(&zones[ZONE_DMA], item) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
-               zone_page_state(&zones[ZONE_DMA32], item) +
-#endif
-#ifdef CONFIG_HIGHMEM
-               zone_page_state(&zones[ZONE_HIGHMEM], item) +
-#endif
-               zone_page_state(&zones[ZONE_NORMAL], item) +
-               zone_page_state(&zones[ZONE_MOVABLE], item);
+       for (i = 0; i < MAX_NR_ZONES; i++)
+               count += zone_page_state(zones + i, item);
+
+       return count;
 }
 
 #endif
@@ -1010,6 +979,9 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
                if (!memmap_valid_within(pfn, page, zone))
                        continue;
 
+               if (page_zone(page) != zone)
+                       continue;
+
                mtype = get_pageblock_migratetype(page);
 
                if (mtype < MIGRATE_TYPES)
@@ -1076,6 +1048,10 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
                                continue;
 
                        page = pfn_to_page(pfn);
+
+                       if (page_zone(page) != zone)
+                               continue;
+
                        if (PageBuddy(page)) {
                                pfn += (1UL << page_order(page)) - 1;
                                continue;
@@ -1378,6 +1354,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
 int sysctl_stat_interval __read_mostly = HZ;
 static cpumask_var_t cpu_stat_off;
 
+#ifdef CONFIG_PROC_FS
+static void refresh_vm_stats(struct work_struct *work)
+{
+       refresh_cpu_vm_stats(true);
+}
+
+int vmstat_refresh(struct ctl_table *table, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       long val;
+       int err;
+       int i;
+
+       /*
+        * The regular update, every sysctl_stat_interval, may come later
+        * than expected: leaving a significant amount in per_cpu buckets.
+        * This is particularly misleading when checking a quantity of HUGE
+        * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
+        * which can equally be echo'ed to or cat'ted from (by root),
+        * can be used to update the stats just before reading them.
+        *
+        * Oh, and since global_page_state() etc. are so careful to hide
+        * transiently negative values, report an error here if any of
+        * the stats is negative, so we know to go looking for imbalance.
+        */
+       err = schedule_on_each_cpu(refresh_vm_stats);
+       if (err)
+               return err;
+       for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+               val = atomic_long_read(&vm_stat[i]);
+               if (val < 0) {
+                       switch (i) {
+                       case NR_ALLOC_BATCH:
+                       case NR_PAGES_SCANNED:
+                               /*
+                                * These are often seen to go negative in
+                                * recent kernels, but not to go permanently
+                                * negative.  Whilst it would be nicer not to
+                                * have exceptions, rooting them out would be
+                                * another task, of rather low priority.
+                                */
+                               break;
+                       default:
+                               pr_warn("%s: %s %ld\n",
+                                       __func__, vmstat_text[i], val);
+                               err = -EINVAL;
+                               break;
+                       }
+               }
+       }
+       if (err)
+               return err;
+       if (write)
+               *ppos += *lenp;
+       else
+               *lenp = 0;
+       return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
 static void vmstat_update(struct work_struct *w)
 {
        if (refresh_cpu_vm_stats(true)) {