proc: much faster /proc/vmstat
[cascardo/linux.git] / mm / vmstat.c
index 7997f52..8857e0e 100644 (file)
@@ -86,8 +86,10 @@ void vm_events_fold_cpu(int cpu)
  *
  * vm_stat contains the global counters
  */
-atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
-EXPORT_SYMBOL(vm_stat);
+atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
+atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
+EXPORT_SYMBOL(vm_zone_stat);
+EXPORT_SYMBOL(vm_node_stat);
 
 #ifdef CONFIG_SMP
 
@@ -167,19 +169,36 @@ int calculate_normal_threshold(struct zone *zone)
  */
 void refresh_zone_stat_thresholds(void)
 {
+       struct pglist_data *pgdat;
        struct zone *zone;
        int cpu;
        int threshold;
 
+       /* Zero current pgdat thresholds */
+       for_each_online_pgdat(pgdat) {
+               for_each_online_cpu(cpu) {
+                       per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
+               }
+       }
+
        for_each_populated_zone(zone) {
+               struct pglist_data *pgdat = zone->zone_pgdat;
                unsigned long max_drift, tolerate_drift;
 
                threshold = calculate_normal_threshold(zone);
 
-               for_each_online_cpu(cpu)
+               for_each_online_cpu(cpu) {
+                       int pgdat_threshold;
+
                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
                                                        = threshold;
 
+                       /* Base nodestat threshold on the largest populated zone. */
+                       pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
+                       per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
+                               = max(threshold, pgdat_threshold);
+               }
+
                /*
                 * Only set percpu_drift_mark if there is a danger that
                 * NR_FREE_PAGES reports the low watermark is ok when in fact
@@ -238,6 +257,26 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
 
+void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+                               long delta)
+{
+       struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+       s8 __percpu *p = pcp->vm_node_stat_diff + item;
+       long x;
+       long t;
+
+       x = delta + __this_cpu_read(*p);
+
+       t = __this_cpu_read(pcp->stat_threshold);
+
+       if (unlikely(x > t || x < -t)) {
+               node_page_state_add(x, pgdat, item);
+               x = 0;
+       }
+       __this_cpu_write(*p, x);
+}
+EXPORT_SYMBOL(__mod_node_page_state);
+
 /*
  * Optimized increment and decrement functions.
  *
@@ -277,12 +316,34 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
        }
 }
 
+void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+       struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+       s8 __percpu *p = pcp->vm_node_stat_diff + item;
+       s8 v, t;
+
+       v = __this_cpu_inc_return(*p);
+       t = __this_cpu_read(pcp->stat_threshold);
+       if (unlikely(v > t)) {
+               s8 overstep = t >> 1;
+
+               node_page_state_add(v + overstep, pgdat, item);
+               __this_cpu_write(*p, -overstep);
+       }
+}
+
 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
 {
        __inc_zone_state(page_zone(page), item);
 }
 EXPORT_SYMBOL(__inc_zone_page_state);
 
+void __inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+       __inc_node_state(page_pgdat(page), item);
+}
+EXPORT_SYMBOL(__inc_node_page_state);
+
 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 {
        struct per_cpu_pageset __percpu *pcp = zone->pageset;
@@ -299,12 +360,34 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
        }
 }
 
+void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+       struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+       s8 __percpu *p = pcp->vm_node_stat_diff + item;
+       s8 v, t;
+
+       v = __this_cpu_dec_return(*p);
+       t = __this_cpu_read(pcp->stat_threshold);
+       if (unlikely(v < - t)) {
+               s8 overstep = t >> 1;
+
+               node_page_state_add(v - overstep, pgdat, item);
+               __this_cpu_write(*p, overstep);
+       }
+}
+
 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 {
        __dec_zone_state(page_zone(page), item);
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
 
+void __dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+       __dec_node_state(page_pgdat(page), item);
+}
+EXPORT_SYMBOL(__dec_node_page_state);
+
 #ifdef CONFIG_HAVE_CMPXCHG_LOCAL
 /*
  * If we have cmpxchg_local support then we do not need to incur the overhead
@@ -318,8 +401,8 @@ EXPORT_SYMBOL(__dec_zone_page_state);
  *     1       Overstepping half of threshold
  *     -1      Overstepping minus half of threshold
 */
-static inline void mod_state(struct zone *zone, enum zone_stat_item item,
-                            long delta, int overstep_mode)
+static inline void mod_zone_state(struct zone *zone,
+       enum zone_stat_item item, long delta, int overstep_mode)
 {
        struct per_cpu_pageset __percpu *pcp = zone->pageset;
        s8 __percpu *p = pcp->vm_stat_diff + item;
@@ -359,26 +442,83 @@ static inline void mod_state(struct zone *zone, enum zone_stat_item item,
 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
                         long delta)
 {
-       mod_state(zone, item, delta, 0);
+       mod_zone_state(zone, item, delta, 0);
 }
 EXPORT_SYMBOL(mod_zone_page_state);
 
-void inc_zone_state(struct zone *zone, enum zone_stat_item item)
-{
-       mod_state(zone, item, 1, 1);
-}
-
 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
 {
-       mod_state(page_zone(page), item, 1, 1);
+       mod_zone_state(page_zone(page), item, 1, 1);
 }
 EXPORT_SYMBOL(inc_zone_page_state);
 
 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 {
-       mod_state(page_zone(page), item, -1, -1);
+       mod_zone_state(page_zone(page), item, -1, -1);
 }
 EXPORT_SYMBOL(dec_zone_page_state);
+
+static inline void mod_node_state(struct pglist_data *pgdat,
+       enum node_stat_item item, int delta, int overstep_mode)
+{
+       struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+       s8 __percpu *p = pcp->vm_node_stat_diff + item;
+       long o, n, t, z;
+
+       do {
+               z = 0;  /* overflow to node counters */
+
+               /*
+                * The fetching of the stat_threshold is racy. We may apply
+                * a counter threshold to the wrong the cpu if we get
+                * rescheduled while executing here. However, the next
+                * counter update will apply the threshold again and
+                * therefore bring the counter under the threshold again.
+                *
+                * Most of the time the thresholds are the same anyways
+                * for all cpus in a node.
+                */
+               t = this_cpu_read(pcp->stat_threshold);
+
+               o = this_cpu_read(*p);
+               n = delta + o;
+
+               if (n > t || n < -t) {
+                       int os = overstep_mode * (t >> 1) ;
+
+                       /* Overflow must be added to node counters */
+                       z = n + os;
+                       n = -os;
+               }
+       } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+       if (z)
+               node_page_state_add(z, pgdat, item);
+}
+
+void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+                                       long delta)
+{
+       mod_node_state(pgdat, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_node_page_state);
+
+void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+       mod_node_state(pgdat, item, 1, 1);
+}
+
+void inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+       mod_node_state(page_pgdat(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_node_page_state);
+
+void dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+       mod_node_state(page_pgdat(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_node_page_state);
 #else
 /*
  * Use interrupt disable to serialize counter updates
@@ -394,15 +534,6 @@ void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 }
 EXPORT_SYMBOL(mod_zone_page_state);
 
-void inc_zone_state(struct zone *zone, enum zone_stat_item item)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-       __inc_zone_state(zone, item);
-       local_irq_restore(flags);
-}
-
 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
 {
        unsigned long flags;
@@ -424,21 +555,69 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(dec_zone_page_state);
-#endif
 
+void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __inc_node_state(pgdat, item);
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL(inc_node_state);
+
+void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+                                       long delta)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __mod_node_page_state(pgdat, item, delta);
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_node_page_state);
+
+void inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+       unsigned long flags;
+       struct pglist_data *pgdat;
+
+       pgdat = page_pgdat(page);
+       local_irq_save(flags);
+       __inc_node_state(pgdat, item);
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL(inc_node_page_state);
+
+void dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __dec_node_page_state(page, item);
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL(dec_node_page_state);
+#endif
 
 /*
  * Fold a differential into the global counters.
  * Returns the number of counters updated.
  */
-static int fold_diff(int *diff)
+static int fold_diff(int *zone_diff, int *node_diff)
 {
        int i;
        int changes = 0;
 
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-               if (diff[i]) {
-                       atomic_long_add(diff[i], &vm_stat[i]);
+               if (zone_diff[i]) {
+                       atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
+                       changes++;
+       }
+
+       for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+               if (node_diff[i]) {
+                       atomic_long_add(node_diff[i], &vm_node_stat[i]);
                        changes++;
        }
        return changes;
@@ -462,9 +641,11 @@ static int fold_diff(int *diff)
  */
 static int refresh_cpu_vm_stats(bool do_pagesets)
 {
+       struct pglist_data *pgdat;
        struct zone *zone;
        int i;
-       int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+       int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+       int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
        int changes = 0;
 
        for_each_populated_zone(zone) {
@@ -477,7 +658,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
                        if (v) {
 
                                atomic_long_add(v, &zone->vm_stat[i]);
-                               global_diff[i] += v;
+                               global_zone_diff[i] += v;
 #ifdef CONFIG_NUMA
                                /* 3 seconds idle till flush */
                                __this_cpu_write(p->expire, 3);
@@ -516,7 +697,22 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
                }
 #endif
        }
-       changes += fold_diff(global_diff);
+
+       for_each_online_pgdat(pgdat) {
+               struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
+
+               for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+                       int v;
+
+                       v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
+                       if (v) {
+                               atomic_long_add(v, &pgdat->vm_stat[i]);
+                               global_node_diff[i] += v;
+                       }
+               }
+       }
+
+       changes += fold_diff(global_zone_diff, global_node_diff);
        return changes;
 }
 
@@ -527,9 +723,11 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
  */
 void cpu_vm_stats_fold(int cpu)
 {
+       struct pglist_data *pgdat;
        struct zone *zone;
        int i;
-       int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+       int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+       int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
 
        for_each_populated_zone(zone) {
                struct per_cpu_pageset *p;
@@ -543,11 +741,27 @@ void cpu_vm_stats_fold(int cpu)
                                v = p->vm_stat_diff[i];
                                p->vm_stat_diff[i] = 0;
                                atomic_long_add(v, &zone->vm_stat[i]);
-                               global_diff[i] += v;
+                               global_zone_diff[i] += v;
+                       }
+       }
+
+       for_each_online_pgdat(pgdat) {
+               struct per_cpu_nodestat *p;
+
+               p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+
+               for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+                       if (p->vm_node_stat_diff[i]) {
+                               int v;
+
+                               v = p->vm_node_stat_diff[i];
+                               p->vm_node_stat_diff[i] = 0;
+                               atomic_long_add(v, &pgdat->vm_stat[i]);
+                               global_node_diff[i] += v;
                        }
        }
 
-       fold_diff(global_diff);
+       fold_diff(global_zone_diff, global_node_diff);
 }
 
 /*
@@ -563,16 +777,19 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
                        int v = pset->vm_stat_diff[i];
                        pset->vm_stat_diff[i] = 0;
                        atomic_long_add(v, &zone->vm_stat[i]);
-                       atomic_long_add(v, &vm_stat[i]);
+                       atomic_long_add(v, &vm_zone_stat[i]);
                }
 }
 #endif
 
 #ifdef CONFIG_NUMA
 /*
- * Determine the per node value of a stat item.
+ * Determine the per node value of a stat item. This function
+ * is called frequently in a NUMA machine, so try to be as
+ * frugal as possible.
  */
-unsigned long node_page_state(int node, enum zone_stat_item item)
+unsigned long sum_zone_node_page_state(int node,
+                                enum zone_stat_item item)
 {
        struct zone *zones = NODE_DATA(node)->node_zones;
        int i;
@@ -584,6 +801,19 @@ unsigned long node_page_state(int node, enum zone_stat_item item)
        return count;
 }
 
+/*
+ * Determine the per node value of a stat item.
+ */
+unsigned long node_page_state(struct pglist_data *pgdat,
+                               enum node_stat_item item)
+{
+       long x = atomic_long_read(&pgdat->vm_stat[item]);
+#ifdef CONFIG_SMP
+       if (x < 0)
+               x = 0;
+#endif
+       return x;
+}
 #endif
 
 #ifdef CONFIG_COMPACTION
@@ -691,33 +921,18 @@ int fragmentation_index(struct zone *zone, unsigned int order)
 const char * const vmstat_text[] = {
        /* enum zone_stat_item countes */
        "nr_free_pages",
-       "nr_alloc_batch",
-       "nr_inactive_anon",
-       "nr_active_anon",
-       "nr_inactive_file",
-       "nr_active_file",
-       "nr_unevictable",
+       "nr_zone_inactive_anon",
+       "nr_zone_active_anon",
+       "nr_zone_inactive_file",
+       "nr_zone_active_file",
+       "nr_zone_unevictable",
+       "nr_zone_write_pending",
        "nr_mlock",
-       "nr_anon_pages",
-       "nr_mapped",
-       "nr_file_pages",
-       "nr_dirty",
-       "nr_writeback",
        "nr_slab_reclaimable",
        "nr_slab_unreclaimable",
        "nr_page_table_pages",
        "nr_kernel_stack",
-       "nr_unstable",
        "nr_bounce",
-       "nr_vmscan_write",
-       "nr_vmscan_immediate_reclaim",
-       "nr_writeback_temp",
-       "nr_isolated_anon",
-       "nr_isolated_file",
-       "nr_shmem",
-       "nr_dirtied",
-       "nr_written",
-       "nr_pages_scanned",
 #if IS_ENABLED(CONFIG_ZSMALLOC)
        "nr_zspages",
 #endif
@@ -729,13 +944,35 @@ const char * const vmstat_text[] = {
        "numa_local",
        "numa_other",
 #endif
+       "nr_free_cma",
+
+       /* Node-based counters */
+       "nr_inactive_anon",
+       "nr_active_anon",
+       "nr_inactive_file",
+       "nr_active_file",
+       "nr_unevictable",
+       "nr_isolated_anon",
+       "nr_isolated_file",
+       "nr_pages_scanned",
        "workingset_refault",
        "workingset_activate",
        "workingset_nodereclaim",
-       "nr_anon_transparent_hugepages",
+       "nr_anon_pages",
+       "nr_mapped",
+       "nr_file_pages",
+       "nr_dirty",
+       "nr_writeback",
+       "nr_writeback_temp",
+       "nr_shmem",
        "nr_shmem_hugepages",
        "nr_shmem_pmdmapped",
-       "nr_free_cma",
+       "nr_anon_transparent_hugepages",
+       "nr_unstable",
+       "nr_vmscan_write",
+       "nr_vmscan_immediate_reclaim",
+       "nr_dirtied",
+       "nr_written",
 
        /* enum writeback_stat_item counters */
        "nr_dirty_threshold",
@@ -749,6 +986,8 @@ const char * const vmstat_text[] = {
        "pswpout",
 
        TEXTS_FOR_ZONES("pgalloc")
+       TEXTS_FOR_ZONES("allocstall")
+       TEXTS_FOR_ZONES("pgskip")
 
        "pgfree",
        "pgactivate",
@@ -758,11 +997,11 @@ const char * const vmstat_text[] = {
        "pgmajfault",
        "pglazyfreed",
 
-       TEXTS_FOR_ZONES("pgrefill")
-       TEXTS_FOR_ZONES("pgsteal_kswapd")
-       TEXTS_FOR_ZONES("pgsteal_direct")
-       TEXTS_FOR_ZONES("pgscan_kswapd")
-       TEXTS_FOR_ZONES("pgscan_direct")
+       "pgrefill",
+       "pgsteal_kswapd",
+       "pgsteal_direct",
+       "pgscan_kswapd",
+       "pgscan_direct",
        "pgscan_direct_throttle",
 
 #ifdef CONFIG_NUMA
@@ -774,7 +1013,6 @@ const char * const vmstat_text[] = {
        "kswapd_low_wmark_hit_quickly",
        "kswapd_high_wmark_hit_quickly",
        "pageoutrun",
-       "allocstall",
 
        "pgrotated",
 
@@ -1016,85 +1254,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
        return 0;
 }
 
-#ifdef CONFIG_PAGE_OWNER
-static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
-                                                       pg_data_t *pgdat,
-                                                       struct zone *zone)
-{
-       struct page *page;
-       struct page_ext *page_ext;
-       unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
-       unsigned long end_pfn = pfn + zone->spanned_pages;
-       unsigned long count[MIGRATE_TYPES] = { 0, };
-       int pageblock_mt, page_mt;
-       int i;
-
-       /* Scan block by block. First and last block may be incomplete */
-       pfn = zone->zone_start_pfn;
-
-       /*
-        * Walk the zone in pageblock_nr_pages steps. If a page block spans
-        * a zone boundary, it will be double counted between zones. This does
-        * not matter as the mixed block count will still be correct
-        */
-       for (; pfn < end_pfn; ) {
-               if (!pfn_valid(pfn)) {
-                       pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
-                       continue;
-               }
-
-               block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
-               block_end_pfn = min(block_end_pfn, end_pfn);
-
-               page = pfn_to_page(pfn);
-               pageblock_mt = get_pageblock_migratetype(page);
-
-               for (; pfn < block_end_pfn; pfn++) {
-                       if (!pfn_valid_within(pfn))
-                               continue;
-
-                       page = pfn_to_page(pfn);
-
-                       if (page_zone(page) != zone)
-                               continue;
-
-                       if (PageBuddy(page)) {
-                               pfn += (1UL << page_order(page)) - 1;
-                               continue;
-                       }
-
-                       if (PageReserved(page))
-                               continue;
-
-                       page_ext = lookup_page_ext(page);
-                       if (unlikely(!page_ext))
-                               continue;
-
-                       if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
-                               continue;
-
-                       page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
-                       if (pageblock_mt != page_mt) {
-                               if (is_migrate_cma(pageblock_mt))
-                                       count[MIGRATE_MOVABLE]++;
-                               else
-                                       count[pageblock_mt]++;
-
-                               pfn = block_end_pfn;
-                               break;
-                       }
-                       pfn += (1UL << page_ext->order) - 1;
-               }
-       }
-
-       /* Print counts */
-       seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
-       for (i = 0; i < MIGRATE_TYPES; i++)
-               seq_printf(m, "%12lu ", count[i]);
-       seq_putc(m, '\n');
-}
-#endif /* CONFIG_PAGE_OWNER */
-
 /*
  * Print out the number of pageblocks for each migratetype that contain pages
  * of other types. This gives an indication of how well fallbacks are being
@@ -1180,17 +1339,41 @@ static const struct file_operations pagetypeinfo_file_ops = {
        .release        = seq_release,
 };
 
+static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
+{
+       int zid;
+
+       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+               struct zone *compare = &pgdat->node_zones[zid];
+
+               if (populated_zone(compare))
+                       return zone == compare;
+       }
+
+       /* The zone must be somewhere! */
+       WARN_ON_ONCE(1);
+       return false;
+}
+
 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                                                        struct zone *zone)
 {
        int i;
        seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+       if (is_zone_first_populated(pgdat, zone)) {
+               seq_printf(m, "\n  per-node stats");
+               for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+                       seq_printf(m, "\n      %-12s %lu",
+                               vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+                               node_page_state(pgdat, i));
+               }
+       }
        seq_printf(m,
                   "\n  pages free     %lu"
                   "\n        min      %lu"
                   "\n        low      %lu"
                   "\n        high     %lu"
-                  "\n        scanned  %lu"
+                  "\n   node_scanned  %lu"
                   "\n        spanned  %lu"
                   "\n        present  %lu"
                   "\n        managed  %lu",
@@ -1198,13 +1381,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   min_wmark_pages(zone),
                   low_wmark_pages(zone),
                   high_wmark_pages(zone),
-                  zone_page_state(zone, NR_PAGES_SCANNED),
+                  node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED),
                   zone->spanned_pages,
                   zone->present_pages,
                   zone->managed_pages);
 
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-               seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
+               seq_printf(m, "\n      %-12s %lu", vmstat_text[i],
                                zone_page_state(zone, i));
 
        seq_printf(m,
@@ -1234,12 +1417,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 #endif
        }
        seq_printf(m,
-                  "\n  all_unreclaimable: %u"
-                  "\n  start_pfn:         %lu"
-                  "\n  inactive_ratio:    %u",
-                  !zone_reclaimable(zone),
+                  "\n  node_unreclaimable:  %u"
+                  "\n  start_pfn:           %lu"
+                  "\n  node_inactive_ratio: %u",
+                  !pgdat_reclaimable(zone->zone_pgdat),
                   zone->zone_start_pfn,
-                  zone->inactive_ratio);
+                  zone->zone_pgdat->inactive_ratio);
        seq_putc(m, '\n');
 }
 
@@ -1287,6 +1470,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
        if (*pos >= ARRAY_SIZE(vmstat_text))
                return NULL;
        stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+                         NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
                          NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
@@ -1301,6 +1485,10 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
                v[i] = global_page_state(i);
        v += NR_VM_ZONE_STAT_ITEMS;
 
+       for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+               v[i] = global_node_page_state(i);
+       v += NR_VM_NODE_STAT_ITEMS;
+
        global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
                            v + NR_DIRTY_THRESHOLD);
        v += NR_VM_WRITEBACK_STAT_ITEMS;
@@ -1326,7 +1514,9 @@ static int vmstat_show(struct seq_file *m, void *arg)
        unsigned long *l = arg;
        unsigned long off = l - (unsigned long *)m->private;
 
-       seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
+       seq_puts(m, vmstat_text[off]);
+       seq_put_decimal_ull(m, ' ', *l);
+       seq_putc(m, '\n');
        return 0;
 }
 
@@ -1390,13 +1580,12 @@ int vmstat_refresh(struct ctl_table *table, int write,
        if (err)
                return err;
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
-               val = atomic_long_read(&vm_stat[i]);
+               val = atomic_long_read(&vm_zone_stat[i]);
                if (val < 0) {
                        switch (i) {
-                       case NR_ALLOC_BATCH:
                        case NR_PAGES_SCANNED:
                                /*
-                                * These are often seen to go negative in
+                                * This is often seen to go negative in
                                 * recent kernels, but not to go permanently
                                 * negative.  Whilst it would be nicer not to
                                 * have exceptions, rooting them out would be
@@ -1529,6 +1718,16 @@ static void __init start_shepherd_timer(void)
                round_jiffies_relative(sysctl_stat_interval));
 }
 
+static void __init init_cpu_node_state(void)
+{
+       int cpu;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu)
+               node_set_state(cpu_to_node(cpu), N_CPU);
+       put_online_cpus();
+}
+
 static void vmstat_cpu_dead(int node)
 {
        int cpu;
@@ -1586,6 +1785,7 @@ static int __init setup_vmstat(void)
 #ifdef CONFIG_SMP
        cpu_notifier_register_begin();
        __register_cpu_notifier(&vmstat_notifier);
+       init_cpu_node_state();
 
        start_shepherd_timer();
        cpu_notifier_register_done();