mm: memcontrol: convert reclaim iterator to simple css refcounting

[cascardo/linux.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index d6ac0e3..c3cd3bb 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,7 +25,7 @@
   * GNU General Public License for more details.
   */
  
-#include <linux/res_counter.h>
+#include <linux/page_counter.h>
  #include <linux/memcontrol.h>
  #include <linux/cgroup.h>
  #include <linux/mm.h>
@@ -143,14 +143,8 @@ struct mem_cgroup_stat_cpu {
         unsigned long targets[MEM_CGROUP_NTARGETS];
  };
  
-struct mem_cgroup_reclaim_iter {
-       /*
-        * last scanned hierarchy member. Valid only if last_dead_count
-        * matches memcg->dead_count of the hierarchy root group.
-        */
-       struct mem_cgroup *last_visited;
-       int last_dead_count;
-
+struct reclaim_iter {
+       struct mem_cgroup *position;
         /* scan generation, increased every round-trip */
         unsigned int generation;
  };
@@ -162,10 +156,10 @@ struct mem_cgroup_per_zone {
         struct lruvec           lruvec;
         unsigned long           lru_size[NR_LRU_LISTS];
  
-       struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+       struct reclaim_iter     iter[DEF_PRIORITY + 1];
  
         struct rb_node          tree_node;      /* RB tree node */
-       unsigned long long      usage_in_excess;/* Set to the value by which */
+       unsigned long           usage_in_excess;/* Set to the value by which */
                                                 /* the soft limit is exceeded*/
         bool                    on_tree;
         struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
@@ -198,7 +192,7 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly;
  
  struct mem_cgroup_threshold {
         struct eventfd_ctx *eventfd;
-       u64 threshold;
+       unsigned long threshold;
  };
  
  /* For threshold */
@@ -284,10 +278,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
   */
  struct mem_cgroup {
         struct cgroup_subsys_state css;
-       /*
-        * the counter to account for memory usage
-        */
-       struct res_counter res;
+
+       /* Accounted resources */
+       struct page_counter memory;
+       struct page_counter memsw;
+       struct page_counter kmem;
+
+       unsigned long soft_limit;
  
         /* vmpressure notifications */
         struct vmpressure vmpressure;
@@ -295,15 +292,6 @@ struct mem_cgroup {
         /* css_online() has been completed */
         int initialized;
  
-       /*
-        * the counter to account for mem+swap usage.
-        */
-       struct res_counter memsw;
-
-       /*
-        * the counter to account for kernel memory usage.
-        */
-       struct res_counter kmem;
         /*
          * Should the accounting and control be hierarchical, per subtree?
          */
@@ -352,7 +340,6 @@ struct mem_cgroup {
         struct mem_cgroup_stat_cpu nocpu_base;
         spinlock_t pcp_counter_lock;
  
-       atomic_t        dead_count;
  #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
         struct cg_proto tcp_mem;
  #endif
@@ -650,7 +637,7 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg)
          * This check can't live in kmem destruction function,
          * since the charges will outlive the cgroup
          */
-       WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
+       WARN_ON(page_counter_read(&memcg->kmem));
  }
  #else
  static void disarm_kmem_keys(struct mem_cgroup *memcg)
@@ -706,7 +693,7 @@ soft_limit_tree_from_page(struct page *page)
  
  static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
                                          struct mem_cgroup_tree_per_zone *mctz,
-                                        unsigned long long new_usage_in_excess)
+                                        unsigned long new_usage_in_excess)
  {
         struct rb_node **p = &mctz->rb_root.rb_node;
         struct rb_node *parent = NULL;
@@ -755,10 +742,21 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
         spin_unlock_irqrestore(&mctz->lock, flags);
  }
  
+static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
+{
+       unsigned long nr_pages = page_counter_read(&memcg->memory);
+       unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
+       unsigned long excess = 0;
+
+       if (nr_pages > soft_limit)
+               excess = nr_pages - soft_limit;
+
+       return excess;
+}
  
  static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
  {
-       unsigned long long excess;
+       unsigned long excess;
         struct mem_cgroup_per_zone *mz;
         struct mem_cgroup_tree_per_zone *mctz;
  
@@ -769,7 +767,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
          */
         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
                 mz = mem_cgroup_page_zoneinfo(memcg, page);
-               excess = res_counter_soft_limit_excess(&memcg->res);
+               excess = soft_limit_excess(memcg);
                 /*
                  * We have to update the tree if mz is on RB-tree or
                  * mem is over its softlimit.
@@ -825,7 +823,7 @@ retry:
          * position in the tree.
          */
         __mem_cgroup_remove_exceeded(mz, mctz);
-       if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
+       if (!soft_limit_excess(mz->memcg) ||
             !css_tryget_online(&mz->memcg->css))
                 goto retry;
  done:
@@ -1062,122 +1060,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
         return memcg;
  }
  
-/*
- * Returns a next (in a pre-order walk) alive memcg (with elevated css
- * ref. count) or NULL if the whole root's subtree has been visited.
- *
- * helper function to be used by mem_cgroup_iter
- */
-static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
-               struct mem_cgroup *last_visited)
-{
-       struct cgroup_subsys_state *prev_css, *next_css;
-
-       prev_css = last_visited ? &last_visited->css : NULL;
-skip_node:
-       next_css = css_next_descendant_pre(prev_css, &root->css);
-
-       /*
-        * Even if we found a group we have to make sure it is
-        * alive. css && !memcg means that the groups should be
-        * skipped and we should continue the tree walk.
-        * last_visited css is safe to use because it is
-        * protected by css_get and the tree walk is rcu safe.
-        *
-        * We do not take a reference on the root of the tree walk
-        * because we might race with the root removal when it would
-        * be the only node in the iterated hierarchy and mem_cgroup_iter
-        * would end up in an endless loop because it expects that at
-        * least one valid node will be returned. Root cannot disappear
-        * because caller of the iterator should hold it already so
-        * skipping css reference should be safe.
-        */
-       if (next_css) {
-               struct mem_cgroup *memcg = mem_cgroup_from_css(next_css);
-
-               if (next_css == &root->css)
-                       return memcg;
-
-               if (css_tryget_online(next_css)) {
-                       /*
-                        * Make sure the memcg is initialized:
-                        * mem_cgroup_css_online() orders the the
-                        * initialization against setting the flag.
-                        */
-                       if (smp_load_acquire(&memcg->initialized))
-                               return memcg;
-                       css_put(next_css);
-               }
-
-               prev_css = next_css;
-               goto skip_node;
-       }
-
-       return NULL;
-}
-
-static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
-{
-       /*
-        * When a group in the hierarchy below root is destroyed, the
-        * hierarchy iterator can no longer be trusted since it might
-        * have pointed to the destroyed group.  Invalidate it.
-        */
-       atomic_inc(&root->dead_count);
-}
-
-static struct mem_cgroup *
-mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
-                    struct mem_cgroup *root,
-                    int *sequence)
-{
-       struct mem_cgroup *position = NULL;
-       /*
-        * A cgroup destruction happens in two stages: offlining and
-        * release.  They are separated by a RCU grace period.
-        *
-        * If the iterator is valid, we may still race with an
-        * offlining.  The RCU lock ensures the object won't be
-        * released, tryget will fail if we lost the race.
-        */
-       *sequence = atomic_read(&root->dead_count);
-       if (iter->last_dead_count == *sequence) {
-               smp_rmb();
-               position = iter->last_visited;
-
-               /*
-                * We cannot take a reference to root because we might race
-                * with root removal and returning NULL would end up in
-                * an endless loop on the iterator user level when root
-                * would be returned all the time.
-                */
-               if (position && position != root &&
-                   !css_tryget_online(&position->css))
-                       position = NULL;
-       }
-       return position;
-}
-
-static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
-                                  struct mem_cgroup *last_visited,
-                                  struct mem_cgroup *new_position,
-                                  struct mem_cgroup *root,
-                                  int sequence)
-{
-       /* root reference counting symmetric to mem_cgroup_iter_load */
-       if (last_visited && last_visited != root)
-               css_put(&last_visited->css);
-       /*
-        * We store the sequence count from the time @last_visited was
-        * loaded successfully instead of rereading it here so that we
-        * don't lose destruction events in between.  We could have
-        * raced with the destruction of @new_position after all.
-        */
-       iter->last_visited = new_position;
-       smp_wmb();
-       iter->last_dead_count = sequence;
-}
-
  /**
   * mem_cgroup_iter - iterate over memory cgroup hierarchy
   * @root: hierarchy root
@@ -1199,8 +1081,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                                    struct mem_cgroup *prev,
                                    struct mem_cgroup_reclaim_cookie *reclaim)
  {
+       struct reclaim_iter *uninitialized_var(iter);
+       struct cgroup_subsys_state *css = NULL;
         struct mem_cgroup *memcg = NULL;
-       struct mem_cgroup *last_visited = NULL;
+       struct mem_cgroup *pos = NULL;
  
         if (mem_cgroup_disabled())
                 return NULL;
@@ -1209,50 +1093,101 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                 root = root_mem_cgroup;
  
         if (prev && !reclaim)
-               last_visited = prev;
+               pos = prev;
  
         if (!root->use_hierarchy && root != root_mem_cgroup) {
                 if (prev)
-                       goto out_css_put;
+                       goto out;
                 return root;
         }
  
         rcu_read_lock();
-       while (!memcg) {
-               struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
-               int uninitialized_var(seq);
-
-               if (reclaim) {
-                       struct mem_cgroup_per_zone *mz;
-
-                       mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
-                       iter = &mz->reclaim_iter[reclaim->priority];
-                       if (prev && reclaim->generation != iter->generation) {
-                               iter->last_visited = NULL;
-                               goto out_unlock;
-                       }
  
-                       last_visited = mem_cgroup_iter_load(iter, root, &seq);
+       if (reclaim) {
+               struct mem_cgroup_per_zone *mz;
+
+               mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
+               iter = &mz->iter[reclaim->priority];
+
+               if (prev && reclaim->generation != iter->generation)
+                       goto out_unlock;
+
+               do {
+                       pos = ACCESS_ONCE(iter->position);
+                       /*
+                        * A racing update may change the position and
+                        * put the last reference, hence css_tryget(),
+                        * or retry to see the updated position.
+                        */
+               } while (pos && !css_tryget(&pos->css));
+       }
+
+       if (pos)
+               css = &pos->css;
+
+       for (;;) {
+               css = css_next_descendant_pre(css, &root->css);
+               if (!css) {
+                       /*
+                        * Reclaimers share the hierarchy walk, and a
+                        * new one might jump in right at the end of
+                        * the hierarchy - make sure they see at least
+                        * one group and restart from the beginning.
+                        */
+                       if (!prev)
+                               continue;
+                       break;
                 }
  
-               memcg = __mem_cgroup_iter_next(root, last_visited);
+               /*
+                * Verify the css and acquire a reference.  The root
+                * is provided by the caller, so we know it's alive
+                * and kicking, and don't take an extra reference.
+                */
+               memcg = mem_cgroup_from_css(css);
  
-               if (reclaim) {
-                       mem_cgroup_iter_update(iter, last_visited, memcg, root,
-                                       seq);
+               if (css == &root->css)
+                       break;
  
-                       if (!memcg)
-                               iter->generation++;
-                       else if (!prev && memcg)
-                               reclaim->generation = iter->generation;
+               if (css_tryget_online(css)) {
+                       /*
+                        * Make sure the memcg is initialized:
+                        * mem_cgroup_css_online() orders the the
+                        * initialization against setting the flag.
+                        */
+                       if (smp_load_acquire(&memcg->initialized))
+                               break;
+
+                       css_put(css);
                 }
  
-               if (prev && !memcg)
-                       goto out_unlock;
+               memcg = NULL;
         }
+
+       if (reclaim) {
+               if (cmpxchg(&iter->position, pos, memcg) == pos) {
+                       if (memcg)
+                               css_get(&memcg->css);
+                       if (pos)
+                               css_put(&pos->css);
+               }
+
+               /*
+                * pairs with css_tryget when dereferencing iter->position
+                * above.
+                */
+               if (pos)
+                       css_put(&pos->css);
+
+               if (!memcg)
+                       iter->generation++;
+               else if (!prev)
+                       reclaim->generation = iter->generation;
+       }
+
  out_unlock:
         rcu_read_unlock();
-out_css_put:
+out:
         if (prev && prev != root)
                 css_put(&prev->css);
  
@@ -1492,7 +1427,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
         return inactive * inactive_ratio < active;
  }
  
-#define mem_cgroup_from_res_counter(counter, member)   \
+#define mem_cgroup_from_counter(counter, member)       \
         container_of(counter, struct mem_cgroup, member)
  
  /**
@@ -1504,12 +1439,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
   */
  static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
  {
-       unsigned long long margin;
+       unsigned long margin = 0;
+       unsigned long count;
+       unsigned long limit;
  
-       margin = res_counter_margin(&memcg->res);
-       if (do_swap_account)
-               margin = min(margin, res_counter_margin(&memcg->memsw));
-       return margin >> PAGE_SHIFT;
+       count = page_counter_read(&memcg->memory);
+       limit = ACCESS_ONCE(memcg->memory.limit);
+       if (count < limit)
+               margin = limit - count;
+
+       if (do_swap_account) {
+               count = page_counter_read(&memcg->memsw);
+               limit = ACCESS_ONCE(memcg->memsw.limit);
+               if (count <= limit)
+                       margin = min(margin, limit - count);
+       }
+
+       return margin;
  }
  
  int mem_cgroup_swappiness(struct mem_cgroup *memcg)
@@ -1644,18 +1590,15 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
  
         rcu_read_unlock();
  
-       pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
-               res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
-               res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
-               res_counter_read_u64(&memcg->res, RES_FAILCNT));
-       pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
-               res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
-               res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
-               res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
-       pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
-               res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
-               res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
-               res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
+       pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
+               K((u64)page_counter_read(&memcg->memory)),
+               K((u64)memcg->memory.limit), memcg->memory.failcnt);
+       pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
+               K((u64)page_counter_read(&memcg->memsw)),
+               K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
+       pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
+               K((u64)page_counter_read(&memcg->kmem)),
+               K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
  
         for_each_mem_cgroup_tree(iter, memcg) {
                 pr_info("Memory cgroup stats for ");
@@ -1695,28 +1638,17 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
  /*
   * Return the memory (and swap, if configured) limit for a memcg.
   */
-static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
  {
-       u64 limit;
-
-       limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+       unsigned long limit;
  
-       /*
-        * Do not consider swap space if we cannot swap due to swappiness
-        */
+       limit = memcg->memory.limit;
         if (mem_cgroup_swappiness(memcg)) {
-               u64 memsw;
-
-               limit += total_swap_pages << PAGE_SHIFT;
-               memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+               unsigned long memsw_limit;
  
-               /*
-                * If memsw is finite and limits the amount of swap space
-                * available to this memcg, return that limit.
-                */
-               limit = min(limit, memsw);
+               memsw_limit = memcg->memsw.limit;
+               limit = min(limit + total_swap_pages, memsw_limit);
         }
-
         return limit;
  }
  
@@ -1740,7 +1672,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
         }
  
         check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
-       totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
+       totalpages = mem_cgroup_get_limit(memcg) ? : 1;
         for_each_mem_cgroup_tree(iter, memcg) {
                 struct css_task_iter it;
                 struct task_struct *task;
@@ -1943,7 +1875,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
                 .priority = 0,
         };
  
-       excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
+       excess = soft_limit_excess(root_memcg);
  
         while (1) {
                 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
@@ -1974,7 +1906,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
                 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
                                                      zone, &nr_scanned);
                 *total_scanned += nr_scanned;
-               if (!res_counter_soft_limit_excess(&root_memcg->res))
+               if (!soft_limit_excess(root_memcg))
                         break;
         }
         mem_cgroup_iter_break(root_memcg, victim);
@@ -2316,33 +2248,31 @@ static DEFINE_MUTEX(percpu_charge_mutex);
  static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
  {
         struct memcg_stock_pcp *stock;
-       bool ret = true;
+       bool ret = false;
  
         if (nr_pages > CHARGE_BATCH)
-               return false;
+               return ret;
  
         stock = &get_cpu_var(memcg_stock);
-       if (memcg == stock->cached && stock->nr_pages >= nr_pages)
+       if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
                 stock->nr_pages -= nr_pages;
-       else /* need to call res_counter_charge */
-               ret = false;
+               ret = true;
+       }
         put_cpu_var(memcg_stock);
         return ret;
  }
  
  /*
- * Returns stocks cached in percpu to res_counter and reset cached information.
+ * Returns stocks cached in percpu and reset cached information.
   */
  static void drain_stock(struct memcg_stock_pcp *stock)
  {
         struct mem_cgroup *old = stock->cached;
  
         if (stock->nr_pages) {
-               unsigned long bytes = stock->nr_pages * PAGE_SIZE;
-
-               res_counter_uncharge(&old->res, bytes);
+               page_counter_uncharge(&old->memory, stock->nr_pages);
                 if (do_swap_account)
-                       res_counter_uncharge(&old->memsw, bytes);
+                       page_counter_uncharge(&old->memsw, stock->nr_pages);
                 stock->nr_pages = 0;
         }
         stock->cached = NULL;
@@ -2371,7 +2301,7 @@ static void __init memcg_stock_init(void)
  }
  
  /*
- * Cache charges(val) which is from res_counter, to local per_cpu area.
+ * Cache charges(val) to local per_cpu area.
   * This will be consumed by consume_stock() function, later.
   */
  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
@@ -2431,8 +2361,7 @@ out:
  /*
   * Tries to drain stocked charges in other cpus. This function is asynchronous
   * and just put a work per cpu for draining localy on each cpu. Caller can
- * expects some charges will be back to res_counter later but cannot wait for
- * it.
+ * expects some charges will be back later but cannot wait for it.
   */
  static void drain_all_stock_async(struct mem_cgroup *root_memcg)
  {
@@ -2506,9 +2435,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
         unsigned int batch = max(CHARGE_BATCH, nr_pages);
         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
         struct mem_cgroup *mem_over_limit;
-       struct res_counter *fail_res;
+       struct page_counter *counter;
         unsigned long nr_reclaimed;
-       unsigned long long size;
         bool may_swap = true;
         bool drained = false;
         int ret = 0;
@@ -2519,16 +2447,15 @@ retry:
         if (consume_stock(memcg, nr_pages))
                 goto done;
  
-       size = batch * PAGE_SIZE;
         if (!do_swap_account ||
-           !res_counter_charge(&memcg->memsw, size, &fail_res)) {
-               if (!res_counter_charge(&memcg->res, size, &fail_res))
+           !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
+               if (!page_counter_try_charge(&memcg->memory, batch, &counter))
                         goto done_restock;
                 if (do_swap_account)
-                       res_counter_uncharge(&memcg->memsw, size);
-               mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+                       page_counter_uncharge(&memcg->memsw, batch);
+               mem_over_limit = mem_cgroup_from_counter(counter, memory);
         } else {
-               mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+               mem_over_limit = mem_cgroup_from_counter(counter, memsw);
                 may_swap = false;
         }
  
@@ -2611,32 +2538,12 @@ done:
  
  static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
  {
-       unsigned long bytes = nr_pages * PAGE_SIZE;
-
         if (mem_cgroup_is_root(memcg))
                 return;
  
-       res_counter_uncharge(&memcg->res, bytes);
+       page_counter_uncharge(&memcg->memory, nr_pages);
         if (do_swap_account)
-               res_counter_uncharge(&memcg->memsw, bytes);
-}
-
-/*
- * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
- * This is useful when moving usage to parent cgroup.
- */
-static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
-                                       unsigned int nr_pages)
-{
-       unsigned long bytes = nr_pages * PAGE_SIZE;
-
-       if (mem_cgroup_is_root(memcg))
-               return;
-
-       res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
-       if (do_swap_account)
-               res_counter_uncharge_until(&memcg->memsw,
-                                               memcg->memsw.parent, bytes);
+               page_counter_uncharge(&memcg->memsw, nr_pages);
  }
  
  /*
@@ -2760,8 +2667,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
                 unlock_page_lru(page, isolated);
  }
  
-static DEFINE_MUTEX(set_limit_mutex);
-
  #ifdef CONFIG_MEMCG_KMEM
  /*
   * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
@@ -2804,16 +2709,17 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
  }
  #endif
  
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
+static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
+                            unsigned long nr_pages)
  {
-       struct res_counter *fail_res;
+       struct page_counter *counter;
         int ret = 0;
  
-       ret = res_counter_charge(&memcg->kmem, size, &fail_res);
-       if (ret)
+       ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
+       if (ret < 0)
                 return ret;
  
-       ret = try_charge(memcg, gfp, size >> PAGE_SHIFT);
+       ret = try_charge(memcg, gfp, nr_pages);
         if (ret == -EINTR)  {
                 /*
                  * try_charge() chose to bypass to root due to OOM kill or
@@ -2830,25 +2736,25 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
                  * when the allocation triggers should have been already
                  * directed to the root cgroup in memcontrol.h
                  */
-               res_counter_charge_nofail(&memcg->res, size, &fail_res);
+               page_counter_charge(&memcg->memory, nr_pages);
                 if (do_swap_account)
-                       res_counter_charge_nofail(&memcg->memsw, size,
-                                                 &fail_res);
+                       page_counter_charge(&memcg->memsw, nr_pages);
                 ret = 0;
         } else if (ret)
-               res_counter_uncharge(&memcg->kmem, size);
+               page_counter_uncharge(&memcg->kmem, nr_pages);
  
         return ret;
  }
  
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
+static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
+                               unsigned long nr_pages)
  {
-       res_counter_uncharge(&memcg->res, size);
+       page_counter_uncharge(&memcg->memory, nr_pages);
         if (do_swap_account)
-               res_counter_uncharge(&memcg->memsw, size);
+               page_counter_uncharge(&memcg->memsw, nr_pages);
  
         /* Not down to 0 */
-       if (res_counter_uncharge(&memcg->kmem, size))
+       if (page_counter_uncharge(&memcg->kmem, nr_pages))
                 return;
  
         /*
@@ -3124,19 +3030,21 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
  
  int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
  {
+       unsigned int nr_pages = 1 << order;
         int res;
  
-       res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
-                               PAGE_SIZE << order);
+       res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
         if (!res)
-               atomic_add(1 << order, &cachep->memcg_params->nr_pages);
+               atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
         return res;
  }
  
  void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
  {
-       memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
-       atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
+       unsigned int nr_pages = 1 << order;
+
+       memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
+       atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
  }
  
  /*
@@ -3257,7 +3165,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
                 return true;
         }
  
-       ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
+       ret = memcg_charge_kmem(memcg, gfp, 1 << order);
         if (!ret)
                 *_memcg = memcg;
  
@@ -3274,7 +3182,7 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
  
         /* The page allocation failed. Revert */
         if (!page) {
-               memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+               memcg_uncharge_kmem(memcg, 1 << order);
                 return;
         }
         /*
@@ -3307,7 +3215,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
                 return;
  
         VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
-       memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+       memcg_uncharge_kmem(memcg, 1 << order);
  }
  #else
  static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
@@ -3485,8 +3393,12 @@ static int mem_cgroup_move_parent(struct page *page,
  
         ret = mem_cgroup_move_account(page, nr_pages,
                                 pc, child, parent);
-       if (!ret)
-               __mem_cgroup_cancel_local_charge(child, nr_pages);
+       if (!ret) {
+               /* Take charge off the local counters */
+               page_counter_cancel(&child->memory, nr_pages);
+               if (do_swap_account)
+                       page_counter_cancel(&child->memsw, nr_pages);
+       }
  
         if (nr_pages > 1)
                 compound_unlock_irqrestore(page, flags);
@@ -3516,7 +3428,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
   *
   * Returns 0 on success, -EINVAL on failure.
   *
- * The caller must have charged to @to, IOW, called res_counter_charge() about
+ * The caller must have charged to @to, IOW, called page_counter_charge() about
   * both res and memsw, and called css_get().
   */
  static int mem_cgroup_move_swap_account(swp_entry_t entry,
@@ -3532,7 +3444,7 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
                 mem_cgroup_swap_statistics(to, true);
                 /*
                  * This function is only called from task migration context now.
-                * It postpones res_counter and refcount handling till the end
+                * It postpones page_counter and refcount handling till the end
                  * of task migration(mem_cgroup_clear_mc()) for performance
                  * improvement. But we cannot postpone css_get(to)  because if
                  * the process that has been moved to @to does swap-in, the
@@ -3590,60 +3502,57 @@ void mem_cgroup_print_bad_page(struct page *page)
  }
  #endif
  
+static DEFINE_MUTEX(memcg_limit_mutex);
+
  static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
-                               unsigned long long val)
+                                  unsigned long limit)
  {
+       unsigned long curusage;
+       unsigned long oldusage;
+       bool enlarge = false;
         int retry_count;
-       int ret = 0;
-       int children = mem_cgroup_count_children(memcg);
-       u64 curusage, oldusage;
-       int enlarge;
+       int ret;
  
         /*
          * For keeping hierarchical_reclaim simple, how long we should retry
          * is depends on callers. We set our retry-count to be function
          * of # of children which we should visit in this loop.
          */
-       retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
+       retry_count = MEM_CGROUP_RECLAIM_RETRIES *
+                     mem_cgroup_count_children(memcg);
  
-       oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
+       oldusage = page_counter_read(&memcg->memory);
  
-       enlarge = 0;
-       while (retry_count) {
+       do {
                 if (signal_pending(current)) {
                         ret = -EINTR;
                         break;
                 }
-               /*
-                * Rather than hide all in some function, I do this in
-                * open coded manner. You see what this really does.
-                * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
-                */
-               mutex_lock(&set_limit_mutex);
-               if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) {
+
+               mutex_lock(&memcg_limit_mutex);
+               if (limit > memcg->memsw.limit) {
+                       mutex_unlock(&memcg_limit_mutex);
                         ret = -EINVAL;
-                       mutex_unlock(&set_limit_mutex);
                         break;
                 }
-
-               if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val)
-                       enlarge = 1;
-
-               ret = res_counter_set_limit(&memcg->res, val);
-               mutex_unlock(&set_limit_mutex);
+               if (limit > memcg->memory.limit)
+                       enlarge = true;
+               ret = page_counter_limit(&memcg->memory, limit);
+               mutex_unlock(&memcg_limit_mutex);
  
                 if (!ret)
                         break;
  
                 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
  
-               curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
+               curusage = page_counter_read(&memcg->memory);
                 /* Usage is reduced ? */
                 if (curusage >= oldusage)
                         retry_count--;
                 else
                         oldusage = curusage;
-       }
+       } while (retry_count);
+
         if (!ret && enlarge)
                 memcg_oom_recover(memcg);
  
@@ -3651,52 +3560,53 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
  }
  
  static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
-                                       unsigned long long val)
+                                        unsigned long limit)
  {
+       unsigned long curusage;
+       unsigned long oldusage;
+       bool enlarge = false;
         int retry_count;
-       u64 oldusage, curusage;
-       int children = mem_cgroup_count_children(memcg);
-       int ret = -EBUSY;
-       int enlarge = 0;
+       int ret;
  
         /* see mem_cgroup_resize_res_limit */
-       retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
-       oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
-       while (retry_count) {
+       retry_count = MEM_CGROUP_RECLAIM_RETRIES *
+                     mem_cgroup_count_children(memcg);
+
+       oldusage = page_counter_read(&memcg->memsw);
+
+       do {
                 if (signal_pending(current)) {
                         ret = -EINTR;
                         break;
                 }
-               /*
-                * Rather than hide all in some function, I do this in
-                * open coded manner. You see what this really does.
-                * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
-                */
-               mutex_lock(&set_limit_mutex);
-               if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) {
+
+               mutex_lock(&memcg_limit_mutex);
+               if (limit < memcg->memory.limit) {
+                       mutex_unlock(&memcg_limit_mutex);
                         ret = -EINVAL;
-                       mutex_unlock(&set_limit_mutex);
                         break;
                 }
-               if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val)
-                       enlarge = 1;
-               ret = res_counter_set_limit(&memcg->memsw, val);
-               mutex_unlock(&set_limit_mutex);
+               if (limit > memcg->memsw.limit)
+                       enlarge = true;
+               ret = page_counter_limit(&memcg->memsw, limit);
+               mutex_unlock(&memcg_limit_mutex);
  
                 if (!ret)
                         break;
  
                 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
  
-               curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+               curusage = page_counter_read(&memcg->memsw);
                 /* Usage is reduced ? */
                 if (curusage >= oldusage)
                         retry_count--;
                 else
                         oldusage = curusage;
-       }
+       } while (retry_count);
+
         if (!ret && enlarge)
                 memcg_oom_recover(memcg);
+
         return ret;
  }
  
@@ -3709,7 +3619,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
         unsigned long reclaimed;
         int loop = 0;
         struct mem_cgroup_tree_per_zone *mctz;
-       unsigned long long excess;
+       unsigned long excess;
         unsigned long nr_scanned;
  
         if (order > 0)
@@ -3763,7 +3673,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                         } while (1);
                 }
                 __mem_cgroup_remove_exceeded(mz, mctz);
-               excess = res_counter_soft_limit_excess(&mz->memcg->res);
+               excess = soft_limit_excess(mz->memcg);
                 /*
                  * One school of thought says that we should not add
                  * back the node to the tree if reclaim returns 0.
@@ -3856,7 +3766,6 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
  static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
  {
         int node, zid;
-       u64 usage;
  
         do {
                 /* This is for making all *used* pages to be on LRU. */
@@ -3888,9 +3797,8 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
                  * right after the check. RES_USAGE should be safe as we always
                  * charge before adding to the LRU.
                  */
-               usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
-                       res_counter_read_u64(&memcg->kmem, RES_USAGE);
-       } while (usage > 0);
+       } while (page_counter_read(&memcg->memory) -
+                page_counter_read(&memcg->kmem) > 0);
  }
  
  /*
@@ -3930,7 +3838,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
         /* we call try-to-free pages for make this cgroup empty */
         lru_add_drain_all();
         /* try to free all pages in this cgroup */
-       while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
+       while (nr_retries && page_counter_read(&memcg->memory)) {
                 int progress;
  
                 if (signal_pending(current))
@@ -4001,8 +3909,8 @@ out:
         return retval;
  }
  
-static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
-                                              enum mem_cgroup_stat_index idx)
+static unsigned long tree_stat(struct mem_cgroup *memcg,
+                              enum mem_cgroup_stat_index idx)
  {
         struct mem_cgroup *iter;
         long val = 0;
@@ -4020,55 +3928,72 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
  {
         u64 val;
  
-       if (!mem_cgroup_is_root(memcg)) {
+       if (mem_cgroup_is_root(memcg)) {
+               val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
+               val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
+               if (swap)
+                       val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
+       } else {
                 if (!swap)
-                       return res_counter_read_u64(&memcg->res, RES_USAGE);
+                       val = page_counter_read(&memcg->memory);
                 else
-                       return res_counter_read_u64(&memcg->memsw, RES_USAGE);
+                       val = page_counter_read(&memcg->memsw);
         }
-
-       /*
-        * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
-        * as well as in MEM_CGROUP_STAT_RSS_HUGE.
-        */
-       val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
-       val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
-
-       if (swap)
-               val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
-
         return val << PAGE_SHIFT;
  }
  
+enum {
+       RES_USAGE,
+       RES_LIMIT,
+       RES_MAX_USAGE,
+       RES_FAILCNT,
+       RES_SOFT_LIMIT,
+};
  
  static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
                                struct cftype *cft)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-       enum res_type type = MEMFILE_TYPE(cft->private);
-       int name = MEMFILE_ATTR(cft->private);
+       struct page_counter *counter;
  
-       switch (type) {
+       switch (MEMFILE_TYPE(cft->private)) {
         case _MEM:
-               if (name == RES_USAGE)
-                       return mem_cgroup_usage(memcg, false);
-               return res_counter_read_u64(&memcg->res, name);
+               counter = &memcg->memory;
+               break;
         case _MEMSWAP:
-               if (name == RES_USAGE)
-                       return mem_cgroup_usage(memcg, true);
-               return res_counter_read_u64(&memcg->memsw, name);
+               counter = &memcg->memsw;
+               break;
         case _KMEM:
-               return res_counter_read_u64(&memcg->kmem, name);
+               counter = &memcg->kmem;
                 break;
         default:
                 BUG();
         }
+
+       switch (MEMFILE_ATTR(cft->private)) {
+       case RES_USAGE:
+               if (counter == &memcg->memory)
+                       return mem_cgroup_usage(memcg, false);
+               if (counter == &memcg->memsw)
+                       return mem_cgroup_usage(memcg, true);
+               return (u64)page_counter_read(counter) * PAGE_SIZE;
+       case RES_LIMIT:
+               return (u64)counter->limit * PAGE_SIZE;
+       case RES_MAX_USAGE:
+               return (u64)counter->watermark * PAGE_SIZE;
+       case RES_FAILCNT:
+               return counter->failcnt;
+       case RES_SOFT_LIMIT:
+               return (u64)memcg->soft_limit * PAGE_SIZE;
+       default:
+               BUG();
+       }
  }
  
  #ifdef CONFIG_MEMCG_KMEM
  /* should be called with activate_kmem_mutex held */
  static int __memcg_activate_kmem(struct mem_cgroup *memcg,
-                                unsigned long long limit)
+                                unsigned long nr_pages)
  {
         int err = 0;
         int memcg_id;
@@ -4115,7 +4040,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
          * We couldn't have accounted to this cgroup, because it hasn't got the
          * active bit set yet, so this should succeed.
          */
-       err = res_counter_set_limit(&memcg->kmem, limit);
+       err = page_counter_limit(&memcg->kmem, nr_pages);
         VM_BUG_ON(err);
  
         static_key_slow_inc(&memcg_kmem_enabled_key);
@@ -4131,25 +4056,27 @@ out:
  }
  
  static int memcg_activate_kmem(struct mem_cgroup *memcg,
-                              unsigned long long limit)
+                              unsigned long nr_pages)
  {
         int ret;
  
         mutex_lock(&activate_kmem_mutex);
-       ret = __memcg_activate_kmem(memcg, limit);
+       ret = __memcg_activate_kmem(memcg, nr_pages);
         mutex_unlock(&activate_kmem_mutex);
         return ret;
  }
  
  static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
-                                  unsigned long long val)
+                                  unsigned long limit)
  {
         int ret;
  
+       mutex_lock(&memcg_limit_mutex);
         if (!memcg_kmem_is_active(memcg))
-               ret = memcg_activate_kmem(memcg, val);
+               ret = memcg_activate_kmem(memcg, limit);
         else
-               ret = res_counter_set_limit(&memcg->kmem, val);
+               ret = page_counter_limit(&memcg->kmem, limit);
+       mutex_unlock(&memcg_limit_mutex);
         return ret;
  }
  
@@ -4167,13 +4094,13 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
          * after this point, because it has at least one child already.
          */
         if (memcg_kmem_is_active(parent))
-               ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
+               ret = __memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
         mutex_unlock(&activate_kmem_mutex);
         return ret;
  }
  #else
  static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
-                                  unsigned long long val)
+                                  unsigned long limit)
  {
         return -EINVAL;
  }
@@ -4187,110 +4114,69 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
                                 char *buf, size_t nbytes, loff_t off)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-       enum res_type type;
-       int name;
-       unsigned long long val;
+       unsigned long nr_pages;
         int ret;
  
         buf = strstrip(buf);
-       type = MEMFILE_TYPE(of_cft(of)->private);
-       name = MEMFILE_ATTR(of_cft(of)->private);
+       ret = page_counter_memparse(buf, &nr_pages);
+       if (ret)
+               return ret;
  
-       switch (name) {
+       switch (MEMFILE_ATTR(of_cft(of)->private)) {
         case RES_LIMIT:
                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
                         ret = -EINVAL;
                         break;
                 }
-               /* This function does all necessary parse...reuse it */
-               ret = res_counter_memparse_write_strategy(buf, &val);
-               if (ret)
+               switch (MEMFILE_TYPE(of_cft(of)->private)) {
+               case _MEM:
+                       ret = mem_cgroup_resize_limit(memcg, nr_pages);
                         break;
-               if (type == _MEM)
-                       ret = mem_cgroup_resize_limit(memcg, val);
-               else if (type == _MEMSWAP)
-                       ret = mem_cgroup_resize_memsw_limit(memcg, val);
-               else if (type == _KMEM)
-                       ret = memcg_update_kmem_limit(memcg, val);
-               else
-                       return -EINVAL;
-               break;
-       case RES_SOFT_LIMIT:
-               ret = res_counter_memparse_write_strategy(buf, &val);
-               if (ret)
+               case _MEMSWAP:
+                       ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
                         break;
-               /*
-                * For memsw, soft limits are hard to implement in terms
-                * of semantics, for now, we support soft limits for
-                * control without swap
-                */
-               if (type == _MEM)
-                       ret = res_counter_set_soft_limit(&memcg->res, val);
-               else
-                       ret = -EINVAL;
+               case _KMEM:
+                       ret = memcg_update_kmem_limit(memcg, nr_pages);
+                       break;
+               }
                 break;
-       default:
-               ret = -EINVAL; /* should be BUG() ? */
+       case RES_SOFT_LIMIT:
+               memcg->soft_limit = nr_pages;
+               ret = 0;
                 break;
         }
         return ret ?: nbytes;
  }
  
-static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
-               unsigned long long *mem_limit, unsigned long long *memsw_limit)
-{
-       unsigned long long min_limit, min_memsw_limit, tmp;
-
-       min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
-       min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
-       if (!memcg->use_hierarchy)
-               goto out;
-
-       while (memcg->css.parent) {
-               memcg = mem_cgroup_from_css(memcg->css.parent);
-               if (!memcg->use_hierarchy)
-                       break;
-               tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
-               min_limit = min(min_limit, tmp);
-               tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
-               min_memsw_limit = min(min_memsw_limit, tmp);
-       }
-out:
-       *mem_limit = min_limit;
-       *memsw_limit = min_memsw_limit;
-}
-
  static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-       int name;
-       enum res_type type;
+       struct page_counter *counter;
  
-       type = MEMFILE_TYPE(of_cft(of)->private);
-       name = MEMFILE_ATTR(of_cft(of)->private);
+       switch (MEMFILE_TYPE(of_cft(of)->private)) {
+       case _MEM:
+               counter = &memcg->memory;
+               break;
+       case _MEMSWAP:
+               counter = &memcg->memsw;
+               break;
+       case _KMEM:
+               counter = &memcg->kmem;
+               break;
+       default:
+               BUG();
+       }
  
-       switch (name) {
+       switch (MEMFILE_ATTR(of_cft(of)->private)) {
         case RES_MAX_USAGE:
-               if (type == _MEM)
-                       res_counter_reset_max(&memcg->res);
-               else if (type == _MEMSWAP)
-                       res_counter_reset_max(&memcg->memsw);
-               else if (type == _KMEM)
-                       res_counter_reset_max(&memcg->kmem);
-               else
-                       return -EINVAL;
+               page_counter_reset_watermark(counter);
                 break;
         case RES_FAILCNT:
-               if (type == _MEM)
-                       res_counter_reset_failcnt(&memcg->res);
-               else if (type == _MEMSWAP)
-                       res_counter_reset_failcnt(&memcg->memsw);
-               else if (type == _KMEM)
-                       res_counter_reset_failcnt(&memcg->kmem);
-               else
-                       return -EINVAL;
+               counter->failcnt = 0;
                 break;
+       default:
+               BUG();
         }
  
         return nbytes;
@@ -4387,6 +4273,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
  static int memcg_stat_show(struct seq_file *m, void *v)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+       unsigned long memory, memsw;
         struct mem_cgroup *mi;
         unsigned int i;
  
@@ -4406,14 +4293,16 @@ static int memcg_stat_show(struct seq_file *m, void *v)
                            mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
  
         /* Hierarchical information */
-       {
-               unsigned long long limit, memsw_limit;
-               memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
-               seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
-               if (do_swap_account)
-                       seq_printf(m, "hierarchical_memsw_limit %llu\n",
-                                  memsw_limit);
+       memory = memsw = PAGE_COUNTER_MAX;
+       for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
+               memory = min(memory, mi->memory.limit);
+               memsw = min(memsw, mi->memsw.limit);
         }
+       seq_printf(m, "hierarchical_memory_limit %llu\n",
+                  (u64)memory * PAGE_SIZE);
+       if (do_swap_account)
+               seq_printf(m, "hierarchical_memsw_limit %llu\n",
+                          (u64)memsw * PAGE_SIZE);
  
         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                 long long val = 0;
@@ -4497,7 +4386,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
  static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
  {
         struct mem_cgroup_threshold_ary *t;
-       u64 usage;
+       unsigned long usage;
         int i;
  
         rcu_read_lock();
@@ -4596,10 +4485,11 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
  {
         struct mem_cgroup_thresholds *thresholds;
         struct mem_cgroup_threshold_ary *new;
-       u64 threshold, usage;
+       unsigned long threshold;
+       unsigned long usage;
         int i, size, ret;
  
-       ret = res_counter_memparse_write_strategy(args, &threshold);
+       ret = page_counter_memparse(args, &threshold);
         if (ret)
                 return ret;
  
@@ -4689,7 +4579,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
  {
         struct mem_cgroup_thresholds *thresholds;
         struct mem_cgroup_threshold_ary *new;
-       u64 usage;
+       unsigned long usage;
         int i, j, size;
  
         mutex_lock(&memcg->thresholds_lock);
@@ -4883,7 +4773,7 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
  
         memcg_kmem_mark_dead(memcg);
  
-       if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
+       if (page_counter_read(&memcg->kmem))
                 return;
  
         if (memcg_kmem_test_and_clear_dead(memcg))
@@ -5363,9 +5253,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
   */
  struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
  {
-       if (!memcg->res.parent)
+       if (!memcg->memory.parent)
                 return NULL;
-       return mem_cgroup_from_res_counter(memcg->res.parent, res);
+       return mem_cgroup_from_counter(memcg->memory.parent, memory);
  }
  EXPORT_SYMBOL(parent_mem_cgroup);
  
@@ -5410,9 +5300,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
         /* root ? */
         if (parent_css == NULL) {
                 root_mem_cgroup = memcg;
-               res_counter_init(&memcg->res, NULL);
-               res_counter_init(&memcg->memsw, NULL);
-               res_counter_init(&memcg->kmem, NULL);
+               page_counter_init(&memcg->memory, NULL);
+               page_counter_init(&memcg->memsw, NULL);
+               page_counter_init(&memcg->kmem, NULL);
         }
  
         memcg->last_scanned_node = MAX_NUMNODES;
@@ -5451,18 +5341,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
         memcg->swappiness = mem_cgroup_swappiness(parent);
  
         if (parent->use_hierarchy) {
-               res_counter_init(&memcg->res, &parent->res);
-               res_counter_init(&memcg->memsw, &parent->memsw);
-               res_counter_init(&memcg->kmem, &parent->kmem);
+               page_counter_init(&memcg->memory, &parent->memory);
+               page_counter_init(&memcg->memsw, &parent->memsw);
+               page_counter_init(&memcg->kmem, &parent->kmem);
  
                 /*
                  * No need to take a reference to the parent because cgroup
                  * core guarantees its existence.
                  */
         } else {
-               res_counter_init(&memcg->res, NULL);
-               res_counter_init(&memcg->memsw, NULL);
-               res_counter_init(&memcg->kmem, NULL);
+               page_counter_init(&memcg->memory, NULL);
+               page_counter_init(&memcg->memsw, NULL);
+               page_counter_init(&memcg->kmem, NULL);
                 /*
                  * Deeper hierachy with use_hierarchy == false doesn't make
                  * much sense so let cgroup subsystem know about this
@@ -5487,24 +5377,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
         return 0;
  }
  
-/*
- * Announce all parents that a group from their hierarchy is gone.
- */
-static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
-{
-       struct mem_cgroup *parent = memcg;
-
-       while ((parent = parent_mem_cgroup(parent)))
-               mem_cgroup_iter_invalidate(parent);
-
-       /*
-        * if the root memcg is not hierarchical we have to check it
-        * explicitely.
-        */
-       if (!root_mem_cgroup->use_hierarchy)
-               mem_cgroup_iter_invalidate(root_mem_cgroup);
-}
-
  static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5525,8 +5397,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
  
         kmem_cgroup_css_offline(memcg);
  
-       mem_cgroup_invalidate_reclaim_iterators(memcg);
-
         /*
          * This requires that offlining is serialized.  Right now that is
          * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
@@ -5544,7 +5414,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
         /*
          * XXX: css_offline() would be where we should reparent all
          * memory to prepare the cgroup for destruction.  However,
-        * memcg does not do css_tryget_online() and res_counter charging
+        * memcg does not do css_tryget_online() and page_counter charging
          * under the same RCU lock region, which means that charging
          * could race with offlining.  Offlining only happens to
          * cgroups with no tasks in them but charges can show up
@@ -5564,7 +5434,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
          * call_rcu()
          *   offline_css()
          *     reparent_charges()
-        *                           res_counter_charge()
+        *                           page_counter_try_charge()
          *                           css_put()
          *                             css_free()
          *                           pc->mem_cgroup = dead memcg
@@ -5599,10 +5469,10 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
-       mem_cgroup_resize_limit(memcg, ULLONG_MAX);
-       mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX);
-       memcg_update_kmem_limit(memcg, ULLONG_MAX);
-       res_counter_set_soft_limit(&memcg->res, ULLONG_MAX);
+       mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
+       mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
+       memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+       memcg->soft_limit = 0;
  }
  
  #ifdef CONFIG_MMU
@@ -5916,19 +5786,18 @@ static void __mem_cgroup_clear_mc(void)
         if (mc.moved_swap) {
                 /* uncharge swap account from the old cgroup */
                 if (!mem_cgroup_is_root(mc.from))
-                       res_counter_uncharge(&mc.from->memsw,
-                                            PAGE_SIZE * mc.moved_swap);
-
-               for (i = 0; i < mc.moved_swap; i++)
-                       css_put(&mc.from->css);
+                       page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
  
                 /*
-                * we charged both to->res and to->memsw, so we should
-                * uncharge to->res.
+                * we charged both to->memory and to->memsw, so we
+                * should uncharge to->memory.
                  */
                 if (!mem_cgroup_is_root(mc.to))
-                       res_counter_uncharge(&mc.to->res,
-                                            PAGE_SIZE * mc.moved_swap);
+                       page_counter_uncharge(&mc.to->memory, mc.moved_swap);
+
+               for (i = 0; i < mc.moved_swap; i++)
+                       css_put(&mc.from->css);
+
                 /* we've already done css_get(mc.to) */
                 mc.moved_swap = 0;
         }
@@ -6294,7 +6163,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
         memcg = mem_cgroup_lookup(id);
         if (memcg) {
                 if (!mem_cgroup_is_root(memcg))
-                       res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+                       page_counter_uncharge(&memcg->memsw, 1);
                 mem_cgroup_swap_statistics(memcg, false);
                 css_put(&memcg->css);
         }
@@ -6460,11 +6329,9 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
  
         if (!mem_cgroup_is_root(memcg)) {
                 if (nr_mem)
-                       res_counter_uncharge(&memcg->res,
-                                            nr_mem * PAGE_SIZE);
+                       page_counter_uncharge(&memcg->memory, nr_mem);
                 if (nr_memsw)
-                       res_counter_uncharge(&memcg->memsw,
-                                            nr_memsw * PAGE_SIZE);
+                       page_counter_uncharge(&memcg->memsw, nr_memsw);
                 memcg_oom_recover(memcg);
         }