ath10k: don't show MSI registration warnings

[cascardo/linux.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 0878ff7..d5ff3ce 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,7 +39,6 @@
  #include <linux/limits.h>
  #include <linux/export.h>
  #include <linux/mutex.h>
-#include <linux/rbtree.h>
  #include <linux/slab.h>
  #include <linux/swap.h>
  #include <linux/swapops.h>
@@ -85,26 +84,12 @@ static int really_do_swap_account __initdata = 0;
  #endif
  
  
-/*
- * Statistics for memory cgroup.
- */
-enum mem_cgroup_stat_index {
-       /*
-        * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
-        */
-       MEM_CGROUP_STAT_CACHE,          /* # of pages charged as cache */
-       MEM_CGROUP_STAT_RSS,            /* # of pages charged as anon rss */
-       MEM_CGROUP_STAT_RSS_HUGE,       /* # of pages charged as anon huge */
-       MEM_CGROUP_STAT_FILE_MAPPED,    /* # of pages charged as file rss */
-       MEM_CGROUP_STAT_SWAP,           /* # of pages, swapped out */
-       MEM_CGROUP_STAT_NSTATS,
-};
-
  static const char * const mem_cgroup_stat_names[] = {
         "cache",
         "rss",
         "rss_huge",
         "mapped_file",
+       "writeback",
         "swap",
  };
  
@@ -175,10 +160,6 @@ struct mem_cgroup_per_zone {
  
         struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
  
-       struct rb_node          tree_node;      /* RB tree node */
-       unsigned long long      usage_in_excess;/* Set to the value by which */
-                                               /* the soft limit is exceeded*/
-       bool                    on_tree;
         struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
                                                 /* use container_of        */
  };
@@ -187,26 +168,6 @@ struct mem_cgroup_per_node {
         struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
  };
  
-/*
- * Cgroups above their limits are maintained in a RB-Tree, independent of
- * their hierarchy representation
- */
-
-struct mem_cgroup_tree_per_zone {
-       struct rb_root rb_root;
-       spinlock_t lock;
-};
-
-struct mem_cgroup_tree_per_node {
-       struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
-};
-
-struct mem_cgroup_tree {
-       struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
-};
-
-static struct mem_cgroup_tree soft_limit_tree __read_mostly;
-
  struct mem_cgroup_threshold {
         struct eventfd_ctx *eventfd;
         u64 threshold;
@@ -280,6 +241,7 @@ struct mem_cgroup {
  
         bool            oom_lock;
         atomic_t        under_oom;
+       atomic_t        oom_wakeups;
  
         int     swappiness;
         /* OOM-Killer disable */
@@ -304,7 +266,7 @@ struct mem_cgroup {
          * Should we move charges of a task when a task is moved into this
          * mem_cgroup ? And what type of charges should we move ?
          */
-       unsigned long   move_charge_at_immigrate;
+       unsigned long move_charge_at_immigrate;
         /*
          * set > 0 if pages under this cgroup are moving to other cgroup.
          */
@@ -341,6 +303,22 @@ struct mem_cgroup {
         atomic_t        numainfo_events;
         atomic_t        numainfo_updating;
  #endif
+       /*
+        * Protects soft_contributed transitions.
+        * See mem_cgroup_update_soft_limit
+        */
+       spinlock_t soft_lock;
+
+       /*
+        * If true then this group has increased parents' children_in_excess
+        * when it got over the soft limit.
+        * When a group falls bellow the soft limit, parents' children_in_excess
+        * is decreased and soft_contributed changed to false.
+        */
+       bool soft_contributed;
+
+       /* Number of children that are in soft limit excess */
+       atomic_t children_in_excess;
  
         struct mem_cgroup_per_node *nodeinfo[0];
         /* WARNING: nodeinfo must be the last member here */
@@ -444,7 +422,6 @@ static bool move_file(void)
   * limit reclaim to prevent infinite loops, if they ever occur.
   */
  #define        MEM_CGROUP_MAX_RECLAIM_LOOPS            100
-#define        MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
  
  enum charge_type {
         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -483,10 +460,9 @@ enum res_type {
   */
  static DEFINE_MUTEX(memcg_create_mutex);
  
-static inline
  struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
  {
-       return container_of(s, struct mem_cgroup, css);
+       return s ? container_of(s, struct mem_cgroup, css) : NULL;
  }
  
  /* Some nice accessors for the vmpressure. */
@@ -672,164 +648,6 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
         return mem_cgroup_zoneinfo(memcg, nid, zid);
  }
  
-static struct mem_cgroup_tree_per_zone *
-soft_limit_tree_node_zone(int nid, int zid)
-{
-       return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
-}
-
-static struct mem_cgroup_tree_per_zone *
-soft_limit_tree_from_page(struct page *page)
-{
-       int nid = page_to_nid(page);
-       int zid = page_zonenum(page);
-
-       return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
-}
-
-static void
-__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
-                               struct mem_cgroup_per_zone *mz,
-                               struct mem_cgroup_tree_per_zone *mctz,
-                               unsigned long long new_usage_in_excess)
-{
-       struct rb_node **p = &mctz->rb_root.rb_node;
-       struct rb_node *parent = NULL;
-       struct mem_cgroup_per_zone *mz_node;
-
-       if (mz->on_tree)
-               return;
-
-       mz->usage_in_excess = new_usage_in_excess;
-       if (!mz->usage_in_excess)
-               return;
-       while (*p) {
-               parent = *p;
-               mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
-                                       tree_node);
-               if (mz->usage_in_excess < mz_node->usage_in_excess)
-                       p = &(*p)->rb_left;
-               /*
-                * We can't avoid mem cgroups that are over their soft
-                * limit by the same amount
-                */
-               else if (mz->usage_in_excess >= mz_node->usage_in_excess)
-                       p = &(*p)->rb_right;
-       }
-       rb_link_node(&mz->tree_node, parent, p);
-       rb_insert_color(&mz->tree_node, &mctz->rb_root);
-       mz->on_tree = true;
-}
-
-static void
-__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
-                               struct mem_cgroup_per_zone *mz,
-                               struct mem_cgroup_tree_per_zone *mctz)
-{
-       if (!mz->on_tree)
-               return;
-       rb_erase(&mz->tree_node, &mctz->rb_root);
-       mz->on_tree = false;
-}
-
-static void
-mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
-                               struct mem_cgroup_per_zone *mz,
-                               struct mem_cgroup_tree_per_zone *mctz)
-{
-       spin_lock(&mctz->lock);
-       __mem_cgroup_remove_exceeded(memcg, mz, mctz);
-       spin_unlock(&mctz->lock);
-}
-
-
-static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
-{
-       unsigned long long excess;
-       struct mem_cgroup_per_zone *mz;
-       struct mem_cgroup_tree_per_zone *mctz;
-       int nid = page_to_nid(page);
-       int zid = page_zonenum(page);
-       mctz = soft_limit_tree_from_page(page);
-
-       /*
-        * Necessary to update all ancestors when hierarchy is used.
-        * because their event counter is not touched.
-        */
-       for (; memcg; memcg = parent_mem_cgroup(memcg)) {
-               mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-               excess = res_counter_soft_limit_excess(&memcg->res);
-               /*
-                * We have to update the tree if mz is on RB-tree or
-                * mem is over its softlimit.
-                */
-               if (excess || mz->on_tree) {
-                       spin_lock(&mctz->lock);
-                       /* if on-tree, remove it */
-                       if (mz->on_tree)
-                               __mem_cgroup_remove_exceeded(memcg, mz, mctz);
-                       /*
-                        * Insert again. mz->usage_in_excess will be updated.
-                        * If excess is 0, no tree ops.
-                        */
-                       __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
-                       spin_unlock(&mctz->lock);
-               }
-       }
-}
-
-static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
-{
-       int node, zone;
-       struct mem_cgroup_per_zone *mz;
-       struct mem_cgroup_tree_per_zone *mctz;
-
-       for_each_node(node) {
-               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-                       mz = mem_cgroup_zoneinfo(memcg, node, zone);
-                       mctz = soft_limit_tree_node_zone(node, zone);
-                       mem_cgroup_remove_exceeded(memcg, mz, mctz);
-               }
-       }
-}
-
-static struct mem_cgroup_per_zone *
-__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
-{
-       struct rb_node *rightmost = NULL;
-       struct mem_cgroup_per_zone *mz;
-
-retry:
-       mz = NULL;
-       rightmost = rb_last(&mctz->rb_root);
-       if (!rightmost)
-               goto done;              /* Nothing to reclaim from */
-
-       mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
-       /*
-        * Remove the node now but someone else can add it back,
-        * we will to add it back at the end of reclaim to its correct
-        * position in the tree.
-        */
-       __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
-       if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
-               !css_tryget(&mz->memcg->css))
-               goto retry;
-done:
-       return mz;
-}
-
-static struct mem_cgroup_per_zone *
-mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
-{
-       struct mem_cgroup_per_zone *mz;
-
-       spin_lock(&mctz->lock);
-       mz = __mem_cgroup_largest_soft_limit_node(mctz);
-       spin_unlock(&mctz->lock);
-       return mz;
-}
-
  /*
   * Implementation Note: reading percpu statistics for memcg.
   *
@@ -1003,6 +821,48 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
         return false;
  }
  
+/*
+ * Called from rate-limited memcg_check_events when enough
+ * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
+ * that all the parents up the hierarchy will be notified that this group
+ * is in excess or that it is not in excess anymore. mmecg->soft_contributed
+ * makes the transition a single action whenever the state flips from one to
+ * the other.
+ */
+static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
+{
+       unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
+       struct mem_cgroup *parent = memcg;
+       int delta = 0;
+
+       spin_lock(&memcg->soft_lock);
+       if (excess) {
+               if (!memcg->soft_contributed) {
+                       delta = 1;
+                       memcg->soft_contributed = true;
+               }
+       } else {
+               if (memcg->soft_contributed) {
+                       delta = -1;
+                       memcg->soft_contributed = false;
+               }
+       }
+
+       /*
+        * Necessary to update all ancestors when hierarchy is used
+        * because their event counter is not touched.
+        * We track children even outside the hierarchy for the root
+        * cgroup because tree walk starting at root should visit
+        * all cgroups and we want to prevent from pointless tree
+        * walk if no children is below the limit.
+        */
+       while (delta && (parent = parent_mem_cgroup(parent)))
+               atomic_add(delta, &parent->children_in_excess);
+       if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
+               atomic_add(delta, &root_mem_cgroup->children_in_excess);
+       spin_unlock(&memcg->soft_lock);
+}
+
  /*
   * Check events in order.
   *
@@ -1026,7 +886,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
  
                 mem_cgroup_threshold(memcg);
                 if (unlikely(do_softlimit))
-                       mem_cgroup_update_tree(memcg, page);
+                       mem_cgroup_update_soft_limit(memcg);
  #if MAX_NUMNODES > 1
                 if (unlikely(do_numainfo))
                         atomic_inc(&memcg->numainfo_events);
@@ -1035,12 +895,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
                 preempt_enable();
  }
  
-struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
-{
-       return mem_cgroup_from_css(
-               cgroup_subsys_state(cont, mem_cgroup_subsys_id));
-}
-
  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
  {
         /*
@@ -1051,7 +905,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
         if (unlikely(!p))
                 return NULL;
  
-       return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
+       return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id));
  }
  
  struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -1075,6 +929,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
         return memcg;
  }
  
+static enum mem_cgroup_filter_t
+mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
+               mem_cgroup_iter_filter cond)
+{
+       if (!cond)
+               return VISIT;
+       return cond(memcg, root);
+}
+
  /*
   * Returns a next (in a pre-order walk) alive memcg (with elevated css
   * ref. count) or NULL if the whole root's subtree has been visited.
@@ -1082,22 +945,13 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
   * helper function to be used by mem_cgroup_iter
   */
  static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
-               struct mem_cgroup *last_visited)
+               struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
  {
-       struct cgroup *prev_cgroup, *next_cgroup;
-
-       /*
-        * Root is not visited by cgroup iterators so it needs an
-        * explicit visit.
-        */
-       if (!last_visited)
-               return root;
+       struct cgroup_subsys_state *prev_css, *next_css;
  
-       prev_cgroup = (last_visited == root) ? NULL
-               : last_visited->css.cgroup;
+       prev_css = last_visited ? &last_visited->css : NULL;
  skip_node:
-       next_cgroup = cgroup_next_descendant_pre(
-                       prev_cgroup, root->css.cgroup);
+       next_css = css_next_descendant_pre(prev_css, &root->css);
  
         /*
          * Even if we found a group we have to make sure it is
@@ -1106,14 +960,34 @@ skip_node:
          * last_visited css is safe to use because it is
          * protected by css_get and the tree walk is rcu safe.
          */
-       if (next_cgroup) {
-               struct mem_cgroup *mem = mem_cgroup_from_cont(
-                               next_cgroup);
-               if (css_tryget(&mem->css))
-                       return mem;
-               else {
-                       prev_cgroup = next_cgroup;
+       if (next_css) {
+               struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
+
+               switch (mem_cgroup_filter(mem, root, cond)) {
+               case SKIP:
+                       prev_css = next_css;
                         goto skip_node;
+               case SKIP_TREE:
+                       if (mem == root)
+                               return NULL;
+                       /*
+                        * css_rightmost_descendant is not an optimal way to
+                        * skip through a subtree (especially for imbalanced
+                        * trees leaning to right) but that's what we have right
+                        * now. More effective solution would be traversing
+                        * right-up for first non-NULL without calling
+                        * css_next_descendant_pre afterwards.
+                        */
+                       prev_css = css_rightmost_descendant(next_css);
+                       goto skip_node;
+               case VISIT:
+                       if (css_tryget(&mem->css))
+                               return mem;
+                       else {
+                               prev_css = next_css;
+                               goto skip_node;
+                       }
+                       break;
                 }
         }
  
@@ -1177,6 +1051,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
   * @root: hierarchy root
   * @prev: previously returned memcg, NULL on first invocation
   * @reclaim: cookie for shared reclaim walks, NULL for full walks
+ * @cond: filter for visited nodes, NULL for no filter
   *
   * Returns references to children of the hierarchy below @root, or
   * @root itself, or %NULL after a full round-trip.
@@ -1189,15 +1064,18 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
   * divide up the memcgs in the hierarchy among all concurrent
   * reclaimers operating on the same zone and priority.
   */
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
+struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
                                    struct mem_cgroup *prev,
-                                  struct mem_cgroup_reclaim_cookie *reclaim)
+                                  struct mem_cgroup_reclaim_cookie *reclaim,
+                                  mem_cgroup_iter_filter cond)
  {
         struct mem_cgroup *memcg = NULL;
         struct mem_cgroup *last_visited = NULL;
  
-       if (mem_cgroup_disabled())
-               return NULL;
+       if (mem_cgroup_disabled()) {
+               /* first call must return non-NULL, second return NULL */
+               return (struct mem_cgroup *)(unsigned long)!prev;
+       }
  
         if (!root)
                 root = root_mem_cgroup;
@@ -1208,7 +1086,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
         if (!root->use_hierarchy && root != root_mem_cgroup) {
                 if (prev)
                         goto out_css_put;
-               return root;
+               if (mem_cgroup_filter(root, root, cond) == VISIT)
+                       return root;
+               return NULL;
         }
  
         rcu_read_lock();
@@ -1231,7 +1111,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                         last_visited = mem_cgroup_iter_load(iter, root, &seq);
                 }
  
-               memcg = __mem_cgroup_iter_next(root, last_visited);
+               memcg = __mem_cgroup_iter_next(root, last_visited, cond);
  
                 if (reclaim) {
                         mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1242,7 +1122,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                                 reclaim->generation = iter->generation;
                 }
  
-               if (prev && !memcg)
+               /*
+                * We have finished the whole tree walk or no group has been
+                * visited because filter told us to skip the root node.
+                */
+               if (!memcg && (prev || (cond && !last_visited)))
                         goto out_unlock;
         }
  out_unlock:
@@ -1525,10 +1409,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
  
  int mem_cgroup_swappiness(struct mem_cgroup *memcg)
  {
-       struct cgroup *cgrp = memcg->css.cgroup;
-
         /* root ? */
-       if (cgrp->parent == NULL)
+       if (!css_parent(&memcg->css))
                 return vm_swappiness;
  
         return memcg->swappiness;
@@ -1805,12 +1687,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
         check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
         totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
         for_each_mem_cgroup_tree(iter, memcg) {
-               struct cgroup *cgroup = iter->css.cgroup;
-               struct cgroup_iter it;
+               struct css_task_iter it;
                 struct task_struct *task;
  
-               cgroup_iter_start(cgroup, &it);
-               while ((task = cgroup_iter_next(cgroup, &it))) {
+               css_task_iter_start(&iter->css, &it);
+               while ((task = css_task_iter_next(&it))) {
                         switch (oom_scan_process_thread(task, totalpages, NULL,
                                                         false)) {
                         case OOM_SCAN_SELECT:
@@ -1823,7 +1704,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                         case OOM_SCAN_CONTINUE:
                                 continue;
                         case OOM_SCAN_ABORT:
-                               cgroup_iter_end(cgroup, &it);
+                               css_task_iter_end(&it);
                                 mem_cgroup_iter_break(memcg, iter);
                                 if (chosen)
                                         put_task_struct(chosen);
@@ -1840,7 +1721,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                 get_task_struct(chosen);
                         }
                 }
-               cgroup_iter_end(cgroup, &it);
+               css_task_iter_end(&it);
         }
  
         if (!chosen)
@@ -1886,6 +1767,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
         return total;
  }
  
+#if MAX_NUMNODES > 1
  /**
   * test_mem_cgroup_node_reclaimable
   * @memcg: the target memcg
@@ -1908,7 +1790,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
         return false;
  
  }
-#if MAX_NUMNODES > 1
  
  /*
   * Always updating the nodemask is not very good - even if we have an empty
@@ -1976,115 +1857,64 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
         return node;
  }
  
-/*
- * Check all nodes whether it contains reclaimable pages or not.
- * For quick scan, we make use of scan_nodes. This will allow us to skip
- * unused nodes. But scan_nodes is lazily updated and may not cotain
- * enough new information. We need to do double check.
- */
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-{
-       int nid;
-
-       /*
-        * quick check...making use of scan_node.
-        * We can skip unused nodes.
-        */
-       if (!nodes_empty(memcg->scan_nodes)) {
-               for (nid = first_node(memcg->scan_nodes);
-                    nid < MAX_NUMNODES;
-                    nid = next_node(nid, memcg->scan_nodes)) {
-
-                       if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
-                               return true;
-               }
-       }
-       /*
-        * Check rest of nodes.
-        */
-       for_each_node_state(nid, N_MEMORY) {
-               if (node_isset(nid, memcg->scan_nodes))
-                       continue;
-               if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
-                       return true;
-       }
-       return false;
-}
-
  #else
  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
  {
         return 0;
  }
  
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-{
-       return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
-}
  #endif
  
-static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
-                                  struct zone *zone,
-                                  gfp_t gfp_mask,
-                                  unsigned long *total_scanned)
-{
-       struct mem_cgroup *victim = NULL;
-       int total = 0;
-       int loop = 0;
-       unsigned long excess;
-       unsigned long nr_scanned;
-       struct mem_cgroup_reclaim_cookie reclaim = {
-               .zone = zone,
-               .priority = 0,
-       };
-
-       excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
-
-       while (1) {
-               victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
-               if (!victim) {
-                       loop++;
-                       if (loop >= 2) {
-                               /*
-                                * If we have not been able to reclaim
-                                * anything, it might because there are
-                                * no reclaimable pages under this hierarchy
-                                */
-                               if (!total)
-                                       break;
-                               /*
-                                * We want to do more targeted reclaim.
-                                * excess >> 2 is not to excessive so as to
-                                * reclaim too much, nor too less that we keep
-                                * coming back to reclaim from this cgroup
-                                */
-                               if (total >= (excess >> 2) ||
-                                       (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
-                                       break;
-                       }
-                       continue;
-               }
-               if (!mem_cgroup_reclaimable(victim, false))
-                       continue;
-               total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
-                                                    zone, &nr_scanned);
-               *total_scanned += nr_scanned;
-               if (!res_counter_soft_limit_excess(&root_memcg->res))
+/*
+ * A group is eligible for the soft limit reclaim under the given root
+ * hierarchy if
+ *     a) it is over its soft limit
+ *     b) any parent up the hierarchy is over its soft limit
+ *
+ * If the given group doesn't have any children over the limit then it
+ * doesn't make any sense to iterate its subtree.
+ */
+enum mem_cgroup_filter_t
+mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
+               struct mem_cgroup *root)
+{
+       struct mem_cgroup *parent;
+
+       if (!memcg)
+               memcg = root_mem_cgroup;
+       parent = memcg;
+
+       if (res_counter_soft_limit_excess(&memcg->res))
+               return VISIT;
+
+       /*
+        * If any parent up to the root in the hierarchy is over its soft limit
+        * then we have to obey and reclaim from this group as well.
+        */
+       while ((parent = parent_mem_cgroup(parent))) {
+               if (res_counter_soft_limit_excess(&parent->res))
+                       return VISIT;
+               if (parent == root)
                         break;
         }
-       mem_cgroup_iter_break(root_memcg, victim);
-       return total;
+
+       if (!atomic_read(&memcg->children_in_excess))
+               return SKIP_TREE;
+       return SKIP;
  }
  
+static DEFINE_SPINLOCK(memcg_oom_lock);
+
  /*
   * Check OOM-Killer is already running under our hierarchy.
   * If someone is running, return false.
- * Has to be called with memcg_oom_lock
   */
-static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
+static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
  {
         struct mem_cgroup *iter, *failed = NULL;
  
+       spin_lock(&memcg_oom_lock);
+
         for_each_mem_cgroup_tree(iter, memcg) {
                 if (iter->oom_lock) {
                         /*
@@ -2098,33 +1928,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
                         iter->oom_lock = true;
         }
  
-       if (!failed)
-               return true;
-
-       /*
-        * OK, we failed to lock the whole subtree so we have to clean up
-        * what we set up to the failing subtree
-        */
-       for_each_mem_cgroup_tree(iter, memcg) {
-               if (iter == failed) {
-                       mem_cgroup_iter_break(memcg, iter);
-                       break;
+       if (failed) {
+               /*
+                * OK, we failed to lock the whole subtree so we have
+                * to clean up what we set up to the failing subtree
+                */
+               for_each_mem_cgroup_tree(iter, memcg) {
+                       if (iter == failed) {
+                               mem_cgroup_iter_break(memcg, iter);
+                               break;
+                       }
+                       iter->oom_lock = false;
                 }
-               iter->oom_lock = false;
         }
-       return false;
+
+       spin_unlock(&memcg_oom_lock);
+
+       return !failed;
  }
  
-/*
- * Has to be called with memcg_oom_lock
- */
-static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
  {
         struct mem_cgroup *iter;
  
+       spin_lock(&memcg_oom_lock);
         for_each_mem_cgroup_tree(iter, memcg)
                 iter->oom_lock = false;
-       return 0;
+       spin_unlock(&memcg_oom_lock);
  }
  
  static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
@@ -2148,7 +1978,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
                 atomic_add_unless(&iter->under_oom, -1, 0);
  }
  
-static DEFINE_SPINLOCK(memcg_oom_lock);
  static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
  
  struct oom_wait_info {
@@ -2178,6 +2007,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
  
  static void memcg_wakeup_oom(struct mem_cgroup *memcg)
  {
+       atomic_inc(&memcg->oom_wakeups);
         /* for filtering, pass "memcg" as argument. */
         __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
  }
@@ -2189,56 +2019,136 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
  }
  
  /*
- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ * try to call OOM killer
   */
-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
-                                 int order)
+static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
  {
-       struct oom_wait_info owait;
-       bool locked, need_to_kill;
+       bool locked;
+       int wakeups;
  
-       owait.memcg = memcg;
-       owait.wait.flags = 0;
-       owait.wait.func = memcg_oom_wake_function;
-       owait.wait.private = current;
-       INIT_LIST_HEAD(&owait.wait.task_list);
-       need_to_kill = true;
-       mem_cgroup_mark_under_oom(memcg);
+       if (!current->memcg_oom.may_oom)
+               return;
+
+       current->memcg_oom.in_memcg_oom = 1;
  
-       /* At first, try to OOM lock hierarchy under memcg.*/
-       spin_lock(&memcg_oom_lock);
-       locked = mem_cgroup_oom_lock(memcg);
         /*
-        * Even if signal_pending(), we can't quit charge() loop without
-        * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
-        * under OOM is always welcomed, use TASK_KILLABLE here.
+        * As with any blocking lock, a contender needs to start
+        * listening for wakeups before attempting the trylock,
+        * otherwise it can miss the wakeup from the unlock and sleep
+        * indefinitely.  This is just open-coded because our locking
+        * is so particular to memcg hierarchies.
          */
-       prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-       if (!locked || memcg->oom_kill_disable)
-               need_to_kill = false;
+       wakeups = atomic_read(&memcg->oom_wakeups);
+       mem_cgroup_mark_under_oom(memcg);
+
+       locked = mem_cgroup_oom_trylock(memcg);
+
         if (locked)
                 mem_cgroup_oom_notify(memcg);
-       spin_unlock(&memcg_oom_lock);
  
-       if (need_to_kill) {
-               finish_wait(&memcg_oom_waitq, &owait.wait);
+       if (locked && !memcg->oom_kill_disable) {
+               mem_cgroup_unmark_under_oom(memcg);
                 mem_cgroup_out_of_memory(memcg, mask, order);
+               mem_cgroup_oom_unlock(memcg);
+               /*
+                * There is no guarantee that an OOM-lock contender
+                * sees the wakeups triggered by the OOM kill
+                * uncharges.  Wake any sleepers explicitely.
+                */
+               memcg_oom_recover(memcg);
         } else {
-               schedule();
-               finish_wait(&memcg_oom_waitq, &owait.wait);
+               /*
+                * A system call can just return -ENOMEM, but if this
+                * is a page fault and somebody else is handling the
+                * OOM already, we need to sleep on the OOM waitqueue
+                * for this memcg until the situation is resolved.
+                * Which can take some time because it might be
+                * handled by a userspace task.
+                *
+                * However, this is the charge context, which means
+                * that we may sit on a large call stack and hold
+                * various filesystem locks, the mmap_sem etc. and we
+                * don't want the OOM handler to deadlock on them
+                * while we sit here and wait.  Store the current OOM
+                * context in the task_struct, then return -ENOMEM.
+                * At the end of the page fault handler, with the
+                * stack unwound, pagefault_out_of_memory() will check
+                * back with us by calling
+                * mem_cgroup_oom_synchronize(), possibly putting the
+                * task to sleep.
+                */
+               current->memcg_oom.oom_locked = locked;
+               current->memcg_oom.wakeups = wakeups;
+               css_get(&memcg->css);
+               current->memcg_oom.wait_on_memcg = memcg;
         }
-       spin_lock(&memcg_oom_lock);
-       if (locked)
-               mem_cgroup_oom_unlock(memcg);
-       memcg_wakeup_oom(memcg);
-       spin_unlock(&memcg_oom_lock);
+}
  
-       mem_cgroup_unmark_under_oom(memcg);
+/**
+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ *
+ * This has to be called at the end of a page fault if the the memcg
+ * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ *
+ * Memcg supports userspace OOM handling, so failed allocations must
+ * sleep on a waitqueue until the userspace task resolves the
+ * situation.  Sleeping directly in the charge context with all kinds
+ * of locks held is not a good idea, instead we remember an OOM state
+ * in the task and mem_cgroup_oom_synchronize() has to be called at
+ * the end of the page fault to put the task to sleep and clean up the
+ * OOM state.
+ *
+ * Returns %true if an ongoing memcg OOM situation was detected and
+ * finalized, %false otherwise.
+ */
+bool mem_cgroup_oom_synchronize(void)
+{
+       struct oom_wait_info owait;
+       struct mem_cgroup *memcg;
  
-       if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+       /* OOM is global, do not handle */
+       if (!current->memcg_oom.in_memcg_oom)
                 return false;
-       /* Give chance to dying process */
-       schedule_timeout_uninterruptible(1);
+
+       /*
+        * We invoked the OOM killer but there is a chance that a kill
+        * did not free up any charges.  Everybody else might already
+        * be sleeping, so restart the fault and keep the rampage
+        * going until some charges are released.
+        */
+       memcg = current->memcg_oom.wait_on_memcg;
+       if (!memcg)
+               goto out;
+
+       if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+               goto out_memcg;
+
+       owait.memcg = memcg;
+       owait.wait.flags = 0;
+       owait.wait.func = memcg_oom_wake_function;
+       owait.wait.private = current;
+       INIT_LIST_HEAD(&owait.wait.task_list);
+
+       prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+       /* Only sleep if we didn't miss any wakeups since OOM */
+       if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+               schedule();
+       finish_wait(&memcg_oom_waitq, &owait.wait);
+out_memcg:
+       mem_cgroup_unmark_under_oom(memcg);
+       if (current->memcg_oom.oom_locked) {
+               mem_cgroup_oom_unlock(memcg);
+               /*
+                * There is no guarantee that an OOM-lock contender
+                * sees the wakeups triggered by the OOM kill
+                * uncharges.  Wake any sleepers explicitely.
+                */
+               memcg_oom_recover(memcg);
+       }
+       css_put(&memcg->css);
+       current->memcg_oom.wait_on_memcg = NULL;
+out:
+       current->memcg_oom.in_memcg_oom = 0;
         return true;
  }
  
@@ -2307,7 +2217,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
  }
  
  void mem_cgroup_update_page_stat(struct page *page,
-                                enum mem_cgroup_page_stat_item idx, int val)
+                                enum mem_cgroup_stat_index idx, int val)
  {
         struct mem_cgroup *memcg;
         struct page_cgroup *pc = lookup_page_cgroup(page);
@@ -2316,18 +2226,11 @@ void mem_cgroup_update_page_stat(struct page *page,
         if (mem_cgroup_disabled())
                 return;
  
+       VM_BUG_ON(!rcu_read_lock_held());
         memcg = pc->mem_cgroup;
         if (unlikely(!memcg || !PageCgroupUsed(pc)))
                 return;
  
-       switch (idx) {
-       case MEMCG_NR_FILE_MAPPED:
-               idx = MEM_CGROUP_STAT_FILE_MAPPED;
-               break;
-       default:
-               BUG();
-       }
-
         this_cpu_add(memcg->stat->count[idx], val);
  }
  
@@ -2469,7 +2372,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
                         flush_work(&stock->work);
         }
  out:
-       put_online_cpus();
+       put_online_cpus();
  }
  
  /*
@@ -2551,12 +2454,11 @@ enum {
         CHARGE_RETRY,           /* need to retry but retry is not bad */
         CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
         CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
-       CHARGE_OOM_DIE,         /* the current is killed because of OOM */
  };
  
  static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                 unsigned int nr_pages, unsigned int min_pages,
-                               bool oom_check)
+                               bool invoke_oom)
  {
         unsigned long csize = nr_pages * PAGE_SIZE;
         struct mem_cgroup *mem_over_limit;
@@ -2613,14 +2515,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
         if (mem_cgroup_wait_acct_move(mem_over_limit))
                 return CHARGE_RETRY;
  
-       /* If we don't need to call oom-killer at el, return immediately */
-       if (!oom_check)
-               return CHARGE_NOMEM;
-       /* check OOM */
-       if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
-               return CHARGE_OOM_DIE;
+       if (invoke_oom)
+               mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
  
-       return CHARGE_RETRY;
+       return CHARGE_NOMEM;
  }
  
  /*
@@ -2723,7 +2621,7 @@ again:
         }
  
         do {
-               bool oom_check;
+               bool invoke_oom = oom && !nr_oom_retries;
  
                 /* If killed, bypass charge */
                 if (fatal_signal_pending(current)) {
@@ -2731,14 +2629,8 @@ again:
                         goto bypass;
                 }
  
-               oom_check = false;
-               if (oom && !nr_oom_retries) {
-                       oom_check = true;
-                       nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-               }
-
-               ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
-                   oom_check);
+               ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
+                                          nr_pages, invoke_oom);
                 switch (ret) {
                 case CHARGE_OK:
                         break;
@@ -2751,16 +2643,12 @@ again:
                         css_put(&memcg->css);
                         goto nomem;
                 case CHARGE_NOMEM: /* OOM routine works */
-                       if (!oom) {
+                       if (!oom || invoke_oom) {
                                 css_put(&memcg->css);
                                 goto nomem;
                         }
-                       /* If oom, we never return -ENOMEM */
                         nr_oom_retries--;
                         break;
-               case CHARGE_OOM_DIE: /* Killed by OOM Killer */
-                       css_put(&memcg->css);
-                       goto bypass;
                 }
         } while (ret != CHARGE_OK);
  
@@ -2901,7 +2789,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
          * is accessed after testing USED bit. To make pc->mem_cgroup visible
          * before USED bit, we need memory barrier here.
          * See mem_cgroup_add_lru_list(), etc.
-        */
+        */
         smp_wmb();
         SetPageCgroupUsed(pc);
  
@@ -2924,9 +2812,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
         unlock_page_cgroup(pc);
  
         /*
-        * "charge_statistics" updated event counter. Then, check it.
-        * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
-        * if they exceeds softlimit.
+        * "charge_statistics" updated event counter.
          */
         memcg_check_events(memcg, page);
  }
@@ -2954,10 +2840,10 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
  }
  
  #ifdef CONFIG_SLABINFO
-static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
-                                       struct seq_file *m)
+static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
+                                   struct cftype *cft, struct seq_file *m)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct memcg_cache_params *params;
  
         if (!memcg_can_account_kmem(memcg))
@@ -3140,7 +3026,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
                 ssize_t size = memcg_caches_array_size(num_groups);
  
                 size *= sizeof(void *);
-               size += sizeof(struct memcg_cache_params);
+               size += offsetof(struct memcg_cache_params, memcg_caches);
  
                 s->memcg_params = kzalloc(size, GFP_KERNEL);
                 if (!s->memcg_params) {
@@ -3183,13 +3069,16 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
  int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
                          struct kmem_cache *root_cache)
  {
-       size_t size = sizeof(struct memcg_cache_params);
+       size_t size;
  
         if (!memcg_kmem_enabled())
                 return 0;
  
-       if (!memcg)
+       if (!memcg) {
+               size = offsetof(struct memcg_cache_params, memcg_caches);
                 size += memcg_limited_groups_array_size * sizeof(void *);
+       } else
+               size = sizeof(struct memcg_cache_params);
  
         s->memcg_params = kzalloc(size, GFP_KERNEL);
         if (!s->memcg_params)
@@ -3642,9 +3531,9 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
          * the page allocator. Therefore, the following sequence when backed by
          * the SLUB allocator:
          *
-        *      memcg_stop_kmem_account();
-        *      kmalloc(<large_number>)
-        *      memcg_resume_kmem_account();
+        *      memcg_stop_kmem_account();
+        *      kmalloc(<large_number>)
+        *      memcg_resume_kmem_account();
          *
          * would effectively ignore the fact that we should skip accounting,
          * since it will drive us directly to this function without passing
@@ -3766,6 +3655,20 @@ void mem_cgroup_split_huge_fixup(struct page *head)
  }
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
+static inline
+void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
+                                       struct mem_cgroup *to,
+                                       unsigned int nr_pages,
+                                       enum mem_cgroup_stat_index idx)
+{
+       /* Update stat data for mem_cgroup */
+       preempt_disable();
+       WARN_ON_ONCE(from->stat->count[idx] < nr_pages);
+       __this_cpu_add(from->stat->count[idx], -nr_pages);
+       __this_cpu_add(to->stat->count[idx], nr_pages);
+       preempt_enable();
+}
+
  /**
   * mem_cgroup_move_account - move account of the page
   * @page: the page
@@ -3811,13 +3714,14 @@ static int mem_cgroup_move_account(struct page *page,
  
         move_lock_mem_cgroup(from, &flags);
  
-       if (!anon && page_mapped(page)) {
-               /* Update mapped_file data for mem_cgroup */
-               preempt_disable();
-               __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-               __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-               preempt_enable();
-       }
+       if (!anon && page_mapped(page))
+               mem_cgroup_move_account_page_stat(from, to, nr_pages,
+                       MEM_CGROUP_STAT_FILE_MAPPED);
+
+       if (PageWriteback(page))
+               mem_cgroup_move_account_page_stat(from, to, nr_pages,
+                       MEM_CGROUP_STAT_WRITEBACK);
+
         mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
  
         /* caller should have done css_get */
@@ -4673,7 +4577,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                                    MEM_CGROUP_RECLAIM_SHRINK);
                 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
                 /* Usage is reduced ? */
-               if (curusage >= oldusage)
+               if (curusage >= oldusage)
                         retry_count--;
                 else
                         oldusage = curusage;
@@ -4694,7 +4598,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
         int enlarge = 0;
  
         /* see mem_cgroup_resize_res_limit */
-       retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
+       retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
         oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
         while (retry_count) {
                 if (signal_pending(current)) {
@@ -4743,98 +4647,6 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
         return ret;
  }
  
-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-                                           gfp_t gfp_mask,
-                                           unsigned long *total_scanned)
-{
-       unsigned long nr_reclaimed = 0;
-       struct mem_cgroup_per_zone *mz, *next_mz = NULL;
-       unsigned long reclaimed;
-       int loop = 0;
-       struct mem_cgroup_tree_per_zone *mctz;
-       unsigned long long excess;
-       unsigned long nr_scanned;
-
-       if (order > 0)
-               return 0;
-
-       mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
-       /*
-        * This loop can run a while, specially if mem_cgroup's continuously
-        * keep exceeding their soft limit and putting the system under
-        * pressure
-        */
-       do {
-               if (next_mz)
-                       mz = next_mz;
-               else
-                       mz = mem_cgroup_largest_soft_limit_node(mctz);
-               if (!mz)
-                       break;
-
-               nr_scanned = 0;
-               reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
-                                                   gfp_mask, &nr_scanned);
-               nr_reclaimed += reclaimed;
-               *total_scanned += nr_scanned;
-               spin_lock(&mctz->lock);
-
-               /*
-                * If we failed to reclaim anything from this memory cgroup
-                * it is time to move on to the next cgroup
-                */
-               next_mz = NULL;
-               if (!reclaimed) {
-                       do {
-                               /*
-                                * Loop until we find yet another one.
-                                *
-                                * By the time we get the soft_limit lock
-                                * again, someone might have aded the
-                                * group back on the RB tree. Iterate to
-                                * make sure we get a different mem.
-                                * mem_cgroup_largest_soft_limit_node returns
-                                * NULL if no other cgroup is present on
-                                * the tree
-                                */
-                               next_mz =
-                               __mem_cgroup_largest_soft_limit_node(mctz);
-                               if (next_mz == mz)
-                                       css_put(&next_mz->memcg->css);
-                               else /* next_mz == NULL or other memcg */
-                                       break;
-                       } while (1);
-               }
-               __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
-               excess = res_counter_soft_limit_excess(&mz->memcg->res);
-               /*
-                * One school of thought says that we should not add
-                * back the node to the tree if reclaim returns 0.
-                * But our reclaim could return 0, simply because due
-                * to priority we are exposing a smaller subset of
-                * memory to reclaim from. Consider this as a longer
-                * term TODO.
-                */
-               /* If excess == 0, no tree ops */
-               __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
-               spin_unlock(&mctz->lock);
-               css_put(&mz->memcg->css);
-               loop++;
-               /*
-                * Could not reclaim anything and there are no more
-                * mem cgroups to try or we seem to be looping without
-                * reclaiming anything.
-                */
-               if (!nr_reclaimed &&
-                       (next_mz == NULL ||
-                       loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
-                       break;
-       } while (!nr_reclaimed);
-       if (next_mz)
-               css_put(&next_mz->memcg->css);
-       return nr_reclaimed;
-}
-
  /**
   * mem_cgroup_force_empty_list - clears LRU of a group
   * @memcg: group to clear
@@ -4943,10 +4755,10 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
   */
  static inline bool __memcg_has_children(struct mem_cgroup *memcg)
  {
-       struct cgroup *pos;
+       struct cgroup_subsys_state *pos;
  
         /* bounce at first found */
-       cgroup_for_each_child(pos, memcg->css.cgroup)
+       css_for_each_child(pos, &memcg->css)
                 return true;
         return false;
  }
@@ -5002,36 +4814,28 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
         return 0;
  }
  
-static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
+static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
+                                       unsigned int event)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-       int ret;
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
         if (mem_cgroup_is_root(memcg))
                 return -EINVAL;
-       css_get(&memcg->css);
-       ret = mem_cgroup_force_empty(memcg);
-       css_put(&memcg->css);
-
-       return ret;
+       return mem_cgroup_force_empty(memcg);
  }
  
-
-static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
+static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
+                                    struct cftype *cft)
  {
-       return mem_cgroup_from_cont(cont)->use_hierarchy;
+       return mem_cgroup_from_css(css)->use_hierarchy;
  }
  
-static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
-                                       u64 val)
+static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
+                                     struct cftype *cft, u64 val)
  {
         int retval = 0;
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-       struct cgroup *parent = cont->parent;
-       struct mem_cgroup *parent_memcg = NULL;
-
-       if (parent)
-               parent_memcg = mem_cgroup_from_cont(parent);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css));
  
         mutex_lock(&memcg_create_mutex);
  
@@ -5101,11 +4905,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
         return val << PAGE_SHIFT;
  }
  
-static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
-                              struct file *file, char __user *buf,
-                              size_t nbytes, loff_t *ppos)
+static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
+                              struct cftype *cft, struct file *file,
+                              char __user *buf, size_t nbytes, loff_t *ppos)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         char str[64];
         u64 val;
         int name, len;
@@ -5138,11 +4942,11 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
         return simple_read_from_buffer(buf, nbytes, ppos, str, len);
  }
  
-static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
+static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
  {
         int ret = -EINVAL;
  #ifdef CONFIG_MEMCG_KMEM
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         /*
          * For simplicity, we won't allow this to be disabled.  It also can't
          * be changed if the cgroup has children already, or if tasks had
@@ -5157,8 +4961,8 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
          */
         mutex_lock(&memcg_create_mutex);
         mutex_lock(&set_limit_mutex);
-       if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
-               if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
+       if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
+               if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
                         ret = -EBUSY;
                         goto out;
                 }
@@ -5167,7 +4971,7 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
  
                 ret = memcg_update_cache_sizes(memcg);
                 if (ret) {
-                       res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
+                       res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
                         goto out;
                 }
                 static_key_slow_inc(&memcg_kmem_enabled_key);
@@ -5228,10 +5032,10 @@ out:
   * The user of this function is...
   * RES_LIMIT.
   */
-static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
+static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
                             const char *buffer)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         enum res_type type;
         int name;
         unsigned long long val;
@@ -5255,7 +5059,7 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
                 else if (type == _MEMSWAP)
                         ret = mem_cgroup_resize_memsw_limit(memcg, val);
                 else if (type == _KMEM)
-                       ret = memcg_update_kmem_limit(cont, val);
+                       ret = memcg_update_kmem_limit(css, val);
                 else
                         return -EINVAL;
                 break;
@@ -5283,18 +5087,15 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
  static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
                 unsigned long long *mem_limit, unsigned long long *memsw_limit)
  {
-       struct cgroup *cgroup;
         unsigned long long min_limit, min_memsw_limit, tmp;
  
         min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
         min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
-       cgroup = memcg->css.cgroup;
         if (!memcg->use_hierarchy)
                 goto out;
  
-       while (cgroup->parent) {
-               cgroup = cgroup->parent;
-               memcg = mem_cgroup_from_cont(cgroup);
+       while (css_parent(&memcg->css)) {
+               memcg = mem_cgroup_from_css(css_parent(&memcg->css));
                 if (!memcg->use_hierarchy)
                         break;
                 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -5307,9 +5108,9 @@ out:
         *memsw_limit = min_memsw_limit;
  }
  
-static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
+static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         int name;
         enum res_type type;
  
@@ -5342,17 +5143,17 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
         return 0;
  }
  
-static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
+static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
                                         struct cftype *cft)
  {
-       return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
+       return mem_cgroup_from_css(css)->move_charge_at_immigrate;
  }
  
  #ifdef CONFIG_MMU
-static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
+static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
                                         struct cftype *cft, u64 val)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
         if (val >= (1 << NR_MOVE_TYPE))
                 return -EINVAL;
@@ -5367,7 +5168,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
         return 0;
  }
  #else
-static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
+static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
                                         struct cftype *cft, u64 val)
  {
         return -ENOSYS;
@@ -5375,13 +5176,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
  #endif
  
  #ifdef CONFIG_NUMA
-static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
-                                     struct seq_file *m)
+static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
+                               struct cftype *cft, struct seq_file *m)
  {
         int nid;
         unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
         unsigned long node_nr;
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
         total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
         seq_printf(m, "total=%lu", total_nr);
@@ -5426,10 +5227,10 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
         BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
  }
  
-static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
+static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
                                  struct seq_file *m)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup *mi;
         unsigned int i;
  
@@ -5513,27 +5314,23 @@ static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
         return 0;
  }
  
-static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
+static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
+                                     struct cftype *cft)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
         return mem_cgroup_swappiness(memcg);
  }
  
-static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
-                                      u64 val)
+static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
+                                      struct cftype *cft, u64 val)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
-       struct mem_cgroup *parent;
-
-       if (val > 100)
-               return -EINVAL;
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
  
-       if (cgrp->parent == NULL)
+       if (val > 100 || !parent)
                 return -EINVAL;
  
-       parent = mem_cgroup_from_cont(cgrp->parent);
-
         mutex_lock(&memcg_create_mutex);
  
         /* If under hierarchy, only empty-root can set this value */
@@ -5616,7 +5413,13 @@ static int compare_thresholds(const void *a, const void *b)
         const struct mem_cgroup_threshold *_a = a;
         const struct mem_cgroup_threshold *_b = b;
  
-       return _a->threshold - _b->threshold;
+       if (_a->threshold > _b->threshold)
+               return 1;
+
+       if (_a->threshold < _b->threshold)
+               return -1;
+
+       return 0;
  }
  
  static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
@@ -5636,10 +5439,10 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
                 mem_cgroup_oom_notify_cb(iter);
  }
  
-static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
+static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
         struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_thresholds *thresholds;
         struct mem_cgroup_threshold_ary *new;
         enum res_type type = MEMFILE_TYPE(cft->private);
@@ -5719,10 +5522,10 @@ unlock:
         return ret;
  }
  
-static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
+static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
         struct cftype *cft, struct eventfd_ctx *eventfd)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_thresholds *thresholds;
         struct mem_cgroup_threshold_ary *new;
         enum res_type type = MEMFILE_TYPE(cft->private);
@@ -5798,10 +5601,10 @@ unlock:
         mutex_unlock(&memcg->thresholds_lock);
  }
  
-static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
+static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
         struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_eventfd_list *event;
         enum res_type type = MEMFILE_TYPE(cft->private);
  
@@ -5823,10 +5626,10 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
         return 0;
  }
  
-static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
+static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
         struct cftype *cft, struct eventfd_ctx *eventfd)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_eventfd_list *ev, *tmp;
         enum res_type type = MEMFILE_TYPE(cft->private);
  
@@ -5844,10 +5647,10 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
         spin_unlock(&memcg_oom_lock);
  }
  
-static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
+static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
         struct cftype *cft,  struct cgroup_map_cb *cb)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
         cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
  
@@ -5858,18 +5661,16 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
         return 0;
  }
  
-static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
+static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
         struct cftype *cft, u64 val)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
-       struct mem_cgroup *parent;
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
  
         /* cannot set to root cgroup and only 0 and 1 are allowed */
-       if (!cgrp->parent || !((val == 0) || (val == 1)))
+       if (!parent || !((val == 0) || (val == 1)))
                 return -EINVAL;
  
-       parent = mem_cgroup_from_cont(cgrp->parent);
-
         mutex_lock(&memcg_create_mutex);
         /* oom-kill-disable is a flag for subhierarchy. */
         if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
@@ -6110,8 +5911,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                 mz = &pn->zoneinfo[zone];
                 lruvec_init(&mz->lruvec);
-               mz->usage_in_excess = 0;
-               mz->on_tree = false;
                 mz->memcg = memcg;
         }
         memcg->nodeinfo[node] = pn;
@@ -6167,7 +5966,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
         int node;
         size_t size = memcg_size();
  
-       mem_cgroup_remove_from_trees(memcg);
         free_css_id(&mem_cgroup_subsys, &memcg->css);
  
         for_each_node(node)
@@ -6204,31 +6002,8 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
  }
  EXPORT_SYMBOL(parent_mem_cgroup);
  
-static void __init mem_cgroup_soft_limit_tree_init(void)
-{
-       struct mem_cgroup_tree_per_node *rtpn;
-       struct mem_cgroup_tree_per_zone *rtpz;
-       int tmp, node, zone;
-
-       for_each_node(node) {
-               tmp = node;
-               if (!node_state(node, N_NORMAL_MEMORY))
-                       tmp = -1;
-               rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
-               BUG_ON(!rtpn);
-
-               soft_limit_tree.rb_tree_per_node[node] = rtpn;
-
-               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-                       rtpz = &rtpn->rb_tree_per_zone[zone];
-                       rtpz->rb_root = RB_ROOT;
-                       spin_lock_init(&rtpz->lock);
-               }
-       }
-}
-
  static struct cgroup_subsys_state * __ref
-mem_cgroup_css_alloc(struct cgroup *cont)
+mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
  {
         struct mem_cgroup *memcg;
         long error = -ENOMEM;
@@ -6243,7 +6018,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
                         goto free_out;
  
         /* root ? */
-       if (cont->parent == NULL) {
+       if (parent_css == NULL) {
                 root_mem_cgroup = memcg;
                 res_counter_init(&memcg->res, NULL);
                 res_counter_init(&memcg->memsw, NULL);
@@ -6256,6 +6031,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
         mutex_init(&memcg->thresholds_lock);
         spin_lock_init(&memcg->move_lock);
         vmpressure_init(&memcg->vmpressure);
+       spin_lock_init(&memcg->soft_lock);
  
         return &memcg->css;
  
@@ -6265,17 +6041,16 @@ free_out:
  }
  
  static int
-mem_cgroup_css_online(struct cgroup *cont)
+mem_cgroup_css_online(struct cgroup_subsys_state *css)
  {
-       struct mem_cgroup *memcg, *parent;
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
         int error = 0;
  
-       if (!cont->parent)
+       if (!parent)
                 return 0;
  
         mutex_lock(&memcg_create_mutex);
-       memcg = mem_cgroup_from_cont(cont);
-       parent = mem_cgroup_from_cont(cont->parent);
  
         memcg->use_hierarchy = parent->use_hierarchy;
         memcg->oom_kill_disable = parent->oom_kill_disable;
@@ -6326,21 +6101,28 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
                 mem_cgroup_iter_invalidate(root_mem_cgroup);
  }
  
-static void mem_cgroup_css_offline(struct cgroup *cont)
+static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
         kmem_cgroup_css_offline(memcg);
  
         mem_cgroup_invalidate_reclaim_iterators(memcg);
         mem_cgroup_reparent_charges(memcg);
+       if (memcg->soft_contributed) {
+               while ((memcg = parent_mem_cgroup(memcg)))
+                       atomic_dec(&memcg->children_in_excess);
+
+               if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
+                       atomic_dec(&root_mem_cgroup->children_in_excess);
+       }
         mem_cgroup_destroy_all_caches(memcg);
         vmpressure_cleanup(&memcg->vmpressure);
  }
  
-static void mem_cgroup_css_free(struct cgroup *cont)
+static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
         memcg_destroy_kmem(memcg);
         __mem_cgroup_free(memcg);
@@ -6710,12 +6492,12 @@ static void mem_cgroup_clear_mc(void)
         mem_cgroup_end_move(from);
  }
  
-static int mem_cgroup_can_attach(struct cgroup *cgroup,
+static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
                                  struct cgroup_taskset *tset)
  {
         struct task_struct *p = cgroup_taskset_first(tset);
         int ret = 0;
-       struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         unsigned long move_charge_at_immigrate;
  
         /*
@@ -6757,7 +6539,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
         return ret;
  }
  
-static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
+static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
                                      struct cgroup_taskset *tset)
  {
         mem_cgroup_clear_mc();
@@ -6905,7 +6687,7 @@ retry:
         up_read(&mm->mmap_sem);
  }
  
-static void mem_cgroup_move_task(struct cgroup *cont,
+static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
                                  struct cgroup_taskset *tset)
  {
         struct task_struct *p = cgroup_taskset_first(tset);
@@ -6920,16 +6702,16 @@ static void mem_cgroup_move_task(struct cgroup *cont,
                 mem_cgroup_clear_mc();
  }
  #else  /* !CONFIG_MMU */
-static int mem_cgroup_can_attach(struct cgroup *cgroup,
+static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
                                  struct cgroup_taskset *tset)
  {
         return 0;
  }
-static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
+static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
                                      struct cgroup_taskset *tset)
  {
  }
-static void mem_cgroup_move_task(struct cgroup *cont,
+static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
                                  struct cgroup_taskset *tset)
  {
  }
@@ -6939,15 +6721,15 @@ static void mem_cgroup_move_task(struct cgroup *cont,
   * Cgroup retains root cgroups across [un]mount cycles making it necessary
   * to verify sane_behavior flag on each mount attempt.
   */
-static void mem_cgroup_bind(struct cgroup *root)
+static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
  {
         /*
          * use_hierarchy is forced with sane_behavior.  cgroup core
          * guarantees that @root doesn't have any children, so turning it
          * on for the root memcg is enough.
          */
-       if (cgroup_sane_behavior(root))
-               mem_cgroup_from_cont(root)->use_hierarchy = true;
+       if (cgroup_sane_behavior(root_css->cgroup))
+               mem_cgroup_from_css(root_css)->use_hierarchy = true;
  }
  
  struct cgroup_subsys mem_cgroup_subsys = {
@@ -7008,7 +6790,6 @@ static int __init mem_cgroup_init(void)
  {
         hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
         enable_swap_cgroup();
-       mem_cgroup_soft_limit_tree_init();
         memcg_stock_init();
         return 0;
  }