ath10k: convert ath10k_pci_reg_read/write32() to take struct ath10k
[cascardo/linux.git] / kernel / cgroup.c
index a7c9e6d..0e0b20b 100644 (file)
@@ -63,9 +63,6 @@
 
 #include <linux/atomic.h>
 
-/* css deactivation bias, makes css->refcnt negative to deny new trygets */
-#define CSS_DEACT_BIAS         INT_MIN
-
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
@@ -99,16 +96,19 @@ static DEFINE_MUTEX(cgroup_root_mutex);
  */
 #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
 #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
-static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
+static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {
 #include <linux/cgroup_subsys.h>
 };
 
 /*
- * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
- * subsystems that are otherwise unattached - it never has more than a
- * single cgroup, and all tasks are part of that cgroup.
+ * The dummy hierarchy, reserved for the subsystems that are otherwise
+ * unattached - it never has more than a single cgroup, and all tasks are
+ * part of that cgroup.
  */
-static struct cgroupfs_root rootnode;
+static struct cgroupfs_root cgroup_dummy_root;
+
+/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
+static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
 
 /*
  * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
@@ -186,18 +186,28 @@ struct cgroup_event {
 
 /* The list of hierarchy roots */
 
-static LIST_HEAD(roots);
-static int root_count;
+static LIST_HEAD(cgroup_roots);
+static int cgroup_root_count;
 
-static DEFINE_IDA(hierarchy_ida);
-static int next_hierarchy_id;
-static DEFINE_SPINLOCK(hierarchy_id_lock);
-
-/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
-#define dummytop (&rootnode.top_cgroup)
+/*
+ * Hierarchy ID allocation and mapping.  It follows the same exclusion
+ * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
+ * writes, either for reads.
+ */
+static DEFINE_IDR(cgroup_hierarchy_idr);
 
 static struct cgroup_name root_cgroup_name = { .name = "/" };
 
+/*
+ * Assign a monotonically increasing serial number to cgroups.  It
+ * guarantees cgroups with bigger numbers are newer than those with smaller
+ * numbers.  Also, as cgroups are always appended to the parent's
+ * ->children list, it guarantees that sibling cgroups are always sorted in
+ * the ascending serial number order on the list.  Protected by
+ * cgroup_mutex.
+ */
+static u64 cgroup_serial_nr_next = 1;
+
 /* This flag indicates whether tasks in the fork and exit paths should
  * check for fork/exit handlers to call. This avoids us having to do
  * extra work in the fork/exit path if none of the subsystems need to
@@ -205,27 +215,15 @@ static struct cgroup_name root_cgroup_name = { .name = "/" };
  */
 static int need_forkexit_callback __read_mostly;
 
+static void cgroup_offline_fn(struct work_struct *work);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
                              struct cftype cfts[], bool is_add);
 
-static int css_unbias_refcnt(int refcnt)
-{
-       return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
-}
-
-/* the current nr of refs, always >= 0 whether @css is deactivated or not */
-static int css_refcnt(struct cgroup_subsys_state *css)
-{
-       int v = atomic_read(&css->refcnt);
-
-       return css_unbias_refcnt(v);
-}
-
 /* convenient tests for these bits */
-inline int cgroup_is_removed(const struct cgroup *cgrp)
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 {
-       return test_bit(CGRP_REMOVED, &cgrp->flags);
+       return test_bit(CGRP_DEAD, &cgrp->flags);
 }
 
 /**
@@ -261,16 +259,38 @@ static int notify_on_release(const struct cgroup *cgrp)
        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
 
-/*
- * for_each_subsys() allows you to iterate on each subsystem attached to
- * an active hierarchy
+/**
+ * for_each_subsys - iterate all loaded cgroup subsystems
+ * @ss: the iteration cursor
+ * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ *
+ * Should be called under cgroup_mutex.
  */
-#define for_each_subsys(_root, _ss) \
-list_for_each_entry(_ss, &_root->subsys_list, sibling)
+#define for_each_subsys(ss, i)                                         \
+       for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++)                 \
+               if (({ lockdep_assert_held(&cgroup_mutex);              \
+                      !((ss) = cgroup_subsys[i]); })) { }              \
+               else
+
+/**
+ * for_each_builtin_subsys - iterate all built-in cgroup subsystems
+ * @ss: the iteration cursor
+ * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
+ *
+ * Bulit-in subsystems are always present and iteration itself doesn't
+ * require any synchronization.
+ */
+#define for_each_builtin_subsys(ss, i)                                 \
+       for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT &&              \
+            (((ss) = cgroup_subsys[i]) || true); (i)++)
+
+/* iterate each subsystem attached to a hierarchy */
+#define for_each_root_subsys(root, ss)                                 \
+       list_for_each_entry((ss), &(root)->subsys_list, sibling)
 
-/* for_each_active_root() allows you to iterate across the active hierarchies */
-#define for_each_active_root(_root) \
-list_for_each_entry(_root, &roots, root_list)
+/* iterate across the active hierarchies */
+#define for_each_active_root(root)                                     \
+       list_for_each_entry((root), &cgroup_roots, root_list)
 
 static inline struct cgroup *__d_cgrp(struct dentry *dentry)
 {
@@ -297,7 +317,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 static bool cgroup_lock_live_group(struct cgroup *cgrp)
 {
        mutex_lock(&cgroup_mutex);
-       if (cgroup_is_removed(cgrp)) {
+       if (cgroup_is_dead(cgrp)) {
                mutex_unlock(&cgroup_mutex);
                return false;
        }
@@ -312,20 +332,24 @@ static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 
-/* Link structure for associating css_set objects with cgroups */
-struct cg_cgroup_link {
-       /*
-        * List running through cg_cgroup_links associated with a
-        * cgroup, anchored on cgroup->css_sets
-        */
-       struct list_head cgrp_link_list;
-       struct cgroup *cgrp;
-       /*
-        * List running through cg_cgroup_links pointing at a
-        * single css_set object, anchored on css_set->cg_links
-        */
-       struct list_head cg_link_list;
-       struct css_set *cg;
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies.  In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+       /* the cgroup and css_set this link associates */
+       struct cgroup           *cgrp;
+       struct css_set          *cset;
+
+       /* list of cgrp_cset_links anchored at cgrp->cset_links */
+       struct list_head        cset_link;
+
+       /* list of cgrp_cset_links anchored at css_set->cgrp_links */
+       struct list_head        cgrp_link;
 };
 
 /* The default css_set - used by init and its children prior to any
@@ -336,7 +360,7 @@ struct cg_cgroup_link {
  */
 
 static struct css_set init_css_set;
-static struct cg_cgroup_link init_css_set_link;
+static struct cgrp_cset_link init_cgrp_cset_link;
 
 static int cgroup_init_idr(struct cgroup_subsys *ss,
                           struct cgroup_subsys_state *css);
@@ -357,10 +381,11 @@ static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 
 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 {
-       int i;
        unsigned long key = 0UL;
+       struct cgroup_subsys *ss;
+       int i;
 
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+       for_each_subsys(ss, i)
                key += (unsigned long)css[i];
        key = (key >> 16) ^ key;
 
@@ -373,90 +398,83 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
  * compiled into their kernel but not actually in use */
 static int use_task_css_set_links __read_mostly;
 
-static void __put_css_set(struct css_set *cg, int taskexit)
+static void __put_css_set(struct css_set *cset, int taskexit)
 {
-       struct cg_cgroup_link *link;
-       struct cg_cgroup_link *saved_link;
+       struct cgrp_cset_link *link, *tmp_link;
+
        /*
         * Ensure that the refcount doesn't hit zero while any readers
         * can see it. Similar to atomic_dec_and_lock(), but for an
         * rwlock
         */
-       if (atomic_add_unless(&cg->refcount, -1, 1))
+       if (atomic_add_unless(&cset->refcount, -1, 1))
                return;
        write_lock(&css_set_lock);
-       if (!atomic_dec_and_test(&cg->refcount)) {
+       if (!atomic_dec_and_test(&cset->refcount)) {
                write_unlock(&css_set_lock);
                return;
        }
 
        /* This css_set is dead. unlink it and release cgroup refcounts */
-       hash_del(&cg->hlist);
+       hash_del(&cset->hlist);
        css_set_count--;
 
-       list_for_each_entry_safe(link, saved_link, &cg->cg_links,
-                                cg_link_list) {
+       list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
                struct cgroup *cgrp = link->cgrp;
-               list_del(&link->cg_link_list);
-               list_del(&link->cgrp_link_list);
 
-               /*
-                * We may not be holding cgroup_mutex, and if cgrp->count is
-                * dropped to 0 the cgroup can be destroyed at any time, hence
-                * rcu_read_lock is used to keep it alive.
-                */
-               rcu_read_lock();
-               if (atomic_dec_and_test(&cgrp->count) &&
-                   notify_on_release(cgrp)) {
+               list_del(&link->cset_link);
+               list_del(&link->cgrp_link);
+
+               /* @cgrp can't go away while we're holding css_set_lock */
+               if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
                        if (taskexit)
                                set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
                }
-               rcu_read_unlock();
 
                kfree(link);
        }
 
        write_unlock(&css_set_lock);
-       kfree_rcu(cg, rcu_head);
+       kfree_rcu(cset, rcu_head);
 }
 
 /*
  * refcounted get/put for css_set objects
  */
-static inline void get_css_set(struct css_set *cg)
+static inline void get_css_set(struct css_set *cset)
 {
-       atomic_inc(&cg->refcount);
+       atomic_inc(&cset->refcount);
 }
 
-static inline void put_css_set(struct css_set *cg)
+static inline void put_css_set(struct css_set *cset)
 {
-       __put_css_set(cg, 0);
+       __put_css_set(cset, 0);
 }
 
-static inline void put_css_set_taskexit(struct css_set *cg)
+static inline void put_css_set_taskexit(struct css_set *cset)
 {
-       __put_css_set(cg, 1);
+       __put_css_set(cset, 1);
 }
 
-/*
+/**
  * compare_css_sets - helper function for find_existing_css_set().
- * @cg: candidate css_set being tested
- * @old_cg: existing css_set for a task
+ * @cset: candidate css_set being tested
+ * @old_cset: existing css_set for a task
  * @new_cgrp: cgroup that's being entered by the task
  * @template: desired set of css pointers in css_set (pre-calculated)
  *
  * Returns true if "cg" matches "old_cg" except for the hierarchy
  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
  */
-static bool compare_css_sets(struct css_set *cg,
-                            struct css_set *old_cg,
+static bool compare_css_sets(struct css_set *cset,
+                            struct css_set *old_cset,
                             struct cgroup *new_cgrp,
                             struct cgroup_subsys_state *template[])
 {
        struct list_head *l1, *l2;
 
-       if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
+       if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
                /* Not all subsystems matched */
                return false;
        }
@@ -470,28 +488,28 @@ static bool compare_css_sets(struct css_set *cg,
         * candidates.
         */
 
-       l1 = &cg->cg_links;
-       l2 = &old_cg->cg_links;
+       l1 = &cset->cgrp_links;
+       l2 = &old_cset->cgrp_links;
        while (1) {
-               struct cg_cgroup_link *cgl1, *cgl2;
-               struct cgroup *cg1, *cg2;
+               struct cgrp_cset_link *link1, *link2;
+               struct cgroup *cgrp1, *cgrp2;
 
                l1 = l1->next;
                l2 = l2->next;
                /* See if we reached the end - both lists are equal length. */
-               if (l1 == &cg->cg_links) {
-                       BUG_ON(l2 != &old_cg->cg_links);
+               if (l1 == &cset->cgrp_links) {
+                       BUG_ON(l2 != &old_cset->cgrp_links);
                        break;
                } else {
-                       BUG_ON(l2 == &old_cg->cg_links);
+                       BUG_ON(l2 == &old_cset->cgrp_links);
                }
                /* Locate the cgroups associated with these links. */
-               cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
-               cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
-               cg1 = cgl1->cgrp;
-               cg2 = cgl2->cgrp;
+               link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
+               link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
+               cgrp1 = link1->cgrp;
+               cgrp2 = link2->cgrp;
                /* Hierarchies should be linked in the same order. */
-               BUG_ON(cg1->root != cg2->root);
+               BUG_ON(cgrp1->root != cgrp2->root);
 
                /*
                 * If this hierarchy is the hierarchy of the cgroup
@@ -500,46 +518,39 @@ static bool compare_css_sets(struct css_set *cg,
                 * hierarchy, then this css_set should point to the
                 * same cgroup as the old css_set.
                 */
-               if (cg1->root == new_cgrp->root) {
-                       if (cg1 != new_cgrp)
+               if (cgrp1->root == new_cgrp->root) {
+                       if (cgrp1 != new_cgrp)
                                return false;
                } else {
-                       if (cg1 != cg2)
+                       if (cgrp1 != cgrp2)
                                return false;
                }
        }
        return true;
 }
 
-/*
- * find_existing_css_set() is a helper for
- * find_css_set(), and checks to see whether an existing
- * css_set is suitable.
- *
- * oldcg: the cgroup group that we're using before the cgroup
- * transition
- *
- * cgrp: the cgroup that we're moving into
- *
- * template: location in which to build the desired set of subsystem
- * state objects for the new cgroup group
+/**
+ * find_existing_css_set - init css array and find the matching css_set
+ * @old_cset: the css_set that we're using before the cgroup transition
+ * @cgrp: the cgroup that we're moving into
+ * @template: out param for the new set of csses, should be clear on entry
  */
-static struct css_set *find_existing_css_set(
-       struct css_set *oldcg,
-       struct cgroup *cgrp,
-       struct cgroup_subsys_state *template[])
+static struct css_set *find_existing_css_set(struct css_set *old_cset,
+                                       struct cgroup *cgrp,
+                                       struct cgroup_subsys_state *template[])
 {
-       int i;
        struct cgroupfs_root *root = cgrp->root;
-       struct css_set *cg;
+       struct cgroup_subsys *ss;
+       struct css_set *cset;
        unsigned long key;
+       int i;
 
        /*
         * Build the set of subsystem state objects that we want to see in the
         * new css_set. while subsystems can change globally, the entries here
         * won't change, so no need for locking.
         */
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+       for_each_subsys(ss, i) {
                if (root->subsys_mask & (1UL << i)) {
                        /* Subsystem is in this hierarchy. So we want
                         * the subsystem state from the new
@@ -548,148 +559,152 @@ static struct css_set *find_existing_css_set(
                } else {
                        /* Subsystem is not in this hierarchy, so we
                         * don't want to change the subsystem state */
-                       template[i] = oldcg->subsys[i];
+                       template[i] = old_cset->subsys[i];
                }
        }
 
        key = css_set_hash(template);
-       hash_for_each_possible(css_set_table, cg, hlist, key) {
-               if (!compare_css_sets(cg, oldcg, cgrp, template))
+       hash_for_each_possible(css_set_table, cset, hlist, key) {
+               if (!compare_css_sets(cset, old_cset, cgrp, template))
                        continue;
 
                /* This css_set matches what we need */
-               return cg;
+               return cset;
        }
 
        /* No existing cgroup group matched */
        return NULL;
 }
 
-static void free_cg_links(struct list_head *tmp)
+static void free_cgrp_cset_links(struct list_head *links_to_free)
 {
-       struct cg_cgroup_link *link;
-       struct cg_cgroup_link *saved_link;
+       struct cgrp_cset_link *link, *tmp_link;
 
-       list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
-               list_del(&link->cgrp_link_list);
+       list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
+               list_del(&link->cset_link);
                kfree(link);
        }
 }
 
-/*
- * allocate_cg_links() allocates "count" cg_cgroup_link structures
- * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
- * success or a negative error
+/**
+ * allocate_cgrp_cset_links - allocate cgrp_cset_links
+ * @count: the number of links to allocate
+ * @tmp_links: list_head the allocated links are put on
+ *
+ * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
+ * through ->cset_link.  Returns 0 on success or -errno.
  */
-static int allocate_cg_links(int count, struct list_head *tmp)
+static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
 {
-       struct cg_cgroup_link *link;
+       struct cgrp_cset_link *link;
        int i;
-       INIT_LIST_HEAD(tmp);
+
+       INIT_LIST_HEAD(tmp_links);
+
        for (i = 0; i < count; i++) {
-               link = kmalloc(sizeof(*link), GFP_KERNEL);
+               link = kzalloc(sizeof(*link), GFP_KERNEL);
                if (!link) {
-                       free_cg_links(tmp);
+                       free_cgrp_cset_links(tmp_links);
                        return -ENOMEM;
                }
-               list_add(&link->cgrp_link_list, tmp);
+               list_add(&link->cset_link, tmp_links);
        }
        return 0;
 }
 
 /**
  * link_css_set - a helper function to link a css_set to a cgroup
- * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
- * @cg: the css_set to be linked
+ * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
+ * @cset: the css_set to be linked
  * @cgrp: the destination cgroup
  */
-static void link_css_set(struct list_head *tmp_cg_links,
-                        struct css_set *cg, struct cgroup *cgrp)
+static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
+                        struct cgroup *cgrp)
 {
-       struct cg_cgroup_link *link;
+       struct cgrp_cset_link *link;
 
-       BUG_ON(list_empty(tmp_cg_links));
-       link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
-                               cgrp_link_list);
-       link->cg = cg;
+       BUG_ON(list_empty(tmp_links));
+       link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
+       link->cset = cset;
        link->cgrp = cgrp;
-       atomic_inc(&cgrp->count);
-       list_move(&link->cgrp_link_list, &cgrp->css_sets);
+       list_move(&link->cset_link, &cgrp->cset_links);
        /*
         * Always add links to the tail of the list so that the list
         * is sorted by order of hierarchy creation
         */
-       list_add_tail(&link->cg_link_list, &cg->cg_links);
+       list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 }
 
-/*
- * find_css_set() takes an existing cgroup group and a
- * cgroup object, and returns a css_set object that's
- * equivalent to the old group, but with the given cgroup
- * substituted into the appropriate hierarchy. Must be called with
- * cgroup_mutex held
+/**
+ * find_css_set - return a new css_set with one cgroup updated
+ * @old_cset: the baseline css_set
+ * @cgrp: the cgroup to be updated
+ *
+ * Return a new css_set that's equivalent to @old_cset, but with @cgrp
+ * substituted into the appropriate hierarchy.
  */
-static struct css_set *find_css_set(
-       struct css_set *oldcg, struct cgroup *cgrp)
+static struct css_set *find_css_set(struct css_set *old_cset,
+                                   struct cgroup *cgrp)
 {
-       struct css_set *res;
-       struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
-
-       struct list_head tmp_cg_links;
-
-       struct cg_cgroup_link *link;
+       struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
+       struct css_set *cset;
+       struct list_head tmp_links;
+       struct cgrp_cset_link *link;
        unsigned long key;
 
+       lockdep_assert_held(&cgroup_mutex);
+
        /* First see if we already have a cgroup group that matches
         * the desired set */
        read_lock(&css_set_lock);
-       res = find_existing_css_set(oldcg, cgrp, template);
-       if (res)
-               get_css_set(res);
+       cset = find_existing_css_set(old_cset, cgrp, template);
+       if (cset)
+               get_css_set(cset);
        read_unlock(&css_set_lock);
 
-       if (res)
-               return res;
+       if (cset)
+               return cset;
 
-       res = kmalloc(sizeof(*res), GFP_KERNEL);
-       if (!res)
+       cset = kzalloc(sizeof(*cset), GFP_KERNEL);
+       if (!cset)
                return NULL;
 
-       /* Allocate all the cg_cgroup_link objects that we'll need */
-       if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
-               kfree(res);
+       /* Allocate all the cgrp_cset_link objects that we'll need */
+       if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
+               kfree(cset);
                return NULL;
        }
 
-       atomic_set(&res->refcount, 1);
-       INIT_LIST_HEAD(&res->cg_links);
-       INIT_LIST_HEAD(&res->tasks);
-       INIT_HLIST_NODE(&res->hlist);
+       atomic_set(&cset->refcount, 1);
+       INIT_LIST_HEAD(&cset->cgrp_links);
+       INIT_LIST_HEAD(&cset->tasks);
+       INIT_HLIST_NODE(&cset->hlist);
 
        /* Copy the set of subsystem state objects generated in
         * find_existing_css_set() */
-       memcpy(res->subsys, template, sizeof(res->subsys));
+       memcpy(cset->subsys, template, sizeof(cset->subsys));
 
        write_lock(&css_set_lock);
        /* Add reference counts and links from the new css_set. */
-       list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
+       list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;
+
                if (c->root == cgrp->root)
                        c = cgrp;
-               link_css_set(&tmp_cg_links, res, c);
+               link_css_set(&tmp_links, cset, c);
        }
 
-       BUG_ON(!list_empty(&tmp_cg_links));
+       BUG_ON(!list_empty(&tmp_links));
 
        css_set_count++;
 
        /* Add this cgroup group to the hash table */
-       key = css_set_hash(res->subsys);
-       hash_add(css_set_table, &res->hlist, key);
+       key = css_set_hash(cset->subsys);
+       hash_add(css_set_table, &cset->hlist, key);
 
        write_unlock(&css_set_lock);
 
-       return res;
+       return cset;
 }
 
 /*
@@ -699,7 +714,7 @@ static struct css_set *find_css_set(
 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
                                            struct cgroupfs_root *root)
 {
-       struct css_set *css;
+       struct css_set *cset;
        struct cgroup *res = NULL;
 
        BUG_ON(!mutex_is_locked(&cgroup_mutex));
@@ -709,13 +724,15 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
         * task can't change groups, so the only thing that can happen
         * is that it exits and its css is set back to init_css_set.
         */
-       css = task->cgroups;
-       if (css == &init_css_set) {
+       cset = task_css_set(task);
+       if (cset == &init_css_set) {
                res = &root->top_cgroup;
        } else {
-               struct cg_cgroup_link *link;
-               list_for_each_entry(link, &css->cg_links, cg_link_list) {
+               struct cgrp_cset_link *link;
+
+               list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
                        struct cgroup *c = link->cgrp;
+
                        if (c->root == root) {
                                res = c;
                                break;
@@ -785,7 +802,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  */
 
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
-static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
                               unsigned long subsys_mask);
@@ -828,14 +844,14 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
 
 static void cgroup_free_fn(struct work_struct *work)
 {
-       struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
+       struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
        struct cgroup_subsys *ss;
 
        mutex_lock(&cgroup_mutex);
        /*
         * Release the subsystem state objects.
         */
-       for_each_subsys(cgrp->root, ss)
+       for_each_root_subsys(cgrp->root, ss)
                ss->css_free(cgrp);
 
        cgrp->root->number_of_cgroups--;
@@ -873,7 +889,8 @@ static void cgroup_free_rcu(struct rcu_head *head)
 {
        struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
 
-       schedule_work(&cgrp->free_work);
+       INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
+       schedule_work(&cgrp->destroy_work);
 }
 
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -882,7 +899,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
        if (S_ISDIR(inode->i_mode)) {
                struct cgroup *cgrp = dentry->d_fsdata;
 
-               BUG_ON(!(cgroup_is_removed(cgrp)));
+               BUG_ON(!(cgroup_is_dead(cgrp)));
                call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
        } else {
                struct cfent *cfe = __d_cfe(dentry);
@@ -950,7 +967,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
        struct cgroup *cgrp = __d_cgrp(dir);
        struct cgroup_subsys *ss;
 
-       for_each_subsys(cgrp->root, ss) {
+       for_each_root_subsys(cgrp->root, ss) {
                struct cftype_set *set;
                if (!test_bit(ss->subsys_id, &subsys_mask))
                        continue;
@@ -988,30 +1005,23 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
  * returns an error, no reference counts are touched.
  */
 static int rebind_subsystems(struct cgroupfs_root *root,
-                             unsigned long final_subsys_mask)
+                            unsigned long added_mask, unsigned removed_mask)
 {
-       unsigned long added_mask, removed_mask;
        struct cgroup *cgrp = &root->top_cgroup;
+       struct cgroup_subsys *ss;
        int i;
 
        BUG_ON(!mutex_is_locked(&cgroup_mutex));
        BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
 
-       removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
-       added_mask = final_subsys_mask & ~root->actual_subsys_mask;
        /* Check that any added subsystems are currently free */
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+       for_each_subsys(ss, i) {
                unsigned long bit = 1UL << i;
-               struct cgroup_subsys *ss = subsys[i];
+
                if (!(bit & added_mask))
                        continue;
-               /*
-                * Nobody should tell us to do a subsys that doesn't exist:
-                * parse_cgroupfs_options should catch that case and refcounts
-                * ensure that subsystems won't disappear once selected.
-                */
-               BUG_ON(ss == NULL);
-               if (ss->root != &rootnode) {
+
+               if (ss->root != &cgroup_dummy_root) {
                        /* Subsystem isn't free */
                        return -EBUSY;
                }
@@ -1025,38 +1035,41 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                return -EBUSY;
 
        /* Process each subsystem */
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-               struct cgroup_subsys *ss = subsys[i];
+       for_each_subsys(ss, i) {
                unsigned long bit = 1UL << i;
+
                if (bit & added_mask) {
                        /* We're binding this subsystem to this hierarchy */
-                       BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i]);
-                       BUG_ON(!dummytop->subsys[i]);
-                       BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
-                       cgrp->subsys[i] = dummytop->subsys[i];
+                       BUG_ON(!cgroup_dummy_top->subsys[i]);
+                       BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top);
+
+                       cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
                        cgrp->subsys[i]->cgroup = cgrp;
                        list_move(&ss->sibling, &root->subsys_list);
                        ss->root = root;
                        if (ss->bind)
                                ss->bind(cgrp);
+
                        /* refcount was already taken, and we're keeping it */
+                       root->subsys_mask |= bit;
                } else if (bit & removed_mask) {
                        /* We're removing this subsystem */
-                       BUG_ON(ss == NULL);
-                       BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
+                       BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]);
                        BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+
                        if (ss->bind)
-                               ss->bind(dummytop);
-                       dummytop->subsys[i]->cgroup = dummytop;
+                               ss->bind(cgroup_dummy_top);
+                       cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top;
                        cgrp->subsys[i] = NULL;
-                       subsys[i]->root = &rootnode;
-                       list_move(&ss->sibling, &rootnode.subsys_list);
+                       cgroup_subsys[i]->root = &cgroup_dummy_root;
+                       list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
+
                        /* subsystem is now free - drop reference on module */
                        module_put(ss->module);
-               } else if (bit & final_subsys_mask) {
+                       root->subsys_mask &= ~bit;
+               } else if (bit & root->subsys_mask) {
                        /* Subsystem state should already exist */
-                       BUG_ON(ss == NULL);
                        BUG_ON(!cgrp->subsys[i]);
                        /*
                         * a refcount was taken, but we already had one, so
@@ -1071,7 +1084,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        BUG_ON(cgrp->subsys[i]);
                }
        }
-       root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
+
+       /*
+        * Mark @root has finished binding subsystems.  @root->subsys_mask
+        * now matches the bound subsystems.
+        */
+       root->flags |= CGRP_ROOT_SUBSYS_BOUND;
 
        return 0;
 }
@@ -1082,7 +1100,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
        struct cgroup_subsys *ss;
 
        mutex_lock(&cgroup_root_mutex);
-       for_each_subsys(root, ss)
+       for_each_root_subsys(root, ss)
                seq_printf(seq, ",%s", ss->name);
        if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
                seq_puts(seq, ",sane_behavior");
@@ -1114,18 +1132,19 @@ struct cgroup_sb_opts {
 };
 
 /*
- * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
- * with cgroup_mutex held to protect the subsys[] array. This function takes
- * refcounts on subsystems to be used, unless it returns error, in which case
- * no refcounts are taken.
+ * Convert a hierarchy specifier into a bitmask of subsystems and
+ * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
+ * array. This function takes refcounts on subsystems to be used, unless it
+ * returns error, in which case no refcounts are taken.
  */
 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
        char *token, *o = data;
        bool all_ss = false, one_ss = false;
        unsigned long mask = (unsigned long)-1;
-       int i;
        bool module_pin_failed = false;
+       struct cgroup_subsys *ss;
+       int i;
 
        BUG_ON(!mutex_is_locked(&cgroup_mutex));
 
@@ -1202,10 +1221,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                        continue;
                }
 
-               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                       struct cgroup_subsys *ss = subsys[i];
-                       if (ss == NULL)
-                               continue;
+               for_each_subsys(ss, i) {
                        if (strcmp(token, ss->name))
                                continue;
                        if (ss->disabled)
@@ -1228,16 +1244,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
         * otherwise if 'none', 'name=' and a subsystem name options
         * were not specified, let's default to 'all'
         */
-       if (all_ss || (!one_ss && !opts->none && !opts->name)) {
-               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                       struct cgroup_subsys *ss = subsys[i];
-                       if (ss == NULL)
-                               continue;
-                       if (ss->disabled)
-                               continue;
-                       set_bit(i, &opts->subsys_mask);
-               }
-       }
+       if (all_ss || (!one_ss && !opts->none && !opts->name))
+               for_each_subsys(ss, i)
+                       if (!ss->disabled)
+                               set_bit(i, &opts->subsys_mask);
 
        /* Consistency checks */
 
@@ -1281,12 +1291,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
         * take duplicate reference counts on a subsystem that's already used,
         * but rebind_subsystems handles this case.
         */
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-               unsigned long bit = 1UL << i;
-
-               if (!(bit & opts->subsys_mask))
+       for_each_subsys(ss, i) {
+               if (!(opts->subsys_mask & (1UL << i)))
                        continue;
-               if (!try_module_get(subsys[i]->module)) {
+               if (!try_module_get(cgroup_subsys[i]->module)) {
                        module_pin_failed = true;
                        break;
                }
@@ -1303,7 +1311,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 
                        if (!(bit & opts->subsys_mask))
                                continue;
-                       module_put(subsys[i]->module);
+                       module_put(cgroup_subsys[i]->module);
                }
                return -ENOENT;
        }
@@ -1313,14 +1321,14 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 
 static void drop_parsed_module_refcounts(unsigned long subsys_mask)
 {
+       struct cgroup_subsys *ss;
        int i;
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-               unsigned long bit = 1UL << i;
 
-               if (!(bit & subsys_mask))
-                       continue;
-               module_put(subsys[i]->module);
-       }
+       mutex_lock(&cgroup_mutex);
+       for_each_subsys(ss, i)
+               if (subsys_mask & (1UL << i))
+                       module_put(cgroup_subsys[i]->module);
+       mutex_unlock(&cgroup_mutex);
 }
 
 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
@@ -1345,7 +1353,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        if (ret)
                goto out_unlock;
 
-       if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
+       if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
                pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
                           task_tgid_nr(current), current->comm);
 
@@ -1353,10 +1361,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        removed_mask = root->subsys_mask & ~opts.subsys_mask;
 
        /* Don't allow flags or name to change at remount */
-       if (opts.flags != root->flags ||
+       if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
            (opts.name && strcmp(opts.name, root->name))) {
+               pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
+                      opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
+                      root->flags & CGRP_ROOT_OPTION_MASK, root->name);
                ret = -EINVAL;
-               drop_parsed_module_refcounts(opts.subsys_mask);
                goto out_unlock;
        }
 
@@ -1367,11 +1377,10 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
         */
        cgroup_clear_directory(cgrp->dentry, false, removed_mask);
 
-       ret = rebind_subsystems(root, opts.subsys_mask);
+       ret = rebind_subsystems(root, added_mask, removed_mask);
        if (ret) {
                /* rebind_subsystems failed, re-populate the removed files */
                cgroup_populate_dir(cgrp, false, removed_mask);
-               drop_parsed_module_refcounts(opts.subsys_mask);
                goto out_unlock;
        }
 
@@ -1386,6 +1395,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+       if (ret)
+               drop_parsed_module_refcounts(opts.subsys_mask);
        return ret;
 }
 
@@ -1401,11 +1412,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->sibling);
        INIT_LIST_HEAD(&cgrp->children);
        INIT_LIST_HEAD(&cgrp->files);
-       INIT_LIST_HEAD(&cgrp->css_sets);
-       INIT_LIST_HEAD(&cgrp->allcg_node);
+       INIT_LIST_HEAD(&cgrp->cset_links);
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
-       INIT_WORK(&cgrp->free_work, cgroup_free_fn);
        mutex_init(&cgrp->pidlist_mutex);
        INIT_LIST_HEAD(&cgrp->event_list);
        spin_lock_init(&cgrp->event_list_lock);
@@ -1418,37 +1427,37 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 
        INIT_LIST_HEAD(&root->subsys_list);
        INIT_LIST_HEAD(&root->root_list);
-       INIT_LIST_HEAD(&root->allcg_list);
        root->number_of_cgroups = 1;
        cgrp->root = root;
-       cgrp->name = &root_cgroup_name;
+       RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
        init_cgroup_housekeeping(cgrp);
-       list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 }
 
-static bool init_root_id(struct cgroupfs_root *root)
+static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
 {
-       int ret = 0;
+       int id;
 
-       do {
-               if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
-                       return false;
-               spin_lock(&hierarchy_id_lock);
-               /* Try to allocate the next unused ID */
-               ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
-                                       &root->hierarchy_id);
-               if (ret == -ENOSPC)
-                       /* Try again starting from 0 */
-                       ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
-               if (!ret) {
-                       next_hierarchy_id = root->hierarchy_id + 1;
-               } else if (ret != -EAGAIN) {
-                       /* Can only get here if the 31-bit IDR is full ... */
-                       BUG_ON(ret);
-               }
-               spin_unlock(&hierarchy_id_lock);
-       } while (ret);
-       return true;
+       lockdep_assert_held(&cgroup_mutex);
+       lockdep_assert_held(&cgroup_root_mutex);
+
+       id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
+                             GFP_KERNEL);
+       if (id < 0)
+               return id;
+
+       root->hierarchy_id = id;
+       return 0;
+}
+
+static void cgroup_exit_root_id(struct cgroupfs_root *root)
+{
+       lockdep_assert_held(&cgroup_mutex);
+       lockdep_assert_held(&cgroup_root_mutex);
+
+       if (root->hierarchy_id) {
+               idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
+               root->hierarchy_id = 0;
+       }
 }
 
 static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1482,12 +1491,16 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
        if (!root)
                return ERR_PTR(-ENOMEM);
 
-       if (!init_root_id(root)) {
-               kfree(root);
-               return ERR_PTR(-ENOMEM);
-       }
        init_cgroup_root(root);
 
+       /*
+        * We need to set @root->subsys_mask now so that @root can be
+        * matched by cgroup_test_super() before it finishes
+        * initialization; otherwise, competing mounts with the same
+        * options may try to bind the same subsystems instead of waiting
+        * for the first one leading to unexpected mount errors.
+        * SUBSYS_BOUND will be set once actual binding is complete.
+        */
        root->subsys_mask = opts->subsys_mask;
        root->flags = opts->flags;
        ida_init(&root->cgroup_ida);
@@ -1500,17 +1513,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
        return root;
 }
 
-static void cgroup_drop_root(struct cgroupfs_root *root)
+static void cgroup_free_root(struct cgroupfs_root *root)
 {
-       if (!root)
-               return;
+       if (root) {
+               /* hierarhcy ID shoulid already have been released */
+               WARN_ON_ONCE(root->hierarchy_id);
 
-       BUG_ON(!root->hierarchy_id);
-       spin_lock(&hierarchy_id_lock);
-       ida_remove(&hierarchy_ida, root->hierarchy_id);
-       spin_unlock(&hierarchy_id_lock);
-       ida_destroy(&root->cgroup_ida);
-       kfree(root);
+               ida_destroy(&root->cgroup_ida);
+               kfree(root);
+       }
 }
 
 static int cgroup_set_super(struct super_block *sb, void *data)
@@ -1597,7 +1608,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
        if (IS_ERR(sb)) {
                ret = PTR_ERR(sb);
-               cgroup_drop_root(opts.new_root);
+               cgroup_free_root(opts.new_root);
                goto drop_modules;
        }
 
@@ -1605,12 +1616,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        BUG_ON(!root);
        if (root == opts.new_root) {
                /* We used the new root structure, so this is a new hierarchy */
-               struct list_head tmp_cg_links;
+               struct list_head tmp_links;
                struct cgroup *root_cgrp = &root->top_cgroup;
                struct cgroupfs_root *existing_root;
                const struct cred *cred;
                int i;
-               struct css_set *cg;
+               struct css_set *cset;
 
                BUG_ON(sb->s_root != NULL);
 
@@ -1637,13 +1648,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 * that's us. The worst that can happen is that we
                 * have some link structures left over
                 */
-               ret = allocate_cg_links(css_set_count, &tmp_cg_links);
+               ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
+               if (ret)
+                       goto unlock_drop;
+
+               /* ID 0 is reserved for dummy root, 1 for unified hierarchy */
+               ret = cgroup_init_root_id(root, 2, 0);
                if (ret)
                        goto unlock_drop;
 
-               ret = rebind_subsystems(root, root->subsys_mask);
+               ret = rebind_subsystems(root, root->subsys_mask, 0);
                if (ret == -EBUSY) {
-                       free_cg_links(&tmp_cg_links);
+                       free_cgrp_cset_links(&tmp_links);
                        goto unlock_drop;
                }
                /*
@@ -1655,8 +1671,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                /* EBUSY should be the only error here */
                BUG_ON(ret);
 
-               list_add(&root->root_list, &roots);
-               root_count++;
+               list_add(&root->root_list, &cgroup_roots);
+               cgroup_root_count++;
 
                sb->s_root->d_fsdata = root_cgrp;
                root->top_cgroup.dentry = sb->s_root;
@@ -1664,11 +1680,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                /* Link the top cgroup in this hierarchy into all
                 * the css_set objects */
                write_lock(&css_set_lock);
-               hash_for_each(css_set_table, i, cg, hlist)
-                       link_css_set(&tmp_cg_links, cg, root_cgrp);
+               hash_for_each(css_set_table, i, cset, hlist)
+                       link_css_set(&tmp_links, cset, root_cgrp);
                write_unlock(&css_set_lock);
 
-               free_cg_links(&tmp_cg_links);
+               free_cgrp_cset_links(&tmp_links);
 
                BUG_ON(!list_empty(&root_cgrp->children));
                BUG_ON(root->number_of_cgroups != 1);
@@ -1684,9 +1700,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 * We re-used an existing hierarchy - the new root (if
                 * any) is not needed
                 */
-               cgroup_drop_root(opts.new_root);
+               cgroup_free_root(opts.new_root);
 
-               if (root->flags != opts.flags) {
+               if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
                        if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
                                pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
                                ret = -EINVAL;
@@ -1705,6 +1721,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        return dget(sb->s_root);
 
  unlock_drop:
+       cgroup_exit_root_id(root);
        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&inode->i_mutex);
@@ -1721,9 +1738,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 static void cgroup_kill_sb(struct super_block *sb) {
        struct cgroupfs_root *root = sb->s_fs_info;
        struct cgroup *cgrp = &root->top_cgroup;
+       struct cgrp_cset_link *link, *tmp_link;
        int ret;
-       struct cg_cgroup_link *link;
-       struct cg_cgroup_link *saved_link;
 
        BUG_ON(!root);
 
@@ -1734,36 +1750,39 @@ static void cgroup_kill_sb(struct super_block *sb) {
        mutex_lock(&cgroup_root_mutex);
 
        /* Rebind all subsystems back to the default hierarchy */
-       ret = rebind_subsystems(root, 0);
-       /* Shouldn't be able to fail ... */
-       BUG_ON(ret);
+       if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
+               ret = rebind_subsystems(root, 0, root->subsys_mask);
+               /* Shouldn't be able to fail ... */
+               BUG_ON(ret);
+       }
 
        /*
-        * Release all the links from css_sets to this hierarchy's
+        * Release all the links from cset_links to this hierarchy's
         * root cgroup
         */
        write_lock(&css_set_lock);
 
-       list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
-                                cgrp_link_list) {
-               list_del(&link->cg_link_list);
-               list_del(&link->cgrp_link_list);
+       list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
+               list_del(&link->cset_link);
+               list_del(&link->cgrp_link);
                kfree(link);
        }
        write_unlock(&css_set_lock);
 
        if (!list_empty(&root->root_list)) {
                list_del(&root->root_list);
-               root_count--;
+               cgroup_root_count--;
        }
 
+       cgroup_exit_root_id(root);
+
        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
 
        simple_xattrs_free(&cgrp->xattrs);
 
        kill_litter_super(sb);
-       cgroup_drop_root(root);
+       cgroup_free_root(root);
 }
 
 static struct file_system_type cgroup_fs_type = {
@@ -1825,6 +1844,38 @@ out:
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
 
+/**
+ * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy
+ * @task: target task
+ * @hierarchy_id: the hierarchy to look up @task's cgroup from
+ * @buf: the buffer to write the path into
+ * @buflen: the length of the buffer
+ *
+ * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and
+ * copy its path into @buf.  This function grabs cgroup_mutex and shouldn't
+ * be used inside locks used by cgroup controller callbacks.
+ */
+int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id,
+                                   char *buf, size_t buflen)
+{
+       struct cgroupfs_root *root;
+       struct cgroup *cgrp = NULL;
+       int ret = -ENOENT;
+
+       mutex_lock(&cgroup_mutex);
+
+       root = idr_find(&cgroup_hierarchy_idr, hierarchy_id);
+       if (root) {
+               cgrp = task_cgroup_from_root(task, root);
+               ret = cgroup_path(cgrp, buf, buflen);
+       }
+
+       mutex_unlock(&cgroup_mutex);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy);
+
 /*
  * Control Group taskset
  */
@@ -1910,10 +1961,11 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
  *
  * Must be called with cgroup_mutex and threadgroup locked.
  */
-static void cgroup_task_migrate(struct cgroup *oldcgrp,
-                               struct task_struct *tsk, struct css_set *newcg)
+static void cgroup_task_migrate(struct cgroup *old_cgrp,
+                               struct task_struct *tsk,
+                               struct css_set *new_cset)
 {
-       struct css_set *oldcg;
+       struct css_set *old_cset;
 
        /*
         * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1921,25 +1973,25 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp,
         * css_set to init_css_set and dropping the old one.
         */
        WARN_ON_ONCE(tsk->flags & PF_EXITING);
-       oldcg = tsk->cgroups;
+       old_cset = task_css_set(tsk);
 
        task_lock(tsk);
-       rcu_assign_pointer(tsk->cgroups, newcg);
+       rcu_assign_pointer(tsk->cgroups, new_cset);
        task_unlock(tsk);
 
        /* Update the css_set linked lists if we're using them */
        write_lock(&css_set_lock);
        if (!list_empty(&tsk->cg_list))
-               list_move(&tsk->cg_list, &newcg->tasks);
+               list_move(&tsk->cg_list, &new_cset->tasks);
        write_unlock(&css_set_lock);
 
        /*
-        * We just gained a reference on oldcg by taking it from the task. As
-        * trading it for newcg is protected by cgroup_mutex, we're safe to drop
-        * it here; it will be freed under RCU.
+        * We just gained a reference on old_cset by taking it from the
+        * task. As trading it for new_cset is protected by cgroup_mutex,
+        * we're safe to drop it here; it will be freed under RCU.
         */
-       set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
-       put_css_set(oldcg);
+       set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
+       put_css_set(old_cset);
 }
 
 /**
@@ -2029,7 +2081,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
        /*
         * step 1: check that we can legitimately attach to the cgroup.
         */
-       for_each_subsys(root, ss) {
+       for_each_root_subsys(root, ss) {
                if (ss->can_attach) {
                        retval = ss->can_attach(cgrp, &tset);
                        if (retval) {
@@ -2044,8 +2096,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
         * we use find_css_set, which allocates a new one if necessary.
         */
        for (i = 0; i < group_size; i++) {
+               struct css_set *old_cset;
+
                tc = flex_array_get(group, i);
-               tc->cg = find_css_set(tc->task->cgroups, cgrp);
+               old_cset = task_css_set(tc->task);
+               tc->cg = find_css_set(old_cset, cgrp);
                if (!tc->cg) {
                        retval = -ENOMEM;
                        goto out_put_css_set_refs;
@@ -2066,7 +2121,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
        /*
         * step 4: do subsystem attach callbacks.
         */
-       for_each_subsys(root, ss) {
+       for_each_root_subsys(root, ss) {
                if (ss->attach)
                        ss->attach(cgrp, &tset);
        }
@@ -2086,7 +2141,7 @@ out_put_css_set_refs:
        }
 out_cancel_attach:
        if (retval) {
-               for_each_subsys(root, ss) {
+               for_each_root_subsys(root, ss) {
                        if (ss == failed_ss)
                                break;
                        if (ss->cancel_attach)
@@ -2323,7 +2378,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
        struct cftype *cft = __d_cft(file->f_dentry);
        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 
-       if (cgroup_is_removed(cgrp))
+       if (cgroup_is_dead(cgrp))
                return -ENODEV;
        if (cft->write)
                return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -2368,7 +2423,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
        struct cftype *cft = __d_cft(file->f_dentry);
        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 
-       if (cgroup_is_removed(cgrp))
+       if (cgroup_is_dead(cgrp))
                return -ENODEV;
 
        if (cft->read)
@@ -2435,10 +2490,12 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
        cft = __d_cft(file->f_dentry);
 
        if (cft->read_map || cft->read_seq_string) {
-               struct cgroup_seqfile_state *state =
-                       kzalloc(sizeof(*state), GFP_USER);
+               struct cgroup_seqfile_state *state;
+
+               state = kzalloc(sizeof(*state), GFP_USER);
                if (!state)
                        return -ENOMEM;
+
                state->cft = cft;
                state->cgroup = __d_cgrp(file->f_dentry->d_parent);
                file->f_op = &cgroup_seqfile_operations;
@@ -2486,6 +2543,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 
        cgrp = __d_cgrp(old_dentry);
 
+       /*
+        * This isn't a proper migration and its usefulness is very
+        * limited.  Disallow if sane_behavior.
+        */
+       if (cgroup_sane_behavior(cgrp))
+               return -EPERM;
+
        name = cgroup_alloc_name(new_dentry);
        if (!name)
                return -ENOMEM;
@@ -2496,7 +2560,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
                return ret;
        }
 
-       old_name = cgrp->name;
+       old_name = rcu_dereference_protected(cgrp->name, true);
        rcu_assign_pointer(cgrp->name, name);
 
        kfree_rcu(old_name, rcu_head);
@@ -2577,7 +2641,7 @@ static const struct inode_operations cgroup_file_inode_operations = {
 };
 
 static const struct inode_operations cgroup_dir_inode_operations = {
-       .lookup = cgroup_lookup,
+       .lookup = simple_lookup,
        .mkdir = cgroup_mkdir,
        .rmdir = cgroup_rmdir,
        .rename = cgroup_rename,
@@ -2587,14 +2651,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
        .removexattr = cgroup_removexattr,
 };
 
-static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
-{
-       if (dentry->d_name.len > NAME_MAX)
-               return ERR_PTR(-ENAMETOOLONG);
-       d_add(dentry, NULL);
-       return NULL;
-}
-
 /*
  * Check if a file is a control file
  */
@@ -2747,58 +2803,78 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
        return ret;
 }
 
-static DEFINE_MUTEX(cgroup_cft_mutex);
-
 static void cgroup_cfts_prepare(void)
-       __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
+       __acquires(&cgroup_mutex)
 {
        /*
         * Thanks to the entanglement with vfs inode locking, we can't walk
         * the existing cgroups under cgroup_mutex and create files.
-        * Instead, we increment reference on all cgroups and build list of
-        * them using @cgrp->cft_q_node.  Grab cgroup_cft_mutex to ensure
-        * exclusive access to the field.
+        * Instead, we use cgroup_for_each_descendant_pre() and drop RCU
+        * read lock before calling cgroup_addrm_files().
         */
-       mutex_lock(&cgroup_cft_mutex);
        mutex_lock(&cgroup_mutex);
 }
 
 static void cgroup_cfts_commit(struct cgroup_subsys *ss,
                               struct cftype *cfts, bool is_add)
-       __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
+       __releases(&cgroup_mutex)
 {
        LIST_HEAD(pending);
-       struct cgroup *cgrp, *n;
+       struct cgroup *cgrp, *root = &ss->root->top_cgroup;
+       struct super_block *sb = ss->root->sb;
+       struct dentry *prev = NULL;
+       struct inode *inode;
+       u64 update_before;
 
        /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
-       if (cfts && ss->root != &rootnode) {
-               list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
-                       dget(cgrp->dentry);
-                       list_add_tail(&cgrp->cft_q_node, &pending);
-               }
+       if (!cfts || ss->root == &cgroup_dummy_root ||
+           !atomic_inc_not_zero(&sb->s_active)) {
+               mutex_unlock(&cgroup_mutex);
+               return;
        }
 
-       mutex_unlock(&cgroup_mutex);
-
        /*
-        * All new cgroups will see @cfts update on @ss->cftsets.  Add/rm
-        * files for all cgroups which were created before.
+        * All cgroups which are created after we drop cgroup_mutex will
+        * have the updated set of files, so we only need to update the
+        * cgroups created before the current @cgroup_serial_nr_next.
         */
-       list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
-               struct inode *inode = cgrp->dentry->d_inode;
+       update_before = cgroup_serial_nr_next;
+
+       mutex_unlock(&cgroup_mutex);
+
+       /* @root always needs to be updated */
+       inode = root->dentry->d_inode;
+       mutex_lock(&inode->i_mutex);
+       mutex_lock(&cgroup_mutex);
+       cgroup_addrm_files(root, ss, cfts, is_add);
+       mutex_unlock(&cgroup_mutex);
+       mutex_unlock(&inode->i_mutex);
+
+       /* add/rm files for all cgroups created before */
+       rcu_read_lock();
+       cgroup_for_each_descendant_pre(cgrp, root) {
+               if (cgroup_is_dead(cgrp))
+                       continue;
+
+               inode = cgrp->dentry->d_inode;
+               dget(cgrp->dentry);
+               rcu_read_unlock();
+
+               dput(prev);
+               prev = cgrp->dentry;
 
                mutex_lock(&inode->i_mutex);
                mutex_lock(&cgroup_mutex);
-               if (!cgroup_is_removed(cgrp))
+               if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
                        cgroup_addrm_files(cgrp, ss, cfts, is_add);
                mutex_unlock(&cgroup_mutex);
                mutex_unlock(&inode->i_mutex);
 
-               list_del_init(&cgrp->cft_q_node);
-               dput(cgrp->dentry);
+               rcu_read_lock();
        }
-
-       mutex_unlock(&cgroup_cft_mutex);
+       rcu_read_unlock();
+       dput(prev);
+       deactivate_super(sb);
 }
 
 /**
@@ -2853,7 +2929,8 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 
        list_for_each_entry(set, &ss->cftsets, node) {
                if (set->cfts == cfts) {
-                       list_del_init(&set->node);
+                       list_del(&set->node);
+                       kfree(set);
                        cgroup_cfts_commit(ss, cfts, false);
                        return 0;
                }
@@ -2872,12 +2949,11 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 int cgroup_task_count(const struct cgroup *cgrp)
 {
        int count = 0;
-       struct cg_cgroup_link *link;
+       struct cgrp_cset_link *link;
 
        read_lock(&css_set_lock);
-       list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
-               count += atomic_read(&link->cg->refcount);
-       }
+       list_for_each_entry(link, &cgrp->cset_links, cset_link)
+               count += atomic_read(&link->cset->refcount);
        read_unlock(&css_set_lock);
        return count;
 }
@@ -2886,25 +2962,24 @@ int cgroup_task_count(const struct cgroup *cgrp)
  * Advance a list_head iterator.  The iterator should be positioned at
  * the start of a css_set
  */
-static void cgroup_advance_iter(struct cgroup *cgrp,
-                               struct cgroup_iter *it)
+static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it)
 {
-       struct list_head *l = it->cg_link;
-       struct cg_cgroup_link *link;
-       struct css_set *cg;
+       struct list_head *l = it->cset_link;
+       struct cgrp_cset_link *link;
+       struct css_set *cset;
 
        /* Advance to the next non-empty css_set */
        do {
                l = l->next;
-               if (l == &cgrp->css_sets) {
-                       it->cg_link = NULL;
+               if (l == &cgrp->cset_links) {
+                       it->cset_link = NULL;
                        return;
                }
-               link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
-               cg = link->cg;
-       } while (list_empty(&cg->tasks));
-       it->cg_link = l;
-       it->task = cg->tasks.next;
+               link = list_entry(l, struct cgrp_cset_link, cset_link);
+               cset = link->cset;
+       } while (list_empty(&cset->tasks));
+       it->cset_link = l;
+       it->task = cset->tasks.next;
 }
 
 /*
@@ -2934,13 +3009,63 @@ static void cgroup_enable_task_cg_lists(void)
                 * entry won't be deleted though the process has exited.
                 */
                if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
-                       list_add(&p->cg_list, &p->cgroups->tasks);
+                       list_add(&p->cg_list, &task_css_set(p)->tasks);
                task_unlock(p);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
        write_unlock(&css_set_lock);
 }
 
+/**
+ * cgroup_next_sibling - find the next sibling of a given cgroup
+ * @pos: the current cgroup
+ *
+ * This function returns the next sibling of @pos and should be called
+ * under RCU read lock.  The only requirement is that @pos is accessible.
+ * The next sibling is guaranteed to be returned regardless of @pos's
+ * state.
+ */
+struct cgroup *cgroup_next_sibling(struct cgroup *pos)
+{
+       struct cgroup *next;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       /*
+        * @pos could already have been removed.  Once a cgroup is removed,
+        * its ->sibling.next is no longer updated when its next sibling
+        * changes.  As CGRP_DEAD assertion is serialized and happens
+        * before the cgroup is taken off the ->sibling list, if we see it
+        * unasserted, it's guaranteed that the next sibling hasn't
+        * finished its grace period even if it's already removed, and thus
+        * safe to dereference from this RCU critical section.  If
+        * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
+        * to be visible as %true here.
+        */
+       if (likely(!cgroup_is_dead(pos))) {
+               next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+               if (&next->sibling != &pos->parent->children)
+                       return next;
+               return NULL;
+       }
+
+       /*
+        * Can't dereference the next pointer.  Each cgroup is given a
+        * monotonically increasing unique serial number and always
+        * appended to the sibling list, so the next one can be found by
+        * walking the parent's children until we see a cgroup with higher
+        * serial number than @pos's.
+        *
+        * While this path can be slow, it's taken only when either the
+        * current cgroup is removed or iteration and removal race.
+        */
+       list_for_each_entry_rcu(next, &pos->parent->children, sibling)
+               if (next->serial_nr > pos->serial_nr)
+                       return next;
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_sibling);
+
 /**
  * cgroup_next_descendant_pre - find the next descendant for pre-order walk
  * @pos: the current position (%NULL to initiate traversal)
@@ -2948,6 +3073,11 @@ static void cgroup_enable_task_cg_lists(void)
  *
  * To be used by cgroup_for_each_descendant_pre().  Find the next
  * descendant to visit for pre-order traversal of @cgroup's descendants.
+ *
+ * While this function requires RCU read locking, it doesn't require the
+ * whole traversal to be contained in a single RCU critical section.  This
+ * function will return the correct next descendant as long as both @pos
+ * and @cgroup are accessible and @pos is a descendant of @cgroup.
  */
 struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
                                          struct cgroup *cgroup)
@@ -2967,11 +3097,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 
        /* no child, visit my or the closest ancestor's next sibling */
        while (pos != cgroup) {
-               next = list_entry_rcu(pos->sibling.next, struct cgroup,
-                                     sibling);
-               if (&next->sibling != &pos->parent->children)
+               next = cgroup_next_sibling(pos);
+               if (next)
                        return next;
-
                pos = pos->parent;
        }
 
@@ -2986,6 +3114,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
  * Return the rightmost descendant of @pos.  If there's no descendant,
  * @pos is returned.  This can be used during pre-order traversal to skip
  * subtree of @pos.
+ *
+ * While this function requires RCU read locking, it doesn't require the
+ * whole traversal to be contained in a single RCU critical section.  This
+ * function will return the correct rightmost descendant as long as @pos is
+ * accessible.
  */
 struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
 {
@@ -3025,6 +3158,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
  *
  * To be used by cgroup_for_each_descendant_post().  Find the next
  * descendant to visit for post-order traversal of @cgroup's descendants.
+ *
+ * While this function requires RCU read locking, it doesn't require the
+ * whole traversal to be contained in a single RCU critical section.  This
+ * function will return the correct next descendant as long as both @pos
+ * and @cgroup are accessible and @pos is a descendant of @cgroup.
  */
 struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
                                           struct cgroup *cgroup)
@@ -3040,8 +3178,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
        }
 
        /* if there's an unvisited sibling, visit its leftmost descendant */
-       next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
-       if (&next->sibling != &pos->parent->children)
+       next = cgroup_next_sibling(pos);
+       if (next)
                return cgroup_leftmost_descendant(next);
 
        /* no sibling left, visit parent */
@@ -3062,7 +3200,7 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
                cgroup_enable_task_cg_lists();
 
        read_lock(&css_set_lock);
-       it->cg_link = &cgrp->css_sets;
+       it->cset_link = &cgrp->cset_links;
        cgroup_advance_iter(cgrp, it);
 }
 
@@ -3071,16 +3209,16 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 {
        struct task_struct *res;
        struct list_head *l = it->task;
-       struct cg_cgroup_link *link;
+       struct cgrp_cset_link *link;
 
        /* If the iterator cg is NULL, we have no tasks */
-       if (!it->cg_link)
+       if (!it->cset_link)
                return NULL;
        res = list_entry(l, struct task_struct, cg_list);
        /* Advance iterator to find next entry */
        l = l->next;
-       link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
-       if (l == &link->cg->tasks) {
+       link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
+       if (l == &link->cset->tasks) {
                /* We reached the end of this task list - move on to
                 * the next cg_cgroup_link */
                cgroup_advance_iter(cgrp, it);
@@ -3411,7 +3549,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
                }
        }
        /* entry not found; create a new one */
-       l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+       l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
        if (!l) {
                mutex_unlock(&cgrp->pidlist_mutex);
                return l;
@@ -3420,8 +3558,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
        down_write(&l->mutex);
        l->key.type = type;
        l->key.ns = get_pid_ns(ns);
-       l->use_count = 0; /* don't increment here */
-       l->list = NULL;
        l->owner = cgrp;
        list_add(&l->links, &cgrp->pidlists);
        mutex_unlock(&cgrp->pidlist_mutex);
@@ -3726,6 +3862,23 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
        return 0;
 }
 
+/*
+ * When dput() is called asynchronously, if umount has been done and
+ * then deactivate_super() in cgroup_free_fn() kills the superblock,
+ * there's a small window that vfs will see the root dentry with non-zero
+ * refcnt and trigger BUG().
+ *
+ * That's why we hold a reference before dput() and drop it right after.
+ */
+static void cgroup_dput(struct cgroup *cgrp)
+{
+       struct super_block *sb = cgrp->root->sb;
+
+       atomic_inc(&sb->s_active);
+       dput(cgrp->dentry);
+       deactivate_super(sb);
+}
+
 /*
  * Unregister event and free resources.
  *
@@ -3746,7 +3899,7 @@ static void cgroup_event_remove(struct work_struct *work)
 
        eventfd_ctx_put(event->eventfd);
        kfree(event);
-       dput(cgrp->dentry);
+       cgroup_dput(cgrp);
 }
 
 /*
@@ -3933,33 +4086,16 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
        return 0;
 }
 
-/*
- * for the common functions, 'private' gives the type of file
- */
-/* for hysterical raisins, we can't put this on the older files */
-#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
-static struct cftype files[] = {
+static struct cftype cgroup_base_files[] = {
        {
-               .name = "tasks",
-               .open = cgroup_tasks_open,
-               .write_u64 = cgroup_tasks_write,
-               .release = cgroup_pidlist_release,
-               .mode = S_IRUGO | S_IWUSR,
-       },
-       {
-               .name = CGROUP_FILE_GENERIC_PREFIX "procs",
+               .name = "cgroup.procs",
                .open = cgroup_procs_open,
                .write_u64 = cgroup_procs_write,
                .release = cgroup_pidlist_release,
                .mode = S_IRUGO | S_IWUSR,
        },
        {
-               .name = "notify_on_release",
-               .read_u64 = cgroup_read_notify_on_release,
-               .write_u64 = cgroup_write_notify_on_release,
-       },
-       {
-               .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
+               .name = "cgroup.event_control",
                .write_string = cgroup_write_event_control,
                .mode = S_IWUGO,
        },
@@ -3974,9 +4110,29 @@ static struct cftype files[] = {
                .flags = CFTYPE_ONLY_ON_ROOT,
                .read_seq_string = cgroup_sane_behavior_show,
        },
+
+       /*
+        * Historical crazy stuff.  These don't have "cgroup."  prefix and
+        * don't exist if sane_behavior.  If you're depending on these, be
+        * prepared to be burned.
+        */
+       {
+               .name = "tasks",
+               .flags = CFTYPE_INSANE,         /* use "procs" instead */
+               .open = cgroup_tasks_open,
+               .write_u64 = cgroup_tasks_write,
+               .release = cgroup_pidlist_release,
+               .mode = S_IRUGO | S_IWUSR,
+       },
+       {
+               .name = "notify_on_release",
+               .flags = CFTYPE_INSANE,
+               .read_u64 = cgroup_read_notify_on_release,
+               .write_u64 = cgroup_write_notify_on_release,
+       },
        {
                .name = "release_agent",
-               .flags = CFTYPE_ONLY_ON_ROOT,
+               .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
                .read_seq_string = cgroup_release_agent_show,
                .write_string = cgroup_release_agent_write,
                .max_write_len = PATH_MAX,
@@ -3997,13 +4153,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
        struct cgroup_subsys *ss;
 
        if (base_files) {
-               err = cgroup_addrm_files(cgrp, NULL, files, true);
+               err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
                if (err < 0)
                        return err;
        }
 
        /* process cftsets of each subsystem */
-       for_each_subsys(cgrp->root, ss) {
+       for_each_root_subsys(cgrp->root, ss) {
                struct cftype_set *set;
                if (!test_bit(ss->subsys_id, &subsys_mask))
                        continue;
@@ -4013,15 +4169,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
        }
 
        /* This cgroup is ready now */
-       for_each_subsys(cgrp->root, ss) {
+       for_each_root_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+               struct css_id *id = rcu_dereference_protected(css->id, true);
+
                /*
                 * Update id->css pointer and make this css visible from
                 * CSS ID functions. This pointer will be dereferened
                 * from RCU-read-side without locks.
                 */
-               if (css->id)
-                       rcu_assign_pointer(css->id->css, css);
+               if (id)
+                       rcu_assign_pointer(id->css, css);
        }
 
        return 0;
@@ -4031,12 +4189,16 @@ static void css_dput_fn(struct work_struct *work)
 {
        struct cgroup_subsys_state *css =
                container_of(work, struct cgroup_subsys_state, dput_work);
-       struct dentry *dentry = css->cgroup->dentry;
-       struct super_block *sb = dentry->d_sb;
 
-       atomic_inc(&sb->s_active);
-       dput(dentry);
-       deactivate_super(sb);
+       cgroup_dput(css->cgroup);
+}
+
+static void css_release(struct percpu_ref *ref)
+{
+       struct cgroup_subsys_state *css =
+               container_of(ref, struct cgroup_subsys_state, refcnt);
+
+       schedule_work(&css->dput_work);
 }
 
 static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -4044,10 +4206,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
                               struct cgroup *cgrp)
 {
        css->cgroup = cgrp;
-       atomic_set(&css->refcnt, 1);
        css->flags = 0;
        css->id = NULL;
-       if (cgrp == dummytop)
+       if (cgrp == cgroup_dummy_top)
                css->flags |= CSS_ROOT;
        BUG_ON(cgrp->subsys[ss->subsys_id]);
        cgrp->subsys[ss->subsys_id] = css;
@@ -4157,7 +4318,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
 
-       for_each_subsys(root, ss) {
+       for_each_root_subsys(root, ss) {
                struct cgroup_subsys_state *css;
 
                css = ss->css_alloc(cgrp);
@@ -4165,7 +4326,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                        err = PTR_ERR(css);
                        goto err_free_all;
                }
+
+               err = percpu_ref_init(&css->refcnt, css_release);
+               if (err)
+                       goto err_free_all;
+
                init_cgroup_css(css, ss, cgrp);
+
                if (ss->use_id) {
                        err = alloc_css_id(ss, parent, cgrp);
                        if (err)
@@ -4183,20 +4350,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                goto err_free_all;
        lockdep_assert_held(&dentry->d_inode->i_mutex);
 
+       cgrp->serial_nr = cgroup_serial_nr_next++;
+
        /* allocation complete, commit to creation */
-       list_add_tail(&cgrp->allcg_node, &root->allcg_list);
        list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
        root->number_of_cgroups++;
 
        /* each css holds a ref to the cgroup's dentry */
-       for_each_subsys(root, ss)
+       for_each_root_subsys(root, ss)
                dget(dentry);
 
        /* hold a ref to the parent's dentry */
        dget(parent->dentry);
 
        /* creation succeeded, notify subsystems */
-       for_each_subsys(root, ss) {
+       for_each_root_subsys(root, ss) {
                err = online_css(ss, cgrp);
                if (err)
                        goto err_destroy;
@@ -4221,9 +4389,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        return 0;
 
 err_free_all:
-       for_each_subsys(root, ss) {
-               if (cgrp->subsys[ss->subsys_id])
+       for_each_root_subsys(root, ss) {
+               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
+               if (css) {
+                       percpu_ref_cancel_init(&css->refcnt);
                        ss->css_free(cgrp);
+               }
        }
        mutex_unlock(&cgroup_mutex);
        /* Release the reference count that we took on the superblock */
@@ -4251,63 +4423,120 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
 
+static void cgroup_css_killed(struct cgroup *cgrp)
+{
+       if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
+               return;
+
+       /* percpu ref's of all css's are killed, kick off the next step */
+       INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
+       schedule_work(&cgrp->destroy_work);
+}
+
+static void css_ref_killed_fn(struct percpu_ref *ref)
+{
+       struct cgroup_subsys_state *css =
+               container_of(ref, struct cgroup_subsys_state, refcnt);
+
+       cgroup_css_killed(css->cgroup);
+}
+
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected.  Also, cgroup core needs to
+ * guarantee that css_tryget() won't succeed by the time ->css_offline() is
+ * invoked.  To satisfy all the requirements, destruction is implemented in
+ * the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
+ *     userland visible parts and start killing the percpu refcnts of
+ *     css's.  Set up so that the next stage will be kicked off once all
+ *     the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ *     rest of destruction.  Once all cgroup references are gone, the
+ *     cgroup is RCU-freed.
+ *
+ * This function implements s1.  After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created.  As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
 static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
        struct dentry *d = cgrp->dentry;
-       struct cgroup *parent = cgrp->parent;
        struct cgroup_event *event, *tmp;
        struct cgroup_subsys *ss;
+       bool empty;
 
        lockdep_assert_held(&d->d_inode->i_mutex);
        lockdep_assert_held(&cgroup_mutex);
 
-       if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
+       /*
+        * css_set_lock synchronizes access to ->cset_links and prevents
+        * @cgrp from being removed while __put_css_set() is in progress.
+        */
+       read_lock(&css_set_lock);
+       empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children);
+       read_unlock(&css_set_lock);
+       if (!empty)
                return -EBUSY;
 
        /*
-        * Block new css_tryget() by deactivating refcnt and mark @cgrp
-        * removed.  This makes future css_tryget() and child creation
-        * attempts fail thus maintaining the removal conditions verified
-        * above.
+        * Block new css_tryget() by killing css refcnts.  cgroup core
+        * guarantees that, by the time ->css_offline() is invoked, no new
+        * css reference will be given out via css_tryget().  We can't
+        * simply call percpu_ref_kill() and proceed to offlining css's
+        * because percpu_ref_kill() doesn't guarantee that the ref is seen
+        * as killed on all CPUs on return.
+        *
+        * Use percpu_ref_kill_and_confirm() to get notifications as each
+        * css is confirmed to be seen as killed on all CPUs.  The
+        * notification callback keeps track of the number of css's to be
+        * killed and schedules cgroup_offline_fn() to perform the rest of
+        * destruction once the percpu refs of all css's are confirmed to
+        * be killed.
         */
-       for_each_subsys(cgrp->root, ss) {
+       atomic_set(&cgrp->css_kill_cnt, 1);
+       for_each_root_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
 
-               WARN_ON(atomic_read(&css->refcnt) < 0);
-               atomic_add(CSS_DEACT_BIAS, &css->refcnt);
-       }
-       set_bit(CGRP_REMOVED, &cgrp->flags);
+               /*
+                * Killing would put the base ref, but we need to keep it
+                * alive until after ->css_offline.
+                */
+               percpu_ref_get(&css->refcnt);
 
-       /* tell subsystems to initate destruction */
-       for_each_subsys(cgrp->root, ss)
-               offline_css(ss, cgrp);
+               atomic_inc(&cgrp->css_kill_cnt);
+               percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
+       }
+       cgroup_css_killed(cgrp);
 
        /*
-        * Put all the base refs.  Each css holds an extra reference to the
-        * cgroup's dentry and cgroup removal proceeds regardless of css
-        * refs.  On the last put of each css, whenever that may be, the
-        * extra dentry ref is put so that dentry destruction happens only
-        * after all css's are released.
+        * Mark @cgrp dead.  This prevents further task migration and child
+        * creation by disabling cgroup_lock_live_group().  Note that
+        * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to
+        * resume iteration after dropping RCU read lock.  See
+        * cgroup_next_sibling() for details.
         */
-       for_each_subsys(cgrp->root, ss)
-               css_put(cgrp->subsys[ss->subsys_id]);
+       set_bit(CGRP_DEAD, &cgrp->flags);
 
+       /* CGRP_DEAD is set, remove from ->release_list for the last time */
        raw_spin_lock(&release_list_lock);
        if (!list_empty(&cgrp->release_list))
                list_del_init(&cgrp->release_list);
        raw_spin_unlock(&release_list_lock);
 
-       /* delete this cgroup from parent->children */
-       list_del_rcu(&cgrp->sibling);
-       list_del_init(&cgrp->allcg_node);
-
+       /*
+        * Remove @cgrp directory.  The removal puts the base ref but we
+        * aren't quite done with @cgrp yet, so hold onto it.
+        */
        dget(d);
        cgroup_d_remove_dir(d);
-       dput(d);
-
-       set_bit(CGRP_RELEASABLE, &parent->flags);
-       check_for_release(parent);
 
        /*
         * Unregister events and notify userspace.
@@ -4322,6 +4551,53 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        spin_unlock(&cgrp->event_list_lock);
 
        return 0;
+};
+
+/**
+ * cgroup_offline_fn - the second step of cgroup destruction
+ * @work: cgroup->destroy_free_work
+ *
+ * This function is invoked from a work item for a cgroup which is being
+ * destroyed after the percpu refcnts of all css's are guaranteed to be
+ * seen as killed on all CPUs, and performs the rest of destruction.  This
+ * is the second step of destruction described in the comment above
+ * cgroup_destroy_locked().
+ */
+static void cgroup_offline_fn(struct work_struct *work)
+{
+       struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
+       struct cgroup *parent = cgrp->parent;
+       struct dentry *d = cgrp->dentry;
+       struct cgroup_subsys *ss;
+
+       mutex_lock(&cgroup_mutex);
+
+       /*
+        * css_tryget() is guaranteed to fail now.  Tell subsystems to
+        * initate destruction.
+        */
+       for_each_root_subsys(cgrp->root, ss)
+               offline_css(ss, cgrp);
+
+       /*
+        * Put the css refs from cgroup_destroy_locked().  Each css holds
+        * an extra reference to the cgroup's dentry and cgroup removal
+        * proceeds regardless of css refs.  On the last put of each css,
+        * whenever that may be, the extra dentry ref is put so that dentry
+        * destruction happens only after all css's are released.
+        */
+       for_each_root_subsys(cgrp->root, ss)
+               css_put(cgrp->subsys[ss->subsys_id]);
+
+       /* delete this cgroup from parent->children */
+       list_del_rcu(&cgrp->sibling);
+
+       dput(d);
+
+       set_bit(CGRP_RELEASABLE, &parent->flags);
+       check_for_release(parent);
+
+       mutex_unlock(&cgroup_mutex);
 }
 
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4361,12 +4637,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        cgroup_init_cftsets(ss);
 
        /* Create the top cgroup state for this subsystem */
-       list_add(&ss->sibling, &rootnode.subsys_list);
-       ss->root = &rootnode;
-       css = ss->css_alloc(dummytop);
+       list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
+       ss->root = &cgroup_dummy_root;
+       css = ss->css_alloc(cgroup_dummy_top);
        /* We don't handle early failures gracefully */
        BUG_ON(IS_ERR(css));
-       init_cgroup_css(css, ss, dummytop);
+       init_cgroup_css(css, ss, cgroup_dummy_top);
 
        /* Update the init_css_set to contain a subsys
         * pointer to this state - since the subsystem is
@@ -4381,7 +4657,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         * need to invoke fork callbacks here. */
        BUG_ON(!list_empty(&init_task.tasks));
 
-       BUG_ON(online_css(ss, dummytop));
+       BUG_ON(online_css(ss, cgroup_dummy_top));
 
        mutex_unlock(&cgroup_mutex);
 
@@ -4404,7 +4680,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        struct cgroup_subsys_state *css;
        int i, ret;
        struct hlist_node *tmp;
-       struct css_set *cg;
+       struct css_set *cset;
        unsigned long key;
 
        /* check name and function validity */
@@ -4427,7 +4703,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
         */
        if (ss->module == NULL) {
                /* a sanity check */
-               BUG_ON(subsys[ss->subsys_id] != ss);
+               BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
                return 0;
        }
 
@@ -4435,26 +4711,26 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        cgroup_init_cftsets(ss);
 
        mutex_lock(&cgroup_mutex);
-       subsys[ss->subsys_id] = ss;
+       cgroup_subsys[ss->subsys_id] = ss;
 
        /*
         * no ss->css_alloc seems to need anything important in the ss
-        * struct, so this can happen first (i.e. before the rootnode
+        * struct, so this can happen first (i.e. before the dummy root
         * attachment).
         */
-       css = ss->css_alloc(dummytop);
+       css = ss->css_alloc(cgroup_dummy_top);
        if (IS_ERR(css)) {
-               /* failure case - need to deassign the subsys[] slot. */
-               subsys[ss->subsys_id] = NULL;
+               /* failure case - need to deassign the cgroup_subsys[] slot. */
+               cgroup_subsys[ss->subsys_id] = NULL;
                mutex_unlock(&cgroup_mutex);
                return PTR_ERR(css);
        }
 
-       list_add(&ss->sibling, &rootnode.subsys_list);
-       ss->root = &rootnode;
+       list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
+       ss->root = &cgroup_dummy_root;
 
        /* our new subsystem will be attached to the dummy hierarchy. */
-       init_cgroup_css(css, ss, dummytop);
+       init_cgroup_css(css, ss, cgroup_dummy_top);
        /* init_idr must be after init_cgroup_css because it sets css->id. */
        if (ss->use_id) {
                ret = cgroup_init_idr(ss, css);
@@ -4471,21 +4747,21 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
         * this is all done under the css_set_lock.
         */
        write_lock(&css_set_lock);
-       hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
+       hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
                /* skip entries that we already rehashed */
-               if (cg->subsys[ss->subsys_id])
+               if (cset->subsys[ss->subsys_id])
                        continue;
                /* remove existing entry */
-               hash_del(&cg->hlist);
+               hash_del(&cset->hlist);
                /* set new value */
-               cg->subsys[ss->subsys_id] = css;
+               cset->subsys[ss->subsys_id] = css;
                /* recompute hash and restore entry */
-               key = css_set_hash(cg->subsys);
-               hash_add(css_set_table, &cg->hlist, key);
+               key = css_set_hash(cset->subsys);
+               hash_add(css_set_table, &cset->hlist, key);
        }
        write_unlock(&css_set_lock);
 
-       ret = online_css(ss, dummytop);
+       ret = online_css(ss, cgroup_dummy_top);
        if (ret)
                goto err_unload;
 
@@ -4511,7 +4787,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
  */
 void cgroup_unload_subsys(struct cgroup_subsys *ss)
 {
-       struct cg_cgroup_link *link;
+       struct cgrp_cset_link *link;
 
        BUG_ON(ss->module == NULL);
 
@@ -4520,45 +4796,46 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
         * try_module_get in parse_cgroupfs_options should ensure that it
         * doesn't start being used while we're killing it off.
         */
-       BUG_ON(ss->root != &rootnode);
+       BUG_ON(ss->root != &cgroup_dummy_root);
 
        mutex_lock(&cgroup_mutex);
 
-       offline_css(ss, dummytop);
+       offline_css(ss, cgroup_dummy_top);
 
        if (ss->use_id)
                idr_destroy(&ss->idr);
 
        /* deassign the subsys_id */
-       subsys[ss->subsys_id] = NULL;
+       cgroup_subsys[ss->subsys_id] = NULL;
 
-       /* remove subsystem from rootnode's list of subsystems */
+       /* remove subsystem from the dummy root's list of subsystems */
        list_del_init(&ss->sibling);
 
        /*
-        * disentangle the css from all css_sets attached to the dummytop. as
-        * in loading, we need to pay our respects to the hashtable gods.
+        * disentangle the css from all css_sets attached to the dummy
+        * top. as in loading, we need to pay our respects to the hashtable
+        * gods.
         */
        write_lock(&css_set_lock);
-       list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
-               struct css_set *cg = link->cg;
+       list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
+               struct css_set *cset = link->cset;
                unsigned long key;
 
-               hash_del(&cg->hlist);
-               cg->subsys[ss->subsys_id] = NULL;
-               key = css_set_hash(cg->subsys);
-               hash_add(css_set_table, &cg->hlist, key);
+               hash_del(&cset->hlist);
+               cset->subsys[ss->subsys_id] = NULL;
+               key = css_set_hash(cset->subsys);
+               hash_add(css_set_table, &cset->hlist, key);
        }
        write_unlock(&css_set_lock);
 
        /*
-        * remove subsystem's css from the dummytop and free it - need to
-        * free before marking as null because ss->css_free needs the
-        * cgrp->subsys pointer to find their state. note that this also
-        * takes care of freeing the css_id.
+        * remove subsystem's css from the cgroup_dummy_top and free it -
+        * need to free before marking as null because ss->css_free needs
+        * the cgrp->subsys pointer to find their state. note that this
+        * also takes care of freeing the css_id.
         */
-       ss->css_free(dummytop);
-       dummytop->subsys[ss->subsys_id] = NULL;
+       ss->css_free(cgroup_dummy_top);
+       cgroup_dummy_top->subsys[ss->subsys_id] = NULL;
 
        mutex_unlock(&cgroup_mutex);
 }
@@ -4572,30 +4849,25 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
  */
 int __init cgroup_init_early(void)
 {
+       struct cgroup_subsys *ss;
        int i;
+
        atomic_set(&init_css_set.refcount, 1);
-       INIT_LIST_HEAD(&init_css_set.cg_links);
+       INIT_LIST_HEAD(&init_css_set.cgrp_links);
        INIT_LIST_HEAD(&init_css_set.tasks);
        INIT_HLIST_NODE(&init_css_set.hlist);
        css_set_count = 1;
-       init_cgroup_root(&rootnode);
-       root_count = 1;
-       init_task.cgroups = &init_css_set;
-
-       init_css_set_link.cg = &init_css_set;
-       init_css_set_link.cgrp = dummytop;
-       list_add(&init_css_set_link.cgrp_link_list,
-                &rootnode.top_cgroup.css_sets);
-       list_add(&init_css_set_link.cg_link_list,
-                &init_css_set.cg_links);
-
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-               struct cgroup_subsys *ss = subsys[i];
-
-               /* at bootup time, we don't worry about modular subsystems */
-               if (!ss || ss->module)
-                       continue;
+       init_cgroup_root(&cgroup_dummy_root);
+       cgroup_root_count = 1;
+       RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
 
+       init_cgrp_cset_link.cset = &init_css_set;
+       init_cgrp_cset_link.cgrp = cgroup_dummy_top;
+       list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
+       list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
+
+       /* at bootup time, we don't worry about modular subsystems */
+       for_each_builtin_subsys(ss, i) {
                BUG_ON(!ss->name);
                BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
                BUG_ON(!ss->css_alloc);
@@ -4620,30 +4892,33 @@ int __init cgroup_init_early(void)
  */
 int __init cgroup_init(void)
 {
-       int err;
-       int i;
+       struct cgroup_subsys *ss;
        unsigned long key;
+       int i, err;
 
        err = bdi_init(&cgroup_backing_dev_info);
        if (err)
                return err;
 
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-               struct cgroup_subsys *ss = subsys[i];
-
-               /* at bootup time, we don't worry about modular subsystems */
-               if (!ss || ss->module)
-                       continue;
+       for_each_builtin_subsys(ss, i) {
                if (!ss->early_init)
                        cgroup_init_subsys(ss);
                if (ss->use_id)
                        cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
        }
 
+       /* allocate id for the dummy hierarchy */
+       mutex_lock(&cgroup_mutex);
+       mutex_lock(&cgroup_root_mutex);
+
        /* Add init_css_set to the hash table */
        key = css_set_hash(init_css_set.subsys);
        hash_add(css_set_table, &init_css_set.hlist, key);
-       BUG_ON(!init_root_id(&rootnode));
+
+       BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
+
+       mutex_unlock(&cgroup_root_mutex);
+       mutex_unlock(&cgroup_mutex);
 
        cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
        if (!cgroup_kobj) {
@@ -4708,7 +4983,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
                int count = 0;
 
                seq_printf(m, "%d:", root->hierarchy_id);
-               for_each_subsys(root, ss)
+               for_each_root_subsys(root, ss)
                        seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
                if (strlen(root->name))
                        seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4734,6 +5009,7 @@ out:
 /* Display information about each subsystem and each hierarchy */
 static int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
+       struct cgroup_subsys *ss;
        int i;
 
        seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
@@ -4743,14 +5019,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
         * subsys/hierarchy state.
         */
        mutex_lock(&cgroup_mutex);
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-               struct cgroup_subsys *ss = subsys[i];
-               if (ss == NULL)
-                       continue;
+
+       for_each_subsys(ss, i)
                seq_printf(m, "%s\t%d\t%d\t%d\n",
                           ss->name, ss->root->hierarchy_id,
                           ss->root->number_of_cgroups, !ss->disabled);
-       }
+
        mutex_unlock(&cgroup_mutex);
        return 0;
 }
@@ -4786,8 +5060,8 @@ static const struct file_operations proc_cgroupstats_operations = {
 void cgroup_fork(struct task_struct *child)
 {
        task_lock(current);
+       get_css_set(task_css_set(current));
        child->cgroups = current->cgroups;
-       get_css_set(child->cgroups);
        task_unlock(current);
        INIT_LIST_HEAD(&child->cg_list);
 }
@@ -4804,6 +5078,7 @@ void cgroup_fork(struct task_struct *child)
  */
 void cgroup_post_fork(struct task_struct *child)
 {
+       struct cgroup_subsys *ss;
        int i;
 
        /*
@@ -4821,7 +5096,7 @@ void cgroup_post_fork(struct task_struct *child)
                write_lock(&css_set_lock);
                task_lock(child);
                if (list_empty(&child->cg_list))
-                       list_add(&child->cg_list, &child->cgroups->tasks);
+                       list_add(&child->cg_list, &task_css_set(child)->tasks);
                task_unlock(child);
                write_unlock(&css_set_lock);
        }
@@ -4840,12 +5115,9 @@ void cgroup_post_fork(struct task_struct *child)
                 * of the array can be freed at module unload, so we
                 * can't touch that.
                 */
-               for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
-                       struct cgroup_subsys *ss = subsys[i];
-
+               for_each_builtin_subsys(ss, i)
                        if (ss->fork)
                                ss->fork(child);
-               }
        }
 }
 
@@ -4886,7 +5158,8 @@ void cgroup_post_fork(struct task_struct *child)
  */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
-       struct css_set *cg;
+       struct cgroup_subsys *ss;
+       struct css_set *cset;
        int i;
 
        /*
@@ -4903,36 +5176,32 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 
        /* Reassign the task to the init_css_set. */
        task_lock(tsk);
-       cg = tsk->cgroups;
-       tsk->cgroups = &init_css_set;
+       cset = task_css_set(tsk);
+       RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
 
        if (run_callbacks && need_forkexit_callback) {
                /*
                 * fork/exit callbacks are supported only for builtin
                 * subsystems, see cgroup_post_fork() for details.
                 */
-               for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
-                       struct cgroup_subsys *ss = subsys[i];
-
+               for_each_builtin_subsys(ss, i) {
                        if (ss->exit) {
-                               struct cgroup *old_cgrp =
-                                       rcu_dereference_raw(cg->subsys[i])->cgroup;
+                               struct cgroup *old_cgrp = cset->subsys[i]->cgroup;
                                struct cgroup *cgrp = task_cgroup(tsk, i);
+
                                ss->exit(cgrp, old_cgrp, tsk);
                        }
                }
        }
        task_unlock(tsk);
 
-       put_css_set_taskexit(cg);
+       put_css_set_taskexit(cset);
 }
 
 static void check_for_release(struct cgroup *cgrp)
 {
-       /* All of these checks rely on RCU to keep the cgroup
-        * structure alive */
        if (cgroup_is_releasable(cgrp) &&
-           !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
+           list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
                /*
                 * Control Group is currently removeable. If it's not
                 * already queued for a userspace notification, queue
@@ -4941,7 +5210,7 @@ static void check_for_release(struct cgroup *cgrp)
                int need_schedule_work = 0;
 
                raw_spin_lock(&release_list_lock);
-               if (!cgroup_is_removed(cgrp) &&
+               if (!cgroup_is_dead(cgrp) &&
                    list_empty(&cgrp->release_list)) {
                        list_add(&cgrp->release_list, &release_list);
                        need_schedule_work = 1;
@@ -4952,34 +5221,6 @@ static void check_for_release(struct cgroup *cgrp)
        }
 }
 
-/* Caller must verify that the css is not for root cgroup */
-bool __css_tryget(struct cgroup_subsys_state *css)
-{
-       while (true) {
-               int t, v;
-
-               v = css_refcnt(css);
-               t = atomic_cmpxchg(&css->refcnt, v, v + 1);
-               if (likely(t == v))
-                       return true;
-               else if (t < 0)
-                       return false;
-               cpu_relax();
-       }
-}
-EXPORT_SYMBOL_GPL(__css_tryget);
-
-/* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css)
-{
-       int v;
-
-       v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-       if (v == 0)
-               schedule_work(&css->dput_work);
-}
-EXPORT_SYMBOL_GPL(__css_put);
-
 /*
  * Notify userspace when a cgroup is released, by running the
  * configured release agent with the name of the cgroup (path
@@ -5054,23 +5295,19 @@ static void cgroup_release_agent(struct work_struct *work)
 
 static int __init cgroup_disable(char *str)
 {
-       int i;
+       struct cgroup_subsys *ss;
        char *token;
+       int i;
 
        while ((token = strsep(&str, ",")) != NULL) {
                if (!*token)
                        continue;
-               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                       struct cgroup_subsys *ss = subsys[i];
-
-                       /*
-                        * cgroup_disable, being at boot time, can't
-                        * know about module subsystems, so we don't
-                        * worry about them.
-                        */
-                       if (!ss || ss->module)
-                               continue;
 
+               /*
+                * cgroup_disable, being at boot time, can't know about
+                * module subsystems, so we don't worry about them.
+                */
+               for_each_builtin_subsys(ss, i) {
                        if (!strcmp(token, ss->name)) {
                                ss->disabled = 1;
                                printk(KERN_INFO "Disabling %s control group"
@@ -5087,9 +5324,7 @@ __setup("cgroup_disable=", cgroup_disable);
  * Functons for CSS ID.
  */
 
-/*
- *To get ID other than 0, this should be called when !cgroup_is_removed().
- */
+/* to get ID other than 0, this should be called when !cgroup_is_dead() */
 unsigned short css_id(struct cgroup_subsys_state *css)
 {
        struct css_id *cssid;
@@ -5099,7 +5334,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
         * on this or this is under rcu_read_lock(). Once css->id is allocated,
         * it's unchanged until freed.
         */
-       cssid = rcu_dereference_check(css->id, css_refcnt(css));
+       cssid = rcu_dereference_raw(css->id);
 
        if (cssid)
                return cssid->id;
@@ -5107,18 +5342,6 @@ unsigned short css_id(struct cgroup_subsys_state *css)
 }
 EXPORT_SYMBOL_GPL(css_id);
 
-unsigned short css_depth(struct cgroup_subsys_state *css)
-{
-       struct css_id *cssid;
-
-       cssid = rcu_dereference_check(css->id, css_refcnt(css));
-
-       if (cssid)
-               return cssid->depth;
-       return 0;
-}
-EXPORT_SYMBOL_GPL(css_depth);
-
 /**
  *  css_is_ancestor - test "root" css is an ancestor of "child"
  * @child: the css to be tested.
@@ -5153,7 +5376,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
 
 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
 {
-       struct css_id *id = css->id;
+       struct css_id *id = rcu_dereference_protected(css->id, true);
+
        /* When this is called before css_id initialization, id can be NULL */
        if (!id)
                return;
@@ -5219,8 +5443,8 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
                return PTR_ERR(newid);
 
        newid->stack[0] = newid->id;
-       newid->css = rootcss;
-       rootcss->id = newid;
+       RCU_INIT_POINTER(newid->css, rootcss);
+       RCU_INIT_POINTER(rootcss->id, newid);
        return 0;
 }
 
@@ -5234,7 +5458,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
        subsys_id = ss->subsys_id;
        parent_css = parent->subsys[subsys_id];
        child_css = child->subsys[subsys_id];
-       parent_id = parent_css->id;
+       parent_id = rcu_dereference_protected(parent_css->id, true);
        depth = parent_id->depth + 1;
 
        child_id = get_new_cssid(ss, depth);
@@ -5299,7 +5523,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
 }
 
 #ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
+static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
 {
        struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
 
@@ -5309,48 +5533,43 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
        return css;
 }
 
-static void debug_css_free(struct cgroup *cont)
-{
-       kfree(cont->subsys[debug_subsys_id]);
-}
-
-static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
+static void debug_css_free(struct cgroup *cgrp)
 {
-       return atomic_read(&cont->count);
+       kfree(cgrp->subsys[debug_subsys_id]);
 }
 
-static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
+static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft)
 {
-       return cgroup_task_count(cont);
+       return cgroup_task_count(cgrp);
 }
 
-static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
+static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft)
 {
        return (u64)(unsigned long)current->cgroups;
 }
 
-static u64 current_css_set_refcount_read(struct cgroup *cont,
-                                          struct cftype *cft)
+static u64 current_css_set_refcount_read(struct cgroup *cgrp,
+                                        struct cftype *cft)
 {
        u64 count;
 
        rcu_read_lock();
-       count = atomic_read(&current->cgroups->refcount);
+       count = atomic_read(&task_css_set(current)->refcount);
        rcu_read_unlock();
        return count;
 }
 
-static int current_css_set_cg_links_read(struct cgroup *cont,
+static int current_css_set_cg_links_read(struct cgroup *cgrp,
                                         struct cftype *cft,
                                         struct seq_file *seq)
 {
-       struct cg_cgroup_link *link;
-       struct css_set *cg;
+       struct cgrp_cset_link *link;
+       struct css_set *cset;
 
        read_lock(&css_set_lock);
        rcu_read_lock();
-       cg = rcu_dereference(current->cgroups);
-       list_for_each_entry(link, &cg->cg_links, cg_link_list) {
+       cset = rcu_dereference(current->cgroups);
+       list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;
                const char *name;
 
@@ -5367,19 +5586,19 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
 }
 
 #define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct cgroup *cont,
+static int cgroup_css_links_read(struct cgroup *cgrp,
                                 struct cftype *cft,
                                 struct seq_file *seq)
 {
-       struct cg_cgroup_link *link;
+       struct cgrp_cset_link *link;
 
        read_lock(&css_set_lock);
-       list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
-               struct css_set *cg = link->cg;
+       list_for_each_entry(link, &cgrp->cset_links, cset_link) {
+               struct css_set *cset = link->cset;
                struct task_struct *task;
                int count = 0;
-               seq_printf(seq, "css_set %p\n", cg);
-               list_for_each_entry(task, &cg->tasks, cg_list) {
+               seq_printf(seq, "css_set %p\n", cset);
+               list_for_each_entry(task, &cset->tasks, cg_list) {
                        if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
                                seq_puts(seq, "  ...\n");
                                break;
@@ -5399,10 +5618,6 @@ static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
 }
 
 static struct cftype debug_files[] =  {
-       {
-               .name = "cgroup_refcount",
-               .read_u64 = cgroup_refcount_read,
-       },
        {
                .name = "taskcount",
                .read_u64 = debug_taskcount_read,