Merge branch 'for-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 19 Mar 2016 03:25:49 +0000 (20:25 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 19 Mar 2016 03:25:49 +0000 (20:25 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 19 Mar 2016 03:25:49 +0000 (20:25 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 19 Mar 2016 03:25:49 +0000 (20:25 -0700)
diff --git a/Documentation/cgroup-v1/00-INDEX b/Documentation/cgroup-v1/00-INDEX

index 6ad425f..106885a 100644 (file)
--- a/Documentation/cgroup-v1/00-INDEX
+++ b/Documentation/cgroup-v1/00-INDEX
@@ -24,5 +24,3 @@ net_prio.txt
         - Network priority cgroups details and usages.
  pids.txt
         - Process number cgroups details and usages.
-unified-hierarchy.txt
-       - Description the new/next cgroup interface.
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt

index 8f1329a..bdc6773 100644 (file)
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -132,6 +132,12 @@ strongly discouraged for production use.  It is recommended to decide
  the hierarchies and controller associations before starting using the
  controllers after system boot.
  
+During transition to v2, system management software might still
+automount the v1 cgroup filesystem and so hijack all controllers
+during boot, before manual intervention is possible. To make testing
+and experimenting easier, the kernel parameter cgroup_no_v1= allows
+disabling controllers in v1 and make them always available in v2.
+
  
  2-2. Organizing Processes
  
@@ -915,7 +921,7 @@ PAGE_SIZE multiple when read back.
         limit, anonymous meomry of the cgroup will not be swapped out.
  
  
-5-2-2. General Usage
+5-2-2. Usage Guidelines
  
  "memory.high" is the main mechanism to control memory usage.
  Over-committing on high limit (sum of high limits > available memory)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index 0ee46a8..eef242e 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -614,6 +614,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                         cut the overhead, others just disable the usage. So
                         only cgroup_disable=memory is actually worthy}
  
+       cgroup_no_v1=   [KNL] Disable one, multiple, all cgroup controllers in v1
+                       Format: { controller[,controller...] | "all" }
+                       Like cgroup_disable, but only applies to cgroup v1;
+                       the blacklisted controllers remain available in cgroup2.
+
         cgroup.memory=  [KNL] Pass options to the cgroup memory controller.
                         Format: <string>
                         nosocket -- Disable socket memory accounting.
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h

index 789471d..3e39ae5 100644 (file)
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -45,6 +45,7 @@ enum {
         CSS_NO_REF      = (1 << 0), /* no reference counting for this css */
         CSS_ONLINE      = (1 << 1), /* between ->css_online() and ->css_offline() */
         CSS_RELEASED    = (1 << 2), /* refcnt reached zero, released */
+       CSS_VISIBLE     = (1 << 3), /* css is visible to userland */
  };
  
  /* bits in struct cgroup flags field */
@@ -190,12 +191,13 @@ struct css_set {
  
         /*
          * If this cset is acting as the source of migration the following
-        * two fields are set.  mg_src_cgrp is the source cgroup of the
-        * on-going migration and mg_dst_cset is the destination cset the
-        * target tasks on this cset should be migrated to.  Protected by
-        * cgroup_mutex.
+        * two fields are set.  mg_src_cgrp and mg_dst_cgrp are
+        * respectively the source and destination cgroups of the on-going
+        * migration.  mg_dst_cset is the destination cset the target tasks
+        * on this cset should be migrated to.  Protected by cgroup_mutex.
          */
         struct cgroup *mg_src_cgrp;
+       struct cgroup *mg_dst_cgrp;
         struct css_set *mg_dst_cset;
  
         /*
@@ -210,6 +212,9 @@ struct css_set {
         /* all css_task_iters currently walking this cset */
         struct list_head task_iters;
  
+       /* dead and being drained, ignore for migration */
+       bool dead;
+
         /* For RCU-protected deletion */
         struct rcu_head rcu_head;
  };
@@ -253,13 +258,14 @@ struct cgroup {
         /*
          * The bitmask of subsystems enabled on the child cgroups.
          * ->subtree_control is the one configured through
-        * "cgroup.subtree_control" while ->child_subsys_mask is the
-        * effective one which may have more subsystems enabled.
-        * Controller knobs are made available iff it's enabled in
-        * ->subtree_control.
+        * "cgroup.subtree_control" while ->child_ss_mask is the effective
+        * one which may have more subsystems enabled.  Controller knobs
+        * are made available iff it's enabled in ->subtree_control.
          */
-       unsigned int subtree_control;
-       unsigned int child_subsys_mask;
+       u16 subtree_control;
+       u16 subtree_ss_mask;
+       u16 old_subtree_control;
+       u16 old_subtree_ss_mask;
  
         /* Private pointers for each registered subsystem */
         struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
@@ -434,7 +440,6 @@ struct cgroup_subsys {
         void (*css_released)(struct cgroup_subsys_state *css);
         void (*css_free)(struct cgroup_subsys_state *css);
         void (*css_reset)(struct cgroup_subsys_state *css);
-       void (*css_e_css_changed)(struct cgroup_subsys_state *css);
  
         int (*can_attach)(struct cgroup_taskset *tset);
         void (*cancel_attach)(struct cgroup_taskset *tset);
@@ -446,7 +451,20 @@ struct cgroup_subsys {
         void (*free)(struct task_struct *task);
         void (*bind)(struct cgroup_subsys_state *root_css);
  
-       int early_init;
+       bool early_init:1;
+
+       /*
+        * If %true, the controller, on the default hierarchy, doesn't show
+        * up in "cgroup.controllers" or "cgroup.subtree_control", is
+        * implicitly enabled on all cgroups on the default hierarchy, and
+        * bypasses the "no internal process" constraint.  This is for
+        * utility type controllers which is transparent to userland.
+        *
+        * An implicit controller can be stolen from the default hierarchy
+        * anytime and thus must be okay with offline csses from previous
+        * hierarchies coexisting with csses for the current one.
+        */
+       bool implicit_on_dfl:1;
  
         /*
          * If %false, this subsystem is properly hierarchical -
@@ -460,8 +478,8 @@ struct cgroup_subsys {
          * cases.  Eventually, all subsystems will be made properly
          * hierarchical and this will go away.
          */
-       bool broken_hierarchy;
-       bool warned_broken_hierarchy;
+       bool broken_hierarchy:1;
+       bool warned_broken_hierarchy:1;
  
         /* the following two fields are initialized automtically during boot */
         int id;
diff --git a/init/Kconfig b/init/Kconfig

index 2d70c8c..e0d2616 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1047,10 +1047,10 @@ config CGROUP_PIDS
           is fairly trivial to reach PID exhaustion before you reach even a
           conservative kmemcg limit. As a result, it is possible to grind a
           system to halt without being limited by other cgroup policies. The
-         PIDs cgroup subsystem is designed to stop this from happening.
+         PIDs controller is designed to stop this from happening.
  
           It should be noted that organisational operations (such as attaching
-         to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+         to a cgroup hierarchy will *not* be blocked by the PIDs controller),
           since the PIDs limit only affects a process's ability to fork, not to
           attach to a cgroup.
  
diff --git a/kernel/Makefile b/kernel/Makefile

index 53abf00..baa55e5 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -14,8 +14,7 @@ obj-y     = fork.o exec_domain.o panic.o \
  obj-$(CONFIG_MULTIUSER) += groups.o
  
  ifdef CONFIG_FUNCTION_TRACER
-# Do not trace debug files and internal ftrace files
-CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
+# Do not trace internal ftrace files
  CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
  endif
  
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index d27904c..3fe02c1 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -178,10 +178,16 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
   * The default hierarchy always exists but is hidden until mounted for the
   * first time.  This is for backward compatibility.
   */
-static bool cgrp_dfl_root_visible;
+static bool cgrp_dfl_visible;
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
  
  /* some controllers are not supported in the default hierarchy */
-static unsigned long cgrp_dfl_root_inhibit_ss_mask;
+static u16 cgrp_dfl_inhibit_ss_mask;
+
+/* some controllers are implicitly enabled on the default hierarchy */
+static unsigned long cgrp_dfl_implicit_ss_mask;
  
  /* The list of hierarchy roots */
  
@@ -205,23 +211,25 @@ static u64 css_serial_nr_next = 1;
   * fork/exit handlers to call. This avoids us having to do extra work in the
   * fork/exit path to check which subsystems have fork/exit callbacks.
   */
-static unsigned long have_fork_callback __read_mostly;
-static unsigned long have_exit_callback __read_mostly;
-static unsigned long have_free_callback __read_mostly;
+static u16 have_fork_callback __read_mostly;
+static u16 have_exit_callback __read_mostly;
+static u16 have_free_callback __read_mostly;
  
  /* Ditto for the can_fork callback. */
-static unsigned long have_canfork_callback __read_mostly;
+static u16 have_canfork_callback __read_mostly;
  
  static struct file_system_type cgroup2_fs_type;
  static struct cftype cgroup_dfl_base_files[];
  static struct cftype cgroup_legacy_base_files[];
  
-static int rebind_subsystems(struct cgroup_root *dst_root,
-                            unsigned long ss_mask);
+static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+static int cgroup_apply_control(struct cgroup *cgrp);
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
  static void css_task_iter_advance(struct css_task_iter *it);
  static int cgroup_destroy_locked(struct cgroup *cgrp);
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
-                     bool visible);
+static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+                                             struct cgroup_subsys *ss);
  static void css_release(struct percpu_ref *ref);
  static void kill_css(struct cgroup_subsys_state *css);
  static int cgroup_addrm_files(struct cgroup_subsys_state *css,
@@ -238,9 +246,17 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
   */
  static bool cgroup_ssid_enabled(int ssid)
  {
+       if (CGROUP_SUBSYS_COUNT == 0)
+               return false;
+
         return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
  }
  
+static bool cgroup_ssid_no_v1(int ssid)
+{
+       return cgroup_no_v1_mask & (1 << ssid);
+}
+
  /**
   * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
   * @cgrp: the cgroup of interest
@@ -339,6 +355,32 @@ static struct cgroup *cgroup_parent(struct cgroup *cgrp)
         return NULL;
  }
  
+/* subsystems visibly enabled on a cgroup */
+static u16 cgroup_control(struct cgroup *cgrp)
+{
+       struct cgroup *parent = cgroup_parent(cgrp);
+       u16 root_ss_mask = cgrp->root->subsys_mask;
+
+       if (parent)
+               return parent->subtree_control;
+
+       if (cgroup_on_dfl(cgrp))
+               root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
+                                 cgrp_dfl_implicit_ss_mask);
+       return root_ss_mask;
+}
+
+/* subsystems enabled on a cgroup */
+static u16 cgroup_ss_mask(struct cgroup *cgrp)
+{
+       struct cgroup *parent = cgroup_parent(cgrp);
+
+       if (parent)
+               return parent->subtree_ss_mask;
+
+       return cgrp->root->subsys_mask;
+}
+
  /**
   * cgroup_css - obtain a cgroup's css for the specified subsystem
   * @cgrp: the cgroup of interest
@@ -378,16 +420,15 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
         if (!ss)
                 return &cgrp->self;
  
-       if (!(cgrp->root->subsys_mask & (1 << ss->id)))
-               return NULL;
-
         /*
          * This function is used while updating css associations and thus
-        * can't test the csses directly.  Use ->child_subsys_mask.
+        * can't test the csses directly.  Test ss_mask.
          */
-       while (cgroup_parent(cgrp) &&
-              !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
+       while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
                 cgrp = cgroup_parent(cgrp);
+               if (!cgrp)
+                       return NULL;
+       }
  
         return cgroup_css(cgrp, ss);
  }
@@ -506,22 +547,28 @@ static int notify_on_release(const struct cgroup *cgrp)
              (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
  
  /**
- * for_each_subsys_which - filter for_each_subsys with a bitmask
+ * do_each_subsys_mask - filter for_each_subsys with a bitmask
   * @ss: the iteration cursor
   * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- * @ss_maskp: a pointer to the bitmask
+ * @ss_mask: the bitmask
   *
   * The block will only run for cases where the ssid-th bit (1 << ssid) of
- * mask is set to 1.
+ * @ss_mask is set.
   */
-#define for_each_subsys_which(ss, ssid, ss_maskp)                      \
-       if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */   \
+#define do_each_subsys_mask(ss, ssid, ss_mask) do {                    \
+       unsigned long __ss_mask = (ss_mask);                            \
+       if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
                 (ssid) = 0;                                             \
-       else                                                            \
-               for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT)   \
-                       if (((ss) = cgroup_subsys[ssid]) && false)      \
-                               break;                                  \
-                       else
+               break;                                                  \
+       }                                                               \
+       for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {       \
+               (ss) = cgroup_subsys[ssid];                             \
+               {
+
+#define while_each_subsys_mask()                                       \
+               }                                                       \
+       }                                                               \
+} while (false)
  
  /* iterate across the hierarchies */
  #define for_each_root(root)                                            \
@@ -535,6 +582,24 @@ static int notify_on_release(const struct cgroup *cgrp)
                         ;                                               \
                 else
  
+/* walk live descendants in preorder */
+#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)         \
+       css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))  \
+               if (({ lockdep_assert_held(&cgroup_mutex);              \
+                      (dsct) = (d_css)->cgroup;                        \
+                      cgroup_is_dead(dsct); }))                        \
+                       ;                                               \
+               else
+
+/* walk live descendants in postorder */
+#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)                \
+       css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
+               if (({ lockdep_assert_held(&cgroup_mutex);              \
+                      (dsct) = (d_css)->cgroup;                        \
+                      cgroup_is_dead(dsct); }))                        \
+                       ;                                               \
+               else
+
  static void cgroup_release_agent(struct work_struct *work);
  static void check_for_release(struct cgroup *cgrp);
  
@@ -665,6 +730,9 @@ static void css_set_move_task(struct task_struct *task,
  {
         lockdep_assert_held(&css_set_lock);
  
+       if (to_cset && !css_set_populated(to_cset))
+               css_set_update_populated(to_cset, true);
+
         if (from_cset) {
                 struct css_task_iter *it, *pos;
  
@@ -698,8 +766,6 @@ static void css_set_move_task(struct task_struct *task,
                  */
                 WARN_ON_ONCE(task->flags & PF_EXITING);
  
-               if (!css_set_populated(to_cset))
-                       css_set_update_populated(to_cset, true);
                 rcu_assign_pointer(task->cgroups, to_cset);
                 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
                                                              &to_cset->tasks);
@@ -1102,13 +1168,13 @@ static void cgroup_destroy_root(struct cgroup_root *root)
         struct cgroup *cgrp = &root->cgrp;
         struct cgrp_cset_link *link, *tmp_link;
  
-       mutex_lock(&cgroup_mutex);
+       cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
  
         BUG_ON(atomic_read(&root->nr_cgrps));
         BUG_ON(!list_empty(&cgrp->self.children));
  
         /* Rebind all subsystems back to the default hierarchy */
-       rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
+       WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
  
         /*
          * Release all the links from cset_links to this hierarchy's
@@ -1248,46 +1314,40 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
  }
  
  /**
- * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
- * @cgrp: the target cgroup
+ * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
   * @subtree_control: the new subtree_control mask to consider
+ * @this_ss_mask: available subsystems
   *
   * On the default hierarchy, a subsystem may request other subsystems to be
   * enabled together through its ->depends_on mask.  In such cases, more
   * subsystems than specified in "cgroup.subtree_control" may be enabled.
   *
   * This function calculates which subsystems need to be enabled if
- * @subtree_control is to be applied to @cgrp.  The returned mask is always
- * a superset of @subtree_control and follows the usual hierarchy rules.
+ * @subtree_control is to be applied while restricted to @this_ss_mask.
   */
-static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
-                                                 unsigned long subtree_control)
+static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
  {
-       struct cgroup *parent = cgroup_parent(cgrp);
-       unsigned long cur_ss_mask = subtree_control;
+       u16 cur_ss_mask = subtree_control;
         struct cgroup_subsys *ss;
         int ssid;
  
         lockdep_assert_held(&cgroup_mutex);
  
-       if (!cgroup_on_dfl(cgrp))
-               return cur_ss_mask;
+       cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
  
         while (true) {
-               unsigned long new_ss_mask = cur_ss_mask;
+               u16 new_ss_mask = cur_ss_mask;
  
-               for_each_subsys_which(ss, ssid, &cur_ss_mask)
+               do_each_subsys_mask(ss, ssid, cur_ss_mask) {
                         new_ss_mask |= ss->depends_on;
+               } while_each_subsys_mask();
  
                 /*
                  * Mask out subsystems which aren't available.  This can
                  * happen only if some depended-upon subsystems were bound
                  * to non-default hierarchies.
                  */
-               if (parent)
-                       new_ss_mask &= parent->child_subsys_mask;
-               else
-                       new_ss_mask &= cgrp->root->subsys_mask;
+               new_ss_mask &= this_ss_mask;
  
                 if (new_ss_mask == cur_ss_mask)
                         break;
@@ -1297,19 +1357,6 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
         return cur_ss_mask;
  }
  
-/**
- * cgroup_refresh_child_subsys_mask - update child_subsys_mask
- * @cgrp: the target cgroup
- *
- * Update @cgrp->child_subsys_mask according to the current
- * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
- */
-static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
-{
-       cgrp->child_subsys_mask =
-               cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
-}
-
  /**
   * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
   * @kn: the kernfs_node being serviced
@@ -1338,19 +1385,22 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
  /**
   * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
   * @kn: the kernfs_node being serviced
+ * @drain_offline: perform offline draining on the cgroup
   *
   * This helper is to be used by a cgroup kernfs method currently servicing
   * @kn.  It breaks the active protection, performs cgroup locking and
   * verifies that the associated cgroup is alive.  Returns the cgroup if
   * alive; otherwise, %NULL.  A successful return should be undone by a
- * matching cgroup_kn_unlock() invocation.
+ * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
+ * cgroup is drained of offlining csses before return.
   *
   * Any cgroup kernfs method implementation which requires locking the
   * associated cgroup should use this helper.  It avoids nesting cgroup
   * locking under kernfs active protection and allows all kernfs operations
   * including self-removal.
   */
-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
+static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
+                                         bool drain_offline)
  {
         struct cgroup *cgrp;
  
@@ -1369,7 +1419,10 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
                 return NULL;
         kernfs_break_active_protection(kn);
  
-       mutex_lock(&cgroup_mutex);
+       if (drain_offline)
+               cgroup_lock_and_drain_offline(cgrp);
+       else
+               mutex_lock(&cgroup_mutex);
  
         if (!cgroup_is_dead(cgrp))
                 return cgrp;
@@ -1399,14 +1452,17 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
  /**
   * css_clear_dir - remove subsys files in a cgroup directory
   * @css: taget css
- * @cgrp_override: specify if target cgroup is different from css->cgroup
   */
-static void css_clear_dir(struct cgroup_subsys_state *css,
-                         struct cgroup *cgrp_override)
+static void css_clear_dir(struct cgroup_subsys_state *css)
  {
-       struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+       struct cgroup *cgrp = css->cgroup;
         struct cftype *cfts;
  
+       if (!(css->flags & CSS_VISIBLE))
+               return;
+
+       css->flags &= ~CSS_VISIBLE;
+
         list_for_each_entry(cfts, &css->ss->cfts, node)
                 cgroup_addrm_files(css, cgrp, cfts, false);
  }
@@ -1414,17 +1470,18 @@ static void css_clear_dir(struct cgroup_subsys_state *css,
  /**
   * css_populate_dir - create subsys files in a cgroup directory
   * @css: target css
- * @cgrp_overried: specify if target cgroup is different from css->cgroup
   *
   * On failure, no file is added.
   */
-static int css_populate_dir(struct cgroup_subsys_state *css,
-                           struct cgroup *cgrp_override)
+static int css_populate_dir(struct cgroup_subsys_state *css)
  {
-       struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+       struct cgroup *cgrp = css->cgroup;
         struct cftype *cfts, *failed_cfts;
         int ret;
  
+       if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
+               return 0;
+
         if (!css->ss) {
                 if (cgroup_on_dfl(cgrp))
                         cfts = cgroup_dfl_base_files;
@@ -1441,6 +1498,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css,
                         goto err;
                 }
         }
+
+       css->flags |= CSS_VISIBLE;
+
         return 0;
  err:
         list_for_each_entry(cfts, &css->ss->cfts, node) {
@@ -1451,67 +1511,30 @@ err:
         return ret;
  }
  
-static int rebind_subsystems(struct cgroup_root *dst_root,
-                            unsigned long ss_mask)
+static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
  {
         struct cgroup *dcgrp = &dst_root->cgrp;
         struct cgroup_subsys *ss;
-       unsigned long tmp_ss_mask;
         int ssid, i, ret;
  
         lockdep_assert_held(&cgroup_mutex);
  
-       for_each_subsys_which(ss, ssid, &ss_mask) {
-               /* if @ss has non-root csses attached to it, can't move */
-               if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
+       do_each_subsys_mask(ss, ssid, ss_mask) {
+               /*
+                * If @ss has non-root csses attached to it, can't move.
+                * If @ss is an implicit controller, it is exempt from this
+                * rule and can be stolen.
+                */
+               if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
+                   !ss->implicit_on_dfl)
                         return -EBUSY;
  
                 /* can't move between two non-dummy roots either */
                 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
                         return -EBUSY;
-       }
-
-       /* skip creating root files on dfl_root for inhibited subsystems */
-       tmp_ss_mask = ss_mask;
-       if (dst_root == &cgrp_dfl_root)
-               tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
-
-       for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
-               struct cgroup *scgrp = &ss->root->cgrp;
-               int tssid;
-
-               ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
-               if (!ret)
-                       continue;
-
-               /*
-                * Rebinding back to the default root is not allowed to
-                * fail.  Using both default and non-default roots should
-                * be rare.  Moving subsystems back and forth even more so.
-                * Just warn about it and continue.
-                */
-               if (dst_root == &cgrp_dfl_root) {
-                       if (cgrp_dfl_root_visible) {
-                               pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
-                                       ret, ss_mask);
-                               pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
-                       }
-                       continue;
-               }
-
-               for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
-                       if (tssid == ssid)
-                               break;
-                       css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
-               }
-               return ret;
-       }
+       } while_each_subsys_mask();
  
-       /*
-        * Nothing can fail from this point on.  Remove files for the
-        * removed subsystems and rebind each subsystem.
-        */
-       for_each_subsys_which(ss, ssid, &ss_mask) {
+       do_each_subsys_mask(ss, ssid, ss_mask) {
                 struct cgroup_root *src_root = ss->root;
                 struct cgroup *scgrp = &src_root->cgrp;
                 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
@@ -1519,8 +1542,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
  
                 WARN_ON(!css || cgroup_css(dcgrp, ss));
  
-               css_clear_dir(css, NULL);
+               /* disable from the source */
+               src_root->subsys_mask &= ~(1 << ssid);
+               WARN_ON(cgroup_apply_control(scgrp));
+               cgroup_finalize_control(scgrp, 0);
  
+               /* rebind */
                 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
                 rcu_assign_pointer(dcgrp->subsys[ssid], css);
                 ss->root = dst_root;
@@ -1532,23 +1559,23 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
                                        &dcgrp->e_csets[ss->id]);
                 spin_unlock_bh(&css_set_lock);
  
-               src_root->subsys_mask &= ~(1 << ssid);
-               scgrp->subtree_control &= ~(1 << ssid);
-               cgroup_refresh_child_subsys_mask(scgrp);
-
                 /* default hierarchy doesn't enable controllers by default */
                 dst_root->subsys_mask |= 1 << ssid;
                 if (dst_root == &cgrp_dfl_root) {
                         static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
                 } else {
                         dcgrp->subtree_control |= 1 << ssid;
-                       cgroup_refresh_child_subsys_mask(dcgrp);
                         static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
                 }
  
+               ret = cgroup_apply_control(dcgrp);
+               if (ret)
+                       pr_warn("partial failure to rebind %s controller (err=%d)\n",
+                               ss->name, ret);
+
                 if (ss->bind)
                         ss->bind(css);
-       }
+       } while_each_subsys_mask();
  
         kernfs_activate(dcgrp->kn);
         return 0;
@@ -1584,7 +1611,7 @@ static int cgroup_show_options(struct seq_file *seq,
  }
  
  struct cgroup_sb_opts {
-       unsigned long subsys_mask;
+       u16 subsys_mask;
         unsigned int flags;
         char *release_agent;
         bool cpuset_clone_children;
@@ -1597,13 +1624,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
  {
         char *token, *o = data;
         bool all_ss = false, one_ss = false;
-       unsigned long mask = -1UL;
+       u16 mask = U16_MAX;
         struct cgroup_subsys *ss;
         int nr_opts = 0;
         int i;
  
  #ifdef CONFIG_CPUSETS
-       mask = ~(1U << cpuset_cgrp_id);
+       mask = ~((u16)1 << cpuset_cgrp_id);
  #endif
  
         memset(opts, 0, sizeof(*opts));
@@ -1678,6 +1705,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                                 continue;
                         if (!cgroup_ssid_enabled(i))
                                 continue;
+                       if (cgroup_ssid_no_v1(i))
+                               continue;
  
                         /* Mutually exclusive option 'all' + subsystem name */
                         if (all_ss)
@@ -1698,7 +1727,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
          */
         if (all_ss || (!one_ss && !opts->none && !opts->name))
                 for_each_subsys(ss, i)
-                       if (cgroup_ssid_enabled(i))
+                       if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
                                 opts->subsys_mask |= (1 << i);
  
         /*
@@ -1728,14 +1757,14 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
         int ret = 0;
         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
         struct cgroup_sb_opts opts;
-       unsigned long added_mask, removed_mask;
+       u16 added_mask, removed_mask;
  
         if (root == &cgrp_dfl_root) {
                 pr_err("remount is not allowed\n");
                 return -EINVAL;
         }
  
-       mutex_lock(&cgroup_mutex);
+       cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
  
         /* See what subsystems are wanted */
         ret = parse_cgroupfs_options(data, &opts);
@@ -1768,7 +1797,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
         if (ret)
                 goto out_unlock;
  
-       rebind_subsystems(&cgrp_dfl_root, removed_mask);
+       WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
  
         if (opts.release_agent) {
                 spin_lock(&release_agent_path_lock);
@@ -1876,7 +1905,7 @@ static void init_cgroup_root(struct cgroup_root *root,
                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
  }
  
-static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
  {
         LIST_HEAD(tmp_links);
         struct cgroup *root_cgrp = &root->cgrp;
@@ -1899,10 +1928,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
         /*
          * We're accessing css_set_count without locking css_set_lock here,
          * but that's OK - it can only be increased by someone holding
-        * cgroup_lock, and that's us. The worst that can happen is that we
-        * have some link structures left over
+        * cgroup_lock, and that's us.  Later rebinding may disable
+        * controllers on the default hierarchy and thus create new csets,
+        * which can't be more than the existing ones.  Allocate 2x.
          */
-       ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
+       ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
         if (ret)
                 goto cancel_ref;
  
@@ -1919,7 +1949,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
         }
         root_cgrp->kn = root->kf_root->kn;
  
-       ret = css_populate_dir(&root_cgrp->self, NULL);
+       ret = css_populate_dir(&root_cgrp->self);
         if (ret)
                 goto destroy_root;
  
@@ -1992,13 +2022,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                         pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
                         return ERR_PTR(-EINVAL);
                 }
-               cgrp_dfl_root_visible = true;
+               cgrp_dfl_visible = true;
                 root = &cgrp_dfl_root;
                 cgroup_get(&root->cgrp);
                 goto out_mount;
         }
  
-       mutex_lock(&cgroup_mutex);
+       cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
  
         /* First find the desired set of subsystems */
         ret = parse_cgroupfs_options(data, &opts);
@@ -2338,38 +2368,38 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
  }
  
  /**
- * cgroup_taskset_migrate - migrate a taskset to a cgroup
+ * cgroup_taskset_migrate - migrate a taskset
   * @tset: taget taskset
- * @dst_cgrp: destination cgroup
+ * @root: cgroup root the migration is taking place on
   *
- * Migrate tasks in @tset to @dst_cgrp.  This function fails iff one of the
- * ->can_attach callbacks fails and guarantees that either all or none of
- * the tasks in @tset are migrated.  @tset is consumed regardless of
- * success.
+ * Migrate tasks in @tset as setup by migration preparation functions.
+ * This function fails iff one of the ->can_attach callbacks fails and
+ * guarantees that either all or none of the tasks in @tset are migrated.
+ * @tset is consumed regardless of success.
   */
  static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
-                                 struct cgroup *dst_cgrp)
+                                 struct cgroup_root *root)
  {
-       struct cgroup_subsys_state *css, *failed_css = NULL;
+       struct cgroup_subsys *ss;
         struct task_struct *task, *tmp_task;
         struct css_set *cset, *tmp_cset;
-       int i, ret;
+       int ssid, failed_ssid, ret;
  
         /* methods shouldn't be called if no task is actually migrating */
         if (list_empty(&tset->src_csets))
                 return 0;
  
         /* check that we can legitimately attach to the cgroup */
-       for_each_e_css(css, i, dst_cgrp) {
-               if (css->ss->can_attach) {
-                       tset->ssid = i;
-                       ret = css->ss->can_attach(tset);
+       do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+               if (ss->can_attach) {
+                       tset->ssid = ssid;
+                       ret = ss->can_attach(tset);
                         if (ret) {
-                               failed_css = css;
+                               failed_ssid = ssid;
                                 goto out_cancel_attach;
                         }
                 }
-       }
+       } while_each_subsys_mask();
  
         /*
          * Now that we're guaranteed success, proceed to move all tasks to
@@ -2396,25 +2426,25 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
          */
         tset->csets = &tset->dst_csets;
  
-       for_each_e_css(css, i, dst_cgrp) {
-               if (css->ss->attach) {
-                       tset->ssid = i;
-                       css->ss->attach(tset);
+       do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+               if (ss->attach) {
+                       tset->ssid = ssid;
+                       ss->attach(tset);
                 }
-       }
+       } while_each_subsys_mask();
  
         ret = 0;
         goto out_release_tset;
  
  out_cancel_attach:
-       for_each_e_css(css, i, dst_cgrp) {
-               if (css == failed_css)
+       do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+               if (ssid == failed_ssid)
                         break;
-               if (css->ss->cancel_attach) {
-                       tset->ssid = i;
-                       css->ss->cancel_attach(tset);
+               if (ss->cancel_attach) {
+                       tset->ssid = ssid;
+                       ss->cancel_attach(tset);
                 }
-       }
+       } while_each_subsys_mask();
  out_release_tset:
         spin_lock_bh(&css_set_lock);
         list_splice_init(&tset->dst_csets, &tset->src_csets);
@@ -2426,6 +2456,20 @@ out_release_tset:
         return ret;
  }
  
+/**
+ * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ * @dst_cgrp: destination cgroup to test
+ *
+ * On the default hierarchy, except for the root, subtree_control must be
+ * zero for migration destination cgroups with tasks so that child cgroups
+ * don't compete against tasks.
+ */
+static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+{
+       return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
+               !dst_cgrp->subtree_control;
+}
+
  /**
   * cgroup_migrate_finish - cleanup after attach
   * @preloaded_csets: list of preloaded css_sets
@@ -2442,6 +2486,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
         spin_lock_bh(&css_set_lock);
         list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
                 cset->mg_src_cgrp = NULL;
+               cset->mg_dst_cgrp = NULL;
                 cset->mg_dst_cset = NULL;
                 list_del_init(&cset->mg_preload_node);
                 put_css_set_locked(cset);
@@ -2474,58 +2519,56 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
         lockdep_assert_held(&cgroup_mutex);
         lockdep_assert_held(&css_set_lock);
  
+       /*
+        * If ->dead, @src_set is associated with one or more dead cgroups
+        * and doesn't contain any migratable tasks.  Ignore it early so
+        * that the rest of migration path doesn't get confused by it.
+        */
+       if (src_cset->dead)
+               return;
+
         src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
  
         if (!list_empty(&src_cset->mg_preload_node))
                 return;
  
         WARN_ON(src_cset->mg_src_cgrp);
+       WARN_ON(src_cset->mg_dst_cgrp);
         WARN_ON(!list_empty(&src_cset->mg_tasks));
         WARN_ON(!list_empty(&src_cset->mg_node));
  
         src_cset->mg_src_cgrp = src_cgrp;
+       src_cset->mg_dst_cgrp = dst_cgrp;
         get_css_set(src_cset);
         list_add(&src_cset->mg_preload_node, preloaded_csets);
  }
  
  /**
   * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @dst_cgrp: the destination cgroup (may be %NULL)
   * @preloaded_csets: list of preloaded source css_sets
   *
- * Tasks are about to be moved to @dst_cgrp and all the source css_sets
- * have been preloaded to @preloaded_csets.  This function looks up and
- * pins all destination css_sets, links each to its source, and append them
- * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
- * source css_set is assumed to be its cgroup on the default hierarchy.
+ * Tasks are about to be moved and all the source css_sets have been
+ * preloaded to @preloaded_csets.  This function looks up and pins all
+ * destination css_sets, links each to its source, and append them to
+ * @preloaded_csets.
   *
   * This function must be called after cgroup_migrate_add_src() has been
   * called on each migration source css_set.  After migration is performed
   * using cgroup_migrate(), cgroup_migrate_finish() must be called on
   * @preloaded_csets.
   */
-static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
-                                     struct list_head *preloaded_csets)
+static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
  {
         LIST_HEAD(csets);
         struct css_set *src_cset, *tmp_cset;
  
         lockdep_assert_held(&cgroup_mutex);
  
-       /*
-        * Except for the root, child_subsys_mask must be zero for a cgroup
-        * with tasks so that child cgroups don't compete against tasks.
-        */
-       if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
-           dst_cgrp->child_subsys_mask)
-               return -EBUSY;
-
         /* look up the dst cset for each src cset and link it to src */
         list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
                 struct css_set *dst_cset;
  
-               dst_cset = find_css_set(src_cset,
-                                       dst_cgrp ?: src_cset->dfl_cgrp);
+               dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
                 if (!dst_cset)
                         goto err;
  
@@ -2538,6 +2581,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
                  */
                 if (src_cset == dst_cset) {
                         src_cset->mg_src_cgrp = NULL;
+                       src_cset->mg_dst_cgrp = NULL;
                         list_del_init(&src_cset->mg_preload_node);
                         put_css_set(src_cset);
                         put_css_set(dst_cset);
@@ -2563,11 +2607,11 @@ err:
   * cgroup_migrate - migrate a process or task to a cgroup
   * @leader: the leader of the process or the task to migrate
   * @threadgroup: whether @leader points to the whole process or a single task
- * @cgrp: the destination cgroup
+ * @root: cgroup root migration is taking place on
   *
- * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
- * process, the caller must be holding cgroup_threadgroup_rwsem.  The
- * caller is also responsible for invoking cgroup_migrate_add_src() and
+ * Migrate a process or task denoted by @leader.  If migrating a process,
+ * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
+ * responsible for invoking cgroup_migrate_add_src() and
   * cgroup_migrate_prepare_dst() on the targets before invoking this
   * function and following up with cgroup_migrate_finish().
   *
@@ -2578,7 +2622,7 @@ err:
   * actually starting migrating.
   */
  static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-                         struct cgroup *cgrp)
+                         struct cgroup_root *root)
  {
         struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
         struct task_struct *task;
@@ -2599,7 +2643,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
         rcu_read_unlock();
         spin_unlock_bh(&css_set_lock);
  
-       return cgroup_taskset_migrate(&tset, cgrp);
+       return cgroup_taskset_migrate(&tset, root);
  }
  
  /**
@@ -2617,6 +2661,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
         struct task_struct *task;
         int ret;
  
+       if (!cgroup_may_migrate_to(dst_cgrp))
+               return -EBUSY;
+
         /* look up all src csets */
         spin_lock_bh(&css_set_lock);
         rcu_read_lock();
@@ -2631,9 +2678,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
         spin_unlock_bh(&css_set_lock);
  
         /* prepare dst csets and commit */
-       ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
+       ret = cgroup_migrate_prepare_dst(&preloaded_csets);
         if (!ret)
-               ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
+               ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
  
         cgroup_migrate_finish(&preloaded_csets);
         return ret;
@@ -2696,7 +2743,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
                 return -EINVAL;
  
-       cgrp = cgroup_kn_lock_live(of->kn);
+       cgrp = cgroup_kn_lock_live(of->kn, false);
         if (!cgrp)
                 return -ENODEV;
  
@@ -2794,7 +2841,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
  
         BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
  
-       cgrp = cgroup_kn_lock_live(of->kn);
+       cgrp = cgroup_kn_lock_live(of->kn, false);
         if (!cgrp)
                 return -ENODEV;
         spin_lock(&release_agent_path_lock);
@@ -2822,38 +2869,28 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
         return 0;
  }
  
-static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
+static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
  {
         struct cgroup_subsys *ss;
         bool printed = false;
         int ssid;
  
-       for_each_subsys_which(ss, ssid, &ss_mask) {
+       do_each_subsys_mask(ss, ssid, ss_mask) {
                 if (printed)
                         seq_putc(seq, ' ');
                 seq_printf(seq, "%s", ss->name);
                 printed = true;
-       }
+       } while_each_subsys_mask();
         if (printed)
                 seq_putc(seq, '\n');
  }
  
-/* show controllers which are currently attached to the default hierarchy */
-static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
-{
-       struct cgroup *cgrp = seq_css(seq)->cgroup;
-
-       cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
-                            ~cgrp_dfl_root_inhibit_ss_mask);
-       return 0;
-}
-
  /* show controllers which are enabled from the parent */
  static int cgroup_controllers_show(struct seq_file *seq, void *v)
  {
         struct cgroup *cgrp = seq_css(seq)->cgroup;
  
-       cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
+       cgroup_print_ss_mask(seq, cgroup_control(cgrp));
         return 0;
  }
  
@@ -2870,16 +2907,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
   * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
   * @cgrp: root of the subtree to update csses for
   *
- * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
- * css associations need to be updated accordingly.  This function looks up
- * all css_sets which are attached to the subtree, creates the matching
- * updated css_sets and migrates the tasks to the new ones.
+ * @cgrp's control masks have changed and its subtree's css associations
+ * need to be updated accordingly.  This function looks up all css_sets
+ * which are attached to the subtree, creates the matching updated css_sets
+ * and migrates the tasks to the new ones.
   */
  static int cgroup_update_dfl_csses(struct cgroup *cgrp)
  {
         LIST_HEAD(preloaded_csets);
         struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
-       struct cgroup_subsys_state *css;
+       struct cgroup_subsys_state *d_css;
+       struct cgroup *dsct;
         struct css_set *src_cset;
         int ret;
  
@@ -2889,21 +2927,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
  
         /* look up all csses currently attached to @cgrp's subtree */
         spin_lock_bh(&css_set_lock);
-       css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+       cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                 struct cgrp_cset_link *link;
  
-               /* self is not affected by child_subsys_mask change */
-               if (css->cgroup == cgrp)
-                       continue;
-
-               list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
-                       cgroup_migrate_add_src(link->cset, cgrp,
+               list_for_each_entry(link, &dsct->cset_links, cset_link)
+                       cgroup_migrate_add_src(link->cset, dsct,
                                                &preloaded_csets);
         }
         spin_unlock_bh(&css_set_lock);
  
         /* NULL dst indicates self on default hierarchy */
-       ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+       ret = cgroup_migrate_prepare_dst(&preloaded_csets);
         if (ret)
                 goto out_finish;
  
@@ -2921,20 +2955,272 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
         }
         spin_unlock_bh(&css_set_lock);
  
-       ret = cgroup_taskset_migrate(&tset, cgrp);
+       ret = cgroup_taskset_migrate(&tset, cgrp->root);
  out_finish:
         cgroup_migrate_finish(&preloaded_csets);
         percpu_up_write(&cgroup_threadgroup_rwsem);
         return ret;
  }
  
+/**
+ * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
+ * @cgrp: root of the target subtree
+ *
+ * Because css offlining is asynchronous, userland may try to re-enable a
+ * controller while the previous css is still around.  This function grabs
+ * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
+ */
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+       __acquires(&cgroup_mutex)
+{
+       struct cgroup *dsct;
+       struct cgroup_subsys_state *d_css;
+       struct cgroup_subsys *ss;
+       int ssid;
+
+restart:
+       mutex_lock(&cgroup_mutex);
+
+       cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+               for_each_subsys(ss, ssid) {
+                       struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+                       DEFINE_WAIT(wait);
+
+                       if (!css || !percpu_ref_is_dying(&css->refcnt))
+                               continue;
+
+                       cgroup_get(dsct);
+                       prepare_to_wait(&dsct->offline_waitq, &wait,
+                                       TASK_UNINTERRUPTIBLE);
+
+                       mutex_unlock(&cgroup_mutex);
+                       schedule();
+                       finish_wait(&dsct->offline_waitq, &wait);
+
+                       cgroup_put(dsct);
+                       goto restart;
+               }
+       }
+}
+
+/**
+ * cgroup_save_control - save control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * Save ->subtree_control and ->subtree_ss_mask to the respective old_
+ * prefixed fields for @cgrp's subtree including @cgrp itself.
+ */
+static void cgroup_save_control(struct cgroup *cgrp)
+{
+       struct cgroup *dsct;
+       struct cgroup_subsys_state *d_css;
+
+       cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+               dsct->old_subtree_control = dsct->subtree_control;
+               dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
+       }
+}
+
+/**
+ * cgroup_propagate_control - refresh control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
+ * ->subtree_control and propagate controller availability through the
+ * subtree so that descendants don't have unavailable controllers enabled.
+ */
+static void cgroup_propagate_control(struct cgroup *cgrp)
+{
+       struct cgroup *dsct;
+       struct cgroup_subsys_state *d_css;
+
+       cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+               dsct->subtree_control &= cgroup_control(dsct);
+               dsct->subtree_ss_mask =
+                       cgroup_calc_subtree_ss_mask(dsct->subtree_control,
+                                                   cgroup_ss_mask(dsct));
+       }
+}
+
+/**
+ * cgroup_restore_control - restore control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * Restore ->subtree_control and ->subtree_ss_mask from the respective old_
+ * prefixed fields for @cgrp's subtree including @cgrp itself.
+ */
+static void cgroup_restore_control(struct cgroup *cgrp)
+{
+       struct cgroup *dsct;
+       struct cgroup_subsys_state *d_css;
+
+       cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+               dsct->subtree_control = dsct->old_subtree_control;
+               dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
+       }
+}
+
+static bool css_visible(struct cgroup_subsys_state *css)
+{
+       struct cgroup_subsys *ss = css->ss;
+       struct cgroup *cgrp = css->cgroup;
+
+       if (cgroup_control(cgrp) & (1 << ss->id))
+               return true;
+       if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
+               return false;
+       return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
+}
+
+/**
+ * cgroup_apply_control_enable - enable or show csses according to control
+ * @cgrp: root of the target subtree
+ *
+ * Walk @cgrp's subtree and create new csses or make the existing ones
+ * visible.  A css is created invisible if it's being implicitly enabled
+ * through dependency.  An invisible css is made visible when the userland
+ * explicitly enables it.
+ *
+ * Returns 0 on success, -errno on failure.  On failure, csses which have
+ * been processed already aren't cleaned up.  The caller is responsible for
+ * cleaning up with cgroup_apply_control_disble().
+ */
+static int cgroup_apply_control_enable(struct cgroup *cgrp)
+{
+       struct cgroup *dsct;
+       struct cgroup_subsys_state *d_css;
+       struct cgroup_subsys *ss;
+       int ssid, ret;
+
+       cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+               for_each_subsys(ss, ssid) {
+                       struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+
+                       WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
+
+                       if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
+                               continue;
+
+                       if (!css) {
+                               css = css_create(dsct, ss);
+                               if (IS_ERR(css))
+                                       return PTR_ERR(css);
+                       }
+
+                       if (css_visible(css)) {
+                               ret = css_populate_dir(css);
+                               if (ret)
+                                       return ret;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * cgroup_apply_control_disable - kill or hide csses according to control
+ * @cgrp: root of the target subtree
+ *
+ * Walk @cgrp's subtree and kill and hide csses so that they match
+ * cgroup_ss_mask() and cgroup_visible_mask().
+ *
+ * A css is hidden when the userland requests it to be disabled while other
+ * subsystems are still depending on it.  The css must not actively control
+ * resources and be in the vanilla state if it's made visible again later.
+ * Controllers which may be depended upon should provide ->css_reset() for
+ * this purpose.
+ */
+static void cgroup_apply_control_disable(struct cgroup *cgrp)
+{
+       struct cgroup *dsct;
+       struct cgroup_subsys_state *d_css;
+       struct cgroup_subsys *ss;
+       int ssid;
+
+       cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+               for_each_subsys(ss, ssid) {
+                       struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+
+                       WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
+
+                       if (!css)
+                               continue;
+
+                       if (css->parent &&
+                           !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
+                               kill_css(css);
+                       } else if (!css_visible(css)) {
+                               css_clear_dir(css);
+                               if (ss->css_reset)
+                                       ss->css_reset(css);
+                       }
+               }
+       }
+}
+
+/**
+ * cgroup_apply_control - apply control mask updates to the subtree
+ * @cgrp: root of the target subtree
+ *
+ * subsystems can be enabled and disabled in a subtree using the following
+ * steps.
+ *
+ * 1. Call cgroup_save_control() to stash the current state.
+ * 2. Update ->subtree_control masks in the subtree as desired.
+ * 3. Call cgroup_apply_control() to apply the changes.
+ * 4. Optionally perform other related operations.
+ * 5. Call cgroup_finalize_control() to finish up.
+ *
+ * This function implements step 3 and propagates the mask changes
+ * throughout @cgrp's subtree, updates csses accordingly and perform
+ * process migrations.
+ */
+static int cgroup_apply_control(struct cgroup *cgrp)
+{
+       int ret;
+
+       cgroup_propagate_control(cgrp);
+
+       ret = cgroup_apply_control_enable(cgrp);
+       if (ret)
+               return ret;
+
+       /*
+        * At this point, cgroup_e_css() results reflect the new csses
+        * making the following cgroup_update_dfl_csses() properly update
+        * css associations of all tasks in the subtree.
+        */
+       ret = cgroup_update_dfl_csses(cgrp);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+
+/**
+ * cgroup_finalize_control - finalize control mask update
+ * @cgrp: root of the target subtree
+ * @ret: the result of the update
+ *
+ * Finalize control mask update.  See cgroup_apply_control() for more info.
+ */
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
+{
+       if (ret) {
+               cgroup_restore_control(cgrp);
+               cgroup_propagate_control(cgrp);
+       }
+
+       cgroup_apply_control_disable(cgrp);
+}
+
  /* change the enabled child controllers for a cgroup in the default hierarchy */
  static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                             char *buf, size_t nbytes,
                                             loff_t off)
  {
-       unsigned long enable = 0, disable = 0;
-       unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
+       u16 enable = 0, disable = 0;
         struct cgroup *cgrp, *child;
         struct cgroup_subsys *ss;
         char *tok;
@@ -2946,11 +3232,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
          */
         buf = strstrip(buf);
         while ((tok = strsep(&buf, " "))) {
-               unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;
-
                 if (tok[0] == '\0')
                         continue;
-               for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
+               do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
                         if (!cgroup_ssid_enabled(ssid) ||
                             strcmp(tok + 1, ss->name))
                                 continue;
@@ -2965,12 +3249,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                 return -EINVAL;
                         }
                         break;
-               }
+               } while_each_subsys_mask();
                 if (ssid == CGROUP_SUBSYS_COUNT)
                         return -EINVAL;
         }
  
-       cgrp = cgroup_kn_lock_live(of->kn);
+       cgrp = cgroup_kn_lock_live(of->kn, true);
         if (!cgrp)
                 return -ENODEV;
  
@@ -2981,10 +3265,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                 continue;
                         }
  
-                       /* unavailable or not enabled on the parent? */
-                       if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
-                           (cgroup_parent(cgrp) &&
-                            !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
+                       if (!(cgroup_control(cgrp) & (1 << ssid))) {
                                 ret = -ENOENT;
                                 goto out_unlock;
                         }
@@ -3018,150 +3299,21 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                 goto out_unlock;
         }
  
-       /*
-        * Update subsys masks and calculate what needs to be done.  More
-        * subsystems than specified may need to be enabled or disabled
-        * depending on subsystem dependencies.
-        */
-       old_sc = cgrp->subtree_control;
-       old_ss = cgrp->child_subsys_mask;
-       new_sc = (old_sc | enable) & ~disable;
-       new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
-
-       css_enable = ~old_ss & new_ss;
-       css_disable = old_ss & ~new_ss;
-       enable |= css_enable;
-       disable |= css_disable;
-
-       /*
-        * Because css offlining is asynchronous, userland might try to
-        * re-enable the same controller while the previous instance is
-        * still around.  In such cases, wait till it's gone using
-        * offline_waitq.
-        */
-       for_each_subsys_which(ss, ssid, &css_enable) {
-               cgroup_for_each_live_child(child, cgrp) {
-                       DEFINE_WAIT(wait);
-
-                       if (!cgroup_css(child, ss))
-                               continue;
-
-                       cgroup_get(child);
-                       prepare_to_wait(&child->offline_waitq, &wait,
-                                       TASK_UNINTERRUPTIBLE);
-                       cgroup_kn_unlock(of->kn);
-                       schedule();
-                       finish_wait(&child->offline_waitq, &wait);
-                       cgroup_put(child);
-
-                       return restart_syscall();
-               }
-       }
-
-       cgrp->subtree_control = new_sc;
-       cgrp->child_subsys_mask = new_ss;
-
-       /*
-        * Create new csses or make the existing ones visible.  A css is
-        * created invisible if it's being implicitly enabled through
-        * dependency.  An invisible css is made visible when the userland
-        * explicitly enables it.
-        */
-       for_each_subsys(ss, ssid) {
-               if (!(enable & (1 << ssid)))
-                       continue;
-
-               cgroup_for_each_live_child(child, cgrp) {
-                       if (css_enable & (1 << ssid))
-                               ret = create_css(child, ss,
-                                       cgrp->subtree_control & (1 << ssid));
-                       else
-                               ret = css_populate_dir(cgroup_css(child, ss),
-                                                      NULL);
-                       if (ret)
-                               goto err_undo_css;
-               }
-       }
-
-       /*
-        * At this point, cgroup_e_css() results reflect the new csses
-        * making the following cgroup_update_dfl_csses() properly update
-        * css associations of all tasks in the subtree.
-        */
-       ret = cgroup_update_dfl_csses(cgrp);
-       if (ret)
-               goto err_undo_css;
+       /* save and update control masks and prepare csses */
+       cgroup_save_control(cgrp);
  
-       /*
-        * All tasks are migrated out of disabled csses.  Kill or hide
-        * them.  A css is hidden when the userland requests it to be
-        * disabled while other subsystems are still depending on it.  The
-        * css must not actively control resources and be in the vanilla
-        * state if it's made visible again later.  Controllers which may
-        * be depended upon should provide ->css_reset() for this purpose.
-        */
-       for_each_subsys(ss, ssid) {
-               if (!(disable & (1 << ssid)))
-                       continue;
+       cgrp->subtree_control |= enable;
+       cgrp->subtree_control &= ~disable;
  
-               cgroup_for_each_live_child(child, cgrp) {
-                       struct cgroup_subsys_state *css = cgroup_css(child, ss);
+       ret = cgroup_apply_control(cgrp);
  
-                       if (css_disable & (1 << ssid)) {
-                               kill_css(css);
-                       } else {
-                               css_clear_dir(css, NULL);
-                               if (ss->css_reset)
-                                       ss->css_reset(css);
-                       }
-               }
-       }
-
-       /*
-        * The effective csses of all the descendants (excluding @cgrp) may
-        * have changed.  Subsystems can optionally subscribe to this event
-        * by implementing ->css_e_css_changed() which is invoked if any of
-        * the effective csses seen from the css's cgroup may have changed.
-        */
-       for_each_subsys(ss, ssid) {
-               struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
-               struct cgroup_subsys_state *css;
-
-               if (!ss->css_e_css_changed || !this_css)
-                       continue;
-
-               css_for_each_descendant_pre(css, this_css)
-                       if (css != this_css)
-                               ss->css_e_css_changed(css);
-       }
+       cgroup_finalize_control(cgrp, ret);
  
         kernfs_activate(cgrp->kn);
         ret = 0;
  out_unlock:
         cgroup_kn_unlock(of->kn);
         return ret ?: nbytes;
-
-err_undo_css:
-       cgrp->subtree_control = old_sc;
-       cgrp->child_subsys_mask = old_ss;
-
-       for_each_subsys(ss, ssid) {
-               if (!(enable & (1 << ssid)))
-                       continue;
-
-               cgroup_for_each_live_child(child, cgrp) {
-                       struct cgroup_subsys_state *css = cgroup_css(child, ss);
-
-                       if (!css)
-                               continue;
-
-                       if (css_enable & (1 << ssid))
-                               kill_css(css);
-                       else
-                               css_clear_dir(css, NULL);
-               }
-       }
-       goto out_unlock;
  }
  
  static int cgroup_events_show(struct seq_file *seq, void *v)
@@ -3359,7 +3511,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
                               bool is_add)
  {
         struct cftype *cft, *cft_end = NULL;
-       int ret;
+       int ret = 0;
  
         lockdep_assert_held(&cgroup_mutex);
  
@@ -3388,7 +3540,7 @@ restart:
                         cgroup_rm_file(cgrp, cft);
                 }
         }
-       return 0;
+       return ret;
  }
  
  static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
@@ -3405,7 +3557,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
         css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
                 struct cgroup *cgrp = css->cgroup;
  
-               if (cgroup_is_dead(cgrp))
+               if (!(css->flags & CSS_VISIBLE))
                         continue;
  
                 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
@@ -4026,6 +4178,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
         struct task_struct *task;
         int ret;
  
+       if (!cgroup_may_migrate_to(to))
+               return -EBUSY;
+
         mutex_lock(&cgroup_mutex);
  
         /* all tasks in @from are being moved, all csets are source */
@@ -4034,7 +4189,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
                 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
         spin_unlock_bh(&css_set_lock);
  
-       ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
+       ret = cgroup_migrate_prepare_dst(&preloaded_csets);
         if (ret)
                 goto out_err;
  
@@ -4050,7 +4205,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
                 css_task_iter_end(&it);
  
                 if (task) {
-                       ret = cgroup_migrate(task, false, to);
+                       ret = cgroup_migrate(task, false, to->root);
                         put_task_struct(task);
                 }
         } while (task && !ret);
@@ -4557,12 +4712,6 @@ static struct cftype cgroup_dfl_base_files[] = {
         },
         {
                 .name = "cgroup.controllers",
-               .flags = CFTYPE_ONLY_ON_ROOT,
-               .seq_show = cgroup_root_controllers_show,
-       },
-       {
-               .name = "cgroup.controllers",
-               .flags = CFTYPE_NOT_ON_ROOT,
                 .seq_show = cgroup_controllers_show,
         },
         {
@@ -4731,7 +4880,9 @@ static void css_release_work_fn(struct work_struct *work)
                  * Those are supported by RCU protecting clearing of
                  * cgrp->kn->priv backpointer.
                  */
-               RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
+               if (cgrp->kn)
+                       RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
+                                        NULL);
         }
  
         mutex_unlock(&cgroup_mutex);
@@ -4802,6 +4953,9 @@ static void offline_css(struct cgroup_subsys_state *css)
         if (!(css->flags & CSS_ONLINE))
                 return;
  
+       if (ss->css_reset)
+               ss->css_reset(css);
+
         if (ss->css_offline)
                 ss->css_offline(css);
  
@@ -4812,17 +4966,16 @@ static void offline_css(struct cgroup_subsys_state *css)
  }
  
  /**
- * create_css - create a cgroup_subsys_state
+ * css_create - create a cgroup_subsys_state
   * @cgrp: the cgroup new css will be associated with
   * @ss: the subsys of new css
- * @visible: whether to create control knobs for the new css or not
   *
   * Create a new css associated with @cgrp - @ss pair.  On success, the new
- * css is online and installed in @cgrp with all interface files created if
- * @visible.  Returns 0 on success, -errno on failure.
+ * css is online and installed in @cgrp.  This function doesn't create the
+ * interface files.  Returns 0 on success, -errno on failure.
   */
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
-                     bool visible)
+static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+                                             struct cgroup_subsys *ss)
  {
         struct cgroup *parent = cgroup_parent(cgrp);
         struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
@@ -4833,7 +4986,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
  
         css = ss->css_alloc(parent_css);
         if (IS_ERR(css))
-               return PTR_ERR(css);
+               return css;
  
         init_and_link_css(css, ss, cgrp);
  
@@ -4846,12 +4999,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
                 goto err_free_percpu_ref;
         css->id = err;
  
-       if (visible) {
-               err = css_populate_dir(css, NULL);
-               if (err)
-                       goto err_free_id;
-       }
-
         /* @css is ready to be brought online now, make it visible */
         list_add_tail_rcu(&css->sibling, &parent_css->children);
         cgroup_idr_replace(&ss->css_idr, css, css->id);
@@ -4869,47 +5016,30 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
                 ss->warned_broken_hierarchy = true;
         }
  
-       return 0;
+       return css;
  
  err_list_del:
         list_del_rcu(&css->sibling);
-       css_clear_dir(css, NULL);
-err_free_id:
         cgroup_idr_remove(&ss->css_idr, css->id);
  err_free_percpu_ref:
         percpu_ref_exit(&css->refcnt);
  err_free_css:
         call_rcu(&css->rcu_head, css_free_rcu_fn);
-       return err;
+       return ERR_PTR(err);
  }
  
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
-                       umode_t mode)
+static struct cgroup *cgroup_create(struct cgroup *parent)
  {
-       struct cgroup *parent, *cgrp, *tcgrp;
-       struct cgroup_root *root;
-       struct cgroup_subsys *ss;
-       struct kernfs_node *kn;
-       int level, ssid, ret;
-
-       /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
-        */
-       if (strchr(name, '\n'))
-               return -EINVAL;
-
-       parent = cgroup_kn_lock_live(parent_kn);
-       if (!parent)
-               return -ENODEV;
-       root = parent->root;
-       level = parent->level + 1;
+       struct cgroup_root *root = parent->root;
+       struct cgroup *cgrp, *tcgrp;
+       int level = parent->level + 1;
+       int ret;
  
         /* allocate the cgroup and its ID, 0 is reserved for the root */
         cgrp = kzalloc(sizeof(*cgrp) +
                        sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
-       if (!cgrp) {
-               ret = -ENOMEM;
-               goto out_unlock;
-       }
+       if (!cgrp)
+               return ERR_PTR(-ENOMEM);
  
         ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
         if (ret)
@@ -4940,20 +5070,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
  
-       /* create the directory */
-       kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
-       if (IS_ERR(kn)) {
-               ret = PTR_ERR(kn);
-               goto out_free_id;
-       }
-       cgrp->kn = kn;
-
-       /*
-        * This extra ref will be put in cgroup_free_fn() and guarantees
-        * that @cgrp->kn is always accessible.
-        */
-       kernfs_get(kn);
-
         cgrp->self.serial_nr = css_serial_nr_next++;
  
         /* allocation complete, commit to creation */
@@ -4967,51 +5083,90 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
          */
         cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
  
-       ret = cgroup_kn_set_ugid(kn);
-       if (ret)
-               goto out_destroy;
+       /*
+        * On the default hierarchy, a child doesn't automatically inherit
+        * subtree_control from the parent.  Each is configured manually.
+        */
+       if (!cgroup_on_dfl(cgrp))
+               cgrp->subtree_control = cgroup_control(cgrp);
  
-       ret = css_populate_dir(&cgrp->self, NULL);
+       cgroup_propagate_control(cgrp);
+
+       /* @cgrp doesn't have dir yet so the following will only create csses */
+       ret = cgroup_apply_control_enable(cgrp);
         if (ret)
                 goto out_destroy;
  
-       /* let's create and online css's */
-       for_each_subsys(ss, ssid) {
-               if (parent->child_subsys_mask & (1 << ssid)) {
-                       ret = create_css(cgrp, ss,
-                                        parent->subtree_control & (1 << ssid));
-                       if (ret)
-                               goto out_destroy;
-               }
+       return cgrp;
+
+out_cancel_ref:
+       percpu_ref_exit(&cgrp->self.refcnt);
+out_free_cgrp:
+       kfree(cgrp);
+       return ERR_PTR(ret);
+out_destroy:
+       cgroup_destroy_locked(cgrp);
+       return ERR_PTR(ret);
+}
+
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+                       umode_t mode)
+{
+       struct cgroup *parent, *cgrp;
+       struct kernfs_node *kn;
+       int ret;
+
+       /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
+       if (strchr(name, '\n'))
+               return -EINVAL;
+
+       parent = cgroup_kn_lock_live(parent_kn, false);
+       if (!parent)
+               return -ENODEV;
+
+       cgrp = cgroup_create(parent);
+       if (IS_ERR(cgrp)) {
+               ret = PTR_ERR(cgrp);
+               goto out_unlock;
+       }
+
+       /* create the directory */
+       kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+       if (IS_ERR(kn)) {
+               ret = PTR_ERR(kn);
+               goto out_destroy;
         }
+       cgrp->kn = kn;
  
         /*
-        * On the default hierarchy, a child doesn't automatically inherit
-        * subtree_control from the parent.  Each is configured manually.
+        * This extra ref will be put in cgroup_free_fn() and guarantees
+        * that @cgrp->kn is always accessible.
          */
-       if (!cgroup_on_dfl(cgrp)) {
-               cgrp->subtree_control = parent->subtree_control;
-               cgroup_refresh_child_subsys_mask(cgrp);
-       }
+       kernfs_get(kn);
+
+       ret = cgroup_kn_set_ugid(kn);
+       if (ret)
+               goto out_destroy;
+
+       ret = css_populate_dir(&cgrp->self);
+       if (ret)
+               goto out_destroy;
  
+       ret = cgroup_apply_control_enable(cgrp);
+       if (ret)
+               goto out_destroy;
+
+       /* let's create and online css's */
         kernfs_activate(kn);
  
         ret = 0;
         goto out_unlock;
  
-out_free_id:
-       cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
-out_cancel_ref:
-       percpu_ref_exit(&cgrp->self.refcnt);
-out_free_cgrp:
-       kfree(cgrp);
+out_destroy:
+       cgroup_destroy_locked(cgrp);
  out_unlock:
         cgroup_kn_unlock(parent_kn);
         return ret;
-
-out_destroy:
-       cgroup_destroy_locked(cgrp);
-       goto out_unlock;
  }
  
  /*
@@ -5065,7 +5220,7 @@ static void kill_css(struct cgroup_subsys_state *css)
          * This must happen before css is disassociated with its cgroup.
          * See seq_css() for details.
          */
-       css_clear_dir(css, NULL);
+       css_clear_dir(css);
  
         /*
          * Killing would put the base ref, but we need to keep it alive
@@ -5114,6 +5269,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
  {
         struct cgroup_subsys_state *css;
+       struct cgrp_cset_link *link;
         int ssid;
  
         lockdep_assert_held(&cgroup_mutex);
@@ -5134,11 +5290,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
                 return -EBUSY;
  
         /*
-        * Mark @cgrp dead.  This prevents further task migration and child
-        * creation by disabling cgroup_lock_live_group().
+        * Mark @cgrp and the associated csets dead.  The former prevents
+        * further task migration and child creation by disabling
+        * cgroup_lock_live_group().  The latter makes the csets ignored by
+        * the migration path.
          */
         cgrp->self.flags &= ~CSS_ONLINE;
  
+       spin_lock_bh(&css_set_lock);
+       list_for_each_entry(link, &cgrp->cset_links, cset_link)
+               link->cset->dead = true;
+       spin_unlock_bh(&css_set_lock);
+
         /* initiate massacre of all css's */
         for_each_css(css, ssid, cgrp)
                 kill_css(css);
@@ -5162,7 +5325,7 @@ static int cgroup_rmdir(struct kernfs_node *kn)
         struct cgroup *cgrp;
         int ret = 0;
  
-       cgrp = cgroup_kn_lock_live(kn);
+       cgrp = cgroup_kn_lock_live(kn, false);
         if (!cgrp)
                 return 0;
  
@@ -5252,7 +5415,7 @@ int __init cgroup_init_early(void)
  
         for_each_subsys(ss, i) {
                 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
-                    "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
+                    "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
                      i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
                      ss->id, ss->name);
                 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
@@ -5269,7 +5432,7 @@ int __init cgroup_init_early(void)
         return 0;
  }
  
-static unsigned long cgroup_disable_mask __initdata;
+static u16 cgroup_disable_mask __initdata;
  
  /**
   * cgroup_init - cgroup initialization
@@ -5280,18 +5443,21 @@ static unsigned long cgroup_disable_mask __initdata;
  int __init cgroup_init(void)
  {
         struct cgroup_subsys *ss;
-       unsigned long key;
         int ssid;
  
+       BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
         BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
         BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
         BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
  
         mutex_lock(&cgroup_mutex);
  
-       /* Add init_css_set to the hash table */
-       key = css_set_hash(init_css_set.subsys);
-       hash_add(css_set_table, &init_css_set.hlist, key);
+       /*
+        * Add init_css_set to the hash table so that dfl_root can link to
+        * it during init.
+        */
+       hash_add(css_set_table, &init_css_set.hlist,
+                css_set_hash(init_css_set.subsys));
  
         BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
  
@@ -5324,10 +5490,16 @@ int __init cgroup_init(void)
                         continue;
                 }
  
+               if (cgroup_ssid_no_v1(ssid))
+                       printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
+                              ss->name);
+
                 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
  
-               if (!ss->dfl_cftypes)
-                       cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
+               if (ss->implicit_on_dfl)
+                       cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
+               else if (!ss->dfl_cftypes)
+                       cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
  
                 if (ss->dfl_cftypes == ss->legacy_cftypes) {
                         WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
@@ -5340,6 +5512,11 @@ int __init cgroup_init(void)
                         ss->bind(init_css_set.subsys[ssid]);
         }
  
+       /* init_css_set.subsys[] has been updated, re-hash */
+       hash_del(&init_css_set.hlist);
+       hash_add(css_set_table, &init_css_set.hlist,
+                css_set_hash(init_css_set.subsys));
+
         WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
         WARN_ON(register_filesystem(&cgroup_fs_type));
         WARN_ON(register_filesystem(&cgroup2_fs_type));
@@ -5398,7 +5575,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                 struct cgroup *cgrp;
                 int ssid, count = 0;
  
-               if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
+               if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
                         continue;
  
                 seq_printf(m, "%d:", root->hierarchy_id);
@@ -5513,11 +5690,11 @@ int cgroup_can_fork(struct task_struct *child)
         struct cgroup_subsys *ss;
         int i, j, ret;
  
-       for_each_subsys_which(ss, i, &have_canfork_callback) {
+       do_each_subsys_mask(ss, i, have_canfork_callback) {
                 ret = ss->can_fork(child);
                 if (ret)
                         goto out_revert;
-       }
+       } while_each_subsys_mask();
  
         return 0;
  
@@ -5602,8 +5779,9 @@ void cgroup_post_fork(struct task_struct *child)
          * css_set; otherwise, @child might change state between ->fork()
          * and addition to css_set.
          */
-       for_each_subsys_which(ss, i, &have_fork_callback)
+       do_each_subsys_mask(ss, i, have_fork_callback) {
                 ss->fork(child);
+       } while_each_subsys_mask();
  }
  
  /**
@@ -5646,8 +5824,9 @@ void cgroup_exit(struct task_struct *tsk)
         }
  
         /* see cgroup_post_fork() for details */
-       for_each_subsys_which(ss, i, &have_exit_callback)
+       do_each_subsys_mask(ss, i, have_exit_callback) {
                 ss->exit(tsk);
+       } while_each_subsys_mask();
  }
  
  void cgroup_free(struct task_struct *task)
@@ -5656,8 +5835,9 @@ void cgroup_free(struct task_struct *task)
         struct cgroup_subsys *ss;
         int ssid;
  
-       for_each_subsys_which(ss, ssid, &have_free_callback)
+       do_each_subsys_mask(ss, ssid, have_free_callback) {
                 ss->free(task);
+       } while_each_subsys_mask();
  
         put_css_set(cset);
  }
@@ -5750,6 +5930,33 @@ static int __init cgroup_disable(char *str)
  }
  __setup("cgroup_disable=", cgroup_disable);
  
+static int __init cgroup_no_v1(char *str)
+{
+       struct cgroup_subsys *ss;
+       char *token;
+       int i;
+
+       while ((token = strsep(&str, ",")) != NULL) {
+               if (!*token)
+                       continue;
+
+               if (!strcmp(token, "all")) {
+                       cgroup_no_v1_mask = U16_MAX;
+                       break;
+               }
+
+               for_each_subsys(ss, i) {
+                       if (strcmp(token, ss->name) &&
+                           strcmp(token, ss->legacy_name))
+                               continue;
+
+                       cgroup_no_v1_mask |= 1 << i;
+               }
+       }
+       return 1;
+}
+__setup("cgroup_no_v1=", cgroup_no_v1);
+
  /**
   * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
   * @dentry: directory dentry of interest
@@ -5763,12 +5970,13 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
                                                        struct cgroup_subsys *ss)
  {
         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+       struct file_system_type *s_type = dentry->d_sb->s_type;
         struct cgroup_subsys_state *css = NULL;
         struct cgroup *cgrp;
  
         /* is @dentry a cgroup dir? */
-       if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
-           kernfs_type(kn) != KERNFS_DIR)
+       if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
+           !kn || kernfs_type(kn) != KERNFS_DIR)
                 return ERR_PTR(-EBADF);
  
         rcu_read_lock();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index 41989ab..9089983 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2089,7 +2089,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
         .attach         = cpuset_attach,
         .bind           = cpuset_bind,
         .legacy_cftypes = files,
-       .early_init     = 1,
+       .early_init     = true,
  };
  
  /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 4edecc1..4ee3ce7 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8441,7 +8441,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
         .can_attach     = cpu_cgroup_can_attach,
         .attach         = cpu_cgroup_attach,
         .legacy_cftypes = cpu_files,
-       .early_init     = 1,
+       .early_init     = true,
  };
  
  #endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c

index dd7cbb5..2ddaebf 100644 (file)
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -279,5 +279,5 @@ struct cgroup_subsys cpuacct_cgrp_subsys = {
         .css_alloc      = cpuacct_css_alloc,
         .css_free       = cpuacct_css_free,
         .legacy_cftypes = files,
-       .early_init     = 1,
+       .early_init     = true,
  };
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 19 Mar 2016 03:25:49 +0000 (20:25 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 19 Mar 2016 03:25:49 +0000 (20:25 -0700)
Documentation/cgroup-v1/00-INDEX		patch \| blob \| history
Documentation/cgroup-v2.txt		patch \| blob \| history
Documentation/kernel-parameters.txt		patch \| blob \| history
include/linux/cgroup-defs.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
kernel/Makefile		patch \| blob \| history
kernel/cgroup.c		patch \| blob \| history
kernel/cpuset.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/cpuacct.c		patch \| blob \| history