Merge branch 'for-4.6-ns' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 21 Mar 2016 17:05:13 +0000 (10:05 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 21 Mar 2016 17:05:13 +0000 (10:05 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 21 Mar 2016 17:05:13 +0000 (10:05 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 21 Mar 2016 17:05:13 +0000 (10:05 -0700)
diff --combined Documentation/cgroup-v2.txt

index bdc6773,ee7917f..4cc07ce
--- 1/Documentation/cgroup-v2.txt
--- 2/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@@ -47,6 -47,11 +47,11 @@@ CONTENT
     5-3. IO
       5-3-1. IO Interface Files
       5-3-2. Writeback
+ 6. Namespace
+   6-1. Basics
+   6-2. The Root and Views
+   6-3. Migration and setns(2)
+   6-4. Interaction with Other Namespaces
   P. Information on Kernel Programming
     P-1. Filesystem Support for Writeback
   D. Deprecated v1 Core Features
@@@ -132,12 -137,6 +137,12 @@@ strongly discouraged for production use
   the hierarchies and controller associations before starting using the
   controllers after system boot.
   
+ +During transition to v2, system management software might still
+ +automount the v1 cgroup filesystem and so hijack all controllers
+ +during boot, before manual intervention is possible. To make testing
+ +and experimenting easier, the kernel parameter cgroup_no_v1= allows
+ +disabling controllers in v1 and make them always available in v2.
+ +
   
   2-2. Organizing Processes
   
@@@ -849,19 -848,6 +854,19 @@@ PAGE_SIZE multiple when read back
                 Amount of memory used to cache filesystem data,
                 including tmpfs and shared memory.
   
+ +        kernel_stack
+ +
+ +              Amount of memory allocated to kernel stacks.
+ +
+ +        slab
+ +
+ +              Amount of memory used for storing in-kernel data
+ +              structures.
+ +
+ +        sock
+ +
+ +              Amount of memory used in network transmission buffers
+ +
           file_mapped
   
                 Amount of cached filesystem data mapped with mmap()
@@@ -886,16 -872,6 +891,16 @@@
                 on the internal memory management lists used by the
                 page reclaim algorithm
   
+ +        slab_reclaimable
+ +
+ +              Part of "slab" that might be reclaimed, such as
+ +              dentries and inodes.
+ +
+ +        slab_unreclaimable
+ +
+ +              Part of "slab" that cannot be reclaimed on memory
+ +              pressure.
+ +
           pgfault
   
                 Total number of page faults incurred
@@@ -921,7 -897,7 +926,7 @@@
         limit, anonymous meomry of the cgroup will not be swapped out.
   
   
- -5-2-2. General Usage
+ +5-2-2. Usage Guidelines
   
   "memory.high" is the main mechanism to control memory usage.
   Over-committing on high limit (sum of high limits > available memory)
@@@ -1114,6 -1090,148 +1119,148 @@@ writeback as follows
         vm.dirty[_background]_ratio.
   
   
+ 6. Namespace
+ 
+ 6-1. Basics
+ 
+ cgroup namespace provides a mechanism to virtualize the view of the
+ "/proc/$PID/cgroup" file and cgroup mounts.  The CLONE_NEWCGROUP clone
+ flag can be used with clone(2) and unshare(2) to create a new cgroup
+ namespace.  The process running inside the cgroup namespace will have
+ its "/proc/$PID/cgroup" output restricted to cgroupns root.  The
+ cgroupns root is the cgroup of the process at the time of creation of
+ the cgroup namespace.
+ 
+ Without cgroup namespace, the "/proc/$PID/cgroup" file shows the
+ complete path of the cgroup of a process.  In a container setup where
+ a set of cgroups and namespaces are intended to isolate processes the
+ "/proc/$PID/cgroup" file may leak potential system level information
+ to the isolated processes.  For Example:
+ 
+   # cat /proc/self/cgroup
+   0::/batchjobs/container_id1
+ 
+ The path '/batchjobs/container_id1' can be considered as system-data
+ and undesirable to expose to the isolated processes.  cgroup namespace
+ can be used to restrict visibility of this path.  For example, before
+ creating a cgroup namespace, one would see:
+ 
+   # ls -l /proc/self/ns/cgroup
+   lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> cgroup:[4026531835]
+   # cat /proc/self/cgroup
+   0::/batchjobs/container_id1
+ 
+ After unsharing a new namespace, the view changes.
+ 
+   # ls -l /proc/self/ns/cgroup
+   lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup -> cgroup:[4026532183]
+   # cat /proc/self/cgroup
+   0::/
+ 
+ When some thread from a multi-threaded process unshares its cgroup
+ namespace, the new cgroupns gets applied to the entire process (all
+ the threads).  This is natural for the v2 hierarchy; however, for the
+ legacy hierarchies, this may be unexpected.
+ 
+ A cgroup namespace is alive as long as there are processes inside or
+ mounts pinning it.  When the last usage goes away, the cgroup
+ namespace is destroyed.  The cgroupns root and the actual cgroups
+ remain.
+ 
+ 
+ 6-2. The Root and Views
+ 
+ The 'cgroupns root' for a cgroup namespace is the cgroup in which the
+ process calling unshare(2) is running.  For example, if a process in
+ /batchjobs/container_id1 cgroup calls unshare, cgroup
+ /batchjobs/container_id1 becomes the cgroupns root.  For the
+ init_cgroup_ns, this is the real root ('/') cgroup.
+ 
+ The cgroupns root cgroup does not change even if the namespace creator
+ process later moves to a different cgroup.
+ 
+   # ~/unshare -c # unshare cgroupns in some cgroup
+   # cat /proc/self/cgroup
+   0::/
+   # mkdir sub_cgrp_1
+   # echo 0 > sub_cgrp_1/cgroup.procs
+   # cat /proc/self/cgroup
+   0::/sub_cgrp_1
+ 
+ Each process gets its namespace-specific view of "/proc/$PID/cgroup"
+ 
+ Processes running inside the cgroup namespace will be able to see
+ cgroup paths (in /proc/self/cgroup) only inside their root cgroup.
+ From within an unshared cgroupns:
+ 
+   # sleep 100000 &
+   [1] 7353
+   # echo 7353 > sub_cgrp_1/cgroup.procs
+   # cat /proc/7353/cgroup
+   0::/sub_cgrp_1
+ 
+ From the initial cgroup namespace, the real cgroup path will be
+ visible:
+ 
+   $ cat /proc/7353/cgroup
+   0::/batchjobs/container_id1/sub_cgrp_1
+ 
+ From a sibling cgroup namespace (that is, a namespace rooted at a
+ different cgroup), the cgroup path relative to its own cgroup
+ namespace root will be shown.  For instance, if PID 7353's cgroup
+ namespace root is at '/batchjobs/container_id2', then it will see
+ 
+   # cat /proc/7353/cgroup
+   0::/../container_id2/sub_cgrp_1
+ 
+ Note that the relative path always starts with '/' to indicate that
+ its relative to the cgroup namespace root of the caller.
+ 
+ 
+ 6-3. Migration and setns(2)
+ 
+ Processes inside a cgroup namespace can move into and out of the
+ namespace root if they have proper access to external cgroups.  For
+ example, from inside a namespace with cgroupns root at
+ /batchjobs/container_id1, and assuming that the global hierarchy is
+ still accessible inside cgroupns:
+ 
+   # cat /proc/7353/cgroup
+   0::/sub_cgrp_1
+   # echo 7353 > batchjobs/container_id2/cgroup.procs
+   # cat /proc/7353/cgroup
+   0::/../container_id2
+ 
+ Note that this kind of setup is not encouraged.  A task inside cgroup
+ namespace should only be exposed to its own cgroupns hierarchy.
+ 
+ setns(2) to another cgroup namespace is allowed when:
+ 
+ (a) the process has CAP_SYS_ADMIN against its current user namespace
+ (b) the process has CAP_SYS_ADMIN against the target cgroup
+     namespace's userns
+ 
+ No implicit cgroup changes happen with attaching to another cgroup
+ namespace.  It is expected that the someone moves the attaching
+ process under the target cgroup namespace root.
+ 
+ 
+ 6-4. Interaction with Other Namespaces
+ 
+ Namespace specific cgroup hierarchy can be mounted by a process
+ running inside a non-init cgroup namespace.
+ 
+   # mount -t cgroup2 none $MOUNT_POINT
+ 
+ This will mount the unified cgroup hierarchy with cgroupns root as the
+ filesystem root.  The process needs CAP_SYS_ADMIN against its user and
+ mount namespaces.
+ 
+ The virtualization of /proc/self/cgroup file combined with restricting
+ the view of cgroup hierarchy by namespace-private cgroupfs mount
+ provides a properly isolated cgroup view inside the container.
+ 
+ 
   P. Information on Kernel Programming
   
   This section contains kernel programming information in the areas
@@@ -1393,12 -1511,6 +1540,12 @@@ system than killing the group.  Otherwi
   limit this type of spillover and ultimately contain buggy or even
   malicious applications.
   
+ +Setting the original memory.limit_in_bytes below the current usage was
+ +subject to a race condition, where concurrent charges could cause the
+ +limit setting to fail. memory.max on the other hand will first set the
+ +limit to prevent new charges, and then reclaim and OOM kill until the
+ +new limit is met - or the task writing to memory.max is killed.
+ +
   The combined memory+swap accounting and limiting is replaced by real
   control over swap space.
   
diff --combined fs/kernfs/dir.c

index 118d033,25d71a5..03b688d
--- 1/fs/kernfs/dir.c
--- 2/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@@ -44,28 -44,122 +44,122 @@@ static int kernfs_name_locked(struct ke
         return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
   }
   
- static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
-                                             size_t buflen)
+ /* kernfs_node_depth - compute depth from @from to @to */
+ static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to)
   {
-       char *p = buf + buflen;
-       int len;
+       size_t depth = 0;
   
-       *--p = '\0';
+       while (to->parent && to != from) {
+               depth++;
+               to = to->parent;
+       }
+       return depth;
+ }
   
-       do {
-               len = strlen(kn->name);
-               if (p - buf < len + 1) {
-                       buf[0] = '\0';
-                       p = NULL;
-                       break;
-               }
-               p -= len;
-               memcpy(p, kn->name, len);
-               *--p = '/';
-               kn = kn->parent;
-       } while (kn && kn->parent);
+ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
+                                                 struct kernfs_node *b)
+ {
+       size_t da, db;
+       struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b);
+ 
+       if (ra != rb)
+               return NULL;
+ 
+       da = kernfs_depth(ra->kn, a);
+       db = kernfs_depth(rb->kn, b);
+ 
+       while (da > db) {
+               a = a->parent;
+               da--;
+       }
+       while (db > da) {
+               b = b->parent;
+               db--;
+       }
+ 
+       /* worst case b and a will be the same at root */
+       while (b != a) {
+               b = b->parent;
+               a = a->parent;
+       }
+ 
+       return a;
+ }
+ 
+ /**
+  * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to,
+  * where kn_from is treated as root of the path.
+  * @kn_from: kernfs node which should be treated as root for the path
+  * @kn_to: kernfs node to which path is needed
+  * @buf: buffer to copy the path into
+  * @buflen: size of @buf
+  *
+  * We need to handle couple of scenarios here:
+  * [1] when @kn_from is an ancestor of @kn_to at some level
+  * kn_from: /n1/n2/n3
+  * kn_to:   /n1/n2/n3/n4/n5
+  * result:  /n4/n5
+  *
+  * [2] when @kn_from is on a different hierarchy and we need to find common
+  * ancestor between @kn_from and @kn_to.
+  * kn_from: /n1/n2/n3/n4
+  * kn_to:   /n1/n2/n5
+  * result:  /../../n5
+  * OR
+  * kn_from: /n1/n2/n3/n4/n5   [depth=5]
+  * kn_to:   /n1/n2/n3         [depth=3]
+  * result:  /../..
+  *
+  * return value: length of the string.  If greater than buflen,
+  * then contents of buf are undefined.  On error, -1 is returned.
+  */
+ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
+                                       struct kernfs_node *kn_from,
+                                       char *buf, size_t buflen)
+ {
+       struct kernfs_node *kn, *common;
+       const char parent_str[] = "/..";
+       size_t depth_from, depth_to, len = 0, nlen = 0;
+       char *p;
+       int i;
+ 
+       if (!kn_from)
+               kn_from = kernfs_root(kn_to)->kn;
+ 
+       if (kn_from == kn_to)
+               return strlcpy(buf, "/", buflen);
+ 
+       common = kernfs_common_ancestor(kn_from, kn_to);
+       if (WARN_ON(!common))
+               return -1;
+ 
+       depth_to = kernfs_depth(common, kn_to);
+       depth_from = kernfs_depth(common, kn_from);
+ 
+       if (buf)
+               buf[0] = '\0';
+ 
+       for (i = 0; i < depth_from; i++)
+               len += strlcpy(buf + len, parent_str,
+                              len < buflen ? buflen - len : 0);
+ 
+       /* Calculate how many bytes we need for the rest */
+       for (kn = kn_to; kn != common; kn = kn->parent)
+               nlen += strlen(kn->name) + 1;
+ 
+       if (len + nlen >= buflen)
+               return len + nlen;
+ 
+       p = buf + len + nlen;
+       *p = '\0';
+       for (kn = kn_to; kn != common; kn = kn->parent) {
+               nlen = strlen(kn->name);
+               p -= nlen;
+               memcpy(p, kn->name, nlen);
+               *(--p) = '/';
+       }
   
-       return p;
+       return len + nlen;
   }
   
   /**
@@@ -114,6 -208,34 +208,34 @@@ size_t kernfs_path_len(struct kernfs_no
         return len;
   }
   
+ /**
+  * kernfs_path_from_node - build path of node @to relative to @from.
+  * @from: parent kernfs_node relative to which we need to build the path
+  * @to: kernfs_node of interest
+  * @buf: buffer to copy @to's path into
+  * @buflen: size of @buf
+  *
+  * Builds @to's path relative to @from in @buf. @from and @to must
+  * be on the same kernfs-root. If @from is not parent of @to, then a relative
+  * path (which includes '..'s) as needed to reach from @from to @to is
+  * returned.
+  *
+  * If @buf isn't long enough, the return value will be greater than @buflen
+  * and @buf contents are undefined.
+  */
+ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
+                         char *buf, size_t buflen)
+ {
+       unsigned long flags;
+       int ret;
+ 
+       spin_lock_irqsave(&kernfs_rename_lock, flags);
+       ret = kernfs_path_from_node_locked(to, from, buf, buflen);
+       spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+       return ret;
+ }
+ EXPORT_SYMBOL_GPL(kernfs_path_from_node);
+ 
   /**
    * kernfs_path - build full path of a given node
    * @kn: kernfs_node of interest
@@@ -127,13 -249,12 +249,12 @@@
    */
   char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
   {
-       unsigned long flags;
-       char *p;
+       int ret;
   
-       spin_lock_irqsave(&kernfs_rename_lock, flags);
-       p = kernfs_path_locked(kn, buf, buflen);
-       spin_unlock_irqrestore(&kernfs_rename_lock, flags);
-       return p;
+       ret = kernfs_path_from_node(kn, NULL, buf, buflen);
+       if (ret < 0 || ret >= buflen)
+               return NULL;
+       return buf;
   }
   EXPORT_SYMBOL_GPL(kernfs_path);
   
@@@ -164,17 -285,25 +285,25 @@@ void pr_cont_kernfs_name(struct kernfs_
   void pr_cont_kernfs_path(struct kernfs_node *kn)
   {
         unsigned long flags;
-       char *p;
+       int sz;
   
         spin_lock_irqsave(&kernfs_rename_lock, flags);
   
-       p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
-                              sizeof(kernfs_pr_cont_buf));
-       if (p)
-               pr_cont("%s", p);
-       else
-               pr_cont("<name too long>");
+       sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf,
+                                         sizeof(kernfs_pr_cont_buf));
+       if (sz < 0) {
+               pr_cont("(error)");
+               goto out;
+       }
+ 
+       if (sz >= sizeof(kernfs_pr_cont_buf)) {
+               pr_cont("(name too long)");
+               goto out;
+       }
+ 
+       pr_cont("%s", kernfs_pr_cont_buf);
   
+ out:
         spin_unlock_irqrestore(&kernfs_rename_lock, flags);
   }
   
@@@ -691,22 -820,15 +820,22 @@@ static struct kernfs_node *kernfs_walk_
                                           const unsigned char *path,
                                           const void *ns)
   {
- -      static char path_buf[PATH_MAX]; /* protected by kernfs_mutex */
- -      size_t len = strlcpy(path_buf, path, PATH_MAX);
- -      char *p = path_buf;
- -      char *name;
+ +      size_t len;
+ +      char *p, *name;
   
         lockdep_assert_held(&kernfs_mutex);
   
- -      if (len >= PATH_MAX)
+ +      /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
+ +      spin_lock_irq(&kernfs_rename_lock);
+ +
+ +      len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
+ +
+ +      if (len >= sizeof(kernfs_pr_cont_buf)) {
+ +              spin_unlock_irq(&kernfs_rename_lock);
                 return NULL;
+ +      }
+ +
+ +      p = kernfs_pr_cont_buf;
   
         while ((name = strsep(&p, "/")) && parent) {
                 if (*name == '\0')
@@@ -714,8 -836,6 +843,8 @@@
                 parent = kernfs_find_ns(parent, name, ns);
         }
   
+ +      spin_unlock_irq(&kernfs_rename_lock);
+ +
         return parent;
   }
   
@@@ -1520,9 -1640,9 +1649,9 @@@ static loff_t kernfs_dir_fop_llseek(str
         struct inode *inode = file_inode(file);
         loff_t ret;
   
- -      mutex_lock(&inode->i_mutex);
+ +      inode_lock(inode);
         ret = generic_file_llseek(file, offset, whence);
- -      mutex_unlock(&inode->i_mutex);
+ +      inode_unlock(inode);
   
         return ret;
   }
diff --combined kernel/cgroup.c

index 3fe02c1,2c88149..671dc05
--- 1/kernel/cgroup.c
--- 2/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@@ -59,6 -59,9 +59,9 @@@
   #include <linux/delay.h>
   #include <linux/atomic.h>
   #include <linux/cpuset.h>
+ #include <linux/proc_ns.h>
+ #include <linux/nsproxy.h>
+ #include <linux/proc_ns.h>
   #include <net/sock.h>
   
   /*
@@@ -178,16 -181,13 +181,16 @@@ EXPORT_SYMBOL_GPL(cgrp_dfl_root)
    * The default hierarchy always exists but is hidden until mounted for the
    * first time.  This is for backward compatibility.
    */
- -static bool cgrp_dfl_root_visible;
+ +static bool cgrp_dfl_visible;
   
   /* Controllers blocked by the commandline in v1 */
- -static unsigned long cgroup_no_v1_mask;
+ +static u16 cgroup_no_v1_mask;
   
   /* some controllers are not supported in the default hierarchy */
- -static unsigned long cgrp_dfl_root_inhibit_ss_mask;
+ +static u16 cgrp_dfl_inhibit_ss_mask;
+ +
+ +/* some controllers are implicitly enabled on the default hierarchy */
+ +static unsigned long cgrp_dfl_implicit_ss_mask;
   
   /* The list of hierarchy roots */
   
@@@ -211,25 -211,32 +214,34 @@@ static u64 css_serial_nr_next = 1
    * fork/exit handlers to call. This avoids us having to do extra work in the
    * fork/exit path to check which subsystems have fork/exit callbacks.
    */
- -static unsigned long have_fork_callback __read_mostly;
- -static unsigned long have_exit_callback __read_mostly;
- -static unsigned long have_free_callback __read_mostly;
+ +static u16 have_fork_callback __read_mostly;
+ +static u16 have_exit_callback __read_mostly;
+ +static u16 have_free_callback __read_mostly;
   
+ /* cgroup namespace for init task */
+ struct cgroup_namespace init_cgroup_ns = {
+       .count          = { .counter = 2, },
+       .user_ns        = &init_user_ns,
+       .ns.ops         = &cgroupns_operations,
+       .ns.inum        = PROC_CGROUP_INIT_INO,
+       .root_cset      = &init_css_set,
+ };
+ 
   /* Ditto for the can_fork callback. */
- -static unsigned long have_canfork_callback __read_mostly;
+ +static u16 have_canfork_callback __read_mostly;
   
   static struct file_system_type cgroup2_fs_type;
   static struct cftype cgroup_dfl_base_files[];
   static struct cftype cgroup_legacy_base_files[];
   
- -static int rebind_subsystems(struct cgroup_root *dst_root,
- -                           unsigned long ss_mask);
+ +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+ +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+ +static int cgroup_apply_control(struct cgroup *cgrp);
+ +static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
   static void css_task_iter_advance(struct css_task_iter *it);
   static int cgroup_destroy_locked(struct cgroup *cgrp);
- -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
- -                    bool visible);
+ +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+ +                                            struct cgroup_subsys *ss);
   static void css_release(struct percpu_ref *ref);
   static void kill_css(struct cgroup_subsys_state *css);
   static int cgroup_addrm_files(struct cgroup_subsys_state *css,
@@@ -246,9 -253,6 +258,9 @@@
    */
   static bool cgroup_ssid_enabled(int ssid)
   {
+ +      if (CGROUP_SUBSYS_COUNT == 0)
+ +              return false;
+ +
         return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
   }
   
@@@ -355,32 -359,6 +367,32 @@@ static struct cgroup *cgroup_parent(str
         return NULL;
   }
   
+ +/* subsystems visibly enabled on a cgroup */
+ +static u16 cgroup_control(struct cgroup *cgrp)
+ +{
+ +      struct cgroup *parent = cgroup_parent(cgrp);
+ +      u16 root_ss_mask = cgrp->root->subsys_mask;
+ +
+ +      if (parent)
+ +              return parent->subtree_control;
+ +
+ +      if (cgroup_on_dfl(cgrp))
+ +              root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
+ +                                cgrp_dfl_implicit_ss_mask);
+ +      return root_ss_mask;
+ +}
+ +
+ +/* subsystems enabled on a cgroup */
+ +static u16 cgroup_ss_mask(struct cgroup *cgrp)
+ +{
+ +      struct cgroup *parent = cgroup_parent(cgrp);
+ +
+ +      if (parent)
+ +              return parent->subtree_ss_mask;
+ +
+ +      return cgrp->root->subsys_mask;
+ +}
+ +
   /**
    * cgroup_css - obtain a cgroup's css for the specified subsystem
    * @cgrp: the cgroup of interest
@@@ -420,15 -398,16 +432,15 @@@ static struct cgroup_subsys_state *cgro
         if (!ss)
                 return &cgrp->self;
   
- -      if (!(cgrp->root->subsys_mask & (1 << ss->id)))
- -              return NULL;
- -
         /*
          * This function is used while updating css associations and thus
- -       * can't test the csses directly.  Use ->child_subsys_mask.
+ +       * can't test the csses directly.  Test ss_mask.
          */
- -      while (cgroup_parent(cgrp) &&
- -             !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
+ +      while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
                 cgrp = cgroup_parent(cgrp);
+ +              if (!cgrp)
+ +                      return NULL;
+ +      }
   
         return cgroup_css(cgrp, ss);
   }
@@@ -547,28 -526,22 +559,28 @@@ static int notify_on_release(const stru
              (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
   
   /**
- - * for_each_subsys_which - filter for_each_subsys with a bitmask
+ + * do_each_subsys_mask - filter for_each_subsys with a bitmask
    * @ss: the iteration cursor
    * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- - * @ss_maskp: a pointer to the bitmask
+ + * @ss_mask: the bitmask
    *
    * The block will only run for cases where the ssid-th bit (1 << ssid) of
- - * mask is set to 1.
+ + * @ss_mask is set.
    */
- -#define for_each_subsys_which(ss, ssid, ss_maskp)                     \
- -      if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */   \
+ +#define do_each_subsys_mask(ss, ssid, ss_mask) do {                   \
+ +      unsigned long __ss_mask = (ss_mask);                            \
+ +      if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
                 (ssid) = 0;                                             \
- -      else                                                            \
- -              for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT)   \
- -                      if (((ss) = cgroup_subsys[ssid]) && false)      \
- -                              break;                                  \
- -                      else
+ +              break;                                                  \
+ +      }                                                               \
+ +      for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {       \
+ +              (ss) = cgroup_subsys[ssid];                             \
+ +              {
+ +
+ +#define while_each_subsys_mask()                                      \
+ +              }                                                       \
+ +      }                                                               \
+ +} while (false)
   
   /* iterate across the hierarchies */
   #define for_each_root(root)                                           \
@@@ -582,24 -555,6 +594,24 @@@
                         ;                                               \
                 else
   
+ +/* walk live descendants in preorder */
+ +#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)                \
+ +      css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))  \
+ +              if (({ lockdep_assert_held(&cgroup_mutex);              \
+ +                     (dsct) = (d_css)->cgroup;                        \
+ +                     cgroup_is_dead(dsct); }))                        \
+ +                      ;                                               \
+ +              else
+ +
+ +/* walk live descendants in postorder */
+ +#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)               \
+ +      css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
+ +              if (({ lockdep_assert_held(&cgroup_mutex);              \
+ +                     (dsct) = (d_css)->cgroup;                        \
+ +                     cgroup_is_dead(dsct); }))                        \
+ +                      ;                                               \
+ +              else
+ +
   static void cgroup_release_agent(struct work_struct *work);
   static void check_for_release(struct cgroup *cgrp);
   
@@@ -730,9 -685,6 +742,9 @@@ static void css_set_move_task(struct ta
   {
         lockdep_assert_held(&css_set_lock);
   
+ +      if (to_cset && !css_set_populated(to_cset))
+ +              css_set_update_populated(to_cset, true);
+ +
         if (from_cset) {
                 struct css_task_iter *it, *pos;
   
@@@ -766,6 -718,8 +778,6 @@@
                  */
                 WARN_ON_ONCE(task->flags & PF_EXITING);
   
- -              if (!css_set_populated(to_cset))
- -                      css_set_update_populated(to_cset, true);
                 rcu_assign_pointer(task->cgroups, to_cset);
                 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
                                                              &to_cset->tasks);
@@@ -1168,13 -1122,13 +1180,13 @@@ static void cgroup_destroy_root(struct 
         struct cgroup *cgrp = &root->cgrp;
         struct cgrp_cset_link *link, *tmp_link;
   
- -      mutex_lock(&cgroup_mutex);
+ +      cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
   
         BUG_ON(atomic_read(&root->nr_cgrps));
         BUG_ON(!list_empty(&cgrp->self.children));
   
         /* Rebind all subsystems back to the default hierarchy */
- -      rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
+ +      WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
   
         /*
          * Release all the links from cset_links to this hierarchy's
@@@ -1314,40 -1268,46 +1326,40 @@@ static umode_t cgroup_file_mode(const s
   }
   
   /**
- - * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
- - * @cgrp: the target cgroup
+ + * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
    * @subtree_control: the new subtree_control mask to consider
+ + * @this_ss_mask: available subsystems
    *
    * On the default hierarchy, a subsystem may request other subsystems to be
    * enabled together through its ->depends_on mask.  In such cases, more
    * subsystems than specified in "cgroup.subtree_control" may be enabled.
    *
    * This function calculates which subsystems need to be enabled if
- - * @subtree_control is to be applied to @cgrp.  The returned mask is always
- - * a superset of @subtree_control and follows the usual hierarchy rules.
+ + * @subtree_control is to be applied while restricted to @this_ss_mask.
    */
- -static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
- -                                                unsigned long subtree_control)
+ +static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
   {
- -      struct cgroup *parent = cgroup_parent(cgrp);
- -      unsigned long cur_ss_mask = subtree_control;
+ +      u16 cur_ss_mask = subtree_control;
         struct cgroup_subsys *ss;
         int ssid;
   
         lockdep_assert_held(&cgroup_mutex);
   
- -      if (!cgroup_on_dfl(cgrp))
- -              return cur_ss_mask;
+ +      cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
   
         while (true) {
- -              unsigned long new_ss_mask = cur_ss_mask;
+ +              u16 new_ss_mask = cur_ss_mask;
   
- -              for_each_subsys_which(ss, ssid, &cur_ss_mask)
+ +              do_each_subsys_mask(ss, ssid, cur_ss_mask) {
                         new_ss_mask |= ss->depends_on;
+ +              } while_each_subsys_mask();
   
                 /*
                  * Mask out subsystems which aren't available.  This can
                  * happen only if some depended-upon subsystems were bound
                  * to non-default hierarchies.
                  */
- -              if (parent)
- -                      new_ss_mask &= parent->child_subsys_mask;
- -              else
- -                      new_ss_mask &= cgrp->root->subsys_mask;
+ +              new_ss_mask &= this_ss_mask;
   
                 if (new_ss_mask == cur_ss_mask)
                         break;
@@@ -1357,6 -1317,19 +1369,6 @@@
         return cur_ss_mask;
   }
   
- -/**
- - * cgroup_refresh_child_subsys_mask - update child_subsys_mask
- - * @cgrp: the target cgroup
- - *
- - * Update @cgrp->child_subsys_mask according to the current
- - * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
- - */
- -static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
- -{
- -      cgrp->child_subsys_mask =
- -              cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
- -}
- -
   /**
    * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
    * @kn: the kernfs_node being serviced
@@@ -1385,22 -1358,19 +1397,22 @@@ static void cgroup_kn_unlock(struct ker
   /**
    * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
    * @kn: the kernfs_node being serviced
+ + * @drain_offline: perform offline draining on the cgroup
    *
    * This helper is to be used by a cgroup kernfs method currently servicing
    * @kn.  It breaks the active protection, performs cgroup locking and
    * verifies that the associated cgroup is alive.  Returns the cgroup if
    * alive; otherwise, %NULL.  A successful return should be undone by a
- - * matching cgroup_kn_unlock() invocation.
+ + * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
+ + * cgroup is drained of offlining csses before return.
    *
    * Any cgroup kernfs method implementation which requires locking the
    * associated cgroup should use this helper.  It avoids nesting cgroup
    * locking under kernfs active protection and allows all kernfs operations
    * including self-removal.
    */
- -static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
+ +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
+ +                                        bool drain_offline)
   {
         struct cgroup *cgrp;
   
@@@ -1419,10 -1389,7 +1431,10 @@@
                 return NULL;
         kernfs_break_active_protection(kn);
   
- -      mutex_lock(&cgroup_mutex);
+ +      if (drain_offline)
+ +              cgroup_lock_and_drain_offline(cgrp);
+ +      else
+ +              mutex_lock(&cgroup_mutex);
   
         if (!cgroup_is_dead(cgrp))
                 return cgrp;
@@@ -1452,17 -1419,14 +1464,17 @@@ static void cgroup_rm_file(struct cgrou
   /**
    * css_clear_dir - remove subsys files in a cgroup directory
    * @css: taget css
- - * @cgrp_override: specify if target cgroup is different from css->cgroup
    */
- -static void css_clear_dir(struct cgroup_subsys_state *css,
- -                        struct cgroup *cgrp_override)
+ +static void css_clear_dir(struct cgroup_subsys_state *css)
   {
- -      struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+ +      struct cgroup *cgrp = css->cgroup;
         struct cftype *cfts;
   
+ +      if (!(css->flags & CSS_VISIBLE))
+ +              return;
+ +
+ +      css->flags &= ~CSS_VISIBLE;
+ +
         list_for_each_entry(cfts, &css->ss->cfts, node)
                 cgroup_addrm_files(css, cgrp, cfts, false);
   }
@@@ -1470,18 -1434,17 +1482,18 @@@
   /**
    * css_populate_dir - create subsys files in a cgroup directory
    * @css: target css
- - * @cgrp_overried: specify if target cgroup is different from css->cgroup
    *
    * On failure, no file is added.
    */
- -static int css_populate_dir(struct cgroup_subsys_state *css,
- -                          struct cgroup *cgrp_override)
+ +static int css_populate_dir(struct cgroup_subsys_state *css)
   {
- -      struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+ +      struct cgroup *cgrp = css->cgroup;
         struct cftype *cfts, *failed_cfts;
         int ret;
   
+ +      if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
+ +              return 0;
+ +
         if (!css->ss) {
                 if (cgroup_on_dfl(cgrp))
                         cfts = cgroup_dfl_base_files;
@@@ -1498,9 -1461,6 +1510,9 @@@
                         goto err;
                 }
         }
+ +
+ +      css->flags |= CSS_VISIBLE;
+ +
         return 0;
   err:
         list_for_each_entry(cfts, &css->ss->cfts, node) {
@@@ -1511,30 -1471,67 +1523,30 @@@
         return ret;
   }
   
- -static int rebind_subsystems(struct cgroup_root *dst_root,
- -                           unsigned long ss_mask)
+ +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
   {
         struct cgroup *dcgrp = &dst_root->cgrp;
         struct cgroup_subsys *ss;
- -      unsigned long tmp_ss_mask;
         int ssid, i, ret;
   
         lockdep_assert_held(&cgroup_mutex);
   
- -      for_each_subsys_which(ss, ssid, &ss_mask) {
- -              /* if @ss has non-root csses attached to it, can't move */
- -              if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
+ +      do_each_subsys_mask(ss, ssid, ss_mask) {
+ +              /*
+ +               * If @ss has non-root csses attached to it, can't move.
+ +               * If @ss is an implicit controller, it is exempt from this
+ +               * rule and can be stolen.
+ +               */
+ +              if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
+ +                  !ss->implicit_on_dfl)
                         return -EBUSY;
   
                 /* can't move between two non-dummy roots either */
                 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
                         return -EBUSY;
- -      }
- -
- -      /* skip creating root files on dfl_root for inhibited subsystems */
- -      tmp_ss_mask = ss_mask;
- -      if (dst_root == &cgrp_dfl_root)
- -              tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
- -
- -      for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
- -              struct cgroup *scgrp = &ss->root->cgrp;
- -              int tssid;
- -
- -              ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
- -              if (!ret)
- -                      continue;
- -
- -              /*
- -               * Rebinding back to the default root is not allowed to
- -               * fail.  Using both default and non-default roots should
- -               * be rare.  Moving subsystems back and forth even more so.
- -               * Just warn about it and continue.
- -               */
- -              if (dst_root == &cgrp_dfl_root) {
- -                      if (cgrp_dfl_root_visible) {
- -                              pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
- -                                      ret, ss_mask);
- -                              pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
- -                      }
- -                      continue;
- -              }
+ +      } while_each_subsys_mask();
   
- -              for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
- -                      if (tssid == ssid)
- -                              break;
- -                      css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
- -              }
- -              return ret;
- -      }
- -
- -      /*
- -       * Nothing can fail from this point on.  Remove files for the
- -       * removed subsystems and rebind each subsystem.
- -       */
- -      for_each_subsys_which(ss, ssid, &ss_mask) {
+ +      do_each_subsys_mask(ss, ssid, ss_mask) {
                 struct cgroup_root *src_root = ss->root;
                 struct cgroup *scgrp = &src_root->cgrp;
                 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
@@@ -1542,12 -1539,8 +1554,12 @@@
   
                 WARN_ON(!css || cgroup_css(dcgrp, ss));
   
- -              css_clear_dir(css, NULL);
+ +              /* disable from the source */
+ +              src_root->subsys_mask &= ~(1 << ssid);
+ +              WARN_ON(cgroup_apply_control(scgrp));
+ +              cgroup_finalize_control(scgrp, 0);
   
+ +              /* rebind */
                 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
                 rcu_assign_pointer(dcgrp->subsys[ssid], css);
                 ss->root = dst_root;
@@@ -1559,23 -1552,23 +1571,23 @@@
                                        &dcgrp->e_csets[ss->id]);
                 spin_unlock_bh(&css_set_lock);
   
- -              src_root->subsys_mask &= ~(1 << ssid);
- -              scgrp->subtree_control &= ~(1 << ssid);
- -              cgroup_refresh_child_subsys_mask(scgrp);
- -
                 /* default hierarchy doesn't enable controllers by default */
                 dst_root->subsys_mask |= 1 << ssid;
                 if (dst_root == &cgrp_dfl_root) {
                         static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
                 } else {
                         dcgrp->subtree_control |= 1 << ssid;
- -                      cgroup_refresh_child_subsys_mask(dcgrp);
                         static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
                 }
   
+ +              ret = cgroup_apply_control(dcgrp);
+ +              if (ret)
+ +                      pr_warn("partial failure to rebind %s controller (err=%d)\n",
+ +                              ss->name, ret);
+ +
                 if (ss->bind)
                         ss->bind(css);
- -      }
+ +      } while_each_subsys_mask();
   
         kernfs_activate(dcgrp->kn);
         return 0;
@@@ -1611,7 -1604,7 +1623,7 @@@ static int cgroup_show_options(struct s
   }
   
   struct cgroup_sb_opts {
- -      unsigned long subsys_mask;
+ +      u16 subsys_mask;
         unsigned int flags;
         char *release_agent;
         bool cpuset_clone_children;
@@@ -1624,13 -1617,13 +1636,13 @@@ static int parse_cgroupfs_options(char 
   {
         char *token, *o = data;
         bool all_ss = false, one_ss = false;
- -      unsigned long mask = -1UL;
+ +      u16 mask = U16_MAX;
         struct cgroup_subsys *ss;
         int nr_opts = 0;
         int i;
   
   #ifdef CONFIG_CPUSETS
- -      mask = ~(1U << cpuset_cgrp_id);
+ +      mask = ~((u16)1 << cpuset_cgrp_id);
   #endif
   
         memset(opts, 0, sizeof(*opts));
@@@ -1757,14 -1750,14 +1769,14 @@@ static int cgroup_remount(struct kernfs
         int ret = 0;
         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
         struct cgroup_sb_opts opts;
- -      unsigned long added_mask, removed_mask;
+ +      u16 added_mask, removed_mask;
   
         if (root == &cgrp_dfl_root) {
                 pr_err("remount is not allowed\n");
                 return -EINVAL;
         }
   
- -      mutex_lock(&cgroup_mutex);
+ +      cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
   
         /* See what subsystems are wanted */
         ret = parse_cgroupfs_options(data, &opts);
@@@ -1797,7 -1790,7 +1809,7 @@@
         if (ret)
                 goto out_unlock;
   
- -      rebind_subsystems(&cgrp_dfl_root, removed_mask);
+ +      WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
   
         if (opts.release_agent) {
                 spin_lock(&release_agent_path_lock);
@@@ -1905,7 -1898,7 +1917,7 @@@ static void init_cgroup_root(struct cgr
                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
   }
   
- -static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
+ +static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
   {
         LIST_HEAD(tmp_links);
         struct cgroup *root_cgrp = &root->cgrp;
@@@ -1928,11 -1921,10 +1940,11 @@@
         /*
          * We're accessing css_set_count without locking css_set_lock here,
          * but that's OK - it can only be increased by someone holding
- -       * cgroup_lock, and that's us. The worst that can happen is that we
- -       * have some link structures left over
+ +       * cgroup_lock, and that's us.  Later rebinding may disable
+ +       * controllers on the default hierarchy and thus create new csets,
+ +       * which can't be more than the existing ones.  Allocate 2x.
          */
- -      ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
+ +      ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
         if (ret)
                 goto cancel_ref;
   
@@@ -1949,7 -1941,7 +1961,7 @@@
         }
         root_cgrp->kn = root->kf_root->kn;
   
- -      ret = css_populate_dir(&root_cgrp->self, NULL);
+ +      ret = css_populate_dir(&root_cgrp->self);
         if (ret)
                 goto destroy_root;
   
@@@ -2002,6 -1994,7 +2014,7 @@@ static struct dentry *cgroup_mount(stru
   {
         bool is_v2 = fs_type == &cgroup2_fs_type;
         struct super_block *pinned_sb = NULL;
+       struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
         struct cgroup_subsys *ss;
         struct cgroup_root *root;
         struct cgroup_sb_opts opts;
@@@ -2010,6 -2003,14 +2023,14 @@@
         int i;
         bool new_sb;
   
+       get_cgroup_ns(ns);
+ 
+       /* Check if the caller has permission to mount. */
+       if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+               put_cgroup_ns(ns);
+               return ERR_PTR(-EPERM);
+       }
+ 
         /*
          * The first time anyone tries to mount a cgroup, enable the list
          * linking each css_set to its tasks and fix up all existing tasks.
@@@ -2020,15 -2021,16 +2041,16 @@@
         if (is_v2) {
                 if (data) {
                         pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+                       put_cgroup_ns(ns);
                         return ERR_PTR(-EINVAL);
                 }
- -              cgrp_dfl_root_visible = true;
+ +              cgrp_dfl_visible = true;
                 root = &cgrp_dfl_root;
                 cgroup_get(&root->cgrp);
                 goto out_mount;
         }
   
- -      mutex_lock(&cgroup_mutex);
+ +      cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
   
         /* First find the desired set of subsystems */
         ret = parse_cgroupfs_options(data, &opts);
@@@ -2125,6 -2127,16 +2147,16 @@@
                 goto out_unlock;
         }
   
+       /*
+        * We know this subsystem has not yet been bound.  Users in a non-init
+        * user namespace may only mount hierarchies with no bound subsystems,
+        * i.e. 'none,name=user1'
+        */
+       if (!opts.none && !capable(CAP_SYS_ADMIN)) {
+               ret = -EPERM;
+               goto out_unlock;
+       }
+ 
         root = kzalloc(sizeof(*root), GFP_KERNEL);
         if (!root) {
                 ret = -ENOMEM;
@@@ -2143,12 -2155,37 +2175,37 @@@ out_free
         kfree(opts.release_agent);
         kfree(opts.name);
   
-       if (ret)
+       if (ret) {
+               put_cgroup_ns(ns);
                 return ERR_PTR(ret);
+       }
   out_mount:
         dentry = kernfs_mount(fs_type, flags, root->kf_root,
                               is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
                               &new_sb);
+ 
+       /*
+        * In non-init cgroup namespace, instead of root cgroup's
+        * dentry, we return the dentry corresponding to the
+        * cgroupns->root_cgrp.
+        */
+       if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
+               struct dentry *nsdentry;
+               struct cgroup *cgrp;
+ 
+               mutex_lock(&cgroup_mutex);
+               spin_lock_bh(&css_set_lock);
+ 
+               cgrp = cset_cgroup_from_root(ns->root_cset, root);
+ 
+               spin_unlock_bh(&css_set_lock);
+               mutex_unlock(&cgroup_mutex);
+ 
+               nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
+               dput(dentry);
+               dentry = nsdentry;
+       }
+ 
         if (IS_ERR(dentry) || !new_sb)
                 cgroup_put(&root->cgrp);
   
@@@ -2161,6 -2198,7 +2218,7 @@@
                 deactivate_super(pinned_sb);
         }
   
+       put_cgroup_ns(ns);
         return dentry;
   }
   
@@@ -2189,14 -2227,45 +2247,45 @@@ static struct file_system_type cgroup_f
         .name = "cgroup",
         .mount = cgroup_mount,
         .kill_sb = cgroup_kill_sb,
+       .fs_flags = FS_USERNS_MOUNT,
   };
   
   static struct file_system_type cgroup2_fs_type = {
         .name = "cgroup2",
         .mount = cgroup_mount,
         .kill_sb = cgroup_kill_sb,
+       .fs_flags = FS_USERNS_MOUNT,
   };
   
+ static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+                                  struct cgroup_namespace *ns)
+ {
+       struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
+       int ret;
+ 
+       ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
+       if (ret < 0 || ret >= buflen)
+               return NULL;
+       return buf;
+ }
+ 
+ char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+                    struct cgroup_namespace *ns)
+ {
+       char *ret;
+ 
+       mutex_lock(&cgroup_mutex);
+       spin_lock_bh(&css_set_lock);
+ 
+       ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
+ 
+       spin_unlock_bh(&css_set_lock);
+       mutex_unlock(&cgroup_mutex);
+ 
+       return ret;
+ }
+ EXPORT_SYMBOL_GPL(cgroup_path_ns);
+ 
   /**
    * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
    * @task: target task
@@@ -2224,7 -2293,7 +2313,7 @@@ char *task_cgroup_path(struct task_stru
   
         if (root) {
                 cgrp = task_cgroup_from_root(task, root);
-               path = cgroup_path(cgrp, buf, buflen);
+               path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
         } else {
                 /* if no hierarchy exists, everyone is in "/" */
                 if (strlcpy(buf, "/", buflen) < buflen)
@@@ -2368,38 -2437,38 +2457,38 @@@ struct task_struct *cgroup_taskset_next
   }
   
   /**
- - * cgroup_taskset_migrate - migrate a taskset to a cgroup
+ + * cgroup_taskset_migrate - migrate a taskset
    * @tset: taget taskset
- - * @dst_cgrp: destination cgroup
+ + * @root: cgroup root the migration is taking place on
    *
- - * Migrate tasks in @tset to @dst_cgrp.  This function fails iff one of the
- - * ->can_attach callbacks fails and guarantees that either all or none of
- - * the tasks in @tset are migrated.  @tset is consumed regardless of
- - * success.
+ + * Migrate tasks in @tset as setup by migration preparation functions.
+ + * This function fails iff one of the ->can_attach callbacks fails and
+ + * guarantees that either all or none of the tasks in @tset are migrated.
+ + * @tset is consumed regardless of success.
    */
   static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
- -                                struct cgroup *dst_cgrp)
+ +                                struct cgroup_root *root)
   {
- -      struct cgroup_subsys_state *css, *failed_css = NULL;
+ +      struct cgroup_subsys *ss;
         struct task_struct *task, *tmp_task;
         struct css_set *cset, *tmp_cset;
- -      int i, ret;
+ +      int ssid, failed_ssid, ret;
   
         /* methods shouldn't be called if no task is actually migrating */
         if (list_empty(&tset->src_csets))
                 return 0;
   
         /* check that we can legitimately attach to the cgroup */
- -      for_each_e_css(css, i, dst_cgrp) {
- -              if (css->ss->can_attach) {
- -                      tset->ssid = i;
- -                      ret = css->ss->can_attach(tset);
+ +      do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ +              if (ss->can_attach) {
+ +                      tset->ssid = ssid;
+ +                      ret = ss->can_attach(tset);
                         if (ret) {
- -                              failed_css = css;
+ +                              failed_ssid = ssid;
                                 goto out_cancel_attach;
                         }
                 }
- -      }
+ +      } while_each_subsys_mask();
   
         /*
          * Now that we're guaranteed success, proceed to move all tasks to
@@@ -2426,25 -2495,25 +2515,25 @@@
          */
         tset->csets = &tset->dst_csets;
   
- -      for_each_e_css(css, i, dst_cgrp) {
- -              if (css->ss->attach) {
- -                      tset->ssid = i;
- -                      css->ss->attach(tset);
+ +      do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ +              if (ss->attach) {
+ +                      tset->ssid = ssid;
+ +                      ss->attach(tset);
                 }
- -      }
+ +      } while_each_subsys_mask();
   
         ret = 0;
         goto out_release_tset;
   
   out_cancel_attach:
- -      for_each_e_css(css, i, dst_cgrp) {
- -              if (css == failed_css)
+ +      do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ +              if (ssid == failed_ssid)
                         break;
- -              if (css->ss->cancel_attach) {
- -                      tset->ssid = i;
- -                      css->ss->cancel_attach(tset);
+ +              if (ss->cancel_attach) {
+ +                      tset->ssid = ssid;
+ +                      ss->cancel_attach(tset);
                 }
- -      }
+ +      } while_each_subsys_mask();
   out_release_tset:
         spin_lock_bh(&css_set_lock);
         list_splice_init(&tset->dst_csets, &tset->src_csets);
@@@ -2456,20 -2525,6 +2545,20 @@@
         return ret;
   }
   
+ +/**
+ + * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ + * @dst_cgrp: destination cgroup to test
+ + *
+ + * On the default hierarchy, except for the root, subtree_control must be
+ + * zero for migration destination cgroups with tasks so that child cgroups
+ + * don't compete against tasks.
+ + */
+ +static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+ +{
+ +      return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
+ +              !dst_cgrp->subtree_control;
+ +}
+ +
   /**
    * cgroup_migrate_finish - cleanup after attach
    * @preloaded_csets: list of preloaded css_sets
@@@ -2486,7 -2541,6 +2575,7 @@@ static void cgroup_migrate_finish(struc
         spin_lock_bh(&css_set_lock);
         list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
                 cset->mg_src_cgrp = NULL;
+ +              cset->mg_dst_cgrp = NULL;
                 cset->mg_dst_cset = NULL;
                 list_del_init(&cset->mg_preload_node);
                 put_css_set_locked(cset);
@@@ -2519,56 -2573,58 +2608,56 @@@ static void cgroup_migrate_add_src(stru
         lockdep_assert_held(&cgroup_mutex);
         lockdep_assert_held(&css_set_lock);
   
+ +      /*
+ +       * If ->dead, @src_set is associated with one or more dead cgroups
+ +       * and doesn't contain any migratable tasks.  Ignore it early so
+ +       * that the rest of migration path doesn't get confused by it.
+ +       */
+ +      if (src_cset->dead)
+ +              return;
+ +
         src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
   
         if (!list_empty(&src_cset->mg_preload_node))
                 return;
   
         WARN_ON(src_cset->mg_src_cgrp);
+ +      WARN_ON(src_cset->mg_dst_cgrp);
         WARN_ON(!list_empty(&src_cset->mg_tasks));
         WARN_ON(!list_empty(&src_cset->mg_node));
   
         src_cset->mg_src_cgrp = src_cgrp;
+ +      src_cset->mg_dst_cgrp = dst_cgrp;
         get_css_set(src_cset);
         list_add(&src_cset->mg_preload_node, preloaded_csets);
   }
   
   /**
    * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- - * @dst_cgrp: the destination cgroup (may be %NULL)
    * @preloaded_csets: list of preloaded source css_sets
    *
- - * Tasks are about to be moved to @dst_cgrp and all the source css_sets
- - * have been preloaded to @preloaded_csets.  This function looks up and
- - * pins all destination css_sets, links each to its source, and append them
- - * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
- - * source css_set is assumed to be its cgroup on the default hierarchy.
+ + * Tasks are about to be moved and all the source css_sets have been
+ + * preloaded to @preloaded_csets.  This function looks up and pins all
+ + * destination css_sets, links each to its source, and append them to
+ + * @preloaded_csets.
    *
    * This function must be called after cgroup_migrate_add_src() has been
    * called on each migration source css_set.  After migration is performed
    * using cgroup_migrate(), cgroup_migrate_finish() must be called on
    * @preloaded_csets.
    */
- -static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
- -                                    struct list_head *preloaded_csets)
+ +static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
   {
         LIST_HEAD(csets);
         struct css_set *src_cset, *tmp_cset;
   
         lockdep_assert_held(&cgroup_mutex);
   
- -      /*
- -       * Except for the root, child_subsys_mask must be zero for a cgroup
- -       * with tasks so that child cgroups don't compete against tasks.
- -       */
- -      if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
- -          dst_cgrp->child_subsys_mask)
- -              return -EBUSY;
- -
         /* look up the dst cset for each src cset and link it to src */
         list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
                 struct css_set *dst_cset;
   
- -              dst_cset = find_css_set(src_cset,
- -                                      dst_cgrp ?: src_cset->dfl_cgrp);
+ +              dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
                 if (!dst_cset)
                         goto err;
   
@@@ -2581,7 -2637,6 +2670,7 @@@
                  */
                 if (src_cset == dst_cset) {
                         src_cset->mg_src_cgrp = NULL;
+ +                      src_cset->mg_dst_cgrp = NULL;
                         list_del_init(&src_cset->mg_preload_node);
                         put_css_set(src_cset);
                         put_css_set(dst_cset);
@@@ -2607,11 -2662,11 +2696,11 @@@ err
    * cgroup_migrate - migrate a process or task to a cgroup
    * @leader: the leader of the process or the task to migrate
    * @threadgroup: whether @leader points to the whole process or a single task
- - * @cgrp: the destination cgroup
+ + * @root: cgroup root migration is taking place on
    *
- - * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
- - * process, the caller must be holding cgroup_threadgroup_rwsem.  The
- - * caller is also responsible for invoking cgroup_migrate_add_src() and
+ + * Migrate a process or task denoted by @leader.  If migrating a process,
+ + * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
+ + * responsible for invoking cgroup_migrate_add_src() and
    * cgroup_migrate_prepare_dst() on the targets before invoking this
    * function and following up with cgroup_migrate_finish().
    *
@@@ -2622,7 -2677,7 +2711,7 @@@
    * actually starting migrating.
    */
   static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
- -                        struct cgroup *cgrp)
+ +                        struct cgroup_root *root)
   {
         struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
         struct task_struct *task;
@@@ -2643,7 -2698,7 +2732,7 @@@
         rcu_read_unlock();
         spin_unlock_bh(&css_set_lock);
   
- -      return cgroup_taskset_migrate(&tset, cgrp);
+ +      return cgroup_taskset_migrate(&tset, root);
   }
   
   /**
@@@ -2661,9 -2716,6 +2750,9 @@@ static int cgroup_attach_task(struct cg
         struct task_struct *task;
         int ret;
   
+ +      if (!cgroup_may_migrate_to(dst_cgrp))
+ +              return -EBUSY;
+ +
         /* look up all src csets */
         spin_lock_bh(&css_set_lock);
         rcu_read_lock();
@@@ -2678,9 -2730,9 +2767,9 @@@
         spin_unlock_bh(&css_set_lock);
   
         /* prepare dst csets and commit */
- -      ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
+ +      ret = cgroup_migrate_prepare_dst(&preloaded_csets);
         if (!ret)
- -              ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
+ +              ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
   
         cgroup_migrate_finish(&preloaded_csets);
         return ret;
@@@ -2743,7 -2795,7 +2832,7 @@@ static ssize_t __cgroup_procs_write(str
         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
                 return -EINVAL;
   
- -      cgrp = cgroup_kn_lock_live(of->kn);
+ +      cgrp = cgroup_kn_lock_live(of->kn, false);
         if (!cgrp)
                 return -ENODEV;
   
@@@ -2841,7 -2893,7 +2930,7 @@@ static ssize_t cgroup_release_agent_wri
   
         BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
   
- -      cgrp = cgroup_kn_lock_live(of->kn);
+ +      cgrp = cgroup_kn_lock_live(of->kn, false);
         if (!cgrp)
                 return -ENODEV;
         spin_lock(&release_agent_path_lock);
@@@ -2869,28 -2921,38 +2958,28 @@@ static int cgroup_sane_behavior_show(st
         return 0;
   }
   
- -static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
+ +static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
   {
         struct cgroup_subsys *ss;
         bool printed = false;
         int ssid;
   
- -      for_each_subsys_which(ss, ssid, &ss_mask) {
+ +      do_each_subsys_mask(ss, ssid, ss_mask) {
                 if (printed)
                         seq_putc(seq, ' ');
                 seq_printf(seq, "%s", ss->name);
                 printed = true;
- -      }
+ +      } while_each_subsys_mask();
         if (printed)
                 seq_putc(seq, '\n');
   }
   
- -/* show controllers which are currently attached to the default hierarchy */
- -static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
- -{
- -      struct cgroup *cgrp = seq_css(seq)->cgroup;
- -
- -      cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
- -                           ~cgrp_dfl_root_inhibit_ss_mask);
- -      return 0;
- -}
- -
   /* show controllers which are enabled from the parent */
   static int cgroup_controllers_show(struct seq_file *seq, void *v)
   {
         struct cgroup *cgrp = seq_css(seq)->cgroup;
   
- -      cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
+ +      cgroup_print_ss_mask(seq, cgroup_control(cgrp));
         return 0;
   }
   
@@@ -2907,17 -2969,16 +2996,17 @@@ static int cgroup_subtree_control_show(
    * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
    * @cgrp: root of the subtree to update csses for
    *
- - * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
- - * css associations need to be updated accordingly.  This function looks up
- - * all css_sets which are attached to the subtree, creates the matching
- - * updated css_sets and migrates the tasks to the new ones.
+ + * @cgrp's control masks have changed and its subtree's css associations
+ + * need to be updated accordingly.  This function looks up all css_sets
+ + * which are attached to the subtree, creates the matching updated css_sets
+ + * and migrates the tasks to the new ones.
    */
   static int cgroup_update_dfl_csses(struct cgroup *cgrp)
   {
         LIST_HEAD(preloaded_csets);
         struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
- -      struct cgroup_subsys_state *css;
+ +      struct cgroup_subsys_state *d_css;
+ +      struct cgroup *dsct;
         struct css_set *src_cset;
         int ret;
   
@@@ -2927,17 -2988,21 +3016,17 @@@
   
         /* look up all csses currently attached to @cgrp's subtree */
         spin_lock_bh(&css_set_lock);
- -      css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+ +      cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                 struct cgrp_cset_link *link;
   
- -              /* self is not affected by child_subsys_mask change */
- -              if (css->cgroup == cgrp)
- -                      continue;
- -
- -              list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
- -                      cgroup_migrate_add_src(link->cset, cgrp,
+ +              list_for_each_entry(link, &dsct->cset_links, cset_link)
+ +                      cgroup_migrate_add_src(link->cset, dsct,
                                                &preloaded_csets);
         }
         spin_unlock_bh(&css_set_lock);
   
         /* NULL dst indicates self on default hierarchy */
- -      ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+ +      ret = cgroup_migrate_prepare_dst(&preloaded_csets);
         if (ret)
                 goto out_finish;
   
@@@ -2955,272 -3020,20 +3044,272 @@@
         }
         spin_unlock_bh(&css_set_lock);
   
- -      ret = cgroup_taskset_migrate(&tset, cgrp);
+ +      ret = cgroup_taskset_migrate(&tset, cgrp->root);
   out_finish:
         cgroup_migrate_finish(&preloaded_csets);
         percpu_up_write(&cgroup_threadgroup_rwsem);
         return ret;
   }
   
+ +/**
+ + * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
+ + * @cgrp: root of the target subtree
+ + *
+ + * Because css offlining is asynchronous, userland may try to re-enable a
+ + * controller while the previous css is still around.  This function grabs
+ + * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
+ + */
+ +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+ +      __acquires(&cgroup_mutex)
+ +{
+ +      struct cgroup *dsct;
+ +      struct cgroup_subsys_state *d_css;
+ +      struct cgroup_subsys *ss;
+ +      int ssid;
+ +
+ +restart:
+ +      mutex_lock(&cgroup_mutex);
+ +
+ +      cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+ +              for_each_subsys(ss, ssid) {
+ +                      struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+ +                      DEFINE_WAIT(wait);
+ +
+ +                      if (!css || !percpu_ref_is_dying(&css->refcnt))
+ +                              continue;
+ +
+ +                      cgroup_get(dsct);
+ +                      prepare_to_wait(&dsct->offline_waitq, &wait,
+ +                                      TASK_UNINTERRUPTIBLE);
+ +
+ +                      mutex_unlock(&cgroup_mutex);
+ +                      schedule();
+ +                      finish_wait(&dsct->offline_waitq, &wait);
+ +
+ +                      cgroup_put(dsct);
+ +                      goto restart;
+ +              }
+ +      }
+ +}
+ +
+ +/**
+ + * cgroup_save_control - save control masks of a subtree
+ + * @cgrp: root of the target subtree
+ + *
+ + * Save ->subtree_control and ->subtree_ss_mask to the respective old_
+ + * prefixed fields for @cgrp's subtree including @cgrp itself.
+ + */
+ +static void cgroup_save_control(struct cgroup *cgrp)
+ +{
+ +      struct cgroup *dsct;
+ +      struct cgroup_subsys_state *d_css;
+ +
+ +      cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+ +              dsct->old_subtree_control = dsct->subtree_control;
+ +              dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
+ +      }
+ +}
+ +
+ +/**
+ + * cgroup_propagate_control - refresh control masks of a subtree
+ + * @cgrp: root of the target subtree
+ + *
+ + * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
+ + * ->subtree_control and propagate controller availability through the
+ + * subtree so that descendants don't have unavailable controllers enabled.
+ + */
+ +static void cgroup_propagate_control(struct cgroup *cgrp)
+ +{
+ +      struct cgroup *dsct;
+ +      struct cgroup_subsys_state *d_css;
+ +
+ +      cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+ +              dsct->subtree_control &= cgroup_control(dsct);
+ +              dsct->subtree_ss_mask =
+ +                      cgroup_calc_subtree_ss_mask(dsct->subtree_control,
+ +                                                  cgroup_ss_mask(dsct));
+ +      }
+ +}
+ +
+ +/**
+ + * cgroup_restore_control - restore control masks of a subtree
+ + * @cgrp: root of the target subtree
+ + *
+ + * Restore ->subtree_control and ->subtree_ss_mask from the respective old_
+ + * prefixed fields for @cgrp's subtree including @cgrp itself.
+ + */
+ +static void cgroup_restore_control(struct cgroup *cgrp)
+ +{
+ +      struct cgroup *dsct;
+ +      struct cgroup_subsys_state *d_css;
+ +
+ +      cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+ +              dsct->subtree_control = dsct->old_subtree_control;
+ +              dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
+ +      }
+ +}
+ +
+ +static bool css_visible(struct cgroup_subsys_state *css)
+ +{
+ +      struct cgroup_subsys *ss = css->ss;
+ +      struct cgroup *cgrp = css->cgroup;
+ +
+ +      if (cgroup_control(cgrp) & (1 << ss->id))
+ +              return true;
+ +      if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
+ +              return false;
+ +      return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
+ +}
+ +
+ +/**
+ + * cgroup_apply_control_enable - enable or show csses according to control
+ + * @cgrp: root of the target subtree
+ + *
+ + * Walk @cgrp's subtree and create new csses or make the existing ones
+ + * visible.  A css is created invisible if it's being implicitly enabled
+ + * through dependency.  An invisible css is made visible when the userland
+ + * explicitly enables it.
+ + *
+ + * Returns 0 on success, -errno on failure.  On failure, csses which have
+ + * been processed already aren't cleaned up.  The caller is responsible for
+ + * cleaning up with cgroup_apply_control_disble().
+ + */
+ +static int cgroup_apply_control_enable(struct cgroup *cgrp)
+ +{
+ +      struct cgroup *dsct;
+ +      struct cgroup_subsys_state *d_css;
+ +      struct cgroup_subsys *ss;
+ +      int ssid, ret;
+ +
+ +      cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+ +              for_each_subsys(ss, ssid) {
+ +                      struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+ +
+ +                      WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
+ +
+ +                      if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
+ +                              continue;
+ +
+ +                      if (!css) {
+ +                              css = css_create(dsct, ss);
+ +                              if (IS_ERR(css))
+ +                                      return PTR_ERR(css);
+ +                      }
+ +
+ +                      if (css_visible(css)) {
+ +                              ret = css_populate_dir(css);
+ +                              if (ret)
+ +                                      return ret;
+ +                      }
+ +              }
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/**
+ + * cgroup_apply_control_disable - kill or hide csses according to control
+ + * @cgrp: root of the target subtree
+ + *
+ + * Walk @cgrp's subtree and kill and hide csses so that they match
+ + * cgroup_ss_mask() and cgroup_visible_mask().
+ + *
+ + * A css is hidden when the userland requests it to be disabled while other
+ + * subsystems are still depending on it.  The css must not actively control
+ + * resources and be in the vanilla state if it's made visible again later.
+ + * Controllers which may be depended upon should provide ->css_reset() for
+ + * this purpose.
+ + */
+ +static void cgroup_apply_control_disable(struct cgroup *cgrp)
+ +{
+ +      struct cgroup *dsct;
+ +      struct cgroup_subsys_state *d_css;
+ +      struct cgroup_subsys *ss;
+ +      int ssid;
+ +
+ +      cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+ +              for_each_subsys(ss, ssid) {
+ +                      struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+ +
+ +                      WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
+ +
+ +                      if (!css)
+ +                              continue;
+ +
+ +                      if (css->parent &&
+ +                          !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
+ +                              kill_css(css);
+ +                      } else if (!css_visible(css)) {
+ +                              css_clear_dir(css);
+ +                              if (ss->css_reset)
+ +                                      ss->css_reset(css);
+ +                      }
+ +              }
+ +      }
+ +}
+ +
+ +/**
+ + * cgroup_apply_control - apply control mask updates to the subtree
+ + * @cgrp: root of the target subtree
+ + *
+ + * subsystems can be enabled and disabled in a subtree using the following
+ + * steps.
+ + *
+ + * 1. Call cgroup_save_control() to stash the current state.
+ + * 2. Update ->subtree_control masks in the subtree as desired.
+ + * 3. Call cgroup_apply_control() to apply the changes.
+ + * 4. Optionally perform other related operations.
+ + * 5. Call cgroup_finalize_control() to finish up.
+ + *
+ + * This function implements step 3 and propagates the mask changes
+ + * throughout @cgrp's subtree, updates csses accordingly and perform
+ + * process migrations.
+ + */
+ +static int cgroup_apply_control(struct cgroup *cgrp)
+ +{
+ +      int ret;
+ +
+ +      cgroup_propagate_control(cgrp);
+ +
+ +      ret = cgroup_apply_control_enable(cgrp);
+ +      if (ret)
+ +              return ret;
+ +
+ +      /*
+ +       * At this point, cgroup_e_css() results reflect the new csses
+ +       * making the following cgroup_update_dfl_csses() properly update
+ +       * css associations of all tasks in the subtree.
+ +       */
+ +      ret = cgroup_update_dfl_csses(cgrp);
+ +      if (ret)
+ +              return ret;
+ +
+ +      return 0;
+ +}
+ +
+ +/**
+ + * cgroup_finalize_control - finalize control mask update
+ + * @cgrp: root of the target subtree
+ + * @ret: the result of the update
+ + *
+ + * Finalize control mask update.  See cgroup_apply_control() for more info.
+ + */
+ +static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
+ +{
+ +      if (ret) {
+ +              cgroup_restore_control(cgrp);
+ +              cgroup_propagate_control(cgrp);
+ +      }
+ +
+ +      cgroup_apply_control_disable(cgrp);
+ +}
+ +
   /* change the enabled child controllers for a cgroup in the default hierarchy */
   static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                             char *buf, size_t nbytes,
                                             loff_t off)
   {
- -      unsigned long enable = 0, disable = 0;
- -      unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
+ +      u16 enable = 0, disable = 0;
         struct cgroup *cgrp, *child;
         struct cgroup_subsys *ss;
         char *tok;
@@@ -3232,9 -3045,11 +3321,9 @@@
          */
         buf = strstrip(buf);
         while ((tok = strsep(&buf, " "))) {
- -              unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;
- -
                 if (tok[0] == '\0')
                         continue;
- -              for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
+ +              do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
                         if (!cgroup_ssid_enabled(ssid) ||
                             strcmp(tok + 1, ss->name))
                                 continue;
@@@ -3249,12 -3064,12 +3338,12 @@@
                                 return -EINVAL;
                         }
                         break;
- -              }
+ +              } while_each_subsys_mask();
                 if (ssid == CGROUP_SUBSYS_COUNT)
                         return -EINVAL;
         }
   
- -      cgrp = cgroup_kn_lock_live(of->kn);
+ +      cgrp = cgroup_kn_lock_live(of->kn, true);
         if (!cgrp)
                 return -ENODEV;
   
@@@ -3265,7 -3080,10 +3354,7 @@@
                                 continue;
                         }
   
- -                      /* unavailable or not enabled on the parent? */
- -                      if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
- -                          (cgroup_parent(cgrp) &&
- -                           !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
+ +                      if (!(cgroup_control(cgrp) & (1 << ssid))) {
                                 ret = -ENOENT;
                                 goto out_unlock;
                         }
@@@ -3299,21 -3117,150 +3388,21 @@@
                 goto out_unlock;
         }
   
- -      /*
- -       * Update subsys masks and calculate what needs to be done.  More
- -       * subsystems than specified may need to be enabled or disabled
- -       * depending on subsystem dependencies.
- -       */
- -      old_sc = cgrp->subtree_control;
- -      old_ss = cgrp->child_subsys_mask;
- -      new_sc = (old_sc | enable) & ~disable;
- -      new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
- -
- -      css_enable = ~old_ss & new_ss;
- -      css_disable = old_ss & ~new_ss;
- -      enable |= css_enable;
- -      disable |= css_disable;
- -
- -      /*
- -       * Because css offlining is asynchronous, userland might try to
- -       * re-enable the same controller while the previous instance is
- -       * still around.  In such cases, wait till it's gone using
- -       * offline_waitq.
- -       */
- -      for_each_subsys_which(ss, ssid, &css_enable) {
- -              cgroup_for_each_live_child(child, cgrp) {
- -                      DEFINE_WAIT(wait);
- -
- -                      if (!cgroup_css(child, ss))
- -                              continue;
- -
- -                      cgroup_get(child);
- -                      prepare_to_wait(&child->offline_waitq, &wait,
- -                                      TASK_UNINTERRUPTIBLE);
- -                      cgroup_kn_unlock(of->kn);
- -                      schedule();
- -                      finish_wait(&child->offline_waitq, &wait);
- -                      cgroup_put(child);
- -
- -                      return restart_syscall();
- -              }
- -      }
+ +      /* save and update control masks and prepare csses */
+ +      cgroup_save_control(cgrp);
   
- -      cgrp->subtree_control = new_sc;
- -      cgrp->child_subsys_mask = new_ss;
+ +      cgrp->subtree_control |= enable;
+ +      cgrp->subtree_control &= ~disable;
   
- -      /*
- -       * Create new csses or make the existing ones visible.  A css is
- -       * created invisible if it's being implicitly enabled through
- -       * dependency.  An invisible css is made visible when the userland
- -       * explicitly enables it.
- -       */
- -      for_each_subsys(ss, ssid) {
- -              if (!(enable & (1 << ssid)))
- -                      continue;
+ +      ret = cgroup_apply_control(cgrp);
   
- -              cgroup_for_each_live_child(child, cgrp) {
- -                      if (css_enable & (1 << ssid))
- -                              ret = create_css(child, ss,
- -                                      cgrp->subtree_control & (1 << ssid));
- -                      else
- -                              ret = css_populate_dir(cgroup_css(child, ss),
- -                                                     NULL);
- -                      if (ret)
- -                              goto err_undo_css;
- -              }
- -      }
- -
- -      /*
- -       * At this point, cgroup_e_css() results reflect the new csses
- -       * making the following cgroup_update_dfl_csses() properly update
- -       * css associations of all tasks in the subtree.
- -       */
- -      ret = cgroup_update_dfl_csses(cgrp);
- -      if (ret)
- -              goto err_undo_css;
- -
- -      /*
- -       * All tasks are migrated out of disabled csses.  Kill or hide
- -       * them.  A css is hidden when the userland requests it to be
- -       * disabled while other subsystems are still depending on it.  The
- -       * css must not actively control resources and be in the vanilla
- -       * state if it's made visible again later.  Controllers which may
- -       * be depended upon should provide ->css_reset() for this purpose.
- -       */
- -      for_each_subsys(ss, ssid) {
- -              if (!(disable & (1 << ssid)))
- -                      continue;
- -
- -              cgroup_for_each_live_child(child, cgrp) {
- -                      struct cgroup_subsys_state *css = cgroup_css(child, ss);
- -
- -                      if (css_disable & (1 << ssid)) {
- -                              kill_css(css);
- -                      } else {
- -                              css_clear_dir(css, NULL);
- -                              if (ss->css_reset)
- -                                      ss->css_reset(css);
- -                      }
- -              }
- -      }
- -
- -      /*
- -       * The effective csses of all the descendants (excluding @cgrp) may
- -       * have changed.  Subsystems can optionally subscribe to this event
- -       * by implementing ->css_e_css_changed() which is invoked if any of
- -       * the effective csses seen from the css's cgroup may have changed.
- -       */
- -      for_each_subsys(ss, ssid) {
- -              struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
- -              struct cgroup_subsys_state *css;
- -
- -              if (!ss->css_e_css_changed || !this_css)
- -                      continue;
- -
- -              css_for_each_descendant_pre(css, this_css)
- -                      if (css != this_css)
- -                              ss->css_e_css_changed(css);
- -      }
+ +      cgroup_finalize_control(cgrp, ret);
   
         kernfs_activate(cgrp->kn);
         ret = 0;
   out_unlock:
         cgroup_kn_unlock(of->kn);
         return ret ?: nbytes;
- -
- -err_undo_css:
- -      cgrp->subtree_control = old_sc;
- -      cgrp->child_subsys_mask = old_ss;
- -
- -      for_each_subsys(ss, ssid) {
- -              if (!(enable & (1 << ssid)))
- -                      continue;
- -
- -              cgroup_for_each_live_child(child, cgrp) {
- -                      struct cgroup_subsys_state *css = cgroup_css(child, ss);
- -
- -                      if (!css)
- -                              continue;
- -
- -                      if (css_enable & (1 << ssid))
- -                              kill_css(css);
- -                      else
- -                              css_clear_dir(css, NULL);
- -              }
- -      }
- -      goto out_unlock;
   }
   
   static int cgroup_events_show(struct seq_file *seq, void *v)
@@@ -3511,7 -3458,7 +3600,7 @@@ static int cgroup_addrm_files(struct cg
                               bool is_add)
   {
         struct cftype *cft, *cft_end = NULL;
- -      int ret;
+ +      int ret = 0;
   
         lockdep_assert_held(&cgroup_mutex);
   
@@@ -3540,7 -3487,7 +3629,7 @@@ restart
                         cgroup_rm_file(cgrp, cft);
                 }
         }
- -      return 0;
+ +      return ret;
   }
   
   static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
@@@ -3557,7 -3504,7 +3646,7 @@@
         css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
                 struct cgroup *cgrp = css->cgroup;
   
- -              if (cgroup_is_dead(cgrp))
+ +              if (!(css->flags & CSS_VISIBLE))
                         continue;
   
                 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
@@@ -4178,9 -4125,6 +4267,9 @@@ int cgroup_transfer_tasks(struct cgrou
         struct task_struct *task;
         int ret;
   
+ +      if (!cgroup_may_migrate_to(to))
+ +              return -EBUSY;
+ +
         mutex_lock(&cgroup_mutex);
   
         /* all tasks in @from are being moved, all csets are source */
@@@ -4189,7 -4133,7 +4278,7 @@@
                 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
         spin_unlock_bh(&css_set_lock);
   
- -      ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
+ +      ret = cgroup_migrate_prepare_dst(&preloaded_csets);
         if (ret)
                 goto out_err;
   
@@@ -4205,7 -4149,7 +4294,7 @@@
                 css_task_iter_end(&it);
   
                 if (task) {
- -                      ret = cgroup_migrate(task, false, to);
+ +                      ret = cgroup_migrate(task, false, to->root);
                         put_task_struct(task);
                 }
         } while (task && !ret);
@@@ -4712,6 -4656,12 +4801,6 @@@ static struct cftype cgroup_dfl_base_fi
         },
         {
                 .name = "cgroup.controllers",
- -              .flags = CFTYPE_ONLY_ON_ROOT,
- -              .seq_show = cgroup_root_controllers_show,
- -      },
- -      {
- -              .name = "cgroup.controllers",
- -              .flags = CFTYPE_NOT_ON_ROOT,
                 .seq_show = cgroup_controllers_show,
         },
         {
@@@ -4880,9 -4830,7 +4969,9 @@@ static void css_release_work_fn(struct 
                  * Those are supported by RCU protecting clearing of
                  * cgrp->kn->priv backpointer.
                  */
- -              RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
+ +              if (cgrp->kn)
+ +                      RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
+ +                                       NULL);
         }
   
         mutex_unlock(&cgroup_mutex);
@@@ -4953,9 -4901,6 +5042,9 @@@ static void offline_css(struct cgroup_s
         if (!(css->flags & CSS_ONLINE))
                 return;
   
+ +      if (ss->css_reset)
+ +              ss->css_reset(css);
+ +
         if (ss->css_offline)
                 ss->css_offline(css);
   
@@@ -4966,16 -4911,17 +5055,16 @@@
   }
   
   /**
- - * create_css - create a cgroup_subsys_state
+ + * css_create - create a cgroup_subsys_state
    * @cgrp: the cgroup new css will be associated with
    * @ss: the subsys of new css
- - * @visible: whether to create control knobs for the new css or not
    *
    * Create a new css associated with @cgrp - @ss pair.  On success, the new
- - * css is online and installed in @cgrp with all interface files created if
- - * @visible.  Returns 0 on success, -errno on failure.
+ + * css is online and installed in @cgrp.  This function doesn't create the
+ + * interface files.  Returns 0 on success, -errno on failure.
    */
- -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
- -                    bool visible)
+ +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+ +                                            struct cgroup_subsys *ss)
   {
         struct cgroup *parent = cgroup_parent(cgrp);
         struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
@@@ -4986,7 -4932,7 +5075,7 @@@
   
         css = ss->css_alloc(parent_css);
         if (IS_ERR(css))
- -              return PTR_ERR(css);
+ +              return css;
   
         init_and_link_css(css, ss, cgrp);
   
@@@ -4999,6 -4945,12 +5088,6 @@@
                 goto err_free_percpu_ref;
         css->id = err;
   
- -      if (visible) {
- -              err = css_populate_dir(css, NULL);
- -              if (err)
- -                      goto err_free_id;
- -      }
- -
         /* @css is ready to be brought online now, make it visible */
         list_add_tail_rcu(&css->sibling, &parent_css->children);
         cgroup_idr_replace(&ss->css_idr, css, css->id);
@@@ -5016,30 -4968,47 +5105,30 @@@
                 ss->warned_broken_hierarchy = true;
         }
   
- -      return 0;
+ +      return css;
   
   err_list_del:
         list_del_rcu(&css->sibling);
- -      css_clear_dir(css, NULL);
- -err_free_id:
         cgroup_idr_remove(&ss->css_idr, css->id);
   err_free_percpu_ref:
         percpu_ref_exit(&css->refcnt);
   err_free_css:
         call_rcu(&css->rcu_head, css_free_rcu_fn);
- -      return err;
+ +      return ERR_PTR(err);
   }
   
- -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- -                      umode_t mode)
+ +static struct cgroup *cgroup_create(struct cgroup *parent)
   {
- -      struct cgroup *parent, *cgrp, *tcgrp;
- -      struct cgroup_root *root;
- -      struct cgroup_subsys *ss;
- -      struct kernfs_node *kn;
- -      int level, ssid, ret;
- -
- -      /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
- -       */
- -      if (strchr(name, '\n'))
- -              return -EINVAL;
- -
- -      parent = cgroup_kn_lock_live(parent_kn);
- -      if (!parent)
- -              return -ENODEV;
- -      root = parent->root;
- -      level = parent->level + 1;
+ +      struct cgroup_root *root = parent->root;
+ +      struct cgroup *cgrp, *tcgrp;
+ +      int level = parent->level + 1;
+ +      int ret;
   
         /* allocate the cgroup and its ID, 0 is reserved for the root */
         cgrp = kzalloc(sizeof(*cgrp) +
                        sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
- -      if (!cgrp) {
- -              ret = -ENOMEM;
- -              goto out_unlock;
- -      }
+ +      if (!cgrp)
+ +              return ERR_PTR(-ENOMEM);
   
         ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
         if (ret)
@@@ -5070,6 -5039,20 +5159,6 @@@
         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
   
- -      /* create the directory */
- -      kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
- -      if (IS_ERR(kn)) {
- -              ret = PTR_ERR(kn);
- -              goto out_free_id;
- -      }
- -      cgrp->kn = kn;
- -
- -      /*
- -       * This extra ref will be put in cgroup_free_fn() and guarantees
- -       * that @cgrp->kn is always accessible.
- -       */
- -      kernfs_get(kn);
- -
         cgrp->self.serial_nr = css_serial_nr_next++;
   
         /* allocation complete, commit to creation */
@@@ -5083,90 -5066,51 +5172,90 @@@
          */
         cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
   
- -      ret = cgroup_kn_set_ugid(kn);
- -      if (ret)
- -              goto out_destroy;
+ +      /*
+ +       * On the default hierarchy, a child doesn't automatically inherit
+ +       * subtree_control from the parent.  Each is configured manually.
+ +       */
+ +      if (!cgroup_on_dfl(cgrp))
+ +              cgrp->subtree_control = cgroup_control(cgrp);
+ +
+ +      cgroup_propagate_control(cgrp);
   
- -      ret = css_populate_dir(&cgrp->self, NULL);
+ +      /* @cgrp doesn't have dir yet so the following will only create csses */
+ +      ret = cgroup_apply_control_enable(cgrp);
         if (ret)
                 goto out_destroy;
   
- -      /* let's create and online css's */
- -      for_each_subsys(ss, ssid) {
- -              if (parent->child_subsys_mask & (1 << ssid)) {
- -                      ret = create_css(cgrp, ss,
- -                                       parent->subtree_control & (1 << ssid));
- -                      if (ret)
- -                              goto out_destroy;
- -              }
+ +      return cgrp;
+ +
+ +out_cancel_ref:
+ +      percpu_ref_exit(&cgrp->self.refcnt);
+ +out_free_cgrp:
+ +      kfree(cgrp);
+ +      return ERR_PTR(ret);
+ +out_destroy:
+ +      cgroup_destroy_locked(cgrp);
+ +      return ERR_PTR(ret);
+ +}
+ +
+ +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ +                      umode_t mode)
+ +{
+ +      struct cgroup *parent, *cgrp;
+ +      struct kernfs_node *kn;
+ +      int ret;
+ +
+ +      /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
+ +      if (strchr(name, '\n'))
+ +              return -EINVAL;
+ +
+ +      parent = cgroup_kn_lock_live(parent_kn, false);
+ +      if (!parent)
+ +              return -ENODEV;
+ +
+ +      cgrp = cgroup_create(parent);
+ +      if (IS_ERR(cgrp)) {
+ +              ret = PTR_ERR(cgrp);
+ +              goto out_unlock;
+ +      }
+ +
+ +      /* create the directory */
+ +      kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ +      if (IS_ERR(kn)) {
+ +              ret = PTR_ERR(kn);
+ +              goto out_destroy;
         }
+ +      cgrp->kn = kn;
   
         /*
- -       * On the default hierarchy, a child doesn't automatically inherit
- -       * subtree_control from the parent.  Each is configured manually.
+ +       * This extra ref will be put in cgroup_free_fn() and guarantees
+ +       * that @cgrp->kn is always accessible.
          */
- -      if (!cgroup_on_dfl(cgrp)) {
- -              cgrp->subtree_control = parent->subtree_control;
- -              cgroup_refresh_child_subsys_mask(cgrp);
- -      }
+ +      kernfs_get(kn);
   
+ +      ret = cgroup_kn_set_ugid(kn);
+ +      if (ret)
+ +              goto out_destroy;
+ +
+ +      ret = css_populate_dir(&cgrp->self);
+ +      if (ret)
+ +              goto out_destroy;
+ +
+ +      ret = cgroup_apply_control_enable(cgrp);
+ +      if (ret)
+ +              goto out_destroy;
+ +
+ +      /* let's create and online css's */
         kernfs_activate(kn);
   
         ret = 0;
         goto out_unlock;
   
- -out_free_id:
- -      cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
- -out_cancel_ref:
- -      percpu_ref_exit(&cgrp->self.refcnt);
- -out_free_cgrp:
- -      kfree(cgrp);
+ +out_destroy:
+ +      cgroup_destroy_locked(cgrp);
   out_unlock:
         cgroup_kn_unlock(parent_kn);
         return ret;
- -
- -out_destroy:
- -      cgroup_destroy_locked(cgrp);
- -      goto out_unlock;
   }
   
   /*
@@@ -5220,7 -5164,7 +5309,7 @@@ static void kill_css(struct cgroup_subs
          * This must happen before css is disassociated with its cgroup.
          * See seq_css() for details.
          */
- -      css_clear_dir(css, NULL);
+ +      css_clear_dir(css);
   
         /*
          * Killing would put the base ref, but we need to keep it alive
@@@ -5269,7 -5213,6 +5358,7 @@@ static int cgroup_destroy_locked(struc
         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
   {
         struct cgroup_subsys_state *css;
+ +      struct cgrp_cset_link *link;
         int ssid;
   
         lockdep_assert_held(&cgroup_mutex);
@@@ -5290,18 -5233,11 +5379,18 @@@
                 return -EBUSY;
   
         /*
- -       * Mark @cgrp dead.  This prevents further task migration and child
- -       * creation by disabling cgroup_lock_live_group().
+ +       * Mark @cgrp and the associated csets dead.  The former prevents
+ +       * further task migration and child creation by disabling
+ +       * cgroup_lock_live_group().  The latter makes the csets ignored by
+ +       * the migration path.
          */
         cgrp->self.flags &= ~CSS_ONLINE;
   
+ +      spin_lock_bh(&css_set_lock);
+ +      list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ +              link->cset->dead = true;
+ +      spin_unlock_bh(&css_set_lock);
+ +
         /* initiate massacre of all css's */
         for_each_css(css, ssid, cgrp)
                 kill_css(css);
@@@ -5325,7 -5261,7 +5414,7 @@@ static int cgroup_rmdir(struct kernfs_n
         struct cgroup *cgrp;
         int ret = 0;
   
- -      cgrp = cgroup_kn_lock_live(kn);
+ +      cgrp = cgroup_kn_lock_live(kn, false);
         if (!cgrp)
                 return 0;
   
@@@ -5415,7 -5351,7 +5504,7 @@@ int __init cgroup_init_early(void
   
         for_each_subsys(ss, i) {
                 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
- -                   "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
+ +                   "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
                      i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
                      ss->id, ss->name);
                 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
@@@ -5432,7 -5368,7 +5521,7 @@@
         return 0;
   }
   
- -static unsigned long cgroup_disable_mask __initdata;
+ +static u16 cgroup_disable_mask __initdata;
   
   /**
    * cgroup_init - cgroup initialization
@@@ -5443,21 -5379,20 +5532,23 @@@
   int __init cgroup_init(void)
   {
         struct cgroup_subsys *ss;
- -      unsigned long key;
         int ssid;
   
+ +      BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
         BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
         BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
         BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
   
+       get_user_ns(init_cgroup_ns.user_ns);
+ 
         mutex_lock(&cgroup_mutex);
   
- -      /* Add init_css_set to the hash table */
- -      key = css_set_hash(init_css_set.subsys);
- -      hash_add(css_set_table, &init_css_set.hlist, key);
+ +      /*
+ +       * Add init_css_set to the hash table so that dfl_root can link to
+ +       * it during init.
+ +       */
+ +      hash_add(css_set_table, &init_css_set.hlist,
+ +               css_set_hash(init_css_set.subsys));
   
         BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
   
@@@ -5496,10 -5431,8 +5587,10 @@@
   
                 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
   
- -              if (!ss->dfl_cftypes)
- -                      cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
+ +              if (ss->implicit_on_dfl)
+ +                      cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
+ +              else if (!ss->dfl_cftypes)
+ +                      cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
   
                 if (ss->dfl_cftypes == ss->legacy_cftypes) {
                         WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
@@@ -5512,11 -5445,6 +5603,11 @@@
                         ss->bind(init_css_set.subsys[ssid]);
         }
   
+ +      /* init_css_set.subsys[] has been updated, re-hash */
+ +      hash_del(&init_css_set.hlist);
+ +      hash_add(css_set_table, &init_css_set.hlist,
+ +               css_set_hash(init_css_set.subsys));
+ +
         WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
         WARN_ON(register_filesystem(&cgroup_fs_type));
         WARN_ON(register_filesystem(&cgroup2_fs_type));
@@@ -5575,7 -5503,7 +5666,7 @@@ int proc_cgroup_show(struct seq_file *m
                 struct cgroup *cgrp;
                 int ssid, count = 0;
   
- -              if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
+ +              if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
                         continue;
   
                 seq_printf(m, "%d:", root->hierarchy_id);
@@@ -5601,7 -5529,8 +5692,8 @@@
                  * " (deleted)" is appended to the cgroup path.
                  */
                 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
-                       path = cgroup_path(cgrp, buf, PATH_MAX);
+                       path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
+                                               current->nsproxy->cgroup_ns);
                         if (!path) {
                                 retval = -ENAMETOOLONG;
                                 goto out_unlock;
@@@ -5690,11 -5619,11 +5782,11 @@@ int cgroup_can_fork(struct task_struct 
         struct cgroup_subsys *ss;
         int i, j, ret;
   
- -      for_each_subsys_which(ss, i, &have_canfork_callback) {
+ +      do_each_subsys_mask(ss, i, have_canfork_callback) {
                 ret = ss->can_fork(child);
                 if (ret)
                         goto out_revert;
- -      }
+ +      } while_each_subsys_mask();
   
         return 0;
   
@@@ -5779,9 -5708,8 +5871,9 @@@ void cgroup_post_fork(struct task_struc
          * css_set; otherwise, @child might change state between ->fork()
          * and addition to css_set.
          */
- -      for_each_subsys_which(ss, i, &have_fork_callback)
+ +      do_each_subsys_mask(ss, i, have_fork_callback) {
                 ss->fork(child);
+ +      } while_each_subsys_mask();
   }
   
   /**
@@@ -5824,9 -5752,8 +5916,9 @@@ void cgroup_exit(struct task_struct *ts
         }
   
         /* see cgroup_post_fork() for details */
- -      for_each_subsys_which(ss, i, &have_exit_callback)
+ +      do_each_subsys_mask(ss, i, have_exit_callback) {
                 ss->exit(tsk);
+ +      } while_each_subsys_mask();
   }
   
   void cgroup_free(struct task_struct *task)
@@@ -5835,9 -5762,8 +5927,9 @@@
         struct cgroup_subsys *ss;
         int ssid;
   
- -      for_each_subsys_which(ss, ssid, &have_free_callback)
+ +      do_each_subsys_mask(ss, ssid, have_free_callback) {
                 ss->free(task);
+ +      } while_each_subsys_mask();
   
         put_css_set(cset);
   }
@@@ -5886,7 -5812,9 +5978,9 @@@ static void cgroup_release_agent(struc
         if (!pathbuf || !agentbuf)
                 goto out;
   
-       path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+       spin_lock_bh(&css_set_lock);
+       path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+       spin_unlock_bh(&css_set_lock);
         if (!path)
                 goto out;
   
@@@ -5941,7 -5869,7 +6035,7 @@@ static int __init cgroup_no_v1(char *st
                         continue;
   
                 if (!strcmp(token, "all")) {
- -                      cgroup_no_v1_mask = ~0UL;
+ +                      cgroup_no_v1_mask = U16_MAX;
                         break;
                 }
   
@@@ -5970,13 -5898,12 +6064,13 @@@ struct cgroup_subsys_state *css_tryget_
                                                        struct cgroup_subsys *ss)
   {
         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ +      struct file_system_type *s_type = dentry->d_sb->s_type;
         struct cgroup_subsys_state *css = NULL;
         struct cgroup *cgrp;
   
         /* is @dentry a cgroup dir? */
- -      if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
- -          kernfs_type(kn) != KERNFS_DIR)
+ +      if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
+ +          !kn || kernfs_type(kn) != KERNFS_DIR)
                 return ERR_PTR(-EBADF);
   
         rcu_read_lock();
@@@ -6098,6 -6025,133 +6192,133 @@@ void cgroup_sk_free(struct sock_cgroup_
   
   #endif        /* CONFIG_SOCK_CGROUP_DATA */
   
+ /* cgroup namespaces */
+ 
+ static struct cgroup_namespace *alloc_cgroup_ns(void)
+ {
+       struct cgroup_namespace *new_ns;
+       int ret;
+ 
+       new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+       if (!new_ns)
+               return ERR_PTR(-ENOMEM);
+       ret = ns_alloc_inum(&new_ns->ns);
+       if (ret) {
+               kfree(new_ns);
+               return ERR_PTR(ret);
+       }
+       atomic_set(&new_ns->count, 1);
+       new_ns->ns.ops = &cgroupns_operations;
+       return new_ns;
+ }
+ 
+ void free_cgroup_ns(struct cgroup_namespace *ns)
+ {
+       put_css_set(ns->root_cset);
+       put_user_ns(ns->user_ns);
+       ns_free_inum(&ns->ns);
+       kfree(ns);
+ }
+ EXPORT_SYMBOL(free_cgroup_ns);
+ 
+ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+                                       struct user_namespace *user_ns,
+                                       struct cgroup_namespace *old_ns)
+ {
+       struct cgroup_namespace *new_ns;
+       struct css_set *cset;
+ 
+       BUG_ON(!old_ns);
+ 
+       if (!(flags & CLONE_NEWCGROUP)) {
+               get_cgroup_ns(old_ns);
+               return old_ns;
+       }
+ 
+       /* Allow only sysadmin to create cgroup namespace. */
+       if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
+ 
+       mutex_lock(&cgroup_mutex);
+       spin_lock_bh(&css_set_lock);
+ 
+       cset = task_css_set(current);
+       get_css_set(cset);
+ 
+       spin_unlock_bh(&css_set_lock);
+       mutex_unlock(&cgroup_mutex);
+ 
+       new_ns = alloc_cgroup_ns();
+       if (IS_ERR(new_ns)) {
+               put_css_set(cset);
+               return new_ns;
+       }
+ 
+       new_ns->user_ns = get_user_ns(user_ns);
+       new_ns->root_cset = cset;
+ 
+       return new_ns;
+ }
+ 
+ static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+ {
+       return container_of(ns, struct cgroup_namespace, ns);
+ }
+ 
+ static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+ {
+       struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+ 
+       if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+           !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+               return -EPERM;
+ 
+       /* Don't need to do anything if we are attaching to our own cgroupns. */
+       if (cgroup_ns == nsproxy->cgroup_ns)
+               return 0;
+ 
+       get_cgroup_ns(cgroup_ns);
+       put_cgroup_ns(nsproxy->cgroup_ns);
+       nsproxy->cgroup_ns = cgroup_ns;
+ 
+       return 0;
+ }
+ 
+ static struct ns_common *cgroupns_get(struct task_struct *task)
+ {
+       struct cgroup_namespace *ns = NULL;
+       struct nsproxy *nsproxy;
+ 
+       task_lock(task);
+       nsproxy = task->nsproxy;
+       if (nsproxy) {
+               ns = nsproxy->cgroup_ns;
+               get_cgroup_ns(ns);
+       }
+       task_unlock(task);
+ 
+       return ns ? &ns->ns : NULL;
+ }
+ 
+ static void cgroupns_put(struct ns_common *ns)
+ {
+       put_cgroup_ns(to_cg_ns(ns));
+ }
+ 
+ const struct proc_ns_operations cgroupns_operations = {
+       .name           = "cgroup",
+       .type           = CLONE_NEWCGROUP,
+       .get            = cgroupns_get,
+       .put            = cgroupns_put,
+       .install        = cgroupns_install,
+ };
+ 
+ static __init int cgroup_namespaces_init(void)
+ {
+       return 0;
+ }
+ subsys_initcall(cgroup_namespaces_init);
+ 
   #ifdef CONFIG_CGROUP_DEBUG
   static struct cgroup_subsys_state *
   debug_css_alloc(struct cgroup_subsys_state *parent_css)
diff --combined kernel/cpuset.c

index 9089983,d393125..00ab5c2
--- 1/kernel/cpuset.c
--- 2/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@@ -2089,7 -2089,7 +2089,7 @@@ struct cgroup_subsys cpuset_cgrp_subsy
         .attach         = cpuset_attach,
         .bind           = cpuset_bind,
         .legacy_cftypes = files,
- -      .early_init     = 1,
+ +      .early_init     = true,
   };
   
   /**
@@@ -2714,10 -2714,10 +2714,10 @@@ int proc_cpuset_show(struct seq_file *m
                 goto out;
   
         retval = -ENAMETOOLONG;
-       rcu_read_lock();
-       css = task_css(tsk, cpuset_cgrp_id);
-       p = cgroup_path(css->cgroup, buf, PATH_MAX);
-       rcu_read_unlock();
+       css = task_get_css(tsk, cpuset_cgrp_id);
+       p = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
+                          current->nsproxy->cgroup_ns);
+       css_put(css);
         if (!p)
                 goto out_free;
         seq_puts(m, p);
diff --combined kernel/fork.c

index accb722,6611a62..5b8d1e7
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -164,20 -164,12 +164,20 @@@ static struct thread_info *alloc_thread
         struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
                                                   THREAD_SIZE_ORDER);
   
+ +      if (page)
+ +              memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
+ +                                          1 << THREAD_SIZE_ORDER);
+ +
         return page ? page_address(page) : NULL;
   }
   
   static inline void free_thread_info(struct thread_info *ti)
   {
- -      free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ +      struct page *page = virt_to_page(ti);
+ +
+ +      memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
+ +                                  -(1 << THREAD_SIZE_ORDER));
+ +      __free_kmem_pages(page, THREAD_SIZE_ORDER);
   }
   # else
   static struct kmem_cache *thread_info_cache;
@@@ -1892,7 -1884,7 +1892,7 @@@ static int check_unshare_flags(unsigne
         if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
                                 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
                                 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
-                               CLONE_NEWUSER|CLONE_NEWPID))
+                               CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
                 return -EINVAL;
         /*
          * Not implemented, but pretend it works if there is nothing
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 21 Mar 2016 17:05:13 +0000 (10:05 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 21 Mar 2016 17:05:13 +0000 (10:05 -0700)
		1	2
Documentation/cgroup-v2.txt	patch \|	diff1 \|	diff2 \|	blob \| history
fs/kernfs/dir.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cpuset.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history