Merge branch 'for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 13 Jan 2016 03:20:32 +0000 (19:20 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 13 Jan 2016 03:20:32 +0000 (19:20 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 13 Jan 2016 03:20:32 +0000 (19:20 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 13 Jan 2016 03:20:32 +0000 (19:20 -0800)
diff --combined include/linux/cgroup-defs.h

index e5f4164,7f334c2..7f540f7
--- 1/include/linux/cgroup-defs.h
--- 2/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@@ -34,17 -34,12 +34,12 @@@ struct seq_file
   
   /* define the enumeration of all cgroup subsystems */
   #define SUBSYS(_x) _x ## _cgrp_id,
- #define SUBSYS_TAG(_t) CGROUP_ ## _t, \
-       __unused_tag_ ## _t = CGROUP_ ## _t - 1,
   enum cgroup_subsys_id {
   #include <linux/cgroup_subsys.h>
         CGROUP_SUBSYS_COUNT,
   };
- #undef SUBSYS_TAG
   #undef SUBSYS
   
- #define CGROUP_CANFORK_COUNT (CGROUP_CANFORK_END - CGROUP_CANFORK_START)
- 
   /* bits in struct cgroup_subsys_state flags field */
   enum {
         CSS_NO_REF      = (1 << 0), /* no reference counting for this css */
@@@ -66,7 -61,6 +61,6 @@@ enum 
   
   /* cgroup_root->flags */
   enum {
-       CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */
         CGRP_ROOT_NOPREFIX      = (1 << 1), /* mounted subsystems have no named prefix */
         CGRP_ROOT_XATTR         = (1 << 2), /* supports extended attributes */
   };
@@@ -439,9 -433,9 +433,9 @@@ struct cgroup_subsys 
         int (*can_attach)(struct cgroup_taskset *tset);
         void (*cancel_attach)(struct cgroup_taskset *tset);
         void (*attach)(struct cgroup_taskset *tset);
-       int (*can_fork)(struct task_struct *task, void **priv_p);
-       void (*cancel_fork)(struct task_struct *task, void *priv);
-       void (*fork)(struct task_struct *task, void *priv);
+       int (*can_fork)(struct task_struct *task);
+       void (*cancel_fork)(struct task_struct *task);
+       void (*fork)(struct task_struct *task);
         void (*exit)(struct task_struct *task);
         void (*free)(struct task_struct *task);
         void (*bind)(struct cgroup_subsys_state *root_css);
@@@ -527,7 -521,6 +521,6 @@@ static inline void cgroup_threadgroup_c
   
   #else /* CONFIG_CGROUPS */
   
- #define CGROUP_CANFORK_COUNT 0
   #define CGROUP_SUBSYS_COUNT 0
   
   static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
@@@ -535,116 -528,4 +528,116 @@@ static inline void cgroup_threadgroup_c
   
   #endif        /* CONFIG_CGROUPS */
   
+ +#ifdef CONFIG_SOCK_CGROUP_DATA
+ +
+ +/*
+ + * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
+ + * per-socket cgroup information except for memcg association.
+ + *
+ + * On legacy hierarchies, net_prio and net_cls controllers directly set
+ + * attributes on each sock which can then be tested by the network layer.
+ + * On the default hierarchy, each sock is associated with the cgroup it was
+ + * created in and the networking layer can match the cgroup directly.
+ + *
+ + * To avoid carrying all three cgroup related fields separately in sock,
+ + * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
+ + * On boot, sock_cgroup_data records the cgroup that the sock was created
+ + * in so that cgroup2 matches can be made; however, once either net_prio or
+ + * net_cls starts being used, the area is overriden to carry prioidx and/or
+ + * classid.  The two modes are distinguished by whether the lowest bit is
+ + * set.  Clear bit indicates cgroup pointer while set bit prioidx and
+ + * classid.
+ + *
+ + * While userland may start using net_prio or net_cls at any time, once
+ + * either is used, cgroup2 matching no longer works.  There is no reason to
+ + * mix the two and this is in line with how legacy and v2 compatibility is
+ + * handled.  On mode switch, cgroup references which are already being
+ + * pointed to by socks may be leaked.  While this can be remedied by adding
+ + * synchronization around sock_cgroup_data, given that the number of leaked
+ + * cgroups is bound and highly unlikely to be high, this seems to be the
+ + * better trade-off.
+ + */
+ +struct sock_cgroup_data {
+ +      union {
+ +#ifdef __LITTLE_ENDIAN
+ +              struct {
+ +                      u8      is_data;
+ +                      u8      padding;
+ +                      u16     prioidx;
+ +                      u32     classid;
+ +              } __packed;
+ +#else
+ +              struct {
+ +                      u32     classid;
+ +                      u16     prioidx;
+ +                      u8      padding;
+ +                      u8      is_data;
+ +              } __packed;
+ +#endif
+ +              u64             val;
+ +      };
+ +};
+ +
+ +/*
+ + * There's a theoretical window where the following accessors race with
+ + * updaters and return part of the previous pointer as the prioidx or
+ + * classid.  Such races are short-lived and the result isn't critical.
+ + */
+ +static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd)
+ +{
+ +      /* fallback to 1 which is always the ID of the root cgroup */
+ +      return (skcd->is_data & 1) ? skcd->prioidx : 1;
+ +}
+ +
+ +static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd)
+ +{
+ +      /* fallback to 0 which is the unconfigured default classid */
+ +      return (skcd->is_data & 1) ? skcd->classid : 0;
+ +}
+ +
+ +/*
+ + * If invoked concurrently, the updaters may clobber each other.  The
+ + * caller is responsible for synchronization.
+ + */
+ +static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
+ +                                         u16 prioidx)
+ +{
+ +      struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
+ +
+ +      if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
+ +              return;
+ +
+ +      if (!(skcd_buf.is_data & 1)) {
+ +              skcd_buf.val = 0;
+ +              skcd_buf.is_data = 1;
+ +      }
+ +
+ +      skcd_buf.prioidx = prioidx;
+ +      WRITE_ONCE(skcd->val, skcd_buf.val);    /* see sock_cgroup_ptr() */
+ +}
+ +
+ +static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
+ +                                         u32 classid)
+ +{
+ +      struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
+ +
+ +      if (sock_cgroup_classid(&skcd_buf) == classid)
+ +              return;
+ +
+ +      if (!(skcd_buf.is_data & 1)) {
+ +              skcd_buf.val = 0;
+ +              skcd_buf.is_data = 1;
+ +      }
+ +
+ +      skcd_buf.classid = classid;
+ +      WRITE_ONCE(skcd->val, skcd_buf.val);    /* see sock_cgroup_ptr() */
+ +}
+ +
+ +#else /* CONFIG_SOCK_CGROUP_DATA */
+ +
+ +struct sock_cgroup_data {
+ +};
+ +
+ +#endif        /* CONFIG_SOCK_CGROUP_DATA */
+ +
   #endif        /* _LINUX_CGROUP_DEFS_H */
diff --combined include/linux/cgroup.h

index 322a284,9d70b48..2162dca
--- 1/include/linux/cgroup.h
--- 2/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@@ -97,12 -97,9 +97,9 @@@ int proc_cgroup_show(struct seq_file *m
                      struct pid *pid, struct task_struct *tsk);
   
   void cgroup_fork(struct task_struct *p);
- extern int cgroup_can_fork(struct task_struct *p,
-                          void *ss_priv[CGROUP_CANFORK_COUNT]);
- extern void cgroup_cancel_fork(struct task_struct *p,
-                              void *ss_priv[CGROUP_CANFORK_COUNT]);
- extern void cgroup_post_fork(struct task_struct *p,
-                            void *old_ss_priv[CGROUP_CANFORK_COUNT]);
+ extern int cgroup_can_fork(struct task_struct *p);
+ extern void cgroup_cancel_fork(struct task_struct *p);
+ extern void cgroup_post_fork(struct task_struct *p);
   void cgroup_exit(struct task_struct *p);
   void cgroup_free(struct task_struct *p);
   
@@@ -562,13 -559,9 +559,9 @@@ static inline int cgroupstats_build(str
                                     struct dentry *dentry) { return -EINVAL; }
   
   static inline void cgroup_fork(struct task_struct *p) {}
- static inline int cgroup_can_fork(struct task_struct *p,
-                                 void *ss_priv[CGROUP_CANFORK_COUNT])
- { return 0; }
- static inline void cgroup_cancel_fork(struct task_struct *p,
-                                     void *ss_priv[CGROUP_CANFORK_COUNT]) {}
- static inline void cgroup_post_fork(struct task_struct *p,
-                                   void *ss_priv[CGROUP_CANFORK_COUNT]) {}
+ static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
+ static inline void cgroup_cancel_fork(struct task_struct *p) {}
+ static inline void cgroup_post_fork(struct task_struct *p) {}
   static inline void cgroup_exit(struct task_struct *p) {}
   static inline void cgroup_free(struct task_struct *p) {}
   
@@@ -577,45 -570,4 +570,45 @@@ static inline int cgroup_init(void) { r
   
   #endif /* !CONFIG_CGROUPS */
   
+ +/*
+ + * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
+ + * definition in cgroup-defs.h.
+ + */
+ +#ifdef CONFIG_SOCK_CGROUP_DATA
+ +
+ +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+ +extern spinlock_t cgroup_sk_update_lock;
+ +#endif
+ +
+ +void cgroup_sk_alloc_disable(void);
+ +void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
+ +void cgroup_sk_free(struct sock_cgroup_data *skcd);
+ +
+ +static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
+ +{
+ +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+ +      unsigned long v;
+ +
+ +      /*
+ +       * @skcd->val is 64bit but the following is safe on 32bit too as we
+ +       * just need the lower ulong to be written and read atomically.
+ +       */
+ +      v = READ_ONCE(skcd->val);
+ +
+ +      if (v & 1)
+ +              return &cgrp_dfl_root.cgrp;
+ +
+ +      return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
+ +#else
+ +      return (struct cgroup *)(unsigned long)skcd->val;
+ +#endif
+ +}
+ +
+ +#else /* CONFIG_CGROUP_DATA */
+ +
+ +static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {}
+ +static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
+ +
+ +#endif        /* CONFIG_CGROUP_DATA */
+ +
   #endif /* _LINUX_CGROUP_H */
diff --combined init/Kconfig

index 235c7a2,f8754f5..5481b49
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -940,95 -940,24 +940,24 @@@ menuconfig CGROUP
   
   if CGROUPS
   
- config CGROUP_DEBUG
-       bool "Example debug cgroup subsystem"
-       default n
-       help
-         This option enables a simple cgroup subsystem that
-         exports useful debugging information about the cgroups
-         framework.
- 
-         Say N if unsure.
- 
- config CGROUP_FREEZER
-       bool "Freezer cgroup subsystem"
-       help
-         Provides a way to freeze and unfreeze all tasks in a
-         cgroup.
- 
- config CGROUP_PIDS
-       bool "PIDs cgroup subsystem"
-       help
-         Provides enforcement of process number limits in the scope of a
-         cgroup. Any attempt to fork more processes than is allowed in the
-         cgroup will fail. PIDs are fundamentally a global resource because it
-         is fairly trivial to reach PID exhaustion before you reach even a
-         conservative kmemcg limit. As a result, it is possible to grind a
-         system to halt without being limited by other cgroup policies. The
-         PIDs cgroup subsystem is designed to stop this from happening.
- 
-         It should be noted that organisational operations (such as attaching
-         to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
-         since the PIDs limit only affects a process's ability to fork, not to
-         attach to a cgroup.
- 
- config CGROUP_DEVICE
-       bool "Device controller for cgroups"
-       help
-         Provides a cgroup implementing whitelists for devices which
-         a process in the cgroup can mknod or open.
- 
- config CPUSETS
-       bool "Cpuset support"
-       help
-         This option will let you create and manage CPUSETs which
-         allow dynamically partitioning a system into sets of CPUs and
-         Memory Nodes and assigning tasks to run only within those sets.
-         This is primarily useful on large SMP or NUMA systems.
- 
-         Say N if unsure.
- 
- config PROC_PID_CPUSET
-       bool "Include legacy /proc/<pid>/cpuset file"
-       depends on CPUSETS
-       default y
- 
- config CGROUP_CPUACCT
-       bool "Simple CPU accounting cgroup subsystem"
-       help
-         Provides a simple Resource Controller for monitoring the
-         total CPU consumed by the tasks in a cgroup.
- 
   config PAGE_COUNTER
          bool
   
   config MEMCG
-       bool "Memory Resource Controller for Control Groups"
+       bool "Memory controller"
         select PAGE_COUNTER
         select EVENTFD
         help
-         Provides a memory resource controller that manages both anonymous
-         memory and page cache. (See Documentation/cgroups/memory.txt)
+         Provides control over the memory footprint of tasks in a cgroup.
   
   config MEMCG_SWAP
-       bool "Memory Resource Controller Swap Extension"
+       bool "Swap controller"
         depends on MEMCG && SWAP
         help
-         Add swap management feature to memory resource controller. When you
-         enable this, you can limit mem+swap usage per cgroup. In other words,
-         when you disable this, memory resource controller has no cares to
-         usage of swap...a process can exhaust all of the swap. This extension
-         is useful when you want to avoid exhaustion swap but this itself
-         adds more overheads and consumes memory for remembering information.
-         Especially if you use 32bit system or small memory system, please
-         be careful about enabling this. When memory resource controller
-         is disabled by boot option, this will be automatically disabled and
-         there will be no overhead from this. Even when you set this config=y,
-         if boot option "swapaccount=0" is set, swap will not be accounted.
-         Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
-         size is 4096bytes, 512k per 1Gbytes of swap.
+         Provides control over the swap space consumed by tasks in a cgroup.
+ 
   config MEMCG_SWAP_ENABLED
-       bool "Memory Resource Controller Swap Extension enabled by default"
+       bool "Swap controller enabled by default"
         depends on MEMCG_SWAP
         default y
         help
@@@ -1052,34 -981,43 +981,43 @@@ config MEMCG_KME
           the kmem extension can use it to guarantee that no group of processes
           will ever exhaust kernel resources alone.
   
- config CGROUP_HUGETLB
-       bool "HugeTLB Resource Controller for Control Groups"
-       depends on HUGETLB_PAGE
-       select PAGE_COUNTER
+ config BLK_CGROUP
+       bool "IO controller"
+       depends on BLOCK
         default n
-       help
-         Provides a cgroup Resource Controller for HugeTLB pages.
-         When you enable this, you can put a per cgroup limit on HugeTLB usage.
-         The limit is enforced during page fault. Since HugeTLB doesn't
-         support page reclaim, enforcing the limit at page fault time implies
-         that, the application will get SIGBUS signal if it tries to access
-         HugeTLB pages beyond its limit. This requires the application to know
-         beforehand how much HugeTLB pages it would require for its use. The
-         control group is tracked in the third page lru pointer. This means
-         that we cannot use the controller with huge page less than 3 pages.
+       ---help---
+       Generic block IO controller cgroup interface. This is the common
+       cgroup interface which should be used by various IO controlling
+       policies.
   
- config CGROUP_PERF
-       bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
-       depends on PERF_EVENTS && CGROUPS
-       help
-         This option extends the per-cpu mode to restrict monitoring to
-         threads which belong to the cgroup specified and run on the
-         designated cpu.
+       Currently, CFQ IO scheduler uses it to recognize task groups and
+       control disk bandwidth allocation (proportional time slice allocation)
+       to such task groups. It is also used by bio throttling logic in
+       block layer to implement upper limit in IO rates on a device.
   
-         Say N if unsure.
+       This option only enables generic Block IO controller infrastructure.
+       One needs to also enable actual IO controlling logic/policy. For
+       enabling proportional weight division of disk bandwidth in CFQ, set
+       CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
+       CONFIG_BLK_DEV_THROTTLING=y.
+ 
+       See Documentation/cgroups/blkio-controller.txt for more information.
+ 
+ config DEBUG_BLK_CGROUP
+       bool "IO controller debugging"
+       depends on BLK_CGROUP
+       default n
+       ---help---
+       Enable some debugging help. Currently it exports additional stat
+       files in a cgroup which can be useful for debugging.
+ 
+ config CGROUP_WRITEBACK
+       bool
+       depends on MEMCG && BLK_CGROUP
+       default y
   
   menuconfig CGROUP_SCHED
-       bool "Group CPU scheduler"
+       bool "CPU controller"
         default n
         help
           This feature lets CPU scheduler recognize task groups and control CPU
@@@ -1116,40 -1054,89 +1054,89 @@@ config RT_GROUP_SCHE
   
   endif #CGROUP_SCHED
   
- config BLK_CGROUP
-       bool "Block IO controller"
-       depends on BLOCK
+ config CGROUP_PIDS
+       bool "PIDs controller"
+       help
+         Provides enforcement of process number limits in the scope of a
+         cgroup. Any attempt to fork more processes than is allowed in the
+         cgroup will fail. PIDs are fundamentally a global resource because it
+         is fairly trivial to reach PID exhaustion before you reach even a
+         conservative kmemcg limit. As a result, it is possible to grind a
+         system to halt without being limited by other cgroup policies. The
+         PIDs cgroup subsystem is designed to stop this from happening.
+ 
+         It should be noted that organisational operations (such as attaching
+         to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+         since the PIDs limit only affects a process's ability to fork, not to
+         attach to a cgroup.
+ 
+ config CGROUP_FREEZER
+       bool "Freezer controller"
+       help
+         Provides a way to freeze and unfreeze all tasks in a
+         cgroup.
+ 
+ config CGROUP_HUGETLB
+       bool "HugeTLB controller"
+       depends on HUGETLB_PAGE
+       select PAGE_COUNTER
         default n
-       ---help---
-       Generic block IO controller cgroup interface. This is the common
-       cgroup interface which should be used by various IO controlling
-       policies.
+       help
+         Provides a cgroup controller for HugeTLB pages.
+         When you enable this, you can put a per cgroup limit on HugeTLB usage.
+         The limit is enforced during page fault. Since HugeTLB doesn't
+         support page reclaim, enforcing the limit at page fault time implies
+         that, the application will get SIGBUS signal if it tries to access
+         HugeTLB pages beyond its limit. This requires the application to know
+         beforehand how much HugeTLB pages it would require for its use. The
+         control group is tracked in the third page lru pointer. This means
+         that we cannot use the controller with huge page less than 3 pages.
   
-       Currently, CFQ IO scheduler uses it to recognize task groups and
-       control disk bandwidth allocation (proportional time slice allocation)
-       to such task groups. It is also used by bio throttling logic in
-       block layer to implement upper limit in IO rates on a device.
+ config CPUSETS
+       bool "Cpuset controller"
+       help
+         This option will let you create and manage CPUSETs which
+         allow dynamically partitioning a system into sets of CPUs and
+         Memory Nodes and assigning tasks to run only within those sets.
+         This is primarily useful on large SMP or NUMA systems.
   
-       This option only enables generic Block IO controller infrastructure.
-       One needs to also enable actual IO controlling logic/policy. For
-       enabling proportional weight division of disk bandwidth in CFQ, set
-       CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
-       CONFIG_BLK_DEV_THROTTLING=y.
+         Say N if unsure.
   
-       See Documentation/cgroups/blkio-controller.txt for more information.
+ config PROC_PID_CPUSET
+       bool "Include legacy /proc/<pid>/cpuset file"
+       depends on CPUSETS
+       default y
   
- config DEBUG_BLK_CGROUP
-       bool "Enable Block IO controller debugging"
-       depends on BLK_CGROUP
+ config CGROUP_DEVICE
+       bool "Device controller"
+       help
+         Provides a cgroup controller implementing whitelists for
+         devices which a process in the cgroup can mknod or open.
+ 
+ config CGROUP_CPUACCT
+       bool "Simple CPU accounting controller"
+       help
+         Provides a simple controller for monitoring the
+         total CPU consumed by the tasks in a cgroup.
+ 
+ config CGROUP_PERF
+       bool "Perf controller"
+       depends on PERF_EVENTS
+       help
+         This option extends the perf per-cpu mode to restrict monitoring
+         to threads which belong to the cgroup specified and run on the
+         designated cpu.
+ 
+         Say N if unsure.
+ 
+ config CGROUP_DEBUG
+       bool "Example controller"
         default n
-       ---help---
-       Enable some debugging help. Currently it exports additional stat
-       files in a cgroup which can be useful for debugging.
+       help
+         This option enables a simple controller that exports
+         debugging information about the cgroups framework.
   
- config CGROUP_WRITEBACK
-       bool
-       depends on MEMCG && BLK_CGROUP
-       default y
+         Say N.
   
   endif # CGROUPS
   
@@@ -2030,6 -2017,13 +2017,6 @@@ config INIT_ALL_POSSIBL
           it was better to provide this option than to break all the archs
           and have several arch maintainers pursuing me down dark alleys.
   
- -config STOP_MACHINE
- -      bool
- -      default y
- -      depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU
- -      help
- -        Need stop_machine() primitive.
- -
   source "block/Kconfig"
   
   config PREEMPT_NOTIFIERS
diff --combined kernel/cgroup.c

index fe95970,effb636..c03a640
--- 1/kernel/cgroup.c
--- 2/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@@ -57,8 -57,8 +57,8 @@@
   #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
   #include <linux/kthread.h>
   #include <linux/delay.h>
- -
   #include <linux/atomic.h>
+ +#include <net/sock.h>
   
   /*
    * pidlists linger the following amount before being destroyed.  The goal
@@@ -211,6 -211,7 +211,7 @@@ static unsigned long have_free_callbac
   /* Ditto for the can_fork callback. */
   static unsigned long have_canfork_callback __read_mostly;
   
+ static struct file_system_type cgroup2_fs_type;
   static struct cftype cgroup_dfl_base_files[];
   static struct cftype cgroup_legacy_base_files[];
   
@@@ -1623,10 -1624,6 +1624,6 @@@ static int parse_cgroupfs_options(char 
                         all_ss = true;
                         continue;
                 }
-               if (!strcmp(token, "__DEVEL__sane_behavior")) {
-                       opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
-                       continue;
-               }
                 if (!strcmp(token, "noprefix")) {
                         opts->flags |= CGRP_ROOT_NOPREFIX;
                         continue;
@@@ -1693,15 -1690,6 +1690,6 @@@
                         return -ENOENT;
         }
   
-       if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
-               pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
-               if (nr_opts != 1) {
-                       pr_err("sane_behavior: no other mount options allowed\n");
-                       return -EINVAL;
-               }
-               return 0;
-       }
- 
         /*
          * If the 'all' option was specified select all the subsystems,
          * otherwise if 'none', 'name=' and a subsystem name options were
@@@ -1981,6 -1969,7 +1969,7 @@@ static struct dentry *cgroup_mount(stru
                          int flags, const char *unused_dev_name,
                          void *data)
   {
+       bool is_v2 = fs_type == &cgroup2_fs_type;
         struct super_block *pinned_sb = NULL;
         struct cgroup_subsys *ss;
         struct cgroup_root *root;
@@@ -1997,6 -1986,17 +1986,17 @@@
         if (!use_task_css_set_links)
                 cgroup_enable_task_cg_lists();
   
+       if (is_v2) {
+               if (data) {
+                       pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+                       return ERR_PTR(-EINVAL);
+               }
+               cgrp_dfl_root_visible = true;
+               root = &cgrp_dfl_root;
+               cgroup_get(&root->cgrp);
+               goto out_mount;
+       }
+ 
         mutex_lock(&cgroup_mutex);
   
         /* First find the desired set of subsystems */
@@@ -2004,15 -2004,6 +2004,6 @@@
         if (ret)
                 goto out_unlock;
   
-       /* look for a matching existing root */
-       if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
-               cgrp_dfl_root_visible = true;
-               root = &cgrp_dfl_root;
-               cgroup_get(&root->cgrp);
-               ret = 0;
-               goto out_unlock;
-       }
- 
         /*
          * Destruction of cgroup root is asynchronous, so subsystems may
          * still be dying after the previous unmount.  Let's drain the
@@@ -2123,9 -2114,10 +2114,10 @@@ out_free
   
         if (ret)
                 return ERR_PTR(ret);
- 
+ out_mount:
         dentry = kernfs_mount(fs_type, flags, root->kf_root,
-                               CGROUP_SUPER_MAGIC, &new_sb);
+                             is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
+                             &new_sb);
         if (IS_ERR(dentry) || !new_sb)
                 cgroup_put(&root->cgrp);
   
@@@ -2168,6 -2160,12 +2160,12 @@@ static struct file_system_type cgroup_f
         .kill_sb = cgroup_kill_sb,
   };
   
+ static struct file_system_type cgroup2_fs_type = {
+       .name = "cgroup2",
+       .mount = cgroup_mount,
+       .kill_sb = cgroup_kill_sb,
+ };
+ 
   /**
    * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
    * @task: target task
@@@ -4039,7 -4037,7 +4037,7 @@@ int cgroup_transfer_tasks(struct cgrou
                 goto out_err;
   
         /*
-        * Migrate tasks one-by-one until @form is empty.  This fails iff
+        * Migrate tasks one-by-one until @from is empty.  This fails iff
          * ->can_attach() fails.
          */
         do {
@@@ -5171,7 -5169,7 +5169,7 @@@ static void __init cgroup_init_subsys(s
   {
         struct cgroup_subsys_state *css;
   
-       printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+       pr_debug("Initializing cgroup subsys %s\n", ss->name);
   
         mutex_lock(&cgroup_mutex);
   
@@@ -5329,6 -5327,7 +5327,7 @@@ int __init cgroup_init(void
   
         WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
         WARN_ON(register_filesystem(&cgroup_fs_type));
+       WARN_ON(register_filesystem(&cgroup2_fs_type));
         WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
   
         return 0;
@@@ -5472,19 -5471,6 +5471,6 @@@ static const struct file_operations pro
         .release = single_release,
   };
   
- static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
- {
-       if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
-               return &ss_priv[i - CGROUP_CANFORK_START];
-       return NULL;
- }
- 
- static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
- {
-       void **private = subsys_canfork_priv_p(ss_priv, i);
-       return private ? *private : NULL;
- }
- 
   /**
    * cgroup_fork - initialize cgroup related fields during copy_process()
    * @child: pointer to task_struct of forking parent process.
@@@ -5507,14 -5493,13 +5493,13 @@@ void cgroup_fork(struct task_struct *ch
    * returns an error, the fork aborts with that error code. This allows for
    * a cgroup subsystem to conditionally allow or deny new forks.
    */
- int cgroup_can_fork(struct task_struct *child,
-                   void *ss_priv[CGROUP_CANFORK_COUNT])
+ int cgroup_can_fork(struct task_struct *child)
   {
         struct cgroup_subsys *ss;
         int i, j, ret;
   
         for_each_subsys_which(ss, i, &have_canfork_callback) {
-               ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+               ret = ss->can_fork(child);
                 if (ret)
                         goto out_revert;
         }
@@@ -5526,7 -5511,7 +5511,7 @@@ out_revert
                 if (j >= i)
                         break;
                 if (ss->cancel_fork)
-                       ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+                       ss->cancel_fork(child);
         }
   
         return ret;
@@@ -5539,15 -5524,14 +5524,14 @@@
    * This calls the cancel_fork() callbacks if a fork failed *after*
    * cgroup_can_fork() succeded.
    */
- void cgroup_cancel_fork(struct task_struct *child,
-                       void *ss_priv[CGROUP_CANFORK_COUNT])
+ void cgroup_cancel_fork(struct task_struct *child)
   {
         struct cgroup_subsys *ss;
         int i;
   
         for_each_subsys(ss, i)
                 if (ss->cancel_fork)
-                       ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+                       ss->cancel_fork(child);
   }
   
   /**
@@@ -5560,8 -5544,7 +5544,7 @@@
    * cgroup_task_iter_start() - to guarantee that the new task ends up on its
    * list.
    */
- void cgroup_post_fork(struct task_struct *child,
-                     void *old_ss_priv[CGROUP_CANFORK_COUNT])
+ void cgroup_post_fork(struct task_struct *child)
   {
         struct cgroup_subsys *ss;
         int i;
@@@ -5605,7 -5588,7 +5588,7 @@@
          * and addition to css_set.
          */
         for_each_subsys_which(ss, i, &have_fork_callback)
-               ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
+               ss->fork(child);
   }
   
   /**
@@@ -5839,59 -5822,6 +5822,59 @@@ struct cgroup *cgroup_get_from_path(con
   }
   EXPORT_SYMBOL_GPL(cgroup_get_from_path);
   
+ +/*
+ + * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
+ + * definition in cgroup-defs.h.
+ + */
+ +#ifdef CONFIG_SOCK_CGROUP_DATA
+ +
+ +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+ +
+ +DEFINE_SPINLOCK(cgroup_sk_update_lock);
+ +static bool cgroup_sk_alloc_disabled __read_mostly;
+ +
+ +void cgroup_sk_alloc_disable(void)
+ +{
+ +      if (cgroup_sk_alloc_disabled)
+ +              return;
+ +      pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
+ +      cgroup_sk_alloc_disabled = true;
+ +}
+ +
+ +#else
+ +
+ +#define cgroup_sk_alloc_disabled      false
+ +
+ +#endif
+ +
+ +void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
+ +{
+ +      if (cgroup_sk_alloc_disabled)
+ +              return;
+ +
+ +      rcu_read_lock();
+ +
+ +      while (true) {
+ +              struct css_set *cset;
+ +
+ +              cset = task_css_set(current);
+ +              if (likely(cgroup_tryget(cset->dfl_cgrp))) {
+ +                      skcd->val = (unsigned long)cset->dfl_cgrp;
+ +                      break;
+ +              }
+ +              cpu_relax();
+ +      }
+ +
+ +      rcu_read_unlock();
+ +}
+ +
+ +void cgroup_sk_free(struct sock_cgroup_data *skcd)
+ +{
+ +      cgroup_put(sock_cgroup_ptr(skcd));
+ +}
+ +
+ +#endif        /* CONFIG_SOCK_CGROUP_DATA */
+ +
   #ifdef CONFIG_CGROUP_DEBUG
   static struct cgroup_subsys_state *
   debug_css_alloc(struct cgroup_subsys_state *parent_css)
diff --combined kernel/fork.c

index 291b08c,ba7d1c0..6774e6b
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -380,7 -380,6 +380,7 @@@ static struct task_struct *dup_task_str
   #endif
         tsk->splice_pipe = NULL;
         tsk->task_frag.page = NULL;
+ +      tsk->wake_q.next = NULL;
   
         account_kernel_stack(ti, 1);
   
@@@ -1250,7 -1249,6 +1250,6 @@@ static struct task_struct *copy_process
   {
         int retval;
         struct task_struct *p;
-       void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
   
         if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                 return ERR_PTR(-EINVAL);
@@@ -1349,9 -1347,9 +1348,9 @@@
         prev_cputime_init(&p->prev_cputime);
   
   #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- -      seqlock_init(&p->vtime_seqlock);
+ +      seqcount_init(&p->vtime_seqcount);
         p->vtime_snap = 0;
- -      p->vtime_snap_whence = VTIME_SLEEPING;
+ +      p->vtime_snap_whence = VTIME_INACTIVE;
   #endif
   
   #if defined(SPLIT_RSS_COUNTING)
@@@ -1527,7 -1525,7 +1526,7 @@@
          * between here and cgroup_post_fork() if an organisation operation is in
          * progress.
          */
-       retval = cgroup_can_fork(p, cgrp_ss_priv);
+       retval = cgroup_can_fork(p);
         if (retval)
                 goto bad_fork_free_pid;
   
@@@ -1609,7 -1607,7 +1608,7 @@@
         write_unlock_irq(&tasklist_lock);
   
         proc_fork_connector(p);
-       cgroup_post_fork(p, cgrp_ss_priv);
+       cgroup_post_fork(p);
         threadgroup_change_end(current);
         perf_event_fork(p);
   
@@@ -1619,7 -1617,7 +1618,7 @@@
         return p;
   
   bad_fork_cancel_cgroup:
-       cgroup_cancel_fork(p, cgrp_ss_priv);
+       cgroup_cancel_fork(p);
   bad_fork_free_pid:
         if (pid != &init_struct_pid)
                 free_pid(pid);
diff --combined kernel/sched/core.c

index 77d97a6,b7d2271..44253ad
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -731,7 -731,7 +731,7 @@@ bool sched_can_stop_tick(void
         if (current->policy == SCHED_RR) {
                 struct sched_rt_entity *rt_se = &current->rt;
   
- -              return rt_se->run_list.prev == rt_se->run_list.next;
+ +              return list_is_singular(&rt_se->run_list);
         }
   
         /*
@@@ -823,8 -823,8 +823,8 @@@ static void set_load_weight(struct task
                 return;
         }
   
- -      load->weight = scale_load(prio_to_weight[prio]);
- -      load->inv_weight = prio_to_wmult[prio];
+ +      load->weight = scale_load(sched_prio_to_weight[prio]);
+ +      load->inv_weight = sched_prio_to_wmult[prio];
   }
   
   static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@@ -1071,8 -1071,8 +1071,8 @@@ static struct rq *move_queued_task(stru
   {
         lockdep_assert_held(&rq->lock);
   
- -      dequeue_task(rq, p, 0);
         p->on_rq = TASK_ON_RQ_MIGRATING;
+ +      dequeue_task(rq, p, 0);
         set_task_cpu(p, new_cpu);
         raw_spin_unlock(&rq->lock);
   
@@@ -1080,8 -1080,8 +1080,8 @@@
   
         raw_spin_lock(&rq->lock);
         BUG_ON(task_cpu(p) != new_cpu);
- -      p->on_rq = TASK_ON_RQ_QUEUED;
         enqueue_task(rq, p, 0);
+ +      p->on_rq = TASK_ON_RQ_QUEUED;
         check_preempt_curr(rq, p, 0);
   
         return rq;
@@@ -1274,15 -1274,6 +1274,15 @@@ void set_task_cpu(struct task_struct *p
         WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
                         !p->on_rq);
   
+ +      /*
+ +       * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+ +       * because schedstat_wait_{start,end} rebase migrating task's wait_start
+ +       * time relying on p->on_rq.
+ +       */
+ +      WARN_ON_ONCE(p->state == TASK_RUNNING &&
+ +                   p->sched_class == &fair_sched_class &&
+ +                   (p->on_rq && !task_on_rq_migrating(p)));
+ +
   #ifdef CONFIG_LOCKDEP
         /*
          * The caller should hold either p->pi_lock or rq->lock, when changing
@@@ -1319,11 -1310,9 +1319,11 @@@ static void __migrate_swap_task(struct 
                 src_rq = task_rq(p);
                 dst_rq = cpu_rq(cpu);
   
+ +              p->on_rq = TASK_ON_RQ_MIGRATING;
                 deactivate_task(src_rq, p, 0);
                 set_task_cpu(p, cpu);
                 activate_task(dst_rq, p, 0);
+ +              p->on_rq = TASK_ON_RQ_QUEUED;
                 check_preempt_curr(dst_rq, p, 0);
         } else {
                 /*
@@@ -1916,97 -1905,6 +1916,97 @@@ static void ttwu_queue(struct task_stru
         raw_spin_unlock(&rq->lock);
   }
   
+ +/*
+ + * Notes on Program-Order guarantees on SMP systems.
+ + *
+ + *  MIGRATION
+ + *
+ + * The basic program-order guarantee on SMP systems is that when a task [t]
+ + * migrates, all its activity on its old cpu [c0] happens-before any subsequent
+ + * execution on its new cpu [c1].
+ + *
+ + * For migration (of runnable tasks) this is provided by the following means:
+ + *
+ + *  A) UNLOCK of the rq(c0)->lock scheduling out task t
+ + *  B) migration for t is required to synchronize *both* rq(c0)->lock and
+ + *     rq(c1)->lock (if not at the same time, then in that order).
+ + *  C) LOCK of the rq(c1)->lock scheduling in task
+ + *
+ + * Transitivity guarantees that B happens after A and C after B.
+ + * Note: we only require RCpc transitivity.
+ + * Note: the cpu doing B need not be c0 or c1
+ + *
+ + * Example:
+ + *
+ + *   CPU0            CPU1            CPU2
+ + *
+ + *   LOCK rq(0)->lock
+ + *   sched-out X
+ + *   sched-in Y
+ + *   UNLOCK rq(0)->lock
+ + *
+ + *                                   LOCK rq(0)->lock // orders against CPU0
+ + *                                   dequeue X
+ + *                                   UNLOCK rq(0)->lock
+ + *
+ + *                                   LOCK rq(1)->lock
+ + *                                   enqueue X
+ + *                                   UNLOCK rq(1)->lock
+ + *
+ + *                   LOCK rq(1)->lock // orders against CPU2
+ + *                   sched-out Z
+ + *                   sched-in X
+ + *                   UNLOCK rq(1)->lock
+ + *
+ + *
+ + *  BLOCKING -- aka. SLEEP + WAKEUP
+ + *
+ + * For blocking we (obviously) need to provide the same guarantee as for
+ + * migration. However the means are completely different as there is no lock
+ + * chain to provide order. Instead we do:
+ + *
+ + *   1) smp_store_release(X->on_cpu, 0)
+ + *   2) smp_cond_acquire(!X->on_cpu)
+ + *
+ + * Example:
+ + *
+ + *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)
+ + *
+ + *   LOCK rq(0)->lock LOCK X->pi_lock
+ + *   dequeue X
+ + *   sched-out X
+ + *   smp_store_release(X->on_cpu, 0);
+ + *
+ + *                    smp_cond_acquire(!X->on_cpu);
+ + *                    X->state = WAKING
+ + *                    set_task_cpu(X,2)
+ + *
+ + *                    LOCK rq(2)->lock
+ + *                    enqueue X
+ + *                    X->state = RUNNING
+ + *                    UNLOCK rq(2)->lock
+ + *
+ + *                                          LOCK rq(2)->lock // orders against CPU1
+ + *                                          sched-out Z
+ + *                                          sched-in X
+ + *                                          UNLOCK rq(2)->lock
+ + *
+ + *                    UNLOCK X->pi_lock
+ + *   UNLOCK rq(0)->lock
+ + *
+ + *
+ + * However; for wakeups there is a second guarantee we must provide, namely we
+ + * must observe the state that lead to our wakeup. That is, not only must our
+ + * task observe its own prior state, it must also observe the stores prior to
+ + * its wakeup.
+ + *
+ + * This means that any means of doing remote wakeups must order the CPU doing
+ + * the wakeup against the CPU the task is going to end up running on. This,
+ + * however, is already required for the regular Program-Order guarantee above,
+ + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ + *
+ + */
+ +
   /**
    * try_to_wake_up - wake up a thread
    * @p: the thread to be awakened
@@@ -2049,34 -1947,15 +2049,34 @@@ try_to_wake_up(struct task_struct *p, u
   
   #ifdef CONFIG_SMP
         /*
- -       * If the owning (remote) cpu is still in the middle of schedule() with
- -       * this task as prev, wait until its done referencing the task.
+ +       * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+ +       * possible to, falsely, observe p->on_cpu == 0.
+ +       *
+ +       * One must be running (->on_cpu == 1) in order to remove oneself
+ +       * from the runqueue.
+ +       *
+ +       *  [S] ->on_cpu = 1;   [L] ->on_rq
+ +       *      UNLOCK rq->lock
+ +       *                      RMB
+ +       *      LOCK   rq->lock
+ +       *  [S] ->on_rq = 0;    [L] ->on_cpu
+ +       *
+ +       * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
+ +       * from the consecutive calls to schedule(); the first switching to our
+ +       * task, the second putting it to sleep.
          */
- -      while (p->on_cpu)
- -              cpu_relax();
+ +      smp_rmb();
+ +
         /*
- -       * Pairs with the smp_wmb() in finish_lock_switch().
+ +       * If the owning (remote) cpu is still in the middle of schedule() with
+ +       * this task as prev, wait until its done referencing the task.
+ +       *
+ +       * Pairs with the smp_store_release() in finish_lock_switch().
+ +       *
+ +       * This ensures that tasks getting woken will be fully ordered against
+ +       * their previous state and preserve Program Order.
          */
- -      smp_rmb();
+ +      smp_cond_acquire(!p->on_cpu);
   
         p->sched_contributes_to_load = !!task_contributes_to_load(p);
         p->state = TASK_WAKING;
@@@ -2160,6 -2039,7 +2160,6 @@@ out
    */
   int wake_up_process(struct task_struct *p)
   {
- -      WARN_ON(task_is_stopped_or_traced(p));
         return try_to_wake_up(p, TASK_NORMAL, 0);
   }
   EXPORT_SYMBOL(wake_up_process);
@@@ -2205,10 -2085,6 +2205,10 @@@ static void __sched_fork(unsigned long 
         p->se.vruntime                  = 0;
         INIT_LIST_HEAD(&p->se.group_node);
   
+ +#ifdef CONFIG_FAIR_GROUP_SCHED
+ +      p->se.cfs_rq                    = NULL;
+ +#endif
+ +
   #ifdef CONFIG_SCHEDSTATS
         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
   #endif
@@@ -3209,6 -3085,7 +3209,6 @@@ static void __sched notrace __schedule(
   
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
- -      rcu_note_context_switch();
         prev = rq->curr;
   
         /*
@@@ -3227,16 -3104,13 +3227,16 @@@
         if (sched_feat(HRTICK))
                 hrtick_clear(rq);
   
+ +      local_irq_disable();
+ +      rcu_note_context_switch();
+ +
         /*
          * Make sure that signal_pending_state()->signal_pending() below
          * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
          * done by the caller to avoid the race with signal_wake_up().
          */
         smp_mb__before_spinlock();
- -      raw_spin_lock_irq(&rq->lock);
+ +      raw_spin_lock(&rq->lock);
         lockdep_pin_lock(&rq->lock);
   
         rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@@ -5973,13 -5847,13 +5973,13 @@@ static int init_rootdomain(struct root_
   {
         memset(rd, 0, sizeof(*rd));
   
- -      if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ +      if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
                 goto out;
- -      if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ +      if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
                 goto free_span;
- -      if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
+ +      if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
                 goto free_online;
- -      if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ +      if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
                 goto free_dlo_mask;
   
         init_dl_bw(&rd->dl_bw);
@@@ -7457,9 -7331,6 +7457,9 @@@ int in_sched_functions(unsigned long ad
    */
   struct task_group root_task_group;
   LIST_HEAD(task_groups);
+ +
+ +/* Cacheline aligned slab cache for task_group */
+ +static struct kmem_cache *task_group_cache __read_mostly;
   #endif
   
   DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
@@@ -7517,12 -7388,11 +7517,12 @@@ void __init sched_init(void
   #endif /* CONFIG_RT_GROUP_SCHED */
   
   #ifdef CONFIG_CGROUP_SCHED
+ +      task_group_cache = KMEM_CACHE(task_group, 0);
+ +
         list_add(&root_task_group.list, &task_groups);
         INIT_LIST_HEAD(&root_task_group.children);
         INIT_LIST_HEAD(&root_task_group.siblings);
         autogroup_init(&init_task);
- -
   #endif /* CONFIG_CGROUP_SCHED */
   
         for_each_possible_cpu(i) {
@@@ -7803,7 -7673,7 +7803,7 @@@ static void free_sched_group(struct tas
         free_fair_sched_group(tg);
         free_rt_sched_group(tg);
         autogroup_free(tg);
- -      kfree(tg);
+ +      kmem_cache_free(task_group_cache, tg);
   }
   
   /* allocate runqueue etc for a new task group */
@@@ -7811,7 -7681,7 +7811,7 @@@ struct task_group *sched_create_group(s
   {
         struct task_group *tg;
   
- -      tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ +      tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
         if (!tg)
                 return ERR_PTR(-ENOMEM);
   
@@@ -8342,7 -8212,7 +8342,7 @@@ static void cpu_cgroup_css_offline(stru
         sched_offline_group(tg);
   }
   
- static void cpu_cgroup_fork(struct task_struct *task, void *private)
+ static void cpu_cgroup_fork(struct task_struct *task)
   {
         sched_move_task(task);
   }
@@@ -8716,44 -8586,3 +8716,44 @@@ void dump_cpu_task(int cpu
         pr_info("Task dump for CPU %d:\n", cpu);
         sched_show_task(cpu_curr(cpu));
   }
+ +
+ +/*
+ + * Nice levels are multiplicative, with a gentle 10% change for every
+ + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ + * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ + * that remained on nice 0.
+ + *
+ + * The "10% effect" is relative and cumulative: from _any_ nice level,
+ + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ + * If a task goes up by ~10% and another task goes down by ~10% then
+ + * the relative distance between them is ~25%.)
+ + */
+ +const int sched_prio_to_weight[40] = {
+ + /* -20 */     88761,     71755,     56483,     46273,     36291,
+ + /* -15 */     29154,     23254,     18705,     14949,     11916,
+ + /* -10 */      9548,      7620,      6100,      4904,      3906,
+ + /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ + /*   0 */      1024,       820,       655,       526,       423,
+ + /*   5 */       335,       272,       215,       172,       137,
+ + /*  10 */       110,        87,        70,        56,        45,
+ + /*  15 */        36,        29,        23,        18,        15,
+ +};
+ +
+ +/*
+ + * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
+ + *
+ + * In cases where the weight does not change often, we can use the
+ + * precalculated inverse to speed up arithmetics by turning divisions
+ + * into multiplications:
+ + */
+ +const u32 sched_prio_to_wmult[40] = {
+ + /* -20 */     48388,     59856,     76040,     92818,    118348,
+ + /* -15 */    147320,    184698,    229616,    287308,    360437,
+ + /* -10 */    449829,    563644,    704093,    875809,   1099582,
+ + /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
+ + /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
+ + /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
+ + /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
+ + /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+ +};
diff --combined mm/memcontrol.c

index fc10620,7ca43eb..14cb1db
--- 1/mm/memcontrol.c
--- 2/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -903,20 -903,14 +903,20 @@@ struct mem_cgroup *mem_cgroup_iter(stru
                 if (prev && reclaim->generation != iter->generation)
                         goto out_unlock;
   
- -              do {
+ +              while (1) {
                         pos = READ_ONCE(iter->position);
+ +                      if (!pos || css_tryget(&pos->css))
+ +                              break;
                         /*
- -                       * A racing update may change the position and
- -                       * put the last reference, hence css_tryget(),
- -                       * or retry to see the updated position.
+ +                       * css reference reached zero, so iter->position will
+ +                       * be cleared by ->css_released. However, we should not
+ +                       * rely on this happening soon, because ->css_released
+ +                       * is called from a work queue, and by busy-waiting we
+ +                       * might block it. So we clear iter->position right
+ +                       * away.
                          */
- -              } while (pos && !css_tryget(&pos->css));
+ +                      (void)cmpxchg(&iter->position, pos, NULL);
+ +              }
         }
   
         if (pos)
@@@ -962,13 -956,17 +962,13 @@@
         }
   
         if (reclaim) {
- -              if (cmpxchg(&iter->position, pos, memcg) == pos) {
- -                      if (memcg)
- -                              css_get(&memcg->css);
- -                      if (pos)
- -                              css_put(&pos->css);
- -              }
- -
                 /*
- -               * pairs with css_tryget when dereferencing iter->position
- -               * above.
+ +               * The position could have already been updated by a competing
+ +               * thread, so check that the value hasn't changed since we read
+ +               * it to avoid reclaiming from the same cgroup twice.
                  */
+ +              (void)cmpxchg(&iter->position, pos, memcg);
+ +
                 if (pos)
                         css_put(&pos->css);
   
@@@ -1001,28 -999,6 +1001,28 @@@ void mem_cgroup_iter_break(struct mem_c
                 css_put(&prev->css);
   }
   
+ +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+ +{
+ +      struct mem_cgroup *memcg = dead_memcg;
+ +      struct mem_cgroup_reclaim_iter *iter;
+ +      struct mem_cgroup_per_zone *mz;
+ +      int nid, zid;
+ +      int i;
+ +
+ +      while ((memcg = parent_mem_cgroup(memcg))) {
+ +              for_each_node(nid) {
+ +                      for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ +                              mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+ +                              for (i = 0; i <= DEF_PRIORITY; i++) {
+ +                                      iter = &mz->iter[i];
+ +                                      cmpxchg(&iter->position,
+ +                                              dead_memcg, NULL);
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +}
+ +
   /*
    * Iteration constructs for visiting all cgroups (under a tree).  If
    * loops are exited prematurely (break), mem_cgroup_iter_break() must
@@@ -2152,7 -2128,7 +2152,7 @@@ done_restock
          */
         do {
                 if (page_counter_read(&memcg->memory) > memcg->high) {
- -                      current->memcg_nr_pages_over_high += nr_pages;
+ +                      current->memcg_nr_pages_over_high += batch;
                         set_notify_resume(current);
                         break;
                 }
@@@ -4348,13 -4324,6 +4348,13 @@@ static void mem_cgroup_css_offline(stru
         wb_memcg_offline(memcg);
   }
   
+ +static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
+ +{
+ +      struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ +
+ +      invalidate_reclaim_iterators(memcg);
+ +}
+ +
   static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
   {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@@ -4813,7 -4782,7 +4813,7 @@@ static void mem_cgroup_clear_mc(void
   static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
   {
         struct cgroup_subsys_state *css;
-       struct mem_cgroup *memcg;
+       struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
         struct mem_cgroup *from;
         struct task_struct *leader, *p;
         struct mm_struct *mm;
@@@ -5216,7 -5185,6 +5216,7 @@@ struct cgroup_subsys memory_cgrp_subsy
         .css_alloc = mem_cgroup_css_alloc,
         .css_online = mem_cgroup_css_online,
         .css_offline = mem_cgroup_css_offline,
+ +      .css_released = mem_cgroup_css_released,
         .css_free = mem_cgroup_css_free,
         .css_reset = mem_cgroup_css_reset,
         .can_attach = mem_cgroup_can_attach,
@@@ -5544,11 -5512,11 +5544,11 @@@ void mem_cgroup_uncharge_list(struct li
    * mem_cgroup_replace_page - migrate a charge to another page
    * @oldpage: currently charged page
    * @newpage: page to transfer the charge to
- - * @lrucare: either or both pages might be on the LRU already
    *
    * Migrate the charge from @oldpage to @newpage.
    *
    * Both pages must be locked, @newpage->mapping must be set up.
+ + * Either or both pages might be on the LRU already.
    */
   void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
   {
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 13 Jan 2016 03:20:32 +0000 (19:20 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 13 Jan 2016 03:20:32 +0000 (19:20 -0800)
		1	2
include/linux/cgroup-defs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/cgroup.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history