/* define the enumeration of all cgroup subsystems */
#define SUBSYS(_x) _x ## _cgrp_id,
- #define SUBSYS_TAG(_t) CGROUP_ ## _t, \
- __unused_tag_ ## _t = CGROUP_ ## _t - 1,
enum cgroup_subsys_id {
#include <linux/cgroup_subsys.h>
CGROUP_SUBSYS_COUNT,
};
- #undef SUBSYS_TAG
#undef SUBSYS
- #define CGROUP_CANFORK_COUNT (CGROUP_CANFORK_END - CGROUP_CANFORK_START)
-
/* bits in struct cgroup_subsys_state flags field */
enum {
CSS_NO_REF = (1 << 0), /* no reference counting for this css */
/* cgroup_root->flags */
enum {
- CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */
CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
};
int (*can_attach)(struct cgroup_taskset *tset);
void (*cancel_attach)(struct cgroup_taskset *tset);
void (*attach)(struct cgroup_taskset *tset);
- int (*can_fork)(struct task_struct *task, void **priv_p);
- void (*cancel_fork)(struct task_struct *task, void *priv);
- void (*fork)(struct task_struct *task, void *priv);
+ int (*can_fork)(struct task_struct *task);
+ void (*cancel_fork)(struct task_struct *task);
+ void (*fork)(struct task_struct *task);
void (*exit)(struct task_struct *task);
void (*free)(struct task_struct *task);
void (*bind)(struct cgroup_subsys_state *root_css);
#else /* CONFIG_CGROUPS */
- #define CGROUP_CANFORK_COUNT 0
#define CGROUP_SUBSYS_COUNT 0
static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
#endif /* CONFIG_CGROUPS */
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+/*
+ * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
+ * per-socket cgroup information except for memcg association.
+ *
+ * On legacy hierarchies, net_prio and net_cls controllers directly set
+ * attributes on each sock which can then be tested by the network layer.
+ * On the default hierarchy, each sock is associated with the cgroup it was
+ * created in and the networking layer can match the cgroup directly.
+ *
+ * To avoid carrying all three cgroup related fields separately in sock,
+ * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
+ * On boot, sock_cgroup_data records the cgroup that the sock was created
+ * in so that cgroup2 matches can be made; however, once either net_prio or
+ * net_cls starts being used, the area is overriden to carry prioidx and/or
+ * classid. The two modes are distinguished by whether the lowest bit is
+ * set. Clear bit indicates cgroup pointer while set bit prioidx and
+ * classid.
+ *
+ * While userland may start using net_prio or net_cls at any time, once
+ * either is used, cgroup2 matching no longer works. There is no reason to
+ * mix the two and this is in line with how legacy and v2 compatibility is
+ * handled. On mode switch, cgroup references which are already being
+ * pointed to by socks may be leaked. While this can be remedied by adding
+ * synchronization around sock_cgroup_data, given that the number of leaked
+ * cgroups is bound and highly unlikely to be high, this seems to be the
+ * better trade-off.
+ */
+struct sock_cgroup_data {
+ union {
+#ifdef __LITTLE_ENDIAN
+ struct {
+ u8 is_data;
+ u8 padding;
+ u16 prioidx;
+ u32 classid;
+ } __packed;
+#else
+ struct {
+ u32 classid;
+ u16 prioidx;
+ u8 padding;
+ u8 is_data;
+ } __packed;
+#endif
+ u64 val;
+ };
+};
+
+/*
+ * There's a theoretical window where the following accessors race with
+ * updaters and return part of the previous pointer as the prioidx or
+ * classid. Such races are short-lived and the result isn't critical.
+ */
+static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd)
+{
+ /* fallback to 1 which is always the ID of the root cgroup */
+ return (skcd->is_data & 1) ? skcd->prioidx : 1;
+}
+
+static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd)
+{
+ /* fallback to 0 which is the unconfigured default classid */
+ return (skcd->is_data & 1) ? skcd->classid : 0;
+}
+
+/*
+ * If invoked concurrently, the updaters may clobber each other. The
+ * caller is responsible for synchronization.
+ */
+static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
+ u16 prioidx)
+{
+ struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
+
+ if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
+ return;
+
+ if (!(skcd_buf.is_data & 1)) {
+ skcd_buf.val = 0;
+ skcd_buf.is_data = 1;
+ }
+
+ skcd_buf.prioidx = prioidx;
+ WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
+}
+
+static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
+ u32 classid)
+{
+ struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
+
+ if (sock_cgroup_classid(&skcd_buf) == classid)
+ return;
+
+ if (!(skcd_buf.is_data & 1)) {
+ skcd_buf.val = 0;
+ skcd_buf.is_data = 1;
+ }
+
+ skcd_buf.classid = classid;
+ WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
+}
+
+#else /* CONFIG_SOCK_CGROUP_DATA */
+
+struct sock_cgroup_data {
+};
+
+#endif /* CONFIG_SOCK_CGROUP_DATA */
+
#endif /* _LINUX_CGROUP_DEFS_H */
struct pid *pid, struct task_struct *tsk);
void cgroup_fork(struct task_struct *p);
- extern int cgroup_can_fork(struct task_struct *p,
- void *ss_priv[CGROUP_CANFORK_COUNT]);
- extern void cgroup_cancel_fork(struct task_struct *p,
- void *ss_priv[CGROUP_CANFORK_COUNT]);
- extern void cgroup_post_fork(struct task_struct *p,
- void *old_ss_priv[CGROUP_CANFORK_COUNT]);
+ extern int cgroup_can_fork(struct task_struct *p);
+ extern void cgroup_cancel_fork(struct task_struct *p);
+ extern void cgroup_post_fork(struct task_struct *p);
void cgroup_exit(struct task_struct *p);
void cgroup_free(struct task_struct *p);
struct dentry *dentry) { return -EINVAL; }
static inline void cgroup_fork(struct task_struct *p) {}
- static inline int cgroup_can_fork(struct task_struct *p,
- void *ss_priv[CGROUP_CANFORK_COUNT])
- { return 0; }
- static inline void cgroup_cancel_fork(struct task_struct *p,
- void *ss_priv[CGROUP_CANFORK_COUNT]) {}
- static inline void cgroup_post_fork(struct task_struct *p,
- void *ss_priv[CGROUP_CANFORK_COUNT]) {}
+ static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
+ static inline void cgroup_cancel_fork(struct task_struct *p) {}
+ static inline void cgroup_post_fork(struct task_struct *p) {}
static inline void cgroup_exit(struct task_struct *p) {}
static inline void cgroup_free(struct task_struct *p) {}
#endif /* !CONFIG_CGROUPS */
+/*
+ * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
+ * definition in cgroup-defs.h.
+ */
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+extern spinlock_t cgroup_sk_update_lock;
+#endif
+
+void cgroup_sk_alloc_disable(void);
+void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
+void cgroup_sk_free(struct sock_cgroup_data *skcd);
+
+static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
+{
+#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+ unsigned long v;
+
+ /*
+ * @skcd->val is 64bit but the following is safe on 32bit too as we
+ * just need the lower ulong to be written and read atomically.
+ */
+ v = READ_ONCE(skcd->val);
+
+ if (v & 1)
+ return &cgrp_dfl_root.cgrp;
+
+ return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
+#else
+ return (struct cgroup *)(unsigned long)skcd->val;
+#endif
+}
+
+#else /* CONFIG_CGROUP_DATA */
+
+static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {}
+static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
+
+#endif /* CONFIG_CGROUP_DATA */
+
#endif /* _LINUX_CGROUP_H */
if CGROUPS
- config CGROUP_DEBUG
- bool "Example debug cgroup subsystem"
- default n
- help
- This option enables a simple cgroup subsystem that
- exports useful debugging information about the cgroups
- framework.
-
- Say N if unsure.
-
- config CGROUP_FREEZER
- bool "Freezer cgroup subsystem"
- help
- Provides a way to freeze and unfreeze all tasks in a
- cgroup.
-
- config CGROUP_PIDS
- bool "PIDs cgroup subsystem"
- help
- Provides enforcement of process number limits in the scope of a
- cgroup. Any attempt to fork more processes than is allowed in the
- cgroup will fail. PIDs are fundamentally a global resource because it
- is fairly trivial to reach PID exhaustion before you reach even a
- conservative kmemcg limit. As a result, it is possible to grind a
- system to halt without being limited by other cgroup policies. The
- PIDs cgroup subsystem is designed to stop this from happening.
-
- It should be noted that organisational operations (such as attaching
- to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
- since the PIDs limit only affects a process's ability to fork, not to
- attach to a cgroup.
-
- config CGROUP_DEVICE
- bool "Device controller for cgroups"
- help
- Provides a cgroup implementing whitelists for devices which
- a process in the cgroup can mknod or open.
-
- config CPUSETS
- bool "Cpuset support"
- help
- This option will let you create and manage CPUSETs which
- allow dynamically partitioning a system into sets of CPUs and
- Memory Nodes and assigning tasks to run only within those sets.
- This is primarily useful on large SMP or NUMA systems.
-
- Say N if unsure.
-
- config PROC_PID_CPUSET
- bool "Include legacy /proc/<pid>/cpuset file"
- depends on CPUSETS
- default y
-
- config CGROUP_CPUACCT
- bool "Simple CPU accounting cgroup subsystem"
- help
- Provides a simple Resource Controller for monitoring the
- total CPU consumed by the tasks in a cgroup.
-
config PAGE_COUNTER
bool
config MEMCG
- bool "Memory Resource Controller for Control Groups"
+ bool "Memory controller"
select PAGE_COUNTER
select EVENTFD
help
- Provides a memory resource controller that manages both anonymous
- memory and page cache. (See Documentation/cgroups/memory.txt)
+ Provides control over the memory footprint of tasks in a cgroup.
config MEMCG_SWAP
- bool "Memory Resource Controller Swap Extension"
+ bool "Swap controller"
depends on MEMCG && SWAP
help
- Add swap management feature to memory resource controller. When you
- enable this, you can limit mem+swap usage per cgroup. In other words,
- when you disable this, memory resource controller has no cares to
- usage of swap...a process can exhaust all of the swap. This extension
- is useful when you want to avoid exhaustion swap but this itself
- adds more overheads and consumes memory for remembering information.
- Especially if you use 32bit system or small memory system, please
- be careful about enabling this. When memory resource controller
- is disabled by boot option, this will be automatically disabled and
- there will be no overhead from this. Even when you set this config=y,
- if boot option "swapaccount=0" is set, swap will not be accounted.
- Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
- size is 4096bytes, 512k per 1Gbytes of swap.
+ Provides control over the swap space consumed by tasks in a cgroup.
+
config MEMCG_SWAP_ENABLED
- bool "Memory Resource Controller Swap Extension enabled by default"
+ bool "Swap controller enabled by default"
depends on MEMCG_SWAP
default y
help
the kmem extension can use it to guarantee that no group of processes
will ever exhaust kernel resources alone.
- config CGROUP_HUGETLB
- bool "HugeTLB Resource Controller for Control Groups"
- depends on HUGETLB_PAGE
- select PAGE_COUNTER
+ config BLK_CGROUP
+ bool "IO controller"
+ depends on BLOCK
default n
- help
- Provides a cgroup Resource Controller for HugeTLB pages.
- When you enable this, you can put a per cgroup limit on HugeTLB usage.
- The limit is enforced during page fault. Since HugeTLB doesn't
- support page reclaim, enforcing the limit at page fault time implies
- that, the application will get SIGBUS signal if it tries to access
- HugeTLB pages beyond its limit. This requires the application to know
- beforehand how much HugeTLB pages it would require for its use. The
- control group is tracked in the third page lru pointer. This means
- that we cannot use the controller with huge page less than 3 pages.
+ ---help---
+ Generic block IO controller cgroup interface. This is the common
+ cgroup interface which should be used by various IO controlling
+ policies.
- config CGROUP_PERF
- bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
- depends on PERF_EVENTS && CGROUPS
- help
- This option extends the per-cpu mode to restrict monitoring to
- threads which belong to the cgroup specified and run on the
- designated cpu.
+ Currently, CFQ IO scheduler uses it to recognize task groups and
+ control disk bandwidth allocation (proportional time slice allocation)
+ to such task groups. It is also used by bio throttling logic in
+ block layer to implement upper limit in IO rates on a device.
- Say N if unsure.
+ This option only enables generic Block IO controller infrastructure.
+ One needs to also enable actual IO controlling logic/policy. For
+ enabling proportional weight division of disk bandwidth in CFQ, set
+ CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
+ CONFIG_BLK_DEV_THROTTLING=y.
+
+ See Documentation/cgroups/blkio-controller.txt for more information.
+
+ config DEBUG_BLK_CGROUP
+ bool "IO controller debugging"
+ depends on BLK_CGROUP
+ default n
+ ---help---
+ Enable some debugging help. Currently it exports additional stat
+ files in a cgroup which can be useful for debugging.
+
+ config CGROUP_WRITEBACK
+ bool
+ depends on MEMCG && BLK_CGROUP
+ default y
menuconfig CGROUP_SCHED
- bool "Group CPU scheduler"
+ bool "CPU controller"
default n
help
This feature lets CPU scheduler recognize task groups and control CPU
endif #CGROUP_SCHED
- config BLK_CGROUP
- bool "Block IO controller"
- depends on BLOCK
+ config CGROUP_PIDS
+ bool "PIDs controller"
+ help
+ Provides enforcement of process number limits in the scope of a
+ cgroup. Any attempt to fork more processes than is allowed in the
+ cgroup will fail. PIDs are fundamentally a global resource because it
+ is fairly trivial to reach PID exhaustion before you reach even a
+ conservative kmemcg limit. As a result, it is possible to grind a
+ system to halt without being limited by other cgroup policies. The
+ PIDs cgroup subsystem is designed to stop this from happening.
+
+ It should be noted that organisational operations (such as attaching
+ to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+ since the PIDs limit only affects a process's ability to fork, not to
+ attach to a cgroup.
+
+ config CGROUP_FREEZER
+ bool "Freezer controller"
+ help
+ Provides a way to freeze and unfreeze all tasks in a
+ cgroup.
+
+ config CGROUP_HUGETLB
+ bool "HugeTLB controller"
+ depends on HUGETLB_PAGE
+ select PAGE_COUNTER
default n
- ---help---
- Generic block IO controller cgroup interface. This is the common
- cgroup interface which should be used by various IO controlling
- policies.
+ help
+ Provides a cgroup controller for HugeTLB pages.
+ When you enable this, you can put a per cgroup limit on HugeTLB usage.
+ The limit is enforced during page fault. Since HugeTLB doesn't
+ support page reclaim, enforcing the limit at page fault time implies
+ that, the application will get SIGBUS signal if it tries to access
+ HugeTLB pages beyond its limit. This requires the application to know
+ beforehand how much HugeTLB pages it would require for its use. The
+ control group is tracked in the third page lru pointer. This means
+ that we cannot use the controller with huge page less than 3 pages.
- Currently, CFQ IO scheduler uses it to recognize task groups and
- control disk bandwidth allocation (proportional time slice allocation)
- to such task groups. It is also used by bio throttling logic in
- block layer to implement upper limit in IO rates on a device.
+ config CPUSETS
+ bool "Cpuset controller"
+ help
+ This option will let you create and manage CPUSETs which
+ allow dynamically partitioning a system into sets of CPUs and
+ Memory Nodes and assigning tasks to run only within those sets.
+ This is primarily useful on large SMP or NUMA systems.
- This option only enables generic Block IO controller infrastructure.
- One needs to also enable actual IO controlling logic/policy. For
- enabling proportional weight division of disk bandwidth in CFQ, set
- CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
- CONFIG_BLK_DEV_THROTTLING=y.
+ Say N if unsure.
- See Documentation/cgroups/blkio-controller.txt for more information.
+ config PROC_PID_CPUSET
+ bool "Include legacy /proc/<pid>/cpuset file"
+ depends on CPUSETS
+ default y
- config DEBUG_BLK_CGROUP
- bool "Enable Block IO controller debugging"
- depends on BLK_CGROUP
+ config CGROUP_DEVICE
+ bool "Device controller"
+ help
+ Provides a cgroup controller implementing whitelists for
+ devices which a process in the cgroup can mknod or open.
+
+ config CGROUP_CPUACCT
+ bool "Simple CPU accounting controller"
+ help
+ Provides a simple controller for monitoring the
+ total CPU consumed by the tasks in a cgroup.
+
+ config CGROUP_PERF
+ bool "Perf controller"
+ depends on PERF_EVENTS
+ help
+ This option extends the perf per-cpu mode to restrict monitoring
+ to threads which belong to the cgroup specified and run on the
+ designated cpu.
+
+ Say N if unsure.
+
+ config CGROUP_DEBUG
+ bool "Example controller"
default n
- ---help---
- Enable some debugging help. Currently it exports additional stat
- files in a cgroup which can be useful for debugging.
+ help
+ This option enables a simple controller that exports
+ debugging information about the cgroups framework.
- config CGROUP_WRITEBACK
- bool
- depends on MEMCG && BLK_CGROUP
- default y
+ Say N.
endif # CGROUPS
it was better to provide this option than to break all the archs
and have several arch maintainers pursuing me down dark alleys.
-config STOP_MACHINE
- bool
- default y
- depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU
- help
- Need stop_machine() primitive.
-
source "block/Kconfig"
config PREEMPT_NOTIFIERS
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/kthread.h>
#include <linux/delay.h>
-
#include <linux/atomic.h>
+#include <net/sock.h>
/*
* pidlists linger the following amount before being destroyed. The goal
/* Ditto for the can_fork callback. */
static unsigned long have_canfork_callback __read_mostly;
+ static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
all_ss = true;
continue;
}
- if (!strcmp(token, "__DEVEL__sane_behavior")) {
- opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
- continue;
- }
if (!strcmp(token, "noprefix")) {
opts->flags |= CGRP_ROOT_NOPREFIX;
continue;
return -ENOENT;
}
- if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
- if (nr_opts != 1) {
- pr_err("sane_behavior: no other mount options allowed\n");
- return -EINVAL;
- }
- return 0;
- }
-
/*
* If the 'all' option was specified select all the subsystems,
* otherwise if 'none', 'name=' and a subsystem name options were
int flags, const char *unused_dev_name,
void *data)
{
+ bool is_v2 = fs_type == &cgroup2_fs_type;
struct super_block *pinned_sb = NULL;
struct cgroup_subsys *ss;
struct cgroup_root *root;
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
+ if (is_v2) {
+ if (data) {
+ pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+ return ERR_PTR(-EINVAL);
+ }
+ cgrp_dfl_root_visible = true;
+ root = &cgrp_dfl_root;
+ cgroup_get(&root->cgrp);
+ goto out_mount;
+ }
+
mutex_lock(&cgroup_mutex);
/* First find the desired set of subsystems */
if (ret)
goto out_unlock;
- /* look for a matching existing root */
- if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
- cgrp_dfl_root_visible = true;
- root = &cgrp_dfl_root;
- cgroup_get(&root->cgrp);
- ret = 0;
- goto out_unlock;
- }
-
/*
* Destruction of cgroup root is asynchronous, so subsystems may
* still be dying after the previous unmount. Let's drain the
if (ret)
return ERR_PTR(ret);
-
+ out_mount:
dentry = kernfs_mount(fs_type, flags, root->kf_root,
- CGROUP_SUPER_MAGIC, &new_sb);
+ is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
+ &new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
.kill_sb = cgroup_kill_sb,
};
+ static struct file_system_type cgroup2_fs_type = {
+ .name = "cgroup2",
+ .mount = cgroup_mount,
+ .kill_sb = cgroup_kill_sb,
+ };
+
/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
goto out_err;
/*
- * Migrate tasks one-by-one until @form is empty. This fails iff
+ * Migrate tasks one-by-one until @from is empty. This fails iff
* ->can_attach() fails.
*/
do {
{
struct cgroup_subsys_state *css;
- printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+ pr_debug("Initializing cgroup subsys %s\n", ss->name);
mutex_lock(&cgroup_mutex);
WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
WARN_ON(register_filesystem(&cgroup_fs_type));
+ WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
return 0;
.release = single_release,
};
- static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
- {
- if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
- return &ss_priv[i - CGROUP_CANFORK_START];
- return NULL;
- }
-
- static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
- {
- void **private = subsys_canfork_priv_p(ss_priv, i);
- return private ? *private : NULL;
- }
-
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
* returns an error, the fork aborts with that error code. This allows for
* a cgroup subsystem to conditionally allow or deny new forks.
*/
- int cgroup_can_fork(struct task_struct *child,
- void *ss_priv[CGROUP_CANFORK_COUNT])
+ int cgroup_can_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i, j, ret;
for_each_subsys_which(ss, i, &have_canfork_callback) {
- ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+ ret = ss->can_fork(child);
if (ret)
goto out_revert;
}
if (j >= i)
break;
if (ss->cancel_fork)
- ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+ ss->cancel_fork(child);
}
return ret;
* This calls the cancel_fork() callbacks if a fork failed *after*
* cgroup_can_fork() succeded.
*/
- void cgroup_cancel_fork(struct task_struct *child,
- void *ss_priv[CGROUP_CANFORK_COUNT])
+ void cgroup_cancel_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
if (ss->cancel_fork)
- ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+ ss->cancel_fork(child);
}
/**
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
* list.
*/
- void cgroup_post_fork(struct task_struct *child,
- void *old_ss_priv[CGROUP_CANFORK_COUNT])
+ void cgroup_post_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
* and addition to css_set.
*/
for_each_subsys_which(ss, i, &have_fork_callback)
- ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
+ ss->fork(child);
}
/**
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+/*
+ * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
+ * definition in cgroup-defs.h.
+ */
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+
+DEFINE_SPINLOCK(cgroup_sk_update_lock);
+static bool cgroup_sk_alloc_disabled __read_mostly;
+
+void cgroup_sk_alloc_disable(void)
+{
+ if (cgroup_sk_alloc_disabled)
+ return;
+ pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
+ cgroup_sk_alloc_disabled = true;
+}
+
+#else
+
+#define cgroup_sk_alloc_disabled false
+
+#endif
+
+void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
+{
+ if (cgroup_sk_alloc_disabled)
+ return;
+
+ rcu_read_lock();
+
+ while (true) {
+ struct css_set *cset;
+
+ cset = task_css_set(current);
+ if (likely(cgroup_tryget(cset->dfl_cgrp))) {
+ skcd->val = (unsigned long)cset->dfl_cgrp;
+ break;
+ }
+ cpu_relax();
+ }
+
+ rcu_read_unlock();
+}
+
+void cgroup_sk_free(struct sock_cgroup_data *skcd)
+{
+ cgroup_put(sock_cgroup_ptr(skcd));
+}
+
+#endif /* CONFIG_SOCK_CGROUP_DATA */
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
#endif
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
+ tsk->wake_q.next = NULL;
account_kernel_stack(ti, 1);
{
int retval;
struct task_struct *p;
- void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
prev_cputime_init(&p->prev_cputime);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- seqlock_init(&p->vtime_seqlock);
+ seqcount_init(&p->vtime_seqcount);
p->vtime_snap = 0;
- p->vtime_snap_whence = VTIME_SLEEPING;
+ p->vtime_snap_whence = VTIME_INACTIVE;
#endif
#if defined(SPLIT_RSS_COUNTING)
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
- retval = cgroup_can_fork(p, cgrp_ss_priv);
+ retval = cgroup_can_fork(p);
if (retval)
goto bad_fork_free_pid;
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p, cgrp_ss_priv);
+ cgroup_post_fork(p);
threadgroup_change_end(current);
perf_event_fork(p);
return p;
bad_fork_cancel_cgroup:
- cgroup_cancel_fork(p, cgrp_ss_priv);
+ cgroup_cancel_fork(p);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
if (current->policy == SCHED_RR) {
struct sched_rt_entity *rt_se = ¤t->rt;
- return rt_se->run_list.prev == rt_se->run_list.next;
+ return list_is_singular(&rt_se->run_list);
}
/*
return;
}
- load->weight = scale_load(prio_to_weight[prio]);
- load->inv_weight = prio_to_wmult[prio];
+ load->weight = scale_load(sched_prio_to_weight[prio]);
+ load->inv_weight = sched_prio_to_wmult[prio];
}
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
lockdep_assert_held(&rq->lock);
- dequeue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ dequeue_task(rq, p, 0);
set_task_cpu(p, new_cpu);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&rq->lock);
BUG_ON(task_cpu(p) != new_cpu);
- p->on_rq = TASK_ON_RQ_QUEUED;
enqueue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
return rq;
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!p->on_rq);
+ /*
+ * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+ * because schedstat_wait_{start,end} rebase migrating task's wait_start
+ * time relying on p->on_rq.
+ */
+ WARN_ON_ONCE(p->state == TASK_RUNNING &&
+ p->sched_class == &fair_sched_class &&
+ (p->on_rq && !task_on_rq_migrating(p)));
+
#ifdef CONFIG_LOCKDEP
/*
* The caller should hold either p->pi_lock or rq->lock, when changing
src_rq = task_rq(p);
dst_rq = cpu_rq(cpu);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
} else {
/*
raw_spin_unlock(&rq->lock);
}
+/*
+ * Notes on Program-Order guarantees on SMP systems.
+ *
+ * MIGRATION
+ *
+ * The basic program-order guarantee on SMP systems is that when a task [t]
+ * migrates, all its activity on its old cpu [c0] happens-before any subsequent
+ * execution on its new cpu [c1].
+ *
+ * For migration (of runnable tasks) this is provided by the following means:
+ *
+ * A) UNLOCK of the rq(c0)->lock scheduling out task t
+ * B) migration for t is required to synchronize *both* rq(c0)->lock and
+ * rq(c1)->lock (if not at the same time, then in that order).
+ * C) LOCK of the rq(c1)->lock scheduling in task
+ *
+ * Transitivity guarantees that B happens after A and C after B.
+ * Note: we only require RCpc transitivity.
+ * Note: the cpu doing B need not be c0 or c1
+ *
+ * Example:
+ *
+ * CPU0 CPU1 CPU2
+ *
+ * LOCK rq(0)->lock
+ * sched-out X
+ * sched-in Y
+ * UNLOCK rq(0)->lock
+ *
+ * LOCK rq(0)->lock // orders against CPU0
+ * dequeue X
+ * UNLOCK rq(0)->lock
+ *
+ * LOCK rq(1)->lock
+ * enqueue X
+ * UNLOCK rq(1)->lock
+ *
+ * LOCK rq(1)->lock // orders against CPU2
+ * sched-out Z
+ * sched-in X
+ * UNLOCK rq(1)->lock
+ *
+ *
+ * BLOCKING -- aka. SLEEP + WAKEUP
+ *
+ * For blocking we (obviously) need to provide the same guarantee as for
+ * migration. However the means are completely different as there is no lock
+ * chain to provide order. Instead we do:
+ *
+ * 1) smp_store_release(X->on_cpu, 0)
+ * 2) smp_cond_acquire(!X->on_cpu)
+ *
+ * Example:
+ *
+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
+ *
+ * LOCK rq(0)->lock LOCK X->pi_lock
+ * dequeue X
+ * sched-out X
+ * smp_store_release(X->on_cpu, 0);
+ *
+ * smp_cond_acquire(!X->on_cpu);
+ * X->state = WAKING
+ * set_task_cpu(X,2)
+ *
+ * LOCK rq(2)->lock
+ * enqueue X
+ * X->state = RUNNING
+ * UNLOCK rq(2)->lock
+ *
+ * LOCK rq(2)->lock // orders against CPU1
+ * sched-out Z
+ * sched-in X
+ * UNLOCK rq(2)->lock
+ *
+ * UNLOCK X->pi_lock
+ * UNLOCK rq(0)->lock
+ *
+ *
+ * However; for wakeups there is a second guarantee we must provide, namely we
+ * must observe the state that lead to our wakeup. That is, not only must our
+ * task observe its own prior state, it must also observe the stores prior to
+ * its wakeup.
+ *
+ * This means that any means of doing remote wakeups must order the CPU doing
+ * the wakeup against the CPU the task is going to end up running on. This,
+ * however, is already required for the regular Program-Order guarantee above,
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ *
+ */
+
/**
* try_to_wake_up - wake up a thread
* @p: the thread to be awakened
#ifdef CONFIG_SMP
/*
- * If the owning (remote) cpu is still in the middle of schedule() with
- * this task as prev, wait until its done referencing the task.
+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+ * possible to, falsely, observe p->on_cpu == 0.
+ *
+ * One must be running (->on_cpu == 1) in order to remove oneself
+ * from the runqueue.
+ *
+ * [S] ->on_cpu = 1; [L] ->on_rq
+ * UNLOCK rq->lock
+ * RMB
+ * LOCK rq->lock
+ * [S] ->on_rq = 0; [L] ->on_cpu
+ *
+ * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
+ * from the consecutive calls to schedule(); the first switching to our
+ * task, the second putting it to sleep.
*/
- while (p->on_cpu)
- cpu_relax();
+ smp_rmb();
+
/*
- * Pairs with the smp_wmb() in finish_lock_switch().
+ * If the owning (remote) cpu is still in the middle of schedule() with
+ * this task as prev, wait until its done referencing the task.
+ *
+ * Pairs with the smp_store_release() in finish_lock_switch().
+ *
+ * This ensures that tasks getting woken will be fully ordered against
+ * their previous state and preserve Program Order.
*/
- smp_rmb();
+ smp_cond_acquire(!p->on_cpu);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
*/
int wake_up_process(struct task_struct *p)
{
- WARN_ON(task_is_stopped_or_traced(p));
return try_to_wake_up(p, TASK_NORMAL, 0);
}
EXPORT_SYMBOL(wake_up_process);
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = NULL;
+#endif
+
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_note_context_switch();
prev = rq->curr;
/*
if (sched_feat(HRTICK))
hrtick_clear(rq);
+ local_irq_disable();
+ rcu_note_context_switch();
+
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*/
smp_mb__before_spinlock();
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_lock(&rq->lock);
lockdep_pin_lock(&rq->lock);
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
{
memset(rd, 0, sizeof(*rd));
- if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
goto out;
- if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
goto free_span;
- if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
goto free_online;
- if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
goto free_dlo_mask;
init_dl_bw(&rd->dl_bw);
*/
struct task_group root_task_group;
LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
+ task_group_cache = KMEM_CACHE(task_group, 0);
+
list_add(&root_task_group.list, &task_groups);
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
-
#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
- kfree(tg);
+ kmem_cache_free(task_group_cache, tg);
}
/* allocate runqueue etc for a new task group */
{
struct task_group *tg;
- tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
if (!tg)
return ERR_PTR(-ENOMEM);
sched_offline_group(tg);
}
- static void cpu_cgroup_fork(struct task_struct *task, void *private)
+ static void cpu_cgroup_fork(struct task_struct *task)
{
sched_move_task(task);
}
pr_info("Task dump for CPU %d:\n", cpu);
sched_show_task(cpu_curr(cpu));
}
+
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+const int sched_prio_to_weight[40] = {
+ /* -20 */ 88761, 71755, 56483, 46273, 36291,
+ /* -15 */ 29154, 23254, 18705, 14949, 11916,
+ /* -10 */ 9548, 7620, 6100, 4904, 3906,
+ /* -5 */ 3121, 2501, 1991, 1586, 1277,
+ /* 0 */ 1024, 820, 655, 526, 423,
+ /* 5 */ 335, 272, 215, 172, 137,
+ /* 10 */ 110, 87, 70, 56, 45,
+ /* 15 */ 36, 29, 23, 18, 15,
+};
+
+/*
+ * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+const u32 sched_prio_to_wmult[40] = {
+ /* -20 */ 48388, 59856, 76040, 92818, 118348,
+ /* -15 */ 147320, 184698, 229616, 287308, 360437,
+ /* -10 */ 449829, 563644, 704093, 875809, 1099582,
+ /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
+ /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
+ /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
+ /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
+ /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
if (prev && reclaim->generation != iter->generation)
goto out_unlock;
- do {
+ while (1) {
pos = READ_ONCE(iter->position);
+ if (!pos || css_tryget(&pos->css))
+ break;
/*
- * A racing update may change the position and
- * put the last reference, hence css_tryget(),
- * or retry to see the updated position.
+ * css reference reached zero, so iter->position will
+ * be cleared by ->css_released. However, we should not
+ * rely on this happening soon, because ->css_released
+ * is called from a work queue, and by busy-waiting we
+ * might block it. So we clear iter->position right
+ * away.
*/
- } while (pos && !css_tryget(&pos->css));
+ (void)cmpxchg(&iter->position, pos, NULL);
+ }
}
if (pos)
}
if (reclaim) {
- if (cmpxchg(&iter->position, pos, memcg) == pos) {
- if (memcg)
- css_get(&memcg->css);
- if (pos)
- css_put(&pos->css);
- }
-
/*
- * pairs with css_tryget when dereferencing iter->position
- * above.
+ * The position could have already been updated by a competing
+ * thread, so check that the value hasn't changed since we read
+ * it to avoid reclaiming from the same cgroup twice.
*/
+ (void)cmpxchg(&iter->position, pos, memcg);
+
if (pos)
css_put(&pos->css);
css_put(&prev->css);
}
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+ struct mem_cgroup *memcg = dead_memcg;
+ struct mem_cgroup_reclaim_iter *iter;
+ struct mem_cgroup_per_zone *mz;
+ int nid, zid;
+ int i;
+
+ while ((memcg = parent_mem_cgroup(memcg))) {
+ for_each_node(nid) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+ for (i = 0; i <= DEF_PRIORITY; i++) {
+ iter = &mz->iter[i];
+ cmpxchg(&iter->position,
+ dead_memcg, NULL);
+ }
+ }
+ }
+ }
+}
+
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
*/
do {
if (page_counter_read(&memcg->memory) > memcg->high) {
- current->memcg_nr_pages_over_high += nr_pages;
+ current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);
break;
}
wb_memcg_offline(memcg);
}
+static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ invalidate_reclaim_iterators(memcg);
+}
+
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
struct mem_cgroup *from;
struct task_struct *leader, *p;
struct mm_struct *mm;
.css_alloc = mem_cgroup_css_alloc,
.css_online = mem_cgroup_css_online,
.css_offline = mem_cgroup_css_offline,
+ .css_released = mem_cgroup_css_released,
.css_free = mem_cgroup_css_free,
.css_reset = mem_cgroup_css_reset,
.can_attach = mem_cgroup_can_attach,
* mem_cgroup_replace_page - migrate a charge to another page
* @oldpage: currently charged page
* @newpage: page to transfer the charge to
- * @lrucare: either or both pages might be on the LRU already
*
* Migrate the charge from @oldpage to @newpage.
*
* Both pages must be locked, @newpage->mapping must be set up.
+ * Either or both pages might be on the LRU already.
*/
void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
{