sunrpc/ SUN Remote Procedure Call (NFS)
vm/ memory management tuning
buffer and cache management
+user/ Per user per user namespace limits
These are the subdirs I have on my system. There might be more
or other subdirs in another setup. If you see another dir, I'd
--- /dev/null
+Documentation for /proc/sys/user/* kernel version 4.9.0
+ (c) 2016 Eric Biederman <ebiederm@xmission.com>
+
+==============================================================
+
+This file contains the documetation for the sysctl files in
+/proc/sys/user.
+
+The files in this directory can be used to override the default
+limits on the number of namespaces and other objects that have
+per user per user namespace limits.
+
+The primary purpose of these limits is to stop programs that
+malfunction and attempt to create a ridiculous number of objects,
+before the malfunction becomes a system wide problem. It is the
+intention that the defaults of these limits are set high enough that
+no program in normal operation should run into these limits.
+
+The creation of per user per user namespace objects are charged to
+the user in the user namespace who created the object and
+verified to be below the per user limit in that user namespace.
+
+The creation of objects is also charged to all of the users
+who created user namespaces the creation of the object happens
+in (user namespaces can be nested) and verified to be below the per user
+limits in the user namespaces of those users.
+
+This recursive counting of created objects ensures that creating a
+user namespace does not allow a user to escape their current limits.
+
+Currently, these files are in /proc/sys/user:
+
+- max_cgroup_namespaces
+
+ The maximum number of cgroup namespaces that any user in the current
+ user namespace may create.
+
+- max_ipc_namespaces
+
+ The maximum number of ipc namespaces that any user in the current
+ user namespace may create.
+
+- max_mnt_namespaces
+
+ The maximum number of mount namespaces that any user in the current
+ user namespace may create.
+
+- max_net_namespaces
+
+ The maximum number of network namespaces that any user in the
+ current user namespace may create.
+
+- max_pid_namespaces
+
+ The maximum number of pid namespaces that any user in the current
+ user namespace may create.
+
+- max_user_namespaces
+
+ The maximum number of user namespaces that any user in the current
+ user namespace may create.
+
+- max_uts_namespaces
+
+ The maximum number of user namespaces that any user in the current
+ user namespace may create.
struct dentry *root = sb->s_root;
struct pts_fs_info *fsi = DEVPTS_SB(sb);
struct pts_mount_opts *opts = &fsi->mount_opts;
- kuid_t root_uid;
- kgid_t root_gid;
-
- root_uid = make_kuid(current_user_ns(), 0);
- root_gid = make_kgid(current_user_ns(), 0);
- if (!uid_valid(root_uid) || !gid_valid(root_gid))
- return -EINVAL;
+ kuid_t ptmx_uid = current_fsuid();
+ kgid_t ptmx_gid = current_fsgid();
inode_lock(d_inode(root));
mode = S_IFCHR|opts->ptmxmode;
init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
- inode->i_uid = root_uid;
- inode->i_gid = root_gid;
+ inode->i_uid = ptmx_uid;
+ inode->i_gid = ptmx_gid;
d_add(dentry, inode);
struct pts_fs_info *fsi = DEVPTS_SB(sb);
struct pts_mount_opts *opts = &fsi->mount_opts;
- sync_filesystem(sb);
err = parse_mount_options(data, PARSE_REMOUNT, opts);
/*
devpts_fill_super(struct super_block *s, void *data, int silent)
{
struct inode *inode;
+ int error;
s->s_iflags &= ~SB_I_NODEV;
s->s_blocksize = 1024;
s->s_op = &devpts_sops;
s->s_time_gran = 1;
+ error = -ENOMEM;
s->s_fs_info = new_pts_fs_info(s);
if (!s->s_fs_info)
goto fail;
+ error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts);
+ if (error)
+ goto fail;
+
+ error = -ENOMEM;
inode = new_inode(s);
if (!inode)
goto fail;
set_nlink(inode, 2);
s->s_root = d_make_root(inode);
- if (s->s_root)
- return 0;
+ if (!s->s_root) {
+ pr_err("get root dentry failed\n");
+ goto fail;
+ }
- pr_err("get root dentry failed\n");
+ error = mknod_ptmx(s);
+ if (error)
+ goto fail_dput;
+ return 0;
+fail_dput:
+ dput(s->s_root);
+ s->s_root = NULL;
fail:
- return -ENOMEM;
+ return error;
}
/*
static struct dentry *devpts_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- int error;
- struct pts_mount_opts opts;
- struct super_block *s;
-
- error = parse_mount_options(data, PARSE_MOUNT, &opts);
- if (error)
- return ERR_PTR(error);
-
- s = sget(fs_type, NULL, set_anon_super, flags, NULL);
- if (IS_ERR(s))
- return ERR_CAST(s);
-
- if (!s->s_root) {
- error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
- if (error)
- goto out_undo_sget;
- s->s_flags |= MS_ACTIVE;
- }
-
- memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
-
- error = mknod_ptmx(s);
- if (error)
- goto out_undo_sget;
-
- return dget(s->s_root);
-
-out_undo_sget:
- deactivate_locked_super(s);
- return ERR_PTR(error);
+ return mount_nodev(fs_type, flags, data, devpts_fill_super);
}
static void devpts_kill_sb(struct super_block *sb)
{
struct pts_fs_info *fsi = DEVPTS_SB(sb);
- ida_destroy(&fsi->allocated_ptys);
+ if (fsi)
+ ida_destroy(&fsi->allocated_ptys);
kfree(fsi);
kill_litter_super(sb);
}
struct mount * root;
struct list_head list;
struct user_namespace *user_ns;
+ struct ucounts *ucounts;
u64 seq; /* Sequence number to prevent loops */
wait_queue_head_t poll;
u64 event;
return retval;
}
+static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
+}
+
+static void dec_mnt_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
+}
+
static void free_mnt_ns(struct mnt_namespace *ns)
{
ns_free_inum(&ns->ns);
+ dec_mnt_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
kfree(ns);
}
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
{
struct mnt_namespace *new_ns;
+ struct ucounts *ucounts;
int ret;
+ ucounts = inc_mnt_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
- if (!new_ns)
+ if (!new_ns) {
+ dec_mnt_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
+ }
ret = ns_alloc_inum(&new_ns->ns);
if (ret) {
kfree(new_ns);
+ dec_mnt_namespaces(ucounts);
return ERR_PTR(ret);
}
new_ns->ns.ops = &mntns_operations;
init_waitqueue_head(&new_ns->poll);
new_ns->event = 0;
new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->ucounts = ucounts;
return new_ns;
}
static void drop_sysctl_table(struct ctl_table_header *header);
static int sysctl_follow_link(struct ctl_table_header **phead,
- struct ctl_table **pentry, struct nsproxy *namespaces);
+ struct ctl_table **pentry);
static int insert_links(struct ctl_table_header *head);
static void put_links(struct ctl_table_header *header);
}
static struct ctl_table_set *
-lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+lookup_header_set(struct ctl_table_root *root)
{
struct ctl_table_set *set = &root->default_set;
if (root->lookup)
- set = root->lookup(root, namespaces);
+ set = root->lookup(root);
return set;
}
goto out;
if (S_ISLNK(p->mode)) {
- ret = sysctl_follow_link(&h, &p, current->nsproxy);
+ ret = sysctl_follow_link(&h, &p);
err = ERR_PTR(ret);
if (ret)
goto out;
if (S_ISLNK(table->mode)) {
/* It is not an error if we can not follow the link ignore it */
- int err = sysctl_follow_link(&head, &table, current->nsproxy);
+ int err = sysctl_follow_link(&head, &table);
if (err)
goto out;
}
}
static int sysctl_follow_link(struct ctl_table_header **phead,
- struct ctl_table **pentry, struct nsproxy *namespaces)
+ struct ctl_table **pentry)
{
struct ctl_table_header *head;
struct ctl_table_root *root;
ret = 0;
spin_lock(&sysctl_lock);
root = (*pentry)->data;
- set = lookup_header_set(root, namespaces);
+ set = lookup_header_set(root);
dir = xlate_dir(set, (*phead)->parent);
if (IS_ERR(dir))
ret = PTR_ERR(dir);
atomic_t count;
struct ns_common ns;
struct user_namespace *user_ns;
+ struct ucounts *ucounts;
struct css_set *root_cset;
};
/* user_ns which owns the ipc ns */
struct user_namespace *user_ns;
+ struct ucounts *ucounts;
struct ns_common ns;
};
struct fs_pin *bacct;
#endif
struct user_namespace *user_ns;
+ struct ucounts *ucounts;
struct work_struct proc_work;
kgid_t pid_gid;
int hide_pid;
struct ctl_table_root {
struct ctl_table_set default_set;
- struct ctl_table_set *(*lookup)(struct ctl_table_root *root,
- struct nsproxy *namespaces);
+ struct ctl_table_set *(*lookup)(struct ctl_table_root *root);
int (*permissions)(struct ctl_table_header *head, struct ctl_table *table);
};
#define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
+struct ucounts;
+
+enum ucount_type {
+ UCOUNT_USER_NAMESPACES,
+ UCOUNT_PID_NAMESPACES,
+ UCOUNT_UTS_NAMESPACES,
+ UCOUNT_IPC_NAMESPACES,
+ UCOUNT_NET_NAMESPACES,
+ UCOUNT_MNT_NAMESPACES,
+ UCOUNT_CGROUP_NAMESPACES,
+ UCOUNT_COUNTS,
+};
+
struct user_namespace {
struct uid_gid_map uid_map;
struct uid_gid_map gid_map;
struct key *persistent_keyring_register;
struct rw_semaphore persistent_keyring_register_sem;
#endif
+ struct work_struct work;
+#ifdef CONFIG_SYSCTL
+ struct ctl_table_set set;
+ struct ctl_table_header *sysctls;
+#endif
+ struct ucounts *ucounts;
+ int ucount_max[UCOUNT_COUNTS];
+};
+
+struct ucounts {
+ struct hlist_node node;
+ struct user_namespace *ns;
+ kuid_t uid;
+ atomic_t count;
+ atomic_t ucount[UCOUNT_COUNTS];
};
extern struct user_namespace init_user_ns;
+bool setup_userns_sysctls(struct user_namespace *ns);
+void retire_userns_sysctls(struct user_namespace *ns);
+struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
+void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
+
#ifdef CONFIG_USER_NS
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
extern int create_user_ns(struct cred *new);
extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
-extern void free_user_ns(struct user_namespace *ns);
+extern void __put_user_ns(struct user_namespace *ns);
static inline void put_user_ns(struct user_namespace *ns)
{
if (ns && atomic_dec_and_test(&ns->count))
- free_user_ns(ns);
+ __put_user_ns(ns);
}
struct seq_operations;
struct kref kref;
struct new_utsname name;
struct user_namespace *user_ns;
+ struct ucounts *ucounts;
struct ns_common ns;
};
extern struct uts_namespace init_uts_ns;
struct list_head exit_list; /* Use only net_mutex */
struct user_namespace *user_ns; /* Owning user namespace */
+ struct ucounts *ucounts;
spinlock_t nsid_lock;
struct idr netns_ids;
#include "util.h"
+static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES);
+}
+
+static void dec_ipc_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_IPC_NAMESPACES);
+}
+
static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
struct ipc_namespace *old_ns)
{
struct ipc_namespace *ns;
+ struct ucounts *ucounts;
int err;
+ err = -ENOSPC;
+ ucounts = inc_ipc_namespaces(user_ns);
+ if (!ucounts)
+ goto fail;
+
+ err = -ENOMEM;
ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
if (ns == NULL)
- return ERR_PTR(-ENOMEM);
+ goto fail_dec;
err = ns_alloc_inum(&ns->ns);
- if (err) {
- kfree(ns);
- return ERR_PTR(err);
- }
+ if (err)
+ goto fail_free;
ns->ns.ops = &ipcns_operations;
atomic_set(&ns->count, 1);
ns->user_ns = get_user_ns(user_ns);
+ ns->ucounts = ucounts;
err = mq_init_ns(ns);
- if (err) {
- put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
- kfree(ns);
- return ERR_PTR(err);
- }
+ if (err)
+ goto fail_put;
sem_init_ns(ns);
msg_init_ns(ns);
shm_init_ns(ns);
return ns;
+
+fail_put:
+ put_user_ns(ns->user_ns);
+ ns_free_inum(&ns->ns);
+fail_free:
+ kfree(ns);
+fail_dec:
+ dec_ipc_namespaces(ucounts);
+fail:
+ return ERR_PTR(err);
}
struct ipc_namespace *copy_ipcs(unsigned long flags,
msg_exit_ns(ns);
shm_exit_ns(ns);
+ dec_ipc_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
kfree(ns);
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o
+ async.o range.o smpboot.o ucount.o
obj-$(CONFIG_MULTIUSER) += groups.o
/* cgroup namespaces */
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
static struct cgroup_namespace *alloc_cgroup_ns(void)
{
struct cgroup_namespace *new_ns;
void free_cgroup_ns(struct cgroup_namespace *ns)
{
put_css_set(ns->root_cset);
+ dec_cgroup_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
kfree(ns);
struct cgroup_namespace *old_ns)
{
struct cgroup_namespace *new_ns;
+ struct ucounts *ucounts;
struct css_set *cset;
BUG_ON(!old_ns);
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
+ ucounts = inc_cgroup_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
/* It is not safe to take cgroup_mutex here */
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
new_ns = alloc_cgroup_ns();
if (IS_ERR(new_ns)) {
put_css_set(cset);
+ dec_cgroup_namespaces(ucounts);
return new_ns;
}
new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->ucounts = ucounts;
new_ns->root_cset = cset;
return new_ns;
void __init fork_init(void)
{
+ int i;
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC];
+
+ for (i = 0; i < UCOUNT_COUNTS; i++) {
+ init_user_ns.ucount_max[i] = max_threads/2;
+ }
}
int __weak arch_dup_task_struct(struct task_struct *dst,
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32
+static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
+}
+
+static void dec_pid_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
+}
+
static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
struct pid_namespace *parent_pid_ns)
{
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
+ struct ucounts *ucounts;
int i;
int err;
- if (level > MAX_PID_NS_LEVEL) {
- err = -EINVAL;
+ err = -ENOSPC;
+ if (level > MAX_PID_NS_LEVEL)
+ goto out;
+ ucounts = inc_pid_namespaces(user_ns);
+ if (!ucounts)
goto out;
- }
err = -ENOMEM;
ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
if (ns == NULL)
- goto out;
+ goto out_dec;
ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!ns->pidmap[0].page)
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
+ ns->ucounts = ucounts;
ns->nr_hashed = PIDNS_HASH_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);
kfree(ns->pidmap[0].page);
out_free:
kmem_cache_free(pid_ns_cachep, ns);
+out_dec:
+ dec_pid_namespaces(ucounts);
out:
return ERR_PTR(err);
}
ns_free_inum(&ns->ns);
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
+ dec_pid_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
call_rcu(&ns->rcu, delayed_free_pidns);
}
--- /dev/null
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/user_namespace.h>
+
+#define UCOUNTS_HASHTABLE_BITS 10
+static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
+static DEFINE_SPINLOCK(ucounts_lock);
+
+#define ucounts_hashfn(ns, uid) \
+ hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \
+ UCOUNTS_HASHTABLE_BITS)
+#define ucounts_hashentry(ns, uid) \
+ (ucounts_hashtable + ucounts_hashfn(ns, uid))
+
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_set *
+set_lookup(struct ctl_table_root *root)
+{
+ return ¤t_user_ns()->set;
+}
+
+static int set_is_seen(struct ctl_table_set *set)
+{
+ return ¤t_user_ns()->set == set;
+}
+
+static int set_permissions(struct ctl_table_header *head,
+ struct ctl_table *table)
+{
+ struct user_namespace *user_ns =
+ container_of(head->set, struct user_namespace, set);
+ int mode;
+
+ /* Allow users with CAP_SYS_RESOURCE unrestrained access */
+ if (ns_capable(user_ns, CAP_SYS_RESOURCE))
+ mode = (table->mode & S_IRWXU) >> 6;
+ else
+ /* Allow all others at most read-only access */
+ mode = table->mode & S_IROTH;
+ return (mode << 6) | (mode << 3) | mode;
+}
+
+static struct ctl_table_root set_root = {
+ .lookup = set_lookup,
+ .permissions = set_permissions,
+};
+
+static int zero = 0;
+static int int_max = INT_MAX;
+#define UCOUNT_ENTRY(name) \
+ { \
+ .procname = name, \
+ .maxlen = sizeof(int), \
+ .mode = 0644, \
+ .proc_handler = proc_dointvec_minmax, \
+ .extra1 = &zero, \
+ .extra2 = &int_max, \
+ }
+static struct ctl_table user_table[] = {
+ UCOUNT_ENTRY("max_user_namespaces"),
+ UCOUNT_ENTRY("max_pid_namespaces"),
+ UCOUNT_ENTRY("max_uts_namespaces"),
+ UCOUNT_ENTRY("max_ipc_namespaces"),
+ UCOUNT_ENTRY("max_net_namespaces"),
+ UCOUNT_ENTRY("max_mnt_namespaces"),
+ UCOUNT_ENTRY("max_cgroup_namespaces"),
+ { }
+};
+#endif /* CONFIG_SYSCTL */
+
+bool setup_userns_sysctls(struct user_namespace *ns)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+ setup_sysctl_set(&ns->set, &set_root, set_is_seen);
+ tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
+ if (tbl) {
+ int i;
+ for (i = 0; i < UCOUNT_COUNTS; i++) {
+ tbl[i].data = &ns->ucount_max[i];
+ }
+ ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl);
+ }
+ if (!ns->sysctls) {
+ kfree(tbl);
+ retire_sysctl_set(&ns->set);
+ return false;
+ }
+#endif
+ return true;
+}
+
+void retire_userns_sysctls(struct user_namespace *ns)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+
+ tbl = ns->sysctls->ctl_table_arg;
+ unregister_sysctl_table(ns->sysctls);
+ retire_sysctl_set(&ns->set);
+ kfree(tbl);
+#endif
+}
+
+static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent)
+{
+ struct ucounts *ucounts;
+
+ hlist_for_each_entry(ucounts, hashent, node) {
+ if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns))
+ return ucounts;
+ }
+ return NULL;
+}
+
+static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
+{
+ struct hlist_head *hashent = ucounts_hashentry(ns, uid);
+ struct ucounts *ucounts, *new;
+
+ spin_lock(&ucounts_lock);
+ ucounts = find_ucounts(ns, uid, hashent);
+ if (!ucounts) {
+ spin_unlock(&ucounts_lock);
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ new->ns = ns;
+ new->uid = uid;
+ atomic_set(&new->count, 0);
+
+ spin_lock(&ucounts_lock);
+ ucounts = find_ucounts(ns, uid, hashent);
+ if (ucounts) {
+ kfree(new);
+ } else {
+ hlist_add_head(&new->node, hashent);
+ ucounts = new;
+ }
+ }
+ if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
+ ucounts = NULL;
+ spin_unlock(&ucounts_lock);
+ return ucounts;
+}
+
+static void put_ucounts(struct ucounts *ucounts)
+{
+ if (atomic_dec_and_test(&ucounts->count)) {
+ spin_lock(&ucounts_lock);
+ hlist_del_init(&ucounts->node);
+ spin_unlock(&ucounts_lock);
+
+ kfree(ucounts);
+ }
+}
+
+static inline bool atomic_inc_below(atomic_t *v, int u)
+{
+ int c, old;
+ c = atomic_read(v);
+ for (;;) {
+ if (unlikely(c >= u))
+ return false;
+ old = atomic_cmpxchg(v, c, c+1);
+ if (likely(old == c))
+ return true;
+ c = old;
+ }
+}
+
+struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
+ enum ucount_type type)
+{
+ struct ucounts *ucounts, *iter, *bad;
+ struct user_namespace *tns;
+ ucounts = get_ucounts(ns, uid);
+ for (iter = ucounts; iter; iter = tns->ucounts) {
+ int max;
+ tns = iter->ns;
+ max = READ_ONCE(tns->ucount_max[type]);
+ if (!atomic_inc_below(&iter->ucount[type], max))
+ goto fail;
+ }
+ return ucounts;
+fail:
+ bad = iter;
+ for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
+ atomic_dec(&iter->ucount[type]);
+
+ put_ucounts(ucounts);
+ return NULL;
+}
+
+void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
+{
+ struct ucounts *iter;
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ int dec = atomic_dec_if_positive(&iter->ucount[type]);
+ WARN_ON_ONCE(dec < 0);
+ }
+ put_ucounts(ucounts);
+}
+
+static __init int user_namespace_sysctl_init(void)
+{
+#ifdef CONFIG_SYSCTL
+ static struct ctl_table_header *user_header;
+ static struct ctl_table empty[1];
+ /*
+ * It is necessary to register the user directory in the
+ * default set so that registrations in the child sets work
+ * properly.
+ */
+ user_header = register_sysctl("user", empty);
+ BUG_ON(!user_header);
+ BUG_ON(!setup_userns_sysctls(&init_user_ns));
+#endif
+ return 0;
+}
+subsys_initcall(user_namespace_sysctl_init);
+
+
static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *map);
+static void free_user_ns(struct work_struct *work);
+
+static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
+{
+ return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
+}
+
+static void dec_user_namespaces(struct ucounts *ucounts)
+{
+ return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
+}
static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
{
struct user_namespace *ns, *parent_ns = new->user_ns;
kuid_t owner = new->euid;
kgid_t group = new->egid;
- int ret;
+ struct ucounts *ucounts;
+ int ret, i;
+ ret = -ENOSPC;
if (parent_ns->level > 32)
- return -EUSERS;
+ goto fail;
+
+ ucounts = inc_user_namespaces(parent_ns, owner);
+ if (!ucounts)
+ goto fail;
/*
* Verify that we can not violate the policy of which files
* by verifing that the root directory is at the root of the
* mount namespace which allows all files to be accessed.
*/
+ ret = -EPERM;
if (current_chrooted())
- return -EPERM;
+ goto fail_dec;
/* The creator needs a mapping in the parent user namespace
* or else we won't be able to reasonably tell userspace who
* created a user_namespace.
*/
+ ret = -EPERM;
if (!kuid_has_mapping(parent_ns, owner) ||
!kgid_has_mapping(parent_ns, group))
- return -EPERM;
+ goto fail_dec;
+ ret = -ENOMEM;
ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
if (!ns)
- return -ENOMEM;
+ goto fail_dec;
ret = ns_alloc_inum(&ns->ns);
- if (ret) {
- kmem_cache_free(user_ns_cachep, ns);
- return ret;
- }
+ if (ret)
+ goto fail_free;
ns->ns.ops = &userns_operations;
atomic_set(&ns->count, 1);
ns->level = parent_ns->level + 1;
ns->owner = owner;
ns->group = group;
+ INIT_WORK(&ns->work, free_user_ns);
+ for (i = 0; i < UCOUNT_COUNTS; i++) {
+ ns->ucount_max[i] = INT_MAX;
+ }
+ ns->ucounts = ucounts;
/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
mutex_lock(&userns_state_mutex);
ns->flags = parent_ns->flags;
mutex_unlock(&userns_state_mutex);
- set_cred_user_ns(new, ns);
-
#ifdef CONFIG_PERSISTENT_KEYRINGS
init_rwsem(&ns->persistent_keyring_register_sem);
#endif
+ ret = -ENOMEM;
+ if (!setup_userns_sysctls(ns))
+ goto fail_keyring;
+
+ set_cred_user_ns(new, ns);
return 0;
+fail_keyring:
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+ key_put(ns->persistent_keyring_register);
+#endif
+ ns_free_inum(&ns->ns);
+fail_free:
+ kmem_cache_free(user_ns_cachep, ns);
+fail_dec:
+ dec_user_namespaces(ucounts);
+fail:
+ return ret;
}
int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
return err;
}
-void free_user_ns(struct user_namespace *ns)
+static void free_user_ns(struct work_struct *work)
{
- struct user_namespace *parent;
+ struct user_namespace *parent, *ns =
+ container_of(work, struct user_namespace, work);
do {
+ struct ucounts *ucounts = ns->ucounts;
parent = ns->parent;
+ retire_userns_sysctls(ns);
#ifdef CONFIG_PERSISTENT_KEYRINGS
key_put(ns->persistent_keyring_register);
#endif
ns_free_inum(&ns->ns);
kmem_cache_free(user_ns_cachep, ns);
+ dec_user_namespaces(ucounts);
ns = parent;
} while (atomic_dec_and_test(&parent->count));
}
-EXPORT_SYMBOL(free_user_ns);
+
+void __put_user_ns(struct user_namespace *ns)
+{
+ schedule_work(&ns->work);
+}
+EXPORT_SYMBOL(__put_user_ns);
static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
{
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
+static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES);
+}
+
+static void dec_uts_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES);
+}
+
static struct uts_namespace *create_uts_ns(void)
{
struct uts_namespace *uts_ns;
struct uts_namespace *old_ns)
{
struct uts_namespace *ns;
+ struct ucounts *ucounts;
int err;
+ err = -ENOSPC;
+ ucounts = inc_uts_namespaces(user_ns);
+ if (!ucounts)
+ goto fail;
+
+ err = -ENOMEM;
ns = create_uts_ns();
if (!ns)
- return ERR_PTR(-ENOMEM);
+ goto fail_dec;
err = ns_alloc_inum(&ns->ns);
- if (err) {
- kfree(ns);
- return ERR_PTR(err);
- }
+ if (err)
+ goto fail_free;
+ ns->ucounts = ucounts;
ns->ns.ops = &utsns_operations;
down_read(&uts_sem);
ns->user_ns = get_user_ns(user_ns);
up_read(&uts_sem);
return ns;
+
+fail_free:
+ kfree(ns);
+fail_dec:
+ dec_uts_namespaces(ucounts);
+fail:
+ return ERR_PTR(err);
}
/*
struct uts_namespace *ns;
ns = container_of(kref, struct uts_namespace, kref);
+ dec_uts_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
kfree(ns);
return peer;
}
+static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
+}
+
+static void dec_net_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
+}
+
/*
* setup_net runs the initializers for the network namespace object.
*/
struct net *copy_net_ns(unsigned long flags,
struct user_namespace *user_ns, struct net *old_net)
{
+ struct ucounts *ucounts;
struct net *net;
int rv;
if (!(flags & CLONE_NEWNET))
return get_net(old_net);
+ ucounts = inc_net_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
net = net_alloc();
- if (!net)
+ if (!net) {
+ dec_net_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
+ }
get_user_ns(user_ns);
mutex_lock(&net_mutex);
+ net->ucounts = ucounts;
rv = setup_net(net, user_ns);
if (rv == 0) {
rtnl_lock();
}
mutex_unlock(&net_mutex);
if (rv < 0) {
+ dec_net_namespaces(ucounts);
put_user_ns(user_ns);
net_drop_ns(net);
return ERR_PTR(rv);
/* Finally it is safe to free my network namespace structure */
list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
list_del_init(&net->exit_list);
+ dec_net_namespaces(net->ucounts);
put_user_ns(net->user_ns);
net_drop_ns(net);
}
#endif
static struct ctl_table_set *
-net_ctl_header_lookup(struct ctl_table_root *root, struct nsproxy *namespaces)
+net_ctl_header_lookup(struct ctl_table_root *root)
{
- return &namespaces->net_ns->sysctls;
+ return ¤t->nsproxy->net_ns->sysctls;
}
static int is_seen(struct ctl_table_set *set)