Merge branch 'nsfs-ioctls' into HEAD

author Eric W. Biederman <ebiederm@xmission.com>

Fri, 23 Sep 2016 01:00:36 +0000 (20:00 -0500)

committer Eric W. Biederman <ebiederm@xmission.com>

Fri, 23 Sep 2016 01:00:36 +0000 (20:00 -0500)
author Eric W. Biederman <ebiederm@xmission.com>
Fri, 23 Sep 2016 01:00:36 +0000 (20:00 -0500)
committer Eric W. Biederman <ebiederm@xmission.com>
Fri, 23 Sep 2016 01:00:36 +0000 (20:00 -0500)
diff --git a/Documentation/sysctl/README b/Documentation/sysctl/README

index 8c3306e..91f54ff 100644 (file)
--- a/Documentation/sysctl/README
+++ b/Documentation/sysctl/README
@@ -69,6 +69,7 @@ proc/         <empty>
  sunrpc/                SUN Remote Procedure Call (NFS)
  vm/            memory management tuning
                 buffer and cache management
+user/          Per user per user namespace limits
  
  These are the subdirs I have on my system. There might be more
  or other subdirs in another setup. If you see another dir, I'd
diff --git a/Documentation/sysctl/user.txt b/Documentation/sysctl/user.txt

new file mode 100644 (file)

index 0000000..1291c49
--- /dev/null
+++ b/Documentation/sysctl/user.txt
@@ -0,0 +1,66 @@
+Documentation for /proc/sys/user/*     kernel version 4.9.0
+       (c) 2016                Eric Biederman <ebiederm@xmission.com>
+
+==============================================================
+
+This file contains the documetation for the sysctl files in
+/proc/sys/user.
+
+The files in this directory can be used to override the default
+limits on the number of namespaces and other objects that have
+per user per user namespace limits.
+
+The primary purpose of these limits is to stop programs that
+malfunction and attempt to create a ridiculous number of objects,
+before the malfunction becomes a system wide problem.  It is the
+intention that the defaults of these limits are set high enough that
+no program in normal operation should run into these limits.
+
+The creation of per user per user namespace objects are charged to
+the user in the user namespace who created the object and
+verified to be below the per user limit in that user namespace.
+
+The creation of objects is also charged to all of the users
+who created user namespaces the creation of the object happens
+in (user namespaces can be nested) and verified to be below the per user
+limits in the user namespaces of those users.
+
+This recursive counting of created objects ensures that creating a
+user namespace does not allow a user to escape their current limits.
+
+Currently, these files are in /proc/sys/user:
+
+- max_cgroup_namespaces
+
+  The maximum number of cgroup namespaces that any user in the current
+  user namespace may create.
+
+- max_ipc_namespaces
+
+  The maximum number of ipc namespaces that any user in the current
+  user namespace may create.
+
+- max_mnt_namespaces
+
+  The maximum number of mount namespaces that any user in the current
+  user namespace may create.
+
+- max_net_namespaces
+
+  The maximum number of network namespaces that any user in the
+  current user namespace may create.
+
+- max_pid_namespaces
+
+  The maximum number of pid namespaces that any user in the current
+  user namespace may create.
+
+- max_user_namespaces
+
+  The maximum number of user namespaces that any user in the current
+  user namespace may create.
+
+- max_uts_namespaces
+
+  The maximum number of user namespaces that any user in the current
+  user namespace may create.
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c

index d116453..154cc45 100644 (file)
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -272,13 +272,8 @@ static int mknod_ptmx(struct super_block *sb)
         struct dentry *root = sb->s_root;
         struct pts_fs_info *fsi = DEVPTS_SB(sb);
         struct pts_mount_opts *opts = &fsi->mount_opts;
-       kuid_t root_uid;
-       kgid_t root_gid;
-
-       root_uid = make_kuid(current_user_ns(), 0);
-       root_gid = make_kgid(current_user_ns(), 0);
-       if (!uid_valid(root_uid) || !gid_valid(root_gid))
-               return -EINVAL;
+       kuid_t ptmx_uid = current_fsuid();
+       kgid_t ptmx_gid = current_fsgid();
  
         inode_lock(d_inode(root));
  
@@ -309,8 +304,8 @@ static int mknod_ptmx(struct super_block *sb)
  
         mode = S_IFCHR|opts->ptmxmode;
         init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
-       inode->i_uid = root_uid;
-       inode->i_gid = root_gid;
+       inode->i_uid = ptmx_uid;
+       inode->i_gid = ptmx_gid;
  
         d_add(dentry, inode);
  
@@ -336,7 +331,6 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
         struct pts_fs_info *fsi = DEVPTS_SB(sb);
         struct pts_mount_opts *opts = &fsi->mount_opts;
  
-       sync_filesystem(sb);
         err = parse_mount_options(data, PARSE_REMOUNT, opts);
  
         /*
@@ -395,6 +389,7 @@ static int
  devpts_fill_super(struct super_block *s, void *data, int silent)
  {
         struct inode *inode;
+       int error;
  
         s->s_iflags &= ~SB_I_NODEV;
         s->s_blocksize = 1024;
@@ -403,10 +398,16 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
         s->s_op = &devpts_sops;
         s->s_time_gran = 1;
  
+       error = -ENOMEM;
         s->s_fs_info = new_pts_fs_info(s);
         if (!s->s_fs_info)
                 goto fail;
  
+       error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts);
+       if (error)
+               goto fail;
+
+       error = -ENOMEM;
         inode = new_inode(s);
         if (!inode)
                 goto fail;
@@ -418,13 +419,21 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
         set_nlink(inode, 2);
  
         s->s_root = d_make_root(inode);
-       if (s->s_root)
-               return 0;
+       if (!s->s_root) {
+               pr_err("get root dentry failed\n");
+               goto fail;
+       }
  
-       pr_err("get root dentry failed\n");
+       error = mknod_ptmx(s);
+       if (error)
+               goto fail_dput;
  
+       return 0;
+fail_dput:
+       dput(s->s_root);
+       s->s_root = NULL;
  fail:
-       return -ENOMEM;
+       return error;
  }
  
  /*
@@ -436,43 +445,15 @@ fail:
  static struct dentry *devpts_mount(struct file_system_type *fs_type,
         int flags, const char *dev_name, void *data)
  {
-       int error;
-       struct pts_mount_opts opts;
-       struct super_block *s;
-
-       error = parse_mount_options(data, PARSE_MOUNT, &opts);
-       if (error)
-               return ERR_PTR(error);
-
-       s = sget(fs_type, NULL, set_anon_super, flags, NULL);
-       if (IS_ERR(s))
-               return ERR_CAST(s);
-
-       if (!s->s_root) {
-               error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-               if (error)
-                       goto out_undo_sget;
-               s->s_flags |= MS_ACTIVE;
-       }
-
-       memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
-
-       error = mknod_ptmx(s);
-       if (error)
-               goto out_undo_sget;
-
-       return dget(s->s_root);
-
-out_undo_sget:
-       deactivate_locked_super(s);
-       return ERR_PTR(error);
+       return mount_nodev(fs_type, flags, data, devpts_fill_super);
  }
  
  static void devpts_kill_sb(struct super_block *sb)
  {
         struct pts_fs_info *fsi = DEVPTS_SB(sb);
  
-       ida_destroy(&fsi->allocated_ptys);
+       if (fsi)
+               ida_destroy(&fsi->allocated_ptys);
         kfree(fsi);
         kill_litter_super(sb);
  }
diff --git a/fs/mount.h b/fs/mount.h

index 14db05d..e037981 100644 (file)
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -10,6 +10,7 @@ struct mnt_namespace {
         struct mount *  root;
         struct list_head        list;
         struct user_namespace   *user_ns;
+       struct ucounts          *ucounts;
         u64                     seq;    /* Sequence number to prevent loops */
         wait_queue_head_t poll;
         u64 event;
diff --git a/fs/namespace.c b/fs/namespace.c

index fea56f3..8a0e90e 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2719,9 +2719,20 @@ dput_out:
         return retval;
  }
  
+static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
+}
+
+static void dec_mnt_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
+}
+
  static void free_mnt_ns(struct mnt_namespace *ns)
  {
         ns_free_inum(&ns->ns);
+       dec_mnt_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         kfree(ns);
  }
@@ -2738,14 +2749,22 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
  static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
  {
         struct mnt_namespace *new_ns;
+       struct ucounts *ucounts;
         int ret;
  
+       ucounts = inc_mnt_namespaces(user_ns);
+       if (!ucounts)
+               return ERR_PTR(-ENOSPC);
+
         new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
-       if (!new_ns)
+       if (!new_ns) {
+               dec_mnt_namespaces(ucounts);
                 return ERR_PTR(-ENOMEM);
+       }
         ret = ns_alloc_inum(&new_ns->ns);
         if (ret) {
                 kfree(new_ns);
+               dec_mnt_namespaces(ucounts);
                 return ERR_PTR(ret);
         }
         new_ns->ns.ops = &mntns_operations;
@@ -2756,6 +2775,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
         init_waitqueue_head(&new_ns->poll);
         new_ns->event = 0;
         new_ns->user_ns = get_user_ns(user_ns);
+       new_ns->ucounts = ucounts;
         return new_ns;
  }
  
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c

index 1b93650..a80acdf 100644 (file)
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -72,7 +72,7 @@ static DEFINE_SPINLOCK(sysctl_lock);
  
  static void drop_sysctl_table(struct ctl_table_header *header);
  static int sysctl_follow_link(struct ctl_table_header **phead,
-       struct ctl_table **pentry, struct nsproxy *namespaces);
+       struct ctl_table **pentry);
  static int insert_links(struct ctl_table_header *head);
  static void put_links(struct ctl_table_header *header);
  
@@ -319,11 +319,11 @@ static void sysctl_head_finish(struct ctl_table_header *head)
  }
  
  static struct ctl_table_set *
-lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+lookup_header_set(struct ctl_table_root *root)
  {
         struct ctl_table_set *set = &root->default_set;
         if (root->lookup)
-               set = root->lookup(root, namespaces);
+               set = root->lookup(root);
         return set;
  }
  
@@ -491,7 +491,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
                 goto out;
  
         if (S_ISLNK(p->mode)) {
-               ret = sysctl_follow_link(&h, &p, current->nsproxy);
+               ret = sysctl_follow_link(&h, &p);
                 err = ERR_PTR(ret);
                 if (ret)
                         goto out;
@@ -659,7 +659,7 @@ static bool proc_sys_link_fill_cache(struct file *file,
  
         if (S_ISLNK(table->mode)) {
                 /* It is not an error if we can not follow the link ignore it */
-               int err = sysctl_follow_link(&head, &table, current->nsproxy);
+               int err = sysctl_follow_link(&head, &table);
                 if (err)
                         goto out;
         }
@@ -976,7 +976,7 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
  }
  
  static int sysctl_follow_link(struct ctl_table_header **phead,
-       struct ctl_table **pentry, struct nsproxy *namespaces)
+       struct ctl_table **pentry)
  {
         struct ctl_table_header *head;
         struct ctl_table_root *root;
@@ -988,7 +988,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead,
         ret = 0;
         spin_lock(&sysctl_lock);
         root = (*pentry)->data;
-       set = lookup_header_set(root, namespaces);
+       set = lookup_header_set(root);
         dir = xlate_dir(set, (*phead)->parent);
         if (IS_ERR(dir))
                 ret = PTR_ERR(dir);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index 984f73b..1ed9281 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -621,6 +621,7 @@ struct cgroup_namespace {
         atomic_t                count;
         struct ns_common        ns;
         struct user_namespace   *user_ns;
+       struct ucounts          *ucounts;
         struct css_set          *root_cset;
  };
  
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h

index d10e54f..848e579 100644 (file)
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -58,6 +58,7 @@ struct ipc_namespace {
  
         /* user_ns which owns the ipc ns */
         struct user_namespace *user_ns;
+       struct ucounts *ucounts;
  
         struct ns_common ns;
  };
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h

index 918b117..34cce96 100644 (file)
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -40,6 +40,7 @@ struct pid_namespace {
         struct fs_pin *bacct;
  #endif
         struct user_namespace *user_ns;
+       struct ucounts *ucounts;
         struct work_struct proc_work;
         kgid_t pid_gid;
         int hide_pid;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h

index 697e160..f166ca0 100644 (file)
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -155,8 +155,7 @@ struct ctl_table_set {
  
  struct ctl_table_root {
         struct ctl_table_set default_set;
-       struct ctl_table_set *(*lookup)(struct ctl_table_root *root,
-                                          struct nsproxy *namespaces);
+       struct ctl_table_set *(*lookup)(struct ctl_table_root *root);
         int (*permissions)(struct ctl_table_header *head, struct ctl_table *table);
  };
  
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h

index 190cf07..eb209d4 100644 (file)
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -22,6 +22,19 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */
  
  #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
  
+struct ucounts;
+
+enum ucount_type {
+       UCOUNT_USER_NAMESPACES,
+       UCOUNT_PID_NAMESPACES,
+       UCOUNT_UTS_NAMESPACES,
+       UCOUNT_IPC_NAMESPACES,
+       UCOUNT_NET_NAMESPACES,
+       UCOUNT_MNT_NAMESPACES,
+       UCOUNT_CGROUP_NAMESPACES,
+       UCOUNT_COUNTS,
+};
+
  struct user_namespace {
         struct uid_gid_map      uid_map;
         struct uid_gid_map      gid_map;
@@ -39,10 +52,30 @@ struct user_namespace {
         struct key              *persistent_keyring_register;
         struct rw_semaphore     persistent_keyring_register_sem;
  #endif
+       struct work_struct      work;
+#ifdef CONFIG_SYSCTL
+       struct ctl_table_set    set;
+       struct ctl_table_header *sysctls;
+#endif
+       struct ucounts          *ucounts;
+       int ucount_max[UCOUNT_COUNTS];
+};
+
+struct ucounts {
+       struct hlist_node node;
+       struct user_namespace *ns;
+       kuid_t uid;
+       atomic_t count;
+       atomic_t ucount[UCOUNT_COUNTS];
  };
  
  extern struct user_namespace init_user_ns;
  
+bool setup_userns_sysctls(struct user_namespace *ns);
+void retire_userns_sysctls(struct user_namespace *ns);
+struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
+void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
+
  #ifdef CONFIG_USER_NS
  
  static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -54,12 +87,12 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
  
  extern int create_user_ns(struct cred *new);
  extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
-extern void free_user_ns(struct user_namespace *ns);
+extern void __put_user_ns(struct user_namespace *ns);
  
  static inline void put_user_ns(struct user_namespace *ns)
  {
         if (ns && atomic_dec_and_test(&ns->count))
-               free_user_ns(ns);
+               __put_user_ns(ns);
  }
  
  struct seq_operations;
diff --git a/include/linux/utsname.h b/include/linux/utsname.h

index 5093f58..60f0bb8 100644 (file)
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -24,6 +24,7 @@ struct uts_namespace {
         struct kref kref;
         struct new_utsname name;
         struct user_namespace *user_ns;
+       struct ucounts *ucounts;
         struct ns_common ns;
  };
  extern struct uts_namespace init_uts_ns;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h

index 0933c74..fc4f757 100644 (file)
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -60,6 +60,7 @@ struct net {
         struct list_head        exit_list;      /* Use only net_mutex */
  
         struct user_namespace   *user_ns;       /* Owning user namespace */
+       struct ucounts          *ucounts;
         spinlock_t              nsid_lock;
         struct idr              netns_ids;
  
diff --git a/ipc/namespace.c b/ipc/namespace.c

index 578d93b..0abdea4 100644 (file)
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -16,39 +16,61 @@
  
  #include "util.h"
  
+static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES);
+}
+
+static void dec_ipc_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_IPC_NAMESPACES);
+}
+
  static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
                                            struct ipc_namespace *old_ns)
  {
         struct ipc_namespace *ns;
+       struct ucounts *ucounts;
         int err;
  
+       err = -ENOSPC;
+       ucounts = inc_ipc_namespaces(user_ns);
+       if (!ucounts)
+               goto fail;
+
+       err = -ENOMEM;
         ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
         if (ns == NULL)
-               return ERR_PTR(-ENOMEM);
+               goto fail_dec;
  
         err = ns_alloc_inum(&ns->ns);
-       if (err) {
-               kfree(ns);
-               return ERR_PTR(err);
-       }
+       if (err)
+               goto fail_free;
         ns->ns.ops = &ipcns_operations;
  
         atomic_set(&ns->count, 1);
         ns->user_ns = get_user_ns(user_ns);
+       ns->ucounts = ucounts;
  
         err = mq_init_ns(ns);
-       if (err) {
-               put_user_ns(ns->user_ns);
-               ns_free_inum(&ns->ns);
-               kfree(ns);
-               return ERR_PTR(err);
-       }
+       if (err)
+               goto fail_put;
  
         sem_init_ns(ns);
         msg_init_ns(ns);
         shm_init_ns(ns);
  
         return ns;
+
+fail_put:
+       put_user_ns(ns->user_ns);
+       ns_free_inum(&ns->ns);
+fail_free:
+       kfree(ns);
+fail_dec:
+       dec_ipc_namespaces(ucounts);
+fail:
+       return ERR_PTR(err);
  }
  
  struct ipc_namespace *copy_ipcs(unsigned long flags,
@@ -96,6 +118,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
         msg_exit_ns(ns);
         shm_exit_ns(ns);
  
+       dec_ipc_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         ns_free_inum(&ns->ns);
         kfree(ns);
diff --git a/kernel/Makefile b/kernel/Makefile

index e2ec54e..eb26e12 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = fork.o exec_domain.o panic.o \
             extable.o params.o \
             kthread.o sys_ni.o nsproxy.o \
             notifier.o ksysfs.o cred.o reboot.o \
-           async.o range.o smpboot.o
+           async.o range.o smpboot.o ucount.o
  
  obj-$(CONFIG_MULTIUSER) += groups.o
  
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index 86b0e8b..d650433 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6295,6 +6295,16 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
  
  /* cgroup namespaces */
  
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
  static struct cgroup_namespace *alloc_cgroup_ns(void)
  {
         struct cgroup_namespace *new_ns;
@@ -6316,6 +6326,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
  void free_cgroup_ns(struct cgroup_namespace *ns)
  {
         put_css_set(ns->root_cset);
+       dec_cgroup_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         ns_free_inum(&ns->ns);
         kfree(ns);
@@ -6327,6 +6338,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
                                         struct cgroup_namespace *old_ns)
  {
         struct cgroup_namespace *new_ns;
+       struct ucounts *ucounts;
         struct css_set *cset;
  
         BUG_ON(!old_ns);
@@ -6340,6 +6352,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
         if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                 return ERR_PTR(-EPERM);
  
+       ucounts = inc_cgroup_namespaces(user_ns);
+       if (!ucounts)
+               return ERR_PTR(-ENOSPC);
+
         /* It is not safe to take cgroup_mutex here */
         spin_lock_irq(&css_set_lock);
         cset = task_css_set(current);
@@ -6349,10 +6365,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
         new_ns = alloc_cgroup_ns();
         if (IS_ERR(new_ns)) {
                 put_css_set(cset);
+               dec_cgroup_namespaces(ucounts);
                 return new_ns;
         }
  
         new_ns->user_ns = get_user_ns(user_ns);
+       new_ns->ucounts = ucounts;
         new_ns->root_cset = cset;
  
         return new_ns;
diff --git a/kernel/fork.c b/kernel/fork.c

index 52e725d..3cb4853 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -302,6 +302,7 @@ int arch_task_struct_size __read_mostly;
  
  void __init fork_init(void)
  {
+       int i;
  #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
  #ifndef ARCH_MIN_TASKALIGN
  #define ARCH_MIN_TASKALIGN     L1_CACHE_BYTES
@@ -321,6 +322,10 @@ void __init fork_init(void)
         init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
         init_task.signal->rlim[RLIMIT_SIGPENDING] =
                 init_task.signal->rlim[RLIMIT_NPROC];
+
+       for (i = 0; i < UCOUNT_COUNTS; i++) {
+               init_user_ns.ucount_max[i] = max_threads/2;
+       }
  }
  
  int __weak arch_dup_task_struct(struct task_struct *dst,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c

index 4fa2d56..df9e8e9 100644 (file)
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -79,23 +79,36 @@ static void proc_cleanup_work(struct work_struct *work)
  /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
  #define MAX_PID_NS_LEVEL 32
  
+static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
+}
+
+static void dec_pid_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
+}
+
  static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
         struct pid_namespace *parent_pid_ns)
  {
         struct pid_namespace *ns;
         unsigned int level = parent_pid_ns->level + 1;
+       struct ucounts *ucounts;
         int i;
         int err;
  
-       if (level > MAX_PID_NS_LEVEL) {
-               err = -EINVAL;
+       err = -ENOSPC;
+       if (level > MAX_PID_NS_LEVEL)
+               goto out;
+       ucounts = inc_pid_namespaces(user_ns);
+       if (!ucounts)
                 goto out;
-       }
  
         err = -ENOMEM;
         ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
         if (ns == NULL)
-               goto out;
+               goto out_dec;
  
         ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
         if (!ns->pidmap[0].page)
@@ -114,6 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
         ns->level = level;
         ns->parent = get_pid_ns(parent_pid_ns);
         ns->user_ns = get_user_ns(user_ns);
+       ns->ucounts = ucounts;
         ns->nr_hashed = PIDNS_HASH_ADDING;
         INIT_WORK(&ns->proc_work, proc_cleanup_work);
  
@@ -129,6 +143,8 @@ out_free_map:
         kfree(ns->pidmap[0].page);
  out_free:
         kmem_cache_free(pid_ns_cachep, ns);
+out_dec:
+       dec_pid_namespaces(ucounts);
  out:
         return ERR_PTR(err);
  }
@@ -146,6 +162,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
         ns_free_inum(&ns->ns);
         for (i = 0; i < PIDMAP_ENTRIES; i++)
                 kfree(ns->pidmap[i].page);
+       dec_pid_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         call_rcu(&ns->rcu, delayed_free_pidns);
  }
diff --git a/kernel/ucount.c b/kernel/ucount.c

new file mode 100644 (file)

index 0000000..9d20d5d
--- /dev/null
+++ b/kernel/ucount.c
@@ -0,0 +1,235 @@
+/*
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/user_namespace.h>
+
+#define UCOUNTS_HASHTABLE_BITS 10
+static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
+static DEFINE_SPINLOCK(ucounts_lock);
+
+#define ucounts_hashfn(ns, uid)                                                \
+       hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \
+                 UCOUNTS_HASHTABLE_BITS)
+#define ucounts_hashentry(ns, uid)     \
+       (ucounts_hashtable + ucounts_hashfn(ns, uid))
+
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_set *
+set_lookup(struct ctl_table_root *root)
+{
+       return &current_user_ns()->set;
+}
+
+static int set_is_seen(struct ctl_table_set *set)
+{
+       return &current_user_ns()->set == set;
+}
+
+static int set_permissions(struct ctl_table_header *head,
+                                 struct ctl_table *table)
+{
+       struct user_namespace *user_ns =
+               container_of(head->set, struct user_namespace, set);
+       int mode;
+
+       /* Allow users with CAP_SYS_RESOURCE unrestrained access */
+       if (ns_capable(user_ns, CAP_SYS_RESOURCE))
+               mode = (table->mode & S_IRWXU) >> 6;
+       else
+       /* Allow all others at most read-only access */
+               mode = table->mode & S_IROTH;
+       return (mode << 6) | (mode << 3) | mode;
+}
+
+static struct ctl_table_root set_root = {
+       .lookup = set_lookup,
+       .permissions = set_permissions,
+};
+
+static int zero = 0;
+static int int_max = INT_MAX;
+#define UCOUNT_ENTRY(name)                             \
+       {                                               \
+               .procname       = name,                 \
+               .maxlen         = sizeof(int),          \
+               .mode           = 0644,                 \
+               .proc_handler   = proc_dointvec_minmax, \
+               .extra1         = &zero,                \
+               .extra2         = &int_max,             \
+       }
+static struct ctl_table user_table[] = {
+       UCOUNT_ENTRY("max_user_namespaces"),
+       UCOUNT_ENTRY("max_pid_namespaces"),
+       UCOUNT_ENTRY("max_uts_namespaces"),
+       UCOUNT_ENTRY("max_ipc_namespaces"),
+       UCOUNT_ENTRY("max_net_namespaces"),
+       UCOUNT_ENTRY("max_mnt_namespaces"),
+       UCOUNT_ENTRY("max_cgroup_namespaces"),
+       { }
+};
+#endif /* CONFIG_SYSCTL */
+
+bool setup_userns_sysctls(struct user_namespace *ns)
+{
+#ifdef CONFIG_SYSCTL
+       struct ctl_table *tbl;
+       setup_sysctl_set(&ns->set, &set_root, set_is_seen);
+       tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
+       if (tbl) {
+               int i;
+               for (i = 0; i < UCOUNT_COUNTS; i++) {
+                       tbl[i].data = &ns->ucount_max[i];
+               }
+               ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl);
+       }
+       if (!ns->sysctls) {
+               kfree(tbl);
+               retire_sysctl_set(&ns->set);
+               return false;
+       }
+#endif
+       return true;
+}
+
+void retire_userns_sysctls(struct user_namespace *ns)
+{
+#ifdef CONFIG_SYSCTL
+       struct ctl_table *tbl;
+
+       tbl = ns->sysctls->ctl_table_arg;
+       unregister_sysctl_table(ns->sysctls);
+       retire_sysctl_set(&ns->set);
+       kfree(tbl);
+#endif
+}
+
+static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent)
+{
+       struct ucounts *ucounts;
+
+       hlist_for_each_entry(ucounts, hashent, node) {
+               if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns))
+                       return ucounts;
+       }
+       return NULL;
+}
+
+static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
+{
+       struct hlist_head *hashent = ucounts_hashentry(ns, uid);
+       struct ucounts *ucounts, *new;
+
+       spin_lock(&ucounts_lock);
+       ucounts = find_ucounts(ns, uid, hashent);
+       if (!ucounts) {
+               spin_unlock(&ucounts_lock);
+
+               new = kzalloc(sizeof(*new), GFP_KERNEL);
+               if (!new)
+                       return NULL;
+
+               new->ns = ns;
+               new->uid = uid;
+               atomic_set(&new->count, 0);
+
+               spin_lock(&ucounts_lock);
+               ucounts = find_ucounts(ns, uid, hashent);
+               if (ucounts) {
+                       kfree(new);
+               } else {
+                       hlist_add_head(&new->node, hashent);
+                       ucounts = new;
+               }
+       }
+       if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
+               ucounts = NULL;
+       spin_unlock(&ucounts_lock);
+       return ucounts;
+}
+
+static void put_ucounts(struct ucounts *ucounts)
+{
+       if (atomic_dec_and_test(&ucounts->count)) {
+               spin_lock(&ucounts_lock);
+               hlist_del_init(&ucounts->node);
+               spin_unlock(&ucounts_lock);
+
+               kfree(ucounts);
+       }
+}
+
+static inline bool atomic_inc_below(atomic_t *v, int u)
+{
+       int c, old;
+       c = atomic_read(v);
+       for (;;) {
+               if (unlikely(c >= u))
+                       return false;
+               old = atomic_cmpxchg(v, c, c+1);
+               if (likely(old == c))
+                       return true;
+               c = old;
+       }
+}
+
+struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
+                          enum ucount_type type)
+{
+       struct ucounts *ucounts, *iter, *bad;
+       struct user_namespace *tns;
+       ucounts = get_ucounts(ns, uid);
+       for (iter = ucounts; iter; iter = tns->ucounts) {
+               int max;
+               tns = iter->ns;
+               max = READ_ONCE(tns->ucount_max[type]);
+               if (!atomic_inc_below(&iter->ucount[type], max))
+                       goto fail;
+       }
+       return ucounts;
+fail:
+       bad = iter;
+       for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
+               atomic_dec(&iter->ucount[type]);
+
+       put_ucounts(ucounts);
+       return NULL;
+}
+
+void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
+{
+       struct ucounts *iter;
+       for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+               int dec = atomic_dec_if_positive(&iter->ucount[type]);
+               WARN_ON_ONCE(dec < 0);
+       }
+       put_ucounts(ucounts);
+}
+
+static __init int user_namespace_sysctl_init(void)
+{
+#ifdef CONFIG_SYSCTL
+       static struct ctl_table_header *user_header;
+       static struct ctl_table empty[1];
+       /*
+        * It is necessary to register the user directory in the
+        * default set so that registrations in the child sets work
+        * properly.
+        */
+       user_header = register_sysctl("user", empty);
+       BUG_ON(!user_header);
+       BUG_ON(!setup_userns_sysctls(&init_user_ns));
+#endif
+       return 0;
+}
+subsys_initcall(user_namespace_sysctl_init);
+
+
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c

index a58a219..86b7854 100644 (file)
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -29,6 +29,17 @@ static DEFINE_MUTEX(userns_state_mutex);
  static bool new_idmap_permitted(const struct file *file,
                                 struct user_namespace *ns, int cap_setid,
                                 struct uid_gid_map *map);
+static void free_user_ns(struct work_struct *work);
+
+static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
+{
+       return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
+}
+
+static void dec_user_namespaces(struct ucounts *ucounts)
+{
+       return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
+}
  
  static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
  {
@@ -62,10 +73,16 @@ int create_user_ns(struct cred *new)
         struct user_namespace *ns, *parent_ns = new->user_ns;
         kuid_t owner = new->euid;
         kgid_t group = new->egid;
-       int ret;
+       struct ucounts *ucounts;
+       int ret, i;
  
+       ret = -ENOSPC;
         if (parent_ns->level > 32)
-               return -EUSERS;
+               goto fail;
+
+       ucounts = inc_user_namespaces(parent_ns, owner);
+       if (!ucounts)
+               goto fail;
  
         /*
          * Verify that we can not violate the policy of which files
@@ -73,26 +90,27 @@ int create_user_ns(struct cred *new)
          * by verifing that the root directory is at the root of the
          * mount namespace which allows all files to be accessed.
          */
+       ret = -EPERM;
         if (current_chrooted())
-               return -EPERM;
+               goto fail_dec;
  
         /* The creator needs a mapping in the parent user namespace
          * or else we won't be able to reasonably tell userspace who
          * created a user_namespace.
          */
+       ret = -EPERM;
         if (!kuid_has_mapping(parent_ns, owner) ||
             !kgid_has_mapping(parent_ns, group))
-               return -EPERM;
+               goto fail_dec;
  
+       ret = -ENOMEM;
         ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
         if (!ns)
-               return -ENOMEM;
+               goto fail_dec;
  
         ret = ns_alloc_inum(&ns->ns);
-       if (ret) {
-               kmem_cache_free(user_ns_cachep, ns);
-               return ret;
-       }
+       if (ret)
+               goto fail_free;
         ns->ns.ops = &userns_operations;
  
         atomic_set(&ns->count, 1);
@@ -101,18 +119,37 @@ int create_user_ns(struct cred *new)
         ns->level = parent_ns->level + 1;
         ns->owner = owner;
         ns->group = group;
+       INIT_WORK(&ns->work, free_user_ns);
+       for (i = 0; i < UCOUNT_COUNTS; i++) {
+               ns->ucount_max[i] = INT_MAX;
+       }
+       ns->ucounts = ucounts;
  
         /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
         mutex_lock(&userns_state_mutex);
         ns->flags = parent_ns->flags;
         mutex_unlock(&userns_state_mutex);
  
-       set_cred_user_ns(new, ns);
-
  #ifdef CONFIG_PERSISTENT_KEYRINGS
         init_rwsem(&ns->persistent_keyring_register_sem);
  #endif
+       ret = -ENOMEM;
+       if (!setup_userns_sysctls(ns))
+               goto fail_keyring;
+
+       set_cred_user_ns(new, ns);
         return 0;
+fail_keyring:
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+       key_put(ns->persistent_keyring_register);
+#endif
+       ns_free_inum(&ns->ns);
+fail_free:
+       kmem_cache_free(user_ns_cachep, ns);
+fail_dec:
+       dec_user_namespaces(ucounts);
+fail:
+       return ret;
  }
  
  int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
@@ -135,21 +172,30 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
         return err;
  }
  
-void free_user_ns(struct user_namespace *ns)
+static void free_user_ns(struct work_struct *work)
  {
-       struct user_namespace *parent;
+       struct user_namespace *parent, *ns =
+               container_of(work, struct user_namespace, work);
  
         do {
+               struct ucounts *ucounts = ns->ucounts;
                 parent = ns->parent;
+               retire_userns_sysctls(ns);
  #ifdef CONFIG_PERSISTENT_KEYRINGS
                 key_put(ns->persistent_keyring_register);
  #endif
                 ns_free_inum(&ns->ns);
                 kmem_cache_free(user_ns_cachep, ns);
+               dec_user_namespaces(ucounts);
                 ns = parent;
         } while (atomic_dec_and_test(&parent->count));
  }
-EXPORT_SYMBOL(free_user_ns);
+
+void __put_user_ns(struct user_namespace *ns)
+{
+       schedule_work(&ns->work);
+}
+EXPORT_SYMBOL(__put_user_ns);
  
  static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
  {
diff --git a/kernel/utsname.c b/kernel/utsname.c

index e1211a8..6976cd4 100644 (file)
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -17,6 +17,16 @@
  #include <linux/user_namespace.h>
  #include <linux/proc_ns.h>
  
+static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES);
+}
+
+static void dec_uts_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES);
+}
+
  static struct uts_namespace *create_uts_ns(void)
  {
         struct uts_namespace *uts_ns;
@@ -36,18 +46,24 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
                                           struct uts_namespace *old_ns)
  {
         struct uts_namespace *ns;
+       struct ucounts *ucounts;
         int err;
  
+       err = -ENOSPC;
+       ucounts = inc_uts_namespaces(user_ns);
+       if (!ucounts)
+               goto fail;
+
+       err = -ENOMEM;
         ns = create_uts_ns();
         if (!ns)
-               return ERR_PTR(-ENOMEM);
+               goto fail_dec;
  
         err = ns_alloc_inum(&ns->ns);
-       if (err) {
-               kfree(ns);
-               return ERR_PTR(err);
-       }
+       if (err)
+               goto fail_free;
  
+       ns->ucounts = ucounts;
         ns->ns.ops = &utsns_operations;
  
         down_read(&uts_sem);
@@ -55,6 +71,13 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
         ns->user_ns = get_user_ns(user_ns);
         up_read(&uts_sem);
         return ns;
+
+fail_free:
+       kfree(ns);
+fail_dec:
+       dec_uts_namespaces(ucounts);
+fail:
+       return ERR_PTR(err);
  }
  
  /*
@@ -85,6 +108,7 @@ void free_uts_ns(struct kref *kref)
         struct uts_namespace *ns;
  
         ns = container_of(kref, struct uts_namespace, kref);
+       dec_uts_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         ns_free_inum(&ns->ns);
         kfree(ns);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c

index 861efa3..e8be581 100644 (file)
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -266,6 +266,16 @@ struct net *get_net_ns_by_id(struct net *net, int id)
         return peer;
  }
  
+static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
+}
+
+static void dec_net_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
+}
+
  /*
   * setup_net runs the initializers for the network namespace object.
   */
@@ -351,19 +361,27 @@ void net_drop_ns(void *p)
  struct net *copy_net_ns(unsigned long flags,
                         struct user_namespace *user_ns, struct net *old_net)
  {
+       struct ucounts *ucounts;
         struct net *net;
         int rv;
  
         if (!(flags & CLONE_NEWNET))
                 return get_net(old_net);
  
+       ucounts = inc_net_namespaces(user_ns);
+       if (!ucounts)
+               return ERR_PTR(-ENOSPC);
+
         net = net_alloc();
-       if (!net)
+       if (!net) {
+               dec_net_namespaces(ucounts);
                 return ERR_PTR(-ENOMEM);
+       }
  
         get_user_ns(user_ns);
  
         mutex_lock(&net_mutex);
+       net->ucounts = ucounts;
         rv = setup_net(net, user_ns);
         if (rv == 0) {
                 rtnl_lock();
@@ -372,6 +390,7 @@ struct net *copy_net_ns(unsigned long flags,
         }
         mutex_unlock(&net_mutex);
         if (rv < 0) {
+               dec_net_namespaces(ucounts);
                 put_user_ns(user_ns);
                 net_drop_ns(net);
                 return ERR_PTR(rv);
@@ -444,6 +463,7 @@ static void cleanup_net(struct work_struct *work)
         /* Finally it is safe to free my network namespace structure */
         list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
                 list_del_init(&net->exit_list);
+               dec_net_namespaces(net->ucounts);
                 put_user_ns(net->user_ns);
                 net_drop_ns(net);
         }
diff --git a/net/sysctl_net.c b/net/sysctl_net.c

index 46a71c7..ba9b5d1 100644 (file)
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -27,9 +27,9 @@
  #endif
  
  static struct ctl_table_set *
-net_ctl_header_lookup(struct ctl_table_root *root, struct nsproxy *namespaces)
+net_ctl_header_lookup(struct ctl_table_root *root)
  {
-       return &namespaces->net_ns->sysctls;
+       return &current->nsproxy->net_ns->sysctls;
  }
  
  static int is_seen(struct ctl_table_set *set)
author	Eric W. Biederman <ebiederm@xmission.com>
	Fri, 23 Sep 2016 01:00:36 +0000 (20:00 -0500)
committer	Eric W. Biederman <ebiederm@xmission.com>
	Fri, 23 Sep 2016 01:00:36 +0000 (20:00 -0500)
Documentation/sysctl/README		patch \| blob \| history
Documentation/sysctl/user.txt	[new file with mode: 0644]	patch \| blob
fs/devpts/inode.c		patch \| blob \| history
fs/mount.h		patch \| blob \| history
fs/namespace.c		patch \| blob \| history
fs/proc/proc_sysctl.c		patch \| blob \| history
include/linux/cgroup.h		patch \| blob \| history
include/linux/ipc_namespace.h		patch \| blob \| history
include/linux/pid_namespace.h		patch \| blob \| history
include/linux/sysctl.h		patch \| blob \| history
include/linux/user_namespace.h		patch \| blob \| history
include/linux/utsname.h		patch \| blob \| history
include/net/net_namespace.h		patch \| blob \| history
ipc/namespace.c		patch \| blob \| history
kernel/Makefile		patch \| blob \| history
kernel/cgroup.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/pid_namespace.c		patch \| blob \| history
kernel/ucount.c	[new file with mode: 0644]	patch \| blob
kernel/user_namespace.c		patch \| blob \| history
kernel/utsname.c		patch \| blob \| history
net/core/net_namespace.c		patch \| blob \| history
net/sysctl_net.c		patch \| blob \| history